I tried to scrap for thousands of pages. So I used async.timesSeries and async.waterfall. Each of functions work synchronously very well but they don't work together. What can I do?
The logic is simple.
Because I want to scrape pages are "http://udb.kr/local/category/390101?page="1~1167, async.timesSeries loop 1 to 1167
async.waterfall scraps components of pages
but messages that console shows me looks like this
info.NM values // just for explain, It shows me each attires of obj because I insert console.log(info.NM) for verifying.
info.NM values
info.NM values
info.NM values and randomly ----- page number -----
...
['done',
'done',
'done',
'done',
'done',
...
'done']
info.NM values again
.../Users/Snark/Dev/job_apply/cheerio_job_app_list.js:29
if (tObj[m+1].children != 0) {info.nAddr = tObj[m+1].firstChild.data}else{info.nAddr = null};
^
TypeError: Cannot read property 'children' of undefined
at /Users/Snark/Dev/job_apply/cheerio_job_app_list.js:29:17
at fn (/Users/Snark/node_modules/async/lib/async.js:746:34)
at /Users/Snark/node_modules/async/lib/async.js:1212:16
at /Users/Snark/node_modules/async/lib/async.js:166:37
at /Users/Snark/node_modules/async/lib/async.js:706:43
at /Users/Snark/node_modules/async/lib/async.js:167:37
at /Users/Snark/node_modules/async/lib/async.js:1208:30
at Request._callback (/Users/Snark/Dev/job_apply/cheerio_job_app_list.js:21:6)
at Request.self.callback (/Users/Snark/node_modules/request/request.js:198:22)
at emitTwo (events.js:87:13)
And this is js code.
var request = require("request"),
cheerio = require("cheerio"),
jsonfile = require("jsonfile"),
fs = require("fs"),
async = require("async");
var info = {},
dbArray = [];
var url = "http://udb.kr/local/category/390101?page=";
async.timesSeries(1166, function(n, next) {
var page = n + 1
async.waterfall([
function(callback) {
request(url + page, function(error, response, html) {
if (error) {
throw error
};
var $ = cheerio.load(html),
tObj = $('tbody tr td');
callback(null, tObj);
});
},
function(tObj, callback) {
for (var m = 0; m < 150; m = m + 5) {
if (tObj[m]) {
info.NM = tObj[m].firstChild.children[0].data
} else {
info.NM = null
};
if (tObj[m + 1].children != 0) {
info.nAddr = tObj[m + 1].firstChild.data
} else {
info.nAddr = null
};
console.log(info.NM);
dbArray.push(info);
}
callback(dbArray, callback);
},
function(dbArray, callback) {
fs.appendFile('./jobDB_l.json', JSON.stringify(dbArray), function (err) {
if (err)
throw err;
});
callback(null, 'done');
}
], function(err, result) {
console.log('----- ' +page+ '-----');
});
next(null, 'done');
}, function(err, result) {
console.log(result)
});
To get these to work together where you are using waterfall inside of each timesSeries iteration, you need to call the timesSeries done callback from the completion callback for the waterfall call. Right now, you are calling it long before that which means that timesSeries won't wait for the waterfall to be done.
You can do that by changing this:
], function(err, result) {
console.log('----- ' +page+ '-----');
});
next(null, 'done');
to this:
], function(err, result) {
console.log('----- ' +page+ '-----');
next(null, 'done');
});
It also seems odd that you have a hard-coded for loop limit of m < 150 rather than using the actual length of the content. You can easily run off the end of the content and potentially cause problems.
And, your error handling probably won't work well either. If you throw inside of the async request() callback, that's not going to go anywhere. You need much better error handling such as calling callback(error) to pass the error on to async.waterfall().
You also may want to surround all your DOM walking in a try/catch so if you throw any exceptions there, you can catch them yourself, analyze them and then fix the code.
if (tObj[m+1] && tObj[m+1].children != 0)
Related
I am trying to build a result_arr of location objects to send as a response, but I am not sure how to send the response only when the entire array has been built. The response contains an empty array, but result_arr array is filled after the response has already been sent.
function handle_getLocations(req, res, done){
var con_id = req.body["contractor_id"];
console.log("Contractor ID :" + con_id.toString());
var result_arr = new Array();
employee.getActiveByContractor(con_id, function(err, employees){
if (err) {
console.log("Logging error in json:\n");
res.json({"code" : 100, "status" : "Error in connection database"});
return;
};
if(employees.length === 0) done(null);
for(var i=0;i<employees.length;i++){
assignment.getLocationsByEmployeeID(employees[i].employee_id, function(err, locations){
if (err) {
console.log("Logging error in json:\n");
res.json({"code" : 100, "status" : "Error in connection database"});
return;
};
console.log("Number of locations: " + locations.length.toString());
for(var j=0;j<locations.length;j++){
console.log("Assignment is: " + locations[j].assignment_id.toString());
location.getAllByID(locations[j].location_id, function(err, loc){
if (err) {
console.log("Logging error in json:\n");
res.json({"code" : 100, "status" : "Error in connection database"});
return;
};
var loc_obj = {};
loc_obj.display_name = loc[0].display_name;
loc_obj.location_id = loc[0].location_id;
console.log("Location is: " + loc_obj.display_name);
console.log("Location ID is: " + loc_obj.location_id.toString());
result_arr.push(loc_obj);
console.log(result_arr);
done(result_arr);
});
};
});
};
});
};
I know that in nodejs the idea is to not make blocking calls, but I am not sure how to make sure all of the information is sent in the response.
You are calling many asynchronous functions in the loop and do not have any logic to check when all they are completed to send the response back to the client.
I modified your code a bit to add the logic in VannilaJS way which is very messy below but working code.
Anyways I would suggest you to use promise based/asynchronous modules
like async, bluebird etc to handle this nicely. Using them, you
can improve readability and easy maintainability in your code to get
rid of callback hells and other disadvantages.
async http://caolan.github.io/async/
bluebird https://github.com/petkaantonov/bluebird
You can read more about this on the below link,
https://strongloop.com/strongblog/node-js-callback-hell-promises-generators/
function handle_getLocations(req, res, done){
var con_id = req.body["contractor_id"];
console.log("Contractor ID :" + con_id.toString());
var result_arr = new Array();
employee.getActiveByContractor(con_id, function(err, employees){
if (err) {
console.log("Logging error in json:\n");
res.json({"code" : 100, "status" : "Error in connection database"});
return;
};
if(employees.length === 0) done(null);
var employeesChecked = 0;
var errors = [];
function sendResponse(){
if(employeesChecked === employees.length) {
res.json(result_arr);
//done(result_arr); // If required, uncomment this line and comment the above line
}
}
for(var i=0;i<employees.length;i++){
assignment.getLocationsByEmployeeID(employees[i].employee_id, function(err, locations){
var locationsChecked = 0;
if (err) {
console.log(err);
errors.push(err);
++employeesChecked;
sendResponse();
} else {
console.log("Number of locations: " + locations.length.toString());
for(var j=0;j<locations.length;j++){
console.log("Assignment is: " + locations[j].assignment_id.toString());
location.getAllByID(locations[j].location_id, function(err, loc){
++locationsChecked;
if (err) {
console.log(err);
errors.push(err);
} else {
var loc_obj = {};
loc_obj.display_name = loc[0].display_name;
loc_obj.location_id = loc[0].location_id;
console.log("Location is: " + loc_obj.display_name);
console.log("Location ID is: " + loc_obj.location_id.toString());
result_arr.push(loc_obj);
console.log(result_arr);
}
if(locationsChecked === locations.length) {
++employeesChecked;
}
sendResponse();
});
}
}
});
}
});
}
In order not to consume much time during the request-response life time, you need to separate each logic in a single endpoint, but sometimes as your case, you may need to hit the database more than a time to fetch data that depends on another, so assuming that employee.getActiveByContractor returning promise and as it's an async method so you need to to chain it with .then like this:
employee.getActiveByContractor(con_id)
.then(function(employees) {
Also, you my need to read about Promise.
As Basim says, this is a good time to use Promises.
getLocationsByEmployeeID and getAllByID are async so they won't be done by the time the loop is finished and you send your response.
Promises are built into the latest Node.js version.
Learn here: https://www.udacity.com/course/javascript-promises--ud898
Suggestion:
Create promise wrappers for getLocationsByEmployeeID and getAllByID
Use Promise.all to make sure every getLocationsByEmployeeID and getAllByID are complete
return your http response within Promise.all's "success" callback
I am writing a small Node js application for automatic vehicle location system.
Here is the code for where I am getting trouble.
markerData contains 4 rows but only in the log I can see the last row.
for (var i = 0, len = markerData.length; i < len; i++) {
var thisMarker = markerData[i];
sql.connect(config, function (err) {
var request = new sql.Request();
request.input('myval', sql.Int, thisMarker.id);
request.query('SELECT d.id, d.name, d.lastupdate, p.latitude, p.longitude, p.speed, p.course FROM dbo.devices AS d INNER JOIN dbo.positions AS p ON d.positionid = p.id AND d.id = p.deviceid WHERE (d.id = #myval)', function (err, recordset2) {
if (typeof recordset2 != 'undefined') {
thisMarker.position.lat = recordset2[0].latitude;
thisMarker.position.long = recordset2[0].longitude;
console.log(recordset2[0].id);
}
});
});
}
Please help me to solve the issue.
As var is not a block level variable in terms of scope, when `sql' module takes time to connect to the database asynchronously, the synchronous loop may change the value of the variable that's why you have the last row printed since the variable holds the reference to the last object at the time of successful connection.
Instead of _.each, I would recommend to use async module with async.each since you have few asynchronous operation to get rid of a synchronous loop.
You can check for samples here,
http://justinklemm.com/node-js-async-tutorial/
Here is your updated code with async.each
-> Install async module with npm install async --save
-> Then add the below reference in the required place,
// Reference
var async = require('async');
-> Modified code:
sql.connect(config, function (err) {
if(err) {
console.log('Connection error: ');
console.log(err);
} else {
async.each(markerData, function(thisMarker, callback) {
var request = new sql.Request();
request.input('myval', sql.Int, thisMarker.id);
request.query('SELECT d.id, d.name, d.lastupdate, p.latitude, p.longitude, p.speed, p.course FROM dbo.devices AS d INNER JOIN dbo.positions AS p ON d.positionid = p.id AND d.id = p.deviceid WHERE (d.id = #myval)', function (err, recordset2) {
if(err) {
console.log(err);
callback();
} else {
if (typeof recordset2 != 'undefined') {
thisMarker.position.lat = recordset2[0].latitude;
thisMarker.position.long = recordset2[0].longitude;
console.log(recordset2[0].id);
} else {
console.log('Recordset empty for id: ' + thisMarker.id);
}
callback();
}
});
}, function(err){
if(err) {
console.log(err);
}
});
}
});
I'm not entirely sure how your library works, but presumably recordset2 is an array of records. recordset2[0] is therefore the first record. If you want the next one you should probably try recordset2[1] and so on and so forth.
Arrays: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array
You'll probably need to loop through all the elements in the array at some point. use a for loop for that:
for (var i = 0; i < recordset2.length; i++ {
console.log(recordset2[i])
}
That will print out everything your query returns.
I'm trying to use async.each function to get an array with my results from two queries. After that, I need to render this results in a web page.
The async.each function calcule the variable results properly, but, I am not be able to export this variable outside the function and render it and I don't understand why.
Here I attached the code, where I tested it. I realized that when I call "callback1" the function(error) is not working and I don't get the variable list in the console (so I won't be able to render it later on). Please I would be grateful if someone could help me with that. Thanks a lot.
var list = [];
async.each(data,
function(elem, callback1){
var classgene = '';
var custom_gene = {};
custom_gene = {Name_Gene: elem['Name_Gene']};
if (elem['Type_Gene'] == "reference") {
async.waterfall([
function(callback2){
var id = elem['Id_Genes'];
geneModel.getGenesRefClass(id, function(error, data2){
classgene = data2[0]['Class_Name'];
custom_gene['classgene'] = classgene;
callback2(custom_gene);
});
},
], function(custom_gene, err){
list.push(custom_gene);
console.log(list);
callback1();
});
}
}, function(err){
// if any of the saves produced an error, err would equal that error
if(err){
console.log(list);
}else{
console.log(list);
}
});
Your code has a few problems:
It's not calling callback2() properly. It should be callback2(null, custom_gene) (the first argument is reserved for errors, or null if there aren't any). Preferably, you should also check for error being returned by geneModel.getGenesRefClass();
The previous issue also means that you need to swap the argument of function(custom_gene, err) (it should become function(err, custom_gene));
When elem['Type_Gene'] does not equal "reference", you should still call callback1(), otherwise async.each() doesn't know that the code is done;
So the code would become something like this:
var list = [];
async.each(data, function(elem, callback1) {
var classgene = '';
var custom_gene = { Name_Gene : elem['Name_Gene'] };
if (elem['Type_Gene'] == "reference") {
async.waterfall([
function(callback2) {
var id = elem['Id_Genes'];
geneModel.getGenesRefClass(id, function(error, data2){
if (error) return callback2(error);
classgene = data2[0]['Class_Name'];
custom_gene['classgene'] = classgene;
callback2(null, custom_gene);
});
},
], function(err, custom_gene) {
// If you want to propagate errors, uncomment the following:
// if (err) return callback1(err);
list.push(custom_gene);
console.log(list);
callback1();
});
} else {
callback1();
}
}, function(err){
// if any of the saves produced an error, err would equal that error
if (err) {
console.log('An error occurred!', err);
}
console.log(list);
});
I'm trying to write a Node program that populates my MySQL database with data from files I have on disk. I may or may not be going about this the right way, but it's working. What I'm having trouble with is understanding how I should be handling allowing asynchronous functions to finish before the connection to the DB is ended. Ultimately, I'll be reading lots of data files, and insert them into the database like I did below. I could just use readFileSync instead of the asynchronous version, but I need to get a better handle on asynchronous functions.
When I insert the wine categories below, it works fine since it's not using an asynchronous function. However, when I use readFile to get data from a file, I get an error that connection ended before any of the queries were executed:
connection.connect( function(err) {
if(err) {
console.log(err);
}
});
// Take a table and the values, and insert a new row into a table
function insert_into( table, values ) {
if( values instanceof Array ) {
values = values.map( function( value ) {
return '"' + value + '"';
}).join(', ');
} else {
values = '"' + values + '"';
}
var statement = 'INSERT INTO ' + table + ' VALUES (NULL, ' + values + ')';
connection.query( statement, function(err, rows, fields) {
if (err) throw err;
console.log( values + " successfully added.");
});
};
// Populate the wine_categories table
var wine_categories = [
'red', 'white', 'rose', 'sparkling', 'fortified'
];
// Works fine when used alone
wine_categories.forEach( function( element ) {
insert_into( 'wine_categories', element );
});
// Populate the countries table
// connection.end() runs before this finishes its job
fs.readFile( countries, 'utf8', function (err, data) {
if (err) {
throw err;
} else {
var codes = Array.prototype.map.call(
data.split('\n'), function( country ) {
return country.split('\t');
});
codes.forEach( function( country ) {
if( country[1].length > 25 ) {
country[1] = country[1].substring(0, 25);
}
insert_into( 'countries', country );
});
}
});
connection.end();
Obviously, connection.end() needs to happen after all of the inserts have completed, but I'm not sure how to handle that. I don't want it to be a callback for the readFile call because I'll ultimately have many of similar calls in this file.
How should I structure my code so that all of the queries execute and connection.end() runs when they're all finished? The answer is probably obvious to an asynchronous wiz...
Using promises it would be like this:
pool.getConnectionAsync().then(function(connection) {
// Populate the wine_categories table
var wine_categories = [
'red', 'white', 'rose', 'sparkling', 'fortified'
];
var wineQueries = wine_categories.map(function(wine){
return insert_into(connection, "wine_categories", wine);
});
var countryQueries = fs.readFileAsync(countries, "utf-8").then(function(data) {
return data.split("\n").map(function(country) {
country = country.split("\t")[1];
if (country.length > 25) {
country = country.substring(0, 25);
}
return insert_into(connection, "countries", country);
});
});
Promise.all(wineQueries.concat(countryQueries))
.then(function() {
console.log("all done");
})
.catch(function(e) {
console.log("error", e);
})
.finally(function() {
connection.release();
})
});
Pre-requisite code for the above
var Promise = require("bluebird");
var fs = Promise.promisifyAll(require("fs"));
Promise.promisifyAll(require("mysql/lib/Connection").prototype);
var pool = Promise.promisifyAll(require("mysql").createPool({
"user": "...",
"password": "...",
"database": "...",
"host": "localhost",
"port": 3306,
"debug": false
}));
function insert_into(connection, table, values) {
if( values instanceof Array ) {
values = values.map(connection.escape, connection).join(', ');
} else {
values = connection.escape(values);
}
return connection
.queryAsync('INSERT INTO ' + table + ' VALUES (NULL, ' + values + ')')
.then(function() {
console.log(values + " successfully added.");
});
}
Assuming that insert_into is also asynchronous, you may want to use something like async.each to handle inserting your records. It has a convenient callback that will be called when all records are inserted, because only at that point do you want to close the connection:
async.each(codes, function(country, callback) {
if ( country[1].length > 25 ) {
country[1] = country[1].substring(0, 25);
}
insert_into( 'countries', country, callback ); // !! read below
}, function(err) {
// TODO: handle any errors
...
// Here, all countries are inserted.
connection.end();
});
However, this means that insert_into should also be made to accept a callback (using the common Node convention function(err, result)) that will be called when the record has been inserted. In the code above, I'm using the callback provided by async directly, meaning that once your insert_into is done, it will call the async callback signaling that this iteration of each is done.
EDIT: you can rewrite insert_into so it looks like this:
function insert_into( table, values, callback ) {
...
connection.query(..., function(err) {
callback(err);
});
}
Since you don't need the actual result from connection.query, you only have to pass err (instead of throwing it).
Tip: assuming that you're using node-mysql, you may want to take a look at the docs on how it can help you with escaping.
I have a function in my express app that makes multiple queries within a For Loop and I need to design a callback that responds with JSON when the loop is finished. But, I'm not sure how to do this in Node yet. Here is what I have so far, but it's not yet working...
exports.contacts_create = function(req, res) {
var contacts = req.body;
(function(res, contacts) {
for (var property in contacts) { // for each contact, save to db
if( !isNaN(property) ) {
contact = contacts[property];
var newContact = new Contact(contact);
newContact.user = req.user.id
newContact.save(function(err) {
if (err) { console.log(err) };
}); // .save
}; // if !isNAN
}; // for
self.response();
})(); // function
}; // contacts_create
exports.response = function(req, res, success) {
res.json('finished');
};
There are a few problems with your code besides just the callback structure.
var contacts = req.body;
(function(res, contacts) {
...
})(); // function
^ you are redefining contacts and res in the parameter list, but not passing in any arguments, so inside your function res and contacts will be undefined.
Also, not sure where your self variable is coming from, but maybe you defined that elsewhere.
As to the callback structure, you're looking for something like this (assuming contacts is an Array):
exports.contacts_create = function(req, res) {
var contacts = req.body;
var iterator = function (i) {
if (i >= contacts.length) {
res.json('finished'); // or call self.response() or whatever
return;
}
contact = contacts[i];
var newContact = new Contact(contact);
newContact.user = req.user.id
newContact.save(function(err) {
if (err)
console.log(err); //if this is really a failure, you should call response here and return
iterator(i + 1); //re-call this function with the next index
});
};
iterator(0); //start the async "for" loop
};
However, you may want to consider performing your database saves in parallel. Something like this:
var savesPending = contacts.length;
var saveCallback = function (i, err) {
if (err)
console.log('Saving contact ' + i + ' failed.');
if (--savesPending === 0)
res.json('finished');
};
for (var i in contacts) {
...
newContact.save(saveCallback.bind(null, i));
}
This way you don't have to wait for each save to complete before starting the next round-trip to the database.
If you're unfamiliar with why I used saveCallback.bind(null, i), it's basically so the callback can know which contact failed in the event of an error. See Function.prototype.bind if you need a reference.