How do I make a recursive scraper with javascript? - javascript

So I'm making a little scraper for learning purposes, in the end I should get a tree-like structure of the pages on the website.
I've been banging my head trying to get the requests right. This is more or less what I have:
var request = require('request');
function scanPage(url) {
// request the page at given url:
request.get(url, function(err, res, body) {
var pageObject = {};
/* [... Jquery mumbo-jumbo to
1. Fill the page object with information and
2. Get the links on that page and store them into arrayOfLinks
*/
var arrayOfLinks = ['url1', 'url2', 'url3'];
for (var i = 0; i < arrayOfLinks.length; i++) {
pageObj[arrayOfLinks[i]] = scanPage[arrayOfLinks[i]];
}
});
return pageObj;
}
I know this code is wrong on many levels, but it should give you an idea of what I'm trying to do.
How should I modify it to make it work? (without the use of promises if possible)
(You can assume that the website has a tree-like structure, so every page only has links to pages further down the three, hence the recursive approach)

I know that you'd rather not use promises for whatever reason (and I can't ask why in the comments because I'm new), but I believe that promises are the best way to achieve this.
Here's a solution using promises that answers your question, but might not be exactly what you need:
var request = require('request');
var Promise = require('bluebird');
var get = Promise.promisify(request.get);
var maxConnections = 1; // maximum number of concurrent connections
function scanPage(url) {
// request the page at given url:
return get(url).then((res) => {
var body = res.body;
/* [... Jquery mumbo-jumbo to
1. Fill the page object with information and
2. Get the links on that page and store them into arrayOfLinks
*/
var arrayOfLinks = ['url1', 'url2', 'url3'];
return Promise.map(arrayOfLinks, scanPage, { concurrency: maxConnections })
.then(results => {
var res = {};
for (var i = 0; i < results.length; i++)
res[arrayOfLinks[i]] = results[i];
return res;
});
});
}
scanPage("http://example.com/").then((res) => {
// do whatever with res
});
Edit: Thanks to Bergi's comment, rewrote the code to avoid the Promise constructor antipattern.
Edit: Rewrote in a much better way. By using Bluebird's concurrency option, you can easily limit the number of simultaneous connections.

Related

How to (Properly) add in each execution of a loop in node.js

So i'm attempting to write a google parser.
The idea of my tool is it takes search queries and searches google for them and returns URLs. It is working good so far but now im trying to set a page configuration and im having troubles, my code is:
const needle = require("needle") //for making get request
const sp = require("serp-parser") //for parsing data from the request
const queryup = "watch movies online free" //my search data
const query = encodeURI(queryup) //my search data so google can read it
var page = 0; //initializing the page counter
let pages = 5; //setting amount of pages to loop through
for (var i = 0; i < pages; i++) { //my loop
needle.get(`https://www.google.com/search?q=${query}&start=${page}`, function(err, response){ //MY MAIN PROBLEM <<<--- The issue is its adding to the page value but its not effecting it here, why?
page += 10 //adding to page value (every 10 page value is 1 extra page)
console.log(`----- Page number: `+ page / 10+" -----") //logging the number of the page to confirm that it is indeed increasing the page value
let results = response.body; //defining the body of my request
parser = new sp.GoogleNojsSERP(results); //initializing the parser
let parsed = parser.serp //parsing the body
let objarray = parsed.organic; //parsed body (returns as an array of json objects)
for (var i = 0; i < objarray.length; i++) { //loop the logging of each url
let url = objarray[i].url //defining url
console.log(url) //logging each url
}
});
}
without a billion comments:
const needle = require("needle")
const sp = require("serp-parser")
const queryup = "watch movies online free"
const query = encodeURI(queryup)
var page = 0;
let pages = 5;
for (var i = 0; i < pages; i++) {
needle.get(`https://www.google.com/search?q=${query}&start=${page}`, function(err, response){
//^^^^^ MY MAIN PROBLEM <<<--- The issue is its adding to the page value but its not effecting it here, why?
page += 10
console.log(`----- Page number: `+ page / 10+" -----")
let results = response.body;
parser = new sp.GoogleNojsSERP(results);
let parsed = parser.serp
let objarray = parsed.organic;
for (var i = 0; i < objarray.length; i++) {
let url = objarray[i].url
console.log(url)
}
});
}
This seems to be an issue with async.
I'm not familiar with needle, but I know that external queries are basically never synchronous.
The problem you're experiencing is basically, the actual web query is happening after the loop first runs and has already incremented page to 50. Then, 5 queries are constructed, each one with page=50, because async is complicated and difficult to manage.
Under the hood, the engine is essentially doing literally everything else it can possibly do first, and THEN doing your web queries.
A trip through the needle npm docs tells me that you can use alternative syntax to get needle to return a promise instead, which can then be wrapped in an asynchronous function and managed through await to force synchronous behavior, which is what you're after:
const needle = require('needle');
const sp = require('serp-parser');
const queryup = 'watch movies online free';
const query = encodeURI(queryup);
let page = 0;
const pages = 5;
const googler = async function () {
for (let i = 0; i < pages; i++) {
try {
const response = await needle('get', `https://www.google.com/search?q=${query}&start=${page}`);// MY MAIN PROBLEM <<<--- The issue is its adding to the page value but its not effecting it here, why?
console.log('----- Page number: ' + page / 10 + ' -----');
const results = await response.body;
const parser = new sp.GoogleNojsSERP(results);
const parsed = parser.serp;
const objarray = parsed.organic;
for (let i = 0; i < objarray.length; i++) {
const url = objarray[i].url;
console.log(url);
}
} catch (err) {
console.error(err);
}
page += 10;
}
};
googler();
The key differences:
Per the needle docs, rather than the request method being a method on the needle object, it's instead the first argument you pass directly to invoking needle itself as a function.
When you manage promises with await, a rejected promise throws an error that should be caught with a traditional try/catch block; I've done that here. Though, if needle is anything like node-fetch it probably basically never throws errors, but it's good practice.
One of my extensions automatically changed your var declarations to let and not-reassigned let declarations to const; you're welcome to change them back.
This is a classic asynchronous problem. Add another console.log() immediately before the needle.get() call (and after the for statement) and you will see what is going wrong: All of the needle.get() calls execute before any of the callbacks where you do the page += 10. Then, after the for loop completes, all of the callbacks are executed. But it is too late for this to have any effect on the start= parameter.
One way to fix this could be to move the body of this for loop (the needle.get() and its callback) into a separate function. Initialize your variables and call this function once. Then at the end of the callback, do your page += 10 and update any other variables you need to, and call this function again from there if there are more pages left that you want to load. If you have completed all of the pages, then don't make that call. The for loop is not needed with this technique.
Or, you could keep your current code but move the page += 10 after the callback but still inside the outer for loop. That way this variable will be incremented as you expect. I don't necessarily recommend this, as Google may get unhappy about receiving the get requests so rapidly and may start blocking your calls or throwing CAPTCHAs at you.
There may be an issue of whether this kind of scraping is allowed by Google's Terms of Service, but I will leave that question to you and your legal advisors.
Also, I would avoid using var anywhere. Use const or let instead, and prefer const over let except when you need to reassign the variable.
One tip: in most cases where you use a numeric for loop to iterate over an array, the code will be cleaner if you use a for..of loop. For example, this bit of code:
let parsed = parser.serp
let objarray = parsed.organic;
for (var i = 0; i < objarray.length; i++) {
let url = objarray[i].url
console.log(url)
}
could be more simply written as:
for (const result of parser.serp.organic) {
console.log(result.url)
}
(I know that is just a bit of debug code, but this is a good habit to get into.)
Finally, watch your indentation and be sure to indent nested blocks or functions. I took the liberty of adding some indentation for you.

HTML 5 FileSytem, combine FileEntry with MetaData array from callback

In a Chrome extension im using the HTML5 FileSytem API.
Im retrieving a list of records in a folder.
var entries = [];
var metadata = [];
listFiles(folder);
function listFiles(fs) {
var dirReader = fs.createReader();
entries = [];
// Call the reader.readEntries() until no more results are returned.
var readEntries = function () {
dirReader.readEntries(function (results) {
if (!results.length) {
addMeta(entries);
} else {
console.log(results);
entries = entries.concat(toArray(results));
readEntries();
}
});
};
readEntries(); // Start reading dirs.
}
The FileEntry object does not contain metadata, I need the last modified date. I'm able to retrieve a object of metadata
function addMeta(entries) {
for (var i = 0; i < entries.length; i++) {
entries[i].getMetadata(function (metadata) {
console.log(entries);
console.log(metadata);
});
}
}
Problem is that i get the metadata in a callback.
How can i join the two object making sure the right match is made?
The simplified result im looking for is:
[
["fileName1", "modifyDate1"],
["fileName2", "modifyDate2"],
]
To get lastModifiedDate, you don't need to use getMetadata, as per the description of this question, just use entry.file.lastModifiedDate, though maybe file() is another callback.
To "join the two object making sure the right match is made", because of Closures, you could use the following code to get the right results. (Assuming the data structure is [[entry, metadata]] as you mentioned)
var ans = [];
function addMeta(entries) {
for (var i = 0; i < entries.length; i++) {
(function(entry) {
entry.getMetadata(function (metadata) {
ans.push([entry, metadata]);
});
}(entries[i]);
}
}
If what you want is to wait for all asynchronous callback ends, see this answer for more details, basically you could adjust your code and use Promise, or use other implementations like setInterval or use a variable to calculate how many callbacks remain.
I suggest to have a look on Promise-based bro-fs implementation of HTML Filesystem API.
To read all entries with metadata you can do something like this:
fs.readdir('dir')
.then(entries => {
const tasks = entries.map(entry => fs.stat(entry.fullPath))
return Promise.all(tasks);
})
.then(results => console.log(results))

How to do first n times async loops then one time function in callback way?

It is not a question about how to do post multipart form in nodejs.
But how to do such logic(first do a n times loops(async) then one time function(async)) in callback way?
for example, client will post multipart form with normal form fields:
req.files[n]: contains n images, needs to save to server's local filesystem
req.body: contains post.title, post.content, post.user
In normal way(php, java...), sample code would be
array savedPath = [];
// save images to local filesystem
foreach image in files
savedPath.push(saveImageToLocal(image))
// append saved images path to post
var post = req.body;
post.images = savedPath;
Posts.insert(post)
But in nodejs, callback way, how can i write it?
var savedPath = [];
saveImageToLocal(files[0], function(path) {
savedPath.push(path);
saveImageToLocal(files[1], function(path) {
savedPath.push(path);
//.... its n elements, how can I write it??
var post = req.body;
post.images = savedPath;
Posts.insert(postfunction(err, result) {
res.send(err, result)
});
});
});
Or
var savedPath = [];
for(i=0;i<n;i++) {
savesaveImageToLocalTo(files[i], function(path) {
savedPath.push(path);
});
}
waitSaveToFinished() ??
var post = req.body;
post.images = savedPath;
Posts.insert(postfunction(err, result) {
res.send(err, result)
});
How to do these kind of things in the way of nodejs/callback?
The best way to coordinate multiple asynchronous operation is to use promises. So, if this were my code, I would change or wrap saveImageToLocalTo() and Posts.insert() to return promises and then use promise features for coordinating them. If you're going to be writing much node.js code, I'd suggest you immediately invest in learning how promises work and start using them for all async behavior.
To solve your issue without promises, you'd have to implement a counter and keep track of when all the async operations are done:
var savedPath = [];
var doneCnt = 0;
for(i=0;i<n;i++) {
savesaveImageToLocalTo(files[i], function(path) {
++doneCnt;
savedPath.push(path);
// if all the requests have finished now, then do the next steps
if (doneCnt === n) {
var post = req.body;
post.images = savedPath;
Posts.insert(postfunction(err, result) {
res.send(err, result)
});
}
});
}
This code looks like it missing error handling since most async operations have a possibility of errors and can return an error code.

Attempt to use Promises to delete records

Here is a simple task I would like to accomplish on Parse.com with Cloud Code.
The task consists to delete a Unit and what is related to it.
One Unit has several Sentences related to it and each Sentence has one or more Translations.
So when the task is performed, the Unit as well as the Sentence and Translations should be deleted.
I have a strong feeling I should be using Promises (and chain them up) in order to do what I want in a good manner.
Below is the code I wrote, but it works only partially (The translations are deleted, not the rest).
Parse.Cloud.define("deleteUnitAndDependencies", function(request, response) {
var unitListQuery = new Parse.Query("UnitList");
unitListQuery.equalTo("objectId", request.params.unitID);
unitListQuery.equalTo("ownerID", request.params.userID);
unitListQuery.find().then(function(resUnit) {
var sentenceListQuery = new Parse.Query("SentenceList");
sentenceListQuery.equalTo("unit", resUnit[0]);
return sentenceListQuery.find();
}).then(function(resSentence) {
var translatListQuery = new Parse.Query("TranslatList");
for (i = 0; i < resSentence.length; i++) {
var query = new Parse.Query("TranslatList");
query.equalTo("sentence", resSentence[i]);
translatListQuery = Parse.Query.or(translatListQuery, query);
}
return translatListQuery.find();
}).then(function(resTranslat) {
for (iT = 0; iT < resTranslat.length; iT++) {
resTranslat[iT].destroy({});
}
});
});
I surely need to add some lines of code like:
resSentence[x].destroy({});
and:
resUnit[0].destroy({});
The problem is that I do not quite see where is the adequate place for that.
Collect the objects to be deleted then use Parse.Object.destroyAll(someArray); to delete all at once.
In cases like this I like to use a scope variable to hold things for later use.
var scope = {
sentences: [],
units: []
};
// later inside then block...
scope.sentences.push(resSentence[i]);
// ...now we have them collected safely
.then(function() {
return Parse.Object.destroyAll(scope.sentences);
})

Javascript memory consumption with map() over a large set and callbacks

I don't even know how to properly ask this question but I have concerns about the performance (mostly memory consumption) of the following code. I am anticipating that this code will consume a lot of memory because of map on a large set and a lot of 'hanging' functions that wait for external service. Are my concerns justified here? What would be a better approach?
var list = fs.readFileSync('./mailinglist.txt') // say 1.000.000 records
.split("\n")
.map( processEntry );
var processEntry = function _processEntry(i){
i = i.split('\t');
getEmailBody( function(emailBody, name){
var msg = {
"message" : emailBody,
"name" : i[0]
}
request(msg, function reqCb(err, result){
...
});
}); // getEmailBody
}
var getEmailBody = function _getEmailBody(obj, cb){
// read email template from file;
// v() returns the correct form for person's name with web-based service
v(obj.name, function(v){
cb(obj, v)
});
}
If you're worried about submitting a million http requests in a very short time span (which you probably should be), you'll have to set up a buffer of some kind.
one simple way to do it:
var lines = fs.readFileSync('./mailinglist.txt').split("\n");
var entryIdx = 0;
var done = false;
var processNextEntry = function () {
if (entryIdx < lines.length) {
processEntry(lines[entryIdx++]);
} else {
done = true;
}
};
var processEntry = function _processEntry(i){
i = i.split('\t');
getEmailBody( function(emailBody, name){
var msg = {
"message" : emailBody,
"name" : name
}
request(msg, function reqCb(err, result){
// ...
!done && processNextEntry();
});
}); // getEmailBody
}
// getEmailBody didn't change
// you set the ball rolling by calling processNextEntry n times,
// where n is a sensible number of http requests to have pending at once.
for (var i=0; i<10; i++) processNextEntry();
Edit: according to this blog post node has an internal queue system, it will only allow 5 simultaneous requests. But you can still use this method to avoid filling up that internal queue with a million items if you're worried about memory consumption.
Firstly I would advise against using readFileSync, and instead favour the async equivalent. Blocking on IO operations should be avoided as reading from a disk is very expensive, and whilst that's the sole purpose of your code now, I would consider how that might change in the future - and arbitrarily wasting clock cycles is never a good idea.
For large data files I would read them in in defined chunks and process them. If you can come up with some schema, either sentinels to distinguish data blocks within the file, or padding to boundaries, then process the file piece by piece.
This is just rough, untested off the top of my head, but something like:
var fs = require("fs");
function doMyCoolWork(startByteIndex, endByteIndex){
fs.open("path to your text file", 'r', function(status, fd) {
var chunkSize = endByteIndex - startByteIndex;
var buffer = new Buffer(chunkSize);
fs.read(fd, buffer, 0, chunkSize, 0, function(err, byteCount) {
var data = buffer.toString('utf-8', 0, byteCount);
// process your data here
if(stillWorkToDo){
//recurse
doMyCoolWork(endByteIndex, endByteIndex + 100);
}
});
});
}
Or look into one of the stream library functions for similar functionality.
H2H
ps. Javascript and Node works extremely well with async and eventing.. using sync is an antipattern in my opinion, and likely to cause code to be a headache in future

Categories