I am trying to parse some data from several web pages using javascript. I wrote a small parser for this purpose. The algorithm looks like this:
Open first URL from my .csv file
Find the data I need on the page
Save URL and data to a json file
My code executes 1. and 2. perfectly but sometimes messes up with number 3. Output looks like this:
URL 1 + data from URL 1 (correct line)
URL 2 + data from URL 2 (correct line)
URL 3 + data from URL 3 (correct line)
URL 4 + data from URL 4 (correct line)
URL 6(wrong URL) + data from another URL
URL 5(wrong URL) + data from another URL
URL 7 + data from URL 7 (correct line)
URL 8 + data from URL 8 (correct line)
URL 9 + data from URL 9 (correct line)
I assume the problem is that some pages load way too long which messes up the whole process. But I still don't understand why it sometimes saves the wrong data.
Heres my code:
var request = require('request');
var cheerio = require('cheerio');
var cloudscraper = require('cloudscraper');
var fs = require('fs');
var path = require('path');
var csvjson = require('csvjson');
//First, we read .csv file with our URL list
function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
end = urlList.length;
logs = [];
//here we start the loop reading and saving data from each url
for (let p = 0; p < end; p += 1){
grabTheData(urlList, p)
}
}
//this code extracts the data from the page and saves it to a json file
function grabTheData(urlList, p){
setTimeout(function() {
url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
getTheList()
The reason this is happening is that there is a potential mismatch between the callback result and the url variable in grabTheData.
Now there is a very quick fix for this, simple change the scope of the url variable like so:
function grabTheData(urlList, p){
setTimeout(function() {
// Set scope of url variable to block
let url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
This should keep your results in order.
Here's another (IMHO much better) option, using promises and avoiding the use of setTimeout to separate calls. This should avoid any potential race condition, since the Promise.all call will preserve order:
async function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
let promiseList = urlList.map(urlEntry => grabTheDataUpdated(urlEntry.ItemLink));
let logs = await Promise.all(promiseList);
fs.writeFileSync('./new_logs.json', JSON.stringify(logs, null, 4));
}
// Promisified version of cloudscraper.get
function getCloudScraperData(url) {
return new Promise((resolve, reject) => {
cloudscraper.get(url, (err, res, body) => {
if (err) {
reject(err);
} else {
resolve ( { url, res, body });
}
})
})
}
function getDataINeed(url, body) {
// Use cheerio to process data..
// Return mock data for now.. replace with actual data processed by cheerio..
return `data from ${url}`;
}
async function grabTheDataUpdated(url) {
try {
let result = await getCloudScraperData(url);
let dataINeed = getDataINeed(result.url, result.body);
return { url, dataINeed };
} catch (error) {
return { url, dataINeed: "Error occurred: " + error.message };
}
}
Related
I'm adding a contact me section to a website. I want to be able to send the data from the forms with JS, and then receive and do something with the data with Node. I understand that there are frameworks and libraries that can handle this stuff, but I would like to build it from scratch so that I have a better understanding of what is happening.
I currently have a section of JS (see below) that is taking the form data, and sending it as a POST request to the node script, but I can't seem to wrap my head around what is happening with node, or how to receive the data with the node script. Any help in pointing me in the right direction is greatly appreciated.
const name = $(".name");
const email = $(".email");
const message = $(".message");
const submitButton = $(".submitButton");
const nameRegex = /([a-zA-Z\s-])/g;
const emailRegex = /^(([^<>()\[\]\\.,;:\s#"]+(\.[^<>()\[\]\\.,;:\s#"]+)*)|(".+"))#((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/g;
const messageRegex = /([a-zA-Z\s.,?!$%&])/gm;
const url = "../node/contactMeSubmissionHandler.js"
submitButton.click(function(){
let nameContents = name.val().match(nameRegex).join("");
let emailContents = email.val().match(emailRegex).join("");
let messageContents = message.val().match(messageRegex).join("");
// if (emailRegex.test(emailContents) == true) {
// let emailValid = emailContents;
// } else {
// console.log("Email is invalid");
// };
const data = {
email: emailContents,
name: nameContents,
message: messageContents
}
$.post(url, data, function(data, status){
console.log(`${data} and status is ${status}`);
})
})
I like to write from scratch too. Here is working code which is called from a command line to get a token.
// clientEx.js
var http = require('http');
var fs = require('fs');
const _SERVER = "dcsmail.net"; /* dcsmail.net */
// Callback function is used to deal with response
//
var callback = function (response)
{
// update stream with data
var body = '';
response.on('data', function(data) {
body += data;
});
response.on ('end', function()
{
// Data received completely.
fs.writeFileSync ("temp.lst", body, 'utf8');
// console.log ("clientEx.js received: " + body);
});
}
if ((process.argv[2] == null) || (process.argv[3] == null) || (process.argv[4] == null) || (process.argv[5] == null))
{
console.log ("clientEx.js usage:<user email> <user password> <destination> <GUID>");
}
else
{
var Ef_email = encodeURI (process.argv[2]);
var Ef_pass = encodeURI (process.argv[3]);
var Ef_dest = encodeURI (process.argv[4]);
var Ef_guid = encodeURI (process.argv[5]);
var post_data = ("f_email=" + Ef_email +
"\&" + "f_pass=" + Ef_pass +
"\&" + "f_dest=" + Ef_dest +
"\&" + "f_guid=" + Ef_guid);
// Options to be used by request
var options = {
host: _SERVER,
port: '80',
path: '/DCSM/tokenP10.php',
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': Buffer.byteLength (post_data)
}
};
// console.log ("clientEx.js using " + _SERVER + ":" + options.port + options.path);
// request the token from the host
try
{
var req = http.request (options, callback);
req.write (post_data);
req.end();
}
catch (error)
{
fs.writeFileSync ("temp.lst", "Host access failed\n", 'utf8');
}
}
You should be able to adapt that to your needs.
Use this code to create a server and check the log in console for different request attributes.
const http = require('http');
http
.createServer((request, response) => {
console.log(request);
response.end();
})
.listen(3000);
Make GET and POST request to http://localhost:3000/ and look for method, headers etc.
See more here and here.
So this is my dilemma. I have a list of movies, witch I have scraped from a website, then I want to add additional properties to my newly constructed object(json)
Now the omdi api witch I am using supports searching for a movie by title.
Then I make a get request using request and q middlewares. When I receive information from omdb api in the call back I add that data to the object.
Now the next part is where my problem lies. Now I want to return a new Request using data from the previous request. Now I make an new get Request and return it but then() func isin't returning anything. But I don't seem to realize what I am doing wrong.
Here is my code..
var promises = [];
films.forEach(function (film) {
// Get omdbapi information
promises.push(HttpService.getContent(configExternal.omodburl + '?t=' + film.title.trim() + '&y=' + film.year + '&plot=true&tomatoes=true&r=json').then(function (data) {
var result = JSON.parse(data);
if(Boolean(result.Response) === true) {
film.omdb.push(result);
}
var imdbid = result.imdbID;
return HttpService.getContent(configExternal.themoviedburl + imdbid + '/videos?api_key=' + configExternal.themoviedbkey);
}).then(function(data) {
film.trailers = [];
film.trailers.push(JSON.parse(data));
}).catch(function (err) {
logger.error().info('Error getting ' + film.title + ' from omdb, ErrorMessage : ' + err);
}));
});
//--------------------------------
// When all promises have finished
//--------------------------------
Promise.all(promises).then(function (data, err) {
// do stuff with the data
});
And here is my getContent func
var Service = {
getContent: function(url) {
var deferred = q.defer();
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
deferred.resolve(body);
} else {
deferred.reject(error);
}
});
return deferred.promise;
}
};
Problem solved. There wasn't anything wrong with the request as Roamer said. But the moviedata base limits by 40 request per 10 sek witch I didn't know :)
I know this question have been asked many times, but I can't make it work.
Here is my situation. I had a string called data, and I want to unshorten all the link inside that string.
Code:
var Bypasser = require('node-bypasser');
var URI = require('urijs');
var data = 'multiple urls : http://example.com/foo http://example.com/bar';
var result = URI.withinString(data, function(url) {
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
Let's me walk you through the code.
URI.withinString will match all the URLs in data, manipulate it and return the result.
You can view an example from URI.js docs
What I want to with these URLs is to unshorten all of them using node-passer.
This is from node-bypasser document:
var Bypasser = require('node-bypasser');
var w = new Bypasser('http://example.com/shortlink');
w.decrypt(function(err, result) {
console.log('Decrypted: ' + result);
});
This is the result that I want multiple urls : http://example.com/foo_processed http://example.com/bar_processed
I created a notebook at tonicdev.com
Solution
var getUrlRegEx = new RegExp(
"(^|[ \t\r\n])((ftp|http|https|gopher|mailto|news|nntp|telnet|wais|file|prospero|aim|webcal):(([A-Za-z0-9$_.+!*(),;/?:#&~=-])|%[A-Fa-f0-9]{2}){2,}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:#&~=%-]*))?([A-Za-z0-9$_+!*();/?:~-]))"
, "g"
);
var urls = data.match(getUrlRegEx);
async.forEachLimit(urls, 5, function (url, callback) {
let w = new Bypasser(url);
w.decrypt(function (err, res) {
if (err == null && res != undefined) {
data = data.replace(url, res);
callback();
}
});
}, function(err) {
res.send(data);
});
You don't really understand what callback is. The callback serves to allow asynchronous code to run without Javascript waiting for it. If you were less lazy and added some debug in your code:
console.log("Started parsing");
var result = URI.withinString(data, function(url) {
console.log("URL parsed (or whatever)");
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
console.log("Call to library over");
You would (most likely) see messages in this order:
Started parsing
Call to library over
URL parsed (or whatever)
The answer: Callback is not guaranteed to run before any code you execute after assigning it. You can't put data in your result variable because the data might not be fetched yet.
I am a rookie in Nodejs and asynchronous programming. I am having a problem executing a GET request inside an asynchronous function. Here I am posting the whole code. I am trying to pull a list of all Urls , add them to a list and send the list for processing to another function.
My problem is with processing them. Inturn for each url I am executing a GET request to fetch the body and to look for image elements in it. I am looking to pass the Image url to a 3rd party api as a GET param. I am unable to execute the GET request as the control doesn't seem to reach there at all.
var async = require("async"),
request = require("request"),
cheerio = require("cheerio");
async.waterfall([
function(callback) {
var url = "someSourceUrl";
var linkList = [];
request(url, function(err, resp, body) {
var $ = cheerio.load(body);
$('.list_more li').each(function() {
//Find all urls and add them to a list
$(this).find('a').each(function() {
linkList.push($(this).attr('href'));
});
});
callback(null, linkList);
});
},
//pass all the links as a list to callback
function(liksListFetched, callback) {
for (var i in liksListFetched) {
callback(null, liksListFetched[i]);
}
}],
//***********My problem is with the below code**************
function(err, curUrl) {
var cuResp = "";
console.log("Currently Processing Url : " + curUrl);
request(curUrl, function(err, resp, body) {
var $ = cheerio.load(body);
var article = $("article");
var articleImage = article.find("figure").children('img').attr('src');
var responseGrabbed = "API response : ";
//check if there is an IMG element
if (articleImage === undefined) {
console.log("No Image Found.");
articleImage = 'none';
}
else {
//if there is an img element, pass this image url to an API,
//So do a GET call by passing imageUrl to the API as a GET param
request("http://apiurl.tld?imageurl=" + articleImage, function(error, response, resp) { //code doesn't seem to reach here
I would like to grab the response and concatenate it to the responseGrabbed var.
console.log(resp);
responseGrabbed += resp;
});
}
console.log(responseGrabbed);// api response never gets concatenated :(
console.log("_=_=_=_=_=_=__=_=_=_=_=_=__=_=_=_=_=_=__=_=_=_=_=_=_");
process.exit(0);
});
});
I appreciate if any one can help me understand the root cause. Thanks in advance.
request() is asynchronous, so when you're console logging the string, the string hasn't been built yet, you have to do the console log inside the callback :
request("http://apiurl.tld?imageurl=" + articleImage, function(error, response, resp) {
responseGrabbed += resp;
console.log(responseGrabbed);// api response never gets concatenated :(
console.log("_=_=_=_=_=_=__=_=_=_=_=_=__=_=_=_=_=_=__=_=_=_=_=_=_");
});
Same goes for terminating the process, which should be done when all the requests have finished
Firstly, here is my code as I've progressed so far:
var http = require("http");
// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var cheerio = require("cheerio");
var url = "http://www.bloglovin.com/en/blogs/1/2/all";
var myArray = [];
var a = 0;
var getLinks = function(){download(url, function(data) {
if (data) {
// console.log(data);
var $ = cheerio.load(data);
$(".content").each(function(i, e) {
var blogName = $(e).find(".blog-name").text();
var followLink = $(e).find("a").attr("href");
var blogSite = $(e).find(".description").text();
myArray[a] = [a];
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
a++;
console.log(myArray);
});
}
});
}
getLinks();
As you can see, followLinks is concatenated to followUrl, of which I'd like to pass through the 'url' download, so effectively I'll be scraping each of the pages using the same CSS rules, which will be added to the multidimensional array for the corresponding blogger.
How can I go about this?
I do something similar in one of my scraping jobs, but I use the async.js library to accomplish. Note that I'm also using the request module and cheerio.js in my scraping. I fetch and scrape rows of data from a single webpage, but suspect you could do something similar to fetch URLs and request / scrape them in the same manner.
I also admit this is quite basic coding, certainly could be optimized with a bit of refactoring. Hope it gives you some ideas at least...
First, I use request to fetch the page and call my parse function -
var url = 'http://www.target-website.com';
function(lastCallback) {
request(url, function(err, resp, body) {
if(!err) { parsePage(err, resp, body, lastCallback); }
else { console.log('web request error:' + resp.statusCode); }
}
}
Next, in my parsePage function, I load the website into Cheerio, fetch the HTML of each data row into an array, push my parseRow function and each HTML segment into another array, and use async.parallel to process each iteration -
var rows = [];
function parsePage(err, resp, body, callback1) {
var $ = cheerio.load(body);
$('div#targetTable tr').each(function(i, elem) {
rows.push($(this).html());
});
var scrRows = [];
rows.forEach(function(row) {
scrRows.push(function(callback2) {
parseRow(err, resp, row);
callback2();
});
async.parallel(scrRows, function() {
callback1();
});
}
Inside your loop, just create an object with the properties you scrape then push that object onto your array.
var blogInfo = {
blogName: blogName,
followLink: "http://www.bloglovin.com"+followLink;
blogSite: blogSite
};
myArray.push(blogInfo);
You have defined a = 0; So
myArray[a] = [a]; // => myArray[0] = [0]; myArray[0] becomes an array with 0 as only member in it
All these statements throw an error since Array can have only integer as keys.
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
Instead try this:
var obj = {
index: a,
blogName: blogName,
followLink: "http://www.bloglovin.com" + followLink,
blogSite: blogSite
}
myArray.push(obj);
console.log(myArray);