i am an absolute beginner learning js and node.js and i am building a simple scraper. the code is scraping multiple domains . i would like to store information such as the titles, meta tags such as description, etc for each of the the domains scraped into a hash table but i have no idea how to proceed. can you shortly explain how to do it? here you have the code
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var arr = ["http://allrecipes.com/", "http://www.gossip.fr/" ];
console.log("Visiting pages now... ");
for (var i = 0; i < arr.length; i++) {
setTimeout(request, 5000 * i, arr[i], function (error, response, body) {
if(error) {
console.log("Error: " + error);
}
console.log("Status code: " + response.statusCode);
if(response.statusCode === 200) {
var $ = cheerio.load(body);
console.log("Page title: " + $('title').text());
}
});
}
i modify the code as you can see below but instead of storing into the hash the title for each domains scraped, it store the result just for the last domains of the array. see code below
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var output = {};
var arr = ["http://allrecipes.com/", "http://www.gossip.fr/", "http://www.clicrbs.com.br/rs/" ];
console.log("Visiting pages now... ");
for (var i = 0; i < arr.length; i++) {
var result = arr[i];
setTimeout(request, 5000 * i, arr[i], function (error, response, body) {
if(error) {
console.log("Error: " + error);
}
console.log("Status code: " + response.statusCode);
if(response.statusCode === 200) {
var $ = cheerio.load(body);
console.log("Page title: " + $('title').text());
}
{
output[result] = {error: error, title: $('title').text(), status: response.statusCode}
}
});
}
Use let in place of var. That way you create a new let variable in each iteration, instead of overwriting the old var.
Correct:
let output = {}
let a = ['a', 'b', 'c']
for(let i=0; i<a.length; i++) {
let result = a[i]; setTimeout(x => output[result] = i, i*5)
}
setTimeout(() => document.write(JSON.stringify(output)), 20)
Incorrect:
let output = {}
let a = ['a', 'b', 'c']
for(let i=0; i<a.length; i++) {
var result = a[i]; setTimeout(x => output[result] = i, i*5)
}
setTimeout(() => document.write(JSON.stringify(output)), 20)
You can read more about let and lexicographic scoping on Mozilla Developer Network.
Related
async postList(arr){
console.log(arr);
console.log(arr[0]);
Debugger + console picture: https://prnt.sc/23q4jra
as seen in the picture of the debugger console.log(arr) returns an array with a value in the [0] position, the very next line arr[0] returns "undefined" and arr.length returns "0"
how is it possible?
the function that calls this function:
async mountedCall(){
var composedArray = await this.createList();
document.getElementById('listWrap_players').appendChild(await this.postList(composedArray));
},
createList():
async createList(){
var composedArray = [];
const id = document.getElementById('joinCode').innerHTML;
var player_count = null;
await firebase.database().ref('lobbies/' + id + '/playerCount/').once('value', (snapshot) => {
const data = snapshot.val();
player_count = data;
}).then(function() {
for(var i = 1; i <= player_count; i++){
var iStr = String(i);
const player_names_snapshot = firebase.database().ref('lobbies/' + id + '/players/' + iStr);
player_names_snapshot.once('value', (snapshot) => {
const data = snapshot.val();
composedArray.push(data);
}).then(function(){return;});
}
});
this.isLeader(id);
return composedArray;
},
UPDATE:
Tried to replace console.log with console.log(JSON.stringify(arr)) as suggested below
console.log(JSON.stringify(arr)) returns an empty array so I think it means I have synchronization problem in createList() or in mountedCall(), yet I cant seem to find it. I've used await and .then() in every location possible...
here are all the functions together:
async mountedCall(){
var composedArray = await this.createList();
document.getElementById('listWrap_players').appendChild(await this.postList(composedArray));
},
async removeAllChildNodes(list) {
while(list.firstChild){
list.removeChild(list.firstChild);
}
},
async postList(arr){
console.log(JSON.stringify(arr));
console.log(arr[0]);
var list = document.createElement('ul');
for(let i = 0; i < arr.length; i++){
var item = document.createElement('li');
item.appendChild(document.createTextNode(arr[i]));
list.appendChild(item);
}
const listContainer = document.getElementById('listWrap_players');
this.removeAllChildNodes(listContainer);
return list;
},
async createList(){
var composedArray = [];
const id = document.getElementById('joinCode').innerHTML;
var player_count = null;
await firebase.database().ref('lobbies/' + id + '/playerCount/').once('value', (snapshot) => {
const data = snapshot.val();
player_count = data;
}).then(function() {
for(var i = 1; i <= player_count; i++){
var iStr = String(i);
const player_names_snapshot = firebase.database().ref('lobbies/' + id + '/players/' + iStr);
player_names_snapshot.once('value', (snapshot) => {
const data = snapshot.val();
composedArray.push(data);
}).then(function(){return;});
}
});
return composedArray;
},
I have a CSV file with ~20,000 records. I send each line using the $.post method to my server using the FileReader API.
The problem is that the browser is buffering each record before starting to send the data and this way is very slow. I want to send each line separately to show a progressbar where it counts the request number of each line.
As this solution is very slow I'm thinking there are must be other ways of doing this to make it faster. Many thanks to your ideas.
$("#form_file").change(function(e) {
if (e.target.files != undefined) {
var reader = new FileReader();
reader.onload = function(e) {
var rows = e.target.result.split("\n");
var index = rows[0];
index = index.split(";");
gesamt = rows.length - 1;
for (var i = 1; i < rows.length; i++) {
var row = rows[i];
cells = row.split(";");
var dataset = {};
for (var ii = 0; ii < cells.length; ii++) {
var value = cells[ii];
var key = index[ii]
var printError = function(error, explicit) {
console.log(`[${explicit ? 'EXPLICIT' : 'INEXPLICIT'}] ${error.name}: ${error.message}`);
}
try {
dataset[key] = value;
} catch (e) {
if (e instanceof RangeError) {
if (e.message.toLowerCase().indexOf('invalid array') !== -1) {
printError(e, true);
} else {
printError(e, false);
}
} else {
printError(e, false);
}
}
}
console.log(dataset);
row = insertrow(dataset, i);
$('#progressbar').show();
$('#progressvalue').text(i + '/' + gesamt);
$('#progresstitle').text('(' + dataset.title + ')');
}
};
var test = reader.readAsText(e.target.files.item(0));
}
});
function insertrow(mydata, step) {
var token = "{{app.request.query.get('_token')}}";
mydata = JSON.stringify(mydata);
$.post('preferences/upload?_token=' + token, {
data: mydata
}, function(data) {
$('#info').show();
var html = data.message + '<br />';
$('#info').append(html);
}, "json");
}
I have html with tables and in html I included js script bom.js:
$(document).ready(function() {
setTimeout(function() {
var nums = [];
$('#table_Serv tr td.serv-nomer').each(function (elem, ind) {
nums[parseInt($(this).text())] = elem + 1;
// nums.newNum[elem] = elem + 1;
// nums.oldNum[elem] = parseInt($(this).text());
$(this).text(elem + 1);
});
$('#test1234 tr td:nth-child(2)').each(function() {
// extract each number in an array
const numbers = $(this).html().split(',').map(x => Number(x));
// var sortednum = [];
var arrs = []
for(var i = 0; i <= numbers.length; i++) {
arrs[numbers[i]] = nums[numbers[i]];
}
// console.log(sortednum);
// // Sort the numbers
const sorted = arrs.sort((a, b) => a > b);
;
const stringArray = sorted.reduce((tmp, x) => `${tmp},${x}`);
// Insert the string back in the td
$(this).text(stringArray);
});
}, 1000);
});
When I go to html page, script working and get me table with changes by javascript. When jsdom get this html, jsdom is not executing javascript and get me clean html.
My index.js:
'use strict';
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
var fs = require('fs');
const options = {
resources: 'usable',
runScripts: 'dangerously',
};
var url = process.argv[2] !== undefined ? process.argv[2] : 'http://google.ru';
JSDOM.fromURL(url, options).then(dom => {
setTimeout(function() {
var cont = dom.window.document.getElementsByTagName('html')[0].innerHTML;
console.log(cont);
var file = url.split('/');
var filename = file[file.length - 1];
fs.writeFile("file.html" + filename, cont, function(err) {
if(err) {
return console.log(err);
}
// console.log("The file was saved!");
});
}, 1000)
});
How I can resolve this solutions? I need save html after javascript processing. Get html by javascript. How I can do it?
I tryied this:
JSDOM.fromURL(url, options).then(dom => {
dom.window.onload = function() {
var cont = dom.window.document.getElementsByTagName('html').[0].innerHTML;
console.log(cont);
var file = url.split('/');
var filename = file[file.length - 1];
fs.writeFile("file.html" + filename, cont, function(err) {
if(err) {
return console.log(err);
}
// console.log("The file was saved!");
});
};
});
Not working :( window.onload method is not want working..
I'm having a problem where for(var x=1; x < 6; x++) is getting called because too fast axios.get() is async, but I have no idea how to counter that without the solution being too complicated
const axios = require("axios");
const cheerio = require("cheerio");
function imdbGetData(id) {
var title, show, $;
var arr = [];
var airdates = [];
show = {
seasons: []
};
axios.get(`http://www.imdb.com/title/${id}/`).then((body) => {
$ = cheerio.load(body.data);
title = $("div h1").text()
});
for(var x=1; x < 6; x++) {
console.log(x); // Will count too 1,2,3,4,5,6
url = `http://www.imdb.com/title/${id}/episodes?season=${x}`
axios.get(url).then((body) => {
$ = cheerio.load(body.data);
console.log(x);// 6, 6, 6, 6
$("div .info .airdate").each(function(index, item) {
var airdate = String($(this).text());
airdates.push(airdate.trim());
});
$(".info strong a").each(function(i, item){
var airdate = airdates[i];
var epsiode_name = $(this).text()
if (epsiode_name && !epsiode_name.includes("#"))
arr.push({epsiode_name, airdate});
});
show.seasons.push(arr);
arr = []
// console.log(show.seasons);
});
setTimeout(() => {console.log(show.seasons)}, 10000) // ghetto
}
}
// season = {
// seasons: [[ {epsiode_name} ], [{Epsiode name}]]
// }
imdbGetData("tt2193021");
You can construct and push all promises to array, and then use Promise.all(arrayOfPromises). This way you will keep your asynchronous chain and you can easily handle results very similar to regular single asynchronous operation:
var promises = [];
for (var x = 1; x < 6; x++) {
url = `http://www.imdb.com/title/${id}/episodes?season=${x}`
promises.push(axios.get(url));
}
Promise.all(promises)
.then(body => {
// all results of promises will be in 'body' parameter
})
.catch(err => console.error(err));
You can also use async/await (in newer versions of Node.js), so you can make the code a little easier to read, I've made a few little changes to update progress too.
const axios = require("axios");
const cheerio = require("cheerio");
async function imdbGetData(id) {
var title, show, $;
var arr = [];
var airdates = [];
show = {
seasons: []
};
console.log('Getting from ' + `http://www.imdb.com/title/${id}/`);
let body = await axios.get(`http://www.imdb.com/title/${id}/`);
$ = cheerio.load(body.data);
title = $("div h1").text()
for(var x=1; x < 6; x++) {
console.log('Getting season: ' + x); // Will count too 1,2,3,4,5,6
url = `http://www.imdb.com/title/${id}/episodes?season=${x}`
let body = await axios.get(url);
$ = cheerio.load(body.data);
$("div .info .airdate").each(function(index, item) {
var airdate = String($(this).text());
airdates.push(airdate.trim());
});
$(".info strong a").each(function(i, item){
var airdate = airdates[i];
var epsiode_name = $(this).text()
if (epsiode_name && !epsiode_name.includes("#"))
arr.push({epsiode_name, airdate});
});
show.seasons.push(arr);
arr = []
}
console.log("Result: ", show.seasons);
}
imdbGetData("tt2193021");
You can simply use ES6 let instead of var , your code will be:
for(let i=0; i<length; i++){
asyncCall(function(){
console.log(i);// will print 0,1,2,3,...
});
}
Please check this article https://codeburst.io/asynchronous-code-inside-an-array-loop-c5d704006c99
I was following this tutorial when a wild step 9 appears.
This problem is the same as the previous problem (HTTP COLLECT) in that you need to use http.get(). However, this time you will be provided with three URLs as the first three command-line arguments.
You must collect the complete content provided to you by each of the URLs and print it to the console (stdout). You don't need to print out the length, just the data as a String; one line per URL. The catch is that you must print them out in the same order as the URLs are provided to you as command-line arguments.
My code was (It doesn't work fine just when he pleases):
http = require("http");
var url = [process.argv[2], process.argv[3], process.argv[4]];
var responses = [];
var completed_responses = 0;
for(var i in url){
http.get(url[i], function(response){
var content = "";
//if(completed_responses == url.length){
response.setEncoding("utf-8");
response.on("data", function(data){
content += data;
})
response.on("error", console.error);
response.on("end", function(end){
console.log(content);
});
})
}
And the answer was:
var http = require("http");
var bl = require("bl");
var results = [];
var count = 0;
function printResults(){
for(var i = 0; i < 3; i++)
console.log(results[i]);
}
function httpGet(index){
http.get(process.argv[2 + index], function(response){
response.pipe(bl(function(err, data){
if (err)
return console.error(err);
results[index] = data.toString();
count++;
if(count == 3)
printResults()
}))
})
}
for(var i = 0; i < 3; i++)
httpGet(i);
What is the right answer WITHOUT BL/AFTER/ETC?
Thanks to all!
I've done that tutorial myself when I was first learning node and I remember that step of the tutorial. The solution was fairly underwhelming. Anyway, for your answer:
NodeJs Asynchronous programming - Coordinating parallel calls
You can check the code in the question and make the fixes I suggested in my answer. That should solve it without BL/Async/Whatever else that tutorial mentions.
Here is my code for the Juggling Async challenge without using any third-party libraries.
var http = require("http");
var urls = [process.argv[2], process.argv[3], process.argv[4]];
var urlResults = new Array("", "", "");
var allDoneCount = 0;
urls.forEach(function (_url) {
http.get(_url, function (resp) {
resp.on("data", function (data) {
if (_url === urls[0]) {
urlResults[0] += data.toString();
} else if (_url === urls[1]) {
urlResults[1] += data.toString();
} else {
urlResults[2] += data.toString();
}
})
resp.on("end", function () {
allDoneCount++;
if (allDoneCount === 3) {
console.log(urlResults[0]);
console.log(urlResults[1]);
console.log(urlResults[2]);
}
})
resp.on("error", function (err) {
console.log(err);
})
}).on("error", function (err) {
console.log(err);
})
})
This is how you can do it without any external modules(except http ;P).
const http = require('http'); //http module
let results = ["", "", ""]; //this will store the data from http.get()
let counter = 0; //to keep a counter for no of httpget's done
//it will iterate when counter is 3 i.e. the 'end' for all
function print() {
for (let i = 0; i < 3; i++) {
console.log(results[i]);
}
}
//accept index(for process.argv) as parameter
function httpGetter(i) {
//http.get method on the url first encountered, 2+i because 2 values are reserved
http.get(process.argv[2 + i], (res) => {
//for converting (res)ponse to string/alternatively toString() method can be used
res.setEncoding('utf8');
//event data on the url, callback with recived chunk as parameter
res.on('data', function(chunk) {
//appending the recived chunk to that element of results corresponding to 'i' of httpGetter function
results[i] += chunk;
});
//event end, when no more data is read
//runs every time for each value of 'i' that is for each url
res.on('end', function() {
//to keep count
counter++;
//when 3 that is when data from all inputs receved
if (counter === 3) {
//print function simply iterating over results array
print();
}
});
})
}
//inputs are recieved from here
for (let i = 0; i < 3; i++) {
//i can be index for results
httpGetter(i);
}