request get stuck from nodejs - javascript

i wrote some code in nodejs. it's about to get html from website.
var request = require('request');
var cheerio = require('cheerio');
var path = require('path');'
var fs = require('fs');
var num = 0;
var arc = '';
for(var i=0; i < 20000; i++) {
num++;
var requrl = 'http://127.0.0.1/'+ num + '.html';
request(requrl, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body); //get body
}
})
}
i just sent request to get html from 1.html to 20000.html
it starts okay, but it soon get stuck in about 1100.html
the console doesn't throw any error or exceptions.
it just stuck there, any solutions?

Related

how to handle string comparison and file writing using xlsx with nodejs

This script is to get the title of the webpage where the URL of the website will be passed from an excel file, check to see if the title contains the keyword, and then store that domain in the new excel file.
There is no issue with the partial code, but the title comparison does not work as expected. Does anyone have an idea how to fix it?
here is my code
var request = require("request");
var cheerio = require("cheerio");
const xlsx = require("xlsx");
jsonData = [{ Domain: "blogger.com" }, { Domain: "stackoverflow.com" }];
function fetchTitle(url, onComplete = null) {
request(url, function (error, response, body) {
var output = url; // default to URL
if (!error && (response && response.statusCode) === 200) {
var $ = cheerio.load(body);
console.log(`URL = ${url}`);
var title = $("head > title").text().trim();
console.log(`Title = ${title}`);
output = `[${title}] (${url})`;
var keywords = ["Developers", "blog"];
var results = [];
var UrlArray = [];
for (var i = 0; i < keywords.length; i++) {
var match = title.match(new RegExp(keywords.join("|"), "g"));
results.push(keywords[i]);
}
if (match.length > 0) {
UrlArray.push({
Domain: url,
Keywords: results,
Title: output,
});
finalJsonData = JSON.stringify(UrlArray);
const ws = xlsx.utils.json_to_sheet(UrlArray);
const wb = xlsx.utils.book_new();
xlsx.utils.book_append_sheet(wb, ws, "Responses");
xlsx.writeFile(wb, "output.xlsx");
}
} else {
console.log(
`Error = ${error}, code = ${response && response.statusCode}`
);
}
console.log(`output = ${output} \n\n`);
if (onComplete) onComplete(output);
});
}
jsonData.forEach(function (table) {
var tableName = table.Domain;
var URL = "http://" + tableName;
fetchTitle(URL);
});
When I execute the script, I am able to get the title, but when I compare it with the keyword, it is not working as expected. Keywords are not being stored. You can see how the output looks after executing the script.
The script shows that both domains have keywords, but only blogger is stored in the spreadsheet, even then keywords aren't stored
you're overwriting the file on each loop,
keywords is an array, so it doesn't get saved, furthermore, keywords column will always contain all keywords, not the matching ones...
as requests are async, you need to track them all, and write results only when all requests are finished.
try this:
match case insensitive, and store only matching keywords for that site, not all (I also added "no match" for domains with no match)
store results outside the loop
move writing results into a separate function
add request counter and callback to track requests
write results when requests are done
the code:
var request = require("request");
var cheerio = require("cheerio");
const xlsx = require("xlsx");
const jsonData = [{ Domain: "blogger.com" }, { Domain: "stackoverflow.com" }];
var UrlArray = [];
function writeResults() {
const finalJsonData = JSON.stringify(UrlArray);
const ws = xlsx.utils.json_to_sheet(UrlArray);
const wb = xlsx.utils.book_new();
xlsx.utils.book_append_sheet(wb, ws, "Responses");
xlsx.writeFile(wb, "output.xlsx");
}
function fetchTitle(url, onComplete = null) {
request(url, function (error, response, body) {
var output = url; // default to URL
if (!error && (response && response.statusCode) === 200) {
var $ = cheerio.load(body);
console.log(`URL = ${url}`);
var title = $("head > title").text().trim();
console.log(`Title = ${title}`);
output = `[${title}] (${url})`;
var keywords = ["Developers", "blog"];
var results = [];
for (var i = 0; i < keywords.length; i++) {
let match = title.match(new RegExp(keywords[i], "gi"));
if (match && match.length > 0) {
results.push(keywords[i]);
}
}
UrlArray.push({
Domain: url,
Keywords: results.length > 0 ? results.join(', ') : 'no match',
Title: output,
});
} else {
console.log(
`Error = ${error}, code = ${response && response.statusCode}`
);
}
console.log(`output = ${output} \n\n`);
if (onComplete) onComplete(output);
});
}
let counter = 0;
jsonData.forEach(function (table) {
var tableName = table.Domain;
var URL = "http://" + tableName;
fetchTitle(URL, ()=>{
counter++;
if(counter === jsonData.length) {
console.log(`all ${counter} requests done`);
writeResults();
}
});
});

Download a file from web using Node js and loop

I want to download multiple files from the web using this code:
var fs = require('fs');
var http = require('http');
var request = require('request');
var file;
for(var i = 1; i <= 5; i++) {
//CHECK IF REMOTE FILE EXISTS
request('http://webaddress.com/filename' + i + '.jar', function (err, resp) {
//IF EXISTS DO
if (resp.statusCode == 200) {
//DOWNLOAD DATA AND CREATE A NEW .JAR FILE
file = fs.createWriteStream('D:\\filename' + i + '.jar');
http.get('http://webaddress.com/filename' + i + '.jar', function(response) {
response.pipe(file);
file.on('finish', function() {
file.close();
});
});
}
//FILE DOES NOT EXIST
});
}
The result I want is: multiple files downloaded with filenames filename1-5.jar. The result I am getting is just 1 file with filename filename5.jar (or the last value of the i var in the loop). What am I doing wrong?
Like #Ionut said your requests are async so you need to wait for it
let fs = require('fs');
let request = require('request');
let download = (uri, filename) => {
return new Promise ((resolve, reject) => {
request.head(uri, function(err, res) {
if (res.statusCode === 200) {
request(uri).pipe(fs.createWriteStream(filename)).on('close', resolve);
} else {
reject(res.statusCode);
}
});
});
};
let promises = [];
for(let i = 1; i <= 5; i++) {
promises.push(download('http://webaddress.com/filename' + i + '.jar', 'D:\\filename' + i + '.jar'));
}
Promise.all(promises).then(() => {
process.exit(0);
});
Your request is asynchronous and it will execute only after your loop finishes hence the 5 from the filename. A solution for this is to threat your code separately by creating a new function and call it inside the loop:
var fs = require('fs');
var http = require('http');
var request = require('request');
var file;
function customRequest(i){
//CHECK IF REMOTE FILE EXISTS
return request('http://webaddress.com/filename' + i + '.jar', function(err, resp) {
//IF EXISTS DO
if (resp.statusCode == 200) {
//DOWNLOAD DATA AND CREATE A NEW .JAR FILE
file = fs.createWriteStream('D:\\filename' + i + '.jar');
http.get('http://webaddress.com/filename' + i + '.jar', function(response) {
response.pipe(file);
file.on('finish', function() {
file.close();
});
});
}
//FILE DOES NOT EXIST
});
}
for (var i = 1; i <= 5; i++) {
customRequest(i)
}

Using Node.js to retrieve the results count from google

To give the specific details:
I am designing a Discord Bot that uses the eris library for private use. One requested feature is a googlefight command which needs to do the following:
Take user input.
Use input to generate a Google search URL.
Pull the resulting HTML page and extract from it the text from within <div class="sd" id="resultStats">.
Output this result as a message.
The currently existing related code is:
const Eris = require("eris");
const express = require('express');
const request = require('request');
const cheerio = require('cheerio');
const app = express();
...
bot.registerCommand("googlefight", (msg, args) => {
if(args.length === 0) {
return "Invalid input";
}
var arrayLength = args.length;
var searchURL = "https://www.google.com/search?q=";
for (var i = 0; i < arrayLength; i++) {
searchURL = searchURL.concat(args[i]);
if (i + 1 < arrayLength) {
searchURL = searchURL.concat("%20");
} else {
}
}
var text;
app.get('/', function(req, res){
request(searchURL, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$(.sd).filter(function(){
var data = $(this);
text = data.children().first().text();
})
}
})
})
return text;
}, {
description: "Result counter",
fullDescription: "The bot will return the result count of a google search term.",
usage: "<text>"
});
I am unfamiliar with this type of work.
The part specifically that is broken is this section:
...
app.get('/', function(req, res){
request(searchURL, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$(.sd).filter(function(){
var data = $(this);
text = data.children().first().text();
})
}
})
})
...

argument handle for nested nodejs requests

In some parent pages, there are some child page anchors I need.I want to crawl all the parent pages, parse them, then get the child anchor, follow the anchor, and get the result.but when i write the code, i found, before i follow the anchor, the anchor url didn't change.here's my code:
var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')
function callnext(index){
var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
var result = req.get(url, function(error, response, body){
if (!error && response.statusCode == 200) {
var patt = /暂无内容/g;
var result = patt.test(body);
if(result){
return;
}
$ = cheerio.load(body);
var children = $('div').first().children();
for(var i=0;i<children.length;i++){
var item = $(children[i]);
var anchor = $(item.find('li>a')[0]).attr('href');
var labelText = $(item.find('label')[0]).text();
//TAG 1
req.get(anchor, function(error, response, body){
//TAG 2
console.log(anchor);
//here's my result
})
}
index = index+20;
callnext(index)
}
})
}
callnext(1);
In this code, if i console.log() the anchor url at TAG1 place and TAG2 place, it cames different result.
in TAG 1, it's my expected result, but at TAG 2,it seems only printout the first anchor of the parent page.
i tried to changed the code and extract the sub request function, the cames the right result.why?
var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')
function crawlItem(url, text){
req.get(url, function(error, response, body){
console.log(url)
var inner = cheerio.load(body);
var text = inner('#text_long').text();
// model.Talk.create({ id: la, video: hr, youku_desc:text }).complete(function(err, album) {
// console.log(err);
// });
})
}
function callnext(index){
var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
var result = req.get(url, function(error, response, body){
if (!error && response.statusCode == 200) {
var patt = /暂无内容/g;
var result = patt.test(body);
if(result){
return;
}
$ = cheerio.load(body);
var children = $('div').first().children();
for(var i=0;i<children.length;i++){
var item = $(children[i]);
var anchor = $(item.find('li>a')[0]).attr('href');
var labelText = $(item.find('label')[0]).text();
// console.log(anchor);
crawlItem(anchor, labelText);
}
index = index+20;
callnext(index)
}
})
}
callnext(1);

How can I get node.js to return data once all operations are complete

I am just learning server-side JavaScript so please bear with any glaring mistakes I've made.
I am trying to write a file parser that operates on HTML files in a directory and returns a JSON string once all files have been parsed. I started it with a single file and it works fine. it loads the resource from Apache running on the same machine, injects jquery, does the parsing and returns my JSON.
var request = require('request'),
jsdom = require('jsdom'),
sys = require('sys'),
http = require('http');
http.createServer(function (req, res) {
request({uri:'http://localhost/tfrohe/Car3E.html'}, function (error, response, body) {
if (!error && response.statusCode == 200) {
var window = jsdom.jsdom(body).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
// jQuery is now loaded on the jsdom window created from 'body'
var emps = {};
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step++;
var name = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = name.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2];
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
emps = JSON.stringify(emps);
//console.log(emps);
res.writeHead(200, {'Content-Type': 'text/plain'});
res.end(emps);
});
} else {
res.writeHead(200, {"Content-Type": "text/plain"});
res.end("empty");
//console.log(response.statusCode);
}
});
}).listen(8124);
Now I am trying to extend this to using the regular file system (fs) and get all HTML files in the directory and parse them the same way and return a single combined JSON object once all files have been parsed. Here is what I have so far but it does not work.
var sys = require("sys"),
fs = require("fs"),
jsdom = require("jsdom"),
emps = {};
//path = '/home/inet/www/media/employees/';
readDirectory = function(path) {
fs.readdir(path, function(err, files) {
var htmlfiles = [];
files.forEach(function(name) {
if(name.substr(-4) === "html") {
htmlfiles.push(name);
}
});
var count = htmlfiles.length;
htmlfiles.forEach(function(filename) {
fs.readFile(path + filename, "binary", function(err, data) {
if(err) throw err;
window = jsdom.jsdom(data).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step++;
var empname = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = empname.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2]
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
});
});
});
});
}
readDirectory('/home/inet/www/media/employees/', function() {
console.log(emps);
});
In this particular case, there are 2 html files in the directory. If i console.log(emps) during the htmlfiles.forEach() it shows me the results from the first file then the results for both files together the way I expect. how do I get emps to be returned to readDirectory so i can output it as desired?
Completed Script
After the answers below, here is the completed script with a httpServer to serve up the detail.
var sys = require('sys'),
fs = require("fs"),
http = require('http'),
jsdom = require('jsdom'),
emps = {};
var timed = setInterval(function() {
emps = {};
readDirectory('/home/inet/www/media/employees/', function(emps) {
});
}, 3600000);
readDirectory = function(path, callback) {
fs.readdir(path, function(err, files) {
var htmlfiles = [];
files.forEach(function(name) {
if(name.substr(-4) === "html") {
htmlfiles.push(name);
}
});
var count = htmlfiles.length;
htmlfiles.forEach(function(filename) {
fs.readFile(path + filename, "binary", function(err, data) {
if(err) throw err;
window = jsdom.jsdom(data).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
var imagecount = jquery("tr td img").length;
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step += 1;
var empname = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = empname.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2]
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
count -= 1;
if (count <= 0) {
callback(JSON.stringify(emps));
}
});
});
});
});
}
var init = readDirectory('/home/inet/www/media/employees/', function(emps) {
});
http.createServer(function (req, res) {
res.writeHead(200, {'Content-Type': 'text/plain'});
res.end(JSON.stringify(emps));
}).listen(8124);
That sure is a lot of code a couple of mistakes.
You're never calling the callback function you supply to readDirectory
You need to keep track of the files you have parsed, when you parsed all of them, call the callback and supply the emps
This should work:
var sys = require("sys"),
fs = require("fs"),
jsdom = require("jsdom"),
//path = '/home/inet/www/media/employees/';
// This is a nicer way
function readDirectory(path, callback) {
fs.readdir(path, function(err, files) {
// make this local
var emps = {};
var htmlfiles = [];
files.forEach(function(name) {
if(name.substr(-4) === "html") {
htmlfiles.push(name);
}
});
// Keep track of the number of files we have parsed
var count = htmlfiles.length;
var done = 0;
htmlfiles.forEach(function(filename) {
fs.readFile(path + filename, "binary", function(err, data) {
if(err) throw err;
window = jsdom.jsdom(data).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step++;
var empname = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = empname.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2]
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
// As soon as all have finished call the callback and supply emps
done++;
if (done === count) {
callback(emps);
}
});
});
});
});
}
readDirectory('/home/inet/www/media/employees/', function(emps) {
console.log(emps);
});
You seem to be doing this a tad wrong
readDirectory('/home/inet/www/media/employees/', function() {
console.log(emps);
});
But you've defined your function as:
readDirectory = function(path) {
Where is the callback argument? Try this:
readDirectory = function(path, callback) {
then under emps[last + ",_" + first] = jquery(this).children('img').attr('src'); put
callback.call(null, emps);
Your callback function will be called however many times your loop goes on for. If you want it to return all of them at once, you'll need to get a count of how many times the loop is going to run for, count up until that number then call your callback when the emps array is full of the data you need.

Categories