user will give the url in the input type field in the Html page that url need to get in the JS program and then the JS program need to execute to fetch the data from webpage.
this is what have done so far.
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = ""
request(url, function(err, response, html){
if(!err) {
var $ =cheerio.load(html);
var allItems = $('.clearfix').parent().children();
var items = [];
allItems.each(function(index) {
var result = $('.clearfix').eq(index).parent().children().eq(1).find("a").text();
if(result !== ""){
items.push(result);
}
});
fs.writeFile("output1.xls",JSON.stringify(items, null, 1),)
console.log(items);
}
});
Is this the solution to your problem?
var url = document.getElementById('myURL').value
Related
var request = require('request'),
cheerio = require('cheerio');
var url = "https://namu.wiki/w/크롤링";
request(url, function (err, res, html) {
if (!err) {
var $ = cheerio.load(html);
$('.wiki-heading-content').each(function(){
var post = {"content": "" };
var data=$(this);
post['content']=data.text();
console.log(post);
});
}
});
The line of code below is not working. Why?
//$('.wiki-heading-content').each(function()
You are using jQuery in your code and you overwrite(in the function scope) your global jQuery object on this line.
var $ = cheerio.load(html);
I'm trying to save the JSON from the GET request to an object and then upload the object to the Cloudant DB
Anyone know what I'm doing wrong?
var request = require("request");
var EventEmitter = require("events").EventEmitter;
var body = new EventEmitter();
var sample = cloudant.db.use('sample')
request("http://ieeexplore.ieee.org/gateway/ipsSearch.jsp?cs=IBM&hc=1000&rs=1001", function(error, response, data) {
body.data = data;
body.emit('update');
sample.insert({ crazy: true }, body.data, function(err, body, header{
// hmm
});
console.log('hmm');
});
You have a male formatted URL for the request. And the code for inserting in cloudant data base is wrong written:
var request = require("request");
var EventEmitter = require("events").EventEmitter;
var body = new EventEmitter();
var sample = cloudant.db.use('sample')
request("http://ieeexplore.ieee.org/gateway/ipsSearch.jsp?cs=IBM&hc=1000&rs=1001", function(error, response, data) {
body.data = data;
body.emit('update');
//implement code for inserting in cloudant db for homework
});
This question is about a crawler in node.js.
A start_url is given where he crawls for URLs, and "pushes" them to a .json-file (output.json).
At the moment, he runs the request function only with the start_url, and saves the collected URLs in output.json. I want that he uses the saved URLs by replacing the start_url with the first collected URL and collect links again ... and so on ...
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
var data = [];
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
data.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
fs.writeFile("output.json", JSON.stringify(data, null, 4), function(err){
if(err){
console.log(err);
} else {
console.log("File successfully written!");
}
});
});
}
for (var i = 0; i < start_url.length; i++){
req(start_url[i]);
}
So what you can do is make the function call recursively. The below example should work:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
var count = 0;
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
try {
fs.writeFileSync("output.json");
console.log("File successfully written!");
}catch(err){
console.log(err);
}
++count;
if(start_url.length > count) {
req(start_url[count]);
}
});
}
return req(start_url[0]);
The problem with this is that you are completely rewriting the file each time. If this goes on for awhile you are going to run out of memory. Another option is to create a write stream
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var wstream = fs.createWriteStream("output.json");
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
wstream.write('"'+ exurls + '",');
});
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);
Edit: switched to a basic queue so combat memory problems
I have a node.js app that scrapes informations from a website. I'm using npm packages request and cheerio and the scraping works fine but I want to do something else when the request function is done. Here's some code:
app.js
var express = require('express');
var extractor = require("./extractor");
console.log(extractor('http://www.example.com'));
var app = express();
app.get('/', function (req, res) {
res.send('Hello world\n');
});
app.listen(3000);
extractor.js (all the fun)
var request = require('request');
var cheerio = require('cheerio');
var Extractor = function(url) {
var games = [];
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('tr.game').each(function(i, v){
var game = { /* many attributes */ };
games.push(game);
});
}
});
this.extractedGames = games;
};
module.exports = function(url) {
return new Extractor(url);
};
Eventually when I run this it shows { extractedGames: [] } that is because the output was printed before the request treatment was over. So I want to add an on success event to extracedGames attribute when the request job is over.
Thanks
Solved it myself ! I hope this could help people in the future (though I felt like a complete noob)
The trick was to emit an event and handle it later.
var express = require('express');
var extractor = require("./extractor");
extractor('http://www.example.com');
var app = express();
app.get('/', function (req, res) {
res.send('Hello world\n');
});
app.listen(3000);
I removed console.log(extractor('http://www.example.com')) because this would run before the request job is done. So I moved it to the event handling function.
var request = require('request');
var cheerio = require('cheerio');
var Emitter = require('events').EventEmitter;
var extractEmitter = new Emitter();
extractEmitter.on('extracted', function(extractedGames){
console.log(extractedGames);
});
var Extractor = function(url) {
var games = [];
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('tr.game').each(function(i, v){
var game = { /* many attributes */ };
games.push(game);
});
extractEmitter.emit('extracted', games);
}
});
this.extractedGames = games;
};
module.exports = function(url) {
return new Extractor(url);
};
I'm trying to get the title tag of a url with cheerio. But, I'm getting empty string values. This is my code:
app.get('/scrape', function(req, res){
url = 'http://nrabinowitz.github.io/pjscrape/';
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var title, release, rating;
var json = { title : "", release : "", rating : ""};
$('title').filter(function(){
//var data = $(this);
var data = $(this);
title = data.children().first().text();
release = data.children().last().children().text();
json.title = title;
json.release = release;
})
$('.star-box-giga-star').filter(function(){
var data = $(this);
rating = data.text();
json.rating = rating;
})
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send('Check your console!')
})
});
request(url, function (error, response, body)
{
if (!error && response.statusCode == 200)
{
var $ = cheerio.load(body);
var title = $("title").text();
}
})
Using Javascript we extract the text contained within the "title" tags.
If Robert Ryan's solution still doesn't work, I'd be suspicious of the formatting of the original page, which may be malformed somehow.
In my case I was accepting gzip and other compression but never decoding, so Cheerio was trying to parse compressed binary bits. When console logging the original body, I was able to spot the binary text instead of plain text HTML.