var request = require('request'),
cheerio = require('cheerio');
var url = "https://namu.wiki/w/크롤링";
request(url, function (err, res, html) {
if (!err) {
var $ = cheerio.load(html);
$('.wiki-heading-content').each(function(){
var post = {"content": "" };
var data=$(this);
post['content']=data.text();
console.log(post);
});
}
});
The line of code below is not working. Why?
//$('.wiki-heading-content').each(function()
You are using jQuery in your code and you overwrite(in the function scope) your global jQuery object on this line.
var $ = cheerio.load(html);
Related
user will give the url in the input type field in the Html page that url need to get in the JS program and then the JS program need to execute to fetch the data from webpage.
this is what have done so far.
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var url = ""
request(url, function(err, response, html){
if(!err) {
var $ =cheerio.load(html);
var allItems = $('.clearfix').parent().children();
var items = [];
allItems.each(function(index) {
var result = $('.clearfix').eq(index).parent().children().eq(1).find("a").text();
if(result !== ""){
items.push(result);
}
});
fs.writeFile("output1.xls",JSON.stringify(items, null, 1),)
console.log(items);
}
});
Is this the solution to your problem?
var url = document.getElementById('myURL').value
This question is about a crawler in node.js.
A start_url is given where he crawls for URLs, and "pushes" them to a .json-file (output.json).
At the moment, he runs the request function only with the start_url, and saves the collected URLs in output.json. I want that he uses the saved URLs by replacing the start_url with the first collected URL and collect links again ... and so on ...
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
var data = [];
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
data.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
fs.writeFile("output.json", JSON.stringify(data, null, 4), function(err){
if(err){
console.log(err);
} else {
console.log("File successfully written!");
}
});
});
}
for (var i = 0; i < start_url.length; i++){
req(start_url[i]);
}
So what you can do is make the function call recursively. The below example should work:
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var req = function(url){
var count = 0;
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});
try {
fs.writeFileSync("output.json");
console.log("File successfully written!");
}catch(err){
console.log(err);
}
++count;
if(start_url.length > count) {
req(start_url[count]);
}
});
}
return req(start_url[0]);
The problem with this is that you are completely rewriting the file each time. If this goes on for awhile you are going to run out of memory. Another option is to create a write stream
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var start_url = ["http://stackoverflow.com/"]
var wstream = fs.createWriteStream("output.json");
var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}
start_url.push(exurls);
// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
wstream.write('"'+ exurls + '",');
});
start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}
wstream.end();
});
}
req(start_url[0]);
Edit: switched to a basic queue so combat memory problems
I am trying to manipulate a string by splitting it into two.
my js
var request = require('request');
var cheerio = require('cheerio');
var href1,href;
var str = "https://google.co.in/search?q=okay"+"+google";
request(str, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var a = $('.r a');
href = a.attr('href');
href1="https://google.co.in"+href;
var href
console.log(href1);
request(href1, function (error, response, html){
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var a1 = $('ol li a');
var href2 = a1.attr('href');
var href3 = href2.indexOf("/48");
var href4=href3.substring(0,20);
console.log(href4);
}
});
}
});
it gives a TypeError at the line where i use the substring function :
Undefined is not a function.
However, href3 returns an integer and that is fine. So, I know href3 is not empty or undefined.
How can I fix ?
You're trying to call a String function from a Number. Href3 will not have the substring function because indexOf returns a Number.
The line:
var href4=href3.substring(0,20);
Should probably be:
var href4=href2.substring(0,20);
I have a node.js app that scrapes informations from a website. I'm using npm packages request and cheerio and the scraping works fine but I want to do something else when the request function is done. Here's some code:
app.js
var express = require('express');
var extractor = require("./extractor");
console.log(extractor('http://www.example.com'));
var app = express();
app.get('/', function (req, res) {
res.send('Hello world\n');
});
app.listen(3000);
extractor.js (all the fun)
var request = require('request');
var cheerio = require('cheerio');
var Extractor = function(url) {
var games = [];
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('tr.game').each(function(i, v){
var game = { /* many attributes */ };
games.push(game);
});
}
});
this.extractedGames = games;
};
module.exports = function(url) {
return new Extractor(url);
};
Eventually when I run this it shows { extractedGames: [] } that is because the output was printed before the request treatment was over. So I want to add an on success event to extracedGames attribute when the request job is over.
Thanks
Solved it myself ! I hope this could help people in the future (though I felt like a complete noob)
The trick was to emit an event and handle it later.
var express = require('express');
var extractor = require("./extractor");
extractor('http://www.example.com');
var app = express();
app.get('/', function (req, res) {
res.send('Hello world\n');
});
app.listen(3000);
I removed console.log(extractor('http://www.example.com')) because this would run before the request job is done. So I moved it to the event handling function.
var request = require('request');
var cheerio = require('cheerio');
var Emitter = require('events').EventEmitter;
var extractEmitter = new Emitter();
extractEmitter.on('extracted', function(extractedGames){
console.log(extractedGames);
});
var Extractor = function(url) {
var games = [];
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('tr.game').each(function(i, v){
var game = { /* many attributes */ };
games.push(game);
});
extractEmitter.emit('extracted', games);
}
});
this.extractedGames = games;
};
module.exports = function(url) {
return new Extractor(url);
};
I'm tryign to build a very simple scraper function for nodeJS - just a function that I can pass a URL to, and it returns the scraped data as var data.
I'm completely new to Node.js and can't work out why the following isn't working:
var request = require('request');
var cheerio = require('cheerio');
function scrape(url) {
console.log("Scraping: " + url);
request(url, function(err, resp, body) {
if (err) {
throw err;
}
var html = cheerio.load(body);
return html;
});
}
var data = scrape('http://www.stackoverflow.com');
$ = data;
var logo = $('#hlogo a').text();
console.log(logo);
The above code should return "Stack Overflow" but obviously does not. When I run this in the console I get an error:
var logo = $('#hlogo a').text();
^
TypeError: Property '$' of object #<Object> is not a function
Any ideas why this isn't working for me?
Your data will be undefined, because scrape function does not return a value, additionaly it asynchronous.
You need change logic to something like this:
function scrape(url, oncomplete) {
console.log("Scraping: " + url);
request(url, function(err, resp, body) {
if (err) {
throw err;
}
var html = cheerio.load(body);
oncomplete(html);
});
}
scrape('http://www.stackoverflow.com', function(data) { /* do work here*/ });