Scraping with Cheerio - Cannot Extract Text [duplicate] - javascript

I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?

Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});

Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.

Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})

Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());

Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000

Related

how can I upload my website with a webscrapper in it and run?

first of all, i dont know much about services, web apps and such, i've been studying js for the past 40 days and i managed to build a simple web scrapper that scrapes data about us dolar and indexes, saves it in a json and in my html i have a js that treats that json information, do some math and show it on the browser.
the thing is, i need the index.js file to run everyday at 8 a.m, i already uploaded a website, static website and i just put the files there, basicay img, style, index and js, but since this has index.js, node files and i need to run the index automaticaly i have no idea how to go about it.
const puppeteer = require('puppeteer');
const fs = require('fs');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://br.advfn.com/investimentos/futuros/di-depositos-interfinanceiros/cotacoes',{
waitUntill: 'load',
timeout: 0
});
const textNode = await page.evaluate(()=>{
const nodeText = document.querySelector(".even.first").innerText;
const text = [nodeText];
return text
});
fs.writeFile('arreglo2.json', JSON.stringify(textNode), err =>{
if (err) throw new Error ('algo deu errado')
console.log('deu certo')
})
})();
//**********************pegar o DX**************************************************/
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://br.tradingview.com/symbols/TVC-DXY/',{
waitUntill: 'load',
timeout: 0
});
const textNode = await page.evaluate(()=>{
const nodeText = document.querySelector(".js-quote-ticker.tv-site-table__row.tv-widget-watch-list__row:nth-child(2)").children[1].children[1].children[0].innerHTML;
const text = [nodeText];
return text
});
fs.writeFile('arreglo.json', JSON.stringify(textNode), err =>{
if (err) throw new Error ('algo deu errado')
console.log('deu certo')
})
})();
/**********vai pegar a cotação do fechamento do colar */
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://br.advfn.com/bolsa-de-valores/fx/USDBRL/cotacao',{
waitUntill: 'load',
timeout: 0
});
const textNode = await page.evaluate(()=>{
const nodeText = document.querySelector(".qs-current-price").innerText;
const text = [nodeText];
return text
});
fs.writeFile('cotacaoFechamento.json', JSON.stringify(textNode), err =>{
if (err) throw new Error ('algo deu errado')
console.log('deu certo')
})
})();
the code above is my index.js the one that gets the info i need and save it to a json.
today when i want to test the file i go to terminal and type node index.js, wait till its ok and then refresh my browser.
ah, i had to download some xampp server and saved my solution in the the folder where it could be accessed thru localhost (when i saved in normal c:// it didnt scrape at all, some CORS-related error).
thank you all in advance.
This isn't code that runs in a browser. This is code that runs in the Nodejs runtime (server side). If you're on Windows, you can use the Task Scheduler to schedule a task that runs every day at 8am and executes the index.js file using node.exe as the executable.
https://joshuatz.com/posts/2020/using-windows-task-scheduler-to-automate-nodejs-scripts/

Loading html into chrome followed by jsdom instance

I'm doing some scraping after receiving html from an api. I'd like to do the following:
Open html page in chrome so I can find selectors in the console.
Immediately load the same html page into a jsdom instance
Drop into the repl - I can then find the right selectors in the console and test them out in a live jsdom environment to see if they work.
For 1, I have:
async function openHtml(htmlString) {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setContent(htmlString);
return;
// await browser.close();
}
The code provided with the api is:
var req = http.request(options, function (res) {
var chunks = [];
res.on("data", function (chunk) {
chunks.push(chunk);
});
res.on("end", function () {
var body = Buffer.concat(chunks);
response = JSON.parse(body); //response.content = html, response.cookies = cookies
const dom = new JSDOM(response.content);
console.log(dom.window.document.querySelector("p").textContent); // "Hello world"
openHtml(response.content);
console.log('hi');
});
});
req.end();
If I run the code at the command line the browser opens as expected. However, if I set a breakpoint at:
console.log('hi');
It does not. How can I get this working?
openHtml is an async function. So you'll have to set the method calling in await (promise) and main function to async as well.
var req = http.request(options, function (res) {
var chunks = []
res.on('data', function (chunk) {
chunks.push(chunk)
})
res.on('end', async function () {
var body = Buffer.concat(chunks)
response = JSON.parse(body) //response.content = html, response.cookies = cookies
const dom = new JSDOM(response.content)
console.log(dom.window.document.querySelector('p').textContent) // 'Hello world'
await openHtml(response.content)
console.log('hi')
})
})
req.end()

Cheerio selector after page loaded

I want to scrape a url value of iframe in this website: https://lk21online.digital/nonton-profile-2021-subtitle-indonesia/
When i search iframe from view page source its not found, i think iframe is loaded after page loaded by javascript
Or my selector is wrong?
Please somebody help me to check my selector or what i need to do for my code
Sorry for my poor english...
There is my code:
async function getDetail(res, url) {
try {
const html = await scraping(res, url)
const $ = cheerio.load(html)
const article = $('#site-container #content .gmr-maincontent #primary #main .gmr-box-content #muvipro_player_content_id #player1-tab-content')
let result = []
setTimeout(() => {
article.each(function () {
const title = $(this).find('.item-article h2').text()
const watch = $(this).find('iframe').attr('src')
result.push({
title,
watch,
})
})
res.json({ result })
}, 5000)
}
catch (err) {
console.log(err)
}
}
this is video iframe
You can't use cheerio for this. Cheerio is not dynamic and just loads whatever html is coming back from the request.
Looking at your webpage, most content is loaded async, so the initial html will be pretty empty.
In addition the video source is lazily loaded when it enters the browser window. So you have to use an actual headless browser to accomplish the task. Here's an example:
// iframeUrl.js
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Goto page
await page.goto("https://lk21online.digital/nonton-profile-2021-subtitle-indonesia/");
// Scroll down
page.evaluate((_) => window.scrollBy(0, 1000));
// Wait a bit
await new Promise((resolve) => setTimeout(resolve, 5000));
// Get the src of the iframe
const iframeUrl = await page.evaluate(`$("#player1-tab-content iframe").attr("src")`);
console.log(iframeUrl);
await browser.close();
process.exit(0);
})();

Getting all images from a webpage and save the to disk programmatically (NodeJS & Javascript)

I need to get a lot of images from a few websites and download them to my disk so that I can use them (will upload them to a blob (azure) and then save the link to my DB).
GETTING THE IMAGES
I know how to get the images from the html with JS, for example one of them I would make a for-loop and do:
document.getElementsByClassName('person')[i].querySelector('div').querySelector('img').getAttribute('src')
And there I would have the links to all the images.
SAVING THE IMAGES
I also saw that I can save the files to disk using node and the fs module, by doing:
function saveImageToDisk(url, localPath) {var fullUrl = url;
var file = fs.createWriteStream(localPath);
var request = https.get(url, function(response) {
response.pipe(file);
});
}
HOW TO PUT IT ALL TOGETHER
This is where I am stuck, I don't know exactly how to connect the two parts (the script and the nodejs code), I want to get the image and also the image name (alt tag in this case) and then use them in node to upload the image to a blob and put them name and image blob url in my DB.
I thought I could download the html page and then put the JS script on the bottom of the body but then I don't know how to pass the url to the nodejs code.
How can I do this?
I am not very used to using scripts, I mostly used node without them and I get a bit confused by their interactions and how to connect js scripts to my code.
Also is this the best way to go about this or is there a simpler/better way I am not seeing?
This feels like you should use a crawler. The following code should work (using the npm module crawler):
const Crawler = require("crawler")
const c = new Crawler({
callback: function(error, res, done) {
if (error) {
console.log({error})
} else {
const images = res.$('.person div img')
images.each(index => {
// here you can save the file or save them in an array to download them later
console.log({
src: images[index].attribs.src,
alt: images[index].attribs.alt,
})
})
}
}
})
c.queue('https://www.yoursite.com')
You need a bridge between Web API (for DOM parsing etc) and Node.js API. For example, some headless browser managing tool for Node.js. Say, you can use puppeteer with this script:
'use strict';
const puppeteer = require('puppeteer');
const https = require('https');
const fs = require('fs');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://en.wikipedia.org/wiki/Image');
const imgURLs = await page.evaluate(() =>
Array.from(
document.querySelectorAll('#mw-content-text img.thumbimage'),
({ src }) => src,
)
);
console.log(imgURLs);
await browser.close();
imgURLs.forEach((imgURL, i) => {
https.get(imgURL, (response) => {
response.pipe(fs.createWriteStream(`${i++}.${imgURL.slice(-3)}`));
});
});
} catch (err) {
console.error(err);
}
})();
You can even download images just once, using pictures already downloaded by the browser. This script saves identical images, but with one session of requests, without using https Node.js module (this saves time, network traffic and server workload):
'use strict';
const puppeteer = require('puppeteer');
const fs = require('fs');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
const allImgResponses = {};
page.on('response', (response) => {
if (response.request().resourceType() === 'image') {
allImgResponses[response.url()] = response;
}
});
await page.goto('https://en.wikipedia.org/wiki/Image');
const selecedImgURLs = await page.evaluate(() =>
Array.from(
document.querySelectorAll('#mw-content-text img.thumbimage'),
({ src }) => src,
)
);
console.log(selecedImgURLs);
let i = 0;
for (const imgURL of selecedImgURLs) {
fs.writeFileSync(
`${i++}.${imgURL.slice(-3)}`,
await allImgResponses[imgURL].buffer(),
);
}
await browser.close();
} catch (err) {
console.error(err);
}
})();
I recommend you to use the dom-parser module. See here: https://www.npmjs.com/package/dom-parser
By doing so, you can download the whole html-File with http.get() and parse it using the dom-parser. Then extract all the information you need from the HTML-File. With the Image URL, use your saveImageToDisk() function.
Following your idea, you have to add the JS script to the html-File as you mentioned. But in addition you have to use Ajax (xmlHttpRequest) to post the URL to a nodeJS-Server.
You can use Promise & inside it do the job of getting all the images and put the image url in an array.Then inside the then method you can either iterate the array and call the saveImageToDisk each time or you can send the array to the middle layer with slide modification. The second option is better since it will make only one network call
function getImages() {
return new Promise((resolve, reject) => {
// Array.from will create an array
// map will return a new array with all the image url
let k = Array.from(document.getElementsByClassName('person')[0].querySelector('div')
.querySelectorAll('img'))
.map((item) => {
return item.getAttribute('src')
})
resolve(k)
})
}
getImages().then((d) => {
// it will work only after the promise is resolved
console.log('****', d);
(item => {
// call saveImageToDisk function
})
})
function saveImageToDisk(url, localPath) {
var fullUrl = url;
var file = fs.createWriteStream(localPath);
var request = https.get(url, function(response) {
response.pipe(file);
});
<div class='person'>
<div>
<img src='https://www.fast-growing-trees.com/images/P/Leyland-Cypress-450-MAIN.jpg'>
<img src='http://cdn.shopify.com/s/files/1/2473/3486/products/Cypress_Leyland_2_Horticopia_d1b5b63a-8bf7-4897-96fb-05320bf3d81b_grande.jpg?v=1532991076'>
<img src='https://www.fast-growing-trees.com/images/P/Live-Oak-Tree-450w.jpg'>
<img src='https://www.greatgardenplants.com/images/uploads/452_1262_popup.jpg'>
<img src='https://shop.arborday.org/data/default/images/catalog/600/Turnkey/1/Leyland-Cypress_3-828.jpg'>
<img src='https://images-na.ssl-images-amazon.com/images/I/51RZkKnrlSL._SX425_.jpg'>
<img src='https://thumbs-prod.si-cdn.com/Z3JYiuJ96ReLq04NCT1B94sTd4E=/800x600/filters:no_upscale()/https://public-media.si-cdn.com/filer/06/9c/069cfb16-c46c-4742-85f0-3c7e45fa139d/mar2018_a05_talkingtrees.jpg'>
</div>

How can I scrape pages with dynamic content using node.js?

I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000

Categories