I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000
Related
first of all, i dont know much about services, web apps and such, i've been studying js for the past 40 days and i managed to build a simple web scrapper that scrapes data about us dolar and indexes, saves it in a json and in my html i have a js that treats that json information, do some math and show it on the browser.
the thing is, i need the index.js file to run everyday at 8 a.m, i already uploaded a website, static website and i just put the files there, basicay img, style, index and js, but since this has index.js, node files and i need to run the index automaticaly i have no idea how to go about it.
const puppeteer = require('puppeteer');
const fs = require('fs');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://br.advfn.com/investimentos/futuros/di-depositos-interfinanceiros/cotacoes',{
waitUntill: 'load',
timeout: 0
});
const textNode = await page.evaluate(()=>{
const nodeText = document.querySelector(".even.first").innerText;
const text = [nodeText];
return text
});
fs.writeFile('arreglo2.json', JSON.stringify(textNode), err =>{
if (err) throw new Error ('algo deu errado')
console.log('deu certo')
})
})();
//**********************pegar o DX**************************************************/
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://br.tradingview.com/symbols/TVC-DXY/',{
waitUntill: 'load',
timeout: 0
});
const textNode = await page.evaluate(()=>{
const nodeText = document.querySelector(".js-quote-ticker.tv-site-table__row.tv-widget-watch-list__row:nth-child(2)").children[1].children[1].children[0].innerHTML;
const text = [nodeText];
return text
});
fs.writeFile('arreglo.json', JSON.stringify(textNode), err =>{
if (err) throw new Error ('algo deu errado')
console.log('deu certo')
})
})();
/**********vai pegar a cotação do fechamento do colar */
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://br.advfn.com/bolsa-de-valores/fx/USDBRL/cotacao',{
waitUntill: 'load',
timeout: 0
});
const textNode = await page.evaluate(()=>{
const nodeText = document.querySelector(".qs-current-price").innerText;
const text = [nodeText];
return text
});
fs.writeFile('cotacaoFechamento.json', JSON.stringify(textNode), err =>{
if (err) throw new Error ('algo deu errado')
console.log('deu certo')
})
})();
the code above is my index.js the one that gets the info i need and save it to a json.
today when i want to test the file i go to terminal and type node index.js, wait till its ok and then refresh my browser.
ah, i had to download some xampp server and saved my solution in the the folder where it could be accessed thru localhost (when i saved in normal c:// it didnt scrape at all, some CORS-related error).
thank you all in advance.
This isn't code that runs in a browser. This is code that runs in the Nodejs runtime (server side). If you're on Windows, you can use the Task Scheduler to schedule a task that runs every day at 8am and executes the index.js file using node.exe as the executable.
https://joshuatz.com/posts/2020/using-windows-task-scheduler-to-automate-nodejs-scripts/
I want to scrape a url value of iframe in this website: https://lk21online.digital/nonton-profile-2021-subtitle-indonesia/
When i search iframe from view page source its not found, i think iframe is loaded after page loaded by javascript
Or my selector is wrong?
Please somebody help me to check my selector or what i need to do for my code
Sorry for my poor english...
There is my code:
async function getDetail(res, url) {
try {
const html = await scraping(res, url)
const $ = cheerio.load(html)
const article = $('#site-container #content .gmr-maincontent #primary #main .gmr-box-content #muvipro_player_content_id #player1-tab-content')
let result = []
setTimeout(() => {
article.each(function () {
const title = $(this).find('.item-article h2').text()
const watch = $(this).find('iframe').attr('src')
result.push({
title,
watch,
})
})
res.json({ result })
}, 5000)
}
catch (err) {
console.log(err)
}
}
this is video iframe
You can't use cheerio for this. Cheerio is not dynamic and just loads whatever html is coming back from the request.
Looking at your webpage, most content is loaded async, so the initial html will be pretty empty.
In addition the video source is lazily loaded when it enters the browser window. So you have to use an actual headless browser to accomplish the task. Here's an example:
// iframeUrl.js
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Goto page
await page.goto("https://lk21online.digital/nonton-profile-2021-subtitle-indonesia/");
// Scroll down
page.evaluate((_) => window.scrollBy(0, 1000));
// Wait a bit
await new Promise((resolve) => setTimeout(resolve, 5000));
// Get the src of the iframe
const iframeUrl = await page.evaluate(`$("#player1-tab-content iframe").attr("src")`);
console.log(iframeUrl);
await browser.close();
process.exit(0);
})();
I need to get a lot of images from a few websites and download them to my disk so that I can use them (will upload them to a blob (azure) and then save the link to my DB).
GETTING THE IMAGES
I know how to get the images from the html with JS, for example one of them I would make a for-loop and do:
document.getElementsByClassName('person')[i].querySelector('div').querySelector('img').getAttribute('src')
And there I would have the links to all the images.
SAVING THE IMAGES
I also saw that I can save the files to disk using node and the fs module, by doing:
function saveImageToDisk(url, localPath) {var fullUrl = url;
var file = fs.createWriteStream(localPath);
var request = https.get(url, function(response) {
response.pipe(file);
});
}
HOW TO PUT IT ALL TOGETHER
This is where I am stuck, I don't know exactly how to connect the two parts (the script and the nodejs code), I want to get the image and also the image name (alt tag in this case) and then use them in node to upload the image to a blob and put them name and image blob url in my DB.
I thought I could download the html page and then put the JS script on the bottom of the body but then I don't know how to pass the url to the nodejs code.
How can I do this?
I am not very used to using scripts, I mostly used node without them and I get a bit confused by their interactions and how to connect js scripts to my code.
Also is this the best way to go about this or is there a simpler/better way I am not seeing?
This feels like you should use a crawler. The following code should work (using the npm module crawler):
const Crawler = require("crawler")
const c = new Crawler({
callback: function(error, res, done) {
if (error) {
console.log({error})
} else {
const images = res.$('.person div img')
images.each(index => {
// here you can save the file or save them in an array to download them later
console.log({
src: images[index].attribs.src,
alt: images[index].attribs.alt,
})
})
}
}
})
c.queue('https://www.yoursite.com')
You need a bridge between Web API (for DOM parsing etc) and Node.js API. For example, some headless browser managing tool for Node.js. Say, you can use puppeteer with this script:
'use strict';
const puppeteer = require('puppeteer');
const https = require('https');
const fs = require('fs');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://en.wikipedia.org/wiki/Image');
const imgURLs = await page.evaluate(() =>
Array.from(
document.querySelectorAll('#mw-content-text img.thumbimage'),
({ src }) => src,
)
);
console.log(imgURLs);
await browser.close();
imgURLs.forEach((imgURL, i) => {
https.get(imgURL, (response) => {
response.pipe(fs.createWriteStream(`${i++}.${imgURL.slice(-3)}`));
});
});
} catch (err) {
console.error(err);
}
})();
You can even download images just once, using pictures already downloaded by the browser. This script saves identical images, but with one session of requests, without using https Node.js module (this saves time, network traffic and server workload):
'use strict';
const puppeteer = require('puppeteer');
const fs = require('fs');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
const allImgResponses = {};
page.on('response', (response) => {
if (response.request().resourceType() === 'image') {
allImgResponses[response.url()] = response;
}
});
await page.goto('https://en.wikipedia.org/wiki/Image');
const selecedImgURLs = await page.evaluate(() =>
Array.from(
document.querySelectorAll('#mw-content-text img.thumbimage'),
({ src }) => src,
)
);
console.log(selecedImgURLs);
let i = 0;
for (const imgURL of selecedImgURLs) {
fs.writeFileSync(
`${i++}.${imgURL.slice(-3)}`,
await allImgResponses[imgURL].buffer(),
);
}
await browser.close();
} catch (err) {
console.error(err);
}
})();
I recommend you to use the dom-parser module. See here: https://www.npmjs.com/package/dom-parser
By doing so, you can download the whole html-File with http.get() and parse it using the dom-parser. Then extract all the information you need from the HTML-File. With the Image URL, use your saveImageToDisk() function.
Following your idea, you have to add the JS script to the html-File as you mentioned. But in addition you have to use Ajax (xmlHttpRequest) to post the URL to a nodeJS-Server.
You can use Promise & inside it do the job of getting all the images and put the image url in an array.Then inside the then method you can either iterate the array and call the saveImageToDisk each time or you can send the array to the middle layer with slide modification. The second option is better since it will make only one network call
function getImages() {
return new Promise((resolve, reject) => {
// Array.from will create an array
// map will return a new array with all the image url
let k = Array.from(document.getElementsByClassName('person')[0].querySelector('div')
.querySelectorAll('img'))
.map((item) => {
return item.getAttribute('src')
})
resolve(k)
})
}
getImages().then((d) => {
// it will work only after the promise is resolved
console.log('****', d);
(item => {
// call saveImageToDisk function
})
})
function saveImageToDisk(url, localPath) {
var fullUrl = url;
var file = fs.createWriteStream(localPath);
var request = https.get(url, function(response) {
response.pipe(file);
});
<div class='person'>
<div>
<img src='https://www.fast-growing-trees.com/images/P/Leyland-Cypress-450-MAIN.jpg'>
<img src='http://cdn.shopify.com/s/files/1/2473/3486/products/Cypress_Leyland_2_Horticopia_d1b5b63a-8bf7-4897-96fb-05320bf3d81b_grande.jpg?v=1532991076'>
<img src='https://www.fast-growing-trees.com/images/P/Live-Oak-Tree-450w.jpg'>
<img src='https://www.greatgardenplants.com/images/uploads/452_1262_popup.jpg'>
<img src='https://shop.arborday.org/data/default/images/catalog/600/Turnkey/1/Leyland-Cypress_3-828.jpg'>
<img src='https://images-na.ssl-images-amazon.com/images/I/51RZkKnrlSL._SX425_.jpg'>
<img src='https://thumbs-prod.si-cdn.com/Z3JYiuJ96ReLq04NCT1B94sTd4E=/800x600/filters:no_upscale()/https://public-media.si-cdn.com/filer/06/9c/069cfb16-c46c-4742-85f0-3c7e45fa139d/mar2018_a05_talkingtrees.jpg'>
</div>
I am attempting to scrape the html from this NCBI.gov page. I need to include the #see-all URL fragment so that I am guaranteed to get the searchpage instead of retrieving the HTML from an incorrect gene page https://www.ncbi.nlm.nih.gov/gene/119016.
URL fragments are not passed to the server, and are instead used by the javascript of the page client-side to (in this case) create entirely different HTML, which is what you get when you go to the page in a browser and "View page source", which is the HTML I want to retrieve. R readLines() ignores url tags followed by #
I tried using phantomJS first, but it just returned the error described here ReferenceError: Can't find variable: Map, and it seems to result from phantomJS not supporting some feature that NCBI was using, thus eliminating this route to solution.
I had more success with Puppeteer using the following Javascript evaluated with node.js:
const puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(
'https://www.ncbi.nlm.nih.gov/gene/?term=AGAP8#see-all');
var HTML = await page.content()
const fs = require('fs');
var ws = fs.createWriteStream(
'TempInterfaceWithChrome.js'
);
ws.write(HTML);
ws.end();
var ws2 = fs.createWriteStream(
'finishedFlag'
);
ws2.end();
browser.close();
})();
however this returned what appeared to be the pre-rendered html. how do I (programmatically) get the final html that I get in browser?
You can try to change this:
await page.goto(
'https://www.ncbi.nlm.nih.gov/gene/?term=AGAP8#see-all');
into this:
await page.goto(
'https://www.ncbi.nlm.nih.gov/gene/?term=AGAP8#see-all', {waitUntil: 'networkidle'});
Or, you can create a function listenFor() to listen to a custom event on page load:
function listenFor(type) {
return page.evaluateOnNewDocument(type => {
document.addEventListener(type, e => {
window.onCustomEvent({type, detail: e.detail});
});
}, type);
}`
await listenFor('custom-event-ready'); // Listen for "custom-event-ready" custom event on page load.
LE:
This also might come in handy:
await page.waitForSelector('h3'); // replace h3 with your selector
Maybe try to wait
await page.waitForNavigation(5);
and after
let html = await page.content();
I had success using the following to get html content that was generated after the page has been loaded.
const browser = await puppeteer.launch();
try {
const page = await browser.newPage();
await page.goto(url);
await page.waitFor(2000);
let html_content = await page.evaluate(el => el.innerHTML, await page.$('.element-class-name'));
console.log(html_content);
} catch (err) {
console.log(err);
}
Hope this helps.
Waiting for network idle was not enough in my case, so I used dom loaded event:
await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 60000} );
const data = await page.content();
Indeed you need innerHTML:
fs.writeFileSync( "test.html", await (await page.$("html")).evaluate( (content => content.innerHTML ) ) );
If you want to actually await a custom event, you can do it this way.
const page = await browser.newPage();
/**
* Attach an event listener to page to capture a custom event on page load/navigation.
* #param {string} type Event name.
* #return {!Promise}
*/
function addListener(type) {
return page.evaluateOnNewDocument(type => {
// here we are in the browser context
document.addEventListener(type, e => {
window.onCustomEvent({ type, detail: e.detail });
});
}, type);
}
const evt = await new Promise(async resolve => {
// Define a window.onCustomEvent function on the page.
await page.exposeFunction('onCustomEvent', e => {
// here we are in the node context
resolve(e); // resolve the outer Promise here so we can await it outside
});
await addListener('app-ready'); // setup listener for "app-ready" custom event on page load
await page.goto('http://example.com'); // N.B! Do not use { waitUntil: 'networkidle0' } as that may cause a race condition
});
console.log(`${evt.type} fired`, evt.detail || '');
Built upon the example at https://github.com/GoogleChrome/puppeteer/blob/master/examples/custom-event.js
I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000