Cheerio selector after page loaded - javascript

I want to scrape a url value of iframe in this website: https://lk21online.digital/nonton-profile-2021-subtitle-indonesia/
When i search iframe from view page source its not found, i think iframe is loaded after page loaded by javascript
Or my selector is wrong?
Please somebody help me to check my selector or what i need to do for my code
Sorry for my poor english...
There is my code:
async function getDetail(res, url) {
try {
const html = await scraping(res, url)
const $ = cheerio.load(html)
const article = $('#site-container #content .gmr-maincontent #primary #main .gmr-box-content #muvipro_player_content_id #player1-tab-content')
let result = []
setTimeout(() => {
article.each(function () {
const title = $(this).find('.item-article h2').text()
const watch = $(this).find('iframe').attr('src')
result.push({
title,
watch,
})
})
res.json({ result })
}, 5000)
}
catch (err) {
console.log(err)
}
}
this is video iframe

You can't use cheerio for this. Cheerio is not dynamic and just loads whatever html is coming back from the request.
Looking at your webpage, most content is loaded async, so the initial html will be pretty empty.
In addition the video source is lazily loaded when it enters the browser window. So you have to use an actual headless browser to accomplish the task. Here's an example:
// iframeUrl.js
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Goto page
await page.goto("https://lk21online.digital/nonton-profile-2021-subtitle-indonesia/");
// Scroll down
page.evaluate((_) => window.scrollBy(0, 1000));
// Wait a bit
await new Promise((resolve) => setTimeout(resolve, 5000));
// Get the src of the iframe
const iframeUrl = await page.evaluate(`$("#player1-tab-content iframe").attr("src")`);
console.log(iframeUrl);
await browser.close();
process.exit(0);
})();

Related

how can I upload my website with a webscrapper in it and run?

first of all, i dont know much about services, web apps and such, i've been studying js for the past 40 days and i managed to build a simple web scrapper that scrapes data about us dolar and indexes, saves it in a json and in my html i have a js that treats that json information, do some math and show it on the browser.
the thing is, i need the index.js file to run everyday at 8 a.m, i already uploaded a website, static website and i just put the files there, basicay img, style, index and js, but since this has index.js, node files and i need to run the index automaticaly i have no idea how to go about it.
const puppeteer = require('puppeteer');
const fs = require('fs');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://br.advfn.com/investimentos/futuros/di-depositos-interfinanceiros/cotacoes',{
waitUntill: 'load',
timeout: 0
});
const textNode = await page.evaluate(()=>{
const nodeText = document.querySelector(".even.first").innerText;
const text = [nodeText];
return text
});
fs.writeFile('arreglo2.json', JSON.stringify(textNode), err =>{
if (err) throw new Error ('algo deu errado')
console.log('deu certo')
})
})();
//**********************pegar o DX**************************************************/
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://br.tradingview.com/symbols/TVC-DXY/',{
waitUntill: 'load',
timeout: 0
});
const textNode = await page.evaluate(()=>{
const nodeText = document.querySelector(".js-quote-ticker.tv-site-table__row.tv-widget-watch-list__row:nth-child(2)").children[1].children[1].children[0].innerHTML;
const text = [nodeText];
return text
});
fs.writeFile('arreglo.json', JSON.stringify(textNode), err =>{
if (err) throw new Error ('algo deu errado')
console.log('deu certo')
})
})();
/**********vai pegar a cotação do fechamento do colar */
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://br.advfn.com/bolsa-de-valores/fx/USDBRL/cotacao',{
waitUntill: 'load',
timeout: 0
});
const textNode = await page.evaluate(()=>{
const nodeText = document.querySelector(".qs-current-price").innerText;
const text = [nodeText];
return text
});
fs.writeFile('cotacaoFechamento.json', JSON.stringify(textNode), err =>{
if (err) throw new Error ('algo deu errado')
console.log('deu certo')
})
})();
the code above is my index.js the one that gets the info i need and save it to a json.
today when i want to test the file i go to terminal and type node index.js, wait till its ok and then refresh my browser.
ah, i had to download some xampp server and saved my solution in the the folder where it could be accessed thru localhost (when i saved in normal c:// it didnt scrape at all, some CORS-related error).
thank you all in advance.
This isn't code that runs in a browser. This is code that runs in the Nodejs runtime (server side). If you're on Windows, you can use the Task Scheduler to schedule a task that runs every day at 8am and executes the index.js file using node.exe as the executable.
https://joshuatz.com/posts/2020/using-windows-task-scheduler-to-automate-nodejs-scripts/

puppeteer - scrape contents in infinite scroll pages stop working after overflow is set

I'm doing a coding exercise with nodejs and puppeteer. I want to make a simple Instagram scraping script. I've tried using axios and HTTP module but with that solution I was unable to fetch all the contents because of the cursors used for pagination. I've then switched to puppeteer and I'm trying with this code
#!/usr/bin/env node
const puppeteer = require('puppeteer');
let imagesLinks = [];
function extractItems() {
const extractedElements = document.querySelectorAll('img');
const items = [];
for (let element of extractedElements) {
items.push(element.src);
}
console.log(items);
return items;
}
async function scrapeInfiniteScrollItems(page, extractItems, itemTargetCount, scrollDelay = 1000){
let items = [];
try {
let previousHeight;
while (items.length < itemTargetCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(scrollDelay);
}
} catch(e) { }
return items;
}
(async() => {
const browser = await puppeteer.launch({
headless: false,
//args: ['--incognito']
});
const page = await browser.newPage();
page.setViewport({height: 680, width: 1280});
await page.goto('https://instagram.com/username');
page.waitForSelector('.aOOlW.bIiDR').then( el => el.click() );
// Scroll and extract items from the page.
const items = await scrapeInfiniteScrollItems(page, extractItems, 100);
console.log(items);
})();
When I run the script, puppeteer will open two chromium tabs, one is blank and the second will open the link passed to the page.goto function. Is possible to set puppeteer to open only the tab that will display the desired URL? Another problem I'm facing is with the Instagram infinite scroll. I'm able to click on the cookie popup that is displayed after the page is loaded, the script then will start to grab the images links but after some scrolls it will stop scrolling because a login popup will appear and the body will get the overflow: hidden property. How can I prevent that the script stops scrolling until the page reaches the bottom? Is it possible to remove the CSS property and after this let the script continue scrolling to scrape contents?

Retrieving JavaScript Rendered HTML with Puppeteer

I am attempting to scrape the html from this NCBI.gov page. I need to include the #see-all URL fragment so that I am guaranteed to get the searchpage instead of retrieving the HTML from an incorrect gene page https://www.ncbi.nlm.nih.gov/gene/119016.
URL fragments are not passed to the server, and are instead used by the javascript of the page client-side to (in this case) create entirely different HTML, which is what you get when you go to the page in a browser and "View page source", which is the HTML I want to retrieve. R readLines() ignores url tags followed by #
I tried using phantomJS first, but it just returned the error described here ReferenceError: Can't find variable: Map, and it seems to result from phantomJS not supporting some feature that NCBI was using, thus eliminating this route to solution.
I had more success with Puppeteer using the following Javascript evaluated with node.js:
const puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(
'https://www.ncbi.nlm.nih.gov/gene/?term=AGAP8#see-all');
var HTML = await page.content()
const fs = require('fs');
var ws = fs.createWriteStream(
'TempInterfaceWithChrome.js'
);
ws.write(HTML);
ws.end();
var ws2 = fs.createWriteStream(
'finishedFlag'
);
ws2.end();
browser.close();
})();
however this returned what appeared to be the pre-rendered html. how do I (programmatically) get the final html that I get in browser?
You can try to change this:
await page.goto(
'https://www.ncbi.nlm.nih.gov/gene/?term=AGAP8#see-all');
into this:
await page.goto(
'https://www.ncbi.nlm.nih.gov/gene/?term=AGAP8#see-all', {waitUntil: 'networkidle'});
Or, you can create a function listenFor() to listen to a custom event on page load:
function listenFor(type) {
return page.evaluateOnNewDocument(type => {
document.addEventListener(type, e => {
window.onCustomEvent({type, detail: e.detail});
});
}, type);
}`
await listenFor('custom-event-ready'); // Listen for "custom-event-ready" custom event on page load.
LE:
This also might come in handy:
await page.waitForSelector('h3'); // replace h3 with your selector
Maybe try to wait
await page.waitForNavigation(5);
and after
let html = await page.content();
I had success using the following to get html content that was generated after the page has been loaded.
const browser = await puppeteer.launch();
try {
const page = await browser.newPage();
await page.goto(url);
await page.waitFor(2000);
let html_content = await page.evaluate(el => el.innerHTML, await page.$('.element-class-name'));
console.log(html_content);
} catch (err) {
console.log(err);
}
Hope this helps.
Waiting for network idle was not enough in my case, so I used dom loaded event:
await page.goto(url, {waitUntil: 'domcontentloaded', timeout: 60000} );
const data = await page.content();
Indeed you need innerHTML:
fs.writeFileSync( "test.html", await (await page.$("html")).evaluate( (content => content.innerHTML ) ) );
If you want to actually await a custom event, you can do it this way.
const page = await browser.newPage();
/**
* Attach an event listener to page to capture a custom event on page load/navigation.
* #param {string} type Event name.
* #return {!Promise}
*/
function addListener(type) {
return page.evaluateOnNewDocument(type => {
// here we are in the browser context
document.addEventListener(type, e => {
window.onCustomEvent({ type, detail: e.detail });
});
}, type);
}
const evt = await new Promise(async resolve => {
// Define a window.onCustomEvent function on the page.
await page.exposeFunction('onCustomEvent', e => {
// here we are in the node context
resolve(e); // resolve the outer Promise here so we can await it outside
});
await addListener('app-ready'); // setup listener for "app-ready" custom event on page load
await page.goto('http://example.com'); // N.B! Do not use { waitUntil: 'networkidle0' } as that may cause a race condition
});
console.log(`${evt.type} fired`, evt.detail || '');
Built upon the example at https://github.com/GoogleChrome/puppeteer/blob/master/examples/custom-event.js

Scraping with Cheerio - Cannot Extract Text [duplicate]

I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000

How can I scrape pages with dynamic content using node.js?

I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000

Categories