puppeteer only scraping around 200 pages and don't continue - javascript

for some reason that I don't understand my node app stops scraping after few minutes without any errors while only scraping, btw its an infinity scroll website...
this is the code:
const fs = require('fs');
(async() => {
// start the browser
const browser = await puppeteer.launch({ args: ['--no-sandbox'] });
// open a new page
const page = await browser.newPage();
const pageURL = 'http://www.yad4.co.il/dogs//////////////#1';
try {
// try to go to URL
await page.goto(pageURL);
console.log(`opened the page: ${pageURL}`);
await page.setViewport({
width: 1200,
height: 800
});
await autoScroll(page);
} catch (error) {
console.log(`failed to open the page: ${pageURL} with the error: ${error}`);
}
// Find all links to dogs
const postsSelector = '.yd-search-page .container .row .col-md-9 .yd-gallery .search-handler-yd .col-xs-12 #dogs_more .col-md-4 .yd-dog-img .yd-mask a';
await page.waitForSelector(postsSelector);
const postUrls = await page.$$eval(postsSelector, postLinks => postLinks.map(link => link.href));
// Visit each page one by one
for (let postUrl of postUrls) {
// open the page
try {
await page.goto(postUrl);
console.log('opened the page: ', postUrl);
} catch (error) {
console.log(error);
console.log('failed to open the page: ', postUrl);
}
// get the name of the dog
const dogSelector = '.adopt.yd-amuta .container .yd-dog-cont .col-xs-12 .adopt-head .row .col-sm-6 .adopt-breadcrumb-title h2 span';
// await page.waitForSelector(dogSelector);
const dogName = await page.$eval(dogSelector, dogSelector => dogSelector.innerHTML);
// Writing the news inside a json file
fs.appendFile("dogtest4.json", JSON.stringify({dogName},), function(err) {
if (err) throw err;
console.log("Saved!");
});
}
// all done, close the browser
await browser.close();
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}
process.exit()
})();
so it gives me information but randomly, I mean sometimes it gives me 115 pages sometimes 300 pages and some times barely 90 pages and I don't understand why,
please help me.
Thank you.

I can't comment, but I suppose it could be something to do with memory limit being reached and that slows things down.
you can try adding "await" in front of fs.appendFile(...) like here explained, might work for you

Related

Cheerio selector after page loaded

I want to scrape a url value of iframe in this website: https://lk21online.digital/nonton-profile-2021-subtitle-indonesia/
When i search iframe from view page source its not found, i think iframe is loaded after page loaded by javascript
Or my selector is wrong?
Please somebody help me to check my selector or what i need to do for my code
Sorry for my poor english...
There is my code:
async function getDetail(res, url) {
try {
const html = await scraping(res, url)
const $ = cheerio.load(html)
const article = $('#site-container #content .gmr-maincontent #primary #main .gmr-box-content #muvipro_player_content_id #player1-tab-content')
let result = []
setTimeout(() => {
article.each(function () {
const title = $(this).find('.item-article h2').text()
const watch = $(this).find('iframe').attr('src')
result.push({
title,
watch,
})
})
res.json({ result })
}, 5000)
}
catch (err) {
console.log(err)
}
}
this is video iframe
You can't use cheerio for this. Cheerio is not dynamic and just loads whatever html is coming back from the request.
Looking at your webpage, most content is loaded async, so the initial html will be pretty empty.
In addition the video source is lazily loaded when it enters the browser window. So you have to use an actual headless browser to accomplish the task. Here's an example:
// iframeUrl.js
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Goto page
await page.goto("https://lk21online.digital/nonton-profile-2021-subtitle-indonesia/");
// Scroll down
page.evaluate((_) => window.scrollBy(0, 1000));
// Wait a bit
await new Promise((resolve) => setTimeout(resolve, 5000));
// Get the src of the iframe
const iframeUrl = await page.evaluate(`$("#player1-tab-content iframe").attr("src")`);
console.log(iframeUrl);
await browser.close();
process.exit(0);
})();

Web scraping a stream chat in real-time (puppeteer.js)

I want to get the chat from a stream in real time with web scraping.
Attempting to create a while loop inside of the .then() function of puppeeter doesn't seem to be effective, and in some implementations breaks it all together.
I am able to get the initial scrape to happen, but in all cases the program ends and does not want to follow the while loop I implemented.
Working code WITHOUT while loop
const puppeteer = require ('puppeteer');
//initiating Puppeteer
puppeteer
.launch ()
.then (async browser => {
//opening a new page and navigating to the live stream
const page = await browser.newPage ();
await page.goto ('https://www.younow.com/Ken_Nara24');
await page.waitForSelector ('body');
//manipulating the page's content
let getComments = await page.evaluate (() => {
let comments = document.body.querySelectorAll ('.comment');
let scrapeItems = [];
comments.forEach (item => {
let commentAuthor = item.querySelector ('div.user-card__header.mini-profile-launcher').innerText;
let commentContent = '';
try {
commentContent = item.querySelector ('div.user-card__body.ng-star-inserted').innerText;
} catch (err) {}
scrapeItems.push ({
commentAuthor: commentAuthor,
commentContent: commentContent,
});
});
let items = {
"userComments": scrapeItems,
};
return items;
});
//outputting the scraped data
console.log (getComments);
//closing the browser
await browser.close ();
})
//handling any errors
.catch (function (err) {
console.error (err);
});
All attempts to get that logic looping have been for naught. I cannot find a way or past issue/example that clearly defines how or if such a thing could be done. I have made a few attempts to implement it myself, but nothing has even compiled correctly.
Am I missing something significant here? I just want to essentially listen to a web page and every 3-5 seconds re-scrape it.
If you still need help you could give this way a try.
const puppeteer = require("puppeteer");
let pageScraping = false; /* set scraping to false */
const scraper = async () => {
if (pageScraping == true) return; /* check if already scraping page */
let browser, page;
let pageUrl = 'https://www.younow.com/Ken_Nara24';
try {
pageScraping = true; /* set scraping to true */
browser = await puppeteer.launch({ headless: true });
page = await browser.newPage();
await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
/* wait for chat to be visible */
await page.waitForSelector('.chat', { visible: true, timeout: 60000 });
let getComments = await page.evaluate(() => {
let scrapeComments = [];
let comments = document.querySelectorAll('.comment');
comments.forEach(comment => {
let commentContent = '';
let commentAuthor = comment.querySelector('div[class="user-card__header mini-profile-launcher"]').innerText;
commentContent = comment.querySelector('div[class="user-card__body ng-star-inserted"]').innerText;
scrapeComments.push({
'commentAuthor': commentAuthor,
'commentContent': commentContent,
});
});
return { 'userComments': scrapeComments };
});
console.log(await getComments); /* log comments */
} catch (err) {
console.log(err.message);
} finally {
if (browser) { /* check if browser is open befor trying to close */
await browser.close();
console.log('closing browser');
}
pageScraping = false; /* set scraping to false again */
await setTimeout(scraper, 5000); /* wait 5 seconds befor re-scraping */
}
}
setTimeout(scraper, 5000); /* start scraping */

Initializing a Puppeteer Browser Outside of Scraping Function

I am very new to puppeteer (I started today). I have some code that is working the way that I want it to except for an issue that I think is making it extremely inefficient. I have a function that links me through potentially thousands of urls that have incremental IDs to pull the name, position, and stats of each player and then inserts that data into a neDB database. Here is my code:
const puppeteer = require('puppeteer');
const Datastore = require('nedb');
const database = new Datastore('database.db');
database.loadDatabase();
async function scrapeProduct(url, id){
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
let attributes = [];
const [name] = await page.$x('//*[#id="ctl00_ctl00_ctl00_Main_Main_name"]');
const txt = await name.getProperty('innerText');
const playerName = await txt.jsonValue();
attributes.push(playerName);
//Make sure that there is a legitimate player profile before trying to pull a bunch of 'undefined' information.
if(playerName){
const [role] = await page.$x('//*[#id="ctl00_ctl00_ctl00_Main_Main_position"]');
const roleTxt = await role.getProperty('innerText');
const playerRole = await roleTxt.jsonValue();
attributes.push(playerRole);
//Loop through the 12 attributes and pull their values.
for(let i = 1; i < 13; i++){
let vLink = '//*[#id="ctl00_ctl00_ctl00_Main_Main_SectionTabBox"]/div/div/div/div[1]/table/tbody/tr['+i+']/td[2]';
const [e1] = await page.$x(vLink);
const val = await e1.getProperty('innerText');
const skillVal = await val.jsonValue();
attributes.push(skillVal);
}
//Create a player profile to be pushed into the database. (I realize this is very wordy and ugly code)
let player = {
Name: attributes[0],
Role: attributes[1],
Athleticism: attributes[2],
Speed: attributes[3],
Durability: attributes[4],
Work_Ethic: attributes[5],
Stamina: attributes[6],
Strength: attributes[7],
Blocking: attributes[8],
Tackling: attributes[9],
Hands: attributes[10],
Game_Instinct: attributes[11],
Elusiveness: attributes[12],
Technique: attributes[13],
_id: id,
};
database.insert(player);
console.log('player #' + id + " scraped.");
await browser.close();
} else {
console.log("Blank profile");
await browser.close();
}
}
//Making sure the first URL is scraped before moving on to the next URL. (i removed the URL because its unreasonably long and is not important for this part).
(async () => {
for(let i = 0; i <= 1000; i++){
let link = 'https://url.com/Ratings.aspx?rid='+i+'&section=Ratings';
await scrapeProduct(link, i);
}
})();
What I think is making this so inefficient is the fact that everytime scrapeProduct() is called, i create a new browser and create a new page. Instead I believe it would be more efficient to create 1 browser and 1 page and just change the pages URL with
await page.goto(url)
I believe that in order to do what I'm trying to accomplish here, i need to move:
const browser = await puppeteer.launch();
const page = await browser.newPage();
outside of my scrapeProduct() function but i cannot seem to get this to work. Anytime I try i get an error in my function saying that page is not defined. I am very new to puppeteer (started today), I would appreciate any guidance on how to accomplish this. Thank you very much!
TL;DR
How do i create 1 Browser instance and 1 Page instance that a function can use repeatedly by only changing the await page.goto(url) function.
About a year ago I tried to a make an React Native Pokemon Go helper app. Since there wasn't an api for pokemon nest and pokestops I created a server that scraped thesilphroad.com and I found the need to implement something like #Arkan said.
I wanted the server to be able to take multiple request, so I decided to initialize the browser when the server is booted up. When a request is received, the server checks to see if MAX_TABS have been reached. If reached, it waits, if not a new tab is opened and the scrape is performed
Here's the scraper.js
const puppeteer = require ('puppeteer')
const fs = require('fs')
const Page = require('./Page')
const exec = require('child_process').exec
const execSync = require('util').promisify(exec)
module.exports = class scraper {
constructor(){
this.browser = null
this.getPages = null
this.getTotalPages = null
this.isRunning = false
//browser permissions
this.permissions = ['geolocation']
this.MAX_TABS = 5
//when puppeteer launches
this.useFirstTab = true
}
async init(config={}){
let headless = config.headless != undefined ? config.headless : true
this.permissions = this.permissions.concat(config.permissions || [])
//get local chromium location
let browserPath = await getBrowserPath('firefox') || await getBrowserPath('chrome')
this.browser = await puppeteer.launch({
headless:headless,
executablePath:browserPath,
defaultViewport:null,
args:[
'--start-maximized',
]
})
this.getPages = this.browser.pages
this.getTotalPages = ()=>{
return this.getPages().then(pages=>pages.length).catch(err=>0)
}
this.isRunning = true
}
async waitForTab(){
let time = Date.now()
let cycles = 1
await new Promise(resolve=>{
let interval = setInterval(async()=>{
let totalPages = await this.getTotalPages()
if(totalPages < this.MAX_TABS){
clearInterval(interval)
resolve()
}
if(Date.now() - time > 100)
console.log('Waiting...')
if(Date.now() - time > 20*1000){
console.log('... ...\n'.repeat(cycle)+'Still waiting...')
cycle++
time = Date.now()
}
},500)
})
}
//open new tab and go to page
async openPage(url,waitSelector,lat,long){
await this.waitForTab()
let pg
//puppeteer launches with a blank tab, use this
// if(this.useFirstTab){
// let pages = await this.browser.pages()
// pg = pages.pop()
// this.useFirstTab = false
// }
// else
pg = await this.browser.newPage()
if(lat && long){
await this.setPermissions(url)
}
let page = await new Page()
await page.init(pg,url,waitSelector,lat,long)
return page
}
async setPermissions(url){
const context = this.browser.defaultBrowserContext();
await context.overridePermissions(url,this.permissions)
}
}
// assumes that the browser is in path
async function getBrowserPath(browserName){
return execSync('command -v chromium').then(({stdout,stderr})=>{
if(stdout.includes('not found'))
return null
return stdout
}).catch(err=>null)
}
The scraper imports Page.js, which is just wrapper for a puppeteer Page object with the functions I used most made available
const path = require('path')
const fs = require('fs')
const userAgents = require('./staticData/userAgents.json')
const cookiesPath = path.normalize('./cookies.json')
// a wrapper for a puppeteer page with pre-made functions
module.exports = class Page{
constuctor(useCookies=false){
this.page = null
this.useCookies = useCookies
this.previousSession = this.useCookies && fs.existsSync(cookiesPath)
}
async close (){
await this.page.close()
}
async init(page,url,waitSelector,lat,long){
this.page = page
let userAgent = userAgents[Math.floor(Math.random()*userAgents.length)]
await this.page.setUserAgent(userAgent)
await this.restoredSession()
if(lat && long)
await this.page.setGeolocation({
latitude: lat || 59.95, longitude:long || 30.31667, accuracy:40
})
await this.page.goto(url)
await this.wait(waitSelector)
}
async screenshotElement(selector='body',directory='./screenshots',padding=0,offset={}) {
const rect = await this.page.evaluate(selector => {
const el = document.querySelector(selector)
const {x, y, width, height} = el.getBoundingClientRect()
return {
left: x,
top: y,
width,
height,
id: el.id
}
}, selector)
let ext = 'jpeg'
let filename = path.normalize(directory+'/'+Date.now())
return await this.page.screenshot({
type:ext,
path:filename+' - '+selector.substring(5)+'.'+ext,
clip: {
x: rect.left - padding+(offset.left || 0),
y: rect.top - padding+(offset.right || 0),
width: rect.width + padding * 2+(offset.width||0),
height: rect.height + padding * 2+ (offset.height||0)
},
encoding:'base64'
})
}
async restoredSession(){
if(!this.previousSession)
return false
let cookies = require(cookiesPath)
for(let cookie of cookies){
await this.page.setCookie(cookie)
}
console.log('Loaded previous session')
return true
}
async saveSession(){
//write cookie to file
if(!this.useCookies)
return
const cookies = await this.page.cookies()
fs.writeFileSync(cookiesPath,JSON.stringify(cookies,null,2))
console.log('Wrote cookies to file')
}
//wait for text input elment and type text
async type(selector,text,options={delay:150}){
await this.wait(selector)
await this.page.type(selector,text,options)
}
//click and waits
async click(clickSelector,waitSelector=500){
await this.page.click(clickSelector)
await this.wait(waitSelector)
}
//hovers over element and waits
async hover(selector,waitSelector=500){
await this.page.hover(selector)
await this.wait(1000)
await this.wait(waitSelector)
}
//waits and suppresses timeout errors
async wait(selector=500, waitForNav=false){
try{
//waitForNav is a puppeteer's waitForNavigation function
//which for me does nothing but timeouts after 30s
waitForNav && await this.page.waitForNavigation()
await this.page.waitFor(selector)
} catch (err){
//print everything but timeout errors
if(err.name != 'Timeout Error'){
console.log('error name:',err.name)
console.log(err)
console.log('- - - '.repeat(4))
}
this.close()
}
}
}
``
To achieve this, you'll just need to separate the browser from your requests, like in a class, for example:
class PuppeteerScraper {
async launch(options = {}) {
this.browser = await puppeteer.launch(options);
// you could reuse the page instance if it was defined here
}
/**
* Pass the address and the function that will scrape your data,
* in order to mantain the page inside this object
*/
async goto(url, callback) {
const page = await this.browser.newPage();
await page.goto(url);
/**evaluate its content */
await callback(page);
await page.close();
}
async close() {
await this.browser.close();
}
}
and, to implement it:
/**
* scrape function, takes the page instance as its parameters
*/
async function evaluate_page(page) {
const titles = await page.$$eval('.col-xs-6 .star-rating ~ h3 a', (itens) => {
const text_titles = [];
for (const item of itens) {
if (item && item.textContent) {
text_titles.push(item.textContent);
}
}
return text_titles;
});
console.log('titles', titles);
}
(async () => {
const scraper = new PuppeteerScraper();
await scraper.launch({ headless: false });
for (let i = 1; i <= 6; i++) {
let link = `https://books.toscrape.com/catalogue/page-${i}.html`;
await scraper.goto(link, evaluate_page);
}
scraper.close();
})();
altho, if you want something more complex, you could take a look how they done at Apify project.

JS puppeteer using for loop to iterate over links

I am trying to iterate over unique youtube video links to get screenshot.
After debugging, I noticed for the forloop below, JS spawn 2 process threads, 1 for each index i . The processALink() function in the second thread seems to start before the processALink() in the first thread has ended fully.
Why is this happening? I thought using async/wait stops this from happening.
The forloop is inside a async function. The code below is just a snippet from the oringinal source code.
for(let i = 0; i<2; i++){
var link = linksArr[i];
var label = labelsArr[i];
await proccessALink(link, label)
}
Function def for processALink()
var proccessALink = async (link,label)=>{
//set download path
var downloadPath = 'data/train/'+label;
//parse the url
var urlToScreenshot = parseUrl(link)
//Give a URL it will take a screen shot
if (validUrl.isWebUri(urlToScreenshot)) {
// console.log('Screenshotting: ' + urlToScreenshot + '&t=' + req.query.t)
console.log('Screenshotting: ' + link)
;(async () => {
//Logic to login to youtube below
//await login();
//go to the url and wait till all the content is loaded.
await page.goto(link, {
waitUntil: 'networkidle'
//waitUntil: 'domcontentloaded'
})
//await page.waitForNavigation();
//Find the video player in the page
const video = await page.$('.html5-video-player')
await page.content();
//Run some command on consoleDev
await page.evaluate(() => {
// Hide youtube player controls.
let dom = document.querySelector('.ytp-chrome-bottom')
if(dom != null){
dom.style.display = 'none'
}
})
await video.screenshot({path: downloadPath});
})()
} else {
res.send('Invalid url: ' + urlToScreenshot)
}
}
Remove the IIFE inside processALink() and it should resolve the issue of running multiple screenshots at the same time.
const proccessALink = async(link, label) => {
//set download path
const downloadPath = 'data/train/' + label;
//parse the url
const urlToScreenshot = parseUrl(link)
//Give a URL it will take a screen shot
if (validUrl.isWebUri(urlToScreenshot)) {
// console.log('Screenshotting: ' + urlToScreenshot + '&t=' + req.query.t)
console.log('Screenshotting: ' + link);
//Logic to login to youtube below
//await login();
//go to the url and wait till all the content is loaded.
await page.goto(link, {
waitUntil: 'networkidle'
//waitUntil: 'domcontentloaded'
})
//await page.waitForNavigation();
//Find the video player in the page
const video = await page.$('.html5-video-player')
await page.content();
//Run some command on consoleDev
await page.evaluate(() => {
// Hide youtube player controls.
let dom = document.querySelector('.ytp-chrome-bottom')
if (dom != null) {
dom.style.display = 'none'
}
})
await video.screenshot({
path: downloadPath
});
} else {
res.send('Invalid url: ' + urlToScreenshot)
}
}

How do I fix this webscraper made using puppeteer which is doing nothing after scraping half data but not giving any error?

For my college project, I made a Wikipedia scraper using nodejs and puppeteer. It's working for all but one link. After scraping almost half the data of a table in that page (I am using the console.log to see which data has been scraped at that moment) it just does nothing. It does not show any error. It does not stop executing, it just does nothing after that. The puppeteer browser does not close either.
In the original scraper, I used a loop of links to generate data. As it was not working so I made a separate scraper for that link but the same thing is happening. Can anyone help me out?
const puppeteer = require('puppeteer');
const fs = require('fs');
(async () => {
try {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 800 });
link = "https://en.wikipedia.org/wiki/List_of_terrorist_incidents_in_June_2016";
console.log("==============================");
console.log("Travelling to link:", link);
console.log("==============================");
await page.goto(link, {waitUntil: 'networkidle0'});
let rowArray = await page.$$("table[class='wikitable sortable jquery-tablesorter'] > tbody > tr");
var dataA = [];
for(let row of rowArray){
let date = await row.$eval('td:nth-child(1)', element => element.textContent);
date = date.substring(0, date.length - 1);
let type = await row.$eval('td:nth-child(2)', element => element.textContent);
type = type.substring(0, type.length - 1);
let dead = await row.$eval('td:nth-child(3)', element => element.textContent);
dead = dead.substring(0, dead.length - 1);
let injured = await row.$eval('td:nth-child(4)', element => element.textContent);
injured = injured.substring(0, injured.length - 1);
let location = await row.$eval('td:nth-child(5)', element => element.textContent);
location = location.substring(0, location.length - 1);
let details = await row.$eval('td:nth-child(6)', element => element.textContent);
details = details.substring(0, details.length - 1);
let perpetrator = await row.$eval('td:nth-child(7)', element => element.textContent);
perpetrator = perpetrator.substring(0, perpetrator.length - 1);
let partOf = await row.$eval('td:nth-child(8)', element => element.textContent);
partOf = partOf.substring(0, partOf.length - 1);
console.log("==============================");
console.log({date, type, dead, injured, location, details, perpetrator, partOf});
console.log("==============================");
dataA.push({date, type, dead, injured, location, details, perpetrator, partOf});
}
console.log("==============================");
console.log("Started writing JSON file");
fs.writeFileSync(`./june.json`, JSON.stringify(dataA), 'utf-8');
console.log("Finished writing JSON file");
console.log("==============================");
await browser.close();
} catch (error) {
console.error();
}
})();
Just by looking at the point where it stops
It seems the script struggles with the next row which doesn't have a "closing cell"
My guess is if you edit that page and close it it will work (or update your script to handle that scenario)
Looking at the Wikipedia source, in that row the "part of" cell is missing, thus your code just hangs in the 'await' section
let partOf = await row.$eval('td:nth-child(8)', element => element.textContent);
Thus you get no errors.

Categories