Multiple selector search via puppeteer library - javascript

I use puppeteer to get data about the store. I search using the p.shop-page-content__text_large, span.shop-list-item__address selectors, but I ran into such a problem that only one of them can be present on the page. I tried to solve the problem in the following way, but it does not work. Tell me how can this be fixed?
const puppeteer = require('puppeteer');
const browser = await puppeteer.launch({
headless: false,
slowMo: 150,
});
const cities = [{'CITY': 'Town1', 'LINK': '/shops/town1/'}, {'CITY': 'Town2', 'LINK': '/shops/town2/'}];
async function getData(page, selector) {
return await page.$$eval(selector, info => info.map((data) => {
let str = data.textContent.trim(),
from = str.search(','),
to = str.length;
return {
'COUNTRY': 'unknow',
'STREET' : str.substring(from, to)
}
}));
}
const result = [];
for (let val of cities) {
console.log(val.LINK, val.CITY);
const page = await browser.newPage();
await page.goto('https://www.example-site.ru' + val.LINK);
data = await page.waitForFunction('.shop-page-content').then(async() => {
console.log('ok');
return await getData(page, 'p.shop-page-content__text_large');
}).catch(async (e) => {
console.log('fail');
await page.waitForSelector('.shops-info__section');
return await getData(page, 'span.shop-list-item__address');
// result.push(data);
});
result.push(data);
await browser.close();
}
console.log(result);

It turned out like this:
const browser = await puppeteer.launch({
headless: false,
slowMo: 150,
});
const cities = [{'CITY': 'Town1', 'LINK': '/shops/town1/'}, {'CITY': 'Town2', 'LINK': '/shops/town2/'}];
const page = await browser.newPage();
const result = [];
for (let val of cities) {
await page.goto('https://www.example-site.ru' + val.LINK);
const list = await page.evaluate(() => {
const data = [];
const elements = document.querySelectorAll('p.shop-page-content__text_large').length
? document.querySelectorAll('p.shop-page-content__text_large')
: document.querySelectorAll('span.shop-list-item__address');
for (const element of elements) {
data.push(element.innerText);
}
return data;
});
result.push({
link: val.LINK,
city: val.CITY,
list
})
}
await browser.close();

Related

JS WebDriverIO how to check if tag h1 is present on a page

I tried doing like this:
await browser.url(https://webdriver.io/');
expect(browser).toHaveText('h1')
so tried:
await browser.url(https://webdriver.io/');
await browser.findElement('xpath', 'h1')
But it doesn't help
Can someone tell me how to check for the presence of the h1 tag in the loop? The test immediately crashes and does not print pages that do not have h1
const url = 'https://webdriver.io/';
const crossFetch = require('cross-fetch');
const links = [];
describe('test', async () => {
beforeEach(async function () {
const result = await crossFetch(`${url}/wp-json/wp/v2/pages/?per_page=100`);
const pages = await result.json();
pages.forEach(page => links.push(page.link));
});
it('Check Headers', async () => {
const array1 = [];
const array2 = [];
for (const elem of links) {
await browser.url(elem);
elem.click;
const classNameAndText = await $('<h1>')
console.log('HAS h1 - ' + await classNameAndText.getText()) // outputs: h1
if (classNameAndText) {
array1.push(elem);
} else {
array2.push(elem);
}
}
if (array2.length > 0) {
array2.forEach(function (elem1) {
console.log('NOT h1 - ' + elem1);
})
}
})
});
this is how i solved the issue:
const url = 'https://webdriver.io/';
const crossFetch = require('cross-fetch');
const links = [];
describe('test', async () => {
beforeEach(async function () {
const result = await crossFetch(`${url}/wp-json/wp/v2/pages/?per_page=100`);
const pages = await result.json();
pages.forEach(page => links.push(page.link));
});
it('Check Headers', async () => {
const errorsUrls = [];
for (const url of links) {
await browser.url(url);
const classNameAndText = await $('h3');
if (typeof classNameAndText.error !== 'undefined') {
errorsUrls.push(url);
}
}
if (errorsUrls.length > 0) {
throw new Error( `
Header Not Found:
${errorsUrls.join('\r\n')}
`);
}
})
});

Execution context was destroyed, most likely because of a navigation Puppeteer

In my Puppeteer Node JS app I need to read localStorage and cookies from a browser web page, but for some reason I'm getting the following error:
UnhandledPromiseRejectionWarning: Error: Execution context was destroyed, most likely because of a navigation.
What am I doing wrong/missing from my JS:
const dayjs = require('dayjs');
const AdvancedFormat = require('dayjs/plugin/advancedFormat');
dayjs.extend(AdvancedFormat);
const puppeteer = require('puppeteer');
const { config } = require('./config');
const helpers = require('./helpers');
const logs = require('./logs');
const runEmulation = async (body) => {
logs.debug('starting emulation');
// vars
const argOptions = [], journey = [];
// sandbox config
if ((config.puppeteer.run_in_sandbox === 'true')) {
argOptions.push('--no-sandbox');
}
// initiate a Puppeteer instance with options and launch
const browser = await puppeteer.launch({
args: argOptions,
headless: (config.puppeteer.run_in_headless === 'true') ? true : false
});
// launch a new page
const page = await browser.newPage()
// initiate a new CDP session
const client = await page.target().createCDPSession();
await client.send('Network.enable');
await client.on('Network.requestWillBeSent', async (e) => {
// if not a document, skip
if (e.type !== "Document") return;
const scrapablePageData = async () => {
function getLocalStorage () {
const values = [];
const keys = Object.keys(localStorage);
let index = keys.length;
while (index--) {
values.push({
key: keys[index],
value: localStorage.getItem(keys[index])
});
}
return values ? values : [];
}
return {
localStorage: getLocalStorage()
}
}
const scrapable = await page.evaluate(scrapablePageData);
const cookies = await page.cookies();
// the data we want to log
journey.push({
url: e.documentURL,
type: e.redirectResponse ? e.redirectResponse.status : 'JS Redirection',
storage: {
cookies: cookies ?? [],
local: scrapable.localStorage ?? []
},
duration_in_ms: 0,
duration_in_sec: 0,
loaded_at: dayjs().valueOf()
})
})
// set userAgent and go to the URL
await page.setUserAgent(body.userAgent);
await page.goto(body.url);
await page.waitForNavigation();
console.log(journey)
}
exports.runEmulation = runEmulation

how to split the sentence that is rendered from html using query select all

(async () => {
let browser, page;
let url = "https://www.facebook.com/marketplace/nyc/search/?query=cars";
browser = await puppeteer.launch({ headless: false });
page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle2" });
console.log("scraping...");
let data = await page.evaluate(() => {
let cars = [];
if (
document.querySelectorAll(
'div[class="fome6x0j tkqzz1yd aodizinl fjf4s8hc f7vcsfb0"]'
)
) {
document.querySelectorAll(
'div[class="fome6x0j tkqzz1yd aodizinl fjf4s8hc f7vcsfb0"]'
)
.forEach((element) => {
let prices = element.querySelectorAll(
'span[class="d2edcug0 hpfvmrgz qv66sw1b"]'
);
let listings = element.querySelectorAll(
'span[class="a8c37x1j ni8dbmo4 stjgntxs l9j0dhe7"]'
);
for (let i = 0; i < addLists.length; i++) {
const car = {
price: prices[i].innerText,
listing: listings[i].innerText,
sentence1:''
sentence2:''
};
cars.push(car);
}
});
}
return cars;
});
console.log(data);
}
})();
how to split the sentence that is rendered using listings , listings is the list of facebook add titles, I tried using split() function but it doesn't work,is there anyway that I can split that sentence and store it in sentence1 and sentence2
You can give this a try.
(async () => {
let browser, page;
let url = 'https://www.facebook.com/marketplace/nyc/search/?query=cars';
try {
browser = await puppeteer.launch({ headless: false });
page = await browser.newPage();
await page.setViewport({ width: 1366, height: 500 });
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
let data = await page.evaluate(() => {
let cars = [];
if (document.getElementsByClassName('sonix8o1')) {
Array.from(document.getElementsByClassName('sonix8o1')).forEach(element => {
let text = element.innerText;
if (text != "" && text != "Create New Listing" && text != "Log In" && text != "Learn More") {
let carElements = element.getElementsByClassName('rq0escxv')[0];
let car_info = carElements.getElementsByClassName('aahdfvyu');
let car = {
price: car_info[0].innerText,
listing: car_info[1].innerText,
sentence1: car_info[2].innerText,
sentence2: car_info[3].innerText
};
cars.push(car);
}
});
}
return cars;
});
data.forEach(e => console.log(e));
} catch (error) {
console.log(error.message);
} finally {
if (browser) {
await browser.close();
console.log('closing browser');
}
}
})();
Update from the comment.
(async () => {
let browser, page;
let url = 'https://www.facebook.com/marketplace/nyc/search/?query=cars';
try {
browser = await puppeteer.launch({ headless: false });
page = await browser.newPage();
await page.setViewport({ width: 1366, height: 500 });
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 });
let data = await page.evaluate(() => {
let cars = [];
if (document.getElementsByClassName('sonix8o1')) {
Array.from(document.getElementsByClassName('sonix8o1')).forEach(element => {
let text = element.innerText;
if (text != "" && text != "Create New Listing" && text != "Log In" && text != "Learn More") {
let carElements = element.getElementsByClassName('rq0escxv')[0];
let car_info = carElements.getElementsByClassName('aahdfvyu');
let splitText = car_info[1].innerText.split(' ');
let car = {
price: car_info[0].innerText,
listing: car_info[1].innerText,
sentence1: splitText[0],
sentence2: splitText[1]
};
cars.push(car);
}
});
}
return cars;
});
data.forEach(e => console.log(e));
} catch (error) {
console.log(error.message);
} finally {
if (browser) {
await browser.close();
console.log('closing browser');
}
}
})();
That returns:

How get the selector of an element from a web page with more than one document html?

I try get information from a web page using puppeteer, but in I don't to find the selector tha I need, I suppose that's because the page contain more than one documents html and I can't to find the way for to get the data that I need.
the inpection of the page
that´s the code:
const puppeteer = require('puppeteer');
(async ()=>{
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.goto('https://www.arrivia.com/careers/job-openings/');
await page.waitForSelector('.job-search-result');
const data = await page.evaluate(()=>{
const elements = document.querySelectorAll('.job-search-result .job-btn-container a');
vacancies = [];
for(element of elements){
vacancies.push(element.href);
}
return vacancies;
});
console.log(data.length);
const vacancies = [];
for (let i = 0; i <=2; i++){
var urljob = data[i];
await page.goto(data[i]);
await page.waitForSelector(".app-title"); //that´s one of the selectors that I can´t to find
from here I get an error`enter code here`
const jobs = await page.evaluate((urljob)=> {
const job = {};
job.title = document.querySelector(".app-title").innerText;
job.location = document.querySelector(".location").innerText;
job.url = urljob;
return job;close
});
vacancies.push(jobs);
}
console.log(vacancies);
//await page.screenshot({ path: 'xx1.jpg'});
await browser.close()
})();
Iframes are not always the easiest things to deal with, in Puppeteer. But a way to bypass this could be to access directly the URL of the iframe, instead of accessing the page which hosts the iframe. It's also faster:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false, defaultViewport: null });
const page = await browser.newPage();
await page.goto("https://www.arrivia.com/careers/job-openings/", {
waitUntil: "domcontentloaded",
});
const jobUrls = await page.$$eval(".job-search-result .job-btn-container a",
els => els.map(el => el.href));
const vacancies = [];
for (let i = 0; i < 10; i++) { // don't forget to replace 10 with jobUrls.length later
const url = jobUrls[i];
const jobId = /job_id=(\d+)/.exec(url)[1]; // Extract the ID from the link
await page.goto(
`https://boards.greenhouse.io/embed/job_app?token=${jobId}`, // Go to iframe URL
{ waitUntil: "domcontentloaded" }
);
vacancies.push({
title: await page.$eval(".app-title", el => el.innerText),
location: await page.$eval(".location", el => el.innerText),
url,
});
}
console.log(vacancies);
await browser.close();
})();
Output:
[
{
title: 'Director of Account Management',
location: 'Scottsdale, AZ',
url: 'https://www.arrivia.com/careers/job/?job_id=2529695'
},
{
title: "Site Admin and Director's Assistant",
location: 'Albufeira, Portugal',
url: 'https://www.arrivia.com/careers/job/?job_id=2540303'
},
...
]

puppeteer whete or gray screenshot

I am trying to take screenshots of all the table elements in a web page. Now for someweb pages it is working perfectly fine, I am able to take take pictures. But some websites are not working. I am able to take screenshots but they are mostly white or gray here is the code iam using.
const puppeteer = require('puppeteer');
const jsonfile = require('jsonfile');
const getWebImages = async(pageToGo, link) => {
puppeteer.launch({
args: ['--start-maximized'],
headless: false,
defaultViewport: null
}).then(async (browser) => {
const page = await browser.newPage();
await page.goto(pageToGo, {waitUntil: 'networkidle2', timeout: 60000});
const VIEWPORT = {width: 1366, height: 768 }; // Your default values
boxes2 = [];
const getData = async (link) => {
return page.evaluate(async (link) => {
return await new Promise(resolve => {
var rects = [];
const element = document.querySelectorAll('table');
element.forEach(function (item, index) {
var box = item.getBoundingClientRect();
rects.push({
x: box.left,
y: box.left,
width: box.width,
height: box.height,
id: index
})
})
return resolve(rects);
})
}, link);
}
const getImages = async (rect) => {
for (const item of rect) {
try {
await page.screenshot({
path: 'data\\_table_' + item.id + '.png',
clip: {
x: item.x,
y: item.y,
width: item.width,
height: item.height
}
});
} catch (e) {
console.log(e)
}
}
}
boxes2 = await getData(link);
images = await getImages(boxes2);
console.log(boxes2)
await browser.close();
});
}
getWebImages("https://www.csb.gc.ca/rates/", 11);
I have tried different screen sizes and other things like waiting for everything to load. When i see in the browser, i can clearly see the page loads and after it loads, the screenshots are taken but the images are either just white screens same size as tabel area.
NOTE: Just a note that i also downloaded some of the pages offline and even that is not working.
My problem was that i was setting my viewport after goto(), I changed the code to this;
`const puppeteer = require('puppeteer');
async function run(url) {
let browser = await puppeteer.launch({ headless: true });
let page = await browser.newPage();
const VIEWPORT = { width: 1360, height: 780}
boxes2 = [];
await page.setViewport(VIEWPORT);
await page.goto(url, { waitUntil: 'domcontentloaded'});
await page.waitFor('table');
await page.waitForSelector('table');
const el = await page.$$('table');
for(let i = 0; i < el.length; i++){
// await console.log(el[i].getBoundingClientRect());
await el[i].screenshot({
path: 'link_' +i+ '.png',
})
}
// await processScreens(el, page);
await page.close();
await browser.close();
}

Categories