I am trying to make a simple webscraper using Node and Puppeteer to get the titles of posts on reddit, but am having issues accessing a global variable, SUBREDDIT_NAME from within only one function, extractItems(). It works fine with every other function, but for that one I have to make a local variable with the same value for it to work.
Am I completely misunderstanding variable scope in Javascript?
I have tried everything I can think of, and the only thing that works is to create a local variable inside of extractedItems() with the value of "news", otherwise I get nothing.
const fs = require('fs');
const puppeteer = require('puppeteer');
const SUBREDDIT = (subreddit_name) => `https://reddit.com/r/${subreddit_name}/`;
const SUBREDDIT_NAME= "news";
function extractItems() {
const extractedElements = document.querySelectorAll(`a[href*='r/${SUBREDDIT_NAME}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
async function scrapeInfiniteScrollItems(
page,
extractItems,
itemTargetCount,
scrollDelay = 1000,
) {
let items = [];
try {
let previousHeight;5
while (items.length < itemTargetCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
// Set up browser and page.
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setViewport({ width: 1280, height: 926 });
// Navigate to the demo page.
await page.goto(SUBREDDIT(SUBREDDIT_NAME));
// Scroll and extract items from the page.
const items = await scrapeInfiniteScrollItems(page, extractItems, 100);
// Save extracted items to a file.
fs.writeFileSync('./items.txt', items.join('\n') + '\n');
// Close the browser.
await browser.close();
})();
I expect a text file with the 100 first found titles, but it only works when I hardcode the subreddit into the extractItems() function.
The problem is that the extractItems function is converted to a string (without processing the template literal) and executed in the pages context where there is no SUBREDDIT_NAME variable.
You can fix that by doing something like this:
function extractItems(name) {
const extractedElements = document.querySelectorAll(`a[href*='r/${name}/comments/'] h3`);
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
page.evaluate(`(${extractItems})(${SUBREDDIT_NAME})`)
Related
I've read multiple posts on here about reading in table data, however, every post is how to read and use that data within a single test function. How do I read a table in, and use it in any test function within a test spec?
If I write this into a test function it works and gives back all the info correctly:
import { sAdmin } from "..authentication";
import TablePage from "../TablePage";
const tablePage = new TablePage();
fixture `A fixture`
.page`https://subdomain.example.com/#/table`
.beforeEach( async t => {
await t
.resizeWindow(1284, 722)
.useRole(sAdmin);
});
// this will work, but this won't share context with any other test function in the spec
// this should be initialized at the start to be able to verify search clears
test(`A test`, async t => {
const tableColumn = await tablePage.woNumber;
const tableCount = await tablePage.woNumber.count;
const cellData = [];
for (let i = 0; i < tableCount; i++) {
const text = await tableColumn.nth(i).innerText;
cellData.push(text);
}
console.log('in spec - tableCount', tableCount);
console.log('in spec - cellData', cellData);
});
The output of both console logs is correct:
in spec - tableCount 4
in spec - cellData [ '0007', '0003', '0005', '0006' ]
I've tried an async function in my test spec, and in my page object model (POM). Async function won't work in my spec unless it's within a test function. The one in the POM works, it'll get called, however i can't do const tableCount = await tablePage.woNumber.count; it will yell at me that I can't use a selector like that. It's because of the modifier .count. I adjusted the .count to be within the for loop but that just returned undefined or other data that didn't help.
Example of the async function in my page object model (TablePage)
async rowCount() {
const tableColumn = await tablePage.fooSelector;
const tableCount = await tablePage.fooSelector;
const cellData = [];
for (let i = 0; i < tableCount.count; i++) {
const text = await tableColumn.nth(i).innerText;
cellData.push(text);
}
console.log('in page - tableColumn', tableColumn);
console.log('in page - tableCount', tableCount);
console.log('in page - cellData', cellData);
return tableCount;
};
It's called with this in my spec file, not sure where to call it though:
const count = tablePage.rowCount();
I need this to run after the page has loaded to grab the cell data, and allow me to share context across all tests within this spec file at the very minimum. I'd prefer to put it in my POM, so it can be used elsewhere in other tests. But I'd settle for it working in my test spec without it being in a test function so it can be shared across all tests in the spec file.
I've tried to do a fixture context, but that also had issues and returned undefined. Here is a before with context that I tried, that didn't work.
.before( async ctx => {
const tableColumn = await tablePage.fooSelector;
const tableCount = await tablePage.fooSelector;
for (let i = 0; i < tableCount.count; i++) {
const text = await tableColumn.nth(i).innerText;
ctx.cellData.push(text);
}
// console.log('in spec - tableCount', tableCount);
// console.log('in in spec - cellData', cellData);
})
These console logs return undefined, or objects instead of the text.
Any help would be greatly appreciated. Here are resources I referenced already:
TestCafe - Storing results of Selector in variable
How do I can get a text of all the cells of the table using testcafe
Testcafe get text from element
https://testcafe.io/documentation/402670/reference/test-api/domnodestate
EDIT: I'm still looking for a way to share context of the data I get, I wasn't able to return the data back to the test spec. Maybe if I do more tinkering I can share the values I've obtained.
Here is my solution that doesn't share context, but it does let me prevent code reuse in every test function.
Page Object Model
import { Selector, t } from "testcafe";
class TablePage{
constructor() {
this.searchInput = Selector('#searchInput');
this.tableCount = Selector('.class-selector');
};
async validateSearchResults(selector, searchText) {
await t
.typeText(this.searchInput, searchText)
.pressKey('enter');
const rowCount = await this.tableCount.count;
let searchResults = []
for (let i = 0; i < rowCount; i++) {
let text = await selector.nth(i).innerText;
searchResults.push(text);
await t.expect(searchResults[i]).contains(searchText);
}
};
}
export default TablePage;
Spec File
import { sAdmin } from "..authentication";
import TablePage from "../TablePage";
const tablePage = new TablePage();
fixture `Test search functionality`
.page`https://examplepage.com`
.beforeEach( async t => {
await t
.useRole(sAdmin)
});
test(`User can search via order number`, async t => {
await tablePage.validateSearchResults(tablePage.tableCount, 'foo');
});
The const tableCount = await tablePage.selector.count; looks correct and should work. Also, it's necessary to call the async method with await keyword:
const count = await tablePage.rowCount();
Here is an example of a similar approach:
table-page.js:
import { Selector } from 'testcafe';
export class TablePage {
constructor () {
this.tableCells = Selector('#ContentHolder_grid_DXDataRow0 td');
this.cellData = [];
this.cellCount = 0;
}
async initCellData () {
this.cellCount = await this.tableCells.count;
for (let i = 0; i < this.cellCount; i++) {
const text = await this.tableCells.nth(i).innerText;
this.cellData.push(text);
}
}
}
test.js:
import { TablePage } from './table-page';
let page = null;
fixture `New Fixture`
.page `https://demos.devexpress.com/ASPxGridViewDemos/DataBinding/QueryBuilderControl.aspx`
.beforeEach(async () => {
page = new TablePage();
await page.initCellData();
});
test(`New Test`, async t => {
console.log('cells count: ', page.cellCount);
console.log('cells data: ', page.cellData);
});
I have no idea why the for-loop only run once.
My intention is making the code run through all elements in foodGroupIdsOnPage1.
But it only runs through the first element now.
Could anyone please explain to me? Thank you for your help in advance.
async function initialize() {
const browser = await playwright.chromium.launch({
headless: false
});
const context = await browser.newContext(); ////create a new browser context, which means no cookies and cache saved
const tab1 = await context.newPage();
return { tab1, context };
}
async function GotoPage2() { ////wait for function>>>only run the next command after all the commands inside the next bracket run
const page1_foodGroupButton = id.querySelector('a') ////beginning of the for loop
await page1_foodGroupButton.click();
};
async function main() {
const { tab1, context } = await initialize();
await tab1.goto('https://www.cfs.gov.hk/tc_chi/nutrient/search1.php');
const foodGroupIdsOnPage1 = await tab1.evaluate(async function getFoodGroupsOnPage1() {
return [...document.querySelector('.tableResponsive').querySelectorAll('td ')].map(e => e.id);
})
for (let id of foodGroupIdsOnPage1) {
await tab1.evaluate(id => {
const page1_foodGroupButton = document.querySelector('[id=' + `"${id}"` + ']').querySelector('a') ////beginning of the for loop
page1_foodGroupButton.click();
}, id);
await tab1.waitForTimeout(2000);
await tab1.click('[id^=grp] > a');
await tab1.waitForTimeout(2000);
const ArrayOfTabs = context.pages(); ////get how many tabs chromium are
let tab2 = ArrayOfTabs[1]; ////make active tab to the second tab
await tab2.evaluate(async function extractFoodGroupData() {
let tableOfAllFoods = [];
let rowsOnPage3 = document.querySelector(".colorTable2").querySelectorAll("tr");
for (let row_OnPage3 of rowsOnPage3) {
let arrayNutritionOfOneFood = [];
let cellsInOneRow = row_OnPage3.querySelectorAll("td");
for (let cell of cellsInOneRow) {
arrayNutritionOfOneFood.push(cell.innerText);
}
tableOfAllFoods.push(arrayNutritionOfOneFood);
}
});
tab2.close();
tab1.goBack();
return;
}
}
main();
I found out the solution.
The first problem is that return terminates the iteration. So delete it.
The second problem is that the second iteration cycle runs before
tab1.goBack();
Simply add await before tab1.goBack(); to fix the second problem.
I have the following function that clicks a link with a "rel=_nofollow" property, awaits the new page tab open and then loads the content, but occassionally the page will not load and instead of looping to the next page, the script hangs.
How can I put a timeout in here?
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
};
I presume that if I can get it to timeout, I can then return false or something to check in the main code for success or failure?
newpage = await clickAndWaitForTarget("a.someSelector", page, browser);
if(newpage){
var url = await newpage.url();
...
Thanks in advance!
edit: I'm using Puppeteer version 2.0.0
waitForTarget has a timeout option. The default is 30 seconds, maybe that´s a lot for you.
You could do something like this:
let clickAndWaitForTarget = async (clickSelector, page, browser) => {
const pageTarget = page.target();
await page.click(clickSelector);
try {
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget,
{ timeout: 3000} /*3 seconds instead*/
);
const newPage = await newTarget.page(); //get the page object
await page.waitFor(10000);
return newPage;
} catch {
return null;
}
};
So I'm trying to crawl a site using Puppeteer. All the data I'm looking to grab is in multiple tables. Specifically, I'm trying to grab the data from a single table. I was able to grab the specific table using a very verbose .querySelector(table.myclass ~ table.myclass), so now my issue is, my code is grabbing the first item of each table (starting from the correct table, which is the 2nd table), but I can't find a way to get it to just grab all the data in only the 2nd table.
const puppeteer = require('puppeteer');
const myUrl = "https://coolurl.com";
(async () => {
const browser = await puppeteer.launch({
headless: true
});
const page = (await browser.pages())[0];
await page.setViewport({
width: 1920,
height: 926
});
await page.goto(myUrl);
let gameData = await page.evaluate(() => {
let games = [];
let gamesElms = document.querySelectorAll('table.myclass ~ table.myclass');
gamesElms.forEach((gameelement) => {
let gameJson = {};
try {
gameJson.name = gameelement.querySelector('.myclass2').textContent;
} catch (exception) {
console.warn(exception);
}
games.push(gameJson);
});
return games;
})
console.log(gameData);
browser.close();
})();
You can use either of the following methods to select the second table:
let gamesElms = document.querySelectorAll('table.myclass')[1];
let gamesElms = document.querySelector('table.myclass:nth-child(2)');
Additionally, you can use the example below to push all of the data from the table to an array:
let games = Array.from(document.querySelectorAll('table.myclass:nth-child(2) tr'), e => {
return Array.from(e.querySelectorAll('th, td'), e => e.textContent);
});
// console.log(games[rowNum][cellNum]); <-- textContent
I'm having a play around with nodejs and the way we can now have a headless chrome browser and interact with it which is pretty awesome!
I have some code and I have it working for scraping 1 website without any issues. However when I want to scrape multiple my loop just seems to mess it up and i'm pretty sure it's all to do with async/await.
My loop is near the bottom of this code - does anyone have any suggestions?
Thanks heaps!
const HeadlessChrome = require('simple-headless-chrome')
const browser = new HeadlessChrome({
headless: true, // If you turn this off, you can actually see the browser navigate with your instructions,
})
async function navigateWebsite(urlToGoTo) {
try {
await browser.init()
const mainTab = await browser.newTab({
privateTab: false
})
await mainTab.inject('jquery')
let cookieName = 'li_at'
let cookieValue = 'cyzzzzzzzzz'
let cookieDomain = '.www.linkedin.com'
await mainTab.setCookie(cookieName, cookieValue, {
domain: cookieDomain
})
// Navigate to a URL
await mainTab.goTo(urlToGoTo)
await mainTab.wait(2000);
// Get a HTML tag value based on class id
let businessName = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__name');
let industry = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.company-industries');
let followers = await mainTab.evaluate(function (selector) {
const selectorHtml = document.querySelector(selector)
return selectorHtml.innerHTML
}, '.org-top-card-module__followers-count');
let details = {
businessName: cleanData(businessName),
industry: cleanData(industry),
followers: cleanData(followers)
}
console.log(details)
// Resize the viewport to full screen size (One use is to take full size screen shots)
await mainTab.resizeFullScreen()
// Take a screenshot
await mainTab.saveScreenshot()
// Close the browser
await browser.close()
} catch (err) {
console.log('ERROR!', err)
}
}
let websites = []
websites.push('https://www.linkedin.com/company/qrious-limited/')
websites.push('https://www.linkedin.com/company/wentworth-consulting-nz-/')
websites.push('https://www.linkedin.com/company/capita/')
websites.forEach(function (i) {
navigateWebsite(i)
})
function cleanData(a) {
return a.result.value.replace(/(\r\n|\n|\r)/gm, "").trim()
}
navigateWebsite() is asynchronous but it's not awaited. You could use Promise.all() mapping your list of websites to your nav function or make sure to await each result.
Promise.all(websites.map(w => navigateWebsite(w)));
// or
for (let w of websites) {
await navigateWebsite(w);
}