I'm doing some scraping after receiving html from an api. I'd like to do the following:
Open html page in chrome so I can find selectors in the console.
Immediately load the same html page into a jsdom instance
Drop into the repl - I can then find the right selectors in the console and test them out in a live jsdom environment to see if they work.
For 1, I have:
async function openHtml(htmlString) {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.setContent(htmlString);
return;
// await browser.close();
}
The code provided with the api is:
var req = http.request(options, function (res) {
var chunks = [];
res.on("data", function (chunk) {
chunks.push(chunk);
});
res.on("end", function () {
var body = Buffer.concat(chunks);
response = JSON.parse(body); //response.content = html, response.cookies = cookies
const dom = new JSDOM(response.content);
console.log(dom.window.document.querySelector("p").textContent); // "Hello world"
openHtml(response.content);
console.log('hi');
});
});
req.end();
If I run the code at the command line the browser opens as expected. However, if I set a breakpoint at:
console.log('hi');
It does not. How can I get this working?
openHtml is an async function. So you'll have to set the method calling in await (promise) and main function to async as well.
var req = http.request(options, function (res) {
var chunks = []
res.on('data', function (chunk) {
chunks.push(chunk)
})
res.on('end', async function () {
var body = Buffer.concat(chunks)
response = JSON.parse(body) //response.content = html, response.cookies = cookies
const dom = new JSDOM(response.content)
console.log(dom.window.document.querySelector('p').textContent) // 'Hello world'
await openHtml(response.content)
console.log('hi')
})
})
req.end()
Related
I want to get the chat from a stream in real time with web scraping.
Attempting to create a while loop inside of the .then() function of puppeeter doesn't seem to be effective, and in some implementations breaks it all together.
I am able to get the initial scrape to happen, but in all cases the program ends and does not want to follow the while loop I implemented.
Working code WITHOUT while loop
const puppeteer = require ('puppeteer');
//initiating Puppeteer
puppeteer
.launch ()
.then (async browser => {
//opening a new page and navigating to the live stream
const page = await browser.newPage ();
await page.goto ('https://www.younow.com/Ken_Nara24');
await page.waitForSelector ('body');
//manipulating the page's content
let getComments = await page.evaluate (() => {
let comments = document.body.querySelectorAll ('.comment');
let scrapeItems = [];
comments.forEach (item => {
let commentAuthor = item.querySelector ('div.user-card__header.mini-profile-launcher').innerText;
let commentContent = '';
try {
commentContent = item.querySelector ('div.user-card__body.ng-star-inserted').innerText;
} catch (err) {}
scrapeItems.push ({
commentAuthor: commentAuthor,
commentContent: commentContent,
});
});
let items = {
"userComments": scrapeItems,
};
return items;
});
//outputting the scraped data
console.log (getComments);
//closing the browser
await browser.close ();
})
//handling any errors
.catch (function (err) {
console.error (err);
});
All attempts to get that logic looping have been for naught. I cannot find a way or past issue/example that clearly defines how or if such a thing could be done. I have made a few attempts to implement it myself, but nothing has even compiled correctly.
Am I missing something significant here? I just want to essentially listen to a web page and every 3-5 seconds re-scrape it.
If you still need help you could give this way a try.
const puppeteer = require("puppeteer");
let pageScraping = false; /* set scraping to false */
const scraper = async () => {
if (pageScraping == true) return; /* check if already scraping page */
let browser, page;
let pageUrl = 'https://www.younow.com/Ken_Nara24';
try {
pageScraping = true; /* set scraping to true */
browser = await puppeteer.launch({ headless: true });
page = await browser.newPage();
await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
/* wait for chat to be visible */
await page.waitForSelector('.chat', { visible: true, timeout: 60000 });
let getComments = await page.evaluate(() => {
let scrapeComments = [];
let comments = document.querySelectorAll('.comment');
comments.forEach(comment => {
let commentContent = '';
let commentAuthor = comment.querySelector('div[class="user-card__header mini-profile-launcher"]').innerText;
commentContent = comment.querySelector('div[class="user-card__body ng-star-inserted"]').innerText;
scrapeComments.push({
'commentAuthor': commentAuthor,
'commentContent': commentContent,
});
});
return { 'userComments': scrapeComments };
});
console.log(await getComments); /* log comments */
} catch (err) {
console.log(err.message);
} finally {
if (browser) { /* check if browser is open befor trying to close */
await browser.close();
console.log('closing browser');
}
pageScraping = false; /* set scraping to false again */
await setTimeout(scraper, 5000); /* wait 5 seconds befor re-scraping */
}
}
setTimeout(scraper, 5000); /* start scraping */
using Puppeteer I'm able to navigate to a certain video src URL, and the MP4 (using a custom build of chronium) plays fine.
NOW: I want to be able to get the video data that's playing and send it to some kind of buffer in node.js that can be saved as a file or sent to a client via a websocket or sent as a response etc.... but I'm not sure how to do it, all I have is the video playing.
I'm not able to just send the URL over to node.js, because in order to view the video file you have to go through the whole puppeteer crawling process (it's not just a static URL, it's dependent on that browser session only, so only puppeteer can view it).
SO: what can I do to get a src URL to a file (or buffer) in nodeJS? this is my current code, if it helps:
var puppeteer = require("puppeteer-core");
var http=require("https");
var fs=require("fs");
var fetch=require("fetch-node");
(async() => {
var browser = await puppeteer.launch({
executablePath:"./cobchrome/chrome.exe"
});
console.log("Got browser", browser);
var page = await browser.newPage();
console.log(page,"got page");
var agentStr = `Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0`;
var agent = await page.setUserAgent(agentStr);
console.log(agent, "Set the user agent");
// await page.goto("https://drive.google.com/file/d/17tkL8jPlBIh5XtcX_tNhyDV5nSX8v7f8/preview");
await page.goto("https://docs.google.com/file/d/1Cyuh41yNfYZU_zL-MHLf_EPJCYnlT7oJ/preview?enablejsapi=1&playerapiid=player4");
console.log("went to page..");
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: './downloadscob/'})
await page.screenshot({path:"shots/onopen.png"});
// var btn = await page.$(".ndfHFb-c4YZDc ndfHFb-c4YZDc-AHmuwe-Hr88gd-OWB6Me ndfHFb-c4YZDc-vyDMJf-aZ2wEe ndfHFb-c4YZDc-i5oIFb ndfHFb-c4YZDc-e1YmVc ndfHFb-c4YZDc-TSZdd");
// var tst = await page.$("#start-of-content");
var clickEl = ".ndfHFb-c4YZDc-aTv5jf-bVEB4e-RJLb9c";
var newClickID = ".ndfHFb-c4YZDc-aTv5jf-NziyQe-LgbsSe";
var clicker = await page.waitForSelector(newClickID);
console.log(clicker,"got clicker");
await page.screenshot({path:"shots/ongotclicker.png"});
await page.click(clickEl);
console.log("clicked")
await page.screenshot({path:"shots/onclicked.png"});
var frame = await page.waitForSelector("iframe[id=drive-viewer-video-player-object-0]");
console.log(frame, "got video frame");
await page.screenshot({path:"shots/ongotframe.png"});
var cf = await frame.contentFrame();
await page.screenshot({path:"shots/oncf.png"});
console.log(cf, "got content frame");
await cf.waitFor(() => !!document.querySelector("video"))
await page.screenshot({path:"shots/videoappeared.png"});
//await cf.waitFor(30000);
// var videos = await cf.$("video");
// console.log(videos, videos.length, "all videos");
var video = await cf.$("video");
await page.screenshot({path:"shots/selectedvideo.png"});
var videoEl = await cf.evaluate(
v =>{
var result = {};
for(var k in v) {
result[k] = v[k];
}
return result;
},
video
);
var src = videoEl.src;
var file = fs.createWriteStream("down.mp4");
console.log("starting to stream");
var req = http.get(src, r => {
console.log("finished pipin");
r.pipe(file); //I REALLY thought this would work but it doesn't do anything
});
var start = Date.now();
await page.screenshot({path:"shots/evalled_vido.png"});
console.log("$$###VIDEO SOURCE::", "time it took", src);
await page.goto(src);
await page.screenshot({path:"shots/wentToNewPage.png"});
// await page.waitFor(5000);
await page.screenshot({path:"shots/maybeItsPlayingNow.png"});
console.log("ABOUT t oFETHC wit H SOURCE", src)
var content = await page.content();
fs.writeFile("outputagain.txt", content, (re) => {
console.log("saved it?");
})
console.log(content);
// await browser.close();
})();
currently the page.content() at the end just gets the HTML content of the page, not any binary data................
I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000
I've node app with function that inside call to to other two function,I want to use some async behavior for it,what is recommended to use in this case.
example will be very helpful.
function myFunction(req,res){
//from here this is the first place which I want to use warp in function
var dataChunks = [],
dataRaw,
data;
req.on("data", function (chunk) {
dataChunks.push(chunk);
});
req.on("end", function () {
dataRaw = Buffer.concat(dataChunks);
data = dataRaw.toString();
console.log(data);
//here is the second code which I want to warp in function and call after the first function
var filePath = 'C://test.txt';
var writeStream = fs.createWriteStream(filePath, {flags: 'w'});
writeStream.write(data);
res.status(200).send('ok');
})
}
one more thing,as I saw until now async is how the node framework working and to add additional libarary like Q is not overkill?
I don't even see why you particularly need promises for this.
function myHandler(req, res) {
var dataChunks = [],
dataRaw,
data;
req.on("data", function (chunk) {
dataChunks.push(chunk);
});
req.on("end", function () {
dataRaw = Buffer.concat(dataChunks);
data = dataRaw.toString();
console.log(data);
var filePath = 'C://test.txt';
var writeStream = fs.createWriteStream(filePath, {flags: 'w'});
writeStream.write(data);
writeStream.on('finish', function() {
res.status(200).send('ok');
});
writeStream.end();
});
}
Or, you could probably pipe the incoming stream right into your file stream and write even less code.
function myHandler(req, res) {
var filePath = 'C://test.txt';
var writeStream = fs.createWriteStream(filePath, {flags: 'w'});
req.pipe(writeStream);
req.on("end", function() {
res.status(200).send('ok');
});
}
I am trying to scrape a website but I don't get some of the elements, because these elements are dynamically created.
I use the cheerio in node.js and My code is below.
var request = require('request');
var cheerio = require('cheerio');
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
request(url, function (err, res, html) {
var $ = cheerio.load(html);
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
});
This code returns empty response, because when the page is loaded, the <ul id="store_list" class="listMain"> is empty.
The content has not been appended yet.
How can I get these elements using node.js? How can I scrape pages with dynamic content?
Here you go;
var phantom = require('phantom');
phantom.create(function (ph) {
ph.createPage(function (page) {
var url = "http://www.bdtong.co.kr/index.php?c_category=C02";
page.open(url, function() {
page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
page.evaluate(function() {
$('.listMain > li').each(function () {
console.log($(this).find('a').attr('href'));
});
}, function(){
ph.exit()
});
});
});
});
});
Check out GoogleChrome/puppeteer
Headless Chrome Node API
It makes scraping pretty trivial. The following example will scrape the headline over at npmjs.com (assuming .npm-expansions remains)
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.npmjs.com/');
const textContent = await page.evaluate(() => {
return document.querySelector('.npm-expansions').textContent
});
console.log(textContent); /* No Problem Mate */
browser.close();
})();
evaluate will allow for the inspection of the dynamic element as this will run scripts on the page.
Use the new npm module x-ray, with a pluggable web driver x-ray-phantom.
Examples in the pages above, but here's how to do dynamic scraping:
var phantom = require('x-ray-phantom');
var Xray = require('x-ray');
var x = Xray()
.driver(phantom());
x('http://google.com', 'title')(function(err, str) {
if (err) return done(err);
assert.equal('Google', str);
done();
})
Answering this as a canonical, an alternative to Puppeteer for scraping dynamic sites which is also well-supported as of 2023 is Playwright. Here's a simple example:
const playwright = require("playwright"); // ^1.28.1
let browser;
(async () => {
browser = await playwright.chromium.launch();
const page = await browser.newPage();
await page.goto("https://example.com");
const text = await page.locator('h1:text("Example")').textContent();
console.log(text); // => Example Domain
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
Easiest and reliable solution is to use puppeteer. As mentioned in https://pusher.com/tutorials/web-scraper-node which is suitable for both static + dynamic scraping.
Only change the timeout in Browser.js, TimeoutSettings.js, Launcher.js 300000 to 3000000