I am trying to crawl a website using NodeJS. I am making an HTTP request using Axios. I am able to only fetch those items which are available when webpage is loaded. All the HTML which is loaded when I scroll down further is not fetched.
Here is my code.
const axios = require('axios');
const cheerio = require('cheerio');
var request = require('request');
// table view
const url = "https://www.usnews.com/best-colleges/search?_sort=rank&_sortDirection=asc&study=Engineering&_mode=table";
fetchData(url).then((res) => {
const html = res.data;
const $ = cheerio.load(html);
const unilist = $('.TableTabular__TableContainer-febmbj-0.guaRKP > tbody > tr >td ');
unilist.each(function() {
let title = $(this).find('div').attr("name");
if (typeof(title) == 'string') {
console.log(title);
}
});
})
async function fetchData(url){
console.log("Crawling data...")
// make http call to url
let response = await axios(url).catch((err) => console.log(err));
if(response.status !== 200){
console.log("Error occurred while fetching data");
return;
}
return response;
}
I am trying to get all the university names. However, I am only able to get 13 universities because the others are loaded only when the page is manually scrolled down.
How do I access all the universities in the webpage: https://www.usnews.com/best-colleges/search?_sort=rank&_sortDirection=asc&study=Engineering&_mode=table
var request = require('request');
const url = "https://www.usnews.com/best-colleges/api/search?_sort=rank&_sortDirection=asc&_page=7&study=Engineering";
let options = {
url: url,
headers: {
"authority": "www.usnews.com",
"method": "GET",
//"path": `/best-colleges/api/search?_sort=rank&_sortDirection=asc&_page=6&study=Engineering`,
"scheme": "https",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"cookie": `ak_bmsc=60A136143B076291C93DD9728862F728172B301F314600004917B85E8498E04F~pl5NwmZFHheJnzgnDGIGpBb4YDDOuhPDVqrNGDysdm/dDPzFJis9zP1awrKKsxeJBlvqZWW6E3ssLbAdi/nUkIEkEiVPu1NDDQge8FegXwVN6Ren/u+X8dx6/TRgRIIXtbj2n2ieih1+SzTEccExtz3QgcXFx+ZxSM1O3Xoe5crrhltym4VHVynMHnup+h3TaL9tLmsoWiopb9GlEG1eTlXIoyPsKVt2FA+s1MJP5zVmQ=; akacd_www=2177452799~rv=53~id=9087b102caf120794dbb1eeceaf4ccc8; usn_session_id=891228875906785; usn_visitor_id=8912288759182043; optimizelyEndUserId=oeu1589122887855r0.7555247616511707; usprivacy=1YNY; s_cc=true; s_fid=6C0F54971BC55B63-31DB4C74AAF1424B; ntv_as_us_privacy=1YNY; _ga=GA1.2.1252831587.1589122893; _gid=GA1.2.1293277568.1589122893; _fbp=fb.1.1589122894850.768122457; _ntv_uid=a074b9dd-6b5b-4f4b-b257-f9e7ee116412; __gads=ID=3343601cd2e45d2f:T=1589122898:S=ALNI_MZI2Mh_V-ROYbHt3s2k1h83if7i8A; edu-page-views=2; modal-page-views=2; pageview-count-Best Colleges Q2 2020 Audience Survey=2; CUID=N,1589123026657:ALHGLuQAAAAPTiwxNTg5MTIzMDI2NjU3xMc3klevipXW6CRMhCp96C/0wAIB5hXG0/fOK/1Ol60Pak5Dv6v1GHuSJcnhwzLp/ZPAF0+w1p4ic6ZfQHqgJCnyVI1XNZdQ7uBtRQ7wisLYSy5p3bcKN45s8z0N5XX37CMtZHg8WMEvbF6Q+BNNPpjuqLZ3n2p0hJ8+nTpo1lq/vOQrVU+DCcsiC38OMawezCmWDdUxbg2PiMkU9F/WZ4MfddfaDwqQ1BBQC0QkUZeRHkOCPndfwQOCKX1IKZ81Ju7MTmN1wqFdHaHxmHICvLvD6er4q4B0o8byjDXO0M79Yt82UMi8E2sqIAzin+FaFk181KNB5Z+5LbvWhORCig==; FCCDCF=[["AKsRol8x0eLcCPRNK87LcFg96i4OohYRu7keT-wXifV77qo_eYe6uZ0ThI1Oxd2-Y4V5wtjFjZW02xgjl0IhpmE9ojyljTmH9lrVeqQI3wXUjtift1w_Dqsor4S-4hEwsOEhBLpQrx8Ijd3oIw7mqxKezHDHZiod4A=="],null,["[[],[],[],[],null,null,true]",1589123041768]]; education-compare-slideout-state=collapsed; s_sq=%5B%5BB%5D%5D; utag_main=v_id:0171ff1af36300170b586aee949903073006706b009dc$_sn:1$_ss:0$_pn:2%3Bexp-session$_st:1589125090368$ses_id:1589122888547%3Bexp-session$_prevpage:www.usnews.com%2Fbest-colleges%2Fsearch%3Bexp-1589126890272; kw.pv_session=6; sailthru_visitor=9abdf1e6-3e02-427f-9899-6c232865866f; bm_sv=C8E5F93ED4F69A94559E23D6F676C38F~k2zHi/YOvrX2jg2IjDjERaOLYsf7bu+NjQmXeUuPHueXWih3Xm6rjXIC8wg1E225YVqIN2Q3cxjMPj6wlfrOgX8K9b5WW9BLiQIddDKHAGX7gH591ibZ8/bJFn4E/h7PhohIoGJK8PpG6Vel3r3dp//PcCGwzvgJNlUWVUqki3c=; _sp_id.26f9=f626f911-80a4-4912-b0bc-ad1b520357f6.1589122896.2.1589128312.1589124442.54a5f830-9b4f-471e-b326-7e4654bf5bf1; _sp_ses.26f9=*; RT="sl=0&ss=1589123021504&tt=0&obo=0&bcn=%2F%2F684d0d40.akstat.io%2F&sh=&dm=usnews.com&si=a65156df-2f6b-4e2a-815d-f7fdf1e8928c`,
}
};
request(options, function (err, resp, html) {
debugger
if (!err) {
var res= JSON.parse(html);
//var items=res.data.items
//var totalItems=res.data.totalItems
//var totalPages=res.data.totalPages
}
})
Please try this code . maybe you have to put your browser cookie in the request url. since this site api is actually restricted for another applications. in the result
Related
I have an API that periodically makes a request to RESTAPI and stores the data into my database, now I want to add a new feature for certain data: download some data by a request to my API, then, my API makes another request to the RESTAPI, but I dont want this data to store in my database I want it to download as JSON or CSV.
I managed to create the request, I coded the request to the RESTAPI and managed to get the JSON into a variable with all the data, Im stuck there, How do I make this data to get downloaded into a direcory?
Im using javascript nodeJS with bent ,getJSON ,mssql ,https.
the code of the function:
async function descargarDatosOpendata(anioIni, anioFin, Indicativo){
try{
while(anioIni == anioFin || anioIni < anioFin){
console.log("first");
var http = require("https");
var options = {
"method": "GET",
"hostname": "opendata.aemet.es",
"path": "/opendata/api/valores/climatologicos/mensualesanuales/datos/anioini/"+ anioIni +"/aniofin/"+ anioIni +"/estacion/"+ Indicativo +"",
"headers": {
"cache-control": "no-cache",
"api_key": "MYKEY"
}
};
console.log("second");
var req = http.request(options, function (res) {
console.log("tercera");
var chunks = [];
res.on("data", function (chunk) {
chunks.push(chunk);
});
res.on("end", async function () {
console.log("endChunk");
var body = Buffer.concat(chunks);
console.log(body);
var bodyString = body.toString();
bodyJSON = JSON.parse(bodyString);
console.log(bodyJSON);
console.log(bodyJSON.datos);
if (bodyJSON.estado == 200 && bodyJSON.descripcion == "exito") {
let obj = await getJSON(bodyJSON.datos);
console.log(obj)
}
});
});
anioIni++;
req.end();
}
}
catch (err) {
console.log(err);
}
}
obj log is the data in json format: [{data}]
If this code is running in Node, you should use the Node fs module. The appendFile method will create a new file or append to it if it already exists. Learn more about fs
Example code
var fs = require('fs');
fs.appendFile('mynewfile1.txt', 'Hello content!', function (err) {
if (err) throw err;
console.log('Saved!');
});
I am trying to create a collection and a document once a document is created in Firebase.
It is working fine in emulator, but when I deploy my functions to Firebase project, they don't create that collection and document.
But if I create fields in the existing document (snapshot) it works.
In the function I get data from an API and write it to the new document (which is not being created at the moment).
Function:
exports.quoteEndPoint = functions.firestore.document('users/{userID}/followedStocks/{stockID}')
.onCreate((snap, context) => {
const stock_id = context.params.stockID;
const user_id = context.params.userID;
var request = require('request');
var http = require('https');
const options = {
"method": "GET",
"hostname": "alpha-vantage.p.rapidapi.com",
"port": null,
"path": '/query?function=GLOBAL_QUOTE&symbol='+stock_id+'&datatype=json',
"headers": {
"x-rapidapi-host": "%API_HOST%",
"x-rapidapi-key": "%API_KEY%",
"useQueryString": true
}
};
const req = http.request(options, function(res){
const chunks = [];
res.on('data', function(chunk){
chunks.push(chunk);
});
res.on('end', function(){
const body = Buffer.concat(chunks);
//console.log(body.toString());
const result = JSON.parse(body.toString());
console.log(result);
//set values from json responso to Firebase
return snap.ref.collection('quoteEndPoint').doc('data').set(
{
'symbol': result['Global Quote']['01. symbol'],
'open': result['Global Quote']['02. open'],
}, { merge: true }).then(()=>{
console.log('New quoteEndPoint fields for ' + stock_id + ' added to Firebase');
})
.catch(err => {
console.log(err);
});
});
})
.on('error',(err) => {
console.log('Error: '+err.message);
});
req.end();
return true;
});
I tried to make the function: function() async, but it didn't work.
In emulator is creating and populating values to the right path : /users/7nDGdHmZDuoDiJkxixgz/followedStocks/AMD/quoteEndPoint/data
Anyone can help?
Thanks
Your problem most probably comes from the fact that a call with request does not return a promise, while in Cloud Functions triggered by background events (like .onCreate() for Firestore) you must return a Promise. Watch this official video series for more details: in particular the 3 videos titled "Learn JavaScript Promises".
In addition, request is deprecated.
You can use axios, which returns a Promise, or node-fetch. You need to chain the promises returned by the asynchronous operations, i.e. axios and Firestore asynchronous calls, as illustrated in the following code "skeleton":
const functions = require('firebase-functions');
const admin = require('firebase-admin');
const axios = require('axios');
exports.quoteEndPoint = functions.firestore.document('users/{userID}/followedStocks/{stockID}')
.onCreate((snap, context) => {
const stock_id = context.params.stockID;
const user_id = context.params.userID;
return axios({
method: 'get',
url: 'http://....'
// ... See the doc
})
.then(response => {
// ...
return snap.ref.collection('quoteEndPoint').doc('data').set(...);
});
});
My app starts with a simple html form. the inputs are PIN# and Date Of Birth.
My express server runs on the same port 3000, when the user submits their data, puppeteer starts and logs into a specific webpage. Then I scrape the image on that webpage. Google Api takes the text from that image and saves it in an array. I then post that array string to src/results.html. But as soon as the user hits submit, they are redirected to /resuts route right immediately and the page says cannot post the data. but when I see in the console (roughly a minute later) that the post was successful, I refresh the page and I get the array of text I wanted to see.
How can I await for the data to finish being posted to the route before the page loads the data? Im using react for client side. below is my server side code. client side is just a basic react page login and a static /results page meant for the data.
const puppeteer = require("puppeteer");
const express = require("express");
const app = express();
const morgan = require("morgan");
const fs = require("fs");
const cors = require("cors");
const request = require("request-promise-native").defaults({ Jar: true });
const poll = require("promise-poller").default;
app.use(morgan("combined"));
const port = 3000;
// Imports the Google Cloud client library
const vision = require("#google-cloud/vision");
require("dotenv").config();
app.use(cors());
const textArray = [];
const App = (pinNum, dateOfB) => {
const config = {
sitekey: process.env.SITEKEY,
pageurl: process.env.PAGEURL,
apiKey: process.env.APIKEY,
apiSubmitUrl: "http://2captcha.com/in.php",
apiRetrieveUrl: "http://2captcha.com/res.php",
};
const chromeOptions = {
executablePath: "/Program Files/Google/Chrome/Application/chrome.exe",
headless: true,
slowMo: 60,
defaultViewport: null,
};
async function main() {
const browser = await puppeteer.launch(chromeOptions);
const page = await browser.newPage();
console.log(`Navigating to ${config.pageurl}`);
await page.goto(config.pageurl);
try {
const requestId = await initiateCaptchaRequest(config.apiKey);
// const pin = getPIN();
console.log(`Typing PIN ${pinNum}`);
await page.type("#PIN", pinNum);
// const dob = getDOB();
console.log(`Typing DOB ${dateOfB}`);
const input = await page.$("#DOB");
await input.click({ clickCount: 3 });
await input.type(dateOfB);
const response = await pollForRequestResults(config.apiKey, requestId);
console.log(`Entering recaptcha response ${response}`);
await page.evaluate(
`document.getElementById("g-recaptcha-response").innerHTML="${response}";`
);
console.log(`Submitting....`);
page.click("#Submit");
} catch (error) {
console.log(
"Your request could not be completed at this time, please check your pin number and date of birth. Also make sure your internet connection is working and try again."
);
console.error(error);
}
await page.waitForSelector(
"body > div.container.body-content > div:nth-child(1) > div:nth-child(2) > p"
);
const image = await page.$(
"body > div.container.body-content > div:nth-child(1) > div:nth-child(2) > p"
);
await image.screenshot({
path: "testResults.png",
});
await getImageText();
await page.close(); // Close the website
await browser.close(); //close browser
await deleteImage();
}
main();
//This section grabs the text off the image that was gathered from the web scraper.
async function getImageText() {
// Creates a client
const client = new vision.ImageAnnotatorClient();
console.log(`Looking for text in image`);
// Performs label detection on the image file
const [result] = await client.textDetection("./testResults.png");
const [annotation] = result.textAnnotations;
const text = annotation ? annotation.description : "";
console.log("Extracted text from image:", text);
//Pushed the text into a globally available array.
textArray.push(text);
//Sent a NOTIFICATION ALERT to the client with the text gathered from the image.
var axios = require("axios");
var data = JSON.stringify({
to: "dp8vGNkcYKb-k-72j7t4Mo:APA91bEfrI3_ht89t5X1f3_Y_DACZc9DbWI4VzcYehaQoXtD_IHIFSwm9H1hgXHNq46BQwDTlCKzkWNAHbBGauEXZNQtvhQc8glz4sHQr3JY3KM7OkUEcNB7qMMpCPxRe5GzzHbe3rkE",
notification: {
body: text,
title: "AverHealth Schedule",
},
});
var config = {
method: "post",
url: "https://fcm.googleapis.com/fcm/send",
headers: {
"Content-Type": "application/json",
Authorization: `key=${process.env.FCM_SERVER_KEY}`,
},
data: data,
};
axios(config)
.then(function (response) {
console.log(JSON.stringify(response.data));
})
.catch(function (error) {
console.log(error);
});
}
//Captcha Solver for the web scraper
async function initiateCaptchaRequest(apiKey) {
const formData = {
key: apiKey,
method: "userrecaptcha",
googlekey: config.sitekey,
json: 1,
pageurl: config.pageurl,
};
console.log(
`Submitting recaptcha request to 2captcha for ${config.pageurl}`
);
const response = await request.post(config.apiSubmitUrl, {
form: formData,
});
console.log(response);
return JSON.parse(response).request;
}
async function pollForRequestResults(
key,
id,
retries = 90,
interval = 5000,
delay = 1500
) {
console.log(`Waiting for ${delay} milliseconds....`);
await timeout(delay);
return poll({
taskFn: requestCaptchaResults(key, id),
interval,
retries,
});
}
function requestCaptchaResults(apiKey, requestId) {
const url = `${config.apiRetrieveUrl}?key=${apiKey}&action=get&id=${requestId}&json=1`;
console.log(url);
return async function () {
return new Promise(async function (resolve, reject) {
console.log(`Polling for response...`);
const rawResponse = await request.get(url);
console.log(rawResponse);
const resp = JSON.parse(rawResponse);
console.log(resp);
if (resp.status === 0) return reject(resp.request);
console.log("Response received");
console.log(resp);
resolve(resp.request);
});
};
}
// DELETES THE FILE CREATED BY GOOGLEAPI
function deleteImage() {
const path = "./testResults.png";
try {
fs.unlinkSync(path);
console.log("File removed:", path);
} catch (err) {
console.error(err);
}
}
const timeout = (ms) => new Promise((res) => setTimeout(res, ms));
};
app.use(express.urlencoded({ extended: false }));
// Route to results Page
app.get("/results", (req, res) => {
res.sendFile(__dirname + "/src/results.html");
res.send(textArray);
});
app.post("/results", (req, res) => {
// Insert Login Code Here
let username = req.body.username;
let password = req.body.password;
App(username, password);
});
app.listen(port, () => {
console.log(`Scraper app listening at http://localhost:${port}`);
});
I think I got the problem.
In the react app, maybe you are not using e.preventDefault() when you click submit. The browser, by default, redirects to a page where the form action is directing, if the action attribute is empty then the browser reloads the same page. I would recommend you to use e.preventDefault() on form submission and then use fetch API to make the request.
In the express server, on the route POST "results", you are not sending any response back to the user. You should always send a response to the user. In your case you are calling the App function - which has many async functions, but you are not awaiting for App() to complete in the POST route, express is sending default response to the user as soon as it parses App() - it is not waiting for the App() to complete - express will get to this later.
You can make the (req, res) => { ... } function in the route as async function async (req, res) => { ... }, then you can make the App as async function as well. Then you can await App(...) in the route function. Also, you need to await for the main() function as well inside the App() function. Then once App() call has finished, you can send redirect response to the user.
What I'm trying to do:
I'm trying to scrape all images in a discord channel and getting their URL by requesting attachments but I can't seem to find a way to request it
Code
const fs = require("fs");
const fetch = require("node-fetch");
function readFileString(path) {
return fs.readFileSync(path, {encoding: "utf8"}).replace(/\r?\n|\r/g, "");
}
const token = readFileString("token.txt");
const channel = process.argv[2];
if(!channel) {
console.error("Usage: node index.js <channel id>");
process.exit(1);
}
const headers = {authorization: token};
async function request(before) {
const options = {
method: "GET",
headers: headers
};
const request = await fetch(
`https://discord.com/api/channels/${channel}/attachments`,
options
);
return await request.json();
}
let result;
async function go() {
let page = await request();
result = page;
while(page.length >= 100) {
page = await request(page[page.length - 1].id);
result = result.concat(page);
}
console.log(`Fetched ${result.length} images`);
fs.writeFileSync("links.json", JSON.stringify(result, null, 2));
}
go();
Output: Console
Fetched undefined images
Output: links.json
{
"message": "404: Not Found",
"code": 0
}
Any help of how I would get all image links in the links.json file would be appreciated
It seems at looking at the Docs It does not allow you to make a GET request for message attachments.
I'm new to Node.js and I need to upload some PDFs to an external API (Zip Forms).
Right now I have the code below but the PDF pages are blank when they arrive at the destination. I tried saving the PDF locally, using the same binary data that I'm sending to the API, and the PDFs are correctly saved.
I am also using setTimeout method here because I cannot find a method that waits for the PDF to read, before sending it to the API.
Also tried binary instead of latin-1 in readFileSync method, but it doesn't change anything.
Code:
const aws = require('aws-sdk');
const https = require('https');
const request = require('request');
const { createWriteStream, readFileSync, writeFileSync } = require('fs');
const s3 = new aws.S3(); // Pass in opts to S3 if necessary
// Look up order and related info.
var order = await Order.findOne({ id })
.populate('agent');
if (createZiplogixTransaction) {
ziplogixTransactionId = await sails.helpers.ziplogix.createZiplogixTransaction.with({
ziplogixContextId: ziplogixContextId,
transactionName: order.propertyStreetAddress + ', ' + order.propertyCity,
// FUTURE: if the transaction helper is updated, include actual order information
// e.g. Primary seller name, property street address, etc.
});
}
if (!order) {
throw 'noSuchOrder';
}
// Permissions
if (this.req.me && this.req.me.accountType !== 'agent' && !ziplogixContextId) {
throw 'forbidden';
}
let savedPdfs = await PdfOrderExternalId.find({ orderId: id });
await PdfOrderExternalId.destroy({
where: { orderId: id }
});
for (const pdf of pdfs) {
let url = await s3.getSignedUrl('getObject', {
Bucket: 'disclosure-pdfs',
Key: pdf.uploadFd,
Expires: 60 * 5
});
let file = createWriteStream(`/tmp/${pdf.slug}.pdf`);
await https.get(url, async (response) => {
await response.pipe(file);
// Need to wait for file to write on disk :|. Doesn't work with await or Promise (Why? IDK)
setTimeout(async () => {
let postData = await readFileSync(`/tmp/${pdf.slug}.pdf`, 'latin1');
let queryString = `Name=${pdf.displayName}&Description=${pdf.displayName}`;
savedPdfs.forEach(item => {
if (item.pdfTemplate === pdf.pdfTemplate) {
queryString += `Id=${item.externalId}`;
}
});
request({
method: 'POST',
url: `${sails.config.custom.ziplogixApiBaseUrl}/transactions/${ziplogixTransactionId}/documents/file?${queryString}`,
headers: {
'X-Auth-ContextID': ziplogixContextId,
'X-Auth-SharedKey': sails.config.custom.ziplogixSharedKey,
'Content-Type': ['application/pdf', 'application/pdf']
},
body: postData
}, async (error, response, body) => {
// code here ...
});
}, 1000);
});
}
await exits.success(Date.now());
Any ideas what I'm doing wrong?
Thank you