What I'm trying to do:
I'm trying to scrape all images in a discord channel and getting their URL by requesting attachments but I can't seem to find a way to request it
Code
const fs = require("fs");
const fetch = require("node-fetch");
function readFileString(path) {
return fs.readFileSync(path, {encoding: "utf8"}).replace(/\r?\n|\r/g, "");
}
const token = readFileString("token.txt");
const channel = process.argv[2];
if(!channel) {
console.error("Usage: node index.js <channel id>");
process.exit(1);
}
const headers = {authorization: token};
async function request(before) {
const options = {
method: "GET",
headers: headers
};
const request = await fetch(
`https://discord.com/api/channels/${channel}/attachments`,
options
);
return await request.json();
}
let result;
async function go() {
let page = await request();
result = page;
while(page.length >= 100) {
page = await request(page[page.length - 1].id);
result = result.concat(page);
}
console.log(`Fetched ${result.length} images`);
fs.writeFileSync("links.json", JSON.stringify(result, null, 2));
}
go();
Output: Console
Fetched undefined images
Output: links.json
{
"message": "404: Not Found",
"code": 0
}
Any help of how I would get all image links in the links.json file would be appreciated
It seems at looking at the Docs It does not allow you to make a GET request for message attachments.
Related
I have never had this happen before and am not sure why it's happening.
I have a component written to display PDF files in an iframe as part of a larger application. I am retrieving a BLOB stream from the server and attempting to create a URL for it to display in the iframe but it keeps giving me a cross-origin error, which I thought would not be possible since it is creating the URL out of data.
Here is my entire component:
import React, { useState, useEffect } from 'react'
import IFrameComponent from '../Elements/IFrameComponent';
const PDFPages = (props) => {
let [file, setFile] = useState(null)
let [notFound, show404]=useState(false)
useEffect(() => {
let id=props.site?.componentFile;
fetch(`${process.env.REACT_APP_HOST}/documents/GetPDF`,
{
method: 'POST'
, headers: {
'Content-Type': 'application/json'
}
, credentials: 'include'
, body: JSON.stringify({file:id})
})
.then(async response => {
let blob;
try{
blob=await response.blob(); // <--- this functions correctly
}
catch(ex){
let b64=await response.json()
blob=Buffer.from(b64.fileData,'base64')
}
//Create a Blob from the PDF Stream
//Build a URL from the file
const str=`data:application/pdf;base64,${b64.fileData}`
const url=URL.createObjectURL(blob) //<--- ERROR IS THROWN HERE
setFile(url);
})
.catch(error => {
show404(true)
});
}, []);
if(!notFound){
return <IFrameComponent src={file} title=''>
Please enable iFrames in your browser for this page to function correctly
</IFrameComponent>
}
else {
return (
<>
<h3> File {file} could not be found on server</h3>
</>
)
}
}
export default PDFPages;
For completeness here is the GetPDF function from the server which is sending the file.
router.post('/GetPDF', async (req, res, next) => {
const props = req.body;
let fileName = props.file;
try {
fileName = fileName.replace(/%20/g, " ");
let options = {};
if (props.base64) options.encoding = 'base64'
let data = await dataQuery.loadFile(`./data/documentation/${fileName}`, options);
if (!props.base64) {
res.attachment = "filename=" + fileName
res.contentType = 'application/pdf'
res.send(data);
}
else{
res.send({fileData:data, fileName: fileName});
}
}
catch (ex) {
res.send({ error: true })
}
});
I have done very little work in node sending files but am positive my client code is good. Where am I going wrong here?
The problem was that I was trying to be too fancy sending a BLOB or Base64 data. After investigation I rewrote
router.post('/GetPDF', async (req, res, next) => {
const props = req.body;
let fileName = props.file;
try {
fileName = fileName.replace(/%20/g, " ");
let options = {};
if (props.base64) options.encoding = 'base64'
let data = await dataQuery.loadFile(`./data/documentation/${fileName}`, options);
if (!props.base64) {
res.attachment = "filename=" + fileName
res.contentType = 'application/pdf'
res.send(data);
}
else{
res.send({fileData:data, fileName: fileName});
}
}
catch (ex) {
res.send({ error: true })
}
});
on the server to
router.get('/GetPDF/:fileName', async (req, res, next) => {
let fileName = req.params.fileName
fileName = `./data/documentation/${fileName.replace(/%20/g, " ")}`;
try {
let data = await dataQuery.loadFile(fileName);
res.contentType("application/pdf");
res.send(data);
}
catch (ex) {
res.send({ error: true })
}
});
Then calling it from the client using
const url = `${process.env.REACT_APP_HOST}/documents/GetPDF/${props.site.componentFile}`
as the iFrame src sends the PDF properly as expected.
This same method also solved another problem with HTML pages sent from the server not functioning correctly.
My app starts with a simple html form. the inputs are PIN# and Date Of Birth.
My express server runs on the same port 3000, when the user submits their data, puppeteer starts and logs into a specific webpage. Then I scrape the image on that webpage. Google Api takes the text from that image and saves it in an array. I then post that array string to src/results.html. But as soon as the user hits submit, they are redirected to /resuts route right immediately and the page says cannot post the data. but when I see in the console (roughly a minute later) that the post was successful, I refresh the page and I get the array of text I wanted to see.
How can I await for the data to finish being posted to the route before the page loads the data? Im using react for client side. below is my server side code. client side is just a basic react page login and a static /results page meant for the data.
const puppeteer = require("puppeteer");
const express = require("express");
const app = express();
const morgan = require("morgan");
const fs = require("fs");
const cors = require("cors");
const request = require("request-promise-native").defaults({ Jar: true });
const poll = require("promise-poller").default;
app.use(morgan("combined"));
const port = 3000;
// Imports the Google Cloud client library
const vision = require("#google-cloud/vision");
require("dotenv").config();
app.use(cors());
const textArray = [];
const App = (pinNum, dateOfB) => {
const config = {
sitekey: process.env.SITEKEY,
pageurl: process.env.PAGEURL,
apiKey: process.env.APIKEY,
apiSubmitUrl: "http://2captcha.com/in.php",
apiRetrieveUrl: "http://2captcha.com/res.php",
};
const chromeOptions = {
executablePath: "/Program Files/Google/Chrome/Application/chrome.exe",
headless: true,
slowMo: 60,
defaultViewport: null,
};
async function main() {
const browser = await puppeteer.launch(chromeOptions);
const page = await browser.newPage();
console.log(`Navigating to ${config.pageurl}`);
await page.goto(config.pageurl);
try {
const requestId = await initiateCaptchaRequest(config.apiKey);
// const pin = getPIN();
console.log(`Typing PIN ${pinNum}`);
await page.type("#PIN", pinNum);
// const dob = getDOB();
console.log(`Typing DOB ${dateOfB}`);
const input = await page.$("#DOB");
await input.click({ clickCount: 3 });
await input.type(dateOfB);
const response = await pollForRequestResults(config.apiKey, requestId);
console.log(`Entering recaptcha response ${response}`);
await page.evaluate(
`document.getElementById("g-recaptcha-response").innerHTML="${response}";`
);
console.log(`Submitting....`);
page.click("#Submit");
} catch (error) {
console.log(
"Your request could not be completed at this time, please check your pin number and date of birth. Also make sure your internet connection is working and try again."
);
console.error(error);
}
await page.waitForSelector(
"body > div.container.body-content > div:nth-child(1) > div:nth-child(2) > p"
);
const image = await page.$(
"body > div.container.body-content > div:nth-child(1) > div:nth-child(2) > p"
);
await image.screenshot({
path: "testResults.png",
});
await getImageText();
await page.close(); // Close the website
await browser.close(); //close browser
await deleteImage();
}
main();
//This section grabs the text off the image that was gathered from the web scraper.
async function getImageText() {
// Creates a client
const client = new vision.ImageAnnotatorClient();
console.log(`Looking for text in image`);
// Performs label detection on the image file
const [result] = await client.textDetection("./testResults.png");
const [annotation] = result.textAnnotations;
const text = annotation ? annotation.description : "";
console.log("Extracted text from image:", text);
//Pushed the text into a globally available array.
textArray.push(text);
//Sent a NOTIFICATION ALERT to the client with the text gathered from the image.
var axios = require("axios");
var data = JSON.stringify({
to: "dp8vGNkcYKb-k-72j7t4Mo:APA91bEfrI3_ht89t5X1f3_Y_DACZc9DbWI4VzcYehaQoXtD_IHIFSwm9H1hgXHNq46BQwDTlCKzkWNAHbBGauEXZNQtvhQc8glz4sHQr3JY3KM7OkUEcNB7qMMpCPxRe5GzzHbe3rkE",
notification: {
body: text,
title: "AverHealth Schedule",
},
});
var config = {
method: "post",
url: "https://fcm.googleapis.com/fcm/send",
headers: {
"Content-Type": "application/json",
Authorization: `key=${process.env.FCM_SERVER_KEY}`,
},
data: data,
};
axios(config)
.then(function (response) {
console.log(JSON.stringify(response.data));
})
.catch(function (error) {
console.log(error);
});
}
//Captcha Solver for the web scraper
async function initiateCaptchaRequest(apiKey) {
const formData = {
key: apiKey,
method: "userrecaptcha",
googlekey: config.sitekey,
json: 1,
pageurl: config.pageurl,
};
console.log(
`Submitting recaptcha request to 2captcha for ${config.pageurl}`
);
const response = await request.post(config.apiSubmitUrl, {
form: formData,
});
console.log(response);
return JSON.parse(response).request;
}
async function pollForRequestResults(
key,
id,
retries = 90,
interval = 5000,
delay = 1500
) {
console.log(`Waiting for ${delay} milliseconds....`);
await timeout(delay);
return poll({
taskFn: requestCaptchaResults(key, id),
interval,
retries,
});
}
function requestCaptchaResults(apiKey, requestId) {
const url = `${config.apiRetrieveUrl}?key=${apiKey}&action=get&id=${requestId}&json=1`;
console.log(url);
return async function () {
return new Promise(async function (resolve, reject) {
console.log(`Polling for response...`);
const rawResponse = await request.get(url);
console.log(rawResponse);
const resp = JSON.parse(rawResponse);
console.log(resp);
if (resp.status === 0) return reject(resp.request);
console.log("Response received");
console.log(resp);
resolve(resp.request);
});
};
}
// DELETES THE FILE CREATED BY GOOGLEAPI
function deleteImage() {
const path = "./testResults.png";
try {
fs.unlinkSync(path);
console.log("File removed:", path);
} catch (err) {
console.error(err);
}
}
const timeout = (ms) => new Promise((res) => setTimeout(res, ms));
};
app.use(express.urlencoded({ extended: false }));
// Route to results Page
app.get("/results", (req, res) => {
res.sendFile(__dirname + "/src/results.html");
res.send(textArray);
});
app.post("/results", (req, res) => {
// Insert Login Code Here
let username = req.body.username;
let password = req.body.password;
App(username, password);
});
app.listen(port, () => {
console.log(`Scraper app listening at http://localhost:${port}`);
});
I think I got the problem.
In the react app, maybe you are not using e.preventDefault() when you click submit. The browser, by default, redirects to a page where the form action is directing, if the action attribute is empty then the browser reloads the same page. I would recommend you to use e.preventDefault() on form submission and then use fetch API to make the request.
In the express server, on the route POST "results", you are not sending any response back to the user. You should always send a response to the user. In your case you are calling the App function - which has many async functions, but you are not awaiting for App() to complete in the POST route, express is sending default response to the user as soon as it parses App() - it is not waiting for the App() to complete - express will get to this later.
You can make the (req, res) => { ... } function in the route as async function async (req, res) => { ... }, then you can make the App as async function as well. Then you can await App(...) in the route function. Also, you need to await for the main() function as well inside the App() function. Then once App() call has finished, you can send redirect response to the user.
Good day I have a custom adonisjs command that pulls from an API.
async handle (args, options) {
// Status
// Open = 1979
// Get all jobs with open status.
const pullJobController = new PullJobsFromJobAdderController;
let token = await pullJobController.get_token();
if(token){
const jobs = await this._getOpenJobs('https://jobs/open-jobs', token , 1979);
}
}
async _getOpenJobs(url, accessToken, status) {
url = url + '?statusId=' + status
const headers = {
'Authorization': 'Bearer ' + accessToken
}
const options = {
method: 'GET',
url: url,
headers: headers
}
return (await rp(options).then(function (result) {
return {
status: true,
info: JSON.parse(result)
}
}).catch(function (error) {
return {
status: false
}
}));
} // _getOpenJobs()
PullJobsFromJobAdderController
async get_token()
{
// This works if directly returning the token.
// return "9ade34acxxa4265fxx4b5x6ss7fs61ez";
const settings = await this.settings();
const jobAdderObject = new this.JobAdder(settings.jobadder['client.id'], settings.jobadder['client.secret'])
const jobadderOauthObject = this.model('JobadderOauth');
const accessInfo = await jobadderOauthObject.jobdderLatestAccess();
let isAccessExpired = await this.checkAccessValidity(accessInfo.created_at);
let accessToken = accessInfo.access_token;
let apiEndpoint = accessInfo.api_endpoint;
if(isAccessExpired === true){
let refreshTokenInfo = await jobAdderObject.refrehToken(accessInfo.refresh_token)
if (refreshTokenInfo.status === true) {
let refreshTokenDetails = JSON.parse(refreshTokenInfo.info)
accessToken = refreshTokenDetails.access_token
apiEndpoint = refreshTokenDetails.api
await jobadderOauthObject.create({
code: accessInfo.code,
access_token: refreshTokenDetails.access_token,
refresh_token: refreshTokenDetails.refresh_token,
scope: 'read write offline_access',
api_endpoint: refreshTokenDetails.api
})
}
}
return accessToken;
} // get_token()
The function async get_token works as expected, it supplies me with a fresh token to be used by the adonisjs command. However it freezes after running the command.
But if I return the string token directly. The custom command handle() works as expected and terminates after running.
Scenario 1: (Directly returning the token string from PullJobsFromJobAdderController)
I run my custom command "adonis pull:jobs" and it runs as expected displaying in the terminal the result of the pulled data from the api.
Terminal is ready to accept another command.
Scenario 2: (Comment out the directly returned string token from PullJobsFromJobAdderController)
I run my custom command "adonis pull:jobs" and it runs as expected
displaying in the terminal the result of the pulled data from the
api.
Terminal is not accepting commands until I press ctrl+c and terminate the current job/command.
Perhaps I am missing something regarding async await calls.
Can someone point / help me to the right direction?
TIA
I got it, for anyone else having this kind of problem with adonis commands:
wrap the task inside your handle in a try... catch block then always have Database.close() and process.exit() in finally.
I'm new to Node.js and I need to upload some PDFs to an external API (Zip Forms).
Right now I have the code below but the PDF pages are blank when they arrive at the destination. I tried saving the PDF locally, using the same binary data that I'm sending to the API, and the PDFs are correctly saved.
I am also using setTimeout method here because I cannot find a method that waits for the PDF to read, before sending it to the API.
Also tried binary instead of latin-1 in readFileSync method, but it doesn't change anything.
Code:
const aws = require('aws-sdk');
const https = require('https');
const request = require('request');
const { createWriteStream, readFileSync, writeFileSync } = require('fs');
const s3 = new aws.S3(); // Pass in opts to S3 if necessary
// Look up order and related info.
var order = await Order.findOne({ id })
.populate('agent');
if (createZiplogixTransaction) {
ziplogixTransactionId = await sails.helpers.ziplogix.createZiplogixTransaction.with({
ziplogixContextId: ziplogixContextId,
transactionName: order.propertyStreetAddress + ', ' + order.propertyCity,
// FUTURE: if the transaction helper is updated, include actual order information
// e.g. Primary seller name, property street address, etc.
});
}
if (!order) {
throw 'noSuchOrder';
}
// Permissions
if (this.req.me && this.req.me.accountType !== 'agent' && !ziplogixContextId) {
throw 'forbidden';
}
let savedPdfs = await PdfOrderExternalId.find({ orderId: id });
await PdfOrderExternalId.destroy({
where: { orderId: id }
});
for (const pdf of pdfs) {
let url = await s3.getSignedUrl('getObject', {
Bucket: 'disclosure-pdfs',
Key: pdf.uploadFd,
Expires: 60 * 5
});
let file = createWriteStream(`/tmp/${pdf.slug}.pdf`);
await https.get(url, async (response) => {
await response.pipe(file);
// Need to wait for file to write on disk :|. Doesn't work with await or Promise (Why? IDK)
setTimeout(async () => {
let postData = await readFileSync(`/tmp/${pdf.slug}.pdf`, 'latin1');
let queryString = `Name=${pdf.displayName}&Description=${pdf.displayName}`;
savedPdfs.forEach(item => {
if (item.pdfTemplate === pdf.pdfTemplate) {
queryString += `Id=${item.externalId}`;
}
});
request({
method: 'POST',
url: `${sails.config.custom.ziplogixApiBaseUrl}/transactions/${ziplogixTransactionId}/documents/file?${queryString}`,
headers: {
'X-Auth-ContextID': ziplogixContextId,
'X-Auth-SharedKey': sails.config.custom.ziplogixSharedKey,
'Content-Type': ['application/pdf', 'application/pdf']
},
body: postData
}, async (error, response, body) => {
// code here ...
});
}, 1000);
});
}
await exits.success(Date.now());
Any ideas what I'm doing wrong?
Thank you
I am trying to crawl a website using NodeJS. I am making an HTTP request using Axios. I am able to only fetch those items which are available when webpage is loaded. All the HTML which is loaded when I scroll down further is not fetched.
Here is my code.
const axios = require('axios');
const cheerio = require('cheerio');
var request = require('request');
// table view
const url = "https://www.usnews.com/best-colleges/search?_sort=rank&_sortDirection=asc&study=Engineering&_mode=table";
fetchData(url).then((res) => {
const html = res.data;
const $ = cheerio.load(html);
const unilist = $('.TableTabular__TableContainer-febmbj-0.guaRKP > tbody > tr >td ');
unilist.each(function() {
let title = $(this).find('div').attr("name");
if (typeof(title) == 'string') {
console.log(title);
}
});
})
async function fetchData(url){
console.log("Crawling data...")
// make http call to url
let response = await axios(url).catch((err) => console.log(err));
if(response.status !== 200){
console.log("Error occurred while fetching data");
return;
}
return response;
}
I am trying to get all the university names. However, I am only able to get 13 universities because the others are loaded only when the page is manually scrolled down.
How do I access all the universities in the webpage: https://www.usnews.com/best-colleges/search?_sort=rank&_sortDirection=asc&study=Engineering&_mode=table
var request = require('request');
const url = "https://www.usnews.com/best-colleges/api/search?_sort=rank&_sortDirection=asc&_page=7&study=Engineering";
let options = {
url: url,
headers: {
"authority": "www.usnews.com",
"method": "GET",
//"path": `/best-colleges/api/search?_sort=rank&_sortDirection=asc&_page=6&study=Engineering`,
"scheme": "https",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"cookie": `ak_bmsc=60A136143B076291C93DD9728862F728172B301F314600004917B85E8498E04F~pl5NwmZFHheJnzgnDGIGpBb4YDDOuhPDVqrNGDysdm/dDPzFJis9zP1awrKKsxeJBlvqZWW6E3ssLbAdi/nUkIEkEiVPu1NDDQge8FegXwVN6Ren/u+X8dx6/TRgRIIXtbj2n2ieih1+SzTEccExtz3QgcXFx+ZxSM1O3Xoe5crrhltym4VHVynMHnup+h3TaL9tLmsoWiopb9GlEG1eTlXIoyPsKVt2FA+s1MJP5zVmQ=; akacd_www=2177452799~rv=53~id=9087b102caf120794dbb1eeceaf4ccc8; usn_session_id=891228875906785; usn_visitor_id=8912288759182043; optimizelyEndUserId=oeu1589122887855r0.7555247616511707; usprivacy=1YNY; s_cc=true; s_fid=6C0F54971BC55B63-31DB4C74AAF1424B; ntv_as_us_privacy=1YNY; _ga=GA1.2.1252831587.1589122893; _gid=GA1.2.1293277568.1589122893; _fbp=fb.1.1589122894850.768122457; _ntv_uid=a074b9dd-6b5b-4f4b-b257-f9e7ee116412; __gads=ID=3343601cd2e45d2f:T=1589122898:S=ALNI_MZI2Mh_V-ROYbHt3s2k1h83if7i8A; edu-page-views=2; modal-page-views=2; pageview-count-Best Colleges Q2 2020 Audience Survey=2; CUID=N,1589123026657:ALHGLuQAAAAPTiwxNTg5MTIzMDI2NjU3xMc3klevipXW6CRMhCp96C/0wAIB5hXG0/fOK/1Ol60Pak5Dv6v1GHuSJcnhwzLp/ZPAF0+w1p4ic6ZfQHqgJCnyVI1XNZdQ7uBtRQ7wisLYSy5p3bcKN45s8z0N5XX37CMtZHg8WMEvbF6Q+BNNPpjuqLZ3n2p0hJ8+nTpo1lq/vOQrVU+DCcsiC38OMawezCmWDdUxbg2PiMkU9F/WZ4MfddfaDwqQ1BBQC0QkUZeRHkOCPndfwQOCKX1IKZ81Ju7MTmN1wqFdHaHxmHICvLvD6er4q4B0o8byjDXO0M79Yt82UMi8E2sqIAzin+FaFk181KNB5Z+5LbvWhORCig==; FCCDCF=[["AKsRol8x0eLcCPRNK87LcFg96i4OohYRu7keT-wXifV77qo_eYe6uZ0ThI1Oxd2-Y4V5wtjFjZW02xgjl0IhpmE9ojyljTmH9lrVeqQI3wXUjtift1w_Dqsor4S-4hEwsOEhBLpQrx8Ijd3oIw7mqxKezHDHZiod4A=="],null,["[[],[],[],[],null,null,true]",1589123041768]]; education-compare-slideout-state=collapsed; s_sq=%5B%5BB%5D%5D; utag_main=v_id:0171ff1af36300170b586aee949903073006706b009dc$_sn:1$_ss:0$_pn:2%3Bexp-session$_st:1589125090368$ses_id:1589122888547%3Bexp-session$_prevpage:www.usnews.com%2Fbest-colleges%2Fsearch%3Bexp-1589126890272; kw.pv_session=6; sailthru_visitor=9abdf1e6-3e02-427f-9899-6c232865866f; bm_sv=C8E5F93ED4F69A94559E23D6F676C38F~k2zHi/YOvrX2jg2IjDjERaOLYsf7bu+NjQmXeUuPHueXWih3Xm6rjXIC8wg1E225YVqIN2Q3cxjMPj6wlfrOgX8K9b5WW9BLiQIddDKHAGX7gH591ibZ8/bJFn4E/h7PhohIoGJK8PpG6Vel3r3dp//PcCGwzvgJNlUWVUqki3c=; _sp_id.26f9=f626f911-80a4-4912-b0bc-ad1b520357f6.1589122896.2.1589128312.1589124442.54a5f830-9b4f-471e-b326-7e4654bf5bf1; _sp_ses.26f9=*; RT="sl=0&ss=1589123021504&tt=0&obo=0&bcn=%2F%2F684d0d40.akstat.io%2F&sh=&dm=usnews.com&si=a65156df-2f6b-4e2a-815d-f7fdf1e8928c`,
}
};
request(options, function (err, resp, html) {
debugger
if (!err) {
var res= JSON.parse(html);
//var items=res.data.items
//var totalItems=res.data.totalItems
//var totalPages=res.data.totalPages
}
})
Please try this code . maybe you have to put your browser cookie in the request url. since this site api is actually restricted for another applications. in the result