I'm making multiple URL request's using Axios and collecting the data using Cheerio.
Everything works great, I just can't figure out how to prevent the data to be overwritten by the previous response that was generated into a file using the createWriteStream method.
I'm trying to create different files for each request with unique names preferably, but haven't found any solution in the docs.
const axios = require("axios").default;
const cheerio = require('cheerio');
const fs = require('fs');
const writeStream = fs.createWriteStream('./names/names.text')
const getTitle = (res) => {
const $ = cheerio.load(res.data);
const names = $('.name_wrap > .name')
names.each(function (i, el) {
const item = $(el).text().replace(/^\s*$/g, '')
writeStream.write(`${item}\n`)
});
}
// URL'S Array
let URLS = []
for (let index = 1; index <= 3; index++) {
let url = `https://www.example.com/name-1-${index}`
URLS.push(axios.get(url))
}
Promise.all(URLS)
.then(responses => {
getTitle(responses[0])
getTitle(responses[1])
});
Related
I have an gif file that is stored in a directory call assets on my computer. I would like to create X amount of duplicates and they should be stored in the same directory and each of them should have a different file name.
Example:
I in the assets directory is the gif file call 0.gif I would like to duplicate this gif file 10 times and The duplicates should be called 1.gif, 2.gif, 3.R and so on.
The simplest option is to use fs and using copyFile function available
const fs = require("fs");
const path = require("path");
let copyMultiple = (src, count) => {
let initCount = 0;
while (initCount < count) {
initCount++;// you can put this at bottom too acc to your needs
const newFileName = `${initCount}_${initCount}${path.extname(src)}`;
console.log(newFileName, "is new file name");
fs.copyFile(src, newFileName, (error) => {
// if errors comes
if (error) {
console.log(error);
}
});
}
};
copyMultiple("1.gif", 3);
Another elegant way of doing this is
const util = require("util");
const fs = require("fs");
const path = require("path");
const copyFilePromise = util.promisify(fs.copyFile);
function copyFiles(srcFile, destDir, destFileNames) {
return Promise.all(
destFileNames.map((file) => {
return copyFilePromise(srcFile, path.join(destDir, file));
})
);
}
const myDestinationFileNames = ["second.gif", "third.gif"];
const sourceFileName = "1.gif";
copyFiles(sourceFileName, "", myDestinationFileNames)
.then(() => {
console.log("Copying is Done");
})
.catch((err) => {
console.log("Got and Error", error);
});
Using this will also give upperhand of knowing when it is done.
You can read docs here
const fs = require("fs")
const filename = "index.js".split(".") //filename like 0.gif to gif
const times = 10 // number of times to duplicate
for(var int = 1; int < times; int++){
const newFilename = `${(parseInt(filename[0]) + init)}.${filename[1]}` //new filename like 0.gif to 1.gif
fs.copyFileSync(filename, newfilename)
}
use the write file and read file from the fs module and a simple for loop
not sure which framework you're on but fs.copyFile() is the standard way for node.js https://nodejs.org/api/fs.html#fscopyfilesrc-dest-mode-callback
I have an axios get request which takes too long to resolve. This is for a site hosted on Heroku, which has a request timeout set at 30 seconds. The following code returns the request after about 50 seconds (which is surprisingly long, as there are only 21 urls to loop through in playerLink). Therefore, the request is never resolved on the live site.
Here is the Promise code:
const PORT = 8000
const axios = require('axios')
const cheerio = require('cheerio')
const express = require('express')
const cors = require('cors')
const app = express()
app.use(cors())
app.listen(PORT , () => console.log(`server running on PORT ${PORT}`))
const players = 'https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster'
const playerStats = 'https://www.trinethunder.com'
const playerLink = []
app.get('/players', (req, res) => {
function getPlayers() {
return new Promise((resolve, reject) => {
axios(players)
.then((response) => {
const html = response.data;
const $ = cheerio.load(html);
$("td.text.pinned-col > a", html).each(function () {
var link = $(this).attr("href");
//if link not yet in array, push to array
if (playerLink.indexOf(playerStats + link) === -1) {
playerLink.push(playerStats + link);
}
});
resolve()
})
.catch((err) => {
console.log(err);
});
});
}
function getPlayerStats() {
setTimeout(async () => {
const statsArray = []
for (let i = 0; i < playerLink.length; i++) {
await new Promise((resolve, reject) => {
axios.get
(playerLink[i])
.then((response) => {
const html = response.data;
const $ = cheerio.load(html);
const statName = [];
const statDesc = [];
const statNum = [];
$("h2 > span:nth-child(1)", html).each(function () {
var name = $(this).text();
statName.push(name);
});
$(".stat-title", html).each(function () {
var stat1 = $(this).text();
statDesc.push(stat1);
});
$(".stat-value", html).each(function () {
var stat2 = $(this).text();
statNum.push(stat2);
});
//Conditional is here because sometimes statsArray
//gets filled multiple times
if (statsArray.length < 63) {
statsArray.push(statName, statDesc, statNum);
}
resolve();
})
.catch((err) => console.log(err));
});
}
res.json(statsArray)
}, 400);
}
getPlayers()
.then(getPlayerStats)
.catch((err) => console.log(err));
});
Simplified Fetch statement for /players:
fetch('http://localhost:8000/players')
.then(response => response.json())
.then(data => {
console.log(data)
}).catch(err=>console.log(err))
Please let me know if you see anything that may be slowing down the execution of the request.
I cleaned up the code, removed the setTimeout(), set it up for maximum parallelization and instrumented it and made it so it can run stand-alone. After doing so, the log it produces is below and I see that getPlayers() takes 2413ms and the synchronous cheerio processing of the individual player requests takes a total of 6087ms. From start to finish, the whole thing takes 9415ms on my system.
This is significantly faster than what you report. The biggest structural change I made is that all the individual getPlayerStat requests are made in parallel, not in serial which (if the target server can handle it) will shorten the total wait for network requests on getting player stats. I also removed the setTimeout() as that seemed like a hack for some other problem and once the code is structured properly for asynchronous handling, that should not be necessary.
Here's the detailed log if you want to see where all the detailed time is spent. You can run the code below on your own system to see what you get there:
000000: begin all
000006: begin getPlayers()
002419: end getPlayers()
002419: begin getPlayerStats
002420: begin get https://www.trinethunder.com/sports/sball/2021-22/players/makinzeromingersy0k
002423: begin get https://www.trinethunder.com/sports/sball/2021-22/players/emersynhaneyjnrb
002424: begin get https://www.trinethunder.com/sports/sball/2021-22/players/amandapratheruluw
002424: begin get https://www.trinethunder.com/sports/sball/2021-22/players/adrienneroseybff7
002425: begin get https://www.trinethunder.com/sports/sball/2021-22/players/emmabeyeri6zz
002426: begin get https://www.trinethunder.com/sports/sball/2021-22/players/aprilsellersi95s
002427: begin get https://www.trinethunder.com/sports/sball/2021-22/players/annakoeppl38q8
002427: begin get https://www.trinethunder.com/sports/sball/2021-22/players/annagilli8rl
002428: begin get https://www.trinethunder.com/sports/sball/2021-22/players/angelenaperry2scn
002429: begin get https://www.trinethunder.com/sports/sball/2021-22/players/laurenclausenfb4j
002430: begin get https://www.trinethunder.com/sports/sball/2021-22/players/emilywheaton1jym
002430: begin get https://www.trinethunder.com/sports/sball/2021-22/players/kaylyncoahranhp6r
002431: begin get https://www.trinethunder.com/sports/sball/2021-22/players/mercededaughertyiswy
002432: begin get https://www.trinethunder.com/sports/sball/2021-22/players/taylormurdockgeho
002432: begin get https://www.trinethunder.com/sports/sball/2021-22/players/lexiclark77gr
002433: begin get https://www.trinethunder.com/sports/sball/2021-22/players/ainsleyphillipsmfe9
002434: begin get https://www.trinethunder.com/sports/sball/2021-22/players/ellietrinexhe2
002434: begin get https://www.trinethunder.com/sports/sball/2021-22/players/ashleyswartouta714
002435: begin get https://www.trinethunder.com/sports/sball/2021-22/players/gisellerileybdb8
002436: begin get https://www.trinethunder.com/sports/sball/2021-22/players/elizabethkoch5umu
002436: begin get https://www.trinethunder.com/sports/sball/2021-22/players/scarlettelliott0bvt
003251: after get https://www.trinethunder.com/sports/sball/2021-22/players/kaylyncoahranhp6r
003596: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/kaylyncoahranhp6r
003599: after get https://www.trinethunder.com/sports/sball/2021-22/players/makinzeromingersy0k
003902: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/makinzeromingersy0k
003905: after get https://www.trinethunder.com/sports/sball/2021-22/players/emersynhaneyjnrb
004200: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/emersynhaneyjnrb
004203: after get https://www.trinethunder.com/sports/sball/2021-22/players/amandapratheruluw
004489: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/amandapratheruluw
004492: after get https://www.trinethunder.com/sports/sball/2021-22/players/emmabeyeri6zz
004771: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/emmabeyeri6zz
004773: after get https://www.trinethunder.com/sports/sball/2021-22/players/aprilsellersi95s
005060: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/aprilsellersi95s
005063: after get https://www.trinethunder.com/sports/sball/2021-22/players/elizabethkoch5umu
005345: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/elizabethkoch5umu
005348: after get https://www.trinethunder.com/sports/sball/2021-22/players/emilywheaton1jym
005638: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/emilywheaton1jym
005643: after get https://www.trinethunder.com/sports/sball/2021-22/players/ashleyswartouta714
005943: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/ashleyswartouta714
005951: after get https://www.trinethunder.com/sports/sball/2021-22/players/ainsleyphillipsmfe9
006243: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/ainsleyphillipsmfe9
006245: after get https://www.trinethunder.com/sports/sball/2021-22/players/adrienneroseybff7
006541: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/adrienneroseybff7
006545: after get https://www.trinethunder.com/sports/sball/2021-22/players/annagilli8rl
006821: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/annagilli8rl
006824: after get https://www.trinethunder.com/sports/sball/2021-22/players/mercededaughertyiswy
007111: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/mercededaughertyiswy
007118: after get https://www.trinethunder.com/sports/sball/2021-22/players/lexiclark77gr
007402: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/lexiclark77gr
007411: after get https://www.trinethunder.com/sports/sball/2021-22/players/angelenaperry2scn
007681: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/angelenaperry2scn
007685: after get https://www.trinethunder.com/sports/sball/2021-22/players/laurenclausenfb4j
007974: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/laurenclausenfb4j
007976: after get https://www.trinethunder.com/sports/sball/2021-22/players/scarlettelliott0bvt
008265: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/scarlettelliott0bvt
008267: after get https://www.trinethunder.com/sports/sball/2021-22/players/ellietrinexhe2
008553: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/ellietrinexhe2
008555: after get https://www.trinethunder.com/sports/sball/2021-22/players/gisellerileybdb8
008838: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/gisellerileybdb8
008840: after get https://www.trinethunder.com/sports/sball/2021-22/players/annakoeppl38q8
009129: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/annakoeppl38q8
009131: after get https://www.trinethunder.com/sports/sball/2021-22/players/taylormurdockgeho
009415: after cheerio parse https://www.trinethunder.com/sports/sball/2021-22/players/taylormurdockgeho
009415: end all
... data here
getPlayers() took 2413ms
cheerio processing took 6087ms
And, here's the stand-alone code that anyone can run:
const axios = require('axios');
const cheerio = require('cheerio');
const players = 'https://www.trinethunder.com/sports/sball/2021-22/teams/trine?view=roster'
const playerStats = 'https://www.trinethunder.com'
const zeroes = "000000000000000000000000000000";
function zeroPad(num, padLen) {
let str = num + "";
let padNum = padLen - str.length;
if (padNum > 0) {
str = zeroes.slice(0, padNum) + str;
}
return str;
}
const base = Date.now();
function log(...args) {
let delta = Date.now() - base;
let deltaPad = zeroPad(delta, 6);
console.log(deltaPad + ": ", ...args);
}
let getPlayersT = 0;
let cheerioT = 0;
async function run() {
async function getPlayers() {
log("begin getPlayers()");
let startT = Date.now();
const playerLink = [];
const response = await axios(players);
const html = response.data;
const $ = cheerio.load(html);
$("td.text.pinned-col > a", html).each(function() {
const link = $(this).attr("href");
//if link not yet in array, push to array
if (playerLink.indexOf(playerStats + link) === -1) {
playerLink.push(playerStats + link);
}
});
log("end getPlayers()")
getPlayersT += Date.now() - startT;
return playerLink;
}
async function getPlayerStats(playerLink) {
log("begin getPlayerStats");
const statsArray = [];
await Promise.all(playerLink.map(async link => {
log(`begin get ${link}`)
const response = await axios.get(link);
log(`after get ${link}`)
const html = response.data;
const startT = Date.now();
const $ = cheerio.load(html);
const statName = [];
const statDesc = [];
const statNum = [];
$("h2 > span:nth-child(1)", html).each(function() {
var name = $(this).text();
statName.push(name);
});
$(".stat-title", html).each(function() {
var stat1 = $(this).text();
statDesc.push(stat1);
});
$(".stat-value", html).each(function() {
var stat2 = $(this).text();
statNum.push(stat2);
});
//Conditional is here because sometimes statsArray
//gets filled multiple times
if (statsArray.length < 63) {
statsArray.push(statName, statDesc, statNum);
}
cheerioT += Date.now() - startT;
log(`after cheerio parse ${link}`);
}));
return statsArray;
}
try {
log("begin all")
const playerLink = await getPlayers();
const statsArray = await getPlayerStats(playerLink);
log("end all")
return statsArray;
} catch (e) {
console.log(e);
}
}
run().then(result => {
console.log(result);
console.log(`getPlayers() took ${getPlayersT}ms`);
console.log(`cheerio processing took ${cheerioT}ms`);
}).catch(err => {
console.log("error");
});
const $ = require ('cheerio');
const fetch = require('node-fetch');
const url = "https://fr.wikipedia.org/wiki/The_Legend_of_Zelda";
function extractionBrut(url){
return fetch(url)
.then((reponse) => reponse.text())
.then((data) => {
return data;
})
}
const getFormationList = async () => {
const data = await extractionBrut(url);
const num = $.parseHTML(data).length;
console.log(num);
for(let i = 0; i<num; i++){
const numTable = $('<div id="" class="bandeau-container homonymie plainlinks hatnote" style="">')[i];
console.log(numTable);
}
}
getFormationList();
I want to show the different section of this div however i have this error and don't know how to solve it.
The $ is supposed to be loaded with data. You would do something like:
const cheerio = require ('cheerio');
And later:
const $ = cheerio.load(someHtmlThatYouGotFromNodeFetch);
Now you can use the $ as if it's jQuery, or how does cheerio know what HTML you're working with?
I'm trying to merge two pdf files on frontend using javascript and pdf-lib library. I found this snippet pdf-lib in github repository:
async function mergePdfs(pdfsToMerge: string[]) {
const mergedPdf = await PDFDocument.create();
for (const pdfCopyDoc of pdfsToMerge) {
const pdfBytes = fs.readFileSync(pdfCopyDoc);
const pdf = await PDFDocument.load(pdfBytes);
const copiedPages = await mergedPdf.copyPages(pdf, pdf.getPageIndices());
copiedPages.forEach((page) => {
mergedPdf.addPage(page);
});
}
const mergedPdfFile = await mergedPdf.save();
return mergedPdfFile;
}
But as I see this snipped is for nodejs (there's no fs.readfilesync in browser javascript). So I have 2 questions:
what should I put in pdfsToMerge(string: [])? I have variables containing urls to pdf1 and pdf
Also I have two variables containing base64 code of these pdfs. How can I use this snippet not using fs.readfilesync like in nodejs but on frontend?
Many thanks in advance!
The PDFDocument.load() method will accept base64 strings as the parameter so you don't need to do transform those at all.
As for your variables storing url paths to pdf documents, you can use fetch instead of node's file system. As described in the pdf-lib docs, you can store the ArrayBuffer and pass that into PDFDocument.load() like so:
const url = 'https://pdf-lib.js.org/assets/with_update_sections.pdf'
const arrayBuffer = await fetch(url).then(res => res.arrayBuffer())
const pdfDoc = await PDFDocument.load(arrayBuffer)
Your version number should be newest pdf-lib
THen sequence of events matters. Here is function i use must be in order of event
I use is with data or emtpy to get filled or non filled pdf files
async copyPages(sale: Sale, url1, urlArray, isWithData, isEmptyForm) {
this.pdfService.getIsEmpty().subscribe(data => { isEmptyForm = data; });
this.pdfService.getIsWithData().subscribe(data => { isWithData = data; });
console.log(urlArray);
let donorBytes = [];
let donorBytesFInal = [];
let donorPage = [];
let donorDoc = [];
/**
* first page get bytes from url
* then load data
* then convert the data bytes to pdfDocument
* later in routine this firstDonorDoc pages are inserted not added
*/
let firstDonorPdfBytes = await fetch(url1).then(res => res.arrayBuffer());
await this.loadDataTodocument(firstDonorPdfBytes, sale, isWithData,
isEmptyForm).then(data => {
firstDonorPdfBytes = data;
});
/**
* load first document
*/
const firstDonorPdfDoc = await PDFDocument.load(firstDonorPdfBytes);
/**
* load url array convert to bytes, send bytes to populate textfields with
data
*/
for (let i = 0; i < urlArray.length; ++i) {
console.log(urlArray.length);
donorBytes[i] = await fetch(urlArray[i].url).then(res =>
res.arrayBuffer());
}
/* Insert data to donorBytes and create DonorBytesFinal array with data */
// tslint:disable-next-line:prefer-for-of
for (let i = 0; i < donorBytes.length; ++i) {
await this.loadDataTodocument(donorBytes[i], sale, isWithData,
isEmptyForm).then(data
=> {
donorBytesFInal.push(data);
});
}
// console.log(donorBytesFInal);
/*
convert donor bytes to PdfDocument after bytes include data re
donorBytesFInal
*/
for (let i = 0; i < donorBytesFInal.length; ++i) {
donorDoc[i] = await PDFDocument.load(donorBytesFInal[i]);
}
/* create out put document **/
const pdfDoc = await PDFDocument.create();
/**
* copay first page... not in array
*/
const [firstDonorPage] = await pdfDoc.copyPages(firstDonorPdfDoc, [0]);
/**
* copy all array pages of singular docuemnts output pdfdoc. Notices these
are insertpages nto addpage
*/
for (let i = 0; i < donorBytes.length; ++i) {
[donorPage[i]] = await pdfDoc.copyPages(donorDoc[i], [0]);
pdfDoc.insertPage(0, donorPage[i]);
}
/** first page is an ADDpage not an insert */
pdfDoc.addPage(firstDonorPage);
/** create tyes for 64 and 8 and update globally */
const u8 = await pdfDoc.save();
const n64 = await pdfDoc.saveAsBase64();
this.pdfService.changeUint8ByteArray(u8);
this.pdfService.changeBase64Array(n64);
const pdfBytes = u8;
/** redundant empty urlarray */
urlArray = [];
I need to extract links from the url in loop , so basically I need to execute another time the function but I don't know how to made this with nodejs.
var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
request(url , function(err,resp,body){
$ = cheerio.load(body);
links = $('a');
$(links).each(function(i,link){
console.log(url+$(link).attr('href'));
}
)
})
My question is about how to extract the links from this array because this code works correctly (This code shows in console the links) but I need to scrape these links.
The result will be scraping the urls inside each.
var request = require('request');
var cheerio = require('cheerio');
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
request(url , function(err,resp,body){
$ = cheerio.load(body)
var allLinks = []
links = $('a');
$(links).each(function(i,link){
console.log(url+$(link).attr('href'))
var currentLink = url+$(link).attr('href')
allLinks.push(currentLink)
if (i == links.length-1){
useLinks(allLinks)
}
}
)
})
function useLinks(allLinks){
console.log(allLinks)
}
If you're asking how to extract the url from the links received from cheerio you're already doing it. If you'd like to use them elsewhere after the request is finished (e.g. for scraping again), then store them in an array and call a function to use the array after you iterate through the last link.
It should look something like this:
let links = $('a').get().map(a => $(a).attr('href'))
I share my solution is like the question but with differents changues.
I don't extract all links only the link thah I pass by url.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var searchTerm = 'baloncesto';
var url = 'http://mismarcadores.com/' + searchTerm;
var arr2 = [];
app.get('/webscrape', function(req, res,body){
request(url , function(err,resp,body){
var array2 = [];
var array3 = [];
$ = cheerio.load(body);
links = $('a'); //jquery get all hyperlinks
$(links).each(function(i, link){
if($(link).attr('href').includes("baloncesto")){
array2.push($(link).attr('href'));
}
});
const uniqueLinks = new Set([...array2]);
uniqueLinks.forEach((d) => {
const row = []; // a new array for each row of data
row.push(d);
array3.push(row.join()); // by default, join() uses a ','
});
fs.writeFile('raaga_output.json', JSON.stringify(array3, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the raaga_output.json file');
})
res.send('File successfully written! - Check your project directory for the raaga_output.json file');
})
})
app.listen('3000')
console.log('Web Scrape happens on port 3000');
exports = module.exports = app;
Everyone could use this without any problem.