how to handle string comparison and file writing using xlsx with nodejs - javascript

This script is to get the title of the webpage where the URL of the website will be passed from an excel file, check to see if the title contains the keyword, and then store that domain in the new excel file.
There is no issue with the partial code, but the title comparison does not work as expected. Does anyone have an idea how to fix it?
here is my code
var request = require("request");
var cheerio = require("cheerio");
const xlsx = require("xlsx");
jsonData = [{ Domain: "blogger.com" }, { Domain: "stackoverflow.com" }];
function fetchTitle(url, onComplete = null) {
request(url, function (error, response, body) {
var output = url; // default to URL
if (!error && (response && response.statusCode) === 200) {
var $ = cheerio.load(body);
console.log(`URL = ${url}`);
var title = $("head > title").text().trim();
console.log(`Title = ${title}`);
output = `[${title}] (${url})`;
var keywords = ["Developers", "blog"];
var results = [];
var UrlArray = [];
for (var i = 0; i < keywords.length; i++) {
var match = title.match(new RegExp(keywords.join("|"), "g"));
results.push(keywords[i]);
}
if (match.length > 0) {
UrlArray.push({
Domain: url,
Keywords: results,
Title: output,
});
finalJsonData = JSON.stringify(UrlArray);
const ws = xlsx.utils.json_to_sheet(UrlArray);
const wb = xlsx.utils.book_new();
xlsx.utils.book_append_sheet(wb, ws, "Responses");
xlsx.writeFile(wb, "output.xlsx");
}
} else {
console.log(
`Error = ${error}, code = ${response && response.statusCode}`
);
}
console.log(`output = ${output} \n\n`);
if (onComplete) onComplete(output);
});
}
jsonData.forEach(function (table) {
var tableName = table.Domain;
var URL = "http://" + tableName;
fetchTitle(URL);
});
When I execute the script, I am able to get the title, but when I compare it with the keyword, it is not working as expected. Keywords are not being stored. You can see how the output looks after executing the script.
The script shows that both domains have keywords, but only blogger is stored in the spreadsheet, even then keywords aren't stored

you're overwriting the file on each loop,
keywords is an array, so it doesn't get saved, furthermore, keywords column will always contain all keywords, not the matching ones...
as requests are async, you need to track them all, and write results only when all requests are finished.
try this:
match case insensitive, and store only matching keywords for that site, not all (I also added "no match" for domains with no match)
store results outside the loop
move writing results into a separate function
add request counter and callback to track requests
write results when requests are done
the code:
var request = require("request");
var cheerio = require("cheerio");
const xlsx = require("xlsx");
const jsonData = [{ Domain: "blogger.com" }, { Domain: "stackoverflow.com" }];
var UrlArray = [];
function writeResults() {
const finalJsonData = JSON.stringify(UrlArray);
const ws = xlsx.utils.json_to_sheet(UrlArray);
const wb = xlsx.utils.book_new();
xlsx.utils.book_append_sheet(wb, ws, "Responses");
xlsx.writeFile(wb, "output.xlsx");
}
function fetchTitle(url, onComplete = null) {
request(url, function (error, response, body) {
var output = url; // default to URL
if (!error && (response && response.statusCode) === 200) {
var $ = cheerio.load(body);
console.log(`URL = ${url}`);
var title = $("head > title").text().trim();
console.log(`Title = ${title}`);
output = `[${title}] (${url})`;
var keywords = ["Developers", "blog"];
var results = [];
for (var i = 0; i < keywords.length; i++) {
let match = title.match(new RegExp(keywords[i], "gi"));
if (match && match.length > 0) {
results.push(keywords[i]);
}
}
UrlArray.push({
Domain: url,
Keywords: results.length > 0 ? results.join(', ') : 'no match',
Title: output,
});
} else {
console.log(
`Error = ${error}, code = ${response && response.statusCode}`
);
}
console.log(`output = ${output} \n\n`);
if (onComplete) onComplete(output);
});
}
let counter = 0;
jsonData.forEach(function (table) {
var tableName = table.Domain;
var URL = "http://" + tableName;
fetchTitle(URL, ()=>{
counter++;
if(counter === jsonData.length) {
console.log(`all ${counter} requests done`);
writeResults();
}
});
});

Related

Undefined Header in Node js using axios

I am trying to pass a token in the header from a CLI to a Rest Api. However the header is said to be undefined in the server side. The code of the CLI file is the following:
import cli from 'cli-ux'
// just prompt for input
import {Command} from '#oclif/command'
import {createConnection} from "typeorm";
import {flags} from '#oclif/command'
const bcrypt = require('bcrypt');
var fs=require('fs');
const https = require('https')
const axios=require('axios');
const client_cert = fs.readFileSync('ca-crt.pem')
axios.defaults.httpsAgent = new https.Agent({ca : client_cert, keepAlive: true})
export class AdminCommand extends Command {
static flags = {
newuser: flags.string({dependsOn:['passw'],exclusive:['newdata','userstatus','moduser']}),
moduser: flags.string({dependsOn:['passw'],exclusive:['newuser','newdata','userstatus']}),
passw: flags.string({dependsOn:['email']}),
email: flags.string({dependsOn:['quota']}),
quota: flags.string(),
userstatus: flags.string({exclusive:['newdata','newuser','moduser']}),
newdata: flags.string({dependsOn:['source'],exclusive:['newdata','newuser','moduser']}),
source: flags.string()
}
async run() {
const {flags} = this.parse(AdminCommand);
var fs=require('fs');
var jwt=require('jsonwebtoken');
var token = fs.readFileSync('softeng19bAPI.token');
axios.defaults.headers.common['X-OBSERVATORY-AUTH']="Bearer " + token;
await cli.anykey();
//create new user
if (`${flags.newuser}` !== "undefined" && `${flags.passw}` !== "undefined" && `${flags.email}` !== "undefined" && `${flags.quota}` !== "undefined" ){
let hash = bcrypt.hashSync(`${flags.passw}`,10);
await axios.post('https://localhost:8765/energy/api/Admin/users?username=' +`${flags.newuser}` +'&passw=' + hash +'&email=' + `${flags.email}` +'&quota=' + `${flags.quota}`);
}
I am passing the token in the header. But when I print the header on the server side, it is undefined. The server side code is the following:
module.exports = app => {
const entry = require("../controlers/entry.controller.js");
const sql = require("../models/db.js");
const bcrypt = require('bcrypt');
const isloggedin=require('../routes/isloggedin.js');
var express=require('express');
// Retrieve a single Entry with Id
var fs=require('fs');
var privateKey = fs.readFileSync('private.key');
app.post("/energy/api/Admin/users",async function(req,res){
const token = req.headers['X-OBSERVATORY-AUTH'];
console.log(token);
var a = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890".split("");
var b = [];
for (var i=0; i<12; i++) {
var j = (Math.random() * (a.length-1)).toFixed(0);
b[i] = a[j];
}
var rest=b.join("");
var count=0;
const resul = [rest[0]];
for(let x=1; x<rest.length; x++){
count=count+1;
if(count==4)
{
resul.push('-', rest[x]);
count=0;
}
else
{
resul.push(rest[x]);
}
}
const apik=resul.join('');
//console.log(apikey);
//insert into table
//console.log(req.query.passw);
//console.log(hash);
if (req.query.username !== "undefined" && req.query.email !== "undefined" && req.query.quota !== "undefined"){
sql.query(`INSERT INTO users VALUES (?,?,?,?,?,?)`,[req.query.username,req.query.passw,req.query.email,req.query.quota,apik,'user'],(err,res2) => {
if (err) {
console.log("error: ", err);
result(err, null);
return;
}
});
}
res.send("Succesful "+apik);
});
Why is it passing as undefined? When I print it on the CLI, the token is not empty. Iam writing in node js, express js
Instead of passing the bearer in the axios header like this
axios.defaults.headers.common['X-OBSERVATORY-AUTH']="Bearer " + token;
Try to post it like this
await axios.post('https://localhost:8765/energy/api/Admin/users?username=' +`${flags.newuser}` +'&passw=' + hash +'&email=' + `${flags.email}` +'&quota=' + `${flags.quota}`, {}, {
headers: {"Authorization": `Bearer ` + token}})
And from the server side, retrieve the token from the request for verification
const token = req.headers.authorization.split(" ")[1];

Javascript - FileReader how can I read and process each file at a time among multiple files

I am trying let the user drop multiple excel file and extract desired values from each one of the files and upload it to website ONE FILE AT A TIME.
My code is not working, and I am assuming this is because of the callback problem..
Could anybody help?
Edit: I also added my uploadFile function. I very much appreciate your help.
for(var i = 0; i < fileList.length; i++) {
//console.log(fileList[i]["file"]);
var reader = new FileReader();
var f = fileList[i]["file"];
//var fName = fileList[i]["fileName"];
var excelObject = fileList[i];
reader.onload = function(ev) {
var data = ev.target.result;
if(!rABS) data = new Uint8Array(data);
var wb = XLSX.read(data, {type: rABS ? 'binary' : 'array'});
var einAddress = "B3";
var engCodeAddress = "B1";
var goAddress = "B2";
var errMsg = tabName + " tab or required value is missing";
// Worksheet with the necessary info
try{
var ws = wb.Sheets[tabName];
var ein_cell = ws[einAddress];
ein = (ein_cell ? ein_cell.v.toString() : undefined);
var eng_cell = ws[engCodeAddress];
engCode = (eng_cell ? eng_cell.v.toString() : undefined);
var go_cell = ws[goAddress];
goLocator = (go_cell ? go_cell.v.toString() : undefined);
if(ein == undefined || engCode == undefined || goLocator == undefined){
hasValues = false;
}
excelObject["EngagementCode"] = engCode;
excelObject["GoSystem"] = goLocator;
excelObject["EIN"] = ein;
if(hasValues && isValid){
uploadFile(fileList[i], userInfo);
} else {
noValueErrorHandler(errMsg);
}
} catch(err){
hasValues = false;
}
};
if(rABS) reader.readAsBinaryString(f); else reader.readAsArrayBuffer(f);
}
function uploadFile(f, userInfo) {
// Define the folder path for this example.
var serverRelativeUrlToFolder = listName;
// Get info of the file to be uploaded
var file = f;
var fileInput = file["file"];
var newName = file["fileName"];
var ein = file["EIN"];
var engCode = file["EngagementCode"];
var email = userInfo;
var goLocator = file["GoSystem"];
console.log("file: " + file);
// Get the server URL.
var serverUrl = _spPageContextInfo.siteAbsoluteUrl + "/StatusTracker";
// Initiate method calls using jQuery promises.
// Get the local file as an array buffer.
var getFile = getFileBuffer(fileInput);
getFile.done(function (arrayBuffer) {
// Add the file to the SharePoint folder.
var addFile = addFileToFolder(arrayBuffer, newName);
addFile.done(function (file, status, xhr) {
// Get the list item that corresponds to the uploaded file.
var getItem = getListItem(file.d.ListItemAllFields.__deferred.uri);
getItem.done(function (listItem, status, xhr) {
// Change the display name and title of the list item.
var changeItem = updateListItem(listItem.d.__metadata);
changeItem.done(function (data, status, xhr) {
processedCount += 1;
if (processedCount < fileCount) {
uploadFile(fileList[processedCount], email);
} else if (processedCount == fileCount){
$("#dropbox").text("Done, drop your next file");
$("#ADMNGrid").data("kendoGrid").dataSource.read();
fileList = [];
alert("Total of " + processedCount + " items are processed!");
}
// Refresh kendo grid and change back the message and empty fileList
//$("#dropbox").text("Drag your Fund/Lower Tier workpaper here ...");
//location.reload(true);
});
changeItem.fail(onError);
});
getItem.fail(onError);
});
addFile.fail(onError);
});
getFile.fail(onError);
You might put the whole thing into an async function and await a Promise for each iteration, forcing the files to be processed in serial. You didn't post your uploadFile, but if you have it return a Promise that resolves once it's done, you could do the following:
async fn() {
for (var i = 0; i < fileList.length; i++) {
await new Promise((resolve, reject) => {
//console.log(fileList[i]["file"]);
var reader = new FileReader();
var f = fileList[i]["file"];
//var fName = fileList[i]["fileName"];
var excelObject = fileList[i];
reader.onload = function(ev) {
var data = ev.target.result;
if (!rABS) data = new Uint8Array(data);
var wb = XLSX.read(data, {
type: rABS ? 'binary' : 'array'
});
var einAddress = "B3";
var engCodeAddress = "B1";
var goAddress = "B2";
var errMsg = tabName + " tab or required value is missing";
// Worksheet with the necessary info
try {
var ws = wb.Sheets[tabName];
var ein_cell = ws[einAddress];
ein = (ein_cell ? ein_cell.v.toString() : undefined);
var eng_cell = ws[engCodeAddress];
engCode = (eng_cell ? eng_cell.v.toString() : undefined);
var go_cell = ws[goAddress];
goLocator = (go_cell ? go_cell.v.toString() : undefined);
if (ein == undefined || engCode == undefined || goLocator == undefined) {
hasValues = false;
}
excelObject["EngagementCode"] = engCode;
excelObject["GoSystem"] = goLocator;
excelObject["EIN"] = ein;
if (hasValues && isValid) {
uploadFile(fileList[i], userInfo)
.then(resolve);
} else {
noValueErrorHandler(errMsg);
reject();
}
} catch (err) {
hasValues = false;
reject();
}
};
if (rABS) reader.readAsBinaryString(f);
else reader.readAsArrayBuffer(f);
});
}
}

Using Node.js to retrieve the results count from google

To give the specific details:
I am designing a Discord Bot that uses the eris library for private use. One requested feature is a googlefight command which needs to do the following:
Take user input.
Use input to generate a Google search URL.
Pull the resulting HTML page and extract from it the text from within <div class="sd" id="resultStats">.
Output this result as a message.
The currently existing related code is:
const Eris = require("eris");
const express = require('express');
const request = require('request');
const cheerio = require('cheerio');
const app = express();
...
bot.registerCommand("googlefight", (msg, args) => {
if(args.length === 0) {
return "Invalid input";
}
var arrayLength = args.length;
var searchURL = "https://www.google.com/search?q=";
for (var i = 0; i < arrayLength; i++) {
searchURL = searchURL.concat(args[i]);
if (i + 1 < arrayLength) {
searchURL = searchURL.concat("%20");
} else {
}
}
var text;
app.get('/', function(req, res){
request(searchURL, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$(.sd).filter(function(){
var data = $(this);
text = data.children().first().text();
})
}
})
})
return text;
}, {
description: "Result counter",
fullDescription: "The bot will return the result count of a google search term.",
usage: "<text>"
});
I am unfamiliar with this type of work.
The part specifically that is broken is this section:
...
app.get('/', function(req, res){
request(searchURL, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$(.sd).filter(function(){
var data = $(this);
text = data.children().first().text();
})
}
})
})
...

argument handle for nested nodejs requests

In some parent pages, there are some child page anchors I need.I want to crawl all the parent pages, parse them, then get the child anchor, follow the anchor, and get the result.but when i write the code, i found, before i follow the anchor, the anchor url didn't change.here's my code:
var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')
function callnext(index){
var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
var result = req.get(url, function(error, response, body){
if (!error && response.statusCode == 200) {
var patt = /暂无内容/g;
var result = patt.test(body);
if(result){
return;
}
$ = cheerio.load(body);
var children = $('div').first().children();
for(var i=0;i<children.length;i++){
var item = $(children[i]);
var anchor = $(item.find('li>a')[0]).attr('href');
var labelText = $(item.find('label')[0]).text();
//TAG 1
req.get(anchor, function(error, response, body){
//TAG 2
console.log(anchor);
//here's my result
})
}
index = index+20;
callnext(index)
}
})
}
callnext(1);
In this code, if i console.log() the anchor url at TAG1 place and TAG2 place, it cames different result.
in TAG 1, it's my expected result, but at TAG 2,it seems only printout the first anchor of the parent page.
i tried to changed the code and extract the sub request function, the cames the right result.why?
var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')
function crawlItem(url, text){
req.get(url, function(error, response, body){
console.log(url)
var inner = cheerio.load(body);
var text = inner('#text_long').text();
// model.Talk.create({ id: la, video: hr, youku_desc:text }).complete(function(err, album) {
// console.log(err);
// });
})
}
function callnext(index){
var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
var result = req.get(url, function(error, response, body){
if (!error && response.statusCode == 200) {
var patt = /暂无内容/g;
var result = patt.test(body);
if(result){
return;
}
$ = cheerio.load(body);
var children = $('div').first().children();
for(var i=0;i<children.length;i++){
var item = $(children[i]);
var anchor = $(item.find('li>a')[0]).attr('href');
var labelText = $(item.find('label')[0]).text();
// console.log(anchor);
crawlItem(anchor, labelText);
}
index = index+20;
callnext(index)
}
})
}
callnext(1);

How can I get node.js to return data once all operations are complete

I am just learning server-side JavaScript so please bear with any glaring mistakes I've made.
I am trying to write a file parser that operates on HTML files in a directory and returns a JSON string once all files have been parsed. I started it with a single file and it works fine. it loads the resource from Apache running on the same machine, injects jquery, does the parsing and returns my JSON.
var request = require('request'),
jsdom = require('jsdom'),
sys = require('sys'),
http = require('http');
http.createServer(function (req, res) {
request({uri:'http://localhost/tfrohe/Car3E.html'}, function (error, response, body) {
if (!error && response.statusCode == 200) {
var window = jsdom.jsdom(body).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
// jQuery is now loaded on the jsdom window created from 'body'
var emps = {};
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step++;
var name = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = name.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2];
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
emps = JSON.stringify(emps);
//console.log(emps);
res.writeHead(200, {'Content-Type': 'text/plain'});
res.end(emps);
});
} else {
res.writeHead(200, {"Content-Type": "text/plain"});
res.end("empty");
//console.log(response.statusCode);
}
});
}).listen(8124);
Now I am trying to extend this to using the regular file system (fs) and get all HTML files in the directory and parse them the same way and return a single combined JSON object once all files have been parsed. Here is what I have so far but it does not work.
var sys = require("sys"),
fs = require("fs"),
jsdom = require("jsdom"),
emps = {};
//path = '/home/inet/www/media/employees/';
readDirectory = function(path) {
fs.readdir(path, function(err, files) {
var htmlfiles = [];
files.forEach(function(name) {
if(name.substr(-4) === "html") {
htmlfiles.push(name);
}
});
var count = htmlfiles.length;
htmlfiles.forEach(function(filename) {
fs.readFile(path + filename, "binary", function(err, data) {
if(err) throw err;
window = jsdom.jsdom(data).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step++;
var empname = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = empname.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2]
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
});
});
});
});
}
readDirectory('/home/inet/www/media/employees/', function() {
console.log(emps);
});
In this particular case, there are 2 html files in the directory. If i console.log(emps) during the htmlfiles.forEach() it shows me the results from the first file then the results for both files together the way I expect. how do I get emps to be returned to readDirectory so i can output it as desired?
Completed Script
After the answers below, here is the completed script with a httpServer to serve up the detail.
var sys = require('sys'),
fs = require("fs"),
http = require('http'),
jsdom = require('jsdom'),
emps = {};
var timed = setInterval(function() {
emps = {};
readDirectory('/home/inet/www/media/employees/', function(emps) {
});
}, 3600000);
readDirectory = function(path, callback) {
fs.readdir(path, function(err, files) {
var htmlfiles = [];
files.forEach(function(name) {
if(name.substr(-4) === "html") {
htmlfiles.push(name);
}
});
var count = htmlfiles.length;
htmlfiles.forEach(function(filename) {
fs.readFile(path + filename, "binary", function(err, data) {
if(err) throw err;
window = jsdom.jsdom(data).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
var imagecount = jquery("tr td img").length;
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step += 1;
var empname = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = empname.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2]
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
count -= 1;
if (count <= 0) {
callback(JSON.stringify(emps));
}
});
});
});
});
}
var init = readDirectory('/home/inet/www/media/employees/', function(emps) {
});
http.createServer(function (req, res) {
res.writeHead(200, {'Content-Type': 'text/plain'});
res.end(JSON.stringify(emps));
}).listen(8124);
That sure is a lot of code a couple of mistakes.
You're never calling the callback function you supply to readDirectory
You need to keep track of the files you have parsed, when you parsed all of them, call the callback and supply the emps
This should work:
var sys = require("sys"),
fs = require("fs"),
jsdom = require("jsdom"),
//path = '/home/inet/www/media/employees/';
// This is a nicer way
function readDirectory(path, callback) {
fs.readdir(path, function(err, files) {
// make this local
var emps = {};
var htmlfiles = [];
files.forEach(function(name) {
if(name.substr(-4) === "html") {
htmlfiles.push(name);
}
});
// Keep track of the number of files we have parsed
var count = htmlfiles.length;
var done = 0;
htmlfiles.forEach(function(filename) {
fs.readFile(path + filename, "binary", function(err, data) {
if(err) throw err;
window = jsdom.jsdom(data).createWindow();
jsdom.jQueryify(window, 'http://ajax.googleapis.com/ajax/libs/jquery/1.4.4/jquery.min.js', function (window, jquery) {
jquery("tr td img").parent().parent().each(function(){
var step = 0;
jquery(this).children().each(function(index){
if (jquery(this).children('img').attr('src') !== undefined) {
step++;
var empname = jquery(this).parent().next().next().children('td:nth-child('+step+')').children().children().text();
var name_parts = empname.split(",");
var last = name_parts[0];
var name_parts = name_parts[1].split(/\u00a0/g);
var first = name_parts[2]
emps[last + ",_" + first] = jquery(this).children('img').attr('src');
}
});
});
// As soon as all have finished call the callback and supply emps
done++;
if (done === count) {
callback(emps);
}
});
});
});
});
}
readDirectory('/home/inet/www/media/employees/', function(emps) {
console.log(emps);
});
You seem to be doing this a tad wrong
readDirectory('/home/inet/www/media/employees/', function() {
console.log(emps);
});
But you've defined your function as:
readDirectory = function(path) {
Where is the callback argument? Try this:
readDirectory = function(path, callback) {
then under emps[last + ",_" + first] = jquery(this).children('img').attr('src'); put
callback.call(null, emps);
Your callback function will be called however many times your loop goes on for. If you want it to return all of them at once, you'll need to get a count of how many times the loop is going to run for, count up until that number then call your callback when the emps array is full of the data you need.

Categories