I am writing a web scraper that makes multiple requests based on a list that looks like this
1. Category1
1a. categoryItem1
1b. categoryItem2
2. Category2
2a. categoryItem1
2b. categoryItem2
2c. categoryItem3
3. Category3
3a. categoryItem1
Both Category and categoryItem are links. Only 1 Category can be expanded at a time. The amount of Categories and categoryItems can change so I don't know the exact amount before hand.
I am gathering the data on each categoryItem page to be saved in a json that looks like this
{
"Category1": [
"categoryItem1: {
// Details saved here
},
"categoryItem2: {
// Details saved here
}
],
"Category2": [
"categoryItem1: {
// Details saved here
},
"categoryItem2: {
// Details saved here
},
"categoryItem3: {
// Details saved here
}
],
"Category3": [
"categoryItem1: {
// Details saved here
}
]
}
The only thing left for me is to figure out how to make this act synchronous
Get the opening page
Open each Category list
Open each categoryItem details page
THIS was the web scraper tutorial that I followed, if you would like to know. Due to async calls I don't know when the very last page is parsed, so here is the structure of the script
server.js
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function (req, res) {
globalJSON = {};
baseUrl = 'http://...';
// 1.) open page with list
request.get(baseUrl, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
// select the list
$('#categoryListSelector').filter(function () {
var data = $(this);
var listItem = data.find('#listItemSelector');
var expansionLink = listItem.find('a').attr('href'); //
var category = listItem.find('font').text();
// Save category to global json
globalJSON[category] = [];
// 2.) Expand the list by opening expansionLink
request.get(baseUrl + expansionLink, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
// Select the sub items of each list item
$('#subItem selector').filter(function () {
var data = $(this);
var categoryItemPageLinkElement = data.find('a');
var categoryItemName = categoryItemPageLinkElement.text();
var categoryItemLink = $(categoryItemPageLinkElement).attr('href');
if (typeof categoryItemLink != "undefinded" && categoryItemLink != null && categoryItemLink != "") {
categoryItemObject = {}; // { categoryItemName: categoryItemDetails }
categoryItemDetails = {};
// 3.) Open the categoryItem page to start gathering data
request.get(baseUrl + categoryItemLink, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
// GATHER and save data here
// Done gathering data save to global json
categoryItemObject[categoryItemName] = categoryItemDetails;
globalJSON[category].push(categoryItemObject);
}
});
}
});
}
});
});
fs.writeFile('output.json', JSON.stringify(globalJSON, null, 4), function (err) {
console.log('File successfully written!');
});
res.send(globalJSON);
}//END if(!error)
});
})//END app.get()
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
Update
I did get my issue solved with some help from the feller below, and this is what I came up with. Now, there might be a better way, feel free to let me know.
Basic Layout
Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
if(error){
return reject(error);
}
//build an array of ALL the categoryItemLinks
return resolve(res, html);
});
}))).then(function(statesArray) {
Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
if(error){
return reject(error);
}
// Gather Data and put into dataJson
return resolve(response, html);
});
}))).then(function(data) {
// Do finishing stuff
}).catch(/*error*/);
}).catch(/*error*/);
server.js
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function (req, res) {
categoriesArr = [];
allCategoryItems = [];
dataJson = {}; // Global json to hold all the data
baseUrl = 'http://www.blahblah.org';
request.get(baseUrl, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html);
$('#categorySelector').filter(function() {
var data = $(this);
var categoryItemLink = data.find('a').attr('href');
categoriesArr.push({
"categoryItemLink": categoryItemLink
});
});
Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
if(error){
return reject(error);
}
var $ = cheerio.load(html);
$('#categoryItemSelector').filter(function() {
var data = $(this);
var categoryItemPageLinkElement = data.find('a');
var categoryItemPageLink = $(categoryItemPageLinkElement).attr('href');
if(typeof categoryItemPageLink != "undefinded" && categoryItemPageLink != null && categoryItemPageLink != "") {
allCategoryItems.push({
"categoryItemPageLink": categoryItemPageLink
});
}
});
return resolve(res, html);
});
}))).then(function(statesArray) {
Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
if(error){
return reject(error);
}
var $ = cheerio.load(html);
// Gather Data and put into dataJson
return resolve(response, html);
});
}))).then(function(data) {
// Do finishing stuff
}).catch(/*error*/);
}).catch(/*error*/);
}//END if(!error)
});
})//END app.get()
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
You can use Promise.all(), so something like:
Promise.all(urls.map(url => new Promise((resolve, reject)=>{
request.get(url, (err, res, html)=>{
if(err){
return reject(err);
}
return resolve(res, html);
});
}))).then(/*success*/).catch(/*error*/);
In that code, the .then() executes after all requests have come back with a response.
Related
In the following node function, it is returning an empty array. Not sure why its doing that. Could this be a async await issue? Would appreciate any help. Thank you
const folderPath = '/public/home.html'
function getCircuitAndFuse(folderPath){
//List containing circuit name with its fuse
let temporaryList = [];
let finalCircuitAndFuseList = []
fs.readFile(__dirname + folderPath, (error, data)=>{
if(error){
console.log(`Unable to read file: ${error}`)
}else{
var $ = cheerio.load(data)
$('img').each(function(index, element){
let getClassAtr = element.attribs.class
temporaryList.push(getClassAtr.slice(0, getClassAtr.lastIndexOf(" ")))
})
finalCircuitAndFuseList = [...new Set(temporaryList)]
}
})
return finalCircuitAndFuseList;
}
let getInfo = getCircuitAndFuse(folderPath)
// Returning empty array
console.log(getInfo)
***Server code****
const server = http.createServer(function(req, res){
res.writeHead(200, {'Content-Type': 'text/plain'})
res.end()
}).listen(port, ()=>{
console.log(`Server listening on port ${port}. Press Ctrl-C to terminate...`)
})
getCircuitAndFuse must return Promise like this:
function getCircuitAndFuse(folderPath) {
return new Promise((resolve, reject) => {
//List containing circuit name with its fuse
let temporaryList = [];
fs.readFile(__dirname + folderPath, (error, data) => {
if (error) {
console.log(`Unable to read file: ${error}`);
} else {
var $ = cheerio.load(data);
$('img').each(function (index, element) {
let getClassAtr = element.attribs.class;
temporaryList.push(
getClassAtr.slice(0, getClassAtr.lastIndexOf(' '))
);
});
resolve([...new Set(temporaryList)]);
}
});
});
}
getCircuitAndFuse(folderPath).then((getInfo) => {
// do something with `getInfo`
});
Another alternative to Faruk's answer would be to just use fs.readFileSync instead of wrapping your function in a promise and requiring some of that extra ceremony. Using fs.readFileSync will ensure that your function doesn't return prematurely.
Here is your code rewritten with that in mind:
function getCircuitAndFuse(folderPath) {
try {
let temporaryList = [];
const data = fs.readFileSync(__dirname + folderPath);
const $ = cheerio.load(data);
$("img").each(function (index, element) {
let getClassAtr = element.attribs.class;
temporaryList.push(getClassAtr.slice(0, getClassAtr.lastIndexOf(" ")));
});
return [...new Set(temporaryList)];
} catch (error) {
console.log(error);
}
}
I use request module to download a zip file that cointain a .csv file, then i use pipe to read the content with unzip and split modules and then i parse and write result into mongodb with mongoose-object-stream module.
My code:
//index.js
var request = require('request');
var bun = require('bun');
var split = require('split');
var unzip = require('./lib/unzip');
var tomongo = require('./lib/tomongo');
var pipeline = bun([ unzip(), split()]);
request.get( "http://someurl/somefile.zip" )
.pipe( pipeline )
.pipe( tomongo() );
//tomongo.js
var mySchema = require('../schema.json');
var through = require('through2');
var mos = require('mongoose-object-stream');
var mongoose = require('mongoose');
var models = require('../models')
const dbpath = "mongodb://localhost:27017/test";
const mongo = mongoose.connect(dbpath, {useNewUrlParser: true });
mongo.then(() => {
console.log('mongoDB connected');
}).catch((err) => {
console.log('err', err);
});
var db = mongoose.connection;
db.on('error', console.error.bind(console, 'connection error:'));
var modelStream = new mos(models.books);
function parser(){
var columns = mySchema;
var parseandwrite = function( chunk, _, cb ){
var row = {}, cells = chunk.toString('utf-8').split('\t');
cells.forEach( function( cell, i ){
row[ columns[ i ] ] = ( cell || '' ).trim();
});
if( !!chunk ){
modelStream.write( row );
}
cb();
};
return through.obj( parseandwrite );
}
module.exports = parser;
I want to do something when the stream ends and all records are stored in the db.
I tried adding to pipe .on('finish', function(){process.exit()}) or .on('end', function(){process.exit()}) but node continue running.
I did it! Through2 need .on("data", function(){}) before the .on("end"...
Now the process gracefully disconnect the database and exit.
var request = require('request');
var bun = require('bun');
var split = require('split');
var unzip = require('./lib/unzip');
var tomongo = require('./lib/tomongo');
var pipeline = bun([unzip(), split()]);
function streamToDB(url) {
return new Promise((resolve, reject) => {
request.get(url)
.pipe(pipeline)
.pipe(tomongo())
.on("data", function(data){
new aModel( data ).save();}) //here i save to the db
.on("error", reject)
.on("end", resolve);
});
}
mongoose.connect("mongodb://localhost:27017/test", {
useNewUrlParser: true
}).then(() => {
console.log('mongoDB connected');
return streamToDB("http://someurl/somefile.zip")
}).catch((err) => {
console.log('err', err);
}).then(() => {
return mongoose.disconnect();
});
//tomongo.js
var parseandwrite = function( chunk, _, cb ){
var row = {}, cells = chunk.toString('utf-8').split('\t');
cells.forEach( function( cell, i ){
row[ columns[ i ] ] = ( cell || '' ).trim();
});
if( !!chunk ){
this.push( row ); //here i push the row to the stream
}
cb();
};
Assuming that your parser method is not the problem here I would suggest moving the database connection logic into your index, you should connect to the DB before attempting to stream data to it. If you wrap the streaming logic in a Promise you can do DB connection handling logic in one Promise chain.
Here's an example of what that might look like:
var Promise = require('bluebird');
var mongoose = require('mongoose');
var MongooseObjectStream = require('mongoose-object-stream');
var request = require('request');
var split = require('split');
var through = require('through2');
var unzip = require('unzip-stream');
function streamToDB(url) {
return new Promise((resolve, reject) => {
request.get(url)
.pipe(unzip.Parse())
.pipe(through.obj(function (entry, enc, cb) {
if (entry.path === 'file_with_content') {
entry.on('end', cb)
.on('error', cb)
.on('data', (data) => this.push(data));
} else {
entry.autodrain()
.on('error', cb)
.on('finish', cb);
}
}))
.pipe(split())
.pipe(through.obj((line, enc, cb) => {
cb(null, line.split('\t')); // Convert to "real" object here
}))
.pipe(new MongooseObjectStream(mongoose, 'Model', {}, { strict: false }))
.on('error', reject)
.on('finish', resolve);
});
}
mongoose.connect('mongodb://localhost:27017/test', {
useNewUrlParser: true,
promiseLibrary: Promise
}).then(() => {
return streamToDB('http://someurl/somefile.zip')
.finally(() => mongoose.disconnect());
}).catch((err) => {
console.error(err);
});
I am trying to find a way to get the currently logged in user and than append them to a JSON file. Below is my code to first read the dir, then get the most recent file, return it and then append the current user that is logged in.
I can append a string to the file but when trying to perform req.user it states
Cannot read property 'user' of undefined
What would I need to include in this file so that it knows what user is?
let fs = require("fs"),
express = require("express"),
_ = require("underscore"),
User = require("./models/user"),
path = require("path");
let getFileAddUser = () => {
let filePath = '../automation_projects/wss-automation-u/results/temp/';
fs.readdir(filePath, (err, files) => {
if (err) { throw err; }
let file = getMostRecentFile(files, filePath);
console.log(file);
fs.readFile(filePath + file, 'utf8', (err, data) => {
let json = JSON.parse(data);
if(err){
console.error(err);
return;
} else {
//Un-comment to write to most recent file.
//==================================================
//This should find the currently logged in user and append them to the most recent file found.
json.currentuser = req.user;
fs.writeFile(filePath + file, JSON.stringify(json), (error) => {
if(error){
console.error(error);
return;
} else {
console.log(json);
}
});
//==================================================
console.log(data);
}
});
});
};
//Get the most recent file from the results folder.
function getMostRecentFile(files, path) {
let out = [];
files.forEach(function(file) {
let stats = fs.statSync(path + "/" +file);
if(stats.isFile()) {
out.push({"file":file, "mtime": stats.mtime.getTime()});
}
});
out.sort(function(a,b) {
return b.mtime - a.mtime;
})
return (out.length>0) ? out[0].file : "";
}
module.exports = getFileAddUser;
Thanks to a knowledgeable co-worker and some further research we were able to get this working. I'd like to share the code we came up with to append the currently logged in user to our results file. You will also notice we got some help using the Ramada.js library.
let fs = require("fs"),
express = require("express"),
_ = require("underscore"),
User = require("./models/user"),
r = require("ramda"),
path = require("path");
//This will be our function to get the most recent file from our dir and
//return it to us. We than user this function below.
function getMostRecentFile(files, path) {
let out = [];
let f = r.tail(files);
console.log(files);
f.forEach(function(file) {
let stats = fs.statSync(path + "/" +file);
if(stats.isFile()) {
out.push({"file":file, "mtime": stats.mtime.getTime()});
}
});
out.sort(function(a,b) {
return b.mtime - a.mtime;
})
return (out.length>0) ? out[0].file : "";
}
//Passing in 'u' as a argument which can than be used in a route and pass in
//anything that we want it to be. In our case it was the currently logged
//in user.
let getUser = (u) => {
let user = u;
let filePath = '../automation_projects/wss-automation-u/results/temp/';
//Comment above and uncomment below for testing locally.
// let filePath = "./temp/";
let file = "";
//Below we read our dir then get the most recent file using the
//getMostRecentfile function above.
read_directory(filePath).then( files => {
file = getMostRecentFile(files, filePath)
console.log(file);
return(read_file(filePath + file))
}).then( x => {
// Here we parse through our data with x representing the data that we
//returned above.
let json = JSON.parse(x);
return new Promise(function(resolve, reject) {
json.currentuser = u;
//And finally we write to the end of the latest file.
fs.writeFile(filePath + file, JSON.stringify(json), (error) => {
if(error) reject(error);
else resolve(json);
// console.log(json);
});
});
});
}
let read_directory = (path) => {
return new Promise((resolve, reject) => {
fs.readdir(path, (err, items) => {
if (err){
return reject(err)
}
return resolve([path, ...items])
})
})
}
let read_file = (path) => {
return new Promise((resolve, reject) => {
fs.readFile(path, "utf8", (err, items) => {
if (err){
return reject(err)
}
return resolve(items)
})
})
}
module.exports = getUser;
Than below is an example route with how to use the getUser module. You will want to require it like you do everything else with node.js and dependencies. Hope this helps someone in the future.
let getUser = require("getuser");
//Make a route to use the getUser module and pass in our argument value.
app.get("/", (req, res) => {
//With in the get user function pass in whatever you want to equal 'u' from the getuser module.
getUser(req.user.username);
res.render("index", { username: req.user });
});
Hi I'm used to work with c#, I'm new with node js. I'm trying to create some kind of web service using Node Js. I'm using VS 2017 community with node js version 8. I have next code:
Express App1
apps.js
app.param('phone', function (request, response, next, phone) {
// ... Perform database query and
// ... Store the user object from the database in the req object
req.phone = phone;
return next();});
index.js
'use strict';
var express = require('express');
var router = express.Router();
var Utils = require("./JavaScript1");
/* GET home page. */
router.get('/', function (req, res) {
res.render('index', { title: 'Express' });});
router.get('/byPhone/:phone', function (req, res) {
var t = Utils.Phone(req.params.phone).then(value => { return value });
//At this point if i try to use await or consume by using web
//http://localhost:1337/byPhone/777777 i only get a promise ...
res.send(t);
});
module.exports = router;
JavaScript1.js
function sAdd(sPhone) {
return new Promise((resolve, reject) => { // (A)
setTimeout(() => resolve("01800" + sPhone), 5000); // (B)
});}
var utils = {
//at this point in the temp and temp2 variables only have promises
Phone:
async function (sPhone) {
var temp = await sAdd(sPhone).then(value => { return value });
var temp2 = await temp;
return temp2;
}
};
module.exports = utils;
I'm trying to consume it with the next Node Js Console App:
'use strict';
async function main()
{
console.log('Hello world');
var url = "http://localhost:1337/byPhone/777777";
request(url, function (err, response, body) {
if (err) { console.log(err); callback(true); return; }
var tt = body;
console.log(tt);
});
}
main();
i get the below answer:
(node:19492) UnhandledPromiseRejectionWarning: Unhandled promise
rejection (rejection id: 1): ReferenceError: request is not defined
I forgot to answer myself in this question, but I resolved what I wanted to do at that time this way:
I created a solution with 2 projects:
c# controllers webapp
on the c# side:
[Produces("application/json")]
[Route("test/Mondb")]
public class MondbController : Controller
{
// GET: api/Mondb
[HttpGet]
public string Get()
{
var client = new MongoClient("mongodb://localhost:27017");
var database = client.GetDatabase("upixTest");
var collection = database.GetCollection<BsonDocument>("contacts");
var document = collection.Find(new BsonDocument()).ToList();
return document.ToJson(new JsonWriterSettings { OutputMode = JsonOutputMode.Strict });
}
// GET: api/Mondb/5
[HttpGet("{id}", Name = "Getmdb")]
public string Get(int id)
{
var filter = Builders<BsonDocument>.Filter.Eq("phone", id.ToString());
var client = new MongoClient("mongodb://localhost:27017");
var database = client.GetDatabase("upixTest");
var collection = database.GetCollection<BsonDocument>("contacts");
var document = collection.Find(filter).FirstOrDefault();
return (document==null) ? "nothing":document.ToJson(new JsonWriterSettings { OutputMode = JsonOutputMode.Strict });
}
on the other side the node js project
functions.js
thefunctions = {
add: function (a, b) { return a + b; }, // test function
mongo: function () {
var axios = require('axios');
var tresult;
return axios.get('http://localhost:55384/test/Mondb/');
}
};
module.exports = thefunctions;
then in the server.js
'use strict';
var http = require('http');
var fs = require('fs');
var url = require('url');
var port = process.env.PORT || 1337;
var thefunctions = require("./thefunctions");
var dataToShow = "";
var JSON = require('JSON');
http.createServer(function (req, res) {
var hostname = req.headers.host;
var pathname = url.parse(req.url).pathname;
var fullurl = 'http://' + hostname + pathname;
var search = url.parse(req.url).search ? url.parse(req.url).search : "";
if (pathname === "/index.html") {
res.writeHead(200, { 'Content-Type': 'text/plain' });
res.write('hi my friend\n');
var tout = thefunctions.mongo();
tout.then(function (response) {
res.write(JSON.stringify(response.data).toString());
res.write('\n');
res.end('\n\nThe End\n');
})
.catch(function (error) {
console.log(error);
res.write("error");
res.end('\n\nThe End\n');
});
}
else {
res.writeHead(400, { 'Content-Type': 'text/plain' });
res.write('error my friend\n');
res.end('404 isnt available');
return;
}
}
).listen(port);
this way you will consume c# wb services from node js app
I am using the request library in javascript.
If I use
function executeYQL(q) {
const uri = 'http://query.yahooapis.com/v1/public/yql'
const qs = {
q: encodeURIComponent(q),
format: 'json',
env: 'http://datatables.org/alltables.env'
};
request(uri, qs, (err, res, body) => {
if (!err && res.statusCode === 200) {
return JSON.parse(body);
} else {
console.log(res.statusCode);
console.log(err);
}
});
};
exports.getStocks = (req, res) => {
const q = 'select * from yahoo.finance.historicaldata where symbol = "YHOO" AND startDate = "2009-09-11" and endDate = "2010-03-10"';
const json = executeYQL(q);
res.json(json);
};
it results in status code 400, but if I use
const uri = 'http://query.yahooapis.com/v1/public/yql?q=' + encodeURIComponent(q) + '&format=json&env=http://datatables.org/alltables.env'
it retrieves the data correctly; however, the variable json inside getStocks() is undefined.
I don't know why it's not populated. Do I have to use Promise? I guess it's because of synchronous call, so it runs res.json(json) before const json = executeYQL(q) is done?
How do I make my code in each line wait until the code previous lines are executed?