Can't seem to get the pipe function working properly as I am getting a "TypeError: Object # has no method 'join'" error. My input file is a very basic comma delimitated csv, nothing too fancy.
I'd like to pipe the output directly to my response.
var output = [];
var parser = parse({auto_parse: true, columns: true});
var input = fs.createReadStream('./uploads/' + req.body.file);
var transformer = transform(function (record, callback) {
setTimeout(function () {
callback(null, record.join(' ') + '\n');
}, 500);
}, {parallel: 10});
transformer.on('error', function (err) {
res.send(500,err);
console.log(output);
});
transformer.on('finish', function () {
console.log('finish');
console.log(output);
});
input.pipe(parser).pipe(transformer).pipe(process.stdout);
Related
I am trying to parse some data from several web pages using javascript. I wrote a small parser for this purpose. The algorithm looks like this:
Open first URL from my .csv file
Find the data I need on the page
Save URL and data to a json file
My code executes 1. and 2. perfectly but sometimes messes up with number 3. Output looks like this:
URL 1 + data from URL 1 (correct line)
URL 2 + data from URL 2 (correct line)
URL 3 + data from URL 3 (correct line)
URL 4 + data from URL 4 (correct line)
URL 6(wrong URL) + data from another URL
URL 5(wrong URL) + data from another URL
URL 7 + data from URL 7 (correct line)
URL 8 + data from URL 8 (correct line)
URL 9 + data from URL 9 (correct line)
I assume the problem is that some pages load way too long which messes up the whole process. But I still don't understand why it sometimes saves the wrong data.
Heres my code:
var request = require('request');
var cheerio = require('cheerio');
var cloudscraper = require('cloudscraper');
var fs = require('fs');
var path = require('path');
var csvjson = require('csvjson');
//First, we read .csv file with our URL list
function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
end = urlList.length;
logs = [];
//here we start the loop reading and saving data from each url
for (let p = 0; p < end; p += 1){
grabTheData(urlList, p)
}
}
//this code extracts the data from the page and saves it to a json file
function grabTheData(urlList, p){
setTimeout(function() {
url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
getTheList()
The reason this is happening is that there is a potential mismatch between the callback result and the url variable in grabTheData.
Now there is a very quick fix for this, simple change the scope of the url variable like so:
function grabTheData(urlList, p){
setTimeout(function() {
// Set scope of url variable to block
let url = url[p].ItemLink;
cloudscraper.get(url, function(err, res, body){
if (err) {
console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
callback();
} else {
var $ = cheerio.load(body);
/*
here are the lines which extract the data I need
dataIneed = ...;
*/
logs.push({
url, dataINeed
});
fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
}
});
//here I set a 2 seconds delay between each URL
}, 2000 * p);
}
This should keep your results in order.
Here's another (IMHO much better) option, using promises and avoiding the use of setTimeout to separate calls. This should avoid any potential race condition, since the Promise.all call will preserve order:
async function getTheList() {
urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
var options = {
delimiter : ';', // optional
quote : '"' // optional
};
urlList = csvjson.toObject(urlList, options);
let promiseList = urlList.map(urlEntry => grabTheDataUpdated(urlEntry.ItemLink));
let logs = await Promise.all(promiseList);
fs.writeFileSync('./new_logs.json', JSON.stringify(logs, null, 4));
}
// Promisified version of cloudscraper.get
function getCloudScraperData(url) {
return new Promise((resolve, reject) => {
cloudscraper.get(url, (err, res, body) => {
if (err) {
reject(err);
} else {
resolve ( { url, res, body });
}
})
})
}
function getDataINeed(url, body) {
// Use cheerio to process data..
// Return mock data for now.. replace with actual data processed by cheerio..
return `data from ${url}`;
}
async function grabTheDataUpdated(url) {
try {
let result = await getCloudScraperData(url);
let dataINeed = getDataINeed(result.url, result.body);
return { url, dataINeed };
} catch (error) {
return { url, dataINeed: "Error occurred: " + error.message };
}
}
I am quite new in AngularJS and cordova, and are having some little problems to solve simple issues.
I was studying the Cordova-Plugin-File, and aren't able to run it in my AngularJS mobile device application. Could someone help me?
Follow the code, that i take from examples in the web.
javascript:
var logOb = "";
window.resolveLocalFileSystemURL(cordova.file.dataDirectory, function (dir) {
console.log("got main dir", dir);
dir.getFile("log.txt", { create: true }, function (file) {
console.log("got the file", file);
logOb = file;
writelog("App started");
});
});
function writelog(str) {
if (!logOb) alert('teste2');
var log = str + " [" + (new Date()) + "]\n";
console.log("going to log " + log);
logOb.createWriter(function (fileWriter) {
fileWriter.seek(fileWriter.length);
var blob = new Blob([log], { type: 'text/plain' });
fileWriter.write(blob);
console.log("ok, in theory i worked");
}, fail);
}
Remembering that i already test the plugin and it are working well. I used this tutorial to test it:
https://www.youtube.com/watch?v=bizW561ALsY
The commands returned the folders in my cellphone as needed. I just are having problem with the example above, when come in the 'createWriter', command, and are unable to pass from it.
If anyone could help me i would be really grateful.
Updating.
While i was waiting for a response i found some ways to start to do what i need. First i was able to create the archive .csv.
$scope.criaCSV = function () {
// Create Object
var items = [
{ name: "Item 1", color: "Green", size: "X-Large" },
{ name: "Item 2", color: "Green", size: "X-Large" },
{ name: "Item 3", color: "Green", size: "X-Large" }];
// Convert Object to JSON
var jsonObject = JSON.stringify(items);
// Convert JSON to CSV & Display CSV
ConvertToCSV(jsonObject);
};
function ConvertToCSV(objArray) {
var array = typeof objArray != 'object' ? JSON.parse(objArray) : objArray;
var str = '';
for (var i = 0; i < array.length; i++) {
var line = '';
for (var index in array[i]) {
if (line != '') line += ','
line += array[i][index];
}
str += line + '\r\n';
}
And latter i was able to save a txt archieve in my mobile:
document.addEventListener("deviceready", onDeviceReady, false);
function onDeviceReady() {
requestFileSystem(LocalFileSystem.PERSISTENT, 0, onSuccess, onError);
}
function onSuccess(fileSystem) {
alert(fileSystem);
var directoryEntry = fileSystem.root;
//lets create a file named readme.txt. getFile method actually creates a file and returns a pointer(FileEntry) if it doesn't exist otherwise just returns a pointer to it. It returns the file pointer as callback parameter.
directoryEntry.getFile("readme.txt", { create: true, exclusive: false }, function (fileEntry) {
//lets write something into the file
fileEntry.createWriter(function (writer) {
alert("This is the text inside readme file");
}, function (error) {
alert("Error occurred while writing to file. Error code is: " + error.code);
});
}, function (error) {
alert("Error occurred while getting a pointer to file. Error code is: " + error.code);
});
}
function onError(evt) {
alert("Error occurred during request to file system pointer. Error code is: " + evt.code);
}
Would someone help me to pass that arch to the code where i save the txt, changing the txt for that arch??
Thank you everyone.
I know this question have been asked many times, but I can't make it work.
Here is my situation. I had a string called data, and I want to unshorten all the link inside that string.
Code:
var Bypasser = require('node-bypasser');
var URI = require('urijs');
var data = 'multiple urls : http://example.com/foo http://example.com/bar';
var result = URI.withinString(data, function(url) {
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
Let's me walk you through the code.
URI.withinString will match all the URLs in data, manipulate it and return the result.
You can view an example from URI.js docs
What I want to with these URLs is to unshorten all of them using node-passer.
This is from node-bypasser document:
var Bypasser = require('node-bypasser');
var w = new Bypasser('http://example.com/shortlink');
w.decrypt(function(err, result) {
console.log('Decrypted: ' + result);
});
This is the result that I want multiple urls : http://example.com/foo_processed http://example.com/bar_processed
I created a notebook at tonicdev.com
Solution
var getUrlRegEx = new RegExp(
"(^|[ \t\r\n])((ftp|http|https|gopher|mailto|news|nntp|telnet|wais|file|prospero|aim|webcal):(([A-Za-z0-9$_.+!*(),;/?:#&~=-])|%[A-Fa-f0-9]{2}){2,}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:#&~=%-]*))?([A-Za-z0-9$_+!*();/?:~-]))"
, "g"
);
var urls = data.match(getUrlRegEx);
async.forEachLimit(urls, 5, function (url, callback) {
let w = new Bypasser(url);
w.decrypt(function (err, res) {
if (err == null && res != undefined) {
data = data.replace(url, res);
callback();
}
});
}, function(err) {
res.send(data);
});
You don't really understand what callback is. The callback serves to allow asynchronous code to run without Javascript waiting for it. If you were less lazy and added some debug in your code:
console.log("Started parsing");
var result = URI.withinString(data, function(url) {
console.log("URL parsed (or whatever)");
var unshortenedUrl = null;
var w = new Bypasser(url);
w.decrypt(function(err, res) {
// How can I return res ?
unshortenedUrl = res;
});
// I know the w.descrypt function is a asynchronous function
// so unshortenedUrl = null
return unshortenedUrl;
});
console.log("Call to library over");
You would (most likely) see messages in this order:
Started parsing
Call to library over
URL parsed (or whatever)
The answer: Callback is not guaranteed to run before any code you execute after assigning it. You can't put data in your result variable because the data might not be fetched yet.
I would like to scrap an url:
1 request to get a list of elements
1 request on each result to get details
Here what I have:
var request = require('request')
, cheerio = require('cheerio')
, async = require('async')
, format = require('util').format;
var baseurl = 'http://magiccards.info';
async.waterfall([
function (callback) {
request(baseurl + '/sitemap.html', function (err, response, body) {
var sets = [];
var $ = cheerio.load(body);
$('a[href$="/en.html"]').each(function () {
sets.push({"name": $(this).text(), "code":$(this).attr('href').match(/\/([^)]+)\//)[1], "path": $(this).attr('href'), "translations":[]});
});
callback(null, sets);
});
},
function (sets, callback) {
console.log(sets);
async.eachSeries(sets, function (set, callback) {
console.log('SET ' + set.code.toUpperCase());
request(baseurl + set.path, function (err, response, body) {
var $ = cheerio.load(body);
$('body > a[href^="/' + set.code + '/"]').each(function () {
console.log(' %s (%s)', $(this).text(), $(this).attr('href'));
});
});
});
}
], function (err, result) {
console.log('ERR');
// result now equals 'done'
});
The problem is that the 2nd waterfall function run only once, if I replace the eachSeries with an each, the loop does run X times (but I need to wait for result).
Wath am I missing?
You need to call the eachSeries callback function. Otherwise async won't know that you are done. (1)
You also need to tell the waterfall function that you are done with that step, also by calling the callback function. (2)
function (sets, waterfallCallback) {
async.eachSeries(sets, function (set, seriesCallback) {
console.log('SET ' + set.code.toUpperCase());
request(baseurl + set.path, function (err, response, body) {
var $ = cheerio.load(body);
$('body > a[href^="/' + set.code + '/"]').each(function () {
console.log(' %s (%s)', $(this).text(), $(this).attr('href'));
});
seriesCallback(null); /* 1 */
});
}, waterfallCallback /* 2 */);
}
Firstly, here is my code as I've progressed so far:
var http = require("http");
// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var cheerio = require("cheerio");
var url = "http://www.bloglovin.com/en/blogs/1/2/all";
var myArray = [];
var a = 0;
var getLinks = function(){download(url, function(data) {
if (data) {
// console.log(data);
var $ = cheerio.load(data);
$(".content").each(function(i, e) {
var blogName = $(e).find(".blog-name").text();
var followLink = $(e).find("a").attr("href");
var blogSite = $(e).find(".description").text();
myArray[a] = [a];
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
a++;
console.log(myArray);
});
}
});
}
getLinks();
As you can see, followLinks is concatenated to followUrl, of which I'd like to pass through the 'url' download, so effectively I'll be scraping each of the pages using the same CSS rules, which will be added to the multidimensional array for the corresponding blogger.
How can I go about this?
I do something similar in one of my scraping jobs, but I use the async.js library to accomplish. Note that I'm also using the request module and cheerio.js in my scraping. I fetch and scrape rows of data from a single webpage, but suspect you could do something similar to fetch URLs and request / scrape them in the same manner.
I also admit this is quite basic coding, certainly could be optimized with a bit of refactoring. Hope it gives you some ideas at least...
First, I use request to fetch the page and call my parse function -
var url = 'http://www.target-website.com';
function(lastCallback) {
request(url, function(err, resp, body) {
if(!err) { parsePage(err, resp, body, lastCallback); }
else { console.log('web request error:' + resp.statusCode); }
}
}
Next, in my parsePage function, I load the website into Cheerio, fetch the HTML of each data row into an array, push my parseRow function and each HTML segment into another array, and use async.parallel to process each iteration -
var rows = [];
function parsePage(err, resp, body, callback1) {
var $ = cheerio.load(body);
$('div#targetTable tr').each(function(i, elem) {
rows.push($(this).html());
});
var scrRows = [];
rows.forEach(function(row) {
scrRows.push(function(callback2) {
parseRow(err, resp, row);
callback2();
});
async.parallel(scrRows, function() {
callback1();
});
}
Inside your loop, just create an object with the properties you scrape then push that object onto your array.
var blogInfo = {
blogName: blogName,
followLink: "http://www.bloglovin.com"+followLink;
blogSite: blogSite
};
myArray.push(blogInfo);
You have defined a = 0; So
myArray[a] = [a]; // => myArray[0] = [0]; myArray[0] becomes an array with 0 as only member in it
All these statements throw an error since Array can have only integer as keys.
myArray[a]["blogName"] = blogName;
myArray[a]["followLink"] = "http://www.bloglovin.com"+followLink;
myArray[a]["blogSite"] = blogSite;
Instead try this:
var obj = {
index: a,
blogName: blogName,
followLink: "http://www.bloglovin.com" + followLink,
blogSite: blogSite
}
myArray.push(obj);
console.log(myArray);