I'm writing a program that scrapes a site for links, then scrapes these links for information. In order to scrape the site, it is necessary to log in first. And so the order is: Log in -> Scrape the index for links -> Scrape the links for info
The callback to the login function prints an empty array { results: [], hasMore: true }, so something is wrong with my code (the scraping part works):
var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');
var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";
var credentials = {
username: 'user1',
password: 'passpass'
};
login(function (result) {
console.log(result);
});
function login(callback) {
request.post({
uri: loginUrl,
headers: { 'content-type': 'application/x-www-form-urlencoded' },
body: require('querystring').stringify(credentials)
}, function(err, res, body){
if(err) {
console.log("Login error");
return;
}
scrapeTorrents(url1, function (result) {
callback(result);
});
});
}
function scrapeTorrents(url, callback) {
request(url, function(err, res, body) {
if(err) {
console.log("Main scrape error");
return;
}
var links = []
var $ = cheerio.load(body);
$('span.title').each(function(i, element){
var title = $(this);
var a = $(this).children().eq(0);
var detailsUrl = a.attr('href');
//console.log(detailsUrl);
links.push(detailsUrl);
});
scrapeTorrentDetails(links, function (result) {
callback(result);
});
});
}
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
});
}
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
}
Maybe Q promises would be better. How would I implement that in the code above?
If you're wondering what the code is for, I'm planning to modify Popcorn-time to use another torrent-tracker (without an API).
Thanks
A main problem is with this code:
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
getDetails() is async, but you just call it links.length times and move on - acting like they have all completed. So, none of the requests in getDetails() is done before you call the callback and try to pass the results. But, none of the results have yet been filled in so they will be empty.
You have all these other nested callbacks everywhere through your code (as required), yet you dropped the ball in this one place. You need to know when all the getDetails() calls are done before you call the final callback with the results.
In addition, you also have to decide if you're OK calling all the getDetails() calls in parallel (all in flight at once) or if what you really want to do is to call one, wait for it to finish, then call the next, etc... Right now you are putting them all in-flight at once which can work if the destination server doesn't object to that many requests all at once.
There are several potential strategies for fixing this.
Add a callback to getDetails() and then keep a count of when you've gotten links.length callbacks from getDetails() and only when the entire count has finished so you call the final callback.
Change getDetails() to return a promise. You can then use something like links.map(getDetails) to create an array of promises that you can then use Promise.all() with to know when they are all done.
Personally, I would change all of your code to use promises and I'd use the Bluebird promises library for it's extra features such as Promise.map() to make this even simpler.
Here's a fix that adds a callback to getDetails() and then counts how many are done:
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url, done) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
done(err);
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
done();
});
}
var doneCnt = 0;
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i], function() {
++doneCnt;
if (doneCnt === links.length) {
callback( {
results: results,
hasMore: true
});
}
});
}
}
The following is the given sample code rewritten to use bind, a custom this object and a count of the requests that have yet to complete (I think promises obscure the execution path).
The reason that the callback is returning an empty array seems to be that there are no spans in the document with a title attribute, so as a result no further requests are triggered.
var
request = require('request').defaults({
jar: true
}), // necessary for persistent login
cheerio = require('cheerio'),
process = require('process'),
url1 = "https://example.org/torrents/browse/index/",
loginUrl = "https://example.org/user/account/login/",
login = function(callback) {
request.post({
uri: loginUrl,
headers: {
'content-type': 'application/x-www-form-urlencoded'
},
body: require('querystring').stringify({
username: 'user1',
password: 'passpass'
})
}, fna.bind({
callback: callback
}));
},
fna = function(err, res, body) {
if (err) {
console.log("Login error");
return;
}
request(url1, fnb.bind(this));
},
fnb = function(err, res, body) {
if (err) {
console.log("Main scrape error");
return;
}
var
$ = cheerio.load(body),
links = [],
fnd = fne.bind(this);
$('span.title').each(function() {
links.push($(this).children().first().attr('href'));
});
this.results = [];
this.resultCount = links.length;
if (this.resultCount) {
fnd = fnc.bind(this);
for (var i = 0; i < links.length; i++) {
request("https://example.org" + links[i], fnd);
}
} else {
process.nextTick(fnd);
}
},
fnc = function(err, res, body) {
if (err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var
$ = cheerio.load(body),
tds = $('td'),
title = $(tds).get(1).firstChild.data,
hash = $(tds).get(3).firstChild.data.trim(),
size = $(tds).get(9).firstChild.data,
rlsDate = "notfound",
genres = "notfound",
runtime = "notfound",
plot = "notfound",
rating = "notfound", // of 10
imdb_id = "notfound",
cover = "notfound",
thumb = "notfound";
if (tds.length > 23) {
rlsDate = $(tds).get(23).firstChild.data || '';
genres = $(tds).get(27).firstChild.data || '';
runtime = $(tds).get(31).firstChild.data || '';
if ($(tds).get(33).firstChild != null) {
plot = $(tds).get(33).firstChild.data || '';
}
rating = $('#imdb_rating').parent().next().text() || ''; // of 10
imdb_id = $('[name=imdbID]').get(0).attribs.value || '';
cover = $('#cover').children().eq(0).get(0).attribs.href || '';
thumb = $('[alt=Cover]').get(0).attribs.src || '';
if (typeof cover == 'undefined') {
cover = thumb;
}
}
this.results.push({
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
});
this.resultCount--;
if (this.resultCount === 0) {
this.callback({
results: this.results,
hasMore: true
});
}
},
fne = function() {
this.callback({
results: this.results,
hasMore: true
});
};
login(function(result) {
console.log(result);
});
Related
In code below, I have a main loop to get each 'assunto' in records(jsonAssuntos). After that, I want to send the 'assunto' to your appropriate queue. The problem is about idSequence. The queue receive the wrong value, sometimes the last number generated.
module.exports.consumeMainQueue = async (event, context, callback) => {
var AssuntoQueueAURL = 'http://localhost:9324/queue/Queue';
for (let recordIndex in event.Records){
var record = event.Records[recordIndex];
var jsonAssuntos = JSON.parse(record.body);
var idMessage = record.messageId;
var idSequence = 0; // initialize *********************
for (var assuntoIndex in jsonAssuntos.dat){
idSequence++; // increment ********************
var assunto = jsonAssuntos.dat[assuntoIndex];
if(assunto.typ == "typ1" || assunto.typ == "typ2" || assunto.typ == "typ3"){
var infoAssunto = {
ParentID: idMessage,
Asssunto: assunto
IdSequence : idSequence; // send to queue **************
};
var paramsAssuntoQueue = {
MessageBody: JSON.stringify(infoAssunto),
QueueUrl: assuntoTyp
};
queue.sendMessage(paramsAssuntoQueue, function(err, data) {
if (err) {
console.log("Error", err);
}
else{
console.log("OK");
//End Log
};
});
}
else{
}
}
};
I'm currently using AWS Lambda JavaScript code to try and search a DynamoDB table this is then implemented into an Amazon Alexa application, but that isn't really important for what I'm asking. Here is the code I'm struggling with:
function readDynamoItem(params2, callback) {
var AWS = require('aws-sdk');
AWS.config.update({region: AWSregion});
var dynamodb = new AWS.DynamoDB();
console.log('reading item from DynamoDB table');
dynamodb.scan(params2, function (err, data){
if (err) {
callback("error");
//console.log(err, err.stack); // an error occurred
}
else{
callback(data);
}
});
}
So when an error occurs I want it to callback the message "error" and then use it here:
const params2 = {
TableName: 'Fixtures',
FilterExpression: 'team1 = :value',
ExpressionAttributeValues: {':value': {"S": MyQuestion.toLowerCase()}}
};
readDynamoItem(params2, myResult=>{
say = myResult;
this.response.speak(say).listen('try again');
this.emit(':responseReady');
});
All I'm getting at the moment is this response when I test, I think due to err just ending the program instead of calling the error back to use in the implementation:
Response:
{
"errorMessage": "RequestId: 0f586880-2ddb-11e8-bdf7-07b4c224b25d Process exited before completing request"
}
Any help would be greatly appreciated.
Here's the full code for my project for further reference:
const AWSregion = 'eu-west-1';
const Alexa = require('alexa-sdk');
const AWS = require('aws-sdk');
AWS.config.update({
region: AWSregion
});
exports.handler = function(event, context, callback) {
var alexa = Alexa.handler(event, context);
// alexa.appId = 'amzn1.echo-sdk-ams.app.1234';
// alexa.dynamoDBTableName = 'YourTableName'; // creates new table for session.attributes
alexa.registerHandlers(handlers);
alexa.execute();
};
const handlers = {
'LaunchRequest': function () {
this.response.speak('welcome to magic answers. ask me a yes or no question.').listen('try again');
this.emit(':responseReady');
},
'MyIntent': function () {
var MyQuestion = this.event.request.intent.slots.MyQuestion.value;
console.log('MyQuestion : ' + MyQuestion);
const params2 = {
TableName: 'Fixtures',
FilterExpression: 'team1 = :value',
ExpressionAttributeValues: {':value': {"S": MyQuestion.toLowerCase()}}
};
const params3 = {
TableName: 'Fixtures',
FilterExpression: 'team2 = :value',
ExpressionAttributeValues: {':value': {"S": MyQuestion.toLowerCase()}}
};
readDynamoItem(params2, myResult=>{
var say = MyQuestion;
//if nothing is found when scanning for team1, scan team2
if (myResult == "error"){
readDynamoItem(params3, myResult2=>{
say = myResult2;
say = 'The top scorer for ' + MyQuestion + ' is ' + myResult2;
this.response.speak(say).listen('try again');
this.emit(':responseReady');
});
}
else{
say = myResult;
say = 'The top scorer for ' + MyQuestion + ' is ' + myResult;
this.response.speak(say).listen('try again');
this.emit(':responseReady');
}
});
},
'AMAZON.HelpIntent': function () {
this.response.speak('ask me a yes or no question.').listen('try again');
this.emit(':responseReady');
},
'AMAZON.CancelIntent': function () {
this.response.speak('Goodbye!');
this.emit(':responseReady');
},
'AMAZON.StopIntent': function () {
this.response.speak('Goodbye!');
this.emit(':responseReady');
}
};
// END of Intent Handlers {} ========================================================================================
// Helper Function =================================================================================================
//reading the Fixtures table
function readDynamoItem(params2, callback) {
var AWS = require('aws-sdk');
AWS.config.update({region: AWSregion});
var dynamodb = new AWS.DynamoDB();
var team1;
var team2;
console.log('reading item from DynamoDB table');
dynamodb.scan(params2, function (err, data){
if (err) {
callback("error");
//callback("error");
//console.log(err, err.stack); // an error occurred
}
else{
console.log(data); // successful response
team1 = jsonToString(data.Items[0].team1);
team2 = jsonToString(data.Items[0].team2);
var t1goals = jsonToString(data.Items[0].t1goals);
var t2goals = jsonToString(data.Items[0].t2goals);
t1goals = parseInt(t1goals);
t2goals = parseInt(t2goals);
var search;
var chosenValue = Math.random() < 0.5 ? team1 : team2;
// if goals are equal in a match then it is random which team will score next
if(t1goals == t2goals){
search = chosenValue;
}
//if a team has 1 goal more than the other then it is a 3rd more likely they will score next
else if(t1goals > t2goals && t1goals == 1){
if(randomInt(1, 3) == 1){
search = team2;
}
else{
search = team1;
}
}
else if(t2goals > t1goals && t2goals == 1){
if(randomInt(1, 3) == 1){
search = team1;
}
else{
search = team2;
}
}
//if a team has more than 1 goal more than the other then it is a 5th more likely they will score next
else if(t1goals > t2goals && t1goals > 1){
if(randomInt(1, 5) == 1){
search = team2;
}
else{
search = team1;
}
}
else if(t2goals > t1goals && t2goals > 1){
if(randomInt(1, 5) == 1){
search = team1;
}
else{
search = team2;
}
}
var params = {
TableName: 'yesno',
FilterExpression: 'team = :value',
ExpressionAttributeValues: {':value': {"S": search}}
};
readDynamoFixtures(params, myResult=>{
callback(myResult);
});
}
});
}
//read player details from the the yesno table
function readDynamoFixtures(params, callback) {
var goals = new Array();
var playing = new Array();
var messages = new Array();
var most = 0;
var mostMessage;
var dynamodb = new AWS.DynamoDB();
dynamodb.scan(params, function (err, data) {
if (err) console.log(err, err.stack); // an error occurred
else{
for(var i = 0; i <= (data.Count - 1); i++){
console.log(data); // successful response
var temp = jsonToString(data.Items[i].playername);
messages[i] = temp;
temp = jsonToString(data.Items[i].goals);
temp = parseInt(temp);
goals[i] = temp;
temp = jsonToString(data.Items[i].playing);
playing[i] = temp;
//compare each players goals
if (goals[i] > most && playing[i] == "true"){
most = goals[i];
mostMessage = messages[i];
}
}
}
callback(mostMessage);
});
}
//convert database items from json format to string
function jsonToString(str){
str = JSON.stringify(str);
str = str.replace('{\"S\":\"', '');
str = str.replace('\"}', '');
return str;
}
//get a random int between min and max
function randomInt(min,max)
{
return Math.floor(Math.random()*(max-min+1)+min);
}
Edit:
I have tried testing this code with .query instead of .scan and the error callback works perfectly which is strange but obviously for this implementation I need to use .scan
When you get the "Process exited" response from the Lambda it is helpful to log heavily to see where the Lambda is getting stuck and then check the Cloudwatch Logs to get to the detail.
Then you can pinpoint the exception and focus on it. At least for me, the root cause was many times unexpected as Lambdas force a different way of thinking.
So, I made a method fetchStories that fetches stories from my API in batches of 10, and the new stories can be fetched once you've scrolled to the bottom of the screen. This is what part of my Vue instance looks like:
var discoveryFeed = new Vue({ // eslint-disable-line no-unused-vars
el: '#discoveryFeed',
data: {
userId: userId,
username: username,
tags: [],
selectedTag: null,
stories: [],
checklist: [],
limit: 10,
page: 1,
isEnd: false,
busy: false,
isFollowing: true
},
beforeMount: function() {
var self = this;
self.$http.get('/api/tags')
.then(function(res) {
self.tags = res.body;
}, function(err) {
self.tags = [];
});
self.$http.get('/api/user/' + self.username + '/checklist')
.then(function(res) {
self.checklist = res.body || [];
}, function(err) {
self.checklist = [];
});
self.fetchStories();
},
methods: {
fetchStories: function(isNew) {
var self = this;
isNew = Boolean(isNew);
if(self.isEnd) { return; }
self.busy = true;
var url = '/api/discover';
if(self.selectedTag) {
url = `/api/tags/${self.selectedTag.code}/stories`;
}
url += '?p=' + self.page;
url += '&l=' + self.limit;
self.page += 1;
self.$http.get(url)
.then(function(res) {
self.busy = false;
if(res.body.length === 0) {
self.isEnd = true;
return;
}
if(isNew) {
self.stories = [];
}
self.stories = self.stories.concat(res.body);
}, function(err) {
self.busy = false;
self.stories = [];
});
},
setTag: function(tag) {
var self = this;
self.selectedTag = tag;
for(var i = 0; i < self.tags.length; i++) {
self.tags[i].selected = false;
}
self.selectedTag.selected = true;
self.page = 1;
self.fetchStories(true);
}
In my pug, I'm using the v-infinite-scroll directive to call the method fetchStories. Also note that I'm working with a list of tags, and clicking a new tag will load different sets of stories through the method setTag(tag).
nav.col-lg-3.col-md-4.d-none.d-md-block.d-lg-block.bg-sidebar(v-cloak)
.feed-sidebar.feed-sidebar-interests.border-top-0.border-bottom-0.border-left-0.position-sticky
strong Your Interests
div.list-group.list-unstyled.mt-2(v-if="tags.length > 0")
a.tag-btn.mb-2.align-middle.text-black(v-for="tag, index in tags" v-bind:class="{ active: selectedTag && selectedTag.id === tag.id }" #click="setTag(tag); scrollToTop();")
i.fa-fw.mr-2(v-bind:class="tag.icon + ' fa-lg'"
v-bind:style="'color:' + tag.hexColor")
span {{ tag.name }}
.col-lg-6.col-md-8(v-cloak
v-infinite-scroll="fetchStories"
infinite-scroll-disabled="busy"
infinite-scroll-distance="50")
Upon checking the data response at the initial load, the ten stories are fetched at /stories?p=1&l=10. However, upon reaching the bottom the data response array of /stories?p=2&l=10 is empty. This may have something to do with my use of booleans to set flags when choosing a tag.
Apparently, I should have reset the value of isEnd when I'm setting the new tag.
setTag: function(tag) {
var self = this;
self.selectedTag = tag;
for(var i = 0; i < self.tags.length; i++) {
self.tags[i].selected = false;
}
self.selectedTag.selected = true;
self.page = 1;
self.isEnd = false;
self.fetchStories(true);
}
I'm quite new to the topic and i'm still having some issues with my mailparser. Though searching and finding emails in the email header (mail.from) does work, it doesn't work in the email body. Does anybody have some experience with that and is willing to help? You can find the function i'm talking about under the "// Check for other addresses in Mail-Body (Doesn't work yet)"-comment. I think, that my Regex is correct. Also if the matchAll-Function give back an array and it can't be saved in the the subscriber.email-object, it shall be at least logged to the console. Also i checked manually in the inbox if there are mails with email adresses in the mail body. There are at least two, which shall be found..
The part of the App.js, that does the mailparsing:
const simpleParser = require('mailparser').simpleParser;
//const htmlparser = require("htmlparser2");
var fs = require('fs');
var config = require('./config');
var Imap = require('imap');
var imap = new Imap(config.imap);
var blacklistString = '';
String.prototype.matchAll = function(regexp) {
var matches = [];
this.replace(regexp, function() {
var arr = ([]).slice.call(arguments, 0);
var extras = arr.splice(-2);
arr.index = extras[0];
arr.input = extras[1];
matches.push(arr);
});
return matches.length ? matches : null;
};
function openInbox(subbox,cb) {
imap.openBox('INBOX.'+subbox, true, cb);
}
function getBoxes(cb) {
imap.getBoxes(cb);
}
function showBoxes(boxes) {
imap.end();
}
function logArrayElements(element) {
if(element[1].indexOf('placeholder.de')==-1){
addToBlacklistString(element[1]);
}
}
function addToBlacklistString(str) {
blacklistString += str+"\n";
}
function writeBlacklistFile() {
fs.appendFile('data/data.csv', blacklistString, function (err) {
if (err) throw err;
console.log('Saved!');
});
}
function search(searchArray, regex){
imap.search(searchArray, function(err, results) {
if (err) throw err;
var temp = 0;
var mailtemp = [];
var f = imap.fetch(results, { bodies: '' });
f.on('message', function(msg, seqno) {
console.log('Message #%d', seqno);
var prefix = '(#' + seqno + ') ';
msg.on('body', function(stream, info) {
simpleParser(stream, (err, mail)=>{
//console.log(temp);
//console.log(mail.subject);
/*fs.writeFile('data/'+seqno+'.txt',mail.text, function(err){
console.log(err);
});*/
//var text = mail.text;
// New Subscriber Object
var subscr = new Subscriber({nr: '', mailIdent: '', from: '', emails: '', text:'', uLink: '', anwalt: false });
subscr.nr = seqno;
//Check for From-Address
if(!!mail.from) {
//console.log(mail.from.value);
for(var i = 0; i < mail.from.value.length; i++) {
mailtemp = mail.from.value[i].address.matchAll(regex);
mailtemp.forEach(function(element){
/*fs.appendFile('data/data.csv', element[0] + "\n", function(error){
console.log(error);
});*/
subscr.from = element[0];
});
if(!!mailtemp) {
mailtemp.forEach(logArrayElements);
}
}
}else{
//console.log(mail.text);
}
// Message-ID
if(!!mail.messageId) {
subscr.mailIdent = mail.messageId;
}
console.log(mail.messageId);
// Check for other addresses in Mail-Body (Doesn't work yet)
var regexEmails = new RegExp('/([\w\.\-\_\#\+]+#[\w\.\-\_äüö]+\.[a-zA-Z]+)/g');
if(!!mail.text){
if(mail.text.matchAll(regexEmails)!=null) {
subscr.emails = mail.text.matchAll(regexEmails);
console.log(subscr.emails);
}
}
/* Split mail.text at substrings in substr-array. Extend if necessary..
*
* Also check for 'Anwalt'-Expression in splitted Substring
*
* If mail.text doesn't exist -> Check for html body and convert it to text-format
*/
//var regexLink = new RegExp('\.de\/(unsubscribe|austragen)\/([^\"]+)');
var regexAnwalt = new RegExp('nwalt|echtsanwalt|rechtlicher');
if(!!mail.text) {
var substr = ["schrieb pplaceholder.de", "Von: \"placeholder.de", "Von: pplaceholder.de", "From: placeholder.de", "Ursprüngliche Nachricht"];
for (var i = 0; i<substr.length; i++) {
if(mail.text.indexOf(substr[i]) > -1) {
var textTemp = mail.text;
var arr = textTemp.split(substr[i]);
if(arr[0].matchAll(regexAnwalt)!=null) {
subscr.anwalt = true;
};
subscr.text = arr[0];
break;
} else {
subscr.text = mail.text;
}
}
//console.log(arr);
}
else
{
var html = mail.html;
var text = htmlToText.fromString(html, {
noLinkBrackets: true,
ignoreImage: true,
uppercaseHeadings: false,
preserveNewlines: false,
wordwrap:130,
format: {
heading: function (node, fn, options) {
var h = fn(node.children, options);
return '\n==== ' + h + ' ====\n\n';
}
}
});
subscr.text = text;
}
mail.headers.forEach(function(value, key) {
//console.log(value);
});
subscr.save();
//console.log(subscr);
temp++;
});
});
msg.once('end', function() {
console.log(prefix + 'Finished');
});
});
f.once('error', function(err) {
console.log('Fetch error: ' + err);
});
f.once('end', function() {
console.log('Done fetching all messages!');
//writeBlacklistFile();
imap.end();
});
});
}
imap.once('ready', function() {
openInbox('Test',function(err, box) {
var searchArray = [['FROM', '#']];
search(searchArray,/([\w\.\-\_\#\+]+#[\w\.\-\_äüö]+\.[a-zA-Z]+)/g);
});
});
imap.once('error', function(err) {
console.log(err);
});
imap.once('end', function() {
console.log('Connection ended');
});
imap.connect();
app.listen(2700, function(){
console.log("Listening on Port 2700")
});
module.exports = app;
subscriber.js
const mongoose = require('mongoose');
var subscriberSchema = mongoose.Schema({
nr: Number,
mailIdent: String,
from: String,
emails: String,
text: String,
uLink: String,
anwalt: Boolean
});
var Subscriber = module.exports = mongoose.model('Subscriber', subscriberSchema);
//get Subscriber
module.exports.getSubscribers = function(callback, limit){
Subscriber.find(callback).limit(limit);
};
module.exports.getSubscriberByID = function(_id, callback){
Subscriber.findById(_id, callback);
};
The Regex for the Emails was a little bit wrong.
Also i didn't noticed that the matchAll-Fct. is giving back a two-dimensional Array. Here is the changed part of the code:
var regexEmails = new RegExp("([\\w\\.\\-\\_\\#\\+]+#[\\w\\.\\-\\_äüö]+\\.[a-zA-Z]+)");
var temp1 = mail.text.matchAll(regexEmails);
if(!!temp1){
//console.log(temp1);
for(var i =0; i<temp1.length; i++) {
if(temp1[0][i]!=='info#service.placeholder.de' && temp1[0][i] !== "info#placeholder.de"){
subscr.emails += temp1[0][i];
}
}
}
I'm building a small web scraper and I have stumbled into the following problem: my applications needs to scrape different parts of a website and put the information into the database. Sometimes it gives crazy results such as duplicated entries, or it returns undefined from a function getPhoto(). However, if I only call that function (and don't run the rest of the script), it returns a correct result!
I have a for loop, that loops through different URL. It goes to each URL and scrapes the following information: 1. title, 2.description, 3. internal link, 4. calls a function that generates an image according to the title (getPhoto(...) ), 5. saves the results to the DB. Everything happens on the server (I'm using Cron jobs, no client interaction)
for (i = 0; i < AllLinks.length; i++) {
if (AllLinks[i] != undefined && AllLinks[i] != null && sepLink[2] == "www.fly4free.pl") {
var t2 = {
travelTitle: null,
travelTitle2: null,
travelTitle3: null,
travelDescription: null,
travelDescription2: null,
travelDescription3: null,
travelBuy: null,
travelBuy2: null,
travelImage: null
};
var TravelLink1 = AllLinks[i];
result = HTTP.get(AllLinks[i], {});
$ = cheerio.load(result.content);
t2.travelTitle = $('.article__title').text();
t2.travelDescription = $('.article__content').find('p').first().text();
if ($("img[src$='//www.fly4free.pl/wp-content/uploads/2016/09/lotJm.png']").parent().attr('href') != null) {
t2.travelBuy = $("img[src$='//www.fly4free.pl/wp-content/uploads/2016/09/lotJm.png']").parent().attr('href'); // Link to buy
}
if (t2.travelBuy) {
if (t2.travelBuy.split('https://').pop().split('http://').pop() != null) {
t2.travelBuy2 = t2.travelBuy.split('https://').pop().split('http://').pop(); // link ready for DB
} else {
t2.travelBuy2 = t2.travelBuy;
}
}
t2.travelTitle3 = convertCurrencyInText(t2.travelTitle, 'PLN');
t2.travelDescription3 = convertCurrencyInText(t2.travelDescription, 'PLN');
translate(t2.travelTitle3, {from: 'pl', to: 'en'}).then(res => {
t2.travelTitle2 = res.text; // title for DB
if (t2.travelTitle2) { t2.travelImage = getPhoto(t2.travelTitle2); }
translate(t2.travelDescription3, {from: 'pl', to: 'en'}).then(response => {
t2.travelDescription2 = response.text; // description for DB
if (t2.travelDescription2 != null && t2.travelTitle2 != null && t2.travelBuy2 != null && TravelLink1 != null && t2.travelImage != null) {
Links.insert({ title: t2.travelTitle2, description:t2.travelDescription2, image: t2.travelImage, buyLink:t2.travelBuy2, link: TravelLink1, datetime: new Date() });
}
}).catch(err => {
console.error(err);
});
}).catch(err => {
console.error(err);
});
}
}
"AllLinks" contains different URLs. I have problems scraping this URL: http://www.fly4free.pl/na-wakacje-do-toskanii-tanie-loty-do-pizy-z-gdanska-za-170-pln/
getPhoto() function
function getPhoto(title) {
var travelPlace = nlp(title).match('to *').out('text').replace('to','').trim();
if (travelPlace) {var travelPlace2 = travelPlace.split(' '); }
if (travelPlace2) {var travelPlace3 = travelPlace2[0] + "+" + travelPlace2[1]; }
if (travelPlace3) {
var URL = "https://pixabay.com/api/?key="+API_KEY+"&q="+travelPlace3+"&category=travel&orientation=horizontal";
var images = (HTTP.get(URL, {}));
if (images.data.totalHits > 0) {
var imageLink = images.data.hits[0].webformatURL;
return imageLink;
} else if (images.data.totalHits == 0) {
var URL = "https://pixabay.com/api/?key="+API_KEY+"&q="+travelPlace2[0]+"&category=travel&orientation=horizontal";
var images = (HTTP.get(URL, {}));
if (images.data.totalHits > 0) {
var imageLink = images.data.hits[0].webformatURL;
return imageLink;
}
}
} else if (nlp(title).places().data().length > 0) {
var result = nlp(title).places().data()[0].text.replace(/[^a-zA-Z ]/g, "").trim();
var URL = "https://pixabay.com/api/?key="+API_KEY+"&q="+result+"&category=travel&orientation=horizontal";
var images = (HTTP.get(URL, {}));
if (images.data.totalHits > 0) {
var imageLink = images.data.hits[0].webformatURL;
return imageLink;
}
} else {
var title2 = title.replace(/[^a-zA-Z ]/g, "").split(" ");
if (title2) {
for(i = 0; i < title2.length; i++) {
if (cities[title2[i]] == 1) {
var URL = "https://pixabay.com/api/?key="+API_KEY+"&q="+title2[i]+"&category=travel&orientation=horizontal";
var images = (HTTP.get(URL, {}));
if (images.data.totalHits > 0) {
var imageLink = images.data.hits[0].webformatURL;
return imageLink;
}
} else {
var URL = "https://pixabay.com/api/?key="+API_KEY+"&q=travel&category=travel&orientation=horizontal";
var images = (HTTP.get(URL, {}));
if (images.data.totalHits > 0) {
var imageLink = images.data.hits[0].webformatURL;
return imageLink;
}
}
}
}
}
}
I try to console log the results - sometimes I get a correct image from getPhoto(), but an undefined link from t2.travelBuy, sometimes vice versa. Can you tell me what I'm doing wrong? I saw some people are using Promises or async/await functions on that kind of problems. Do you think that would help me? How should I change my code in order to scrape the website without getting "undefined"?
"translate" comes from "google-translate-api" package
you can try var new_func = Meteor.wrapAsync(YOUR FUNCTION THAT HAVE CALLBACK) and the when you use new_func() it will return the result as you would expect from normal function instead of waiting for callback