I currently developping a Coffeescript with nodejs that would allows me to dump all images on a Tumblr.
In a first loop it execute a request that will get a list of all the images on each page via the API.
In a second loop, it download all the images.
the problem is that, I don't know why, the first callback is never executed, the "start" counter is never incremented.
Here is the code :
xml2json = require "xml2json"
fs = require "fs"
util = require "util"
request = require "request"
tumblr_name = process.argv[2]
api_endpoint = util.format "http://%s.tumblr.com/api/read", tumblr_name
start = 0
num = 50
post_count = 50
download = (uri, filename) ->
request(uri).pipe(fs.createWriteStream(filename))
while post_count > 0
console.log util.format "post_count: %s - num: %s", post_count, num
page_uri = util.format "%s?type=photo&start=%s&num=%s", api_endpoint, start, num
do (page_uri) ->
request page_uri, (error, response, body) ->
console.log util.format "Downloading %s", page_uri
data_xml = body
data_json = JSON.parse xml2json.toJson data_xml
post_count = data_json["tumblr"]["posts"]["post"].length
for post in data_json["tumblr"]["posts"]["post"]
post_id = post["id"]
post_date = post["date-gmt"].split(" ")[0]
for photo in post["photo-url"]
if photo["max-width"] == 1280
outname = util.format "%s-%s-%s.%s", tumblr_name, post_date, post_id, photo["$t"].split(".").slice(-1)[0]
download photo["$t"], outname
start = start + num
Related
I have this function in my API Controller that executes a node file and it works normally on localhost, here is the script.
GifApiController.php
public function saveAndDissectGif(Request $request) {
$gif = $request->file;
//save the file temporarily in the asset-uploader directory
Storage::disk('roblogif')->put('/',$gif);
//dissect the gif into frames and get the frame count
$output = $this->executeNodeFile('dissectGif.js');
$frame_count = $output[0];
//there's a 6.5 second delay between each frame, divide by 60 to get it in minutes
//assume that 20 percent of the frames will fail, which means 20 percent will have a 1 minute delay
$time_in_minutes = $frame_count * ( 6.5 / 60 ) + ( ( $frame_count * 0.2 ) );
return ceil($time_in_minutes);
}
private function executeNodeFile($javascript_file_name) {
exec("cd ".__DIR__."; cd ../../asset-uploader; node ".$javascript_file_name, $output, $err);
if($err)
return $err;
return $output;
}
dissectGif.js gets all the frames from a gif and saves them in a folder (using the gif-frames library). To get the output from the file, I am console logging it.
dissectGif.js
const fs = require("fs")
const gifFrames = require('gif-frames');
async function startApp() {
// get the gif
let gif = fs.readdirSync("./gif");
gif = gif[Object.keys(gif)[0]];
// dissect gif into frames
await gifFrames({ url: './gif/'+gif, frames: 'all', outputType: 'png', cumulative: true }).then(function (frameData) {
frameData.forEach(async function (frame) {
await frame.getImage().pipe(fs.createWriteStream('./gif-frames/'+frame.frameIndex+'.png'));
});
});
//get and return the frame count so we can estimate loading time
return fs.readdir('./gif-frames', (err, files) => {
console.log(files.length)
return files.length
});
}
startApp()
This works great on localhost. $output[0] in GifApiController.php gets the files.length from the dissectGif.js file. But when I host the site, I get an error saying Undefined array key 0 which makes me think that in production mode, it's not finding the fs and the gif-frames libraries.
I tried to put the full directory of the libraries such as:
const fs = require("../../node_modules/fs")
but that didn't work. I also tried running npm install before executing the script but that didn't work as well.
I thought the issue could be from the gif not getting saved, but i checked and it was.
Does anybody have an idea on how to solve this issue?
I am currently trying to run a python file from a deno backend using the following code...
const cmd = Deno.run({
cmd: ["python", "python.py"],
stdout: "piped",
stderr: "piped"
});
const output = await cmd.output() // "piped" must be set
const outStr = new TextDecoder().decode(output);
const error = await cmd.stderrOutput();
const errorStr = new TextDecoder().decode(error);
cmd.close();
console.log(outStr, errorStr);
const resultsAlgorithm = outStr
console.log('This is a test, python result is...',outStr)
console.log('Finished')
The code works for basic scripts like 'print("Hello")' but is unable to run imports on more complex scripts such as...
import pandas as pd # Changes call up name to pd
from yahoofinancials import YahooFinancials
from datetime import date, datetime, timedelta,time
Yahoo_Forex = pd.DataFrame()
Currency_Pair_Prices = pd.DataFrame()
print('Running')
def DataExtract(FileName, DataFrameName, DataFrameName_2, time_range, Interval, ColumnName):
print('Function started')
start = date.today() - timedelta(days=2)
end = date.today() - timedelta(days=time_range)
DataFrameName = pd.read_excel(FileName, header=None)
DataFrameName.columns = ColumnName
n = 0
for ticker in DataFrameName[ColumnName[0]]:
Currency_Pair = DataFrameName.iloc[n, 1]
Currency_Pair_Ticker = YahooFinancials(ticker)
data = Currency_Pair_Ticker.get_historical_price_data(
str(end), str(start), Interval)
Extracted_Data = pd.DataFrame(data[ticker]["prices"])
Currency_Close_Price = (Extracted_Data["close"])
DataFrameName_2[str(Currency_Pair)] = Currency_Close_Price
n = n+1 # 13
print(DataFrameName_2)
print("DataExtract Completed")
DataExtract("yahoo_Forex.xlsx", Yahoo_Forex, Currency_Pair_Prices,int(10), "daily", ["Ticker", "Pair", "Exchange"])
The python code runs successfully on it's own so must be something with deno but sure what I would need to change so any help would be appreciated!
I have a nodeJS application and I need to run a python script in order to get a certain response. I am using python-shell in order to do that, but I am getting no response.
I have tried also using a child-process, same response.
Here I call the python script:
var ps = require('python-shell');
ps.PythonShell.run('./face_detect.py', array1, function (err, data) {
if (err) req.send(err);
req.send(data.toString())
});
This is a snippet of my python script:
import cv2
import sys
import os
import numpy as np
students = sys.argv[1]
# get the names and put them in an array ---> subjects
imagePath = "class/welcome.jpg"
cascPath = "haarcascade_frontalface_alt.xml"
faceCascade = cv2.CascadeClassifier(cascPath)
.....
for (x, y, w, h) in faces:
num = 0
crop_img = cv2.UMat(image[y-40:y+h+100,x-40:x+h+40])
cv2.imwrite("face" + str(num) + ".jpg", crop_img)
test_img = cv2.imread("face" + str(num) + ".jpg")
num = num + 1
predicted_img1 = predict(test_img)
absences["A"] = 1
for name, a in absences.items():
if a == 0:
noshow.append(name)
print(noshow)
cv2.waitKey(0)
I expect it to return an array.
Can anyone help me with this?
The correct syntax for passing argument from Nodejs python-shell to Python script is:
ps.PythonShell.run('./face_detect.py', { args: array1 }, function (err, data) { ... })
Here the value of sys.argv[1] in your Python script will not contain the Nodejs array1 value because you don't set the args property in your PythonShell options.
Note also this should probably be res.send instead of req.send, depending on your program, and I advise you to return if there is an error to prevent "headers already sent" exception.
I'm trying to make a .js file that will constantly have the price of bitcoin updated (every five minutes or so). I've tried tons of different ways to web scrape but they always output with either null or nothing. Here is my latest code, any ideas?
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var app = express();
var url = 'https://blockchain.info/charts/';
var port = 9945;
function BTC() {
request(url, function (err, res, body) {
var $ = cheerio.load(body);
var a = $(".market-price");
var b = a.text();
console.log(b);
})
setInterval(BTC, 300000)
}
BTC();
app.listen(port);
console.log('server is running on '+port);
It successfully says what port it's running on, that's not the problem. This example (when outputting) just makes a line break every time the function happens.
UPDATE:
I changed the new code I got from Wartoshika and it stopped working, but im not sure why. Here it is:
function BTCPrice() {
request('https://blockchain.info/de/ticker', (error, response, body) => {
const data = JSON.parse(body);
var value = (parseInt(data.USD.buy, 10) + parseInt(data.USD.sell, 10)) / 2;
return value;
});
};
console.log(BTCPrice());
If I have it console.log directly from inside the function it works, but when I have it console.log the output of the function it outputs undefined. Any ideas?
I would rather use a JSON api to get the current bitcoin value instead of an HTML parser. With the JSON api you get a strait forward result set that is parsable by your browser.
Checkout Exchange Rates API
Url will look like https://blockchain.info/de/ticker
Working script:
const request = require('request');
function BTC() {
// send a request to blockchain
request('https://blockchain.info/de/ticker', (error, response, body) => {
// parse the json answer and get the current bitcoin value
const data = JSON.parse(body);
value = (parseInt(data.THB.buy, 10) + parseInt(data.THB.sell, 10)) / 2;
console.log(value);
});
}
BTC();
Using the value as callback:
const request = require('request');
function BTC() {
return new Promise((resolve) => {
// send a request to blockchain
request('https://blockchain.info/de/ticker', (error, response, body) => {
// parse the json answer and get the current bitcoin value
const data = JSON.parse(body);
value = (parseInt(data.THB.buy, 10) + parseInt(data.THB.sell, 10)) / 2;
resolve(value);
});
});
}
BTC().then(val => console.log(val));
As the other answer stated, you should really use an API. You should also think about what type of price you want to request. If you just want a sort of index price that aggregates prices from multiple exchanges, use something like the CoinGecko API. Also if you need real-time data you need a websocket-based API, not a REST API.
If you need prices for a particular exchange, for example you're building a trading bot for one or more exchanges, you;ll need to communicate with each exchange's websoceket API directly. For that I would recommend something like the Coygo API, a node.js package that connects you directly to each exchange's real-time data feeds. You want something that doesn't add a middleman since that would add latency to your data.
i have about 66Million domains in a MySQL table, i need to run crawler on all the domains and update the row count = 1 when the crawler completed.
the crawler script is in php using php crawler library
here is the script.
set_time_limit(10000);
try{
$strWebURL = $_POST['url'];
$crawler = new MyCrawler();
$crawler->setURL($strWebURL);
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
$crawler->enableCookieHandling(true);
$crawler->setTrafficLimit(1000 * 1024);
$crawler->setConnectionTimeout(10);
//start of the table
echo '<table border="1" style="margin-bottom:10px;width:100% !important;">';
echo '<tr>';
echo '<th>URL</th>';
echo '<th>Status</th>';
echo '<th>Size (bytes)</th>';
echo '<th>Page</th>';
echo '</tr>';
$crawler->go();
echo '</table>';
$this->load->model('urls');
$this->urls->incrementCount($_POST['id'],'urls');
}catch(Exception $e){
}
$this->urls->incrementCount(); only update the row and to mark the count column = 1
and because i have 66M domains i needed to run a cronjob on my server
and as cronjob runs on command line i needed a headless browser so i choose phanjomjs
because the crawler doesnt work the way i wanted it to work without the headless browser (phantomjs)
first problem i faced was to load domains from mysql db and run crawler script from a js script
i tried this:
create a php script that returns domains in json form and load it from js file and foreach the domains and run the crawler, but it didnt work very well and get stuck after sometime
next thing i tried, which im still using is create a python script to load the domains directly from mysql db and run the phantom js script on each domains from python script.
here is the code
import MySQLdb
import httplib
import sys
import subprocess
import json
args = sys.argv;
db = MySQLdb.connect("HOST","USER","PW","DB")
cursor = db.cursor()
#tablecount = args[1]
frm = args[1]
limit = args[2]
try:
sql = "SELECT * FROM urls WHERE count = 0 LIMIT %s,%s" % (frm,limit)
cursor.execute(sql)
print "TOTAL RECORDS: "+str(cursor.rowcount)
results = cursor.fetchall()
count = 0;
for row in results:
try:
domain = row[1].lower()
idd = row[0]
command = "/home/wasif/public_html/phantomjs /home/wasif/public_html/crawler2.js %s %s" % (domain,idd)
print command
proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
script_response = proc.stdout.read()
print script_response
except:
print "error running crawler: "+domain
except:
print "Error: unable to fetch data"
db.close()
it takes 2 arguments to set the limit to select domain from database.
foreach domains and run this command using subproces
command = "/home/wasif/public_html/phantomjs /home/wasif/public_html/crawler2.js %s %s" % (domain,idd)
command
proc = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
script_response = proc.stdout.read()
print script_response
crawler2.js file also takes 2 args 1 is domain and 2nd is the id to update the count = 1 when crawler completed
this is the crawler2.js
var args = require('system').args;
var address = '';
var id = '';
args.forEach(function(arg, i) {
if(i == 1){
address = arg;
}
if(i == 2){
id = arg;
}
});
address = "http://www."+address;
var page = require('webpage').create(),
server = 'http://www.EXAMPLE.net/main/crawler',
data = 'url='+address+'&id='+id;
console.log(data);
page.open(server, 'post', data, function (status) {
if (status !== 'success') {
console.log(address+' Unable to post!');
} else {
console.log(address+' : done');
}
phantom.exit();
});
it works well but my script get stuck after sometime n need to restart after sometime and log shows nothing wrong
i need to optimize this process and run crawler as fast as i can, any help would be appreciated
Web crawler programmer is in here. :)
Your python execute the phantom serially. You should do it in parallel. To do it, execute the phantom then leave it, don't wait it.
In PHP, would be like this:
exec("/your_executable_path > /dev/null &");
Don't use phantom if you don't need to. It render everything. > 50MB memory will be needed.