This is a proxy grabber and tester script.
I have tried many things. I like Rolling Curl but I can not seem to get it working with this. Is there any way to speed this up or maybe throttle it with JavaScript? The processing time and resources are way too high!
Right now it will work only if there are one or two sources otherwise it just runs forever. There are tons of applications like this in PYTHON and Windows applications that harvest and check thousands of proxies. I just wonder if this is even possible for PHP to do the same.
// Settings
error_reporting(E_ALL);
ini_set('max_execution_time', 0);
require_once ('classes/class.multicurl.php');
set_time_limit(0);
// Short Delay
$delay = rand(2, 4);
// Long Delay
$longdelay = rand(4, 7);
// Checking Proxies
$fileName = "leeched/proxies.txt";
// where to save successful proxies
$success = "goodproxies/success.txt";
$source = file('sources/sources.txt');
// SET Cookie
$c = new Curl;
foreach($source as $sources) {
// Request To Delete Duplicate Proxies
$c->addRequest(trim($sources));
}
$c->chunk(25);
$c->perform();
$proxies = array();
foreach($c->results as $url => $res) {
// REGEX MATCH
preg_match_all('#[0-9]{1,4}\.[0-9]{1,4}\.[0-9]{1,4}\.[0-9]{1,4}:[0-9] {1,6}#', $res, $m);
$eachproxy = stream_get_contents($res);
$proxies[$url] = $m[0]; {
while ($proxies == time() && $eachproxy > 4) { // go into "waiting" when we going to fast
usleep(100000); // wait .1 second and ask again
}
if ($proxies != time()) { // remember to reset this second and the cnt
$proxies = time();
$eachproxy = 0;
}
}
foreach($proxies as $url => $parr) {
$str = implode("\n", $parr);
file_put_contents('leeched/proxies.txt', $str);
$k = count($parr);
$str2 = date('h:i:s d m') . " | \t" . $k . "\t" . $url . "\n";
file_put_contents('logs/counts.txt', $str2, FILE_APPEND);
}
$uar = file('leeched/proxies.txt');
$uar = array_unique($uar);
$str = implode("\n", $uar) . "\n";
$str = preg_replace('/^\h*\v+/m', '', $str);
file_put_contents('leeched/proxies.txt', $str);
}
// Proxy Testing
if (!is_file($fileName)) die('Proxy file not available');
$proxies = file($fileName);
for ($p = 0; $p < count($proxies); $p++) {
$ch = curl_init(); //initizlize and set url
curl_setopt($ch, CURLOPT_URL, "http://www.yordomain.com/check.php");
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_HTTPGET, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_VERBOSE, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 7);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_PROXY, trim($proxies[$p]));
$data = curl_exec($ch);
usleep(100000);
if (strpos($data, 'Anonymous') !== false) {
usleep(100000);
echo "<img src=\"images/good.png\"> <font color=\"#7CFC00\"><strong>" . $proxies[$p] . " </font></strong><font color=\"#FFFFE0\"><strong> THIS IS A WORKING ANONYMOUS PROXY SAVED TO /goodproxies/success.txt</font></strong><font color=\"yellow\"><strong> " . "Total time: " . curl_getinfo($ch, CURLINFO_TOTAL_TIME) . " seconds!</font></strong><img src=\"images/small.png\"> <br/><br/>";
$f = fopen($success, "a");
fwrite($f, $proxies[$p]);
fclose($f);
}
elseif (curl_errno($ch)) {
usleep(100000);
echo "<img src=\"images/bad.png\"> <font color=\"white\"><strong>" . $proxies[$p] . " </font></strong><font color=\"red\"><strong>ERROR:</font></strong><font color=\"#00FFFF\"><strong> " . curl_error($ch) . " </font></strong><img src=\"images/redx.png\"> <br/><br/>";
}
else {
echo "<img src=\"images/warning.png\"> <font color=\"#7CFC00\"><strong> " . $proxies[$p] . " </font></strong><font color=\"white\"><strong> THERE WAS NO ERROR CONNECTING BUT THIS PROXY IS NOT ANONYMOUS! NOT SAVED</font></strong> <font color=\"#FF69B4\"><strong>(No content from source)</font></strong><img src=\"images/redx.png\"> <br/><br/>";
}
flush();
curl_close($ch);
}
$done = "done";
echo $done;
it'd go faster if you'd do curl_setopt($ch, CURLOPT_ENCODING, ''); and your libcurl is compiled with gzip/deflate support, and the target website supports at least 1 of those (which is pretty much always the case)
Related
I'm creating modals to provide information about various countries.
One of the modals I am trying to create is to show the weather and I found this codepen example which is very nice! https://codepen.io/irwingb1979/pen/XWMbqmP
Is there any way to make such a call using javaScript AJAX to a PHP file like such
<?php
// remove for production
ini_set('display_errors', 'On');
error_reporting(E_ALL);
$executionStartTime = microtime(true);
$url='https://api.openweathermap.org/data/2.5/onecall?lat=' . $_REQUEST['lat'] . '&lon=' . $_REQUEST['lng'] . '&units=metric&appid=01a69c43fa692a1e67ae4c9bdabb8fdc';
$ch = curl_init();
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL,$url);
$result=curl_exec($ch);
curl_close($ch);
echo var_dump($result, $url);
$decode = json_decode($result,true);
$output['status']['code'] = "200";
$output['status']['name'] = "ok";
$output['status']['description'] = "success";
$output['status']['returnedIn'] = intval((microtime(true) - $executionStartTime) * 1000) . " ms";
$output['data'] = $decode['geonames'];
header('Content-Type: application/json; charset=UTF-8');
echo json_encode($output);
?>
I'm not quite sure how to alter it to fit my modal and call like above
I am creating a personal website for vacation rentals (a joomla website).
The owner has created an ad here: https://www.armor-vacances.com/locat...tml#calendrier
Do you know if there is a way to extract the entire "calendar" portion to display on my website site?
I try some scripts find with "file_get_html" for example but I don't arrive to my goals.
Thanks for your help.
php can do it, as long as you're not treading on site copyright or robot control issues, and that you can rely on the site you're reading will always have the container you're after, but it's not going to be easy.
It would be good if the site you're scraping has the information in machine ready format using meta tags that it knows programs will be looking for.
Here's a starting point for some scraping code for you (I've cached the page content to a local file so that you don't hit the website too many times each day):
<?php
// php7.0
$src ="https://stackoverflow.com/questions/52678213/extract-a-portion-code-from-an-external-page";
$tmpfn="C:/temp/temp.$srcX.$now.html";
$findDivId="Place your ID here";
$now = date('Y-m-d', time());
$srcX = preg_replace("/[^a-zA-Z0-9]+/", "", $src);
$srcX = substr($srcX, 0, 155);
if ( file_exists($tmpfn) ) {
$html=file_get_contents($tmpfn);
}
else {
$ch = curl_init($src);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($ch, CURLOPT_HEADER, true);
$html = curl_exec($ch);
if ( !$html ) {
echo curl_error($ch);
exit;
}
curl_close($ch);
file_put_contents($tmpfn, $html);
}
echo "<LI>html size = ".strlen($html)." bytes";
if ( strcmp($html, "") != 0 ) {
$dom = new DOMDocument;
#$dom->loadHTML($html);
$divs = $dom->getElementsByTagName("div"); // or ->getElementsById($id);
if ( $divs ) {
echo "<UL>";
foreach ($divs as $div) {
echo "<LI>Tag::".$div->nodeName;
if ( $div->hasAttributes() ) {
foreach ($div->attributes as $attr) {
echo "<BR>Attribute::".$attr->nodeName . "=" . $attr->nodeValue . " ";
if ( strcmp($attr->nodeName,'id')==0
and strcmp($attr->nodeValue,$findDivId)==0 ) {
echo "<LI>Found $findDivId!!";
}
}
}
echo "<BR>Value::".$div->nodeValue."<BR><BR>";
}
echo "</UL>";
}
}
?>
I am trying to connect to the UK Companies House API. Ideally, I am looking for a JavaScript solution.
But I am trying to get this PHP variant up and running. How do I gain access to the API, with the API key?
PHP:
public function GetCompanyHouse(){
$int = '00928555';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://data.companieshouse.gov.uk/doc/company/' . $int . '.json');
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_TIMEOUT, '10');
$result = curl_exec($ch);
$status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
switch($status)
{
case '200':
return json_decode($result);
break;
default:
return false;
break;
}
}
https://developer.companieshouse.gov.uk/api/docs/index/gettingStarted/apikey_authorisation.html
curl -XGET -u my_api_key:
https://api.companieshouse.gov.uk/company/00000006
Do I set this as a header?
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'u: my_api_key'
));
Is it possible to just do a JSON call like this? https://api.companieshouse.gov.uk/company/00000006?api_key=xxxxxx
Here is a jQuery model in the works
http://jsfiddle.net/NYEaX/1412/
$(document).ready(function() {
console.log("api");
xhr = new XMLHttpRequest();
$(document).ready(function() {
$.ajax({
url: 'https://api.companieshouse.gov.uk/search?q=subway',
type: 'GET',
datatype: 'json',
success: function() { alert("Success"); },
error: function() { alert('Failed!'); },
beforeSend: setHeader
});
});
function setHeader(xhr) {
xhr.setRequestHeader('Authorization', 'Basic bXlfYXBpX2tleTo=');
// xhr.setRequestHeader('X-GET', 'xx');
}
});
for me
curl -u "token": https://api.companieshouse.gov.uk/company/SC185088
in command prompt works
Here is some sample code, modify as needed...
$my_api_key = "blablablablabla";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'https://api.company-information.service.gov.uk/company/11263172');
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_TIMEOUT, '10');
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'Authorization: '.$my_api_key
));
$result = curl_exec($ch);
curl_close($ch);
print_r($result);
Not an expert but the following has worked for me
//Function thanks to Stackoverflow member:
//https://stackoverflow.com/questions/7393719/human-readable-json-aka-add-spaces-and-breaks-to-json-dump
//https://stackoverflow.com/users/720204/som
function jsonToReadable($json){
$tc = 0; //tab count
$r = ''; //result
$q = false; //quotes
$t = "\t"; //tab
$nl = "\n"; //new line
for($i=0;$i<strlen($json);$i++){
$c = $json[$i];
if($c=='"' && $json[$i-1]!='\\') $q = !$q;
if($q){
$r .= $c;
continue;
}
switch($c){
case '{':
case '[':
$r .= $c . $nl . str_repeat($t, ++$tc);
break;
case '}':
case ']':
$r .= $nl . str_repeat($t, --$tc) . $c;
break;
case ',':
$r .= $c;
if($json[$i+1]!='{' && $json[$i+1]!='[') $r .= $nl . str_repeat($t, $tc);
break;
case ':':
$r .= $c . ' ';
break;
default:
$r .= $c;
}
}
return $r;
}
//I'm searching time company incorporated, adjust your search variables/query to your needs
//I'm using the Advanced Search here but again adjust to your needs
//https://developer-specs.company-information.service.gov.uk/companies-house-public-data-api/reference/search/advanced-company-search
//My main issue was getting connected and rid of "Invalid authorisation" trying to connect to the service.
//Why they don't just issue a multi language toolkit I'll never know, it may save them some bandwith...
$incorporated_from= urlencode('2022-11-14');
$incorporated_to= urlencode('2022-11-14');
$curl = curl_init("https://api.company-information.service.gov.uk/advanced-search/companies?incorporated_from=".$incorporated_from."&incorporated_to=".$incorporated_to."&size=100");
//IMPORTANT!!! BASE64 Encode your API key but also add a colon ':' at the end of it before encoding.
//I found this finally worked if I pre-prepared the BASE64 encoding of my API key rather than let PHP do it on the fly
//So your headers variable will look something like
//$headers = array('Authorization: Basic BLAHBLAHBLAHBLAHBLAHBLAHUfsd5436ghZWY4Og==');
$headers = array('Authorization: Basic MY_COMPANIES_HOUSE_REST_API_KEY_WITH_COLON_AT_END_BASE64_ENCODED');
curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curl, CURLOPT_HEADER, true); // Comment out or use when debugging to show returning header
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
$results = curl_exec($curl);
//Make the JSON returned readable - thanks to Stackoverflow member "som", that created the function!
echo jsonToReadable($results);
if(curl_errno($curl))
{
echo 'Curl error : ' . curl_error($curl);
}
curl_close($curl);
I'm currently using whateverorigin.org in some javascript to retrieve a URL as a JSON object because a 3rd party site hasn't made one of their functions available via their JSON API.
I'd like to remove this dependancy from my website as whateverorigin.org breaks the HTTPS/SSL browser checks for secure content because it's a clear http call.
Has anyone done this? I haven't found an example of it anywhere.
Thanks in advance for a response!
Ok, so since I first typed up this question, I've now already found some examples and cobbled together a working proxy function in php... Feel free to use it for your own purposes!
<?php
// Sourced from: http://stackoverflow.com/questions/2511410/curl-follow-location-error
function curl_exec_follow(/*resource*/ &$ch, /*int*/ $redirects = 20, /*bool*/ $curlopt_header = false) {
if ((!ini_get('open_basedir') && !ini_get('safe_mode')) || $redirects < 1) {
curl_setopt($ch, CURLOPT_HEADER, $curlopt_header);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $redirects > 0);
curl_setopt($ch, CURLOPT_MAXREDIRS, $redirects);
return curl_exec($ch);
} else {
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FORBID_REUSE, false);
do {
$data = curl_exec($ch);
if (curl_errno($ch))
break;
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($code != 301 && $code != 302)
break;
$header_start = strpos($data, "\r\n")+2;
$headers = substr($data, $header_start, strpos($data,"\r\n\r\n", $header_start)+2-$header_start);
if (!preg_match("!\r\n(?:Location|URI): *(.*?) *\r\n!",$headers, $matches))
break;
curl_setopt($ch, CURLOPT_URL, $matches[1]);
} while (--$redirects);
if (!$redirects)
trigger_error('Too many redirects. When following redirects, libcurl hit the maximum amount.', E_USER_WARNING);
if (!$curlopt_header)
$data = substr($data, strpos($data, "\r\n\r\n")+4);
return $data;
}
}
header('Content-Type: application/json');
$retrieveurl = curl_init(urldecode($_GET['url']));
$callbackname = $_GET['callback'];
$htmldata = curl_exec_follow($retrieveurl);
if (curl_error($retrieveurl))
die(curl_error($retrieveurl));
$status = curl_getinfo($retrieveurl, CURLINFO_HTTP_CODE);
curl_close($retrieveurl);
$data = array('contents' => $htmldata, 'status' => $status);
$jsonresult = json_encode($data);
echo $callbackname . '(' . $jsonresult . ')';
?>
Hope this helps someone!
I'm having an issue trying to submit a form for a medical website to determine a user's name based on their registration number or their national ID number. An example Registration Number is MPS0753602
Site in question: http://isystems.hpcsa.co.za/iregister/
I can do an initial query to generate the second input form which is generated using a dopostback when clicking on one of the options, in this case clicking on "Registration Number". My issue is that if I fill in all the fields as examined by FireCurl I get a server error, even if I fill them in identically. I think it has something to do with the Javascript on the page.
My code:
<?php
// Specify a cookie file
$cookiefile = '/var/www/hpcsa/cookies.txt';
$client = new Login($cookiefile);
// Retrieve page first to store cookies
$page = file_get_contents("http://isystems.hpcsa.co.za/iregister/");
// scrape __VIEWSTATE value
$start = strpos($page, '__VIEWSTATE" value="') + 20;
$end = strpos($page, '"', $start);
$viewstate = substr($page, $start, $end - $start);
// scrape __EVENTVALIDATION value
$start = strpos($page, '__EVENTVALIDATION" value="') + 26;
$end = strpos($page, '"', $start);
$eventvalidation = substr($page, $start, $end - $start);
// Do our actual query
$form_data = array(
'SearchChkb$0' => 'on',
'__EVENTARGUMENT' => '',
'__EVENTTARGET' => '',
'__EVENTVALIDATION' => $eventvalidation,
'__LASTFOCUS' => '',
'__VIEWSTATE' => $viewstate,
'rgReg_No' => 'rbReg_NoExact',
'txtReg_No' => 'MPS0753602'
);
$page = $client -> get("http://isystems.hpcsa.co.za/iregister/", $form_data);
echo($page);
// cURL wrapper class
class Login {
private $_cookiefile;
public function __construct($cookiefile) {
if (!is_writable($cookiefile)) {
throw new Exception('Cannot write cookiefile: ' . $cookiefile);
}
$this -> _cookiefile = $cookiefile;
}
public function get($url, $data = false) {
// Setup cURL
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_REFERER, 'http://isystems.hpcsa.co.za/iregister/');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
// Is there data to post
if (!empty($data)) {
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($data));
}
return curl_exec($ch);
}
}
?>