I'm trying to scrape a website with load more button, but I can't do a recursive function with in nightmare. my code is something like this:
const Nightmare = require('nightmare');
const nightmare = Nightmare({
show:true
});// }
const request = require('request');
const cheerio = require('cheerio');
let url = 'https://www.housers.com/es/proyectos/avanzado';
let propertyArray = [];
var getThePage = function() {
nightmare
.goto('https://www.housers.com/es/proyectos/avanzado')
.wait(1500)
.click('#loadMore')
.evaluate(() =>{
return document.querySelector('.all-info').innerHTML;
})
.end()
.then((result) => {
let $ = cheerio.load(result);
let loadMore = $('#loadMore')
if (loadMore) {
getThePage();
}
return result
})
.catch((error) => {
console.error('Search failed:', error);
});
}
getThePage()
I don't know if you have any way to do it by this method or any other idea
If you want to scrap the data in the table, you don't need to use nightmare. From the network tab, you would see that it calls this endpoint :
https://www.housers.com/es/proyectos/avanzado/scroll
with some pagination & page size, let's take 200 per page (don't know if it's above the limit).
Then you just have to parse html & put data in an array :
const axios = require('axios');
const querystring = require('querystring');
const cheerio = require('cheerio');
const entities = require("entities");
const url = 'https://www.housers.com/es/proyectos/avanzado/scroll';
const prices = [];
function doRequest(url, page){
return axios.post(url + '?page=' + page + '&size=200', querystring.stringify({
word: "",
country: "",
type: "",
order: "STOCK_PRICE_VARIATION",
orderDirection: "DESC"
}));
}
async function getPrices() {
var empty = false;
var page = 0;
while (!empty) {
//call API
console.log("GET page n°" + page);
var res = await doRequest(url, page);
page++;
//parse HTML
const $ = cheerio.load(res.data,{
xmlMode: true,
normalizeWhitespace: true,
decodeEntities: true
});
if (res.data.trim() !== ""){
//extract prices : put it in array
$('tr').map(function(){
var obj = [];
$(this).children('td').map(function(){
obj.push(entities.decodeHTML($(this).text().trim()));
});
prices.push(obj);
});
}
else {
empty = true;
}
}
console.log(prices);
console.log("total length : " + prices.length);
}
getPrices();
Related
I am trying to create a middleware that receive a form-data and return the fieldname, contentType and the value. So when I send the firts post the data view in the terminal but if I send the same request again doesn't show me the data in the terminal.
And if a toggle the image, the data come show in the terminal
This is my code:
server:
const express = require("express");
const Upes = require("../upes");
const app = express();
const start = new Upes();
app.post("/", start.setup.bind(start), (req, res) => {
res.send("all right");
});
app.listen(3000, () => {
console.log("The server is active");
});
the index of my middleware:
const getData = require("./utils/getData");
const parseContentType = require("./utils/parseContentType");
class Upes {
setup(req, res, next) {
const contentType = parseContentType(req.headers["content-type"]);
if (!contentType) {
throw new Error("Malformed content-type");
}
const SUBTYPES = ["form-data", "x-www-form-urlencoded"];
if (!SUBTYPES.includes(contentType.subtype)) {
throw new Error(
"The subtypes does not match the following subtypes: " + SUBTYPES
);
}
getData(req, contentType.params.boundary, (data) => {
console.log(data);
});
next();
}
}
module.exports = Upes;
The function that receive the data and processes it:
function getData(req, boundary, callback) {
let chunk = "";
let data = [];
req.on("data", (buffer) => {
chunk += buffer.toString();
});
req.on("end", () => {
// Split the chunk in blocks
const blocks = getBlock(chunk, boundary);
blocks.forEach((block) => {
let [params, value] = block.split("\r\n\r\n");
params = params.split(";");
let fieldname = params[1].split("=")[1].replaceAll('"', "");
let contentType = () => {
const condition = params.length === 3;
if (condition) {
let type = params[2].split(":")[1].replace(" ", "");
return type;
}
return "text-plain";
};
const payload = {
fieldname: fieldname,
contentType: contentType(),
value: "", // value.replace("\r\n", "")
};
data.push(payload);
});
callback(data);
});
}
function getBlock(body, boundary) {
boundary = boundary.replaceAll("-", "");
return body.replaceAll("-", "").split(`${boundary}`).slice(1, -1);
}
module.exports = getData;
Send the same request 20 times
I don't know what happend, please can someone help me?
I am currently trying to get the page count of a Word document in openXML format and have been able to get to the point of where I have the XML structure of the document in a readable format, but I can't seem to find where the page count property is. Any guidance would be appreciated.|
UPDATE:
You can access the page count and other metadata by accessing the docProps/app.xml file. All you have to do is separate and extract the data you want. I got the page count by doing this.
const XMLData = fs.readFileSync(data, { encoding: "utf-8" });
let pageCount = XMLData.split("<Pages>")
.join(",")
.split("</Pages>")
.join(",")
.split(",")[1];`
const fs = require("fs");
const path = require("path");
const axios = require("axios");
let noRepeatDocs = ['somewebsite.com/somedocument.docx'];
const writeTheFile = async (data) => {
fs.writeFileSync("read_word_doc", data);
};
const unzipTheFile = async (data) => {
fs.createReadStream(data)
.pipe(unzipper.Parse())
.on("entry", function (entry) {
const fileName = entry.path;
const type = entry.type;
const size = entry.vars.uncompressedSize;
if (fileName === "word/document.xml") {
entry.pipe(fs.createWriteStream("./output"));
} else {
entry.autodrain();
}
});
};
const getWordBuffer = async (arr) => {
for (const wordDocLink of arr) {
const response = await axios({
url: wordDocLink,
method: "GET",
responseType: "arraybuffer",
});
const data = response.data;
await writeTheFile(data);
await unzipTheFile("./read_word_doc");
}
};
getWordBuffer(noRepeatDocs);
I am creating a new actor in Apify with Cheerio to read an input file of URLs and return primarily two items: (1) the HTTP status code and (2) the HTML title. As part of our process, I would like to be able to try up to 4 variations of each input URL, such as:
HTTP://WWW.SOMEURL.COM
HTTPS://WWW.SOMEURL.COM
HTTP://SOMEURL.COM
HTTPS://SOMEURL.COM
If one of the 4 variations is successful, then the process should ignore the other variations and move to the next input URL.
I read the original input list into a RequestList, and then would like to create the variations in a RequestQueue. Is this the most efficient way to do it? Please see code below, and thank you!
const Apify = require('apify');
const {
utils: { enqueueLinks },
} = Apify;
const urlParse = require('url');
Apify.main(async () => {
const input = await Apify.getInput();
const inputFile = input.inputFile;
console.log('INPUT FILE: ' + inputFile);
const requestList = await Apify.openRequestList('urls', [
{ requestsFromUrl: inputFile, userData: { isFromUrl: true } },
]);
const requestQueue = await Apify.openRequestQueue();
const proxyConfiguration = await Apify.createProxyConfiguration();
const handlePageFunction = async ({ $, request, response }) => {
let parsedHost = urlParse.parse(request.url).host;
let simplifiedHost = parsedHost.replace('www.', '');
const urlPrefixes = ['HTTP://WWW.', 'HTTPS://WWW.', 'HTTP://', 'HTTPS://'];
let i;
for (i = 0; i < urlPrefixes.length; i++) {
let newUrl = urlPrefixes[i] + simplifiedHost;
console.log('NEW URL: ' + newUrl);
await requestQueue.addRequest({ url: newUrl });
}
console.log(`Processing ${request.url}`);
const results = {
inputUrl: request.url,
httpCode: response.statusCode,
title: $('title').first().text().trim(),
responseUrl: response.url
};
await Apify.pushData(results);
};
const crawler = new Apify.CheerioCrawler({
proxyConfiguration,
maxRequestRetries: 0,
handlePageTimeoutSecs: 60,
requestTimeoutSecs: 60,
requestList,
requestQueue,
handlePageFunction,
handleFailedRequestFunction: async ({ request }) => {
await Apify.pushData({ inputUrl: request.url, httpCode: '000', title: '', responseUrl: ''});
}
});
await crawler.run();
});
you should create your URL list beforehand. the handlePageFunction is only used for the actual scraping part, and you should only have the Apify.pushData there:
//...
const initRequestList = await Apify.openRequestList('urls', [
{ requestsFromUrl: inputFile },
]);
const parsedRequests = [];
let req;
while (req = await initRequestList.fetchNextRequest()) {
const parsedHost = urlParse.parse(req .url).host;
const simplifiedHost = parsedHost.replace('www.', '');
const urlPrefixes = ['HTTP://WWW.', 'HTTPS://WWW.', 'HTTP://', 'HTTPS://'];
for (let i = 0; i < urlPrefixes.length; i++) {
let newUrl = urlPrefixes[i] + simplifiedHost;
console.log('NEW URL: ' + newUrl);
parsedRequests.push({
url: newUrl,
userData: { isFromUrl: true }
});
}
}
const requestList = await Apify.openRequestList('starturls', parsedRequests);
//...
const crawler = new Apify.CheerioCrawler({
proxyConfiguration,
maxRequestRetries: 0,
handlePageTimeoutSecs: 60,
requestTimeoutSecs: 60,
handlePageFunction,
requestList,
handleFailedRequestFunction: async ({ request }) => {
await Apify.pushData({ inputUrl: request.url, httpCode: '000', title: '', responseUrl: ''});
}
});
//...
requestsFromUrl is a greedy function that tries to parse all URLs from to the given resource. so you'll have to perform the processing as an additional step.
I try to retrieve data (scraping) from a url with Get method (axios) but it doesn't retrieve the data what i looking for (piano) and no error is show. I use json to retrieve the data. Any idea what i'm doing wrong .dddddddddddddddddddddddddddd
const axios = require('axios');
const cheerio = require ('cheerio');
const fs = require('fs');
const baseUrl = '**********/';
const axiosCookieJarSupport = require('axios-cookiejar-support').default;
const tough = require('tough-cookie');
axiosCookieJarSupport(axios);
const cookieJar = new tough.CookieJar();
function scrapeUrl(url, items=[]) {
const params = {
keywords: "piano",
latitude:40.489353,
longitude:-3.6827461,
};
return axios
.get(baseUrl + "search", {
params,
jar: cookieJar,
withCredentials: true,
})
.then(response => {
const $ = cheerio.load(response.data);
const pageItems = $('.container-wall .card-product-content').toArray()
tracked').toArray()
.map(item => {
const $item = $(item);
// console.log('todo', $item);
return {
// id: $item.attr('data-adid'),
title: $item.find('.card-product-image').attr('alt'),
link: baseUrl + $item.parent('a').attr('href'),
image: $item.find('.card-product-image').attr('src'),
price: $item.find('.product-info-price').text(),
};
});
const allItems = items.concat(pageItems);
console.log(pageItems.length,'items retrieved', allItems.length,
'acumulated');
const nextUrl = $('.pagination .next a').attr('href');
return nextUrl ? scrapeUrl(baseUrl + nextUrl, allItems) : allItems;
})
.catch(error => {
console.log('error', error);
return items;
});
}
scrapeUrl(baseUrl + initialUrl)
.then(items => {
process.stdout.write(JSON.stringify(items));
fs.writeFile('./items.json', JSON.stringify(items), 'utf8', function(error) {
if (error) return console.log('error', error);
console.log(items.length, 'items saved');
});
});
i switch to puppeteer and headless Chrome as Chris sayed it's better than axios/cheerio becouse nowadays when many of the websites are built as a single page application and gets rendered dynamically on the client it might not be possible to get the content.
I have an API call in api.js:
export const getGraphData = (domain, userId, testId) => {
return axios({
url: `${domain}/api/${c.embedConfig.apiVersion}/member/${userId}/utests/${testId}`,
method: 'get',
});
};
I have a React helper that takes that data and transforms it.
import { getGraphData } from './api';
const dataObj = (domain, userId, testId) => {
const steps = getGraphData(domain, userId, testId)
.then((result) => {
return result.attributes;
});
console.log(steps);
// const steps = test.get('steps');
const expr = /select/;
// build array of steps that we have results in
const resultsSteps = [];
steps.forEach((step) => {
// check for types that contain 'select', and add them to array
if (expr.test(step.get('type'))) {
resultsSteps.push(step);
}
});
const newResultsSteps = [];
resultsSteps.forEach((item, i) => {
const newMapStep = new Map();
const itemDescription = item.get('description');
const itemId = item.get('id');
const itemOptions = item.get('options');
const itemAnswers = item.get('userAnswers');
const newOptArray = [];
itemOptions.forEach((element) => {
const optionsMap = new Map();
let elemName = element.get('value');
if (!element.get('value')) { elemName = element.get('caption'); }
const elemPosition = element.get('position');
const elemCount = element.get('count');
optionsMap.name = elemName;
optionsMap.position = elemPosition;
optionsMap.value = elemCount;
newOptArray.push(optionsMap);
});
newMapStep.chartType = 'horizontalBar';
newMapStep.description = itemDescription;
newMapStep.featured = 'false';
newMapStep.detailUrl = '';
newMapStep.featuredStepIndex = i + 1;
newMapStep.id = itemId;
newMapStep.isValid = 'false';
newMapStep.type = 'results';
const listForNewOptArray = List(newOptArray);
newMapStep.data = listForNewOptArray;
newMapStep.userAnswers = itemAnswers;
newResultsSteps.push(newMapStep);
});
return newResultsSteps;
};
export default dataObj;
The issue is steps, when logged outside the .then() returns a Promise {<pending>}. If I log results.attributes inside the .then(), I see the data fully returned.
You need to wait until your async call is resolved. You can do this by chaining on another then:
getGraphData(domain, userId, testId)
.then((result) => {
return result.attributes;
})
.then(steps => {
// put the rest of your method here
});
You can also look at async/await if your platform supports it which would allow code closer to your original
const steps = await getGraphData(domain, userId, testId)
.then((result) => {
return result.attributes;
});
// can use steps here
You have 2 options to transform your fetched data :
1st option : create a async function that returns a promise with the modified data :
const dataObj = (domain, userId, testId) => {
return getGraphData(domain, userId, testId).then((result) => {
const steps = result.attributes;
const expr = /select/;
// build array of steps that we have results in
const resultsSteps = [];
steps.forEach((step) => {
// check for types that contain 'select', and add them to array
if (expr.test(step.get('type'))) {
resultsSteps.push(step);
}
});
const newResultsSteps = [];
resultsSteps.forEach((item, i) => {
const newMapStep = new Map();
const itemDescription = item.get('description');
const itemId = item.get('id');
const itemOptions = item.get('options');
const itemAnswers = item.get('userAnswers');
const newOptArray = [];
itemOptions.forEach((element) => {
const optionsMap = new Map();
let elemName = element.get('value');
if (!element.get('value')) {
elemName = element.get('caption');
}
const elemPosition = element.get('position');
const elemCount = element.get('count');
optionsMap.name = elemName;
optionsMap.position = elemPosition;
optionsMap.value = elemCount;
newOptArray.push(optionsMap);
});
newMapStep.chartType = 'horizontalBar';
newMapStep.description = itemDescription;
newMapStep.featured = 'false';
newMapStep.detailUrl = '';
newMapStep.featuredStepIndex = i + 1;
newMapStep.id = itemId;
newMapStep.isValid = 'false';
newMapStep.type = 'results';
const listForNewOptArray = List(newOptArray);
newMapStep.data = listForNewOptArray;
newMapStep.userAnswers = itemAnswers;
newResultsSteps.push(newMapStep);
});
return newResultsSteps;
});
};
With es7 async/await syntax it should be :
const dataObj = async (domain, userId, testId) => {
const result = await getGraphData(domain, userId, testId);
const steps = result.attributes;
... modify the data
}
Then keep in mind that this function returns a promise, you'll need to wait for it to get the result, example in a react component :
componentDidMount(){
dataObj('mydomain', 'myuserId', 'mytestId').then((res) => {
this.setState({ data: res });
}
}
The component will update when the promise is resolve, you can then use the data (you'll need to handle the undefined data state in render method)
2nd option : Create a sync function to modify the data :
const dataObj = (steps) => {
const expr = /select/;
const resultsSteps = [];
steps.forEach((step) => {
...
}
return newResultsSteps;
};
To have the same result as option 1 in our component we'll use it like this :
componentDidMount(){
getGraphData('mydomain', 'myuserId', 'mytestId').then((res) => {
const modifiedData = dataObj(res);
this.setState({ data: modifiedData });
}
}
That's how promises work. The data is not ready when you are trying to use it so you should move all your processing into the .then. The reason your variable is a Promise {<pending>} is because you can chain other things onto it.
Something like:
steps.then((steps) => {
...
});