web scraping for html page but need for repeat on lots link? - javascript

I wrote the following code for parse some part of HTML for one URL. I means parse page const URL= 'https://www.example.com/1'
Now I want to parse the next page 'https://www.example.com/2' and so on. so I want to implement a For-Loop manner here.
what is the easiest way that I can use the iteration manner here to
change URL (cover page 1,2,3, ...) automatically and run this code in repeat to parse other pages? How I can use for-loop manner here?
const PORT = 8000
const axios = require('axios')
const cheerio = require('cheerio')
const express = require('express')
const app = express()
const cors = require('cors')
app.use(cors())
const url = 'https://www.example.com/1'
app.get('/', function (req, res) {
res.json('This is my parser')
})
app.get('/results', (req, res) => {
axios(url)
.then(response => {
const html = response.data
const $ = cheerio.load(html)
const articles = []
$('.fc-item__title', html).each(function () {
const title = $(this).text()
const url = $(this).find('a').attr('href')
articles.push({
title,
url
})
})
res.json(articles)
}).catch(err => console.log(err))
})
app.listen(PORT, () => console.log(`server running on PORT ${PORT}`))

Some considerations, if you added CORS to your app, so that you can GET the data, it's useless, you add CORS when you want to SEND data, when your app is going to receive requests, CORS enable other people to use your app, it's useless then trying to use other people's app. And CORS problems happen only in the browser, as node is on the server, it will never get CORS error.
The first problem with your code, is that https://www.example.com/1, even working on the browser, returns 404 Not Found Error to axios, because this page really doesn't exist, only https://www.example.com would work.
I added an example using the comic site https://xkcd.com/ that accepts pages.
I added each axios request to an array of promises, then used Promise.all to wait for all of them:
The code is to get the image link:
const PORT = 8000;
const axios = require("axios");
const cheerio = require("cheerio");
const express = require("express");
const app = express();
const url = "https://xkcd.com/";
app.get("/", function (req, res) {
res.json("This is my parser");
});
let pagesToScrap = 50;
app.get("/results", (req, res) => {
const promisesArray = [];
for (let pageNumber = 1; pageNumber <= pagesToScrap; pageNumber++) {
let promise = new Promise((resolve, reject) => {
axios(url + pageNumber)
.then((response) => {
const $ = cheerio.load(response.data);
let result = $("#transcript").prev().html();
resolve(result);
})
.catch((err) => reject(err));
});
promisesArray.push(promise);
}
Promise.all(promisesArray)
.then((result) => res.json(result))
.catch((err) => {
res.json(err);
});
});
app.listen(PORT, () => console.log(`server running on PORT ${PORT}`));

Related

App deployed on Heroku but api calls are now failing (Failed to load resource: net::ERR_CONNECTION_REFUSED, TypeError: Failed to fetch)

My app is successfully deployed on Heroku- it works when I have VSCode open and do npm run start manually, however when I close VSCode it is no longer able to successfully call any APIs on the backend and the console shows me a bunch of errors like the one in the title.
my console (ONLY happens when I close VSCode):
my code in the backend:
const PORT = 8000
const express = require('express')
const cors = require('cors')
const {TwitterApi} = require('twitter-api-v2')
const axios = require('axios')
const cheerio = require('cheerio')
require('dotenv').config()
const snoowrap = require('snoowrap')
const linkPreviewGenerator = require('link-preview-generator')
const spotify = require('spotify-web-api-node')
const fetch = require('node-fetch')
var request = require('request')
const app = express()
app.use(cors())
app.get('/', async (req, res) => {
const client = new TwitterApi(process.env.twt_bearer_token)
const trendsInternational = await client.v1.trendsByPlace(1);
const trendList = []
for (const {trends} of trendsInternational) {
for (const trend of trends) {
trendList.push({
name: trend.name,
url: trend.url
})
}
}
res.json(trendList)
})
app.get('/reddit', async (req, res) => {
const r = new snoowrap({
userAgent: process.env.user_agent,
clientId: process.env.client_id,
clientSecret: process.env.client_secret,
refreshToken: process.env.refresh_token
})
topPosts = []
;(await r.getHot()).forEach(post => {
topPosts.push({
title: post.title,
url: post.url
})
})
res.json(topPosts);
})
app.get('/news', async (req, res) => {
const news_url = 'https://www.theguardian.com/international'
axios(news_url)
.then(response => {
const html = response.data;
const $ = cheerio.load(html);
const articles = [];
const values = new Set();
$('.fc-item__title', html).each(function () { //<-- cannot be a function expression
const title = $(this).text().trim();
const url = $(this).find('a').attr('href');
if (!values.has(url)) {
values.add(url);
articles.push({
title,
url
});
}
})
res.json(articles)
}).catch(err => console.log(err))
})
app.listen(PORT, () => console.log(`Server is running on port ${PORT}`))```
Heroku runs on its own port, try setting port like this
const PORT = Number(process.env["PORT"]) || 8000

POST data passed from frontend JS to Nodejs/Expressjs is always undefined

I have a frontend JS script that takes text input from an HTML text box and sends it to an expressjs server. The body of the POST request, though, is always undefined, or depending on how I tweak things, returning as "{ }" if I view it via console.log( ). As I'm new to this, I can't seem to see what's going wrong.
Front end js:
async function submitCity(){
let x = document.getElementById("wg_input").value;
console.log("Successfully captured city name:", x);
let toWeather = JSON.stringify(x);
console.log("Input data successfully converted to JSON string:", toWeather);
const options = {
method: 'POST',
mode: 'cors',
headers: {'Content-Type': 'text/plain'},
body: toWeather
}
fetch('http://localhost:3000', options)
.then(res => console.log(res))
.catch(error => console.log(error))
}
Backend:
// Dependencies
const express = require('express');
const bp = require("body-parser");
const request = require("request");
const jimp = require('jimp');
const cors = require('cors');
const wgServer = express();
const port = 3000;
// Dotenv package
require("dotenv").config();
// OpenWeatherMap API_KEY
const apiKey = `${process.env.API_KEY}`;
// Basic server initialization
wgServer.use(cors())
wgServer.use(bp.json())
wgServer.use(bp.urlencoded({ extended: true }))
wgServer.listen(port, function() {
console.log(`Example app listening on port ${port}!`)
});
wgServer.post('/', async function (req, res) {
res.set('Content-Type', 'text/plain');
console.log(req.body)
res.send('Hello World');
//const data = await req.body;
// let jsonData = JSON.stringify(req.body);
// res.status(201);
//res.json();
});
The returned data is supposed to be a string of about 15 characters, give or take a few (a city and state). I thank you in advance.

Modifying data from API and save in postgreSQL

I want to fetch some data from a public API and then modify this data. For example, I will add a Point(long, lat) in geography in Postgis. However, I need to fetch the data from a public API before I get there. I have tried this so far, but it doesn't seem like the logic makes sense.
Inserting data into the database works fine, and I have set it up correctly. However, the problems happen when I try to do it in a JSON function.
require('dotenv').config()
const express = require('express')
const fetch = require('node-fetch');
const app = express();
const db = require("./db");
app.use(express.json());
async function fetchDummyJSON(){
fetch('https://jsonplaceholder.typicode.com/todos/1')
.then(res => res.json())
.then((json) => {
await db.query("INSERT INTO logictest(userId,id,title,completed) values($1,$2,$3,$4)",[json.userId+1,json.id,json.title,json.completed])
});
}
fetchDummyJSON()
app.get('/', (req, res) => {
res.send('Hello World!')
});
const port = process.env.PORT || 3001
app.listen(port, () => {
console.log(`Example app listening on port ${port}`)
})
I keep getting SyntaxError: await is only valid in async functions and the top-level bodies of modules; however, fetchDummyData() is an async function from what I can tell. Is there a way to make more sense or make this work? I am going for a PERN stack.
This is a function as well:
.then((json) => {
await db.query("INSERT INTO logictest(userId,id,title,completed) values($1,$2,$3,$4)",[json.userId+1,json.id,json.title,json.completed])
});
try
.then(async (json) => {
await db.query("INSERT INTO logictest(userId,id,title,completed) values($1,$2,$3,$4)",[json.userId+1,json.id,json.title,json.completed])
});

Express not rendering my React Front End?

I have two repos for the Front End and Back End portions of my project.
The Front End is a simple create-react-app project that hits my Express Back End and received responses from API calls.
I ran npm run build in my Front End project and moved that build folder to the root of my express backend repo.
However, when I try to reach the root page (i.e. localhost:3001), for some reason the response only returns the static html from index.html and doesn't actually render anything.
But if I go to something that has a path like localhost:3001/pokedex/1 then at least I see a correct response coming from the API.
I have a feeling that there is something wrong with the way I'm declaring my paths.
Here is the code on the Front End that is reaching out to the Back End:
import axios from 'axios'
const baseUrl = '/'
const getAll = () => {
const request = axios.get(baseUrl)
return request.then(response => response.data)
}
const getPkm = (id) => {
const request = axios.get(`${baseUrl}pokedex/${id}`)
return request.then(response => response.data)
}
export default { getAll, getPkm }
This is my Express Back End entry index.js:
const express = require('express')
const app = express()
const cors = require('cors')
const axios = require('axios')
//Middleware
app.use(cors())
app.use(express.json())
app.use(express.static('build'))
const unknownEndpoint = (request, response) => {
response.status(404).send({ error: 'unknown endpoint' })
}
let fullPkmList = require('./fullPkmList.json')
function ignoreFavicon(req, res, next) {
if (req.originalUrl.includes('favicon.ico')) {
res.status(204).end()
}
next();
}
app.get('/', (req, res) => {
axios.get(`https://pokeapi.co/api/v2/pokemon/?limit=100`)
.then((list) => res.json(list.data.results))
})
app.get('/pokedex/:id', (request, response) => {
const id = Number(request.params.id)
const pokemon = fullPkmList[id - 1]
if (pokemon) {
axios.all([
axios.get(`https://pokeapi.co/api/v2/pokemon/${id}`),
axios.get(`https://pokeapi.co/api/v2/pokemon-species/${id}`)
])
.then(axios.spread((pokemonResponse, speciesReponse) => {
let pkmResponse = pokemonResponse.data
let speciesResponse = speciesReponse.data
response.json({pkm: pkmResponse, species: speciesResponse })
}))
} else {
response.status(404).end()
}
})
app.use(unknownEndpoint)
const PORT = process.env.PORT || 3001
app.listen(PORT, () => {
console.log(`this is a test ${PORT}`)
})
Code for the Front End: https://github.com/rohithpalagiri/pocketdex
Code for the Back End: https://github.com/rohithpalagiri/pocketdex-backend
To see the issue, you only need to run the backend. I console log the response and in that, you will see the index.html file markup being returned. My goal is to have all of the paths relative so that the root url doesn't really matter. I think that is the part I'm getting stuck on.
I'd appreciate any help!

Prevent Apify from shutting down my express server

I have an express server with a POST endpoint that starts a crawler. When the crawler finishes it shuts down the whole server. Am I doing something wrong? How can I prevent it from happening?
The project looks something like this:
// server.js
const express = require('express')
const bodyParser = require('body-parser')
const startSearch = require('./crawler.js')
const app = express()
app.use(bodyParser.json())
app.post('/crawl', async (req, res) => {
const { foo, bar } = req.body
startSearch({ foo, bar })
res.end()
})
app.listen(PORT, () => console.log(`listening on port ${PORT}`))
// crawler.js
const Apify = require('apify')
const startSearch = ({ foo, bar }) => {
Apify.main(async () => {
const sources = [{
url: 'https://path_to_website.com',
userData: { foo, bar }
}]
const requestList = await Apify.openRequestList(null, sources)
const crawler = new Apify.PuppeteerCrawler({
requestList,
handlePageFunction: async ({ request, page }) => {
// do things using puppeteer
}
}
})
await crawler.run()
})
}
Just avoid using Apify.main(). For details, see How to use Apify on Google Cloud Functions
(I thought I'm sending the answer, but it seems it was just a comment)

Categories