Apify - Extract url in html colde - javascript

I would like to extract url in html code with Apify scrapper.
Here is the html code :
<a class="app-aware-link profile-rail-card__profile-link t-16 t-black t-bold tap-target" href="https://www.linkedin.com/in/benjaminejzenberg?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAj58zYBTN8loEzvrFJhh-16iFZ8gnfPSGU" data-test-app-aware-link="">
<div class="single-line-truncate t-16 t-black t-bold mt2">
Voir le profil complet
</div>
</a>
And this my input code in Apify :
async function pageFunction(context) {
const $ = context.jQuery;
const pageTitle = $('title').first().text();
const h1 = $('h1').first().text();
const first_h2 = $('h2').first().text();
const random_text_from_the_page = $('p').first().text();
const author_profile_link = $('div.scaffold-layout.scaffold-layout--breakpoint-xl.scaffold-layout--sidebar-main-aside.scaffold-layout--reflow > div > div > div > div > div > div > div.pt3.ph3.pb4.break-words > a:nth-child(5) a[href]').text();
context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);
await context.enqueueRequest({ url: 'http://www.example.com' });
return {
url: context.request.url,
pageTitle,
h1,
first_h2,
random_text_from_the_page,
author_profile_link
};
}
Regards :)

Related

Scraping table from iFrame with playwright

hi i'm triying to create a $$eval function on a website that work with iFrames using playwrite. but I am not getting the information from the table that Im trying to scrape.
here is the snippet of my code.
tableData = await frame.$$eval('body > app-root > document-list > div.container-document-list > div.table-document-list > mat-card > mat-card-content > mat-table > mat-row ',(users) => {
return users.map(user => {
const folio = user.querySelector('mat-cell:nth-child(1)')
const fecha = user.qu.querySelector('mat-cell:nth-child(2)')
const cliente = user.querySelector('mat-cell:nth-child(3)')
const vendedor = user.querySelector('mat-cell:nth-child(4)')
const cierre = user.querySelector('mat-cell:nth-child(5)')
const estado = user.querySelector('mat-cell:nth-child(6)')
const total = user.querySelector('mat-cell:nth-child(7)')
return {
Folio: folio ? folio.textContent.trim() : '',
Fecha: fecha ? fecha.textContent.trim(): '',
Cliente: cliente ? cliente.textContent.trim(): '',
Vendedor: vendedor ? vendedor.textContent.trim(): '',
Cierre: cierre ? cierre.textContent.trim(): '',
Estado: estado ? estado.textContent.trim():'',
Total: total ? total.textContent.trim():'',
}
})
})
frame is defined this way:
const frame = page.mainFrame()
now the full path to get a reading of 1 value on the row is this
const valores1 = page.frameLocator('iframe[name="main"]').locator('body > app-root > document-list > div.container-document-list > div.table-document-list > mat-card > mat-card-content > mat-table > mat-row:nth-child(2) > mat-cell.mat-cell.cdk-column-IDNumCot.mat-column-IDNumCot.ng-star-inserted');
texts1 = await valores1.allTextContents();
i am not sure what the problem is that the $$eval code it's not reading the table and getting the information. if someone can help orientate me to the solution i would be thankful.

getting null value in span element

When i try to run my index.html file
<div class="quote-text">
<i class="fas fa-quote-left"></i>
<span id="quote"></span>
</div>
<!-- author element -->
<div class="quote-author">
<span id="author"></span>
</div>
my textContent method is not showing anything.
const quoteText = document.getElementById("quote");
const authorText = document.getElementById("author");
let apiQuotes = []
function newQuote(){
const ranQuote = apiQuotes[Math.floor(Math.random() * apiQuotes.length)];
console.log(ranQuote);
console.log(ranQuote.text);
console.log(ranQuote.author);
console.log(quoteText);<!-- this shows null -->
console.log(authorText); <!-- this shows null -->
quoteText.textContent = ranQuote.text;
authorText.textContent = ranQuote.author;
};
async function getQuotes(){
const apiUrl = 'https://type.fit/api/quotes';
try{
const response = await fetch(apiUrl);
apiQuotes = await response.json()
newQuote();
}
catch(error){
}
}
getQuotes();
I console logged span element value and its logging null. And i can't change my desired value in span element
this is console screeenshot
Your code is working as expected, as you can see in the snippet below.
Make sure you are running this script after DOM elements are created.
Put your script just before </body>, or make sure it executes after DOMContentLoaded (Stack snippet below does this ^)
const quoteText = document.getElementById("quote");
const authorText = document.getElementById("author");
let apiQuotes = []
function newQuote() {
const ranQuote = apiQuotes[Math.floor(Math.random() * apiQuotes.length)];
console.log(ranQuote);
console.log(ranQuote.text);
console.log(ranQuote.author);
console.log(quoteText); <!-- this shows null -->
console.log(authorText); <!-- this shows null -->
quoteText.textContent = ranQuote.text;
authorText.textContent = ranQuote.author;
};
async function getQuotes() {
const apiUrl = 'https://type.fit/api/quotes';
try {
const response = await fetch(apiUrl);
apiQuotes = await response.json()
newQuote();
} catch (error) {}
}
getQuotes();
<div class="quote-text">
<i class="fas fa-quote-left"></i>
<span id="quote">Wait for the request to finish...</span>
</div>
<!-- author element -->
<div class="quote-author">
<span id="author"></span>
</div>

Navbar TypeError: Cannot read property 'classList' of null

I am trying to implement a navbar to html which has the effect dynamically switch pages instead of changing links via href. The trick I'm using to accomplish this is by adding a classList of is-active to the div elements in the section tag.
Here is an example code of the generated HTML :
navBar.js
const navbarItem = [
{
navitem : "About",
link: 1
},
{
navitem : "Legal",
link: 2
},
{
navitem : "Contact",
link: 3
}
];
window.onload = function navLoad() {
const navbar = document.getElementById('navbar');
navbar.innerHTML =
`
<div class="toggle">
<i class="fa fa-bars menu" aria-hidden="true"></i>
</div>
<ul class="nav-list">
<li class="tab is-active">
<a onclick="renderNav()">Home</a>
</li>
${navbarItem.map(loadNavitems).join(" ")}
</ul>
`
}
function loadNavitems(navItems) {
return `
<li class="tab">
<a data-switcher data-id="${navItems.link}" onclick="renderNav(); navSwap();">${navItems.navitem}</a>
</li>
`
}
function renderNav(){
const pages = document.getElementById('main');
document.getElementById('alphabetButtons').style.display = "block";
pages.innerHTML =
`
<section class="pages">
${navbarItem.map(item => `
<div class="page" data-page="${item.link}">
<h1>${item.navitem}</h1>
</div>
`).join('')}
</section>
`
};
And here is the code which takes care of the page switching:
navSwitcher.js
function navSwap() {
const tab_switchers = document.querySelectorAll('[data-switcher]');
for (let input of tab_switchers) {
const page_id = input.dataset.switcher;
console.log(page_id);
input.addEventListener('click', function () {
if(document.querySelector('.nav-list .tab.is-active')){
document.querySelector('.nav-list .tab.is-active').classList.remove('is-active');
console.log('removed');
}
if(input.parentNode.classList.contains('tab')){
input.parentNode.classList.add('is-active');
}
//SwitchNav(page_id);
});
}
}
function SwitchNav(page_id) {
const currentPage = document.querySelector('.pages .page');
const next_page = document.querySelector(`.pages .page[data-page="${page_id}"]`);
console.log(next_page);
if(document.querySelector('.pages .page.is-active')){
document.querySelector('.pages .page.is-active').classList.remove('is-active');
}
next_page.classList.add('is-active');
}
Update : The issue is that causes the error comes when attempting to place [data-page="${page_id}"] after .page inside the querySelector. When console logging, [data-page="${page_id}"] will return null. This is weird because data-page is named correctly in the renderNav function which holds the div of class page.
Hence my hypothesis now is that the issue comes from [data-page="${page_id}"] in the SwitchNav(page_id) function. My question is why is this occuring if everything is named correctly?
Fixes tried:
Attempted to change the for/of loop to a regular for loop inside the navSwap function.
Inside the navSwap function, const page_id = input.dataset.tab; was changed to const page_id = input.dataset.switcher; which now returns 3 items.
So after some digging around, in the end it was actually const page_id = tab_switcher.dataset.tab; which was returning undefined, hence causing the error to happen. In reality it was returning id so I changed it to tab_switcher.dataset.id;.
Here is the final attempt at the code (i placed everything in one single file):
navBar.js:
const navbarItem = [
{
navitem : "About",
link: 1
},
{
navitem : "Legal",
link: 2
},
{
navitem : "Contact",
link: 3
}
];
window.onload = function navLoad() {
const navbar = document.getElementById('navbar');
navbar.innerHTML =
`
<div class="toggle">
<i class="fa fa-bars menu" aria-hidden="true"></i>
</div>
<ul class="nav-list">
<li class="tab is-active">
<a onclick="renderNav()">Home</a>
</li>
${navbarItem.map(loadNavitems).join(" ")}
</ul>
`
}
function loadNavitems(navItems) {
return `
<li class="tab">
<a data-switcher data-id="${navItems.link}" onclick="renderNav(); navSwap();">${navItems.navitem}</a>
</li>
`
}
function renderNav(){
const pages = document.getElementById('main');
document.getElementById('alphabetButtons').style.display = "block";
pages.innerHTML =
`
<section class="pages">
${navbarItem.map(item => `
<div class="page" data-page="${item.link}">
<h1>${item.navitem}</h1>
</div>
`).join('')}
</section>
`
};
function navSwap() {
const tab_switchers = document.querySelectorAll('[data-switcher]');
for (let i = 0; i < tab_switchers.length; i++) {
const tab_switcher = tab_switchers[i];
const page_id = tab_switcher.dataset.id;
tab_switcher.addEventListener('click', function(){
document.querySelector('.nav-list .tab.is-active').classList.remove('is-active');
tab_switcher.parentNode.classList.add('is-active');
SwitchNav(page_id);
});
}
}
function SwitchNav(page_id) {
const currentPage = document.querySelector('.pages .page');
const next_page = document.querySelector(`.pages .page[data-page="${page_id}"]`);
console.log(next_page);
if(document.querySelector('.pages .page.is-active')){
currentPage.classList.remove('is-active');
}
if(document.querySelector('.pages .page')){
next_page.classList.add('is-active');
}
}
Although the code generates the intended results, I have to click twice on the tag to fire it's onclick event. If anybody knows why this occurs it would be of much help to know in the comments.
Also let me know if this is a viable fix. Thanks again :)

How to create multiple HTML elements from an Array?

I have a simple site that is getting a list of books from the Google Books API.
I have a separate file called scripts.js that is getting all the book information (title, author, ISBN, link to the image).
I want to create a div for each book in a gallery style page, where there is a picture of the book and on top of the book is the Title, Author, and ISBN.
I've tried creating the DIV's in Javascript but I want there to be an h3, p, and img inside of each DIV and I can't seem to wrap my head around how I could do that in Javascript.
My HTML code for the gallery:
<div id="content">
<h2>My Bookshelf</h2>
<div class="book">
<!-- The book image is the background of the div -->
<h3 class="book-title">Title</h3>
<p class="book-isbn">ISBN: 000000</p>
<p class="book-author">Authors: ABC</p>
</div>
</div>
My Javascript code that cycles through the JSON file and returns the needed information.
// Returns an array with the book title, ISBN, author, bookmark icon, description, image
apiRequest.onreadystatechange = () => {
if (apiRequest.readyState === 4) {
const response = JSON.parse(apiRequest.response);
var bookList = response.items;
// Removes old search results before display new ones
bookSection.innerHTML = "";
for (let i = 0; i < bookList.length; i++) {
console.log(i);
var title = (bookList[i]["volumeInfo"]["title"]);
try {
var isbn = (bookList[i]["volumeInfo"]["industryIdentifiers"][0]["identifier"]);
} catch (TypeError) {
var isbn = "ISBN Not Available";
}
var author = (bookList[i]["volumeInfo"]["authors"]);
var description = (bookList[i]["description"]);
try {
var image = (bookList[i]["volumeInfo"]["imageLinks"]["thumbnail"]);
} catch (TypeError) {
var image = "img/unavailable.png";
}
}
}
}
You can use template literals to make your job easier.
You can do it like this:
var bookSection = `<div id="content">
<h2>My Bookshelf</h2>
<div class="book">
<!-- The book image is the background of the div -->
<h3 class="book-title">${titleVar}</h3>
<p class="book-isbn">ISBN: ${ISBNVar}</p>
<p class="book-author">Authors: ${AuthorsVar}</p>
</div>
</div>`;
Learn more about template literals from here: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals
Your code should look something like this
apiRequest.onreadystatechange = () => {
if (apiRequest.readyState === 4) {
const response = JSON.parse(apiRequest.response);
var bookList = response.items;
// Removes old search results before display new ones
bookSection.innerHTML = "";
let bookListHtmlMarkup = '';
for (let i = 0; i < bookList.length; i++) {
console.log(i);
// Declaring book object
const book = {};
const bookListHtmlMarkup = '';
book['title'] = (bookList[i]["volumeInfo"]["title"]);
try {
book['isbn'] = (bookList[i]["volumeInfo"]["industryIdentifiers"][0]["identifier"]);
} catch (TypeError) {
book['isbn'] = "ISBN Not Available";
}
book['author'] = (bookList[i]["volumeInfo"]["authors"]);
book['description'] = (bookList[i]["description"]);
try {
book['image'] = (bookList[i]["volumeInfo"]["imageLinks"]["thumbnail"]);
} catch (TypeError) {
book['image'] = "img/unavailable.png";
}
bookListHtmlMarkup += `
<div class="book">
<div class="book-image">
<img src="${book.image}" alt="Image unavailable" />
</div>
<div class="book-info">
<h3 class="book-title">${book.title}</h3>
<p class="book-isbn">ISBN: ${book.isbn}</p>
<p class="book-author">Author: ${book.author}</p>
<p class="book-description">Author: ${book.description}</p>
</div>
</div>
`;
}
// Assigning generated markup to innerHTML of bookSection
bookSection.innerHTML = bookListHtmlMarkup;
}
}

JQuery Get method running only on one div

Well, i have a $.get method to get data from other page and load it to a div. But even with the .each, only one div is affected.
Here is the HTML:
$(document).ready(function() {
$(".post1").each(function() {
var categlink = $(this).find(".post-area > a").attr("href");
var pAvatar = $(this).find("img.postavatar");
var pTTitle = $(this).find(".topictitle");
var pContent = $(this).find(".postcontent");
var pType = $(this).find("img.post-type");
var pOwnerlink = $(this).find(".p-ownerlink");
var pTime = $(this).find(".post-time");
$.get(categlink, function(data) {
var tLink = $(data).find(".main-content:nth-child(5) tr:first-child a.topictitle").attr("href");
$.get(tLink, function(data) {
var fAvatar = $(data).find(".post:first-child .user-basic-info > a > img").attr("src");
pAvatar.attr("src", fAvatar);
var fTopicTitle = $(data).find(".paged-head.clearfix > h1").text();
pTTitle.text(fTopicTitle);
var fTContent = $(data).find(".post:first-child .entry-content > div:first-child").html();
pContent.html(fTContent);
var fType = $(data).find(".post:first-child .posthead > h2 > img").attr("src");
pType.attr("src", fType);
var fOwner = $(data).find(".post:first-child h4.username > a > span > strong").text();
pOwnerlink.text(fOwner);
var fOwnerlink = $(data).find(".post:first-child h4.username > a").attr("href");
pOwnerlink.attr("href", fOwnerlink);
var fTTime = $(data).find(".post:first-child .posttime").text();
pTime.text(fTTime);
$(".postcontent img").each(function() {
if ($(this).height() > 70) {
$(this).addClass("imgresize");
$('.main-forum').isotope({
itemSelector: '.c-post',
percentPosition: true,
});
}
});
});
});
});
});
<div class="c-post post1">
<div class="post-info">
<img class="postavatar">
<div class="topictitle"></div>
</div>
<div class="postcontent"></div>
<div class="post-info2">
<img class="post-type">
<div class="post-owner">Por
<a class="p-ownerlink"></a>
</div>
<div class="post-time"></div>
<div class="post-area">{catrow.forumrow.FORUM_NAME}
</div>
</div>
</div>
<div class="c-post post2">
<div class="post-info">
<img class="postavatar">
<div class="topictitle"></div>
</div>
<div class="postcontent"></div>
<div class="post-info2">
<img class="post-type">
<div class="post-owner">Por
<a class="p-ownerlink"></a>
</div>
<div class="post-time"></div>
<div class="post-area">{catrow.forumrow.FORUM_NAME}
</div>
</div>
</div>
<div class="c-post post3">
<div class="post-info">
<img class="postavatar">
<div class="topictitle"></div>
</div>
<div class="postcontent"></div>
<div class="post-info2">
<img class="post-type">
<div class="post-owner">Por
<a class="p-ownerlink"></a>
</div>
<div class="post-time"></div>
<div class="post-area">{catrow.forumrow.FORUM_NAME}
</div>
</div>
</div>
Actually, my page has various .post1 divs, but only one of all is affected.
Thanks.
EDIT
So, the problem is: we cant use a $.get inside other $.get. The thing is: every .post1 has a link, that i call tLink. After getting tLink of the .post1, i need to run another $.get method using the tLink value to get info from other page. I've tried that, no sucess:
function loadpostinfo() {
$(".post1").each(function () {
var categlink = $(this).find(".post-area > a").attr("href");
var pAvatar = $(this).find("img.postavatar");
var pTTitle = $(this).find(".topictitle");
var pContent = $(this).find(".postcontent");
var pType = $(this).find("img.post-type");
var pOwnerlink = $(this).find(".p-ownerlink");
var pTime = $(this).find(".post-time");
$.get(categlink, function (data) {
var tLink = $(data).find(".main-content:nth-child(5) tr:first-child a.topictitle").attr("href");
});
if (typeof tLink === "undefined") {
} else {
console.log("checked if tLink is valid");
$.get(tLink, function (data) {
console.log("running tLink get method");
var fAvatar = $(data).find(".post:first-child .user-basic-info > a > img").attr("src");
pAvatar.attr("src", fAvatar);
var fTopicTitle = $(data).find(".paged-head.clearfix > h1").text();
pTTitle.text(fTopicTitle);
var fType = $(data).find(".post:first-child .posthead > h2 > img").attr("src");
pType.attr("src", fType);
var fOwner = $(data).find(".post:first-child h4.username > a > span > strong").text();
pOwnerlink.text(fOwner);
var fOwnerlink = $(data).find(".post:first-child h4.username > a").attr("href");
pOwnerlink.attr("href", fOwnerlink);
var fTTime = $(data).find(".post:first-child .posttime").text();
pTime.text(fTTime);
var fTContent = $(data).find(".post:first-child .entry-content > div:first-child").html();
pContent.html(fTContent);
setTimeout(alignposts, 5000);
});
}
});
}
$(document).ready(function () {
loadpostinfo();
});
How do i proceed?
OH MY GOSH, i'm the dumbest man in Earth!
The problem: only the first .post1 of page has content loaded. Each of then have a categlink url value, wich is used to get the TLink url value inside the categlink page. Then i use the TLink with $.get to load content inside this .post1 div.
The fix: only the categlink page from the first .post1 div actually has a TLink inside to use. The other ones was having no content matching my filters. So, this mess was all about data on server, and not JQuery or HTML.
Sorry for wasting your time, guys!
It's all fixed now, thanks!

Categories