pdf.js: looping through pages searching for text - javascript

I guess this is more of a question regarding how to use Promises correctly, which i don't grok:
According to this site (https://ourcodeworld.com/articles/read/405/how-to-convert-pdf-to-text-extract-text-from-pdf-with-javascript), we extract text from a page this way:
// assume pdf file has been loaded
function getPageText(pageNum, PDFDocumentInstance) {
// Return a Promise that is solved once the text of the page is retrieven
return new Promise(function (resolve, reject) {
PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) {
// The main trick to obtain the text of the PDF page, use the getTextContent method
pdfPage.getTextContent().then(function (textContent) {
var textItems = textContent.items;
var finalString = "";
// Concatenate the string of the item to the final string
for (var i = 0; i < textItems.length; i++) {
var item = textItems[i];
finalString += item.str + " ";
}
// Solve promise with the text retrieven from the page
resolve(finalString);
});
});
});
}
I want to search for a certain string through all the pages till i find the page with that string. i tried the obviously wrong way of calling the above function in a for loop, but didn't know how to end when the string was found.
Thanks for the assist!

here's a lame-ish attempt. it's recursive (wish it wasn't), and although it finds the text and gets to the resolve() call, i have no idea where execution goes from there, because it doesn't log to console as i had hoped:
function findText() {
var textToFind = document.getElementById('textToFind').value;
findIt( 1, textToFind ).then( function( pageIndex ) {
// the line below never gets called. i expected the resolve() method
// further down to come here.
console.log( 'Found ' + textToFind + ' on page ' + pageIndex );
},
function(reason) {
console.log(reason);
});
}
function findIt( pageIndex, textToFind ) {
return new Promise( function( resolve, reject ) {
if ( pageIndex > pdfObject.numPages-1 ) {
reject("Couldn't find " + textToFind);
}
getPageText( pageIndex ).then( function( pageText ) {
if ( pageText.indexOf(textToFind) === -1 ) {
findIt( pageIndex+1, textToFind );
}
else {
resolve(pageIndex); // in the debugger, i get here
}
});
});
}

Related

Synchronously retrieve a random number of record IDs, change button text and Asynchronously retrieve the corresponding records from the "server"

I need to change the text and style of the "Get next" button to "Loading...",
Synchronously retrieve a random number of record IDs from a "server" and Asynchronously retrieve the corresponding records from the "server", only proceeding when all records have been received.
Sort the records in date order, oldest first and at the end reset the button to its original state
The code is as follows
let loading = true;
const buttonHandler = function () {
loading = !loading;
toggleButton(loading);
getRecords();
};
const btn = document.getElementById('get-records');
btn.addEventListener('click', buttonHandler);
function toggleButton(loaded) {
btn.innerHTML = loaded ? 'Loading...' : 'Get next';
btn.classList.toggle('button-not-loading');
btn.classList.toggle('button-loading');
}
function getRecords() {
// getting the IDs of the records to fetch is a synchronous operation
// you don't need to change this call, it should return the IDs
const ids = Server.getIds();
const allTheRecords = [];
// getting each corresponding record is an async operation
ids.forEach(function (recordId) {
Server.getRecord(recordId, function (error, data) {
// if the fetch is unsuccessful the callback function is invoked with the error only
// if the fetch is successful the callback is invoked with error variable set to null,
// and data variable will hold the response (i.e. the record you wanted to retrieve)
if (error) {
console.log(error);
} else {
error = null;
allTheRecords.push(data);
}
});
// you can get a SINGLE record by calling Server.getRecord(recordId, callbackFunction)
// callbackFunction takes 2 parameters, error and data
// invocation as follows
// you need to make sure the list is not rendered until we have the records...
//but need to allow for any fetch errors or app will hang
// i.e. a record you request might not exist - how would you allow for this?
// when you have the records, call processRecords as follows
processRecords(allTheRecords);
});
}
function processRecords(records) {
toggleButton(true);
const sortedRecords = sortRecords(records);
let html = '';
let tr;
sortedRecords.forEach(function (index, value, array) {
tr = '';
tr +=
'<tr>' +
'<td>' + value.date + '</td>' +
'<td>' + value.name + '</td>' +
'<td>' + value.natInsNumber + '</td>' +
'<td>' + value.hoursWorked + '</td>' +
'<td>' + value.hourlyRate + '</td>' +
'<td>' + (value.hoursWorked * value.hourlyRate) + '</td>' +
'</tr>';
html += tr;
});
document.getElementById('results-body').innerHTML = html;
addTotals(sortedRecords);
}
function sortRecords(records) {
let sorted = records.sort(function (a, b) {
return new Date(a.date) - new Date(b.date);
});
// sort results in date order, most recent last
return sorted;
}
function addTotals(records) {
let hours = 0;
let paid = 0;
records.forEach(function (value, index) {
hours += value.hoursWorked;
paid += (value.hoursWorked * value.hourlyRate);
});
document.getElementById('totals-annot').innerHTML = 'TOTALS';
document.getElementById('totals-hours').innerHTML = hours;
document.getElementById('totals-paid').innerHTML = paid;
}
there is no question there, but ill give a vague pseudo code answer which should be enough to point you in the right direction.
Keyword = Promise.
const loadRecordIds = () => {
return new Promise((resolve, reject) => {
jQuery.get('http://localhost/recordIds').then((data) => {
// do something with the data ... e.g parse/validate
resolve(data);
});
});
};
const loadRecords = (recordIds) => {
return new Promise((resolve, reject) => {
jQuery.get('http://localhost/records?recordIds='+recordIds).then((data) => {
// check the data for errors etc
resolve(data);
});
});
};
const toggleButton = () => {
// toggle your button styles
};
// and you use the functions in sequence using .then() or async keyword(if you have a preprocessor or dont care about old browsers)
loadRecordIds().then((recordIds) => {
// now you have your recordIds loaded
toggleButton();
loadRecords(recordIds).then((records) => {
// now you have your records available for further processing
});
});
// with async await keywords you could do the same like this.
try {
const recordIds = await loadRecordIds();
toggleButton();
const records = await loadRecords(recordIds);
} catch (error) {
// handle errors
}
If you dont know what promises are, google them.
// ok, ill throw in a quick sample of an async code that runs in "sync" using promises.
step1 = () => {
return new Promise((resolve, reject) => {
setTimeout(() => {
// time has run out now, and its time for the second step
// calling "resolve" will call the "then" function and allows the code to continue
// whatever you pass in as the argument for resolve() will be a parameter in the "then()" function callback.
resolve('3000 seconds has passed, time to continue.');
}, 3000);
});
};
step2 = () => {
return new Promise((resolve, reject) => {
setTimeout(() => {
resolve('2000 seconds has passed, time to continue.');
}, 2000);
});
};
step1().then((message) => {
console.log(message);
step2().then((message) => {
console.log(message);
setTimeout(() => {
console.log('and now the script is done...all in sequence');
}, 2000);
});
});
/*
this will output
3000 seconds has passed, time to continue.
2000 seconds has passed, time to continue.
and now the script is done...all in sequence
*/

My recursive function isnt in infinite loop. Why?

The nightmare is working, of course I'm testing this tool but nway, the mainly problem is why my function isnt on a infinite loop? Since i didnt make a condition to page. May I'm doing this wrong?
The case that I wanted was: whenever page loaded, I get the tittle with page then call the function again to next page till the last page.
I tried without success the setTimeout too.
My console log just print 1 then finish.
The code snippet is here:-
var pagn = 1;
function ab(page){
nightmare.goto(url_base+"&page="+page)
.evaluate(() => {
return document.title;
})
.end()
.then((title) => {
console.log(title + ":" + page);
ab(++pagn);
//setTimeout("page(" + page + ")", 5000);
}).catch(()=>{console.log("Error");});
}
ab(pagn);
The problem is that you are ending your nightmare session with the .end() statement, which stops the nightmare engine, and so node exits after running through the remaining .then statements.
To test your code, I rewrote your function a bit, so that it scrapes a particular website, and exits when it finds the same page multiple times (which is kinda my test scenario, so you might have to adapt it for your code)
const Nightmare = require('nightmare')
const nightmare = Nightmare({ show: true })
function scrapePages( targetUrl, curPage = 0, transform = (url, page) => `${url}?page=${page}`, pageSet = new Set() ) {
console.info('Trying to scrape page ' + transform( targetUrl, curPage ) );
return nightmare
.goto( transform( targetUrl, curPage ) )
.evaluate( () => document.title )
.then( (title) => {
if (pageSet.has( title )) {
throw 'page already exists';
}
pageSet.add( title );
console.info( title + ':' + curPage );
return scrapePages( targetUrl, curPage + 1, transform, pageSet );
})
.catch( ( err ) => {
console.error( err );
return { maxPages: curPage, pages: pageSet };
} );
}
scrapePages( 'some-paged-url', 0, (url, page) => url + '/' + (page + 1) )
.then( ({ maxPages, pages }) => {
// end nightmare process
nightmare.end().then( () => {
console.info(`Found ${maxPages} pages`);
});
} )
.catch( err => console.error('Error occured', err ) );
The biggest difference, as you can see, is that the ending of the nightmare process only occurs once the scraping ran through. At that time, you would have the total pages available and all pages that were called successfully
you should not pass the page variable when defining a global variable else it will be overwritten everytime..
var page = 1;
function ab(){
nightmare.goto(url_base+"&page="+page)
.evaluate(() => {
return document.title;
})
.end()
.then((title) => {
console.log(title + ":" + page);
ab(page++);
//setTimeout("page(" + page + ")", 5000);
});
}
ab();
What if there is a reject thrown by nightmare.goto(). You should implement catch()
page++ didn't pass the incremented value as its post increment operator. page + 1 or ++page should do the tricks.
var page = 1;
function ab(page){
nightmare.goto(url_base+"&page="+page)
.evaluate(() => {
return document.title;
})
.end()
.then((title) => {
console.log(title + ":" + page);
ab(page+1);
//setTimeout("page(" + page + ")", 5000);
}).catch(error => {
console.error('Search failed:', error)
ab(page);
});
}
ab(page);

chain promises in map loop

i know that there are plenty of posts on this topic, but i couldn't figure it out. I'm new to all this promise-thing, but i got a specific problem that needs to be solved. In the code below i loop through an array 'fnFoes' and within this fnFoes trough another array 'avatar'. For each element in avatar an svg-Code from an indexedDB is loaded and appended to html. My problem is: I want this function to fully load and append all svg before finishing the function and going on to the next function.
I tried to wrap the entire function into a promis but it still went on before having loaded all svg. I assume that this has to do with the indexedDB, but honestly, i just don't get it.
After 2 days of post reading and trial and error, Any help is much appreciated!!
Thx
Fritz
$.map(fnFoes, function(i,obj){
console.log(obj);
console.log('loadAllAvatars: checking foe: ' + obj);
if (!(obj in foes) || fnFoes[obj].avatar != foes[obj].avatar || document.getElementById(obj) === null){ // compare avatar information of both variables. if changed, render new
console.log('loadAllAvatars: foe '+ obj +' will be rendered');
foeRenderIds.push(obj); // push foe id in list
if (obj in foes) {
foes[obj].avatar = fnFoes[obj].avatar; // change avatar items in foes;
}
var avatar = fnFoes[obj].avatar.split(','); // split avatar string in array
console.log(avatar);
console.log(fnFoes[obj]);
if (document.getElementById('#'+obj)){ // if foe div already exists in avatarDoubles, just delete it
$('#'+obj).html(); // delete old avatar from doubles
} else {
var html ='<div id="foe' + obj + '"></div>'; // if avatar doesn't exist, create it in avatarDoubles
$('#avatarDoubles').append(html);
}
//render avatar
if (typeof avatar !== 'undefined' && avatar.length > 1) {
$.map(avatar, function(j,key){
console.log(avatar[key]);
var name = avatar[key];
db.svgs.get(name).then(function(obj2){
var html = "<div class='" + obj2.category + "' data-name='" + obj2.name + "' data-category='" + obj2.category + "'>" + obj2.svg + "</div>";
$('#foe' + obj).append(html);
console.log(obj2.name);
});
});
}
} else {
console.log('loadAllAvatars: foe '+ obj +' already exists');
}
});
I am struggling to understand your code but you probably want something like this:
function connect() {
return new Promise(function(resolve, reject){
var request = indexedDB.open();
request.onsuccess = () => resolve(request.result);
});
}
function dothings(db, things) {
var promises = [];
for(var thing of things) {
var promise = dothing(db, thing);
promises.push(promise);
}
return Promise.all(promises);
}
function dothing(db, thing) {
return new Promise(function(resolve, reject) {
var tx = db.transaction('svgs');
var store = tx.objectStore('svgs');
var request = store.get(thing);
request.onsuccess = () => resolve(request.result);
});
}
function foo() {
var things = getthings();
connect().then(function(db) {
return dothings(db, things);
});
}

How to wait for the first function to finish executing before the second is executed?

I'm trying to figure out a way to wait until the first function (relatedVids(...)) to finish executing and then in turn execute the second function (relatedVidsDetails()). What the code does is simply loop through a single $.get(...) request from the youtube api and loop through each of its item retrieved.
The problem with this as tested in the debugger, is that it will go in the first function (up to the $.get() without actually getting anything yet) then skip into the second function (once again, up to the $.get()). Then, it will proceed to execute the first function till it's finished retrieving all items, then when it gets into the second function, it will do the same thing but for some mysterious reason, the videoIdChainStr which holds all the video ids in a string from the first function is never retrieved or being executed since I suspected it executed the second function's $.get(...) already and never did it again a "second time" when it had the values.
So, my next step is trying to use $.Deferred() which is said to help resolve the first function first before even stepping into executing the second function so it will guaranteed values from the first function to be used in the second without skipping anything. But I'm not sure if I'm doing this right as it still does the same thing with or without using $.Deferred().
First function (relatedVids(...)):
var relatedVidsDefer = function relatedVids(videoId)
{
var r = $.Deferred();
$.get( // get related videos related to videoId
"https://www.googleapis.com/youtube/v3/search",
{
part: 'snippet',
maxResults: vidResults,
relatedToVideoId: videoId,
order: 'relevance',
type: 'video',
key: 'XXXXXXXXXX'
},
function(data)
{
debugger;
$.each(data.items,
function(i, item)
{
try
{
console.log(item);
var vidTitle = item.snippet.title; // video title
var vidThumbUrl = item.snippet.thumbnails.default.url; // video thumbnail url
var channelTitle = item.snippet.channelTitle; // channel of uploaded video
var extractVideoId = null; // var to extract video id string from vidThumbUrl
// check if vidThumbUrl is not null, empty string, or undefined
if(vidThumbUrl)
{
var split = vidThumbUrl.split("/"); // split string when '/' seen
extractVideoId = split[4]; // retrieve the fourth index on the fourth '/'
}
else console.error("vidThumbUrl is either undefined or null or empty string.");
// if video title is longer than 25 characters, insert the three-dotted ellipse
if(vidTitle.length > 25)
{
var strNewVidTitle = vidTitle.substr(0, 25) + "...";
vidTitle = strNewVidTitle;
}
// check whether channelTitle is March of Dimes
if(channelTitle === "March of Dimes")
{
extractedVideoIdArr.push(extractVideoId); // add the extracted video id to the array
// check if extractedVideoIdArr is not empty
if(extractedVideoIdArr !== 'undefined' && extractedVideoIdArr.length > 0)
{
console.log("before join(): ", extractedVideoIdArr);
videoIdChainStr = extractedVideoIdArr.join(", "); // change from an array to a chain string of videoIds for the relatedVidsDetails()
console.log("after join(): ", videoIdChainStr);
}
var vidThumbnail = '<div class="video-thumbnail"><a class="thumb-link" href="single-video.html"><div class="video-overlay"><img src="imgs/video-play-button.png"/></div><img src="' + vidThumbUrl + '" alt="No Image Available." style="width:204px;height:128px"/></a><p><a class="thumb-link" href="single-video.html">' + vidTitle + '</a><br/></div>'; //+ convert_time(vidDuration) + ' / Views: ' + viewCount + '</p></div>';
// print results
$('.thumb-related').append(vidThumbnail);
$(item).show(); // show current video thumbnail item
}
else $(item).hide(); // hide current video thumbnail item
}
catch(err)
{
console.error(err.message); // log error but continue operation
}
}
);
}
);
return r;
};
Second function (relatedVidsDetails(...)):
var relatedVidsDetailsDefer = function relatedVidsDetails()
{
// change extractvideoid into a string by tostring() or join() for param to recognize
console.log("initial: ", extractedVideoIdArr);
$.get(
"https://www.googleapis.com/youtube/v3/videos",
{
part: 'snippet, contentDetails, statistics',
id: videoIdChainStr, // chain string of video ids to be called upon in a single request
key: 'XXXXXXXXXX',
},
function(data)
{
debugger;
$.each(data.items,
function(i, item)
{
try
{
console.log("relatedVidsDetails()", item);
console.log("extractedvideoidarr: ", extractedVideoIdArr[i]);
var _vidDuration = item.contentDetails.duration;
var _viewCount = item.statistics.viewCount;
console.log("id: " + extractedVideoIdArr[i] + " duration: " + _vidDuration);
console.log("id: " + extractedVideoIdArr[i] + " viewCount: " + _viewCount);
}
catch(err)
{
console.error(err.message); // log error but continue operation
}
}
);
}
);
};
Code being skipped when the second function has been tapped into the second time:
$.each(data.items,
function(i, item)
{
try
{
console.log("relatedVidsDetails()", item);
console.log("extractedvideoidarr: ", extractedVideoIdArr[i]);
var _vidDuration = item.contentDetails.duration;
var _viewCount = item.statistics.viewCount;
console.log("id: " + extractedVideoIdArr[i] + " duration: " + _vidDuration);
console.log("id: " + extractedVideoIdArr[i] + " viewCount: " + _viewCount);
}
catch(err)
{
console.error(err.message); // log error but continue operation
}
}
);
The code being skipped due to the second function being stepped into when videoIdChainStr was empty and then skipped when first function completed and had the values ready for the second to use. I couldn't get this videoIdChainStr when it has values to then execute.
Well the answer to the question: "How to wait for the first function to finish executing before the second is executed" would be:
USE CALLBACK OR PROMISES.
Example:
function firstMethod() {
// half of first method actions ...
secondMethod(function() {
// other half of first method actions
});
}
function secondMethod(callBack) {
// Second method actions
setTimeout(function() { // The timeout is just a way to simulate the time that a response make us wait
callBack();
}, 5000);
}

Cannot break recursion with $.Deffered() object and $.then()

I have to search through word index tables with potentially hundreds of thousands rows. I can restrict my search by passing a list of documents to the search. Request to search for words in many documents return very slowly. So...to improve the UX we're chunking the request into several groups of documents. So, if a user asks to search 90 documents, and the chunk size is 10 documents per query, then we send out 90 / 10 = 9 independent $.ajax() calls. We want the results to come in the order they were sent.
We implement this recursion:
var SearchFunction = function () {
$.ajax(/* ... */);
}
var RecursiveSearch = function () {
var deferred = $.Deferred();
if (arrTransSearch.length > 0) {
deferred = SearchDocuments(arrTransSearch.shift());
}
else {
deferred.reject();
}
return deferred.promise().then(RecursiveSearch);
}
if (arrTransSearch.length > 1) {
RecursiveSearch().fail(SomeFunction);
}
var SomeFunction = function () {
alert("Failed. Yes!");
}
When I debug the code, it appears that deferred.reject() does not change the state of deferred.promise(). That is, when the next line
return deferred.promise().then(RecursiveSearch)
is executed, it just loops back into the recursive function, instead of exiting the recursion and falling into
RecursiveSearch().fail(SomeFunction);
Important Note:
I'm using jQuery-1.7.1. I ran analogous recursion in JSFiddle (Thank you Beeetroot-Beetroot) and it failed on jQuery-1.7.2 while on jQuery-2.1.0 it ran without a problem.
Any idea on how to get the recursion to work in jQuery-1.7.1?
A pattern partly covering what you are looking for is provided here under the heading "The Collection Kerfuffle". You actually need slightly more than that because you want to address your list of document referneces in chunks (groups).
The code will be something like this :
$(function() {
//General ajax options for searching a document group
var ajaxOptions = {
url: '...',
type: 'POST',
//data: ... //added dynamically
dataType: 'JSON',
// etc.
};
//
function searchDocumentsInGroups(arr, n) {
//Pre-process arr to create an array of arrays, where each inner array is a group of document references
var groups = [];
$.each(arr, function (i) {
if (!(i % n)) groups.push(arr.slice(i, i + n));
});
//Ajax serializer (from the Collection Kerfuffle reference)
return groups.reduce(function (promise, group) {
return promise.then(function () {
return $.ajax($.extend({}, ajaxOptions, {
data: JSON.stringify(group);//or whatever, compatible with the server-side script
})).then(function (groupResults) {
//display groupResults here
});
});
}, $.when(0));
}
// data
var myDocumentArray = [ 'doc1', 'doc2', 'doc3', 'doc4', 'etc.' ], //Your array of 90 document references.
groupSize = 10; //Number of documents per "chunk".
// Event handler to kick off the process.
$("#searchDocuments").on('click', function () {
// display "in progress" message or spinner here
searchDocumentsInGroups(myDocumentArray, groupSize).then(function () {
// display "complete" message or hide spinner here
});
});
});
You also need the Polyfill for Array.prototype.reduce, as .reduce is relied on above and older browsers (pre ECMAScript5) don't have it.
if ( 'function' !== typeof Array.prototype.reduce ) {
Array.prototype.reduce = function( callback /*, initialValue*/ ) {
'use strict';
if ( null === this || 'undefined' === typeof this ) {
throw new TypeError(
'Array.prototype.reduce called on null or undefined' );
}
if ( 'function' !== typeof callback ) {
throw new TypeError( callback + ' is not a function' );
}
var t = Object( this ), len = t.length >>> 0, k = 0, value;
if ( arguments.length >= 2 ) {
value = arguments[1];
} else {
while ( k < len && ! k in t ) k++;
if ( k >= len )
throw new TypeError('Reduce of empty array with no initial value');
value = t[ k++ ];
}
for ( ; k < len ; k++ ) {
if ( k in t ) {
value = callback( value, t[k], k, t );
}
}
return value;
};
}
All untested but I recently answered a similar question here, with a link to a fiddle.
It turns out that until jQuery-1.8, calling $.then() with one argument was the equivalent of calling $.then(successFunction, successFunction). Since I was using jQuery-1.7.1, a rejected promise would still invoke the recursion.

Categories