Preserving indentation with Tesseract.js library - javascript

I am using tessearct.js library in my angular code.
I want to preserve the white spaces, the indentation as it is. How to do it?
Currently I am using this piece of code to do it.
async doOCR {
const worker = createWorker({
logger: m => console.log(m),
});
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const value = await worker.recognize(this.selectedFile);
}
I am looking a method to do it on client side only, that's why not using its python library.

You can give it a try after version (3.04), they have added the preserve_interword_spaces`. You can try this and check if this works:
async doOCR {
const worker = createWorker({
logger: m => console.log(m),
});
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
// there is no proper documentation, but they have added this flag
// to run it as a command
await worker.setParameters({
preserve_interword_spaces: 1,
});
const value = await worker.recognize(this.selectedFile);
}

Related

puppeteer: How to intercept ServiceWorker/WebWorker request?

There are some old solutions to resolve this question, the one is in github, the other is in stackoverflow.
puppeteer has _client property in lower version.
The solution in lower vision as follows:
page._client.send('Network.setBypassServiceWorker', {bypass: true})
The puppeteer version is 18.0.5, so the Page has not _client property.
So the same solution in higher vison as follows:
const client = await page.target().createCDPSession();
await client.send("Network.setBypassServiceWorker", { bypass: true });
But it not working.
So how to resolve this problem?
We should add a new line await client.send("Network.enable") to enable the network. The code as follows:
...
const client = await page.target().createCDPSession();
await client.send("Network.enable"); // Must enable network.
await client.send("Network.setBypassServiceWorker", { bypass: true });
await page.setRequestInterception(true);
...
So we can handle the response in page.on().
...
page.on("response", async (res) => {
// do somethings.
})
...

Trying to use puppeteer inside async function inside async function which already has puppeteer

I'm trying to build telegram bot to parse page on use request. My parsing code works fine inside one async function, but completeky falls on its face if I try to put it inside another async function.
Here is the relevant code I have:
const puppeteer = require('puppeteer');
const fs = require('fs/promises');
const { Console } = require('console');
async function start(){
async function searcher(input) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const url = ; //here is a long url combining logic, that works fine
await page.goto(url);
const currentUrl = requestPage.url();
console.log(currentUrl); //returns nothing.
//here is some long parsing logic
await browser.close();
return combinedResult;
}
//here is a bot code
const { Telegraf } = require('telegraf');
const bot = new Telegraf('my bot ID');
bot.command('start', ctx => {
console.log(ctx.from);
bot.telegram.sendMessage(ctx.chat.id, 'Greatings message', {});
bot.telegram.sendMessage(ctx.chat.id, 'request prompt ', {});
})
bot.on('text', (ctx) => {
console.log(ctx.message.text);
const queryOutput = searcher(ctx.message.text);
bot.telegram.sendMessage(ctx.chat.id, queryOutput, {});
});
bot.launch()
}
start();
Here is an error message:
/Users/a.rassanov/Desktop/Fetch/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:218
return Promise.reject(new Error(`Protocol error (${method}): Session closed. Most likely the ${this._targetType} has been closed.`));
^
Error: Protocol error (Page.navigate): Session closed. Most likely the page has been closed.
I'm very new to this, and your help is really appriciated.

Using wappalyzer and puppeteer in node.js

I am trying to build a scraper to monitor web projects automatically.
So far so good, the script is running, but now I want to add a feature that automatically analyses what libraries I used in the projects. The most powerful script for this job is wappalyser. They have a node package (https://www.npmjs.com/package/wappalyzer) and it's written that you can use it combined with pupperteer.
I managed to run pupperteer and to log the source code of the sites in the console, but I don't get the right way to pass the source code to the wappalyzer analyse function.
Do you guys have a hint for me?
I tryed this code but a am getting a TypeError: url.split is not a function
function getLibarys(url) {
(async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url);
// get source code with puppeteer
const html = await page.content();
const wappalyzer = new Wappalyzer();
(async function () {
try {
await wappalyzer.init()
// Optionally set additional request headers
const headers = {}
const site = await wappalyzer.open(page, headers)
// Optionally capture and output errors
site.on('error', console.error)
const results = await site.analyze()
console.log(JSON.stringify(results, null, 2))
} catch (error) {
console.error(error)
}
await wappalyzer.destroy()
})()
await browser.close()
})()
}
Fixed it by using the sample code from wappalyzer.
function getLibarys(url) {
const Wappalyzer = require('wappalyzer');
const options = {
debug: false,
delay: 500,
headers: {},
maxDepth: 3,
maxUrls: 10,
maxWait: 5000,
recursive: true,
probe: true,
proxy: false,
userAgent: 'Wappalyzer',
htmlMaxCols: 2000,
htmlMaxRows: 2000,
noScripts: false,
noRedirect: false,
};
const wappalyzer = new Wappalyzer(options)
;(async function() {
try {
await wappalyzer.init()
// Optionally set additional request headers
const headers = {}
const site = await wappalyzer.open(url, headers)
// Optionally capture and output errors
site.on('error', console.error)
const results = await site.analyze()
console.log(JSON.stringify(results, null, 2))
} catch (error) {
console.error(error)
}
await wappalyzer.destroy()
})()
}
I do not know if you still need an answer to this. But this is what a wappalyzer collaborator told me:
Normally you'd run Wappalyzer like this:
const Wappalyzer = require('wappalyzer')
const wappalyzer = new Wappalyzer()
await wappalyzer.init() // Launches a Puppeteer instance
const site = await wappalyzer.open(url)
If you want to use your own browser instance, you can skip wappalyzer.init() and assign the instance to wappalyzer.browser:
const Wappalyzer = require('wappalyzer')
const wappalyzer = new Wappalyzer()
wappalyzer.browser = await puppeteer.launch() // Use your own Puppeteer launch logic
const site = await wappalyzer.open(url)
You can find the discussion here.
Hope this helps.

How to get passed or failed test case name in the puppeteer

I need to integrate the puppeteer-jest test framework with TestRail using TestRail API. But for that, I need to know what tests are failed and what of the tests are passed
I Search some information in the official GitHub Repository and in the Jest site. But nothing about it.
Test:
describe('Single company page Tests:', () => {
let homePage;
beforeAll(async () => {
homePage = await addTokenToBrowser(browser);
}, LOGIN_FLOW_MAX_TIME);
it('Open the company page from the list', async done => {
await goto(homePage, LIST_PAGE_RELATIVE_PATH);
await listPage.clickSearchByCompanyName(homePage);
await addCompanyNamePopup.isPopupDisplayed(homePage);
await addCompanyNamePopup.fillCompanyName(homePage, companies.century.link);
await addCompanyNamePopup.clickNext(homePage);
await addCompanyNamePopup.fillListName(homePage, listNames[0]);
await addCompanyNamePopup.clickSave(homePage);
await addCompanyNamePopup.clickViewList(homePage);
const nextPage = await clickCompanyName(homePage, browser, companies.century.name);
await companyPage.isOverviewTabPresent(nextPage);
await companyPage.isPeopleTabPresent(nextPage);
await companyPage.isSocialTabPresent(nextPage);
await companyPage.isFinanceTabPresent(nextPage);
await companyPage.isLeaseTabPresent(nextPage);
await homePage.close();
done();
});
}
I expected to get all passed and failed test cases name and write it to JSON with the name of test cases and the result of them.
Actually, I have nothing of this.
You can use true/false assertion approach I like I do in my github project.
for example, try anchor case to some final selector with simple assert:
describe('E2E testing', () => {
it('[Random Color Picker] color button clickable', async () => {
// Setup
let expected = true;
let expectedCssLocator = '#color-button';
let actual;
// Execute
let actualPromise = await page.waitForSelector(expectedCssLocator);
if (actualPromise != null) {
await page.click(expectedCssLocator);
actual = true;
}
else
actual = false;
// Verify
assert.equal(actual, expected);
});

I can't go from a page to another using page.goto() - Puppeteer

I'm trying to make a InstagramBot that logs in and then go to some profile, my code worked yesterday for awhile and than it just stopped working .
I've tried to clone my repository from github, but it does'n work either, sometimes it works again, but if I try to create another function, the code just ignore the line of the code that changes the page.
I've also tried to create a new page and then in this new page use the goto function and it worked, but the account doesn keep logged in
The version of puppeteer that I'm using: 1.16.0
The version of node.js that I'm using: v10.15.3
const puppeteer = require('puppeteer');
const BASE_URL = "https://www.instagram.com/accounts/login/?hl=en&source=auth_switcher";
const instagram = {
browser: null,
page: null,
profile_url: null,
initialize: async (profile) => {
instagram.browser = await puppeteer.launch({
headless: false
})
instagram.profile_url = await "https://www.instagram.com/" + profile;
instagram.page = await instagram.browser.newPage();
await instagram.page.goto(BASE_URL, {waitUntil: 'networkidle2'});
},
login: async(username, password) =>{
await instagram.page.waitFor(1000);
await instagram.page.type('input[name="username"]', username);
await instagram.page.type('input[name="password"', password);
await instagram.page.click('button[type="submit"]');
await instagram.page.waitFor(1500);
await console.log(instagram.profile_url);
await instagram.page.goto(instagram.profile_url, {timeout: 0, waitUntil: 'domcontentloaded'}); // the code just ignore this line
await instagram.page.waitFor(1000);
},
getPhotosLinks: async() => {
console.log("Do something here");
}
}
module.exports = instagram;
It doesn't give any error message, just doesn't work
Replace
await instagram.page.click('button[type="submit"]');
await instagram.page.waitFor(1500);
with
await Promise.all([
instagram.page.click('button[type="submit"]');,
instagram.page.waitForNavigation()
]);
and see if it works

Categories