Does this iframe buster script look safe? - javascript

We're being asked to host a number of iframe buster scripts on our site - they allow ads which are served from external domains into iframes to expand outside of them into the host page. Our hosting provider's warned us to watch out for security holes in these scripts. Specifically, they say some of them create cross-site scripting holes by allowing a piece of Javascript to be loaded into our site from any URL.
To implement the script, you host an HTML page on your site. I'm looking at an example from the ad provider Atlas. In this case the URL is like http://domain.com/atlas/atlas_rm.htm. That page contains a script tag with src at an external URL, and here's the JS it includes:
var ARMIfbLib = function () {
function documentWrite(htmlString) {
document.write(htmlString);
}
function writeIframeBustingScript() {
var imgSrvPath = getTlDirectoryFromQueryString(getParameterString());
if (imgSrvPath != "") {
var scriptURL = imgSrvPath + getScriptFileName();
ARMIfbLib.DocumentWrite("<script language='javascript' type='text/javascript' src='" + scriptURL + "'></scr" + "ipt>");
}
}
return {
WriteIframeBustingScript: writeIframeBustingScript,
DocumentWrite: documentWrite
}
}();
function getValueFromDelimitedString(paramKey, delimiter, queryString) {
if (paramKey == "imgSrv")
return getValueFromProperties();
var re = new RegExp(paramKey + "=" + "(.*?)" + "(" + delimiter + "|$)");
var matchArray = queryString.match(re);
if (matchArray == null)
return "";
else
return matchArray[1];
}
function getValueFromProperties() {
var iframename = unescape(self.name);
if (iframename.indexOf("<form") >= 0) {
var params = iframename.split("<input ");
for (var i = 1; i < params.length; i++) {
var parts = params[i].split(" ");
for (var j = 0; j < parts.length; j++) {
var param = parts[j].split("=");
if (param[0].indexOf("name") >= 0 && param[1].indexOf("TL_files_path") >= 0) {
param = parts[j + 1].split("=");
if (param[0].indexOf("value") >= 0) {
var value = param[1].substr(1, param[1].indexOf(">"));
value = value.substr(value, value.lastIndexOf("/"));
value = value.substr(value, value.lastIndexOf("/") + 1);
return unescape(value);
}
}
}
}
}
else if (iframename.indexOf("adparamdelim") >= 0) {
var params = iframename.split("adparamdelim");
for (var i = 0; i < params.length; i++) {
var param = params[i].split("=");
if (param[0].indexOf("TL_files_path") >= 0) {
var value = param[1];
value = value.substr(value, value.lastIndexOf("/"));
value = value.substr(value, value.lastIndexOf("/") + 1);
return value;
}
}
}
else if (/^\{.*\}$/.test(iframename)) {
try {
eval('var results = ' + iframename);
var value = results.TL_files_path;
value = value.substr(value, value.lastIndexOf("/"));
value = value.substr(value, value.lastIndexOf("/") + 1);
return value;
} catch (e) {
return "";
}
} else {
var params = iframename.split("&");
for (var i = 0; i < params.length; i++) {
var param = params[i].split("=");
if (param[0].indexOf("TL_files_path") >= 0) {
var value = unescape(param[1]);
value = value.substr(value, value.lastIndexOf("/"));
value = value.substr(value, value.lastIndexOf("/") + 1);
return value;
}
}
}
return "";
}
function getTlDirectoryFromQueryString(sLocation) {
var queryVar = getValueFromDelimitedString("imgSrv", "a4edelim", sLocation);
var temp = queryVar.substr(0, queryVar.lastIndexOf("/"));
var tlDir = temp.substr(0, temp.lastIndexOf("/") + 1);
return tlDir;
}
function getDocumentQueryString() {
return window.location.search;
}
function getIframeParameterString() {
var ret = "";
var qs = getDocumentQueryString();
if (qs.length > 0)
ret = qs.substring(1);
return ret;
}
function getScriptParameterString() {
var ret = "";
var scripts = document.getElementsByTagName('script');
for (var i = 0; i < scripts.length; i++) {
var scriptSrc = scripts[i].src;
if (scriptSrc.toLowerCase().indexOf("newiframescript") != -1 && scriptSrc.indexOf("?") != -1) {
ret = scriptSrc.substr(scriptSrc.indexOf("?") + 1);
break;
}
}
return ret;
}
function getParameterString() {
var qs = getIframeParameterString();
if (qs.length > 0 && qs.indexOf("a4edelim") > 0)
return qs;
return getScriptParameterString();
}
function getScriptFileName() {
var armdelim = ",";
var fileName = "ifb.0";
var queryString = getParameterString();
var parmValue = "";
if (queryString.length > 0) {
parmValue = getValueFromDelimitedString("armver", "a4edelim", queryString);
}
if (parmValue.length > 0) {
var fileNames = parmValue.split(armdelim);
for (var i = 0; i < fileNames.length; i++) {
if (fileNames[i].toLowerCase().indexOf("ifb") != -1) {
fileName = fileNames[i];
break;
}
}
}
return fileName + ".js";
}
if (typeof(armTestMode) == "undefined") {
ARMIfbLib.WriteIframeBustingScript();
}
I've spent a couple of hours studying this to try and work out what it's doing, but I've got bogged down in the different function calls. It seems to be grabbing a query string parameter or else a value from the name of an iframe, presumably the iframe the contains the ad.
Can anyone understand what this JS is doing? Does it look fairly safe from a XSS point of view?
=========================================
EDIT
In case useful to anybody else, we mentioned this concern to the providers, and their response was:
The iframe buster page will only work if it is in an iframe
The code in the ftlocal.html file will only work if the domain of the iframe is already the same as the domain of the parent page – So any code would already have access to the parent page anyway

The the JS script creates a dynamically generated script tag in your page.
ARMIfbLib.DocumentWrite("<script language='javascript' type='text/javascript' src='" + scriptURL + "'></scr" + "ipt>");
If you dig into where scriptURL comes from, it appears to be a parameter passed to window.location.search (the query string).
From what I can see this effectively allows any script to be passed to your page on the query string rendering it vulnerable to DOM XSS, unless it is effectively secured to allow the domain to be set by the frame name in your page. I'd do some testing using your own domains and passing the query string variables that are searched for (the string literals in the JS).

Related

Comparing JPG files with Photoshop Layers

Is it possible to compare filenames for a set of files that are imported as Photoshop layers ?
I have a folder of 50 jpg images which I have used in a PSD file.
Now I want to check whether all the JPG files are used or not ?
Is it possible to do so ?
As I've said, Photoshop scripting can help you achieve this by using File Objects and basic javascript knowledge. I've modified my old script as you've desired and now it should work well with any nested groups and images.
I highly encourage you to learn scripting and ask questions here wherever you feels confused.
Save below code as 'Script.jsx' and run it from 'File > Scripts > Browse'
Update 2 : Now it saves log.txt file too as per you requested. P.S. Learn from this script and tweak it to your desired result.
// Managing Document
var docs = app.documents;
// Progress Bar
var win = new Window("window{text:'Progress',bounds:[100,100,400,150],bar:Progressbar{bounds:[20,20,280,31] , value:0,maxvalue:100}};");
// assigning activeDocument
if (docs.length != 0) {
var docRef = app.activeDocument;
// Defining the folder
alert("You will be prompted for the folder containing your images.\n" +
"Files will be selected with a '.png'/'.jpg/.jpeg' on the end in the same folder.");
var folder = Folder.selectDialog();
if (!folder) {
exit;
}
var photoFiles = folder.getFiles(/\.(jpg|jpeg|png)$/i);
var matchFiles = [];
var photoFilesName = [];
//Searching for used images
var increment = parseFloat(0);
var divider = parseFloat(100/photoFiles.length);
win.show();
for (var i = 0; i < photoFiles.length; i++) {
increment = increment + divider;
var indexPhotoName = removeExtension(photoFiles[i].displayName);
photoFilesName.push(indexPhotoName);
var doc = activeDocument;
var curLayer;
goThroughLayers(doc, indexPhotoName);
}
function goThroughLayers(parentLayer, targetName) {
for (var i = 0; i < parentLayer.layers.length; i++) {
curLayer = parentLayer.layers[i];
doc.activeLayer = curLayer;
if (curLayer.typename == 'LayerSet') {
goThroughLayers(curLayer, targetName)
} else {
if (curLayer.name == targetName) {
// if (curLayer.name.match(/[e]/ig)) {
matchFiles.push(targetName);
// }
} //end if
} //end else
} //end loop
} //end function
function arr_diff(a1, a2) {
var a = [],
diff = [];
for (var i = 0; i < a1.length; i++) {
a[a1[i]] = true;
}
for (var i = 0; i < a2.length; i++) {
if (a[a2[i]]) {
delete a[a2[i]];
} else {
a[a2[i]] = true;
}
}
for (var k in a) {
diff.push(k);
}
return diff;
}
function removeExtension(str) {
return str.split('.').slice(0, -1).join('.');
}
var missItems = arr_diff(matchFiles, photoFilesName);
if (missItems.length > 0) {
var missFolder = new Folder(photoFiles[0].path + '/Missed%20Files');
if(!missFolder.exists){
missFolder.create();
}
for (var y = 0; y < photoFiles.length; y++) {
var photoTrimName = removeExtension(photoFiles[y].displayName);
for( var x = 0; x < missItems.length ; x++){
if(photoTrimName == missItems[x]){
photoFiles[y].copy(new File(missFolder+'/'+photoFiles[y].displayName));
}
}
};
win.close();
alert("You've missed total " + missItems.length + " files. Press OK to open folder containing missing files. Log report is generated wherever PSD is saved.");
var FileStr = "";
for(var m=0; m<missItems.length; m++){
FileStr = FileStr + '\n' + (m+1) + '. ' + missItems[m];
}
var str = "Your missed files are : " + FileStr;
saveTxt(str);
missFolder.execute();
} else {
win.close();
saveTxt('All Photos are used');
alert('All Photos are used');
}
} else {
alert('Open atleast one document');
}
function saveTxt(txt)
{
var Name = "LogReport_" + app.activeDocument.name.replace(/\.[^\.]+$/, '');
var Ext = decodeURI(app.activeDocument.name).replace(/^.*\./,'');
if (Ext.toLowerCase() != 'psd')
return;
var Path = app.activeDocument.path;
var saveFile = File(Path + "/" + Name +".txt");
if(saveFile.exists)
saveFile.remove();
saveFile.encoding = "UTF8";
saveFile.open("e", "TEXT", "????");
saveFile.writeln(txt);
saveFile.close();
}
In Javascript, it is possible to get some information related to PSD file layers using PSD.js library

How can get only the last element from an array javascript

I'm trying to get the only last visited page by the user, it should update on every page. I tried to save the URL in session cookie from there I'm retrieving the last visited page.
This is my code:
function getSecondCookie(cSname) {
var name = cSname + "=";
var ca = document.cookie.split(';');
for ( var i = 0; i < ca.length; i++) {
var c = ca[i].trim();
if (c.indexOf(name) == 0)
return c.substring(name.length, c.length);
}
return "";
}
function checkHistory(targetId) {
var history = getSecondCookie("history");
var htmlContent = '';
if (history != "") {
var insert = true;
var sp = history.toString().split(",");
for ( var i = sp.length - 1; i >= 0; i--) {
htmlContent += '<a class="previous_url" href="'
+ sp[i]
+ '">'
+ sp[i].substring(sp[i].lastIndexOf('/') + 1) + '</a><br>';
if (sp[i] == document.URL) {
insert = false;
}
document.getElementById(targetId).innerHTML = htmlContent;
console.log(sp[i]);
}
if (insert) {
sp.push(document.URL);
}
setSecondCookie("history", sp.toString(), 30);
} else {
var stack = new Array();
stack.push(document.URL);
setSecondCookie("history", stack.toString(), 30);
}
}
This is working fine how it has to.
At the moment it is showing like this:
https://www.example.com,https://www.example.com/about.html
and so on.
But I want here show only last element from an array. How can I do this?
let lastItem = array[array.length-1];
You can also do:
const last = array.slice(-1)[0]

How to add or replace a query parameter in a URL using Javascript/jQuery?

I'm using jQuery 1.12. I want to replace a query string parameter in my window's URL query string, or add the parameter if doesn't exist. I tried the below:
new_url = window.location.href.replace( /[\?#].*|$/, "?order_by=" + data_val )
window.location.href = new_url
but what I'm discovering is that this wipes out all previous parameters in the query string, which I don't want. If the query string is:
?a=1&b=2
I would want the new query string to be:
?a=2&b=2&order_by=data
and if the query string was:
?a=2&b=3&order_by=old_data
it would become:
?a=2&b=3&order_by=data
You could use a jQuery plugin to do the all the heavy lifting for you. It will parse the query string, and also reconstruct the updated query string for you. Much less code to deal with.
Plugin Download Page
Github Repo
// URL: ?a=2&b=3&order_by=old_data
var order_by = $.query.get('order_by');
//=> old_data
// Conditionally modify parameter value
if (order_by) {
order_by = “data”;
}
// Inject modified parameter back into query string
var newUrl = $.query.set(“order_by”, order_by).toString();
//=> ?a=2&b=3&order_by=data
For those using Node.js, there is a package for this available in NPM.
NPM Package
Github Repo
var queryString = require('query-string');
var parsed = queryString.parse('?a=2&b=3&order_by=old_data'); // location.search
// Conditionally modify parameter value
if (parsed.order_by) {
parsed.order_by = 'data';
}
// Inject modified parameter back into query string
const newQueryString = queryString.stringify(parsed);
//=> a=2&b=3&order_by=data
A good solution ought to handle all of the following:
A URL that already has an order_by query parameter, optionally with whitespace before the equals sign. This can be further divided into cases where the order_by appears at the start, middle or end of the query string.
A URL that doesn't already have and order_by query parameter but does already have a question mark to delimit the query string.
A URL that doesn't already have and order_by query parameter and doesn't already have a question mark to delimit the query string.
The following will handle the cases above:
if (/[?&]order_by\s*=/.test(oldUrl)) {
newUrl = oldUrl.replace(/(?:([?&])order_by\s*=[^?&]*)/, "$1order_by=" + data_val);
} else if (/\?/.test(oldUrl)) {
newUrl = oldUrl + "&order_by=" + data_val;
} else {
newUrl = oldUrl + "?order_by=" + data_val;
}
as demonstrated below:
getNewUrl("?a=1&b=2");
getNewUrl("?a=2&b=3&order_by=old_data");
getNewUrl("?a=2&b=3&order_by = old_data&c=4");
getNewUrl("?order_by=old_data&a=2&b=3");
getNewUrl("http://www.stackoverflow.com");
function getNewUrl(oldUrl) {
var data_val = "new_data";
var newUrl;
if (/[?&]order_by\s*=/.test(oldUrl)) {
newUrl = oldUrl.replace(/(?:([?&])order_by\s*=[^?&]*)/, "$1order_by=" + data_val);
} else if (/\?/.test(oldUrl)) {
newUrl = oldUrl + "&order_by=" + data_val;
} else {
newUrl = oldUrl + "?order_by=" + data_val;
}
console.log(oldUrl + "\n...becomes...\n" + newUrl);
}
something like this?
let new_url = "";
if (window.location.search && window.location.search.indexOf('order_by=') != -1) {
new_url = window.location.search.replace( /order_by=\w*\d*/, "order_by=" + data_val);
} else if (window.location.search) {
new_url = window.location.search + "&order_by=" + data_val;
} else {
new_url = window.location.search + "?order_by=" + data_val;
}
window.location.href = new_url;
function addOrReplaceOrderBy(newData) {
var stringToAdd = "order_by=" + newData;
if (window.location.search == "")
return window.location.href + stringToAdd;
if (window.location.search.indexOf('order_by=') == -1)
return window.location.href + stringToAdd;
var newSearchString = "";
var searchParams = window.location.search.substring(1).split("&");
for (var i = 0; i < searchParams.length; i++) {
if (searchParams[i].indexOf('order_by=') > -1) {
searchParams[i] = "order_by=" + newData;
break;
}
}
return window.location.href.split("?")[0] + "?" + searchParams.join("&");
}
window.location.href = addOrReplaceOrderBy("new_order_by");
A little long but I think it works as intended.
You can remove parameter from query string using URLSearchParams https://developer.mozilla.org/ru/docs/Web/API/URLSearchParams?param11=val
It is not yet supported by IE and Safari, but you can use it by adding polyfill https://github.com/jerrybendy/url-search-params-polyfill
And for accessing or modifying query part of the URI you should use "search" property of the window.location.
Working code example:
var a = document.createElement("a")
a.href = "http://localhost.com?param1=val&param2=val2&param3=val3#myHashCode";
var queryParams = new URLSearchParams(a.search)
queryParams.delete("param2")
a.search = queryParams.toString();
console.log(a.href);
Try this:
For reading parameters:
const data = ['example.com?var1=value1&var2=value2&var3=value3', 'example.com?a=2&b=2&order_by=data']
const getParameters = url => {
const parameters = url.split('?')[1],
regex = /(\w+)=(\w+)/g,
obj = {}
let temp
while (temp = regex.exec(parameters)){
obj[temp[1]] = decodeURIComponent(temp[2])
}
return obj
}
for(let url of data){
console.log(getParameters(url))
}
For placing only this parameters:
const data = ['example.com?zzz=asd']
const parameters = {a:1, b:2, add: "abs"}
const setParameters = (url, parameters) => {
const keys = Object.keys(parameters)
let temp = url.split('?')[0] += '?'
for (let i = 0; i < keys.length; i++) {
temp += `${keys[i]}=${parameters[keys[i]]}${i == keys.length - 1 ? '' : '&'}`
}
return temp
}
for (let url of data){
console.log(setParameters(url, parameters))
}
And finaly for inserting (or replace while exists)
const data = ['example.com?a=123&b=3&sum=126']
const parameters = {order_by: 'abc', a: 11}
const insertParameters = (url, parameters) => {
const keys = Object.keys(parameters)
let result = url
for (let i = 0; i < keys.length; i++){
if (result.indexOf(keys[i]) === -1) {
result += `&${keys[i]}=${encodeURIComponent(parameters[keys[i]])}`
} else {
let regex = new RegExp(`${keys[i]}=(\\w+)`)
result = result.replace(regex, `&${keys[i]}=${encodeURIComponent(parameters[keys[i]])}`)
}
}
return result
}
for (let url of data){
console.log(insertParameters(url, parameters))
}
Hope this works for you ;)
After using function just replace window.location.href
This small function could help.
function changeSearchQueryParameter(oldParameter,newParameter,newValue) {
var parameters = location.search.replace("?", "").split("&").filter(function(el){ return el !== "" });
var out = "";
var count = 0;
if(oldParameter.length>0) {
if(newParameter.length>0 && (newValue.length>0 || newValue>=0)){
out += "?";
var params = [];
parameters.forEach(function(v){
var vA = v.split("=");
if(vA[0]==oldParameter) {
vA[0]=newParameter;
if((newValue.length>0 || newValue>=0)) {
vA[1] = newValue;
}
} else {
count++;
}
params.push(vA.join("="));
});
if(count==parameters.length) {
params.push([newParameter,newValue].join("="));
}
params = params.filter(function(el){ return el !== "" });
if(params.length>1) {
out += params.join("&");
}
if(params.length==1) {
out += params[0];
}
}
} else {
if((newParameter.length>0) && (newValue.length>0 || newValue>=0)){
if(location.href.indexOf("?")!==-1) {
var out = "&"+newParameter+"="+newValue;
} else {
var out = "?"+newParameter+"="+newValue;
}
}
}
return location.href+out;
}
// if old query parameter is declared but does not exist in url then new parameter and value is simply added if it exists it will be replaced
console.log(changeSearchQueryParameter("ib","idx",5));
// add new parameter and value in url
console.log(changeSearchQueryParameter("","idy",5));
// if no new or old parameter are present url does not change
console.log(changeSearchQueryParameter("","",5));
console.log(changeSearchQueryParameter("","",""));
Maybe you could try tweaking the regular expression to retrieve only the values you're looking for, then add or update them in a helper function, something like this:
function paramUpdate(param) {
var url = window.location.href,
regExp = new RegExp(param.key + '=([a-z0-9\-\_]+)(?:&)?'),
existsMatch = url.match(regExp);
if (!existsMatch) {
return url + '&' + param.key + '=' + param.value
}
var paramToUpdate = existsMatch[0],
valueToReplace = existsMatch[1],
updatedParam = paramToUpdate.replace(valueToReplace, param.value);
return url.replace(paramToUpdate, updatedParam);
}
var new_url = paramUpdate({
key: 'order_by',
value: 'id'
});
window.location.href = new_url;
Hope it works well for your needs!
To use Regex pattern, I prefer this one:
var oldUrl = "http://stackoverflow.com/";
var data_val = "newORDER" ;
var r = /^(.+order_by=).+?(&|$)(.*)$/i ;
var newUrl = "";
var matches = oldUrl.match(r) ;
if(matches===null){
newUrl = oldUrl + ((oldUrl.indexOf("?")>-1)?"&":"?") + "order_by=" + data_val ;
}else{
newUrl = matches[1]+data_val+matches[2]+matches[3] ;
}
conole.log(newUrl);
If no order_by exist, matches is null and order_by=.. should come after ? or & (if other parameters exist, new one needs &).
If order_by exist, matches has 3 items, see here
Based on AVAVT´s answer I improved it so it takes any key, and I also fixed the missing "?" if there was no querystring
function addOrReplace(key, value) {
var stringToAdd = key+"=" + value;
if (window.location.search == "")
return window.location.href + '?'+stringToAdd;
if (window.location.search.indexOf(key+'=') == -1)
return window.location.href + stringToAdd;
var newSearchString = "";
var searchParams = window.location.search.substring(1).split("&");
for (var i = 0; i < searchParams.length; i++) {
if (searchParams[i].indexOf(key+'=') > -1) {
searchParams[i] = key+"=" + value;
break;
}
}
return window.location.href.split("?")[0] + "?" + searchParams.join("&");
}
usuage:
window.location.href = addOrReplace('order_by', 'date_created');
if you would not want to reload the page you can use pushState Api
if (history.pushState) {
var newurl = addOrReplace('order_by', 'date_created');
window.history.pushState({path:newurl},'',newurl);
}
function myFunction() {
var str = "https://www.citicards.com/cards/credit/application/flow.action?app=UNSOL&HKOP=828cca70910b4fe25e118bd0b59b89c3c7c560df877909495d8252d20026cf8d&cmp=afa|acquire|2003|comparecards&ranMID=44660&ranEAID=2759285&ProspectID=516511657A844EF3A6F0C2B1E85FEFB0&ID=3000";
var res = str.split("&");
var myKey;
if (!str.includes("ranSiteID")) {
console.log("key not found ");
res.push('ranSiteID=samplearsdyfguh.090-nuvbknlmc0.gvyhbjknl')
console.log(res.join("&"));
} else {
res.map(function(key) {
console.log("my keys", key);
if (key.includes("ranSiteID")) {
console.log("my required-->key", key);
mykey = key.split("=");
console.log(mykey);
}
})
}
document.getElementById("demo").innerHTML = res;
}
<!DOCTYPE html>
<html>
<body>
<p>Click the button to display the array values after the split.</p>
<button onclick="myFunction()">Try it</button>
<p id="demo"></p>
</body>
</html>

window.location does not get updated

I have a Javascript function as shown below for my HTML form. I am getting the window.location.href as source. Then I am removing two unwanted querystring parmaeters from that. This part works fine (as can be seen from first and second alert screenshots).
But setting the location with new value doesn’t work. The third alert (from window.location or even document.location) still includes the unwanted query string parameters. When I observed the POST url, it has the unwanted query string parameter.
How to get the window.location updated by removing the unwanted query string parameters (so that the POST will not have the unwanted query string parameters)?
function sort_table(strHead)
{
var source = window.location.href;
alert("SOURCE-----"+source)
var removed = excludeQueryString (source,"SortColumnAutoRefresh");
removed = excludeQueryString (removed,"SortOrderAutoRefresh");
alert("REMOVED-----"+removed);
//location.replace(removed);
document.location.replace(removed);
alert("LATESTLoc-----"+window.location);
//alert("***"+document.location);
document.getElementById('hidSort').value=strHead;
document.frmCountList.submit();
}
function excludeQueryString(url, parameter)
{
var urlparts= url.split('?');
if (urlparts.length>=2) {
var prefix= encodeURIComponent(parameter)+'=';
var pars= urlparts[1].split(/[&;]/g);
for (var i= pars.length; i-- > 0;)
{
if (pars[i].lastIndexOf(prefix, 0) !== -1)
{
pars.splice(i, 1);
}
}
url= urlparts[0] + (pars.length > 0 ? '?' + pars.join('&') : "");
return url;
}
else
{
return url;
}
}
Alerts
This is a function I whipped up right now:
function redirectOnBadArgs(badargs)
{
var i, j, newloc, args, segs, isbad, redirect;
redirect = false;
newloc = location.href.split("?");
if (newloc.length === 2)
{
args = newloc[1].split("&");
i = 0;
while (i < args.length)
{
isbad = false;
for (j = 0; j < badargs.length; j++)
{
if (args[i].indexOf(badargs[j] + "=") === 0)
{
isbad = true;
break;
}
}
if (isbad)
{
args.splice(i, 1);
redirect = true;
continue;
}
i++;
}
newloc[1] = args.join("&");
}
newloc = newloc.join("?");
if (redirect)
{
// DEBUG: alert("redirecting to " + newloc);
location.href = newloc;
}
}
Pass it an array of bad arguments, like this:
onload = function () {
redirectOnBadArgs([ "badarg1", "badarg2" ]);
};
Remember, you should redirect on page load, not on submit, or you won't POST any data!
Thanks for the comments.
I commented the submit (POST) and used location.href for GET redirect. Now the querystrings are working as expected.
function sort_table(strHead)
{
var source = window.location.href;
var removed = excludeQueryString(source,"SortColumnAutoRefresh");
removed = excludeQueryString(removed,"SortOrderAutoRefresh");
removed = excludeQueryString(removed,"SortColumnClick");
removed = excludeQueryString(removed,"SortOrderClick");
var sortColumn = strHead;
var clickOrder = getClickOrder(sortColumn);
removed = removed + "&SortColumnClick="+sortColumn+"&SortOrderClick="+clickOrder;
//GET - Redirect
location.href = removed;
//document.getElementById('hidSort').value=strHead;
//document.frmCountList.submit();
}

How to extract text from a PDF in JavaScript

I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.

Categories