I am using https://github.com/jakubfiala/soundtouch-js soundtouch library to do some pitch shifting on audio files. the lib works fine with pitch shifting but I don't know how to control the audio file's channels to switch them on/off.
I am following this pen https://codepen.io/ezzatly/pen/LLqOgJ?editors=1010
var source = {
extract: function (target, numFrames, position) {
$("#current-time").html(minsSecs(position / (context.sampleRate)));
console.log('width: ', 100 * position / (bufferDuration * context.sampleRate));
$("#progress").width(100 * position / (bufferDuration * context.sampleRate) + "%");
if (Math.round(100 * position / (bufferDuration * context.sampleRate)) == 100 && is_playing) {
//stop recorder
recorder && recorder.stop();
__log('Recording complete.');
// create WAV download link using audio data blob
createDownloadLink();
if (typeof recorder != "undefined") {
recorder.clear();
}
is_playing = false;
}
var l = buffer.getChannelData(0);
if (buffer.numberofChannels > 1) {
var r = buffer.getChannelData(1);
} else {
var r = buffer.getChannelData(0);
}
for (var i = 0; i < numFrames; i++) {
target[i * 2] = l[i + position];
target[i * 2 + 1] = r[i + position];
}
return Math.min(numFrames, l.length - position);
}
};
node.onaudioprocess = function (e) {
if (buffer.getChannelData) {
pos += BUFFER_SIZE / context.sampleRate;
console.log('position: ', pos);
var l = e.outputBuffer.getChannelData(0);
var r = e.outputBuffer.getChannelData(1);
var framesExtracted = f.extract(samples, BUFFER_SIZE);
if (framesExtracted == 0) {
node.disconnect();
}
for (var i = 0; i < framesExtracted; i++) {
l[i] = samples[i * 2];
r[i] = samples[i * 2 + 1];
}
leftchannel.push(new Float32Array(l));
rightchannel.push(new Float32Array(r));
recordingLength += BUFFER_SIZE;
}
};
any help?
First, I have a slightly more modernized version of SoundTouchJs in my GitHub. Might make it slightly easier to hack around.
Second, you probably want to attach a ChannelSplitterNode to your connected audio context. I haven't played with this, but I would think that's what you would need to independently control the gain on each channel. Again, I'm guessing, but hopefully that'll push you in the right direction.
Related
I have an issue with BiquadFilterNode (Web Audio API).
I made a simple audio program that repeats noise in every one bar.
The issue is when I connect a BiquadFilterNode to the noise, its volume keeps increasing gradually.
Could anyone kindly point out why the gradual volume increase is happening?
Many thanks!
Webpage running the program:
https://abirdwhale.github.io/javascript-web-audio-api-sketches/sequencers/test-biquadfilter-issue/
GitHub repository to recreate the issue:
https://github.com/abirdwhale/javascript-web-audio-api-sketches/tree/main/sequencers/test-biquadfilter-issue
HTML:
<!-- <!DOCTYPE html> -->
<html>
<head>
<meta charset="UTF-8">
<title></title>
<link rel="stylesheet" href="css/app.css">
</head>
<body>
<button id="play-button">Play/Pause</button>
<script type="module" src="js/app.js"></script>
</body>
</html>
JavaScript:
"use strict";
// for cross browser
const AudioContext = window.AudioContext || window.webkitaudioCtx;
const audioCtx = new AudioContext();
let futureTickTime = audioCtx.currentTime;
let counter = 1;
let tempo = 120;
let secondsPerBeat = 60 / tempo;
let counterTimeValue = (secondsPerBeat / 4); // 16th note
// Noise volume
let noiseVolume = audioCtx.createGain();
noiseVolume.gain.value = 0.001; //Noise volume before send to FX
// Noise parameters
let noiseDuration = 1.; //Duration of Noise
let bandHz = 100;
function playNoise(time, playing) {
if (playing) {
const bufferSize = audioCtx.sampleRate * noiseDuration; // set the time of the note
const buffer = audioCtx.createBuffer(1, bufferSize, audioCtx.sampleRate); // create an empty buffer
const data = buffer.getChannelData(0); // get data
// fill the buffer with noise
for (let i = 0; i < bufferSize; i++) {
data[i] = Math.random() * 2 - 1;
}
// create a buffer source for our created data
const noise = audioCtx.createBufferSource();
noise.buffer = buffer;
const bandpass = audioCtx.createBiquadFilter();
bandpass.type = 'bandpass';
bandpass.frequency.value = bandHz;
// connect our graph
noise.connect(noiseVolume);
// 1. without a bandpass filter
// noiseVolume.connect(audioCtx.destination);
// 2. with a bandpass filter
noiseVolume.connect(bandpass).connect(audioCtx.destination);
if (counter === 1) {
noise.start(time);
// noise.stop(time + noiseDuration);
}
}
}
const lookahead = 25.0; // How frequently to call scheduling function (in milliseconds)
const scheduleAheadTime = 0.1; // How far ahead to schedule audio (sec)
function playTick() {
console.log("This 16th note is: " + counter);
console.log("16th is: " + counterTimeValue);
console.log("futureTickTime: " + futureTickTime);
console.log("Web Audio Time: " + audioCtx.currentTime);
counter += 1;
futureTickTime += counterTimeValue;
console.log("futureTickTime: " + futureTickTime);
if (counter > 16) {
counter = 1;
}
}
function scheduler() {
if (futureTickTime < audioCtx.currentTime + scheduleAheadTime) {
playNoise(futureTickTime, true);
playTick();
}
window.setTimeout(scheduler, lookahead);
}
scheduler();
document.getElementById("play-button").addEventListener("click", function () {
if (audioCtx.state !== "running") {
console.log("it's not running well");
audioCtx.resume();
} else {
console.log("it's running");
audioCtx.suspend();
console.log(audioCtx.state);
}
});
OK, taking BiquadFilterNode setup out of the playNoise function solved the issue. I don't understand why placing the setup inside the function was causing the issue, though.
Fixed JavaScript:
"use strict";
// for cross browser
const AudioContext = window.AudioContext || window.webkitaudioCtx;
const audioCtx = new AudioContext();
let futureTickTime = audioCtx.currentTime;
let counter = 1;
let tempo = 120;
let secondsPerBeat = 60 / tempo;
let counterTimeValue = (secondsPerBeat / 4); // 16th note
// Noise volume
let noiseVolume = audioCtx.createGain();
noiseVolume.gain.value = 0.1; //Noise volume before send to FX
// Noise parameters
let noiseDuration = 1.; //Duration of Noise
let bandHz = 100;
// Biquad filter setup
const bandpass = audioCtx.createBiquadFilter();
bandpass.type = 'bandpass';
bandpass.frequency.value = bandHz;
function playNoise(time, playing) {
if (playing) {
const bufferSize = audioCtx.sampleRate * noiseDuration; // set the time of the note
const buffer = audioCtx.createBuffer(1, bufferSize, audioCtx.sampleRate); // create an empty buffer
const data = buffer.getChannelData(0); // get data
// fill the buffer with noise
for (let i = 0; i < bufferSize; i++) {
data[i] = Math.random() * 2 - 1;
}
// create a buffer source for our created data
const noise = audioCtx.createBufferSource();
noise.buffer = buffer;
// connect our graph
noise.connect(noiseVolume);
// 1. without a bandpass filter
// noiseVolume.connect(audioCtx.destination);
// 2. with a bandpass filter
noiseVolume.connect(bandpass).connect(audioCtx.destination);
if (counter === 1) {
// bandpass.gain.setValueAtTime(-40, time);
noise.start(time);
// noise.stop(time + noiseDuration);
}
}
}
const lookahead = 25.0; // How frequently to call scheduling function (in milliseconds)
const scheduleAheadTime = 0.1; // How far ahead to schedule audio (sec)
function playTick() {
console.log("This 16th note is: " + counter);
console.log("16th is: " + counterTimeValue);
console.log("futureTickTime: " + futureTickTime);
console.log("Web Audio Time: " + audioCtx.currentTime);
counter += 1;
futureTickTime += counterTimeValue;
console.log("futureTickTime: " + futureTickTime);
if (counter > 16) {
counter = 1;
}
}
function scheduler() {
if (futureTickTime < audioCtx.currentTime + scheduleAheadTime) {
playNoise(futureTickTime, true);
playTick();
}
window.setTimeout(scheduler, lookahead);
}
scheduler();
document.getElementById("play-button").addEventListener("click", function () {
if (audioCtx.state !== "running") {
console.log("it's not running well");
audioCtx.resume();
} else {
console.log("it's running");
audioCtx.suspend();
console.log(audioCtx.state);
}
});
I am trying to make it to where a user can upload a file to the site and convert their file. Then download it back. I used JavaScripts Web audio api to do so. Now I just need to let the user download the file back. I create a blob but I am currently stuck.
ConverterSec2.jsx:
class ConverterSec2 extends Component {
render() {
return (
<div className="sec2">
<form>
<input type="file" id="audio-file" name="file" accept="audio/mpeg, audio/ogg, audio/*" />
<button type="Submit" id="convert_btn">Convert to 432Hz</button>
<script src="ihertz_website/src/pages/Converter/compressor.js"></script>
</form>
</div>
)
}
}
export default ConverterSec2
converting.js:
//render proccessed audio
offlineAudioCtx.startRendering().then(function(renderBuffer){
make_download(renderBuffer, offlineAudioCtx.length)
console.log("ERROR: PROCCESSING AUDIO")
})
//download Audio buffer as downloadable WAV file
function make_download(abuffer, total_samples) {
//get duration and sample rate
var duration = abuffer.duration,
rate = abuffer.sampleRate,
offset = 0;
var new_file = URL.createObjectURL(bufferToWave(abuffer, total_samples));
var download_link = document.getElementById("download_link");
download_link.href = new_file;
var name = generateFileName();
download_link.download = name;
}
//generate name of file
function generateFileName() {
var origin_name = fileInput.files[0].name;
var pos = origin_name.lastIndexOf('.');
var no_exit = origin_name.slice(0, pos);
return no_exit + ".wav";
}
//Convert an AudioBuffer to a Blob using WAVE representation
function bufferToWave(abuffer, len) {
var numOfChan = abuffer.numberOfChannels,
length = len * numOfChan * 2 + 44,
buffer = new ArrayBuffer(length),
view = new DataView(buffer),
channels = [], i, sample,
offset = 0,
pos = 0;
//write WAVE header
setUnit32(0x464664952);
setUnit32(length - 8);
setUint32(0x45564157); // "WAVE"
setUint32(0x20746d66); // "fmt " chunk
setUint32(16); // length = 16
setUint16(1); // PCM (uncompressed)
setUint16(numOfChan);
setUint32(abuffer.sampleRate);
setUint32(abuffer.sampleRate * 2 * numOfChan); // avg. bytes/sec
setUint16(numOfChan * 2); // block-align
setUint16(16); // 16-bit (hardcoded in this demo)
setUint32(0x61746164); // "data" - chunk
setUint32(length - pos - 4); // chunk length
// write interleaved data
for(i = 0; i < abuffer.numberOfChannels; i++)
channels.push(abuffer.getChannelData(i));
while(pos < length) {
for(i = 0; i < numOfChan; i++) { // interleave channels
sample = Math.max(-1, Math.min(1, channels[i][offset])); // clamp
sample = (0.5 + sample < 0 ? sample * 32768 : sample * 32767)|0; // scale to 16-bit signed int
view.setInt16(pos, sample, true); // write 16-bit sample
pos += 2;
}
offset++ // next source sample
}
//create Blob
return new Blob([buffer], {type: "audio/wav"});
function setUint16(data) {
view.setUint16(pos, data, true);
pos += 2;
}
function setUnit32(data) {
view.setUint32(pos, data, true);
pos += 4;
}
}
Is it possible to achieve low latency audio playback using HTML5? I'm currently using AudioContext API. However, I am getting latency of about 4 seconds. Which is way to much for my use case.
if (!window.audioContextInstance) {
window.audioContextInstance = new webkitAudioContext();
}
var context = window.audioContextInstance;
context.sampleRate = 48000;
var buffers = [];
var src = new Float32Array();
var srcIdx = 0;
var bufferSize = 2048;
var sourceNode = context.createScriptProcessor(bufferSize, 1, 2);
sourceNode.onaudioprocess = function(evt) {
var c0 = evt.outputBuffer.getChannelData(0);
var c1 = evt.outputBuffer.getChannelData(1);
var sample = 0;
while(sample < bufferSize) {
if (srcIdx >= src.length) {
if (!buffers.length) {
console.log("Warning: Audio Buffer Underflow")
return;
}
src = buffers.shift();
srcIdx = 0;
}
while(sample < bufferSize && srcIdx < src.length) {
c0[sample] = src[srcIdx++];
c1[sample] = src[srcIdx++];
sample++;
}
}
};
scope.$on('frame', function (event, frame) {
while (buffers.length > 1) {
buffers.shift();
}
buffers.push(new Float32Array(frame.data));
if (buffers.length > 0) {
sourceNode.connect(context.destination);
}
});
}
You may be interested in riffwave.js which appears to have much lower than 4s latency.
I have a folder with a lot of subfolders, each with a bunch of TIFs and PSD files inside. Some of these have transparency in them while some don't. These files vary massively in size.
I need all the files to be turned into JPGs, or if they contain transparency, PNGs. I require the files to be 200kb or less and don't really mind how large they are as long as they aren't scaled up.
Someone on a forum (who I'm insanely thankful for) wrote a fair bit of code for it, which my friend modified to suit exactly what I was asking and we're nearly there now.
It worked fine, the only problem being that a lot of images came out 1x1 pixel and a solid block of colour.
We've found this was consistently happening with the same images for some reason, but couldn't work out what exactly it was in these images.
Now Mr forum blokey ( http://www.photoshopgurus.com/forum/members/paul-mr.html ) modified the script and it now seems to work fine with PSDs.
It's working with TIFs with transparency but some of the TIFs with 100% opacity it just won't work on. I can't find much that's consistent with these files other than the colour blue, though this just could be a massive coincidence and probably is (there's a lot of blue in the images I've been dealing with).
Below is a link to the thread in which the code was first written. Paul MR seems to think the colorsampler bit is a little suspect so perhaps that's what's causing the problems (blueness?).
http://www.photoshopgurus.com/forum/photoshop-actions-automation/34745-batching-tiffs-jpg-png-w-automatic-resize-based-filesize.html
I wish I could do a little more to try and work this out myself but I've barely a speck of understanding on this stuff, I just know when there's a situation where a bit of scripting could help out.
Below is the script as it currently stands:
#target PhotoshopString.prototype.endsWith = function(str) {
return (this.match(str + "$") == str)
} String.prototype.startsWith = function(str) {
return this.indexOf(str) == 0;
};
var desiredFileSize = 200000;
app.bringToFront();
app.displayDialogs = DialogModes.NO;
main();
//app.displayDialogs = DialogModes.YES;
function main() {
var topLevelFolder = Folder.selectDialog("Please select top level folder.");
if (topLevelFolder == null)return;
var FileList = [];
getFileList(topLevelFolder);
var startRulerUnits = app.preferences.rulerUnits;
app.preferences.rulerUnits = Units.PIXELS;
for (var f in FileList) {
app.open(FileList[f]);
activeDocument.changeMode(ChangeMode.RGB);
try {
activeDocument.mergeVisibleLayers();
} catch(e) {} var Name = decodeURI(app.activeDocument.name).replace(/.[^.] + $ /, '');
if (hasTransparency(FileList[f])) {
var saveFile = File(FileList[f].path + "/" + Name + ".png");
SavePNG(saveFile);
app.activeDocument.close(SaveOptions.DONOTSAVECHANGES);
} else {
var saveFile = File(FileList[f].path + "/" + Name + ".jpg");
SaveForWeb(saveFile, 80);
app.activeDocument.close(SaveOptions.DONOTSAVECHANGES);
} app.preferences.rulerUnits = startRulerUnits;
} function getFileList(folder) {
var fileList = folder.getFiles();
for (var i = 0; i < fileList.length; i++) {
var file = fileList[i];
if (file instanceof Folder) {
getFileList(file);
} else {
if ((file.name.endsWith("tiff") || file.name.endsWith("tif") || file.name.endsWith("psd")) && ! file.name.startsWith("._"))FileList.push(file);
}
}
} alert(FileList.length + " files have been modified.");
} function hasTransparency(file) {
if (file.name.endsWith("tiff") || file.name.endsWith("tif")) {
var sample = app.activeDocument.colorSamplers.add([new UnitValue(1.5, 'px'), new UnitValue(1.5, 'px')]);
try {
sample.color.rgb.hexValue;
sample.remove();
return false;
} catch(e) {
sample.remove();
return true;
}
} var doc = activeDocument;
if (doc.activeLayer.isBackgroundLayer)return false;
var desc = new ActionDescriptor();
var ref = new ActionReference();
ref.putProperty(charIDToTypeID("Chnl"), charIDToTypeID("fsel"));
desc.putReference(charIDToTypeID("null"), ref);
var ref1 = new ActionReference();
ref1.putEnumerated(charIDToTypeID("Chnl"), charIDToTypeID("Chnl"), charIDToTypeID("Trsp"));
desc.putReference(charIDToTypeID("T "), ref1);
executeAction(charIDToTypeID("setd"), desc, DialogModes.NO);
var w = doc.width.as('px');
var h = doc.height.as('px');
var transChannel = doc.channels.add();
doc.selection.store(transChannel);
if (transChannel.histogram[255] != (h * w)) {
transChannel.remove();
return true;
} else {
transChannel.remove();
return false;
}
};
function SavePNG(saveFile) {
pngSaveOptions = new PNGSaveOptions();
activeDocument.saveAs(saveFile, pngSaveOptions, true, Extension.LOWERCASE);
var actualFilesize = saveFile.length;
var ratio = desiredFileSize / actualFilesize;
if (ratio < 1) {
var imageScale = Math.sqrt(ratio);
activeDocument.resizeImage(activeDocument.width * imageScale, activeDocument.height * imageScale, activeDocument.resolution, ResampleMethod.BICUBICSMOOTHER);
activeDocument.saveAs(saveFile, pngSaveOptions, true, Extension.LOWERCASE);
}
}
function SaveForWeb(saveFile, jpegQuality) {
var sfwOptions = new ExportOptionsSaveForWeb();
sfwOptions.format = SaveDocumentType.JPEG;
sfwOptions.includeProfile = false;
sfwOptions.interlaced = 0;
sfwOptions.optimized = true;
sfwOptions.quality = jpegQuality;
activeDocument.exportDocument(saveFile, ExportType.SAVEFORWEB, sfwOptions);
var actualFilesize = saveFile.length;
var ratio = desiredFileSize / actualFilesize;
if (ratio < 1) {
var imageScale = Math.sqrt(ratio);
activeDocument.resizeImage(activeDocument.width * imageScale, activeDocument.height * imageScale, activeDocument.resolution, ResampleMethod.BICUBICSMOOTHER);
activeDocument.exportDocument(saveFile, ExportType.SAVEFORWEB, sfwOptions);
}
}
I wonder if is possible to get the text inside of a PDF file by using only Javascript?
If yes, can anyone show me how?
I know there are some server-side java, c#, etc libraries but I would prefer not using a server.
thanks
Because pdf.js has been developing over the years, I would like to give a new answer. That is, it can be done locally without involving any server or external service. The new pdf.js has a function: page.getTextContent(). You can get the text content from that. I've done it successfully with the following code.
What you get in each step is a promise. You need to code this way: .then( function(){...}) to proceed to the next step.
PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
What you finally get is an string array textContent.bidiTexts[]. You concatenate them to get the text of 1 page. Text blocks' coordinates are used to judge whether newline or space need to be inserted. (This may not be totally robust, but from my test it seems ok.)
The input parameter data needs to be either a URL or ArrayBuffer type data. I used the ReadAsArrayBuffer(file) function in FileReader API to get the data.
Note: According to some other user, the library has updated and caused the code to break. According to the comment by async5 below, you need to replace textContent.bidiTexts with textContent.items.
function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* #param data ArrayBuffer of the pdf file content
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* #param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
I couldn't get gm2008's example to work (the internal data structure on pdf.js has changed apparently), so I wrote my own fully promise-based solution that doesn't use any DOM elements, queryselectors or canvas, using the updated pdf.js from the example at mozilla
It eats a file path for the upload since i'm using it with node-webkit.
You need to make sure you have the cmaps downloaded and pointed somewhere and you nee pdf.js and pdf.worker.js to get this working.
/**
* Extract text from PDFs with PDF.js
* Uses the demo pdf.js from https://mozilla.github.io/pdf.js/getting_started/
*/
this.pdfToText = function(data) {
PDFJS.workerSrc = 'js/vendor/pdf.worker.js';
PDFJS.cMapUrl = 'js/vendor/pdfjs/cmaps/';
PDFJS.cMapPacked = true;
return PDFJS.getDocument(data).then(function(pdf) {
var pages = [];
for (var i = 0; i < pdf.numPages; i++) {
pages.push(i);
}
return Promise.all(pages.map(function(pageNumber) {
return pdf.getPage(pageNumber + 1).then(function(page) {
return page.getTextContent().then(function(textContent) {
return textContent.items.map(function(item) {
return item.str;
}).join(' ');
});
});
})).then(function(pages) {
return pages.join("\r\n");
});
});
}
usage:
self.pdfToText(files[0].path).then(function(result) {
console.log("PDF done!", result);
})
Just leaving here a full working sample.
<html>
<head>
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
</head>
<body>
<input id="pdffile" name="pdffile" type="file" />
<button id="btn" onclick="convert()">Process</button>
<div id="result"></div>
</body>
</html>
<script>
function convert() {
var fr=new FileReader();
var pdff = new Pdf2TextClass();
fr.onload=function(){
pdff.pdfToText(fr.result, null, (text) => { document.getElementById('result').innerText += text; });
}
fr.readAsDataURL(document.getElementById('pdffile').files[0])
}
function Pdf2TextClass() {
var self = this;
this.complete = 0;
this.pdfToText = function (data, callbackPageDone, callbackAllDone) {
console.assert(data instanceof ArrayBuffer || typeof data == 'string');
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function (pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++) {
pdf.getPage(i).then(function (page) {
var n = page.pageNumber;
page.getTextContent().then(function (textContent) {
//console.log(textContent.items[0]);0
if (null != textContent.items) {
var page_text = "";
var last_block = null;
for (var k = 0; k < textContent.items.length; k++) {
var block = textContent.items[k];
if (last_block != null && last_block.str[last_block.str.length - 1] != ' ') {
if (block.x < last_block.x)
page_text += "\r\n";
else if (last_block.y != block.y && (last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total) {
window.setTimeout(function () {
var full_text = "";
var num_pages = Object.keys(layers).length;
for (var j = 1; j <= num_pages; j++)
full_text += layers[j];
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
</script>
Here's some JavaScript code that does what you want using Pdf.js from http://hublog.hubmed.org/archives/001948.html:
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
...and here's an example:
http://git.macropus.org/2011/11/pdftotext/example/
Note: This code assumes you're using nodejs. That means you're parsing a local file instead of one from a web page since the original question doesn't explicitly ask about parsing pdfs on a web page.
#gm2008's answer was a great starting point (please read it and its comments for more info), but needed some updates (08/19) and had some unused code. I also like examples that are more full. There's more refactoring and tweaking that could be done (e.g. with await), but for now it's as close to that original answer as it could be.
As before, this uses Mozilla's PDFjs library. The npmjs package is at https://www.npmjs.com/package/pdfjs-dist.
In my experience, this doesn't do well in finding where to put spaces, but that's a problem for another time.
[Edit: I believe the update to the use of .transform has restored the whitespace as it originally behaved.]
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* #param path Path to the pdf file.
* #param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* #param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
Run in the terminal:
node myPDFfileToText.js
Updated 02/2021
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
<script>
function Pdf2TextClass(){
var self = this;
this.complete = 0;
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
var loadingTask = pdfjsLib.getDocument(data);
loadingTask.promise.then(function(pdf) {
var total = pdf._pdfInfo.numPages;
//callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
//console.log(textContent.items[0]);0
if( null != textContent.items ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.items.length; k++ ){
var block = textContent.items[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
//callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
console.log(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var pdff = new Pdf2TextClass();
pdff.pdfToText('PDF_URL');
</script>
For all the people who actually want to use it on a node server:
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
It is possible but:
you would have to use the server anyway, there's no way you can get content of a file on user computer without transferring it to server and back
I don't thing anyone has written such library yet
So if you have some free time you can learn pdf format and write such a library yourself, or you can just use server side library of course.