Replace regular expression in text file with file contents using node.js

Replace regular expression in text file with file contents using node.js - javascript

This question would apply to any text file but as I want to use it for HTML replacements I will use HTML files for examples. I have looked at things like gulp inject and replace on npm but neither seamed to do quite what i needed.
I would like to have some placeholder text that references another file. when run through this replacement function the placeholdler text is replaced by the contents of the file.
main.html
<script><replace src="./other.js" /></script>
other.js
console.log("Hello, world!");
After the transformation the output file should be.
<script>console.log("Hello, world!")</script>
I have got to the following but don't know how to make it work with file streams in node.
var REGEX = /<replace src="(.+)" \/>/;
function replace(file){
match = file.match(REGEX);
var placeholder = match[0];
if (placeholder) {
return file.replace(placeholder, match[1].toUpperCase());
// toUpperCase is just an example and instead should lookup a file for contents
}
}

If your files are reasonable in size, you can avoid using streams and go with a custom replacer for string.replace():
var fs = require('fs');
function replace(path) {
var REGEX = /<replace src="(.+)" \/>/g;
// load the html file
var fileContent = fs.readFileSync(path, 'utf8');
// replacePath is your match[1]
fileContent = fileContent.replace(REGEX, function replacer(match, replacePath) {
// load and return the replacement file
return fs.readFileSync(replacePath, 'utf8');
});
// this will overwrite the original html file, change the path for test
fs.writeFileSync(path, fileContent);
}
replace('./main.html');
Using streams
var es = require('event-stream');
function replaceWithStreams(path) {
var REGEX = /<replace src="(.+)" \/>/g;
fs.createReadStream(path, 'utf8')
.pipe(es.split()) // split the input file into lines
.pipe(es.map(function (line, next) {
line = line.replace(REGEX, function replacer(match, replacePath) {
// better to keep a readFileSync here for clarity
return fs.readFileSync(replacePath, 'utf8');
});
next(null, line);
})).pipe(fs.createWriteStream(path)); // change path if needed
}
replaceWithStreams('./main.html');

You can try this: https://gist.github.com/thebergamo/5ee9f589757ee904f882
Basically you will read your index.html, find the replace tag, read the content in the file and replace to the content in your other file.
And this snippet is promisified to help you to treat exceptions =D

Related

Getting all the text content from a HTML string in NodeJS

I need to get only the text content from a HTML String with a space or a line break separating the text content of different elements.
For example, the HTML String might be:
<ul>
<li>First</li>
<li>Second</li>
</ul>
What I want:
First Second
or
First
Second
I've tried to get the text content by first wrapping the entire string inside a div and then getting the textContent using third party libraries. But, there is no spacing or line breaks between text content of different elements which I specifically require (i.e. I get FirstSecond which is not what I want).
The only solution I am thinking of right now is to make a DOM Tree and then apply recursion to get the nodes that contain text, and then append the text of that element to a string with spaces.
Are there any cleaner, neater, and simpler solution than this?

Convert HTML to Plain Text:
In your terminal, install the html-to-text npm package:
npm install html-to-text
Then in JavaScript::
const { convert } = require('html-to-text'); // Import the library
var htmlString = `
<ul>
<li>First</li>
<li>Second</li>
</ul>
`;
var text = convert(htmlString, { wordwrap: 130 })
// Out:
// First
// Second
Hope this helps!

You can try get rid of html tags using regex, for the yours example try the following:
let str = `<ul>
<li>First</li>
<li>Second</li>
</ul>`
console.log(str)
let regex = '<\/?!?(li|ul)[^>]*>'
var re = new RegExp(regex, 'g');
str = str.replace(re, '');
console.log(str)

Okay you can try this example, This may help you
I used JSDom module
https://www.npmjs.com/package/jsdom
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
console.log(dom.window.document.querySelector("p").textContent);
BTW Helped me
This code can help I think :)

Using the DOM, you could use document.Node.textContent. However, NodeJs doesn't have textContent (since it doesn't have native access to the DOM), therefore you should use external packages. You could install request and cheerio, using npm. cheerio, suggested by Jon Church, is maybe the easiest web scraping tool to use (there are also complexer ones like jsdom)
With power of cheerio and request in your hands, you could write
const request = require("request");
const cheerio = require("cheerio");
const fs = require("fs");
//taken from https://stackoverflow.com/a/19709846/10713877
function is_absolute(url)
{
var r = new RegExp('^(?:[a-z]+:)?//', 'i');
return r.test(url);
}
function is_local(url)
{
var r = new RegExp('^(?:file:)?//', 'i');
return (r.test(url) || !is_absolute(url));
}
function send_request(URL)
{
if(is_local(URL))
{
if(URL.slice(0,7)==="file://")
url_tmp = URL.slice(7,URL.length);
else
url_tmp = URL;
//taken from https://stackoverflow.com/a/20665078/10713877
const $ = cheerio.load(fs.readFileSync(url_tmp));
//Do something
console.log($.text())
}
else
{
var options = {
url: URL,
headers: {
'User-Agent': 'Your-User-Agent'
}
};
request(options, function(error, response, html) {
//no error
if(!error && response.statusCode == 200)
{
console.log("Success");
const $ = cheerio.load(html);
return Promise.resolve().then(()=> {
//Do something
console.log($.text())
});
}
else
{
console.log(`Failure: ${error}`);
}
});
}
}
Let me explain the code. You pass a URL to send_request function. It checks whether the URL string is a path to your local file, (a relative path, or a path starting with file://). If it is a local file, it proceeds to use cheerio module, otherwise, it has to send a request, to the website, using the request module, then use cheerio module. Regular Expressions are used in is_absolute and is_local. You get the text using text() method provided by cheerio. Under the comments //Do something, you could do whatever you want with the text.
There are websites that let you know 'Your-User-Agent', copy-paste your user agent to that field.
Below lines will work
//your local file
send_request("/absolute/path/to/your/local/index.html");
send_request("/relative/path/to/your/local/index.html");
send_request("file:///absolute/path/to/your/local/index.html");
//website
send_request("https://stackoverflow.com/");
EDIT: I am on a linux system.

You can try using npm library htmlparser2. Its will be very simple using this
const htmlparser2 = require('htmlparser2');
const htmlString = ''; //your html string goes here
const parser = new htmlparser2.Parser({
ontext(text) {
if (text && text.trim().length > 0) {
//do as you need, you can concatenate or collect as string array
}
}
});
parser.write(htmlString);
parser.end();

How do I write a LZ compressed string to text file using JXA?

I am trying to write a JXA script in Apple Script Editor, that compresses a string using the LZ algorithm and writes it to a text (JSON) file:
var story = "Once upon a time in Silicon Valley..."
var storyC = LZString.compress(story)
var data_to_write = "{\x22test\x22\x20:\x20\x22"+storyC+"\x22}"
app.displayAlert(data_to_write)
var desktopString = app.pathTo("desktop").toString()
var file = `${desktopString}/test.json`
writeTextToFile(data_to_write, file, true)
Everything works, except that the LZ compressed string is just transformed to a set of "?" by the time it reaches the output file, test.json.
It should look like:
{"test" : "㲃냆੠Њޱᐈ攀렒삶퓲ٔ쀛䳂䨀푖㢈Ӱນꀀ"}
Instead it looks like:
{"test" : "????????????????????"}
I have a feeling the conversion is happening in the app.write command used by the writeTextToFile() function (which I pulled from an example in Apple's Mac Automation Scripting Guide):
var app = Application.currentApplication()
app.includeStandardAdditions = true
function writeTextToFile(text, file, overwriteExistingContent) {
try {
// Convert the file to a string
var fileString = file.toString()
// Open the file for writing
var openedFile = app.openForAccess(Path(fileString), { writePermission: true })
// Clear the file if content should be overwritten
if (overwriteExistingContent) {
app.setEof(openedFile, { to: 0 })
}
// Write the new content to the file
app.write(text, { to: openedFile, startingAt: app.getEof(openedFile) })
// Close the file
app.closeAccess(openedFile)
// Return a boolean indicating that writing was successful
return true
}
catch(error) {
try {
// Close the file
app.closeAccess(file)
}
catch(error) {
// Report the error is closing failed
console.log(`Couldn't close file: ${error}`)
}
// Return a boolean indicating that writing was successful
return false
}
}
Is there a substitute command for app.write that maintains the LZ compressed string / a better way to accomplish what I am trying to do?
In addition, I am using the readFile() function (also from the Scripting Guide) to load the LZ string back into the script:
function readFile(file) {
// Convert the file to a string
var fileString = file.toString()
// Read the file and return its contents
return app.read(Path(fileString))
}
But rather than returning:
{"test" : "㲃냆੠Њޱᐈ攀렒삶퓲ٔ쀛䳂䨀푖㢈Ӱນꀀ"}
It is returning:
"{\"test\" : \"„≤ÉÎÉÜ‡©†–äÓÄéﬁ±·êàÊîÄÎ†íÏÇ∂Ìì≤ŸîÏÄõ‰≥Ç‰®ÄÌëñ„¢à”∞‡∫ôÍÄÄ\"}"
Does anybody know a fix for this too?
I know that it is possible to use Cocoa in JXA scripts, so maybe the solution lies therein?
I am just getting to grips with JavaScript so I'll admit trying to grasp Objective-C or Swift is way beyond me right now.
I look forward to any solutions and/or pointers that you might be able to provide me. Thanks in advance!

After some further Googl'ing, I came across these two posts:
How can I write UTF-8 files using JavaScript for Mac Automation?
read file as class utf8
I have thus altered my script accordingly.
writeTextToFile() now looks like:
function writeTextToFile(text, file) {
// source: https://stackoverflow.com/a/44293869/11616368
var nsStr = $.NSString.alloc.initWithUTF8String(text)
var nsPath = $(file).stringByStandardizingPath
var successBool = nsStr.writeToFileAtomicallyEncodingError(nsPath, false, $.NSUTF8StringEncoding, null)
if (!successBool) {
throw new Error("function writeFile ERROR:\nWrite to File FAILED for:\n" + file)
}
return successBool
};
While readFile() looks like:
ObjC.import('Foundation')
const readFile = function (path, encoding) {
// source: https://github.com/JXA-Cookbook/JXA-Cookbook/issues/25#issuecomment-271204038
pathString = path.toString()
!encoding && (encoding = $.NSUTF8StringEncoding)
const fm = $.NSFileManager.defaultManager
const data = fm.contentsAtPath(pathString)
const str = $.NSString.alloc.initWithDataEncoding(data, encoding)
return ObjC.unwrap(str)
};
Both use Objective-C to overcome app.write and app.read's inability to handle UTF-8.

How to get a value in a callback function out of the callback

I am writing a Webpack loader for Pug files. It loads all the dependent images referenced in the Pug file, uses file-loader to copy them to the appropriate directory, and replaces require() calls in the Pug file with the URLs of the resultant file.
I am trying to add functionality to allow for data URLs to be interpolated into the Pug file in lieu of file URLs. I am using this.loadModule() to execute file-loader now.
This function takes a callback as an argument. One of the args passed to this callback contains the result of url-loader, the data URL. So, I need this to write into the Pug file that should be output.
The problem is, the entire loader finishes running before that callback ever runs. So, anything I do with the data in there doesn't end up in the final file. The overall structure of my program looks like this:
module.exports = function pugDepLoader(inputFile) {
/* Setting up various constants and variables. */
/* A while() loop that continually tests inputFile with a
regex to find require() statements.
It creates a copy of inputFile called outputFile,
which it iterates over again and again,
replacing require() statements with file paths
one at a time.
The call to this.loadModule() is within this while() loop. */
/* End of while() loop */
/* Return outputFile to Webpack, which is now a
string with the Pug file, but with the
require() statements replaced with
file paths.
This is meant to be fed to file-loader to
get written out to disc. */
}
That return statement at the end will get called repeatedly before this.loadModule()'s callback ever runs once, as I have discovered with debugging statements. I need to be able to get that data URL that is provided within the callback into outputFile before I return it, so that it will end up in the file that is ultimately written to disc.
Entire source code follows:
module.exports = function pugDepLoader(inputFile) {
const fs = require('fs');
const path = require('path');
const loaderUtils = require('loader-utils');
var options = loaderUtils.getOptions(this);
//This option is required. It specifies
//the path to the root of the Pug file's
//dependencies relative to the location
//of the Pug files.
var contextualRoot;
if(!options || !options.hasOwnProperty('contextualRoot')) {
throw new Error('You must specify a contextual root');
}
else {
contextualRoot = options.contextualRoot;
//Ensure there is a trailing slash.
if(contextualRoot[contextualRoot.length-1] !== '/') {
contextualRoot = contextualRoot + '/';
}
}
//Determines whether paths should begin with
//a leading slash. Useful for Express.js
//compatibility.
if(!options.hasOwnProperty('absolute')) {
options.absolute = false;
}
//Set up regex to search for require() statements
const reqRE = new RegExp(/require\(/, 'g');
//outputFile will be returned containing the
//appropriately processed Pug
var outputFile = inputFile.slice();
//We need to execute reqRE once to kick things
//off, and we need to save it to a variable
//because we need information from it.
let regexResult = reqRE.exec(inputFile);
//regexResult will be null when there
//are no more matches to be found in
//the file.
while(regexResult != null) {
//pathStartIndex is the beginning of
//the path to the required file.
//We add 1 to skip the opening
//single quote.
let pathStartIndex = reqRE.lastIndex+1;
//We require the path to be wrapped in
//single quotes, so that we can easily
//be certain about where the require
//statement ends.
if(inputFile[reqRE.lastIndex] !== "'") {
console.log('FATAL ERROR: File path must be wrapped in single quotes!');
break;
}
//inputPath will hold the actual file path itself.
let inputPath = inputFile.slice(pathStartIndex, inputFile.indexOf("'", pathStartIndex));
//pathArray is used to split the
//path so we can easily extract
//the file name and path
//separately.
let pathArray = inputPath.split('/');
//Just the file name, with extension.
let fileName = pathArray.pop();
//outputPath will define what path should be
//written into the output Pug file.
//The user may optionally specify a
//custom output path.
let outputPath;
if(options.outputPath) {
outputPath = options.outputPath;
}
else {
outputPath = pathArray.join('/');
}
//Ensure a trailing slash.
if(outputPath[outputPath.length-1] !== '/') {
outputPath = (outputPath + '/');
}
//reqStart holds the index of the letter
//"r" in require(), so that we can remove
//the require() call and replace it
//with the file path.
let reqStart = inputFile.indexOf('require', regexResult.index);
//reqStmt is the require() statement in
//full. This will be used with replace()
//to replace the require() call with a
//file path in the output file.
let reqStmt = inputFile.slice(reqStart, inputFile.indexOf('"', pathStartIndex));
//The final file path, with file name.
//This will be written into the output
//Pug file in place of the require()
//calls.
let filePath = outputPath + fileName;
if(options.absolute && filePath[0] !== '/') {
if(filePath[0] === '.' && filePath[1] === '/') {
filePath = filePath.slice(1);
}
else {
filePath = '/' + filePath;
}
}
else if(!options.absolute && filePath[0] === '/') {
filePath = filePath.slice(1);
}
this.loadModule(contextualRoot + inputPath, (err, res, srcmap, module) => {
if(err) {
console.log(err);
}
if(new RegExp(/data:image/).test(res)) {
filePath = res.slice(res.indexOf('data:image'), res.lastIndexOf('"'));
}
});
//This takes care of require() calls
//inside of url() CSS functions,
//such as are used to declare
//background images inline.
//If the word "require" is
//preceeded by a single quote,
//it is within a url() function,
//and so we add appropriate closure
//for that function to the end of
//the path.
if(inputFile[reqStart-1] === "'") {
filePath = filePath + "');";
}
//Write the output.
outputFile = outputFile.replace(reqStmt, filePath);
//Run the next iteration of the regex search.
regexResult = reqRE.exec(inputFile);
}
//Return output as a string to Webpack.
return outputFile;
}

NodeJS stream parse and write json line to line upon Promise result

I have a large json file that looks like that:
[
{"name": "item1"},
{"name": "item2"},
{"name": "item3"}
]
I want to stream this file (pretty easy so far), for each line run a asynchronous function (that returns a promise) upon the resolve/reject call edit this line.
The result of the input file could be:
[
{"name": "item1", "response": 200},
{"name": "item2", "response": 404},
{"name": "item3"} // not processed yet
]
I do not wish to create another file, I want to edit on the fly the SAME FILE (if possible!).
Thanks :)

I don't really answer the question, but don't think it can be answered in a satisfactory way anyway, so here are my 2 cents.
I assume that you know how to stream line by line, and run the function, and that the only problem you have is editing the file that you are reading from.
Consequences of inserting
It is not possible to natively insert data into any file (which is what you want to do by changing the JSON live). A file can only grow up at its end.
So inserting 10 bytes of data at the beginning of a 1GB file means that you need to write 1GB to the disk (to move all the data 10 bytes further).
Your filesystem does not understand JSON, and just sees that you are inserting bytes in the middle of a big file so this is going to be very slow.
So, yes it is possible to do.
Write a wrapper over the file API in NodeJS with an insert() method.
Then write some more code to be able to know where to insert bytes into a JSON file without loading the whole file and not producing invalid JSON at the end.
Now I would not recommend it :)
=> Read this question: Is it possible to prepend data to an file without rewriting?
Why do it then?
I assume that want to either
Be able to kill your process at any time, and easily resume work by reading the file again.
Retry partially treated files to fill only the missing bits.
First solution: Use a database
Abstracting the work that needs to be done to live edit files at random places is the sole purpose of existence of databases.
They all exist only to abstract the magic that is behind UPDATE mytable SET name = 'a_longer_name_that_the_name_that_was_there_before' where name = 'short_name'.
Have a look at LevelUP/Down, sqlite, etc...
They will abstract all the magic that needs to be done in your JSON file!
Second solution: Use multiple files
When you stream your file, write two new files!
One that contain current position in the input file and lines that need to be retried
The other one the expected result.
You will also be able to kill your process at any time and restart

According to this answer writing to the same file while reading is not reliable. As a commenter there says, better to write to a temporary file, and then delete the original and rename the temp file over it.
To create a stream of lines you can use byline. Then for each line, apply some operation and pipe it out to the output file.
Something like this:
var fs = require('fs');
var stream = require('stream');
var util = require('util');
var LineStream = require('byline').LineStream;
function Modify(options) {
stream.Transform.call(this, options);
}
util.inherits(Modify, stream.Transform);
Modify.prototype._transform = function(chunk, encoding, done) {
var self = this;
setTimeout(function() {
// your modifications here, note that the exact regex depends on
// your json format and is probably the most brittle part of this
var modifiedChunk = chunk.toString();
if (modifiedChunk.search('response:[^,}]+') === -1) {
modifiedChunk = modifiedChunk
.replace('}', ', response: ' + new Date().getTime() + '}') + '\n';
}
self.push(modifiedChunk);
done();
}, Math.random() * 2000 + 1000); // to simulate an async modification
};
var inPath = './data.json';
var outPath = './out.txt';
fs.createReadStream(inPath)
.pipe(new LineStream())
.pipe(new Modify())
.pipe(fs.createWriteStream(outPath))
.on('close', function() {
// replace input with output
fs.unlink(inPath, function() {
fs.rename(outPath, inPath);
});
});
Note that the above results in only one async operation happening at a time. You could also save the modifications to an array and once all of them are done write the lines from the array to a file, like this:
var fs = require('fs');
var stream = require('stream');
var LineStream = require('byline').LineStream;
var modifiedLines = [];
var modifiedCount = 0;
var inPath = './data.json';
var allModified = new Promise(function(resolve, reject) {
fs.createReadStream(inPath).pipe(new LineStream()).on('data', function(chunk) {
modifiedLines.length++;
var index = modifiedLines.length - 1;
setTimeout(function() {
// your modifications here
var modifiedChunk = chunk.toString();
if (modifiedChunk.search('response:[^,}]+') === -1) {
modifiedChunk = modifiedChunk
.replace('}', ', response: ' + new Date().getTime() + '}');
}
modifiedLines[index] = modifiedChunk;
modifiedCount++;
if (modifiedCount === modifiedLines.length) {
resolve();
}
}, Math.random() * 2000 + 1000);
});
}).then(function() {
fs.writeFile(inPath, modifiedLines.join('\n'));
}).catch(function(reason) {
console.error(reason);
});
If instead of lines you wish to stream chunks of valid json which would be a more robust approach, take a look at JSONStream.

As mentioned in the comment, the file you have is not proper JSON, although is valid in Javascript. In order to generate proper JSON, JSON.stringify() could be used. I think it would make life difficult for others to parse nonstandard JSON as well, therefore I would recommend furnishing a new output file instead of keeping the original one.
However, it is still possible to parse the original file as JSON. This is possible via eval('(' + procline + ')');, however it is not secure to take external data into node.js like this.
const fs = require('fs');
const readline = require('readline');
const fr = fs.createReadStream('file1');
const rl = readline.createInterface({
input: fr
});
rl.on('line', function (line) {
if (line.match(new RegExp("\{name"))) {
var procline = "";
if (line.trim().split('').pop() === ','){
procline = line.trim().substring(0,line.trim().length-1);
}
else{
procline = line.trim();
}
var lineObj = eval('(' + procline + ')');
lineObj.response = 200;
console.log(JSON.stringify(lineObj));
}
});
The output would be like this:
{"name":"item1","response":200}
{"name":"item2","response":200}
{"name":"item3","response":200}
Which is line-delimited JSON (LDJSON) and could be useful for streaming stuff, without the need for leading and trailing [, ], or ,. There is an ldjson-stream package for it as well.

nodejs prepending to a file

For Node.js, what is the best way to prepend to a file in a way SIMILAR to
fs.appendFile(path.join(__dirname, 'app.log'), 'appendme', 'utf8')
Personally, the best way really revolves around a asynchronous solution to create a log where I can basically push onto the file from the top.

This solution isn't mine and I don't know where it's from but it works.
const data = fs.readFileSync('message.txt')
const fd = fs.openSync('message.txt', 'w+')
const insert = Buffer.from("text to prepend \n")
fs.writeSync(fd, insert, 0, insert.length, 0)
fs.writeSync(fd, data, 0, data.length, insert.length)
fs.close(fd, (err) => {
if (err) throw err;
});

It is impossible to add to a beginning of a file. See this question for the similar problem in C or this question for the similar problem in C#.
I suggest you do your logging in the conventional way (that is, log to the end of file).
Otherwise, there is no way around reading the file, adding the text to the start and writing it back to the file which can get really costly really fast.

It seems it is indeed possible with https://www.npmjs.com/package/prepend-file

Here is an example of how to prepend text to a file using gulp and a custom built function.
var through = require('through2');
gulp.src('somefile.js')
.pipe(insert('text to prepend with'))
.pipe(gulp.dest('Destination/Path/'))
function insert(text) {
function prefixStream(prefixText) {
var stream = through();
stream.write(prefixText);
return stream;
}
let prefixText = new Buffer(text + "\n\n"); // allocate ahead of time
// creating a stream through which each file will pass
var stream = through.obj(function (file, enc, cb) {
//console.log(file.contents.toString());
if (file.isBuffer()) {
file.contents = new Buffer(prefixText.toString() + file.contents.toString());
}
if (file.isStream()) {
throw new Error('stream files are not supported for insertion, they must be buffered');
}
// make sure the file goes through the next gulp plugin
this.push(file);
// tell the stream engine that we are done with this file
cb();
});
// returning the file stream
return stream;
}
Sources: [cole_gentry_github_dealingWithStreams][1]

Its possible by using the prepend-file node module. Do the following:
npm i prepend-file -S
import prepend-file module in your respective code.
Example:
let firstFile = 'first.txt';
let secondFile = 'second.txt';
prependFile(firstFile, secondFile, () => {
console.log('file prepend successfully');
})

We Keep Coding

JavaScript is the programming language of the Web.