Javascript regex to find forward slashes between > and < - javascript

In a Javascript function I need to replace all forward slashes not part of an HTML tag with &#x2F.
Is there any way using a regular expression to find all forward slashes between a > and a <?

Not exactly, but if you're in this kind of a fix, I guess you'll be happy with a quick-and-dirty solution: Find a / if the next occurring angle bracket is not a closing angle bracket.
result = subject.replace(/\/(?![^<>]*>)/g, "&#x2F");
Of course, this is highly brittle - for example it doesn't care at all about comments, strings etc. (yet, and it would be very difficult to pull this off with regex).

You can test this:
html ='toto/tata';
html = html.replace(/(<[^>]+>)|\//g,
function (match, p1) { return (p1)?match:"&#x2f"; });
console.log (html);
The idea is to capture all html tags (and replace by themselves) before trying to match slashes. Then a callback function tests if the first capture group exist and return the full match or the replacement.
You can improve the safety of this pattern to deal with style and script content, like this:
html = html.replace(/(<s(tyle|cript)\b[\s\S]*?<\/s\2>|<[^>]+>)|\//gi,
function (match, p1, p2) { return (p1)?match:"&#x2f"; });

here is a good example. First hit on google: http://james.padolsey.com/javascript/find-and-replace-text-with-javascript/
The basic idea is to iterate through all the nodes in the DOM and replace text in text nodes. Also, don't replace any text in nodes in script, style, metadata type tags. While you might be able to do this with one big regex, it doesn't make much sense to implement a dom parser in regex when there is one built into every browser.
function findAndReplace(searchText, replacement, searchNode) {
if (!searchText || typeof replacement === 'undefined') {
// Throw error here if you want...
return;
}
var regex = typeof searchText === 'string' ?
new RegExp(searchText, 'g') : searchText,
childNodes = (searchNode || document.body).childNodes,
cnLength = childNodes.length,
excludes = 'html,head,style,title,link,meta,script,object,iframe';
while (cnLength--) {
var currentNode = childNodes[cnLength];
if (currentNode.nodeType === 1 &&
(excludes + ',').indexOf(currentNode.nodeName.toLowerCase() + ',') === -1) {
arguments.callee(searchText, replacement, currentNode);
}
if (currentNode.nodeType !== 3 || !regex.test(currentNode.data) ) {
continue;
}
var parent = currentNode.parentNode,
frag = (function(){
var html = currentNode.data.replace(regex, replacement),
wrap = document.createElement('div'),
frag = document.createDocumentFragment();
wrap.innerHTML = html;
while (wrap.firstChild) {
frag.appendChild(wrap.firstChild);
}
return frag;
})();
parent.insertBefore(frag, currentNode);
parent.removeChild(currentNode);
}
}
Then use it
findAndReplace('\\/', '&#x2F');

Related

Javascript regex neglect div span tags

I have the below text
<span> is an </span>
And I wanted to change the an into a and I use the below regex pattern to do that.
const regExFinder = new RegExp("an", 'gi');
const sourceHTML = "<span> is an </span>";
sourceHTML.replace(regExFinder, `$&`);
But the output is something like this. Can anybody give me an idea of how to neglect any tag and only change the text inside the tag.
<spa> is a </spa>
And what if my source HTML looks like this:
<div> an <span> is an </span></div>
You have a couple of options.
const str = "<div> an <span> is an </span></div>";
// method 1: negative lookaheads (probably the best for regex)
str.replace(/an(?![^<>]*>)/gi, "a");
// method 2: rely on having a space after the "an" (not reliable)
str.replace(/an /gi, "a ")
// method 3: rely on "an" being its own word (depends on the situation)
str.replace(/\ban/gi, "a")
I parse the whole string into a DOM element and then go through all span elements to change their content from "an" to "a", The metacharacter \b in the regular expression denotes a word boundary.
Edit:
After digging a bit deeper I can now operate on all text nodes and change the strings in question:
var html='<div> an <span> is an </span>apple and this <span> is a </span> banana.</div>';
var b=document.createElement('body');
b.innerHTML=html;
// use the "optional filter function" to do the changes:
getTextNodesIn(b,n=>n.textContent=n.textContent.replace(/\ban\b/g,'a'));
// output:
console.log(b.innerHTML);
// I just realised that I can also use Chris West's original function:
// https://cwestblog.com/2014/03/14/javascript-getting-all-text-nodes/
function getTextNodesIn(elem, opt_fnFilter) {
var textNodes = [];
if (elem) {
for (var nodes = elem.childNodes, i = nodes.length; i--;) {
var node = nodes[i], nodeType = node.nodeType;
if (nodeType == 3) {
if (!opt_fnFilter || opt_fnFilter(node, elem)) {
textNodes.push(node);
}
}
else if (nodeType == 1 || nodeType == 9 || nodeType == 11) {
textNodes = textNodes.concat(getTextNodesIn(node, opt_fnFilter));
}
}
}
return textNodes;
}
"Fun fact": In ES6 notation the function can be re-written in an even shorter way as:
function getTN(elem, opt_flt) {
if (elem) return [...elem.childNodes].reduce((tn,node)=>{
var nty = node.nodeType;
if (nty==3 && (!opt_flt || opt_flt(node, elem))) tn.push(node);
else if (nty==1 || nty==9 || nty==11) tn=tn.concat(getTN(node, opt_flt));
return tn
}, []);
}
You can check this solution. I've removed all html tag from the string and then applied the replacement operation. It'll work for both of your test cases.
const regExFinder = new RegExp("an", 'gi');
let sourceHTML = "<div> an <span> is an </span></div>";
sourceHTML = sourceHTML.replace(/<[^>]*>?/gm, '').trim(); // removing HTML tags
sourceHTML = sourceHTML.replace(regExFinder, 'a');
console.log(sourceHTML)

Find word in HTML

I am trying to find given word in HTML string and add a span around it.
What I am doing now is this:
function find(what:String,where:String)
{
var regexp:RegExp=new RegExp(what,'gi');
return where.replace(regexp,'<span>$&</span>');
}
It works well on words that are not inside HTML tags.
What I want is to ignore those that are inside HTML tags.
Example: find("spain")
Input:
The rain in <b class="spain">Spain</b> stays mainly in the <i data-test="Spain">plain</i>.
Output:
The rain in <b class="spain"><span>Spain</span></b> stays mainly in the <i data-test="Spain">plain</i>.
How can I achieve this, please?
To account for html tags and attributes that could match, you are going to need to parse that HTML one way or another. The easiest way is to add it to the DOM (or just to a new element):
var container = document.createElement("div");
container.style.display = "none";
document.body.appendChild(container); // this step is optional
container.innerHTML = where;
Once parsed, you can now iterate the nodes using DOM methods and find just the text nodes and search on those. Use a recursive function to walk the nodes:
function wrapWord(el, word)
{
var expr = new RegExp(word, "i");
var nodes = [].slice.call(el.childNodes, 0);
for (var i = 0; i < nodes.length; i++)
{
var node = nodes[i];
if (node.nodeType == 3) // textNode
{
var matches = node.nodeValue.match(expr);
if (matches)
{
var parts = node.nodeValue.split(expr);
for (var n = 0; n < parts.length; n++)
{
if (n)
{
var span = el.insertBefore(document.createElement("span"), node);
span.appendChild(document.createTextNode(matches[n - 1]));
}
if (parts[n])
{
el.insertBefore(document.createTextNode(parts[n]), node);
}
}
el.removeChild(node);
}
}
else
{
wrapWord(node, word);
}
}
}
Here's a working demo: http://jsfiddle.net/gilly3/J8JJm/3
You won't be able to process HTML in any reliable way using regex. Instead, parse the HTML into a DOM tree and iterate the Text nodes checking their data for content.
If you are using JavaScript in a web browser, the parsing will have already have been done for you. See this question for example wrap-word-in-span code. It's much trickier if you need to match phrases that might be split across different elements.
function find(what:String,where:String)
{
what = what.replace(/(\[|\\|\^|\$|\.|\||\?|\*|\+|\(|\)|\{|\})/g, "\\$1")
.replace(/[^a-zA-Z0-9\s:;'"~[\]\{\}\-_+=(),.<>*\/!##$%^&|\\?]/g, "(?:&[0-9A-Za-z]{3,25};|&#[0-9]{1,10};?|[^\s<])")
.replace(/</g,"<?").replace(/>/g,">?").replace(/"/g,"(?:\"|"?)")
.replace(/\s/g, "(?:\\s| ?)");
what = "(>[^<]*|^[^<]*)(" + what + ")";
var regexp:RegExp=new RegExp(what,'gi');
return where.replace(regexp,'$1<span>$2</span>');
}
The first replace function adds a backslash before characters which have a special meaning in a RE, to prevent errors or unexpected results.
The second replace function replaces every occurrence of unknown characters in the search query by (?:&[0-9A-Za-z]{3,25};|&#[0-9]{1,10};?|[^\s<]). This RE consists of three parts: First, it tries to match a HTML entity. Second, it attempts to match a HTML numeric entity. Finally, it matches any non-whitespace character (in case the creator of the HTML document didn't properly encode the characters).
The third, fourth and fifth replace functions replaces <, > and " by the corresponding HTML entities, so that the search query will not search through tags.
The sixth replace function replaces white-space by a RE (\s| ?), which match white-space characters and the HTML entity.
The only shortcoming of this function is that undocumented special characters (such as €) match any HTML entity/character (following the example, not only € and € are valid matches, but also £ and #).
This proposed solution suits in most cases. It can be inaccurate in complex situations, which is probably not worse than a DOM iteration (which is very susceptible to memory leaks and requires more computing power).
When you work with HTML elements which have Event listeners assigned through DOM, you should iterate through all (child) elements, and apply this function to every Text node.
Pure JavaScript (based on Sizzle.getText from jQuery); Demo: http://jsfiddle.net/vol7ron/U8LLv/
var wrapText = function ( elems,regex ) {
var re = new RegExp(regex);
var elem;
for ( var i = 0; elems[i]; i++ ) {
elem = elems[i];
// Get the text from text nodes and CDATA nodes
if ( elem.nodeType === 3 || elem.nodeType === 4 ) {
parent = elem.parentNode;
re.lastIndex = 0;
if(re.test(elem.nodeValue)){
var span = document.createElement('span');
span.innerHTML = RegExp.$1;
if (RegExp.leftContext != ''){
parent.insertBefore(document.createTextNode(RegExp.leftContext),elem); i++;
}
parent.insertBefore(span,elem); i++;
if (RegExp.rightContext != ''){
parent.insertBefore(document.createTextNode(RegExp.rightContext),elem); i++;
}
parent.removeChild(elem);
}
// Traverse everything else, except comment nodes
} else if ( elem.nodeType !== 8 ) {
wrapText( elem.childNodes, regex );
}
}
return;
};
var obj = document.getElementById('wrapper');
wrapText([obj],/(spain)/gi);

How can I improve the performance of my JavaScript text formatter?

I am allowing my users to wrap words with "*", "/", "_", and "-" as a shorthand way to indicate they'd like to bold, italicize, underline, or strikethrough their text. Unfortunately, when the page is filled with text using this markup, I'm seeing a noticeable (borderline acceptable) slow down.
Here's the JavaScript I wrote to handle this task. Can you please provide feedback on how I could speed things up?
function handleContentFormatting(content) {
content = handleLineBreaks(content);
var bold_object = {'regex': /\*(.|\n)+?\*/i, 'open': '<b>', 'close': '</b>'};
var italic_object = {'regex': /\/(?!\D>|>)(.|\n)+?\//i, 'open': '<i>', 'close': '</i>'};
var underline_object = {'regex': /\_(.|\n)+?\_/i, 'open': '<u>', 'close': '</u>'};
var strikethrough_object = {'regex': /\-(.|\n)+?\-/i, 'open': '<del>', 'close': '</del>'};
var format_objects = [bold_object, italic_object, underline_object, strikethrough_object];
for( obj in format_objects ) {
content = handleTextFormatIndicators(content, format_objects[obj]);
}
return content;
}
//#param obj --- an object with 3 properties:
// 1.) the regex to search with
// 2.) the opening HTML tag that will replace the opening format indicator
// 3.) the closing HTML tag that will replace the closing format indicator
function handleTextFormatIndicators(content, obj) {
while(content.search(obj.regex) > -1) {
var matches = content.match(obj.regex);
if( matches && matches.length > 0) {
var new_segment = obj.open + matches[0].slice(1,matches[0].length-1) + obj.close;
content = content.replace(matches[0],new_segment);
}
}
return content;
}
Change your regex with the flags /ig and remove the while loop.
Change your for(obj in format_objects) loop with a normal for loop, because format_objects is an array.
Update
Okay, I took the time to write an even faster and simplified solution, based on your code:
function handleContentFormatting(content) {
content = handleLineBreaks(content);
var bold_object = {'regex': /\*([^*]+)\*/ig, 'replace': '<b>$1</b>'},
italic_object = {'regex': /\/(?!\D>|>)([^\/]+)\//ig, 'replace': '<i>$1</i>'},
underline_object = {'regex': /\_([^_]+)\_/ig, 'replace': '<u>$1</u>'},
strikethrough_object = {'regex': /\-([^-]+)\-/ig, 'replace': '<del>$1</del>'};
var format_objects = [bold_object, italic_object, underline_object, strikethrough_object],
i = 0, foObjSize = format_objects.length;
for( i; i < foObjSize; i++ ) {
content = handleTextFormatIndicators(content, format_objects[i]);
}
return content;
}
//#param obj --- an object with 2 properties:
// 1.) the regex to search with
// 2.) the replace string
function handleTextFormatIndicators(content, obj) {
return content.replace(obj.regex, obj.replace);
}
Here is a demo.
This will work with nested and/or not nested formatting boundaries. You can omit the function handleTextFormatIndicators altogether if you want to, and do the replacements inline inside handleContentFormatting.
Your code is forcing the browser to do a whole lot of repeated, wasted work. The approach you should be taking is this:
Concoct a regex that combines all of your "target" regexes with another that matches a leading string of characters that are not your special meta-characters.
Change the loop so that it does the following:
Grab the next match from the source string. That match, due to the way you changed your regex, will be a string of non-meta characters followed by your matched portion.
Append the non-meta characters and the replacement for the target portion onto a separate array of strings.
At the end of that process, the separate accumulator array can be joined and used to replace the content.
As to how to combine the regular expressions, well, it's not very pretty in JavaScript but it looks like this. First, you need a regex for a string of zero or more "uninteresting" characters. That should be the first capturing group in the regex. Next should be the alternates for the target strings you're looking for. Thus the general form is:
var tokenizer = /(uninteresting pattern)?(?:(target 1)|(target 2)|(target 3)| ... )?/;
When you match that against the source string, you'll get back a result array that will contain the following:
result[0] - entire chunk of string (not used)
result[1] - run of uninteresting characters
result[2] - either an instance of target type 1, or null
result[3] - either an instance of target type 2, or null
...
Thus you'll know which kind of replacement target you saw by checking which of the target regexes are non empty. (Note that in your case the targets can conceivably overlap; if you intend for that to work, then you'll have to approach this as a full-blown parsing problem I suspect.)
You can do things like:
function formatText(text){
return text.replace(
/\*([^*]*)\*|\/([^\/]*)\/|_([^_]*)_|-([^-]*)-/gi,
function(m, tb, ti, tu, ts){
if(typeof(tb) != 'undefined')
return '<b>' + formatText(tb) + '</b>';
if(typeof(ti) != 'undefined')
return '<i>' + formatText(ti) + '</i>';
if(typeof(tu) != 'undefined')
return '<u>' + formatText(tu) + '</u>';
if(typeof(ts) != 'undefined')
return '<del>' + formatText(ts) + '</del>';
return 'ERR('+m+')';
}
);
}
This will work fine on nested tags, but will not with overlapping tags, which are invalid anyway.
Example at http://jsfiddle.net/m5Rju/

RegEx for match/replacing JavaScript comments (both multiline and inline)

I need to remove all JavaScript comments from a JavaScript source using the JavaScript RegExp object.
What I need is the pattern for the RegExp.
So far, I've found this:
compressed = compressed.replace(/\/\*.+?\*\/|\/\/.*(?=[\n\r])/g, '');
This pattern works OK for:
/* I'm a comment */
or for:
/*
* I'm a comment aswell
*/
But doesn't seem to work for the inline:
// I'm an inline comment
I'm not quite an expert for RegEx and it's patterns, so I need help.
Also, I' would like to have a RegEx pattern which would remove all those HTML-like comments.
<!-- HTML Comment //--> or <!-- HTML Comment -->
And also those conditional HTML comments, which can be found in various JavaScript sources.
Thanks.
NOTE: Regex is not a lexer or a parser. If you have some weird edge case where you need some oddly nested comments parsed out of a string, use a parser. For the other 98% of the time this regex should work.
I had pretty complex block comments going on with nested asterisks, slashes, etc. The regular expression at the following site worked like a charm:
http://upshots.org/javascript/javascript-regexp-to-remove-comments
(see below for original)
Some modifications have been made, but the integrity of the original regex has been preserved. In order to allow certain double-slash (//) sequences (such as URLs), you must use back reference $1 in your replacement value instead of an empty string. Here it is:
/\/\*[\s\S]*?\*\/|([^\\:]|^)\/\/.*$/gm
// JavaScript:
// source_string.replace(/\/\*[\s\S]*?\*\/|([^\\:]|^)\/\/.*$/gm, '$1');
// PHP:
// preg_replace("/\/\*[\s\S]*?\*\/|([^\\:]|^)\/\/.*$/m", "$1", $source_string);
DEMO: https://regex101.com/r/B8WkuX/1
FAILING USE CASES: There are a few edge cases where this regex fails. An ongoing list of those cases is documented in this public gist. Please update the gist if you can find other cases.
...and if you also want to remove <!-- html comments --> use this:
/\/\*[\s\S]*?\*\/|([^\\:]|^)\/\/.*|<!--[\s\S]*?-->$/
(original - for historical reference only)
// DO NOT USE THIS - SEE ABOVE
/(\/\*([\s\S]*?)\*\/)|(\/\/(.*)$)/gm
try this,
(\/\*[\w\'\s\r\n\*]*\*\/)|(\/\/[\w\s\']*)|(\<![\-\-\s\w\>\/]*\>)
should work :)
I have been putting togethor an expression that needs to do something similar.
the finished product is:
/(?:((["'])(?:(?:\\\\)|\\\2|(?!\\\2)\\|(?!\2).|[\n\r])*\2)|(\/\*(?:(?!\*\/).|[\n\r])*\*\/)|(\/\/[^\n\r]*(?:[\n\r]+|$))|((?:=|:)\s*(?:\/(?:(?:(?!\\*\/).)|\\\\|\\\/|[^\\]\[(?:\\\\|\\\]|[^]])+\])+\/))|((?:\/(?:(?:(?!\\*\/).)|\\\\|\\\/|[^\\]\[(?:\\\\|\\\]|[^]])+\])+\/)[gimy]?\.(?:exec|test|match|search|replace|split)\()|(\.(?:exec|test|match|search|replace|split)\((?:\/(?:(?:(?!\\*\/).)|\\\\|\\\/|[^\\]\[(?:\\\\|\\\]|[^]])+\])+\/))|(<!--(?:(?!-->).)*-->))/g
Scary right?
To break it down, the first part matches anything within single or double quotation marks
This is necessary to avoid matching quoted strings
((["'])(?:(?:\\\\)|\\\2|(?!\\\2)\\|(?!\2).|[\n\r])*\2)
the second part matches multiline comments delimited by /* */
(\/\*(?:(?!\*\/).|[\n\r])*\*\/)
The third part matches single line comments starting anywhere in the line
(\/\/[^\n\r]*(?:[\n\r]+|$))
The fourth through sixth parts matchs anything within a regex literal
This relies on a preceding equals sign or the literal being before or after a regex call
((?:=|:)\s*(?:\/(?:(?:(?!\\*\/).)|\\\\|\\\/|[^\\]\[(?:\\\\|\\\]|[^]])+\])+\/))
((?:\/(?:(?:(?!\\*\/).)|\\\\|\\\/|[^\\]\[(?:\\\\|\\\]|[^]])+\])+\/)[gimy]?\.(?:exec|test|match|search|replace|split)\()
(\.(?:exec|test|match|search|replace|split)\((?:\/(?:(?:(?!\\*\/).)|\\\\|\\\/|[^\\]\[(?:\\\\|\\\]|[^]])+\])+\/))
and the seventh which I originally forgot removes the html comments
(<!--(?:(?!-->).)*-->)
I had an issue with my dev environment issuing errors for a regex that broke a line, so I used the following solution
var ADW_GLOBALS = new Object
ADW_GLOBALS = {
quotations : /((["'])(?:(?:\\\\)|\\\2|(?!\\\2)\\|(?!\2).|[\n\r])*\2)/,
multiline_comment : /(\/\*(?:(?!\*\/).|[\n\r])*\*\/)/,
single_line_comment : /(\/\/[^\n\r]*[\n\r]+)/,
regex_literal : /(?:\/(?:(?:(?!\\*\/).)|\\\\|\\\/|[^\\]\[(?:\\\\|\\\]|[^]])+\])+\/)/,
html_comments : /(<!--(?:(?!-->).)*-->)/,
regex_of_doom : ''
}
ADW_GLOBALS.regex_of_doom = new RegExp(
'(?:' + ADW_GLOBALS.quotations.source + '|' +
ADW_GLOBALS.multiline_comment.source + '|' +
ADW_GLOBALS.single_line_comment.source + '|' +
'((?:=|:)\\s*' + ADW_GLOBALS.regex_literal.source + ')|(' +
ADW_GLOBALS.regex_literal.source + '[gimy]?\\.(?:exec|test|match|search|replace|split)\\(' + ')|(' +
'\\.(?:exec|test|match|search|replace|split)\\(' + ADW_GLOBALS.regex_literal.source + ')|' +
ADW_GLOBALS.html_comments.source + ')' , 'g'
);
changed_text = code_to_test.replace(ADW_GLOBALS.regex_of_doom, function(match, $1, $2, $3, $4, $5, $6, $7, $8, offset, original){
if (typeof $1 != 'undefined') return $1;
if (typeof $5 != 'undefined') return $5;
if (typeof $6 != 'undefined') return $6;
if (typeof $7 != 'undefined') return $7;
return '';
}
This returns anything captured by the quoted string text and anything found in a regex literal intact but returns an empty string for all the comment captures.
I know this is excessive and rather difficult to maintain but it does appear to work for me so far.
This works for almost all cases:
var RE_BLOCKS = new RegExp([
/\/(\*)[^*]*\*+(?:[^*\/][^*]*\*+)*\//.source, // $1: multi-line comment
/\/(\/)[^\n]*$/.source, // $2 single-line comment
/"(?:[^"\\]*|\\[\S\s])*"|'(?:[^'\\]*|\\[\S\s])*'/.source, // - string, don't care about embedded eols
/(?:[$\w\)\]]|\+\+|--)\s*\/(?![*\/])/.source, // - division operator
/\/(?=[^*\/])[^[/\\]*(?:(?:\[(?:\\.|[^\]\\]*)*\]|\\.)[^[/\\]*)*?\/[gim]*/.source
].join('|'), // - regex
'gm' // note: global+multiline with replace() need test
);
// remove comments, keep other blocks
function stripComments(str) {
return str.replace(RE_BLOCKS, function (match, mlc, slc) {
return mlc ? ' ' : // multiline comment (replace with space)
slc ? '' : // single/multiline comment
match; // divisor, regex, or string, return as-is
});
}
The code is based on regexes from jspreproc, I wrote this tool for the riot compiler.
See http://github.com/aMarCruz/jspreproc
In plain simple JS regex, this:
my_string_or_obj.replace(/\/\*[\s\S]*?\*\/|([^:]|^)\/\/.*$/gm, ' ')
a bit simpler -
this works also for multiline - (<!--.*?-->)|(<!--[\w\W\n\s]+?-->)
Simple regex ONLY for multi-lines:
/\*((.|\n)(?!/))+\*/
The accepted solution does not capture all common use cases. See examples here: https://regex101.com/r/38dIQk/1.
The following regular expression should match JavaScript comments more reliably:
/(?:\/\*(?:[^\*]|\**[^\*\/])*\*+\/)|(?:\/\/[\S ]*)/g
For demonstration, visit the following link: https://regex101.com/r/z99Nq5/1/.
This is late to be of much use to the original question, but maybe it will help someone.
Based on #Ryan Wheale's answer, I've found this to work as a comprehensive capture to ensure that matches exclude anything found inside a string literal.
/(?:\r\n|\n|^)(?:[^'"])*?(?:'(?:[^\r\n\\']|\\'|[\\]{2})*'|"(?:[^\r\n\\"]|\\"|[\\]{2})*")*?(?:[^'"])*?(\/\*(?:[\s\S]*?)\*\/|\/\/.*)/g
The last group (all others are discarded) is based on Ryan's answer. Example here.
This assumes code is well structured and valid javascript.
Note: this has not been tested on poorly structured code which may or may not be recoverable depending on the javascript engine's own heuristics.
Note: this should hold for valid javascript < ES6, however, ES6 allows multi-line string literals, in which case this regex will almost certainly break, though that case has not been tested.
However, it is still possible to match something that looks like a comment inside a regex literal (see comments/results in the Example above).
I use the above capture after replacing all regex literals using the following comprehensive capture extracted from es5-lexer here and here, as referenced in Mike Samuel's answer to this question:
/(?:(?:break|case|continue|delete|do|else|finally|in|instanceof|return|throw|try|typeof|void|[+]|-|[.]|[/]|,|[*])|[!%&(:;<=>?[^{|}~])?(\/(?![*/])(?:[^\\\[/\r\n\u2028\u2029]|\[(?:[^\]\\\r\n\u2028\u2029]|\\(?:[^\r\n\u2028\u2029ux]|u[0-9A-Fa-f]{4}|x[0-9A-Fa-f]{2}))+\]|\\(?:[^\r\n\u2028\u2029ux]|u[0-9A-Fa-f]{4}|x[0-9A-Fa-f]{2}))*\/[gim]*)/g
For completeness, see also this trivial caveat.
If you click on the link below you find a comment removal script written in regex.
These are 112 lines off code that work together also works with mootools and Joomla and drupal and other cms websites.
Tested it on 800.000 lines of code and comments. works fine.
This one also selects multiple parenthetical like ( abc(/nn/('/xvx/'))"// testing line") and comments that are between colons and protect them.
23-01-2016..! This is the code with the comments in it.!!!!
Click Here
I was looking for a quick Regex solution too, but none of the answers provided work 100%. Each one ends up breaking the source code in some way, mostly due to comments detected inside string literals. E.g.
var string = "https://www.google.com/";
Becomes
var string = "https:
For the benefit of those coming in from google, I ended up writing a short function (in Javascript) that achieves what the Regex couldn't do. Modify for whatever language you are using to parse Javascript.
function removeCodeComments(code) {
var inQuoteChar = null;
var inBlockComment = false;
var inLineComment = false;
var inRegexLiteral = false;
var newCode = '';
for (var i=0; i<code.length; i++) {
if (!inQuoteChar && !inBlockComment && !inLineComment && !inRegexLiteral) {
if (code[i] === '"' || code[i] === "'" || code[i] === '`') {
inQuoteChar = code[i];
}
else if (code[i] === '/' && code[i+1] === '*') {
inBlockComment = true;
}
else if (code[i] === '/' && code[i+1] === '/') {
inLineComment = true;
}
else if (code[i] === '/' && code[i+1] !== '/') {
inRegexLiteral = true;
}
}
else {
if (inQuoteChar && ((code[i] === inQuoteChar && code[i-1] != '\\') || (code[i] === '\n' && inQuoteChar !== '`'))) {
inQuoteChar = null;
}
if (inRegexLiteral && ((code[i] === '/' && code[i-1] !== '\\') || code[i] === '\n')) {
inRegexLiteral = false;
}
if (inBlockComment && code[i-1] === '/' && code[i-2] === '*') {
inBlockComment = false;
}
if (inLineComment && code[i] === '\n') {
inLineComment = false;
}
}
if (!inBlockComment && !inLineComment) {
newCode += code[i];
}
}
return newCode;
}
2019:
All other answers are incomplete and full of shortcomings. I take the time to write complete answer that WORK
function stripComments(code){
const savedText = [];
return code
.replace(/(['"`]).*?\1/gm,function (match) {
var i = savedText.push(match);
return (i-1)+'###';
})
// remove // comments
.replace(/\/\/.*/gm,'')
// now extract all regex and save them
.replace(/\/[^*\n].*\//gm,function (match) {
var i = savedText.push(match);
return (i-1)+'###';
})
// remove /* */ comments
.replace(/\/\*[\s\S]*\*\//gm,'')
// remove <!-- --> comments
.replace(/<!--[\s\S]*-->/gm, '')
.replace(/\d+###/gm,function(match){
var i = Number.parseInt(match);
return savedText[i];
})
}
var cleancode = stripComments(stripComments.toString())
console.log(cleancode)
Other answers not working on samples code like that:
// won't execute the creative code ("Can't execute code form a freed script"),
navigator.userAgent.match(/\b(MSIE |Trident.*?rv:|Edge\/)(\d+)/);
function stripComments(code){
const savedText = [];
return code
// extract strings and regex
.replace(/(['"`]).*?\1/gm,function (match) {
savedText.push(match);
return '###';
})
// remove // comments
.replace(/\/\/.*/gm,'')
// now extract all regex and save them
.replace(/\/[^*\n].*\//gm,function (match) {
savedText.push(match);
return '###';
})
// remove /* */ comments
.replace(/\/\*[\s\S]*\*\//gm,'')
// remove <!-- --> comments
.replace(/<!--[\s\S]*-->/gm, '')
/*replace \ with \\ so we not lost \b && \t*/
.replace(/###/gm,function(){
return savedText.shift();
})
}
var cleancode = stripComments(stripComments.toString())
console.log(cleancode)
for /**/ and //
/(?:(?:\/\*(?:[^*]|(?:\*+[^*\/]))*\*+\/)|(?:(?<!\:|\\\|\')\/\/.*))/gm
I wonder if this was a trick question given by
a professor to students. Why? Because it seems
to me it is IMPOSSIBLE to do this, with
Regular Expressions, in the general case.
Your (or whoever's code it is) can contain
valid JavaScript like this:
let a = "hello /* ";
let b = 123;
let c = "world */ ";
Now if you have a regexp which removes everything
between a pair of /* and */, it would break the code
above, it would remove the executable code in the
middle as well.
If you try to devise a regexp that would not
remove comments which contain quotes then
you cannot remove such comments. That applies
to single-quote, double-quotes and back-quotes.
You can not remove (all) comments with Regular
Expressions in JavaScript, it seems to me,
maybe someone can point out a way how to do
it for the case above.
What you can do is build a small parser which
goes through the code character by character
and knows when it is inside a string and when
it is inside a comment, and when it is inside
a comment inside a string and so on.
I'm sure there are good open source JavaScript
parsers that can do this. Maybe some of the
packaging and minifying tools can do this for
you as well.
For block comment:
https://regex101.com/r/aepSSj/1
Matches slash character (the \1) only if slash character is followed by asterisk.
(\/)(?=\*)
maybe followed by another asterisk
(?:\*)
followed by first group of match, or zero or more times from something...maybe, without remember the match but capture as a group.
((?:\1|[\s\S])*?)
followed by asterisk and first group
(?:\*)\1
For block and/or inline comment:
https://regex101.com/r/aepSSj/2
where | mean or and (?=\/\/(.*)) capture anything after any //
or https://regex101.com/r/aepSSj/3
to capture the third part too
all in: https://regex101.com/r/aepSSj/8
DEMO: https://onecompiler.com/javascript/3y825u3d5
const context = `
<html>
<script type="module">
/* I'm a comment */
/*
* I'm a comment aswell url="https://example.com/";
*/
var re = /\\/*not a comment!*/;
var m = /\\//.test("\"not a comment!\"");
var re = /"/; // " thiscommentishandledasascode!
const s1 = "multi String \\
\\"double quote\\" \\
// single commet in str \\
/* multiple lines commet in str \\
secend line */ \\
last line";
const s2 = 's2"s';
const url = "https://example.com/questions/5989315/";
let a = "hello /* ";
let b = 123;
let c = "world */ ";
//public static final String LETTERS_WORK_FOLDER = "/Letters/Generated/Work";
console.log(/*comment in
console.log*/ "!message at console.log");
function displayMsg( // the end comment
/*commet arg1*/ a, ...args) {
console.log("Hello World!", a, ...args)
}
<\/script>
<body>
<!-- HTML Comment //--> or <!-- HTML Comment -->
<!--
function displayMsg() {
alert("Hello World!")
}
//-->
</body>
</html>
`;
console.log("before:\n" + context);
console.log("<".repeat(100));
const save = {'txt':[], 'comment':[], 'regex': []};
const context2 =
context.replace(/(['"`]|\/[\*\/]{0,1}|<!\-\-)(?:(?=(?<=\/\*))[\s\S]*?\*\/|(?=(?<=\/\/)).*|(?=(?<=<!\-\-))[\s\S]*?\-\->|(?=(?<=[\s\=]\/)).+?(?<!\\)\/|(?=(?<=['"`]))[\s\S]*?(?<!\\)\1)/g,
function (m) {
const t = (m[0].match(/["'`]/) && 'txt') || (m.match(/^(\/\/|\/\*|<)/) && 'comment') || 'regex';
save[t].push(m);
return '${save.'+t+'['+(save[t].length - 1)+']}';
}).replace(/[\S\s]*/, function(m) {
console.log("watch:\n"+m);
console.log(">".repeat(100));
/*
##remove comment
save.comment = save.comment.map(_ => _.replace(/[\S\s]+/,""));
##replace comment
save.comment = save.comment.map(_ => _.replace(/console\.log/g, 'CONSOLE.LOG'));
##replace text
save.txt = save.txt.map(_ => _.replace(/console\.log/g, 'CONSOLE.LOG'));
##replace your code
m = m.replace(/console\.log/g, 'console.warn');
*/
// console.warn("##remove comment -> save.comment.fill('');");
save.comment.fill('');
return m;
}).replace(/\$\{save.(\w+)\[(\d+)\]\}/g, function(m, t, id) {
return save[t][id];
}).replace(/[\S\s]*/, function(m) {
console.log("result:", m);
// console.log("compare:", (context === m));
return m;
})
My English is not good, can someone help translate what I have written, I will be very grateful
Consider some problems
A.There may be strings in comments, or comments in strings, like
/*
const url="https://example.com/";
*/
const str = "i am s string and /*commet in string*/";
B. " or ' or ` in a string will be escaped with
like
const str = "my name is \"john\"";
const str2 = 'i am "john\'s" friend';
Combining the above multiple regex replaces will cause some problems
Consider regex find to the beginning part
" ' ` // /* <!--
use regex
(['"`]|\/[\*\/]|<!\-\-)
(['"`]|/[*/]|<!\-\-) result as \1
\1 is one of ' or " or
`
or /* or // or <!--
use If-Then-Else Conditionals in Regular Expressions
https://www.regular-expressions.info/conditional.html
(?:(?=(?<=\/\*))[\s\S]*?\*\/|(?=(?<=\/\/)).*|(?=(?<=<!\-\-))[\s\S]*?\-\->|[^\1]*?(?<!\\)\1)
if (?=(?<=\/\*))[\s\S]*?\*\/
(?=(?<=\/\*)) positive lookbehind (?<=\/\*) beacuse/*
It's a multi-line comment, so it should be followed by the latest one */
[\s\S]*?\*\/ match complete /*..\n..\n. */
elseif (?=(?<=\/\/)).*
(?=(?<=//)).* positive lookbehind
(?<=\/\/) catch // single line commet
.* match complete // any single commet
elseif (?=(?<=<!\-\-))[\s\S]*?\-\->
(?=(?<=<!--)) positive lookbehind (?<=<!\-\-) ,
[\s\S]*?\-\-> match complete
<!--..\n..\n. /*/*\-\->
else [^\1]*?(?<!\\)\1
Finally need to process the string
use regex [\s\S]*?\1
maybe the wrong result with "STR\" or 'STR"S\'
at [\s\S]*?we can use "positive lookbehind"
add this [\s\S]*?(?<!\\)\1 to filter escape quotes
end
Based on above attempts and using UltraEdit , mostly Abhishek Simon, I found this to work for inline comments and handles all of the characters within the comment.
(\s\/\/|$\/\/)[\w\s\W\S.]*
This matches comments at the start of the line or with a space before //
//public static final String LETTERS_WORK_FOLDER =
"/Letters/Generated/Work";
but not
"http://schemas.us.com.au/hub/'>" +
so it is only not good for something like
if(x){f(x)}//where f is some function
it just needs to be
if(x){f(x)} //where f is function

How to replace text not within a specific-Tag in JavaScript

I have a string (partly HTML) where I want to replace the string :-) into bbcode :wink:. But this replacement should not happen within <pre>, but in any other tag (or even not within a tag).
For example, I want to replace
:-)<pre>:-)</pre><blockquote>:-)</blockquote>
to:
:wink:<pre>:-)</pre><blockquote>:wink:</blockquote>
I already tried it with the following RegEx, but it does not work (nothing gets replaced):
var s = ':-)<pre>:-)</pre><blockquote>:-)</blockquote>';
var regex = /:\-\)(?!(^<pre>).*<\/pre>)/g;
var r = s.replace(regex, ':wink:');
Can someone please help me? :-)
This ought to do it:-
var src = ":-)<pre>:-)</pre><blockquote>:-)</blockquote>"
var result = src.replace(/(<pre>(?:[^<](?!\/pre))*<\/pre>)|(\:\-\))/gi, fnCallback)
function fnCallback(s)
{
if (s == ":-)") return ":wink:"
return s;
}
alert(result);
It works because any pre element will get picked up by the first option in the regex and once consumed means that any contained :-) can't be matched since the processor will have moved beyond it.
You could avoid hellish regexes altogether if you use a suitable library such as jQuery, e.g.:
var excludeThese = ['pre'];
// loop over all elements on page, replacing :-) with :wink: for anything
// that is *not* a tag name in the excludeThese array
$('* not:(' + excludeThese.join(',') + ')').each(function() {
$(this).html($(this).html().replace(/:\-\)/,':wink:'));
});
Just thought it'd be worth offering a DOM solution:
E.g.
var div = document.createElement('div');
div.innerHTML = ":-)<pre>:-)</pre><blockquote>:-)</blockquote>";
replace(div, /:-\)/g, ":wink:", function(){
// Custom filter function.
// Returns false for <pre> elements.
return this.nodeName.toLowerCase() !== 'pre';
});
div.innerHTML; // <== here's your new string!
And here's the replace function:
function replace(element, regex, replacement, filter) {
var cur = element.firstChild;
if (cur) do {
if ( !filter || filter.call(cur) ) {
if ( cur.nodeType == 1 ) {
replace( cur, regex, replacement );
} else {
cur.data = cur.data.replace( regex, replacement );
}
}
} while ( cur = cur.nextSibling );
}
Almost good: Your negative lookbehind and lookahead where not in the right position and need a slight adjustment:
/(?<!(<pre>)):-\)(?!(<\/pre>))/g
Looks for all ":-)"
...but not if there is a <pre> behind (the regex cursor is!)
...but not if there is a </pre> before (the regex cursor is!)
as a side effect though: <pre>:-):-)</pre> works too but not <pre>:-):-):-)</pre>
https://regex101.com/r/CO0DAD/1
ps. this is a firefox 104 browser (could be different in others)
try with
var regex = /:-)(?!(^)*</pre>)/g;

Categories