Find word in HTML - javascript

I am trying to find given word in HTML string and add a span around it.
What I am doing now is this:
function find(what:String,where:String)
{
var regexp:RegExp=new RegExp(what,'gi');
return where.replace(regexp,'<span>$&</span>');
}
It works well on words that are not inside HTML tags.
What I want is to ignore those that are inside HTML tags.
Example: find("spain")
Input:
The rain in <b class="spain">Spain</b> stays mainly in the <i data-test="Spain">plain</i>.
Output:
The rain in <b class="spain"><span>Spain</span></b> stays mainly in the <i data-test="Spain">plain</i>.
How can I achieve this, please?

To account for html tags and attributes that could match, you are going to need to parse that HTML one way or another. The easiest way is to add it to the DOM (or just to a new element):
var container = document.createElement("div");
container.style.display = "none";
document.body.appendChild(container); // this step is optional
container.innerHTML = where;
Once parsed, you can now iterate the nodes using DOM methods and find just the text nodes and search on those. Use a recursive function to walk the nodes:
function wrapWord(el, word)
{
var expr = new RegExp(word, "i");
var nodes = [].slice.call(el.childNodes, 0);
for (var i = 0; i < nodes.length; i++)
{
var node = nodes[i];
if (node.nodeType == 3) // textNode
{
var matches = node.nodeValue.match(expr);
if (matches)
{
var parts = node.nodeValue.split(expr);
for (var n = 0; n < parts.length; n++)
{
if (n)
{
var span = el.insertBefore(document.createElement("span"), node);
span.appendChild(document.createTextNode(matches[n - 1]));
}
if (parts[n])
{
el.insertBefore(document.createTextNode(parts[n]), node);
}
}
el.removeChild(node);
}
}
else
{
wrapWord(node, word);
}
}
}
Here's a working demo: http://jsfiddle.net/gilly3/J8JJm/3

You won't be able to process HTML in any reliable way using regex. Instead, parse the HTML into a DOM tree and iterate the Text nodes checking their data for content.
If you are using JavaScript in a web browser, the parsing will have already have been done for you. See this question for example wrap-word-in-span code. It's much trickier if you need to match phrases that might be split across different elements.

function find(what:String,where:String)
{
what = what.replace(/(\[|\\|\^|\$|\.|\||\?|\*|\+|\(|\)|\{|\})/g, "\\$1")
.replace(/[^a-zA-Z0-9\s:;'"~[\]\{\}\-_+=(),.<>*\/!##$%^&|\\?]/g, "(?:&[0-9A-Za-z]{3,25};|&#[0-9]{1,10};?|[^\s<])")
.replace(/</g,"<?").replace(/>/g,">?").replace(/"/g,"(?:\"|"?)")
.replace(/\s/g, "(?:\\s| ?)");
what = "(>[^<]*|^[^<]*)(" + what + ")";
var regexp:RegExp=new RegExp(what,'gi');
return where.replace(regexp,'$1<span>$2</span>');
}
The first replace function adds a backslash before characters which have a special meaning in a RE, to prevent errors or unexpected results.
The second replace function replaces every occurrence of unknown characters in the search query by (?:&[0-9A-Za-z]{3,25};|&#[0-9]{1,10};?|[^\s<]). This RE consists of three parts: First, it tries to match a HTML entity. Second, it attempts to match a HTML numeric entity. Finally, it matches any non-whitespace character (in case the creator of the HTML document didn't properly encode the characters).
The third, fourth and fifth replace functions replaces <, > and " by the corresponding HTML entities, so that the search query will not search through tags.
The sixth replace function replaces white-space by a RE (\s| ?), which match white-space characters and the HTML entity.
The only shortcoming of this function is that undocumented special characters (such as €) match any HTML entity/character (following the example, not only € and € are valid matches, but also £ and #).
This proposed solution suits in most cases. It can be inaccurate in complex situations, which is probably not worse than a DOM iteration (which is very susceptible to memory leaks and requires more computing power).
When you work with HTML elements which have Event listeners assigned through DOM, you should iterate through all (child) elements, and apply this function to every Text node.

Pure JavaScript (based on Sizzle.getText from jQuery); Demo: http://jsfiddle.net/vol7ron/U8LLv/
var wrapText = function ( elems,regex ) {
var re = new RegExp(regex);
var elem;
for ( var i = 0; elems[i]; i++ ) {
elem = elems[i];
// Get the text from text nodes and CDATA nodes
if ( elem.nodeType === 3 || elem.nodeType === 4 ) {
parent = elem.parentNode;
re.lastIndex = 0;
if(re.test(elem.nodeValue)){
var span = document.createElement('span');
span.innerHTML = RegExp.$1;
if (RegExp.leftContext != ''){
parent.insertBefore(document.createTextNode(RegExp.leftContext),elem); i++;
}
parent.insertBefore(span,elem); i++;
if (RegExp.rightContext != ''){
parent.insertBefore(document.createTextNode(RegExp.rightContext),elem); i++;
}
parent.removeChild(elem);
}
// Traverse everything else, except comment nodes
} else if ( elem.nodeType !== 8 ) {
wrapText( elem.childNodes, regex );
}
}
return;
};
var obj = document.getElementById('wrapper');
wrapText([obj],/(spain)/gi);

Related

Javascript regex neglect div span tags

I have the below text
<span> is an </span>
And I wanted to change the an into a and I use the below regex pattern to do that.
const regExFinder = new RegExp("an", 'gi');
const sourceHTML = "<span> is an </span>";
sourceHTML.replace(regExFinder, `$&`);
But the output is something like this. Can anybody give me an idea of how to neglect any tag and only change the text inside the tag.
<spa> is a </spa>
And what if my source HTML looks like this:
<div> an <span> is an </span></div>
You have a couple of options.
const str = "<div> an <span> is an </span></div>";
// method 1: negative lookaheads (probably the best for regex)
str.replace(/an(?![^<>]*>)/gi, "a");
// method 2: rely on having a space after the "an" (not reliable)
str.replace(/an /gi, "a ")
// method 3: rely on "an" being its own word (depends on the situation)
str.replace(/\ban/gi, "a")
I parse the whole string into a DOM element and then go through all span elements to change their content from "an" to "a", The metacharacter \b in the regular expression denotes a word boundary.
Edit:
After digging a bit deeper I can now operate on all text nodes and change the strings in question:
var html='<div> an <span> is an </span>apple and this <span> is a </span> banana.</div>';
var b=document.createElement('body');
b.innerHTML=html;
// use the "optional filter function" to do the changes:
getTextNodesIn(b,n=>n.textContent=n.textContent.replace(/\ban\b/g,'a'));
// output:
console.log(b.innerHTML);
// I just realised that I can also use Chris West's original function:
// https://cwestblog.com/2014/03/14/javascript-getting-all-text-nodes/
function getTextNodesIn(elem, opt_fnFilter) {
var textNodes = [];
if (elem) {
for (var nodes = elem.childNodes, i = nodes.length; i--;) {
var node = nodes[i], nodeType = node.nodeType;
if (nodeType == 3) {
if (!opt_fnFilter || opt_fnFilter(node, elem)) {
textNodes.push(node);
}
}
else if (nodeType == 1 || nodeType == 9 || nodeType == 11) {
textNodes = textNodes.concat(getTextNodesIn(node, opt_fnFilter));
}
}
}
return textNodes;
}
"Fun fact": In ES6 notation the function can be re-written in an even shorter way as:
function getTN(elem, opt_flt) {
if (elem) return [...elem.childNodes].reduce((tn,node)=>{
var nty = node.nodeType;
if (nty==3 && (!opt_flt || opt_flt(node, elem))) tn.push(node);
else if (nty==1 || nty==9 || nty==11) tn=tn.concat(getTN(node, opt_flt));
return tn
}, []);
}
You can check this solution. I've removed all html tag from the string and then applied the replacement operation. It'll work for both of your test cases.
const regExFinder = new RegExp("an", 'gi');
let sourceHTML = "<div> an <span> is an </span></div>";
sourceHTML = sourceHTML.replace(/<[^>]*>?/gm, '').trim(); // removing HTML tags
sourceHTML = sourceHTML.replace(regExFinder, 'a');
console.log(sourceHTML)

JS RegExp finding word that is not in tag and replace string [duplicate]

This question already has answers here:
How can I change an element's text without changing its child elements?
(16 answers)
Closed 5 years ago.
I need to write a second RegExp to find variable d inside sentence that is not in tags. So variable in tags should be skipped.
Regex '(?:^|\\b)('+d+')(?=\\b|$)' will find d variable but i need to exclude <span> tag with class="description".
New sentence is wrapped in a new tag.
sentence = "This is some word. <span class='description'>word</span> in tag should be skipped"
d = 'word'
re = new RegExp('(?:^|\\b)('+d+')(?=\\b|$)', 'gi')
sentence = sentence.replace(re, "<span>$1</span>")
Result I'm trying to achieve is:
"This is some <span>word</span>. <span class='description'>word</span> in tag should be skipped"
I'm using coffeescript, thanks for the help.
Try this one: (word)(?![^<>]*<\/)
Full code:
var sentence = "This is some word. <span class='description'>word</span> in tag should be skipped"
var d = 'word'
var re = new RegExp('('+d+')(?![^<>]*<\/)', 'gi')
sentence = sentence.replace(re, "<span>$1</span>")
I based this answer on this snippet: https://regex101.com/library/gN4vI6
Trying to manipulate HTML with regular expressions is not a good idea: sooner or later you'll bump into some boundary condition where it fails. Maybe some < or > occur inside attribute values, or even inside text nodes, while the searched term may also occur at unexpected places, like in HTML comments, attribute values, or script tags, ... The list of boundary cases is long.
Furthermore, your search term may contain characters that have a special meaning in regular expression syntax, so you should at least escape those.
Here is a solution that interprets the string as HTML, using the DOM capabilities, and only replaces text in text nodes:
function escapeRegExp(str) {
return str.replace(/[\[\]\/{}()*+?.\\^$|-]/g, "\\$&");
}
function wrapText(sentence, word) {
const re = new RegExp("\\b(" + escapeRegExp(word) + ")\\b", "gi"),
span = document.createElement('span');
span.innerHTML = sentence;
Array.from(span.childNodes, function (node) {
if (node.nodeType !== 3) return;
node.nodeValue.split(re).forEach(function (part, i) {
let add;
if (i%2) {
add = document.createElement('span');
add.textContent = part;
add.className = 'someClass';
} else {
add = document.createTextNode(part);
}
span.insertBefore(add, node);
});
span.removeChild(node);
});
return span.innerHTML;
}
const html = 'This is some word. <span class="word">word</span> should stay',
result = wrapText(html, 'word');
console.log(result);
Recursing into elements
In comments you mentioned that you would now also like to have the replacements happening within some tags, like p.
I'll assume that you want this to happen for all elements, except those that have a certain class, e.g. the class that you use for the wrapping span elements, but you can of course customise the condition to your needs (like only recursing into p, or ...).
The code needs only a few modifications:
function escapeRegExp(str) {
return str.replace(/[\[\]\/{}()*+?.\\^$|-]/g, "\\$&");
}
function wrapText(sentence, word) {
const re = new RegExp("\\b(" + escapeRegExp(word) + ")\\b", "gi"),
doc = document.createElement('span');
doc.innerHTML = sentence;
(function recurse(elem) {
Array.from(elem.childNodes, function (node) {
// Customise this condition as needed:
if (node.classList && !node.classList.contains('someClass')) recurse(node);
if (node.nodeType !== 3) return;
node.nodeValue.split(re).forEach(function (part, i) {
let add;
if (i%2) {
add = document.createElement('span');
add.textContent = part;
add.className = 'someClass';
} else {
add = document.createTextNode(part);
}
elem.insertBefore(add, node);
});
elem.removeChild(node);
});
})(doc);
return doc.innerHTML;
}
const html = '<p><b>Some word</b></p>. <span class="someClass">word</span> should stay',
result = wrapText(html, 'word');
console.log(result);

Javascript regex to find forward slashes between > and <

In a Javascript function I need to replace all forward slashes not part of an HTML tag with &#x2F.
Is there any way using a regular expression to find all forward slashes between a > and a <?
Not exactly, but if you're in this kind of a fix, I guess you'll be happy with a quick-and-dirty solution: Find a / if the next occurring angle bracket is not a closing angle bracket.
result = subject.replace(/\/(?![^<>]*>)/g, "&#x2F");
Of course, this is highly brittle - for example it doesn't care at all about comments, strings etc. (yet, and it would be very difficult to pull this off with regex).
You can test this:
html ='toto/tata';
html = html.replace(/(<[^>]+>)|\//g,
function (match, p1) { return (p1)?match:"&#x2f"; });
console.log (html);
The idea is to capture all html tags (and replace by themselves) before trying to match slashes. Then a callback function tests if the first capture group exist and return the full match or the replacement.
You can improve the safety of this pattern to deal with style and script content, like this:
html = html.replace(/(<s(tyle|cript)\b[\s\S]*?<\/s\2>|<[^>]+>)|\//gi,
function (match, p1, p2) { return (p1)?match:"&#x2f"; });
here is a good example. First hit on google: http://james.padolsey.com/javascript/find-and-replace-text-with-javascript/
The basic idea is to iterate through all the nodes in the DOM and replace text in text nodes. Also, don't replace any text in nodes in script, style, metadata type tags. While you might be able to do this with one big regex, it doesn't make much sense to implement a dom parser in regex when there is one built into every browser.
function findAndReplace(searchText, replacement, searchNode) {
if (!searchText || typeof replacement === 'undefined') {
// Throw error here if you want...
return;
}
var regex = typeof searchText === 'string' ?
new RegExp(searchText, 'g') : searchText,
childNodes = (searchNode || document.body).childNodes,
cnLength = childNodes.length,
excludes = 'html,head,style,title,link,meta,script,object,iframe';
while (cnLength--) {
var currentNode = childNodes[cnLength];
if (currentNode.nodeType === 1 &&
(excludes + ',').indexOf(currentNode.nodeName.toLowerCase() + ',') === -1) {
arguments.callee(searchText, replacement, currentNode);
}
if (currentNode.nodeType !== 3 || !regex.test(currentNode.data) ) {
continue;
}
var parent = currentNode.parentNode,
frag = (function(){
var html = currentNode.data.replace(regex, replacement),
wrap = document.createElement('div'),
frag = document.createDocumentFragment();
wrap.innerHTML = html;
while (wrap.firstChild) {
frag.appendChild(wrap.firstChild);
}
return frag;
})();
parent.insertBefore(frag, currentNode);
parent.removeChild(currentNode);
}
}
Then use it
findAndReplace('\\/', '&#x2F');

Match characters at start of string, ignore strings in html tags

A little help required please...
I have a regular expression that matches characters at the start of a string as follows:
If I have a set of strings like so:
Ray Fox
Foster Joe
Finding Forrester
REGEX
/\bfo[^\b]*?\b/gi
This will match 'FO' in Fox, Foster, and Forrester as expected:
However, I am faced with an issue where if the set of strings are wrapped in html tags like so;-
<span class="fontColor1">Ray Fox</span>
<span class="fontColor2">Foster Joe</span>
<span class="fontColor3">Finding Forrester</span>
This will match 'FO' in fontColor* as well.
I'm fairly green with Regular expressions, I need a little help updating the query so that it only searches values between HTML tags where HTML tags exist, but still works correctly if HTML tags do not exist.
You can use a html parser and extract pure text, and match that.
var root;
try {
root = document.implementation.createHTMLDocument("").body;
}
catch(e) {
root = document.createElement("body");
}
root.innerHTML = '<span class="fontColor1">Ray Fox</span>\
<span class="fontColor2">Foster Joe</span>\
<span class="fontColor3">Finding Forrester</span>';
//If you are using jQuery
var text = $(root).text();
//Proceed as normal with the text variable
If you are not using jQuery, you can replace $(root).text() with findText(root), where findText:
function findText(root) {
var ret = "",
nodes = root.childNodes;
for (var i = 0; i < nodes.length; ++i) {
if (nodes[i].nodeType === 3) {
ret += nodes[i].nodeValue;
} else if (nodes[i].nodeType === 1) {
ret += findText(nodes[i]);
}
}
return ret;
}
What about
<.*?span.*?>(.*?)<\s?\/.*?span.*?>
And where do you have text where html tags don't exist? That makes no sense.
EDIT:
This solution will not match nested tags, but as the question is written, that doesn't seem to be an issue.

In JavaScript, how can I replace text in an HTML page without affecting the tags?

I'm trying to figure out how to do a replace with Javascript. I'm looking at the entire body of the page and would like to replace the keyword matches NOT within an HTML tag.
Here is an example:
<body>
<span id="keyword">blah</span>
<div>
blah blah keyword blah<br />
whatever keyword whatever
</div>
</body>
<script type="text/javascript">
var replace_terms = {
'keyword':{'url':'http://en.wikipedia.org/','target':'_blank'}
}
jQuery.each(replace_terms, function(i, val) {
var re = new RegExp(i, "gi");
$('body').html(
$('body').html().replace(re, '' + i + '')
);
});
</script>
I'm looking to replace all instances of the "keyword" that isn't within an HTML tag (between < and >).
I guess I also need to ignore if "keyword" is within a script or style element.
Don't use regex to parse HTML. [X][HT]ML is not a regular language and cannot reliably be processed using regex. Your browser has a good HTML parser built-in; let that take the strain of working out where the tags are.
Also you don't really want to work on html()/innerHTML on body. This will serialise and re-parse the entire page, which will be slow and will lose any information that cannot be serialised in HTML, such as event handlers, form values and other JavaScript references.
Here's a method using DOM that seems to work for me:
function replaceInElement(element, find, replace) {
// iterate over child nodes in reverse, as replacement may increase
// length of child node list.
for (var i= element.childNodes.length; i-->0;) {
var child= element.childNodes[i];
if (child.nodeType==1) { // ELEMENT_NODE
var tag= child.nodeName.toLowerCase();
if (tag!='style' && tag!='script') // special case, don't touch CDATA elements
replaceInElement(child, find, replace);
} else if (child.nodeType==3) { // TEXT_NODE
replaceInText(child, find, replace);
}
}
}
function replaceInText(text, find, replace) {
var match;
var matches= [];
while (match= find.exec(text.data))
matches.push(match);
for (var i= matches.length; i-->0;) {
match= matches[i];
text.splitText(match.index);
text.nextSibling.splitText(match[0].length);
text.parentNode.replaceChild(replace(match), text.nextSibling);
}
}
// keywords to match. This *must* be a 'g'lobal regexp or it'll fail bad
var find= /\b(keyword|whatever)\b/gi;
// replace matched strings with wiki links
replaceInElement(document.body, find, function(match) {
var link= document.createElement('a');
link.href= 'http://en.wikipedia.org/wiki/'+match[0];
link.appendChild(document.createTextNode(match[0]));
return link;
});

Categories