Javascript Regex to replace text NOT in html attributes [duplicate] - javascript

This question already has answers here:
Highlight search terms (select only leaf nodes)
(7 answers)
Closed 9 years ago.
I'd like a Javascript Regex to wrap a given list of of words in a given start (<span>) and end tag (i.e. </span>), but only if the word is actually "visible text" on the page, and not inside of an html attribute (such as a link's title tag, or inside of a <script></script> block.
I've created a JS Fiddle with the basics setup: http://jsfiddle.net/4YCR6/1/

HTML is too complex to reliably parse with a regular expression.
If you're looking to do this client-side, you can create a document fragment and/or disconnected DOM node (neither of which is displayed anywhere) and initialize it with your HTML string, then walk through the resulting DOM tree and process the text nodes. (Or use a library to help you do that, although it's actually quite simple.)
Here's a DOM walking example. This example is slightly simpler than your problem because it just updates the text, it doesn't add new elements to the structure (wrapping parts of the text in spans involves updating the structure), but it should get you going. Notes on what you'll need to change at the end.
var html =
"<p>This is a test.</p>" +
"<form><input type='text' value='test value'></form>" +
"<p class='testing test'>Testing here too</p>";
var frag = document.createDocumentFragment();
var body = document.createElement('body');
var node, next;
// Turn the HTML string into a DOM tree
body.innerHTML = html;
// Walk the dom looking for the given text in text nodes
walk(body);
// Insert the result into the current document via a fragment
node = body.firstChild;
while (node) {
next = node.nextSibling;
frag.appendChild(node);
node = next;
}
document.body.appendChild(frag);
// Our walker function
function walk(node) {
var child, next;
switch (node.nodeType) {
case 1: // Element
case 9: // Document
case 11: // Document fragment
child = node.firstChild;
while (child) {
next = child.nextSibling;
walk(child);
child = next;
}
break;
case 3: // Text node
handleText(node);
break;
}
}
function handleText(textNode) {
textNode.nodeValue = textNode.nodeValue.replace(/test/gi, "TEST");
}
Live example
The changes you'll need to make will be in handleText. Specifically, rather than updating nodeValue, you'll need to:
Find the index of the beginning of each word within the nodeValue string.
Use Node#splitText to split the text node into up to three text nodes (the part before your matching text, the part that is your matching text, and the part following your matching text).
Use document.createElement to create the new span (this is literally just span = document.createElement('span')).
Use Node#insertBefore to insert the new span in front of the third text node (the one containing the text following your matched text); it's okay if you didn't need to create a third node because your matched text was at the end of the text node, just pass in null as the refChild.
Use Node#appendChild to move the second text node (the one with the matching text) into the span. (No need to remove it from its parent first; appendChild does that for you.)

T.J. Crowder's answer is correct. I've gone a little further code-wise: here's a fully-formed example that works in all major browsers. I've posted variations of this code on Stack Overflow before (here and here, for example), and made it nice and generic so I (or anyone else) don't have to change it much to reuse it.
jsFiddle example: http://jsfiddle.net/7Vf5J/38/
Code:
// Reusable generic function
function surroundInElement(el, regex, surrounderCreateFunc) {
// script and style elements are left alone
if (!/^(script|style)$/.test(el.tagName)) {
var child = el.lastChild;
while (child) {
if (child.nodeType == 1) {
surroundInElement(child, regex, surrounderCreateFunc);
} else if (child.nodeType == 3) {
surroundMatchingText(child, regex, surrounderCreateFunc);
}
child = child.previousSibling;
}
}
}
// Reusable generic function
function surroundMatchingText(textNode, regex, surrounderCreateFunc) {
var parent = textNode.parentNode;
var result, surroundingNode, matchedTextNode, matchLength, matchedText;
while ( textNode && (result = regex.exec(textNode.data)) ) {
matchedTextNode = textNode.splitText(result.index);
matchedText = result[0];
matchLength = matchedText.length;
textNode = (matchedTextNode.length > matchLength) ?
matchedTextNode.splitText(matchLength) : null;
// Ensure searching starts at the beginning of the text node
regex.lastIndex = 0;
surroundingNode = surrounderCreateFunc(matchedTextNode.cloneNode(true));
parent.insertBefore(surroundingNode, matchedTextNode);
parent.removeChild(matchedTextNode);
}
}
// This function does the surrounding for every matched piece of text
// and can be customized to do what you like
function createSpan(matchedTextNode) {
var el = document.createElement("span");
el.style.color = "red";
el.appendChild(matchedTextNode);
return el;
}
// The main function
function wrapWords(container, words) {
// Replace the words one at a time to ensure "test2" gets matched
for (var i = 0, len = words.length; i < len; ++i) {
surroundInElement(container, new RegExp(words[i]), createSpan);
}
}
wrapWords(document.getElementById("container"), ["test2", "test"]);

Related

Using Javascript to create a link on raw text [duplicate]

How do I find every word on a page beginning with http:// and wrap tags around it?
Can I use something like regex perhaps?
I disagree heavily that jQuery can be much use in finding a solution here. Granted you have to get down and dirty with some of the textNode element attributes but putting the DOM back together again after you split your matched node can be made a wee bit easier using the jQuery library.
The following code is documented inline to explain the action taken. I've written it as a jQuery plugin in case you just want to take this and move it around elsewhere. This way you can scope which elements you want to convert URLs for or you can simply use the $("body") selector.
(function($) {
$.fn.anchorTextUrls = function() {
// Test a text node's contents for URLs and split and rebuild it with an achor
var testAndTag = function(el) {
// Test for URLs along whitespace and punctuation boundaries (don't look too hard or you will be consumed)
var m = el.nodeValue.match(/(https?:\/\/.*?)[.!?;,]?(\s+|"|$)/);
// If we've found a valid URL, m[1] contains the URL
if (m) {
// Clone the text node to hold the "tail end" of the split node
var tail = $(el).clone()[0];
// Substring the nodeValue attribute of the text nodes based on the match boundaries
el.nodeValue = el.nodeValue.substring(0, el.nodeValue.indexOf(m[1]));
tail.nodeValue = tail.nodeValue.substring(tail.nodeValue.indexOf(m[1]) + m[1].length);
// Rebuild the DOM inserting the new anchor element between the split text nodes
$(el).after(tail).after($("<a></a>").attr("href", m[1]).html(m[1]));
// Recurse on the new tail node to check for more URLs
testAndTag(tail);
}
// Behave like a function
return false;
}
// For each element selected by jQuery
this.each(function() {
// Select all descendant nodes of the element and pick out only text nodes
var textNodes = $(this).add("*", this).contents().filter(function() {
return this.nodeType == 3
});
// Take action on each text node
$.each(textNodes, function(i, el) {
testAndTag(el);
});
});
}
}(jQuery));
$("body").anchorTextUrls(); //Sample call
Please keep in mind that given the way I wrote this to populate the textNodes array, the method will find ALL descendant text nodes, not just immediate children text nodes. If you want it to replace URLs only amongst the text within a specific selector, remove the .add("*", this) call that adds all the descendants of the selected element.
Here's a fiddle example.
This is one of those few things that jQuery doesn't directly help you with much. You basically have to walk through the DOM tree and examine the text nodes (nodeType === 3); if you find a text node containing the target text you want to wrap ("http://.....", whatever rules you want to apply), you then split the text node (using splitText) into three parts (the part before the string, the part that is the string, and the part following the string), then put the a element around the second of those.
That sounds a bit complicated, but it isn't really all that bad. It's just a recursive descent walker function (for working through the DOM), a regex match to find the things you want to replace, and then a couple of calls to splitText, createElement, insertBefore, appendChild.
Here's an example that searches for a fixed string; just add your regex matching for "http://":
walk(document.body, "foo");
function walk(node, targetString) {
var child;
switch (node.nodeType) {
case 1: // Element
for (child = node.firstChild;
child;
child = child.nextSibling) {
walk(child, targetString);
}
break;
case 3: // Text node
handleText(node, targetString);
break;
}
}
function handleText(node, targetString) {
var start, targetNode, followingNode, wrapper;
// Does the text contain our target string?
// (This would be a regex test in your http://... case)
start = node.nodeValue.indexOf(targetString);
if (start >= 0) {
// Split at the beginning of the match
targetNode = node.splitText(start);
// Split at the end of the match
followingNode = targetNode.splitText(targetString.length);
// Wrap the target in an element; in this case, we'll
// use a `span` with a class, but you'd use an `a`.
// First we create the wrapper and insert it in front
// of the target text.
wrapper = document.createElement('span');
wrapper.className = "wrapper";
targetNode.parentNode.insertBefore(wrapper, targetNode);
// Now we move the target text inside it
wrapper.appendChild(targetNode);
// Clean up any empty nodes (in case the target text
// was at the beginning or end of a text ndoe)
if (node.nodeValue.length == 0) {
node.parentNode.removeChild(node);
}
if (followingNode.nodeValue.length == 0) {
followingNode.parentNode.removeChild(followingNode);
}
}
}
Live example
Update: The above didn't handle it if there were multiple matches in the same text node (doh!). And oh what the heck, I did a regexp match — you will have to adjust the regexp, and probably do some post-processing on each match, because what's here is too simplistic. But it's a start:
// The regexp should have a capture group that
// will be the href. In our case below, we just
// make it the whole thing, but that's up to you.
// THIS REGEXP IS ALMOST CERTAINLY TOO SIMPLISTIC
// AND WILL NEED ADJUSTING (for instance: what if
// the link appears at the end of a sentence and
// it shouldn't include the ending puncutation?).
walk(document.body, /(http:\/\/[^ ]+)/i);
function walk(node, targetRe) {
var child;
switch (node.nodeType) {
case 1: // Element
for (child = node.firstChild;
child;
child = child.nextSibling) {
walk(child, targetRe);
}
break;
case 3: // Text node
handleText(node, targetRe);
break;
}
}
function handleText(node, targetRe) {
var match, targetNode, followingNode, wrapper;
// Does the text contain our target string?
// (This would be a regex test in your http://... case)
match = targetRe.exec(node.nodeValue);
if (match) {
// Split at the beginning of the match
targetNode = node.splitText(match.index);
// Split at the end of the match.
// match[0] is the full text that was matched.
followingNode = targetNode.splitText(match[0].length);
// Wrap the target in an `a` element.
// First we create the wrapper and insert it in front
// of the target text. We use the first capture group
// as the `href`.
wrapper = document.createElement('a');
wrapper.href = match[1];
targetNode.parentNode.insertBefore(wrapper, targetNode);
// Now we move the target text inside it
wrapper.appendChild(targetNode);
// Clean up any empty nodes (in case the target text
// was at the beginning or end of a text ndoe)
if (node.nodeValue.length == 0) {
node.parentNode.removeChild(node);
}
if (followingNode.nodeValue.length == 0) {
followingNode.parentNode.removeChild(followingNode);
}
// Continue with the next match in the node, if any
match = followingNode
? targetRe.exec(followingNode.nodeValue)
: null;
}
}
Live example
I am not practically but you can try it
$('a([href^="http://"])').each( function(){
//perform your task
})

how to make a chrome extension that turns specific words into hyperlinks?

I am trying to make a chrome extension that finds certain words on pages and turns them into hyperlinks. So for example, if I visit a website that has the word "search" written somewhere, then that word will turn into a link that I can click on (it will still appear as the word but maybe in a different colour or something) and be redirected to "www.google.com".
I have a code that finds words and changes them to other words but I don't know how to change them to hyperlinks. Here is the JavaScript I have:
walk(document.body);
function walk(node)
{
var child, next;
switch ( node.nodeType )
{
case 1:
case 9:
case 11:
child = node.firstChild;
while ( child )
{
next = child.nextSibling;
walk(child);
child = next;
}
break;
case 3:
handleText(node);
break;
}
}
function handleText(textNode)
{
var v = textNode.nodeValue;
v = v.replace(/\bsearch\b/g, (str.link("https://www.google.com")));
v = v.replace(/\bsearch\b/g, asdf);
textNode.nodeValue = v;
}
Can somebody helpful help me out please?
HTML is parsed into the DOM, which is what you are traversing. You are not really replacing the DOM, but instead replacing the text node with unparsed HTML, which won't work. Instead, you need to replace the text node with a <a> node that has the URL and text you want.
You have to replace the text node with the parsed HTML elements.
Change
textNode.nodeValue = v;
to something like
var htmlParser = document.createElement('div');
htmlParser.innerHTML = v;
// replace text node with parsed nodes
var newNodes = htmlParser.childNodes;
while (newNodes.length) {
textNode.parentNode.insertBefore(newNodes[0], textNode);
}
textNode.parentNode.removeChild(textNode);
Also, make sure to ignore anchors (node.tagName === 'A') while traversing the DOM, or you could end up replacing the text inside existing anchors.

A generic way to extract and replace text from the DOM [duplicate]

This question already has an answer here:
Replace each word in webpage's paragraphs with a button containing that text
(1 answer)
Closed 5 years ago.
I have two wrappers:
function wrapSentences(str, tmpl) {
return str.replace(/[^\.!\?]+[\.!\?]+/g, tmpl || "<sentence>$&</sentence>")
}
and
function wrapWords(str, tmpl) {
return str.replace(/\w+/g, tmpl || "<word>$&</word>");
}
I use these in our extension to wrap every word and sentence on any webpage the user visits for TTS and settings purposes.
document.body is the most atomic element on every website, but doing body.innerHTML = wrapWords(body.innerText) will (obviously) replace any element that was in between the different text nodes, thus breaking (the visual part of) the website. I'm looking for a way to find any closest element around any text without knowing anything specific about that element, so I can replace it with a wrapped equivalent without altering the website in any way.
I found several examples that go to the deepest child, but they all rely on passing something (node or id) the extension has no way of knowing about. We will use rangy for highlighting, but have the same issue... I always end up having to pass a node or id that the extension is unable to be aware of when visiting random sites.
One of the examples that needs a node passed:
function replaceTextNodes(node, newText) {
if (node.nodeType === 3) {
//Filter out text nodes that contain only whitespace
if (!/^\s*$/.test(node.data)) {
node.data = newText;
}
} else if (node.hasChildNodes()) {
for (let i = 0, len = node.childNodes.length; i < len; ++i) {
replaceTextNodes(node.childNodes[i], newText);
}
}
}
I'll be happy to explain it better if needed. I fear my wording may not always be the best, I'm aware of that.
It looks like what you want is all the text nodes on the page... This question might have your answer.
Using the function from the first answer:
Edit: now wrapping text in <word> nodes, not just their textContent
function textNodesUnder(el){
var n, a=[], walk=document.createTreeWalker(el,NodeFilter.SHOW_TEXT,null,false);
while(n=walk.nextNode()) a.push(n);
return a;
}
exp = /(?:(\W+)|(\w+))/g
textNodesUnder(document.body)
.filter(t => !/^\s*$/.test(t.textContent))
.forEach(t => {
let s = t.textContent, match
while(match = exp.exec(s)) {
let el
if(match[1] !== undefined) {
el = document.createTextNode(match[1])
}
else {
el = document.createElement("word")
el.textContent = match[2]
}
t.parentNode.insertBefore(el, t)
}
t.parentElement.removeChild(t)
})

CSS styling a single character within a word

My client has asked for the letter 4 to appear in red, wherever it is used in his website navigation.
For instance, where he has 'bikes4kids' as a menu item.
Unfortunately, I am using a 'mega menu' style plugin for his Magento site that only allows for plain text menu items - I cannot use HTML code in the menu item title box, which takes away the chance of me using <span>.
Is there a way of achieving this with JS? I assume not with CSS alone.
EDIT: The mega menu I am working with can be seen here: http://www.magentech.com/extensions/commercial-extensions/item/246-sm-mega-menu-responsive-magento-module
I did it.
Please have a look at this Link
<div class="title">menu1</div>
<div class="title">bike4kids</div>
<div class="title">menu2</div>
var avno = $(".title:nth-child(2)").text();
var avn = avno.split('4');
var item = avn[0]+"<span style='color:red'>4</span>"+avn[1];
$(".title:nth-child(2)").html(item);
No, within “plain text menu items” (as described in the question) you cannot style one character differently from others (except in a few very special cases, which do not apply here: styling the first letter, and setting the font of some characters different from others). JavaScript won’t help, because you would still need to make the character an element, and anything containing an element is by definition not plain text.
So you need to consider other approaches, like menus with items that allow some markup.
If you can process the document after it's finished loading, or sometime after magento has finished doing its thing, you can try the following. It will wrap a provided character in a span with a supplied class. A root element can be provided to limit the scope of the replace. If no root is provided, it searches the entire document.
// Simple function to convert NodeList to Array
// Not suitable for general application
function toArray(obj) {
var a = [];
for (var i=0, iLen=obj.length; i<iLen; i++) {
a[i] = obj[i];
}
return a;
}
// Highlight character c by wrapping in a span with class className
// starting with element root. If root not provided, document.body is used
function highlightChar(c, className, root) {
if (!root) root = document.body;
var frag, idx, t;
var re = new RegExp(c);
// Add tag names to ignore
var ignoreTags = {'script':'script'};
// Child nodes is a live NodeList, convert to array
// so don't have to deal with changing as nodes are added
var node, nodes = toArray(root.childNodes);
var span = document.createElement('span');
span.appendChild(document.createTextNode(c));
span.className = 'highlightChar';
for (var i=0, iLen=nodes.length; i<iLen; i++) {
node = nodes[i];
// If node is a text node and contains the chacter, highlight it
if (node.nodeType == 3 && re.test(node.data)) {
t = node.data.split(re);
frag = document.createDocumentFragment();
// Insert higlight spans after first but not after last
for (var j=0, jLen = t.length-1; j<jLen; j++) {
frag.appendChild(document.createTextNode(t[j]));
frag.appendChild(span.cloneNode(true));
}
// Append last text node
if (j > 0 && t[j]) {
frag.appendChild(document.createTextNode(t[j]));
}
// Replace the original text node with higlighted fragment
node.parentNode.replaceChild(frag, node);
// Otherwise, if node is an element, process it
} else if (node.nodeType == 1 && !(node.tagName.toLowerCase() in ignoreTags)) {
highlightChar(c, className, node);
}
}
}
It can be used to process the entire document using:
window.onload = function() {
highlightChar('4','highlightChar');
};
Edit:
Modified to find menu-items in 'mega menu'... I hope. In the demo site the "$" variable isn't jQuery so I modified the answer as well to use the jQuery function.
Testing in the demo site I found that the letter I modified did color yellow, but there was a bullet added to the left of it - apparently their css adds a bullet to the left (ie. :before) every span...
After the plugin completes its DOM modifications - simply run over the menu items and search-and-replace "4" with a colored span
eg.
// loop over all dom elements with class 'menu-item'
// - I assume here below them exist only text
jQuery('.sm-megamenu-child span').each(function() {
var $item = jQuery(this);
var text = $item.text();
var modified = text.replace(/4/g, "<span style='color:yellow'>4</span>");
$item.html(modified);
})

Broken HTML tags when using .innerHTML

As part of a larger script, I've been trying to make a page that would take a block of text from another function and "type" it out onto the screen:
function typeOut(page,nChar){
var txt = document.getElementById("text");
if (nChar<page.length){
txt.innerHTML = txt.innerHTML + page[nChar];
setTimeout(function () {typeOut(page,nChar+1);},20);
}
}
This basically works the way I want it to, but if the block of text I pass it has any html tags in it (like links), those show up as plain-text instead of being interpreted. Is there any way to get around that and force it to display the html elements correctly?
The problem is that you will create invalid HTML in the process, which the browser will try to correct. So apparently when you add < or >, it will automatically encode that character to not break the structure.
A proper solution would not work literally with every character of the text, but would process the HTML element by element. I.e. whenever you encounter an element in the source HTML, you would clone the element and add it to target element. Then you would process its text nodes character by character.
Here is a solution I hacked together (meaning, it can probably be improved a lot):
function typeOut(html, target) {
var d = document.createElement('div');
d.innerHTML = html;
var source = d.firstChild;
var i = 0;
(function process() {
if (source) {
if (source.nodeType === 3) { // process text node
if (i === 0) { // create new text node
target = target.appendChild(document.createTextNode(''));
target.nodeValue = source.nodeValue.charAt(i++);
// stop and continue to next node
} else if (i === source.nodeValue.length) {
if (source.nextSibling) {
source = source.nextSibling;
target = target.parentNode;
}
else {
source = source.parentNode.nextSibling;
target = target.parentNode.parentNode;
}
i = 0;
} else { // add to text node
target.nodeValue += source.nodeValue.charAt(i++);
}
} else if (source.nodeType === 1) { // clone element node
var clone = source.cloneNode();
clone.innerHTML = '';
target.appendChild(clone);
if (source.firstChild) {
source = source.firstChild;
target = clone;
} else {
source = source.nextSibling;
}
}
setTimeout(process, 20);
}
}());
}
DEMO
Your code should work. Example here : http://jsfiddle.net/hqKVe/2/
The issue is probably that the content of page[nChar] has HTML chars escaped.
The easiest solution is to use the html() function of jQuery (if you use jQuery). There a good example given by Canavar here : How to decode HTML entities using jQuery?
If you are not using jQuery, you have to unescape the string by yourself. In practice, just do the opposite of what is described here : Fastest method to escape HTML tags as HTML entities?

Categories