Javascript RegEx - Split Html-string

Javascript RegEx - Split Html-string - javascript

I'm working on a script and need to split strings which contain both html tags and text. I'm trying to isolate the text and elimanate the tags
For example, I want this:
string = '<p><span style="color:#ff3366;">A</span></p><p><span style="color:#ff3366;text-decoration:underline;">B</span></p><p><span style="color:#ff3366;text-decoration:underline;"><em>C</em></span></p>';
to be split like this:
separation = string.split(/some RegExp/);
and become:
separation[0] = "<span style="color:#ff3366;">A</span>";
separation[1] = "<span style="color:#ff3366;text-decoration:underline;">B</span>";
separation[2] = "<span style="color:#ff3366;text-decoration:underline;"><em>C</em></span>";
After that I would like to split the sepeartion string like this:
stringNew = '<span style="color:#ff3366;">A</span>';
extendedSeperation = stringNew.split(/some RegExp/);
extendedSeperation[0] = "A";
extendedSeperation[1] = "style="color:#ff3366;";

Don't use RegEx for reasons explained in comments.
Instead, do this:
Create an invisible node:
node = $("<div>").css("display", "none");
Attach it to the body:
$("body").append(node);
Now inject your HTML into the node:
node.html(myHTMLString);
Now you can traverse the DOM tree and extract/render it as you like, much like this:
ptags = node.find("p") // will return all <p> tags
To get the content of a tag use:
ptags[0].html()
Finally, to clear the node do:
node.html("");
This should be enough to get you going.
This way you leverage the internal parser of the browser, as suggested in the comments.

Your exact expectations are a little unclear, but based only on the information given here is an example that may give you ideas.
Does not use RegExp
Does not use jQuery or any other library
Does not append and remove elements from the DOM
Is well supported across browsers
function walkTheDOM(node, func) {
func(node);
node = node.firstChild;
while (node) {
walkTheDOM(node, func);
node = node.nextSibling;
}
}
function textContent(node) {
if (typeof node.textContent !== "undefined" && node.textContent !== null) {
return node.textContent;
}
var text = ""
walkTheDOM(node, function (current) {
if (current.nodeType === 3) {
text += current.nodeValue;
}
});
return text;
}
function dominate(text) {
var container = document.createElement('div');
container.innerHTML = text;
return container;
}
function toSeparation(htmlText) {
var spans = dominate(htmlText).getElementsByTagName('span'),
length = spans.length,
result = [],
index;
for (index = 0; index < length; index += 1) {
result.push(spans[index].outerHTML);
}
return result;
}
function toExtendedSeperation(node) {
var child = dominate(node).firstChild,
attributes = child.attributes,
length = attributes.length,
text = textContent(child),
result = [],
style,
index,
attr;
if (text) {
result.push(text);
}
for (index = 0; index < length; index += 1) {
attr = attributes[index]
if (attr.name === 'style') {
result.push(attr.name + '=' + attr.value);
break;
}
}
return result;
}
var strHTML = '<p><span style="color:#ff3366;">A</span></p><p><span style="color:#ff3366;text-decoration:underline;">B</span></p><p><span style="color:#ff3366;text-decoration:underline;"><em>C</em></span></p>',
separation = toSeparation(strHTML),
extendedSeperation = toExtendedSeperation(separation[0]),
pre = document.getElementById('out');
pre.appendChild(document.createTextNode(JSON.stringify(separation, null, 2)));
pre.appendChild(document.createTextNode('\n\n'));
pre.appendChild(document.createTextNode(JSON.stringify(extendedSeperation, null, 2)));
<pre id="out"></pre>
Of course you will need to make modifications to suit your exact needs.

Related

JavaScript - Extract specific nodes along with the node start position

Following is a sample node,
<div>Hell<span class="locate">Q1</span>o <b>w<span class="locate">Q2</span>or</b>ld</div>
My goal is to extract all the 'locate' class nodes along with their start character/text position (So later, i can reuse that text/character position to inject the node)
Ex:
<div>Hell<span class="locate">Q1</span>o <b>w<span class="locate">Q2</span>or</b>ld</div>
Extract something like
Output:
1. Extraction
[
{
"start": 5,
"node": "<span class='locate'>Q1</span>"
},
{
"start": 9,
"node": "<span class='locate'>Q2</span>"
}
]
Removal of locate nodes to Hello world`
So far what I've tried:
treeWalker = document.createTreeWalker(input, NodeFilter.SHOW_ALL);
while(treeWalker.nextNode()) {
temp = {};
currentNode = treeWalker.currentNode;
if (currentNode.parentNode.tagName.toLowerCase() === 'div') {
totalText += currentNode.textContent.length;
if (
currentNode.className &&
currentNode.className.toLowerCase() === 'locate'
) {
temp.startPosition = totalText;
temp.node = currentNode.cloneNode(true);
collectorArray.push(temp);
console.log(currentNode, totalText);
} else {
updatedNode.appendChild(currentNode.cloneNode(true));
console.log(currentNode, totalText);
}
}
}
I tried to use TreeWalker to collect nodes and start position, but, I'm not getting no where.
I also feel the way i calculate the text length is wrong. May be a better way?
The idea of the whole process is, reapply the collected locate nodes after some text change occurs, using the start position and node collected.

Try this (sorry for bad indentation in the snippet):
var nodeIterator = document.createNodeIterator(
document.getElementById('someId'),
NodeFilter.SHOW_ELEMENT,
{ acceptNode: function(node) {
if ( node.className == 'locate' ) {
return NodeFilter.FILTER_ACCEPT;
}
}
},
false
);
var node;
var doc = document.getElementById('someId').textContent;
var result = [];
var currentStart = 0;
while ((node = nodeIterator.nextNode())) {
var tempDoc = doc.slice(currentStart, doc.length);
var idx = tempDoc.indexOf(node.textContent);
var temp = {};
temp.start = currentStart + idx;
temp.node = node.outerHTML;
currentStart += (idx + node.textContent.length);
result.push(temp);
}
console.log(result)
<div id="someId">Hell<span class="locate">Q1</span>o <b>w<span class="locate">Q1</span>or</b>ld<span class="locate">Q1</span></div>
Note, that I use SHOW_ELEMENT because you distinguish what is needed by elements class name. So later you can just set a rule in iterator that accepts only those nodes.
And then I get textContent of the root element and get index value of accepted nodes from iterator object.

Javascript replace function error

I have a problem with the javascript replace function and I don't succeed to resolve it.
This is my code : https://jsfiddle.net/r36k20sa/1/
var tags = ['zazie', 'johnny'];
tags.forEach(function(element) {
content = content.replace(
new RegExp("(?!<a.*?>.*?)(\\b" + element + "\\b)(?!.*?<\\/a>)", "igm"),
'$1'
);
});
In the tags array, if I reverse the array "johnny" then "zazie" all tags are well selected otherwise, some tags are missing. (The last in this example). What can be the trick?
What can be explained that ? It seems like the javascript replace function runs asynchronous?
Thanks for your help.

Are you seriously using regex to process HTML when you have a DOM parser at your fingertips?
var content = document.getElementById('content');
function findTextNodes(root,ret) {
// recursively descend into child nodes and return an array of text nodes
var children = root.childNodes, l = children.length, i;
ret = ret || [];
for( i=0; i<l; i++) {
if( children[i].nodeType == 1) { // ElementNode
// excluding A tags here, you might also want to exclude BUTTON tags
if( children[i].nodeName != "A") {
findTextNodes(children[i],ret);
}
}
if( children[i].nodeType == 3) { // TextNode
ret.push(children[i]);
}
}
return ret;
}
var textNodes = findTextNodes(content);
// now search those text node contents for matching tags.
var tags = ['zazie','johnny'], tagcount = tags.length, regexes, tag;
for( tag=0; tag<tagcount; tag++) {
regexes[tag] = new RegExp("\b"+tags[tag]+"\b","i");
}
var node, match, index, tagtext, newnode;
while(node = textNodes.shift()) {
for( tag=0; tag<tagcount; tag++) {
if( match = node.nodeValue.match(regexes[tag])) {
index = match.index;
textNodes.unshift(node.splitText(index + tags[tag].length));
tagtext = node.splitText(index);
newnode = document.createElement('a');
newnode.href = "";
newnode.className = "esk-seo-plu-link";
newnode.style.cssText = "background:red;color:white";
tagtext.parentNode.replaceChild(newnode,tagtext);
newnode.appendChild(tagtext);
}
}
}
// and done - no more action needed since it was in-place.
See it in action

Please replace . with \\.
var tags = ['zazie', 'johnny'];
tags.forEach(function(element) {
content = content.replace(
new RegExp("(?!<a.*?>\\.*?)(\\b" + element + "\\b)(?!\\.*?<\\/a>)", "igm"),
'$1'
);
});

Simple javascript find and replace

is there a straightforward method for searching within a div for a specific string and replacing it with another? I cannot use .replaceWith alone because there are other elements within the div I need to preserve. I've tried various javascript methods found here to no avail.
So something like:
$('#foo').find('this string').replaceWith('this other string');
for:
<div id="foo"><div id="child">Other Element</div>this string</div>
Thanks.

Try this:
var foo = $('#foo').html();
foo = foo.replace('this string', 'this other string');
$('#foo').html(foo);
Fiddle: http://jsfiddle.net/maniator/w9GzF/

This replaces all occurrences:
var $foo = $('#foo'),
fooHtml = $foo.html();
$foo.html(fooHtml.replace(/this string/g, 'this other string'));

Just using html().replace() with match all results element attribute or tag name.
I face this issue also, my solution is similar to findAndReplace() function from http://james.padolsey.com/javascript/find-and-replace-text-with-javascript/ but using regular expression to get all textNode and search in each of them.
function epubSearch(query) {
var d = document.getElementsByTagName("body")[0];
var re = new RegExp(query, "gi");//pattern for keyword
var re0 = new RegExp("[>][^><]*[><]", "gi");//pattern to get textnode
d.innerHTML = d.innerHTML.replace(re0, function (text) {
// with each textNode, looking for keyword
return text.replace(re, "<span class=\"search-result\" style=\"background-color:red;\">$&</span>");
});
}

Here's a jQuery plugin I just wrote that provides safeReplace for collections.
(function($){
$.fn.safeReplace = function ( find, replacement ) {
return this.each(function(index, elem) {
var
queue = [elem],
node,
i;
while (queue.length) {
node = queue.shift();
if (node.nodeType === 1) {
i = node.childNodes.length;
while (i--) {
queue[queue.length] = node.childNodes[i];
}
} else if (node.nodeType === 3) {
node.nodeValue = node.nodeValue.replace( find, replacement );
}
}
});
};
})(jQuery);
And here's how you use it:
$('#foo').safeReplace( /this string/g, 'something else' );
I've only tested in FF 4, and only on the sample HTML input - more testing is recommended.
Hope this helps!

What's wrong with String.replace();?
e.g.
$("#div").html($("#div").html().replace("search string", "replace string"));
Or Exploded:
var $divElement = $("#div"); //Find the div to perform replace on
var divContent = $divElement.html(); //Get the div's content
divContent = divContent.replace("search string", "replace string"); //Perform replace
$divElement.html(divContent); //Replace contents of div element.

This one works as many times as your term appears and will not kill any of the important things that shouldn't be changed (stored in the excludes array).
usage: findAndReplace('dog','cat', document.getElementById('content'));
/* js find andreplace Based on http://james.padolsey.com/javascript/find-and-replace-text-with-javascript/ */
function findAndReplace(searchText, replacement, searchNode) {
if (!searchText || typeof replacement === 'undefined') {
return;
}
var regex = typeof searchText === 'string' ?
new RegExp(searchText, 'g') : searchText,
childNodes = (searchNode || document.body).childNodes,
cnLength = childNodes.length,
excludes = ['html','head','style','link','meta','script','object','iframe'];
while (cnLength--) {
var currentNode = childNodes[cnLength];
if (currentNode.nodeType === 1 &&
excludes.indexOf(currentNode.nodeName.toLowerCase() + ',') === -1) {
arguments.callee(searchText, replacement, currentNode);
}
if (currentNode.nodeType !== 3 || !regex.test(currentNode.data) ) {
continue;
}
var parent = currentNode.parentNode,
frag = (function(){
var html = currentNode.data.replace(regex, replacement),
wrap = document.createElement('div'),
frag = document.createDocumentFragment();
wrap.innerHTML = html;
while (wrap.firstChild) {
frag.appendChild(wrap.firstChild);
}
return frag;
})();
parent.insertBefore(frag, currentNode);
parent.removeChild(currentNode);
}
}

Getting a jQuery selector for an element

In psuedo code, this is what I want.
var selector = $(this).cssSelectorAsString(); // Made up method...
// selector is now something like: "html>body>ul>li>img[3]"
var element = $(selector);
The reason is that I need to pass this off to an external environment, where a string is my only way to exchange data. This external environment then needs to send back a result, along with what element to update. So I need to be able to serialize a unique CSS selector for every element on the page.
I noticed jquery has a selector method, but it does not appear to work in this context. It only works if the object was created with a selector. It does not work if the object was created with an HTML node object.

I see now that a plugin existed (with the same name I thought of too), but here's just some quick JavaScript I wrote. It takes no consideration to the ids or classes of elements – only the structure (and adds :eq(x) where a node name is ambiguous).
jQuery.fn.getPath = function () {
if (this.length != 1) throw 'Requires one element.';
var path, node = this;
while (node.length) {
var realNode = node[0], name = realNode.name;
if (!name) break;
name = name.toLowerCase();
var parent = node.parent();
var siblings = parent.children(name);
if (siblings.length > 1) {
name += ':eq(' + siblings.index(realNode) + ')';
}
path = name + (path ? '>' + path : '');
node = parent;
}
return path;
};
(License: MIT)

TL;DR - this is a more complex problem than it seems and you should use a library.
This problem appears easy at the first glance, but it's trickier than it seems, just as replacing plain URLs with links is non-trivial. Some considerations:
Using descendant selectors vs. child selectors can lead to cases where the selector isn't unique.
Using :eq() limits the usefulness of the solution, as it will require jQuery
Using tag+nth-child selectors can result in unnecessarily long selectors
Not taking advantage of ids makes the selector less robust to changes in the page structure.
Further proof that the problem isn't as easy as it seems: there are 10+ libraries that generate CSS selectors, and the author of one of them has published this comparison.

jQuery-GetPath is a good starting point: it'll give you the item's ancestors, like this:
var path = $('#foo').getPath();
// e.g., "html > body > div#bar > ul#abc.def.ghi > li#foo"

Here's a version of Blixt's answer that works in IE:
jQuery.fn.getPath = function () {
if (this.length != 1) throw 'Requires one element.';
var path, node = this;
while (node.length) {
var realNode = node[0];
var name = (
// IE9 and non-IE
realNode.localName ||
// IE <= 8
realNode.tagName ||
realNode.nodeName
);
// on IE8, nodeName is '#document' at the top level, but we don't need that
if (!name || name == '#document') break;
name = name.toLowerCase();
if (realNode.id) {
// As soon as an id is found, there's no need to specify more.
return name + '#' + realNode.id + (path ? '>' + path : '');
} else if (realNode.className) {
name += '.' + realNode.className.split(/\s+/).join('.');
}
var parent = node.parent(), siblings = parent.children(name);
if (siblings.length > 1) name += ':eq(' + siblings.index(node) + ')';
path = name + (path ? '>' + path : '');
node = parent;
}
return path;
};

I just wanted to share my version too because it is very clear to understand. I tested this script in all common browsers and it is working like a boss.
jQuery.fn.getPath = function () {
var current = $(this);
var path = new Array();
var realpath = "BODY";
while ($(current).prop("tagName") != "BODY") {
var index = $(current).parent().find($(current).prop("tagName")).index($(current));
var name = $(current).prop("tagName");
var selector = " " + name + ":eq(" + index + ") ";
path.push(selector);
current = $(current).parent();
}
while (path.length != 0) {
realpath += path.pop();
}
return realpath;
}

Same solution like that one from #Blixt but compatible with multiple jQuery elements.
jQuery('.some-selector') can result in one or many DOM elements. #Blixt's solution works unfortunately only with the first one. My solution concatenates all them with ,.
If you want just handle the first element do it like this:
jQuery('.some-selector').first().getPath();
// or
jQuery('.some-selector:first').getPath();
Improved version
jQuery.fn.extend({
getPath: function() {
var pathes = [];
this.each(function(index, element) {
var path, $node = jQuery(element);
while ($node.length) {
var realNode = $node.get(0), name = realNode.localName;
if (!name) { break; }
name = name.toLowerCase();
var parent = $node.parent();
var sameTagSiblings = parent.children(name);
if (sameTagSiblings.length > 1)
{
allSiblings = parent.children();
var index = allSiblings.index(realNode) +1;
if (index > 0) {
name += ':nth-child(' + index + ')';
}
}
path = name + (path ? ' > ' + path : '');
$node = parent;
}
pathes.push(path);
});
return pathes.join(',');
}
});

If you are looking for a comprehensive, non-jQuery solution then you should try axe.utils.getSelector.

Following up on what alex wrote.
jQuery-GetPath is a great starting point but I have modified it a little to incorporate :eq(), allowing me to distinguish between multiple id-less elements.
Add this before the getPath return line:
if (typeof id == 'undefined' && cur != 'body') {
allSiblings = $(this).parent().children(cur);
var index = allSiblings.index(this);// + 1;
//if (index > 0) {
cur += ':eq(' + index + ')';
//}
}
This will return a path like "html > body > ul#hello > li.5:eq(1)"

Update: This code was changed since then. You may find the implementation of the function now at css-login.js
Original answer:
You may also have a look at findCssSelector, which is used in Firefox developer tools to save the currently selected node upon page refreshes. It doesn't use jQuery or any library.
const findCssSelector = function(ele) {
ele = getRootBindingParent(ele);
let document = ele.ownerDocument;
if (!document || !document.contains(ele)) {
throw new Error("findCssSelector received element not inside document");
}
let cssEscape = ele.ownerGlobal.CSS.escape;
// document.querySelectorAll("#id") returns multiple if elements share an ID
if (ele.id &&
document.querySelectorAll("#" + cssEscape(ele.id)).length === 1) {
return "#" + cssEscape(ele.id);
}
// Inherently unique by tag name
let tagName = ele.localName;
if (tagName === "html") {
return "html";
}
if (tagName === "head") {
return "head";
}
if (tagName === "body") {
return "body";
}
// We might be able to find a unique class name
let selector, index, matches;
if (ele.classList.length > 0) {
for (let i = 0; i < ele.classList.length; i++) {
// Is this className unique by itself?
selector = "." + cssEscape(ele.classList.item(i));
matches = document.querySelectorAll(selector);
if (matches.length === 1) {
return selector;
}
// Maybe it's unique with a tag name?
selector = cssEscape(tagName) + selector;
matches = document.querySelectorAll(selector);
if (matches.length === 1) {
return selector;
}
// Maybe it's unique using a tag name and nth-child
index = positionInNodeList(ele, ele.parentNode.children) + 1;
selector = selector + ":nth-child(" + index + ")";
matches = document.querySelectorAll(selector);
if (matches.length === 1) {
return selector;
}
}
}
// Not unique enough yet. As long as it's not a child of the document,
// continue recursing up until it is unique enough.
if (ele.parentNode !== document) {
index = positionInNodeList(ele, ele.parentNode.children) + 1;
selector = findCssSelector(ele.parentNode) + " > " +
cssEscape(tagName) + ":nth-child(" + index + ")";
}
return selector;
};

$.fn.getSelector = function(){
var $ele = $(this);
return '#' + $ele.parents('[id!=""]').first().attr('id')
+ ' .' + $ele.attr('class');
};

Pretty printing XML with javascript

I have a string that represents a non indented XML that I would like to pretty-print. For example:
<root><node/></root>
should become:
<root>
<node/>
</root>
Syntax highlighting is not a requirement. To tackle the problem I first transform the XML to add carriage returns and white spaces and then use a pre tag to output the XML. To add new lines and white spaces I wrote the following function:
function formatXml(xml) {
var formatted = '';
var reg = /(>)(<)(\/*)/g;
xml = xml.replace(reg, '$1\r\n$2$3');
var pad = 0;
jQuery.each(xml.split('\r\n'), function(index, node) {
var indent = 0;
if (node.match( /.+<\/\w[^>]*>$/ )) {
indent = 0;
} else if (node.match( /^<\/\w/ )) {
if (pad != 0) {
pad -= 1;
}
} else if (node.match( /^<\w[^>]*[^\/]>.*$/ )) {
indent = 1;
} else {
indent = 0;
}
var padding = '';
for (var i = 0; i < pad; i++) {
padding += ' ';
}
formatted += padding + node + '\r\n';
pad += indent;
});
return formatted;
}
I then call the function like this:
jQuery('pre.formatted-xml').text(formatXml('<root><node1/></root>'));
This works perfectly fine for me but while I was writing the previous function I thought that there must be a better way. So my question is do you know of any better way given an XML string to pretty-print it in an html page? Any javascript frameworks and/or plugins that could do the job are welcome. My only requirement is this to be done on the client side.

This can be done using native javascript tools, without 3rd party libs, extending the #Dimitre Novatchev's answer:
var prettifyXml = function(sourceXml)
{
var xmlDoc = new DOMParser().parseFromString(sourceXml, 'application/xml');
var xsltDoc = new DOMParser().parseFromString([
// describes how we want to modify the XML - indent everything
'<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform">',
' <xsl:strip-space elements="*"/>',
' <xsl:template match="para[content-style][not(text())]">', // change to just text() to strip space in text nodes
' <xsl:value-of select="normalize-space(.)"/>',
' </xsl:template>',
' <xsl:template match="node()|#*">',
' <xsl:copy><xsl:apply-templates select="node()|#*"/></xsl:copy>',
' </xsl:template>',
' <xsl:output indent="yes"/>',
'</xsl:stylesheet>',
].join('\n'), 'application/xml');
var xsltProcessor = new XSLTProcessor();
xsltProcessor.importStylesheet(xsltDoc);
var resultDoc = xsltProcessor.transformToDocument(xmlDoc);
var resultXml = new XMLSerializer().serializeToString(resultDoc);
return resultXml;
};
console.log(prettifyXml('<root><node/></root>'));
Outputs:
<root>
<node/>
</root>
JSFiddle
Note, as pointed out by #jat255, pretty printing with <xsl:output indent="yes"/> is not supported by firefox. It only seems to work in chrome, opera and probably the rest webkit-based browsers.

From the text of the question I get the impression that a string result is expected, as opposed to an HTML-formatted result.
If this is so, the simplest way to achieve this is to process the XML document with the identity transformation and with an <xsl:output indent="yes"/> instruction:
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:template match="node()|#*">
<xsl:copy>
<xsl:apply-templates select="node()|#*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
When applying this transformation on the provided XML document:
<root><node/></root>
most XSLT processors (.NET XslCompiledTransform, Saxon 6.5.4 and Saxon 9.0.0.2, AltovaXML) produce the wanted result:
<root>
<node />
</root>

Found this thread when I had a similar requirement but I simplified OP's code as follows:
function formatXml(xml, tab) { // tab = optional indent value, default is tab (\t)
var formatted = '', indent= '';
tab = tab || '\t';
xml.split(/>\s*</).forEach(function(node) {
if (node.match( /^\/\w/ )) indent = indent.substring(tab.length); // decrease indent by one 'tab'
formatted += indent + '<' + node + '>\r\n';
if (node.match( /^<?\w[^>]*[^\/]$/ )) indent += tab; // increase indent
});
return formatted.substring(1, formatted.length-3);
}
works for me!

Slight modification of efnx clckclcks's javascript function. I changed the formatting from spaces to tab, but most importantly I allowed text to remain on one line:
var formatXml = this.formatXml = function (xml) {
var reg = /(>)\s*(<)(\/*)/g; // updated Mar 30, 2015
var wsexp = / *(.*) +\n/g;
var contexp = /(<.+>)(.+\n)/g;
xml = xml.replace(reg, '$1\n$2$3').replace(wsexp, '$1\n').replace(contexp, '$1\n$2');
var pad = 0;
var formatted = '';
var lines = xml.split('\n');
var indent = 0;
var lastType = 'other';
// 4 types of tags - single, closing, opening, other (text, doctype, comment) - 4*4 = 16 transitions
var transitions = {
'single->single': 0,
'single->closing': -1,
'single->opening': 0,
'single->other': 0,
'closing->single': 0,
'closing->closing': -1,
'closing->opening': 0,
'closing->other': 0,
'opening->single': 1,
'opening->closing': 0,
'opening->opening': 1,
'opening->other': 1,
'other->single': 0,
'other->closing': -1,
'other->opening': 0,
'other->other': 0
};
for (var i = 0; i < lines.length; i++) {
var ln = lines[i];
// Luca Viggiani 2017-07-03: handle optional <?xml ... ?> declaration
if (ln.match(/\s*<\?xml/)) {
formatted += ln + "\n";
continue;
}
// ---
var single = Boolean(ln.match(/<.+\/>/)); // is this line a single tag? ex. <br />
var closing = Boolean(ln.match(/<\/.+>/)); // is this a closing tag? ex. </a>
var opening = Boolean(ln.match(/<[^!].*>/)); // is this even a tag (that's not <!something>)
var type = single ? 'single' : closing ? 'closing' : opening ? 'opening' : 'other';
var fromTo = lastType + '->' + type;
lastType = type;
var padding = '';
indent += transitions[fromTo];
for (var j = 0; j < indent; j++) {
padding += '\t';
}
if (fromTo == 'opening->closing')
formatted = formatted.substr(0, formatted.length - 1) + ln + '\n'; // substr removes line break (\n) from prev loop
else
formatted += padding + ln + '\n';
}
return formatted;
};

Personnaly, I use google-code-prettify with this function :
prettyPrintOne('<root><node1><root>', 'xml')

Or if you'd just like another js function to do it, I've modified Darin's (a lot):
var formatXml = this.formatXml = function (xml) {
var reg = /(>)(<)(\/*)/g;
var wsexp = / *(.*) +\n/g;
var contexp = /(<.+>)(.+\n)/g;
xml = xml.replace(reg, '$1\n$2$3').replace(wsexp, '$1\n').replace(contexp, '$1\n$2');
var pad = 0;
var formatted = '';
var lines = xml.split('\n');
var indent = 0;
var lastType = 'other';
// 4 types of tags - single, closing, opening, other (text, doctype, comment) - 4*4 = 16 transitions
var transitions = {
'single->single' : 0,
'single->closing' : -1,
'single->opening' : 0,
'single->other' : 0,
'closing->single' : 0,
'closing->closing' : -1,
'closing->opening' : 0,
'closing->other' : 0,
'opening->single' : 1,
'opening->closing' : 0,
'opening->opening' : 1,
'opening->other' : 1,
'other->single' : 0,
'other->closing' : -1,
'other->opening' : 0,
'other->other' : 0
};
for (var i=0; i < lines.length; i++) {
var ln = lines[i];
var single = Boolean(ln.match(/<.+\/>/)); // is this line a single tag? ex. <br />
var closing = Boolean(ln.match(/<\/.+>/)); // is this a closing tag? ex. </a>
var opening = Boolean(ln.match(/<[^!].*>/)); // is this even a tag (that's not <!something>)
var type = single ? 'single' : closing ? 'closing' : opening ? 'opening' : 'other';
var fromTo = lastType + '->' + type;
lastType = type;
var padding = '';
indent += transitions[fromTo];
for (var j = 0; j < indent; j++) {
padding += ' ';
}
formatted += padding + ln + '\n';
}
return formatted;
};

All of the javascript functions given here won't work for an xml document having unspecified white spaces between the end tag '>' and the start tag '<'. To fix them, you just need to replace the first line in the functions
var reg = /(>)(<)(\/*)/g;
by
var reg = /(>)\s*(<)(\/*)/g;

what about creating a stub node (document.createElement('div') - or using your library equivalent), filling it with the xml string (via innerHTML) and calling simple recursive function for the root element/or the stub element in case you don't have a root. The function would call itself for all the child nodes.
You could then syntax-highlight along the way, be certain the markup is well-formed (done automatically by browser when appending via innerHTML) etc. It wouldn't be that much code and probably fast enough.

If you are looking for a JavaScript solution just take the code from the Pretty Diff tool at http://prettydiff.com/?m=beautify
You can also send files to the tool using the s parameter, such as:
http://prettydiff.com/?m=beautify&s=https://stackoverflow.com/

You can get pretty formatted xml with xml-beautify
var prettyXmlText = new XmlBeautify().beautify(xmlText,
{indent: " ",useSelfClosingElement: true});
indent:indent pattern like white spaces
useSelfClosingElement: true=>use self-closing element when empty element.
JSFiddle
Original(Before)
<?xml version="1.0" encoding="utf-8"?><example version="2.0">
<head><title>Original aTitle</title></head>
<body info="none" ></body>
</example>
Beautified(After)
<?xml version="1.0" encoding="utf-8"?>
<example version="2.0">
<head>
<title>Original aTitle</title>
</head>
<body info="none" />
</example>

For a current project I had the need to prettify and colorize XML without extra libraries. The following self contained code works quite well.
function formatXml(xml,colorize,indent) {
function esc(s){return s.replace(/[-\/&<> ]/g,function(c){ // Escape special chars
return c==' '?' ':'&#'+c.charCodeAt(0)+';';});}
var sm='<div class="xmt">',se='<div class="xel">',sd='<div class="xdt">',
sa='<div class="xat">',tb='<div class="xtb">',tc='<div class="xtc">',
ind=indent||' ',sz='</div>',tz='</div>',re='',is='',ib,ob,at,i;
if (!colorize) sm=se=sd=sa=sz='';
xml.match(/(?<=<).*(?=>)|$/s)[0].split(/>\s*</).forEach(function(nd){
ob=('<'+nd+'>').match(/^(<[!?\/]?)(.*?)([?\/]?>)$/s); // Split outer brackets
ib=ob[2].match(/^(.*?)>(.*)<\/(.*)$/s)||['',ob[2],'']; // Split inner brackets
at=ib[1].match(/^--.*--$|=|('|").*?\1|[^\t\n\f \/>"'=]+/g)||['']; // Split attributes
if (ob[1]=='</') is=is.substring(ind.length); // Decrease indent
re+=tb+tc+esc(is)+tz+tc+sm+esc(ob[1])+sz+se+esc(at[0])+sz;
for (i=1;i<at.length;i++) re+=(at[i]=="="?sm+"="+sz+sd+esc(at[++i]):sa+' '+at[i])+sz;
re+=ib[2]?sm+esc('>')+sz+sd+esc(ib[2])+sz+sm+esc('</')+sz+se+ib[3]+sz:'';
re+=sm+esc(ob[3])+sz+tz+tz;
if (ob[1]+ob[3]+ib[2]=='<>') is+=ind; // Increase indent
});
return re;
}
See https://jsfiddle.net/dkb0La16/

Or just print out the special HTML characters?
Ex: <xmlstuff>
<node />
</xmlstuff>
Horizontal tab
Line feed

XMLSpectrum formats XML, supports attribute indentation and also does syntax-highlighting for XML and any embedded XPath expressions:
XMLSpectrum is an open source project, coded in XSLT 2.0 - so you can run this server-side with a processor such as Saxon-HE (recommended) or client-side using Saxon-CE.
XMLSpectrum is not yet optimised to run in the browser - hence the recommendation to run this server-side.

here is another function to format xml
function formatXml(xml){
var out = "";
var tab = " ";
var indent = 0;
var inClosingTag=false;
var dent=function(no){
out += "\n";
for(var i=0; i < no; i++)
out+=tab;
}
for (var i=0; i < xml.length; i++) {
var c = xml.charAt(i);
if(c=='<'){
// handle </
if(xml.charAt(i+1) == '/'){
inClosingTag = true;
dent(--indent);
}
out+=c;
}else if(c=='>'){
out+=c;
// handle />
if(xml.charAt(i-1) == '/'){
out+="\n";
//dent(--indent)
}else{
if(!inClosingTag)
dent(++indent);
else{
out+="\n";
inClosingTag=false;
}
}
}else{
out+=c;
}
}
return out;
}

Xml formatting can be done by parsing the xml, adding or changing text nodes in the dom tree for indentation and then serializing the DOM back to xml.
Please check formatxml function in https://jsonbrowser.sourceforge.io/formatxml.js
You can see the function in action in https://jsonbrowser.sourceforge.io/
under the Xml tab.
Below is the simplified code.
formatxml.js adds error checking, optional removal of comments, indent as a parameter and handles non-space text between parent nodes.
const parser = new DOMParser();
const serializer = new XMLSerializer();
function formatXml(xml) {
let xmlDoc = parser.parseFromString(xml, 'application/xml');
let rootElement = xmlDoc.documentElement;
indentChildren(xmlDoc, rootElement, "\n", "\n ");
xml = serializer.serializeToString(xmlDoc);
return xml;
}
function indentChildren(xmlDoc, node, prevPrefix, prefix) {
let children = node.childNodes;
let i;
let prevChild = null;
let prevChildType = 1;
let child = null;
let childType;
for (i = 0; i < children.length; i++) {
child = children[i];
childType = child.nodeType;
if (childType != 3) {
if (prevChildType == 3) {
// Update prev text node with correct indent
prevChild.nodeValue = prefix;
} else {
// Create and insert text node with correct indent
let textNode = xmlDoc.createTextNode(prefix);
node.insertBefore(textNode, child);
i++;
}
if (childType == 1) {
let isLeaf = child.childNodes.length == 0 || child.childNodes.length == 1 && child.childNodes[0].nodeType != 1;
if (!isLeaf) {
indentChildren(xmlDoc, child, prefix, prefix + " ");
}
}
}
prevChild = child;
prevChildType =childType;
}
if (child != null) {
// Previous level indentation after last child
if (childType == 3) {
child.nodeValue = prevPrefix;
} else {
let textNode = xmlDoc.createTextNode(prevPrefix);
node.append(textNode);
}
}
}
Reference: https://www.w3schools.com/XML/dom_intro.asp

var formatXml = this.formatXml = function (xml) {
var reg = /(>)(<)(\/*)/g;
var wsexp = / *(.*) +\n/g;
var contexp = /(<.+>)(.+\n)/g;
xml = xml.replace(reg, '$1\n$2$3').replace(wsexp, '$1\n').replace(contexp, '$1\n$2');
var pad = 0;
var formatted = '';
var lines = xml.split('\n');
var indent = 0;
var lastType = 'other';

var reg = /(>)\s*(<)(\/*)/g;
xml = xml.replace(/\r|\n/g, ''); //deleting already existing whitespaces
xml = xml.replace(reg, '$1\r\n$2$3');

Use above method for pretty print and then add this in any div by using jquery text() method. for example id of div is xmldiv then use :
$("#xmldiv").text(formatXml(youXmlString));

You could also use Saxon-JS client-side:
<script src="SaxonJS/SaxonJS2.js"></script>
<script>
let myXML = `<root><node/></root>`;
SaxonJS.getResource({
text: myXML.replace(`xml:space="preserve"`, ''),
type: "xml"
}).then(doc => {
const output = SaxonJS.serialize(doc, {method: "xml", indent: true, "omit-xml-declaration":true});
console.log(output);
})
</script>
Saxon-JS Installation client-side
Saxon-JS Download page

This may involve creating nodes as objects, but you can have total control over exporting pretty formatted xml.
The following will return a string array of the lines which you can join with a new line delimiter "\n".
/**
* The child of an XML node can be raw text or another xml node.
*/
export type PossibleNode = XmlNode | string;
/**
* Base XML Node type.
*/
export interface XmlNode {
tag: string;
attrs?: { [key: string]: string };
children?: PossibleNode[];
}
/**
* Exports the given XML node to a string array.
*
* #param node XML Node
* #param autoClose Auto close the tag
* #param indent Indentation level
* #returns String array
*/
export function xmlNodeToString(
node: XmlNode,
autoClose: boolean = true,
indent: number = 0
): string[] {
const indentStr = " ".repeat(indent);
const sb: string[] = [];
sb.push(`${indentStr}<${node.tag}`);
if (node.attrs) {
for (const key in node.attrs) {
sb.push(`${indentStr} ${key}="${node.attrs[key]}"`);
}
}
if (node.children) {
if (node.children.length === 1 && typeof node.children[0] === "string") {
sb[sb.length - 1] += ">" + node.children[0];
} else {
sb.push(`${indentStr}>`);
for (const child of node.children) {
if (typeof child === "string") {
sb.push(`${indentStr} ${child}`);
} else {
const lines = xmlNodeToString(child, autoClose, indent + 1);
sb.push(...lines.map((line) => `${indentStr} ${line}`));
}
}
}
if (autoClose) {
if (node.children.length === 1 && typeof node.children[0] === "string") {
sb[sb.length - 1] += `</${node.tag}>`;
} else {
sb.push(`${indentStr}</${node.tag}>`);
}
}
} else {
if (autoClose) {
sb.push(`${indentStr}/>`);
} else {
sb.push(`${indentStr}>`);
}
}
return sb;
}
Updates appreciated on the gist: https://gist.github.com/rodydavis/acd609560ab0416b60681fddabc43eee

Xml-to-json library has method formatXml(xml). I am the maintainer of the project.
var prettyXml = formatXml("<a><b/></a>");
// <a>
// <b/>
// </a>

This my version, maybe usefull for others, using String builder
Saw that someone had the same piece of code.
public String FormatXml(String xml, String tab)
{
var sb = new StringBuilder();
int indent = 0;
// find all elements
foreach (string node in Regex.Split(xml,#">\s*<"))
{
// if at end, lower indent
if (Regex.IsMatch(node, #"^\/\w")) indent--;
sb.AppendLine(String.Format("{0}<{1}>", string.Concat(Enumerable.Repeat(tab, indent).ToArray()), node));
// if at start, increase indent
if (Regex.IsMatch(node, #"^<?\w[^>]*[^\/]$")) indent++;
}
// correct first < and last > from the output
String result = sb.ToString().Substring(1);
return result.Remove(result.Length - Environment.NewLine.Length-1);
}

We Keep Coding

JavaScript is the programming language of the Web.

Javascript RegEx - Split Html-string - javascript

Related

JavaScript - Extract specific nodes along with the node start position

Javascript replace function error

Simple javascript find and replace

Getting a jQuery selector for an element

Pretty printing XML with javascript

Categories

Resources