JavaScript getting innerHtml for all text nodes in nested elements [duplicate] - javascript

<div class="title">
I am text node
<a class="edit">Edit</a>
</div>
I wish to get the "I am text node", do not wish to remove the "edit" tag, and need a cross browser solution.

var text = $(".title").contents().filter(function() {
return this.nodeType == Node.TEXT_NODE;
}).text();
This gets the contents of the selected element, and applies a filter function to it. The filter function returns only text nodes (i.e. those nodes with nodeType == Node.TEXT_NODE).

You can get the nodeValue of the first childNode using
$('.title')[0].childNodes[0].nodeValue
http://jsfiddle.net/TU4FB/

Another native JS solution that can be useful for "complex" or deeply nested elements is to use NodeIterator. Put NodeFilter.SHOW_TEXT as the second argument ("whatToShow"), and iterate over just the text node children of the element.
var root = document.querySelector('p'),
iter = document.createNodeIterator(root, NodeFilter.SHOW_TEXT),
textnode;
// print all text nodes
while (textnode = iter.nextNode()) {
console.log(textnode.textContent)
}
<p>
<br>some text<br>123
</p>
You can also use TreeWalker. The difference between the two is that NodeIterator is a simple linear iterator, while TreeWalker allows you to navigate via siblings and ancestors as well.

ES6 version that return the first #text node content
const extract = (node) => {
const text = [...node.childNodes].find(child => child.nodeType === Node.TEXT_NODE);
return text && text.textContent.trim();
}

If you mean get the value of the first text node in the element, this code will work:
var oDiv = document.getElementById("MyDiv");
var firstText = "";
for (var i = 0; i < oDiv.childNodes.length; i++) {
var curNode = oDiv.childNodes[i];
if (curNode.nodeName === "#text") {
firstText = curNode.nodeValue;
break;
}
}
You can see this in action here: http://jsfiddle.net/ZkjZJ/

Pure JavaScript: Minimalist
First off, always keep this in mind when looking for text in the DOM.
MDN - Whitespace in the DOM
This issue will make you pay attention to the structure of your XML / HTML.
In this pure JavaScript example, I account for the possibility of multiple text nodes that could be interleaved with other kinds of nodes. However, initially, I do not pass judgment on whitespace, leaving that filtering task to other code.
In this version, I pass a NodeList in from the calling / client code.
/**
* Gets strings from text nodes. Minimalist. Non-robust. Pre-test loop version.
* Generic, cross platform solution. No string filtering or conditioning.
*
* #author Anthony Rutledge
* #param nodeList The child nodes of a Node, as in node.childNodes.
* #param target A positive whole number >= 1
* #return String The text you targeted.
*/
function getText(nodeList, target)
{
var trueTarget = target - 1,
length = nodeList.length; // Because you may have many child nodes.
for (var i = 0; i < length; i++) {
if ((nodeList[i].nodeType === Node.TEXT_NODE) && (i === trueTarget)) {
return nodeList[i].nodeValue; // Done! No need to keep going.
}
}
return null;
}
Of course, by testing node.hasChildNodes() first, there would be no need to use a pre-test for loop.
/**
* Gets strings from text nodes. Minimalist. Non-robust. Post-test loop version.
* Generic, cross platform solution. No string filtering or conditioning.
*
* #author Anthony Rutledge
* #param nodeList The child nodes of a Node, as in node.childNodes.
* #param target A positive whole number >= 1
* #return String The text you targeted.
*/
function getText(nodeList, target)
{
var trueTarget = target - 1,
length = nodeList.length,
i = 0;
do {
if ((nodeList[i].nodeType === Node.TEXT_NODE) && (i === trueTarget)) {
return nodeList[i].nodeValue; // Done! No need to keep going.
}
i++;
} while (i < length);
return null;
}
Pure JavaScript: Robust
Here the function getTextById() uses two helper functions: getStringsFromChildren() and filterWhitespaceLines().
getStringsFromChildren()
/**
* Collects strings from child text nodes.
* Generic, cross platform solution. No string filtering or conditioning.
*
* #author Anthony Rutledge
* #version 7.0
* #param parentNode An instance of the Node interface, such as an Element. object.
* #return Array of strings, or null.
* #throws TypeError if the parentNode is not a Node object.
*/
function getStringsFromChildren(parentNode)
{
var strings = [],
nodeList,
length,
i = 0;
if (!parentNode instanceof Node) {
throw new TypeError("The parentNode parameter expects an instance of a Node.");
}
if (!parentNode.hasChildNodes()) {
return null; // We are done. Node may resemble <element></element>
}
nodeList = parentNode.childNodes;
length = nodeList.length;
do {
if ((nodeList[i].nodeType === Node.TEXT_NODE)) {
strings.push(nodeList[i].nodeValue);
}
i++;
} while (i < length);
if (strings.length > 0) {
return strings;
}
return null;
}
filterWhitespaceLines()
/**
* Filters an array of strings to remove whitespace lines.
* Generic, cross platform solution.
*
* #author Anthony Rutledge
* #version 6.0
* #param textArray a String associated with the id attribute of an Element.
* #return Array of strings that are not lines of whitespace, or null.
* #throws TypeError if the textArray param is not of type Array.
*/
function filterWhitespaceLines(textArray)
{
var filteredArray = [],
whitespaceLine = /(?:^\s+$)/; // Non-capturing Regular Expression.
if (!textArray instanceof Array) {
throw new TypeError("The textArray parameter expects an instance of a Array.");
}
for (var i = 0; i < textArray.length; i++) {
if (!whitespaceLine.test(textArray[i])) { // If it is not a line of whitespace.
filteredArray.push(textArray[i].trim()); // Trimming here is fine.
}
}
if (filteredArray.length > 0) {
return filteredArray ; // Leave selecting and joining strings for a specific implementation.
}
return null; // No text to return.
}
getTextById()
/**
* Gets strings from text nodes. Robust.
* Generic, cross platform solution.
*
* #author Anthony Rutledge
* #version 6.0
* #param id A String associated with the id property of an Element.
* #return Array of strings, or null.
* #throws TypeError if the id param is not of type String.
* #throws TypeError if the id param cannot be used to find a node by id.
*/
function getTextById(id)
{
var textArray = null; // The hopeful output.
var idDatatype = typeof id; // Only used in an TypeError message.
var node; // The parent node being examined.
try {
if (idDatatype !== "string") {
throw new TypeError("The id argument must be of type String! Got " + idDatatype);
}
node = document.getElementById(id);
if (node === null) {
throw new TypeError("No element found with the id: " + id);
}
textArray = getStringsFromChildren(node);
if (textArray === null) {
return null; // No text nodes found. Example: <element></element>
}
textArray = filterWhitespaceLines(textArray);
if (textArray.length > 0) {
return textArray; // Leave selecting and joining strings for a specific implementation.
}
} catch (e) {
console.log(e.message);
}
return null; // No text to return.
}
Next, the return value (Array, or null) is sent to the client code where it should be handled. Hopefully, the array should have string elements of real text, not lines of whitespace.
Empty strings ("") are not returned because you need a text node to properly indicate the presence of valid text. Returning ("") may give the false impression that a text node exists, leading someone to assume that they can alter the text by changing the value of .nodeValue. This is false, because a text node does not exist in the case of an empty string.
Example 1:
<p id="bio"></p> <!-- There is no text node here. Return null. -->
Example 2:
<p id="bio">
</p> <!-- There are at least two text nodes ("\n"), here. -->
The problem comes in when you want to make your HTML easy to read by spacing it out. Now, even though there is no human readable valid text, there are still text nodes with newline ("\n") characters in their .nodeValue properties.
Humans see examples one and two as functionally equivalent--empty elements waiting to be filled. The DOM is different than human reasoning. This is why the getStringsFromChildren() function must determine if text nodes exist and gather the .nodeValue values into an array.
for (var i = 0; i < length; i++) {
if (nodeList[i].nodeType === Node.TEXT_NODE) {
textNodes.push(nodeList[i].nodeValue);
}
}
In example two, two text nodes do exist and getStringFromChildren() will return the .nodeValue of both of them ("\n"). However, filterWhitespaceLines() uses a regular expression to filter out lines of pure whitespace characters.
Is returning null instead of newline ("\n") characters a form of lying to the client / calling code? In human terms, no. In DOM terms, yes. However, the issue here is getting text, not editing it. There is no human text to return to the calling code.
One can never know how many newline characters might appear in someone's HTML. Creating a counter that looks for the "second" newline character is unreliable. It might not exist.
Of course, further down the line, the issue of editing text in an empty <p></p> element with extra whitespace (example 2) might mean destroying (maybe, skipping) all but one text node between a paragraph's tags to ensure the element contains precisely what it is supposed to display.
Regardless, except for cases where you are doing something extraordinary, you will need a way to determine which text node's .nodeValue property has the true, human readable text that you want to edit. filterWhitespaceLines gets us half way there.
var whitespaceLine = /(?:^\s+$)/; // Non-capturing Regular Expression.
for (var i = 0; i < filteredTextArray.length; i++) {
if (!whitespaceLine.test(textArray[i])) { // If it is not a line of whitespace.
filteredTextArray.push(textArray[i].trim()); // Trimming here is fine.
}
}
At this point you may have output that looks like this:
["Dealing with text nodes is fun.", "Some people just use jQuery."]
There is no guarantee that these two strings are adjacent to each other in the DOM, so joining them with .join() might make an unnatural composite. Instead, in the code that calls getTextById(), you need to chose which string you want to work with.
Test the output.
try {
var strings = getTextById("bio");
if (strings === null) {
// Do something.
} else if (strings.length === 1) {
// Do something with strings[0]
} else { // Could be another else if
// Do something. It all depends on the context.
}
} catch (e) {
console.log(e.message);
}
One could add .trim() inside of getStringsFromChildren() to get rid of leading and trailing whitespace (or to turn a bunch of spaces into a zero length string (""), but how can you know a priori what every application may need to have happen to the text (string) once it is found? You don't, so leave that to a specific implementation, and let getStringsFromChildren() be generic.
There may be times when this level of specificity (the target and such) is not required. That is great. Use a simple solution in those cases. However, a generalized algorithm enables you to accommodate simple and complex situations.

.text() - for jquery
$('.title').clone() //clone the element
.children() //select all the children
.remove() //remove all the children
.end() //again go back to selected element
.text(); //get the text of element

This will ignore the whitespace as well so, your never got the Blank textNodes..code using core Javascript.
var oDiv = document.getElementById("MyDiv");
var firstText = "";
for (var i = 0; i < oDiv.childNodes.length; i++) {
var curNode = oDiv.childNodes[i];
whitespace = /^\s*$/;
if (curNode.nodeName === "#text" && !(whitespace.test(curNode.nodeValue))) {
firstText = curNode.nodeValue;
break;
}
}
Check it on jsfiddle : - http://jsfiddle.net/webx/ZhLep/

Simply via Vanilla JavaScript:
const el = document.querySelector('.title');
const text = el.firstChild.textContent.trim();

You can also use XPath's text() node test to get the text nodes only. For example
var target = document.querySelector('div.title');
var iter = document.evaluate('text()', target, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE);
var node;
var want = '';
while (node = iter.iterateNext()) {
want += node.data;
}

There are some overcomplicated solutions here but the operation is as straightforward as using .childNodes to get children of all node types and .filter to extract e.nodeType === Node.TEXT_NODEs. Optionally, we may want to do it recursively and/or ignore "empty" text nodes (all whitespace).
These examples convert the nodes to their text content for display purposes, but this is technically a separate step from filtering.
const immediateTextNodes = el =>
[...el.childNodes].filter(e => e.nodeType === Node.TEXT_NODE);
const immediateNonEmptyTextNodes = el =>
[...el.childNodes].filter(e =>
e.nodeType === Node.TEXT_NODE && e.textContent.trim()
);
const firstImmediateTextNode = el =>
[...el.childNodes].find(e => e.nodeType === Node.TEXT_NODE);
const firstImmediateNonEmptyTextNode = el =>
[...el.childNodes].find(e =>
e.nodeType === Node.TEXT_NODE && e.textContent.trim()
);
// example usage:
const text = el => el.textContent;
const p = document.querySelector("p");
console.log(immediateTextNodes(p).map(text));
console.log(immediateNonEmptyTextNodes(p).map(text));
console.log(text(firstImmediateTextNode(p)));
console.log(text(firstImmediateNonEmptyTextNode(p)));
// if you want to trim whitespace:
console.log(immediateNonEmptyTextNodes(p).map(e => text(e).trim()));
<p>
<span>IGNORE</span>
<b>IGNORE</b>
foo
<br>
bar
</p>
Recursive alternative to a NodeIterator:
const deepTextNodes = el => [...el.childNodes].flatMap(e =>
e.nodeType === Node.TEXT_NODE ? e : deepTextNodes(e)
);
const deepNonEmptyTextNodes = el =>
[...el.childNodes].flatMap(e =>
e.nodeType === Node.TEXT_NODE && e.textContent.trim()
? e : deepNonEmptyTextNodes(e)
);
// example usage:
const text = el => el.textContent;
const p = document.querySelector("p");
console.log(deepTextNodes(p).map(text));
console.log(deepNonEmptyTextNodes(p).map(text));
<p>
foo
<span>bar</span>
baz
<span><b>quux</b></span>
</p>
Finally, feel free to join the text node array into a string if you wish using .join(""). But as with trimming and text content extraction, I'd probably not bake this into the core filtering function and leave it to the caller to handle as needed.

Related

Finding HTML tags by using `content`'s of them from a Google Chrome extension [duplicate]

How can I find DIV with certain text? For example:
<div>
SomeText, text continues.
</div>
Trying to use something like this:
var text = document.querySelector('div[SomeText*]').innerTEXT;
alert(text);
But ofcourse it will not work. How can I do it?
OP's question is about plain JavaScript and not jQuery.
Although there are plenty of answers and I like #Pawan Nogariya answer, please check this alternative out.
You can use XPATH in JavaScript. More info on the MDN article here.
The document.evaluate() method evaluates an XPATH query/expression. So you can pass XPATH expressions there, traverse into the HTML document and locate the desired element.
In XPATH you can select an element, by the text node like the following, whch gets the div that has the following text node.
//div[text()="Hello World"]
To get an element that contains some text use the following:
//div[contains(., 'Hello')]
The contains() method in XPATH takes a node as first parameter and the text to search for as second parameter.
Check this plunk here, this is an example use of XPATH in JavaScript
Here is a code snippet:
var headings = document.evaluate("//h1[contains(., 'Hello')]", document, null, XPathResult.ANY_TYPE, null );
var thisHeading = headings.iterateNext();
console.log(thisHeading); // Prints the html element in console
console.log(thisHeading.textContent); // prints the text content in console
thisHeading.innerHTML += "<br />Modified contents";
As you can see, I can grab the HTML element and modify it as I like.
You could use this pretty simple solution:
Array.from(document.querySelectorAll('div'))
.find(el => el.textContent === 'SomeText, text continues.');
The Array.from will convert the NodeList to an array (there are multiple methods to do this like the spread operator or slice)
The result now being an array allows for using the Array.find method, you can then put in any predicate. You could also check the textContent with a regex or whatever you like.
Note that Array.from and Array.find are ES2015 features. Te be compatible with older browsers like IE10 without a transpiler:
Array.prototype.slice.call(document.querySelectorAll('div'))
.filter(function (el) {
return el.textContent === 'SomeText, text continues.'
})[0];
Since you have asked it in javascript so you can have something like this
function contains(selector, text) {
var elements = document.querySelectorAll(selector);
return Array.prototype.filter.call(elements, function(element){
return RegExp(text).test(element.textContent);
});
}
And then call it like this
contains('div', 'sometext'); // find "div" that contain "sometext"
contains('div', /^sometext/); // find "div" that start with "sometext"
contains('div', /sometext$/i); // find "div" that end with "sometext", case-insensitive
This solution does the following:
Uses the ES6 spread operator to convert the NodeList of all divs to an array.
Provides output if the div contains the query string, not just if it exactly equals the query string (which happens for some of the other answers). e.g. It should provide output not just for 'SomeText' but also for 'SomeText, text continues'.
Outputs the entire div contents, not just the query string. e.g. For 'SomeText, text continues' it should output that whole string, not just 'SomeText'.
Allows for multiple divs to contain the string, not just a single div.
[...document.querySelectorAll('div')] // get all the divs in an array
.map(div => div.innerHTML) // get their contents
.filter(txt => txt.includes('SomeText')) // keep only those containing the query
.forEach(txt => console.log(txt)); // output the entire contents of those
<div>SomeText, text continues.</div>
<div>Not in this div.</div>
<div>Here is more SomeText.</div>
Coming across this in 2021, I found using XPATH too complicated (need to learn something else) for something that should be rather simple.
Came up with this:
function querySelectorIncludesText (selector, text){
return Array.from(document.querySelectorAll(selector))
.find(el => el.textContent.includes(text));
}
Usage:
querySelectorIncludesText('button', 'Send')
Note that I decided to use includes and not a strict comparison, because that's what I really needed, feel free to adapt.
You might need those polyfills if you want to support all browsers:
/**
* String.prototype.includes() polyfill
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/includes#Polyfill
* #see https://vanillajstoolkit.com/polyfills/stringincludes/
*/
if (!String.prototype.includes) {
String.prototype.includes = function (search, start) {
'use strict';
if (search instanceof RegExp) {
throw TypeError('first argument must not be a RegExp');
}
if (start === undefined) {
start = 0;
}
return this.indexOf(search, start) !== -1;
};
}
You best see if you have a parent element of the div you are querying. If so get the parent element and perform an element.querySelectorAll("div"). Once you get the nodeList apply a filter on it over the innerText property. Assume that a parent element of the div that we are querying has an id of container. You can normally access container directly from the id but let's do it the proper way.
var conty = document.getElementById("container"),
divs = conty.querySelectorAll("div"),
myDiv = [...divs].filter(e => e.innerText == "SomeText");
So that's it.
If you don't want to use jquery or something like that then you can try this:
function findByText(rootElement, text){
var filter = {
acceptNode: function(node){
// look for nodes that are text_nodes and include the following string.
if(node.nodeType === document.TEXT_NODE && node.nodeValue.includes(text)){
return NodeFilter.FILTER_ACCEPT;
}
return NodeFilter.FILTER_REJECT;
}
}
var nodes = [];
var walker = document.createTreeWalker(rootElement, NodeFilter.SHOW_TEXT, filter, false);
while(walker.nextNode()){
//give me the element containing the node
nodes.push(walker.currentNode.parentNode);
}
return nodes;
}
//call it like
var nodes = findByText(document.body,'SomeText');
//then do what you will with nodes[];
for(var i = 0; i < nodes.length; i++){
//do something with nodes[i]
}
Once you have the nodes in an array that contain the text you can do something with them. Like alert each one or print to console. One caveat is that this may not necessarily grab divs per se, this will grab the parent of the textnode that has the text you are looking for.
Google has this as a top result for For those who need to find a node with certain text.
By way of update, a nodelist is now iterable in modern browsers without having to convert it to an array.
The solution can use forEach like so.
var elList = document.querySelectorAll(".some .selector");
elList.forEach(function(el) {
if (el.innerHTML.indexOf("needle") !== -1) {
// Do what you like with el
// The needle is case sensitive
}
});
This worked for me to do a find/replace text inside a nodelist when a normal selector could not choose just one node so I had to filter each node one by one to check it for the needle.
Use XPath and document.evaluate(), and make sure to use text() and not . for the contains() argument, or else you will have the entire HTML, or outermost div element matched.
var headings = document.evaluate("//h1[contains(text(), 'Hello')]", document, null, XPathResult.ANY_TYPE, null );
or ignore leading and trailing whitespace
var headings = document.evaluate("//h1[contains(normalize-space(text()), 'Hello')]", document, null, XPathResult.ANY_TYPE, null );
or match all tag types (div, h1, p, etc.)
var headings = document.evaluate("//*[contains(text(), 'Hello')]", document, null, XPathResult.ANY_TYPE, null );
Then iterate
let thisHeading;
while(thisHeading = headings.iterateNext()){
// thisHeading contains matched node
}
Here's the XPath approach but with a minimum of XPath jargon.
Regular selection based on element attribute values (for comparison):
// for matching <element class="foo bar baz">...</element> by 'bar'
var things = document.querySelectorAll('[class*="bar"]');
for (var i = 0; i < things.length; i++) {
things[i].style.outline = '1px solid red';
}
XPath selection based on text within element.
// for matching <element>foo bar baz</element> by 'bar'
var things = document.evaluate('//*[contains(text(),"bar")]',document,null,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,null);
for (var i = 0; i < things.snapshotLength; i++) {
things.snapshotItem(i).style.outline = '1px solid red';
}
And here's with case-insensitivity since text is more volatile:
// for matching <element>foo bar baz</element> by 'bar' case-insensitively
var things = document.evaluate('//*[contains(translate(text(),"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz"),"bar")]',document,null,XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,null);
for (var i = 0; i < things.snapshotLength; i++) {
things.snapshotItem(i).style.outline = '1px solid red';
}
There are lots of great solutions here already. However, to provide a more streamlined solution and one more in keeping with the idea of a querySelector behavior and syntax, I opted for a solution that extends Object with a couple prototype functions. Both of these functions use regular expressions for matching text, however, a string can be provided as a loose search parameter.
Simply implement the following functions:
// find all elements with inner text matching a given regular expression
// args:
// selector: string query selector to use for identifying elements on which we
// should check innerText
// regex: A regular expression for matching innerText; if a string is provided,
// a case-insensitive search is performed for any element containing the string.
Object.prototype.queryInnerTextAll = function(selector, regex) {
if (typeof(regex) === 'string') regex = new RegExp(regex, 'i');
const elements = [...this.querySelectorAll(selector)];
const rtn = elements.filter((e)=>{
return e.innerText.match(regex);
});
return rtn.length === 0 ? null : rtn
}
// find the first element with inner text matching a given regular expression
// args:
// selector: string query selector to use for identifying elements on which we
// should check innerText
// regex: A regular expression for matching innerText; if a string is provided,
// a case-insensitive search is performed for any element containing the string.
Object.prototype.queryInnerText = function(selector, text){
return this.queryInnerTextAll(selector, text)[0];
}
With these functions implemented, you can now make calls as follows:
document.queryInnerTextAll('div.link', 'go');
This would find all divs containing the link class with the word go in the innerText (eg. Go Left or GO down or go right or It's Good)
document.queryInnerText('div.link', 'go');
This would work exactly as the example above except it would return only the first matching element.
document.queryInnerTextAll('a', /^Next$/);
Find all links with the exact text Next (case-sensitive). This will exclude links that contain the word Next along with other text.
document.queryInnerText('a', /next/i);
Find the first link that contains the word next, regardless of case (eg. Next Page or Go to next)
e = document.querySelector('#page');
e.queryInnerText('button', /Continue/);
This performs a search within a container element for a button containing the text, Continue (case-sensitive). (eg. Continue or Continue to Next but not continue)
I had similar problem.
Function that return all element which include text from arg.
This works for me:
function getElementsByText(document, str, tag = '*') {
return [...document.querySelectorAll(tag)]
.filter(
el => (el.text && el.text.includes(str))
|| (el.children.length === 0 && el.outerText && el.outerText.includes(str)))
}
Since there are no limits to the length of text in a data attribute, use data attributes! And then you can use regular css selectors to select your element(s) like the OP wants.
for (const element of document.querySelectorAll("*")) {
element.dataset.myInnerText = element.innerText;
}
document.querySelector("*[data-my-inner-text='Different text.']").style.color="blue";
<div>SomeText, text continues.</div>
<div>Different text.</div>
Ideally you do the data attribute setting part on document load and narrow down the querySelectorAll selector a bit for performance.
I was looking for a way to do something similar using a Regex, and decided to build something of my own that I wanted to share if others are looking for a similar solution.
function getElementsByTextContent(tag, regex) {
const results = Array.from(document.querySelectorAll(tag))
.reduce((acc, el) => {
if (el.textContent && el.textContent.match(regex) !== null) {
acc.push(el);
}
return acc;
}, []);
return results;
}

Extract text from HTML while preserving block-level element newlines

Background
Most questions about extracting text from HTML (i.e., stripping the tags) use:
jQuery( htmlString ).text();
While this abstracts browser inconsistencies (such as innerText vs. textContent), the function call also ignores the semantic meaning of block-level elements (such as li).
Problem
Preserving newlines of block-level elements (i.e., the semantic intent) across various browsers entails no small effort, as Mike Wilcox describes.
A seemingly simpler solution would be to emulate pasting HTML content into a <textarea>, which strips HTML while preserving block-level element newlines. However, JavaScript-based inserts do not trigger the same HTML-to-text routines that browsers employ when users paste content into a <textarea>.
I also tried integrating Mike Wilcox's JavaScript code. The code works in Chromium, but not in Firefox.
Question
What is the simplest cross-browser way to extract text from HTML while preserving semantic newlines for block-level elements using jQuery (or vanilla JavaScript)?
Example
Consider:
Select and copy this entire question.
Open the textarea example page.
Paste the content into the textarea.
The textarea preserves the newlines for ordered lists, headings, preformatted text, and so forth. That is the result I would like to achieve.
To further clarify, given any HTML content, such as:
<h1>Header</h1>
<p>Paragraph</p>
<ul>
<li>First</li>
<li>Second</li>
</ul>
<dl>
<dt>Term</dt>
<dd>Definition</dd>
</dl>
<div>Div with <span>span</span>.<br />After the break.</div>
How would you produce:
Header
Paragraph
First
Second
Term
Definition
Div with span.
After the break.
Note: Neither indentation nor non-normalized whitespace are relevant.
Consider:
/**
* Returns the style for a node.
*
* #param n The node to check.
* #param p The property to retrieve (usually 'display').
* #link http://www.quirksmode.org/dom/getstyles.html
*/
this.getStyle = function( n, p ) {
return n.currentStyle ?
n.currentStyle[p] :
document.defaultView.getComputedStyle(n, null).getPropertyValue(p);
}
/**
* Converts HTML to text, preserving semantic newlines for block-level
* elements.
*
* #param node - The HTML node to perform text extraction.
*/
this.toText = function( node ) {
var result = '';
if( node.nodeType == document.TEXT_NODE ) {
// Replace repeated spaces, newlines, and tabs with a single space.
result = node.nodeValue.replace( /\s+/g, ' ' );
}
else {
for( var i = 0, j = node.childNodes.length; i < j; i++ ) {
result += _this.toText( node.childNodes[i] );
}
var d = _this.getStyle( node, 'display' );
if( d.match( /^block/ ) || d.match( /list/ ) || d.match( /row/ ) ||
node.tagName == 'BR' || node.tagName == 'HR' ) {
result += '\n';
}
}
return result;
}
http://jsfiddle.net/3mzrV/2/
That is to say, with an exception or two, iterate through each node and print its contents, letting the browser's computed style tell you when to insert newlines.
This seems to be (nearly) doing what you want:
function getText($node) {
return $node.contents().map(function () {
if (this.nodeName === 'BR') {
return '\n';
} else if (this.nodeType === 3) {
return this.nodeValue;
} else {
return getText($(this));
}
}).get().join('');
}
DEMO
It just recursively concatenates the values of all text nodes and replaces <br> elements with line breaks.
But there is no semantics in this, it completely relies the original HTML formatting (the leading and trailing white spaces seem to come from how jsFiddle embeds the HTML, but you can easily trim those). For example, notice how it indents the definition term.
If you really want to do this on a semantic level, you need a list of block level elements, recursively iterate over the elements and indent them accordingly. You treat different block elements differently with respect to indentation and line breaks around them. This should not be too difficult.
based on https://stackoverflow.com/a/20384452/3338098
and fixed to support TEXT1<div>TEXT2</div>=>TEXT1\nTEXT2 and allow non-DOM nodes
/**
* Returns the style for a node.
*
* #param n The node to check.
* #param p The property to retrieve (usually 'display').
* #link http://www.quirksmode.org/dom/getstyles.html
*/
function getNodeStyle( n, p ) {
return n.currentStyle ?
n.currentStyle[p] :
document.defaultView.getComputedStyle(n, null).getPropertyValue(p);
}
//IF THE NODE IS NOT ACTUALLY IN THE DOM then this won't take into account <div style="display: inline;">text</div>
//however for simple things like `contenteditable` this is sufficient, however for arbitrary html this will not work
function isNodeBlock(node) {
if (node.nodeType == document.TEXT_NODE) {return false;}
var d = getNodeStyle( node, 'display' );//this is irrelevant if the node isn't currently in the current DOM.
if (d.match( /^block/ ) || d.match( /list/ ) || d.match( /row/ ) ||
node.tagName == 'BR' || node.tagName == 'HR' ||
node.tagName == 'DIV' // div,p,... add as needed to support non-DOM nodes
) {
return true;
}
return false;
}
/**
* Converts HTML to text, preserving semantic newlines for block-level
* elements.
*
* #param node - The HTML node to perform text extraction.
*/
function htmlToText( htmlOrNode, isNode ) {
var node = htmlOrNode;
if (!isNode) {node = jQuery("<span>"+htmlOrNode+"</span>")[0];}
//TODO: inject "unsafe" HTML into current DOM while guaranteeing that it won't
// change the visible DOM so that `isNodeBlock` will work reliably
var result = '';
if( node.nodeType == document.TEXT_NODE ) {
// Replace repeated spaces, newlines, and tabs with a single space.
result = node.nodeValue.replace( /\s+/g, ' ' );
} else {
for( var i = 0, j = node.childNodes.length; i < j; i++ ) {
result += htmlToText( node.childNodes[i], true );
if (i < j-1) {
if (isNodeBlock(node.childNodes[i])) {
result += '\n';
} else if (isNodeBlock(node.childNodes[i+1]) &&
node.childNodes[i+1].tagName != 'BR' &&
node.childNodes[i+1].tagName != 'HR') {
result += '\n';
}
}
}
}
return result;
}
the main change was
if (i < j-1) {
if (isNodeBlock(node.childNodes[i])) {
result += '\n';
} else if (isNodeBlock(node.childNodes[i+1]) &&
node.childNodes[i+1].tagName != 'BR' &&
node.childNodes[i+1].tagName != 'HR') {
result += '\n';
}
}
to check neighboring blocks to determine the appropriateness of adding a newline.
I would like to suggest a little edit from the code of svidgen:
function getText(n, isInnerNode) {
var rv = '';
if (n.nodeType == 3) {
rv = n.nodeValue;
} else {
var partial = "";
var d = getComputedStyle(n).getPropertyValue('display');
if (isInnerNode && d.match(/^block/) || d.match(/list/) || n.tagName == 'BR') {
partial += "\n";
}
for (var i = 0; i < n.childNodes.length; i++) {
partial += getText(n.childNodes[i], true);
}
rv = partial;
}
return rv;
};
I just added the line break before the for loop, in this way we have a newline before the block, and also a variable to avoid the newline for the root element.
The code should be invocated:
getText(document.getElementById("divElement"))
Use element.innerText
This not return extra nodes added from contenteditable elements.
If you use element.innerHTML the text will contain additional markup, but innerText will return what you see on the element's contents.
<div id="txt" contenteditable="true"></div>
<script>
var txt=document.getElementById("txt");
var withMarkup=txt.innerHTML;
var textOnly=txt.innerText;
console.log(withMarkup);
console.log(textOnly);
</script>

How to get the text node of an element?

<div class="title">
I am text node
<a class="edit">Edit</a>
</div>
I wish to get the "I am text node", do not wish to remove the "edit" tag, and need a cross browser solution.
var text = $(".title").contents().filter(function() {
return this.nodeType == Node.TEXT_NODE;
}).text();
This gets the contents of the selected element, and applies a filter function to it. The filter function returns only text nodes (i.e. those nodes with nodeType == Node.TEXT_NODE).
You can get the nodeValue of the first childNode using
$('.title')[0].childNodes[0].nodeValue
http://jsfiddle.net/TU4FB/
Another native JS solution that can be useful for "complex" or deeply nested elements is to use NodeIterator. Put NodeFilter.SHOW_TEXT as the second argument ("whatToShow"), and iterate over just the text node children of the element.
var root = document.querySelector('p'),
iter = document.createNodeIterator(root, NodeFilter.SHOW_TEXT),
textnode;
// print all text nodes
while (textnode = iter.nextNode()) {
console.log(textnode.textContent)
}
<p>
<br>some text<br>123
</p>
You can also use TreeWalker. The difference between the two is that NodeIterator is a simple linear iterator, while TreeWalker allows you to navigate via siblings and ancestors as well.
ES6 version that return the first #text node content
const extract = (node) => {
const text = [...node.childNodes].find(child => child.nodeType === Node.TEXT_NODE);
return text && text.textContent.trim();
}
If you mean get the value of the first text node in the element, this code will work:
var oDiv = document.getElementById("MyDiv");
var firstText = "";
for (var i = 0; i < oDiv.childNodes.length; i++) {
var curNode = oDiv.childNodes[i];
if (curNode.nodeName === "#text") {
firstText = curNode.nodeValue;
break;
}
}
You can see this in action here: http://jsfiddle.net/ZkjZJ/
Pure JavaScript: Minimalist
First off, always keep this in mind when looking for text in the DOM.
MDN - Whitespace in the DOM
This issue will make you pay attention to the structure of your XML / HTML.
In this pure JavaScript example, I account for the possibility of multiple text nodes that could be interleaved with other kinds of nodes. However, initially, I do not pass judgment on whitespace, leaving that filtering task to other code.
In this version, I pass a NodeList in from the calling / client code.
/**
* Gets strings from text nodes. Minimalist. Non-robust. Pre-test loop version.
* Generic, cross platform solution. No string filtering or conditioning.
*
* #author Anthony Rutledge
* #param nodeList The child nodes of a Node, as in node.childNodes.
* #param target A positive whole number >= 1
* #return String The text you targeted.
*/
function getText(nodeList, target)
{
var trueTarget = target - 1,
length = nodeList.length; // Because you may have many child nodes.
for (var i = 0; i < length; i++) {
if ((nodeList[i].nodeType === Node.TEXT_NODE) && (i === trueTarget)) {
return nodeList[i].nodeValue; // Done! No need to keep going.
}
}
return null;
}
Of course, by testing node.hasChildNodes() first, there would be no need to use a pre-test for loop.
/**
* Gets strings from text nodes. Minimalist. Non-robust. Post-test loop version.
* Generic, cross platform solution. No string filtering or conditioning.
*
* #author Anthony Rutledge
* #param nodeList The child nodes of a Node, as in node.childNodes.
* #param target A positive whole number >= 1
* #return String The text you targeted.
*/
function getText(nodeList, target)
{
var trueTarget = target - 1,
length = nodeList.length,
i = 0;
do {
if ((nodeList[i].nodeType === Node.TEXT_NODE) && (i === trueTarget)) {
return nodeList[i].nodeValue; // Done! No need to keep going.
}
i++;
} while (i < length);
return null;
}
Pure JavaScript: Robust
Here the function getTextById() uses two helper functions: getStringsFromChildren() and filterWhitespaceLines().
getStringsFromChildren()
/**
* Collects strings from child text nodes.
* Generic, cross platform solution. No string filtering or conditioning.
*
* #author Anthony Rutledge
* #version 7.0
* #param parentNode An instance of the Node interface, such as an Element. object.
* #return Array of strings, or null.
* #throws TypeError if the parentNode is not a Node object.
*/
function getStringsFromChildren(parentNode)
{
var strings = [],
nodeList,
length,
i = 0;
if (!parentNode instanceof Node) {
throw new TypeError("The parentNode parameter expects an instance of a Node.");
}
if (!parentNode.hasChildNodes()) {
return null; // We are done. Node may resemble <element></element>
}
nodeList = parentNode.childNodes;
length = nodeList.length;
do {
if ((nodeList[i].nodeType === Node.TEXT_NODE)) {
strings.push(nodeList[i].nodeValue);
}
i++;
} while (i < length);
if (strings.length > 0) {
return strings;
}
return null;
}
filterWhitespaceLines()
/**
* Filters an array of strings to remove whitespace lines.
* Generic, cross platform solution.
*
* #author Anthony Rutledge
* #version 6.0
* #param textArray a String associated with the id attribute of an Element.
* #return Array of strings that are not lines of whitespace, or null.
* #throws TypeError if the textArray param is not of type Array.
*/
function filterWhitespaceLines(textArray)
{
var filteredArray = [],
whitespaceLine = /(?:^\s+$)/; // Non-capturing Regular Expression.
if (!textArray instanceof Array) {
throw new TypeError("The textArray parameter expects an instance of a Array.");
}
for (var i = 0; i < textArray.length; i++) {
if (!whitespaceLine.test(textArray[i])) { // If it is not a line of whitespace.
filteredArray.push(textArray[i].trim()); // Trimming here is fine.
}
}
if (filteredArray.length > 0) {
return filteredArray ; // Leave selecting and joining strings for a specific implementation.
}
return null; // No text to return.
}
getTextById()
/**
* Gets strings from text nodes. Robust.
* Generic, cross platform solution.
*
* #author Anthony Rutledge
* #version 6.0
* #param id A String associated with the id property of an Element.
* #return Array of strings, or null.
* #throws TypeError if the id param is not of type String.
* #throws TypeError if the id param cannot be used to find a node by id.
*/
function getTextById(id)
{
var textArray = null; // The hopeful output.
var idDatatype = typeof id; // Only used in an TypeError message.
var node; // The parent node being examined.
try {
if (idDatatype !== "string") {
throw new TypeError("The id argument must be of type String! Got " + idDatatype);
}
node = document.getElementById(id);
if (node === null) {
throw new TypeError("No element found with the id: " + id);
}
textArray = getStringsFromChildren(node);
if (textArray === null) {
return null; // No text nodes found. Example: <element></element>
}
textArray = filterWhitespaceLines(textArray);
if (textArray.length > 0) {
return textArray; // Leave selecting and joining strings for a specific implementation.
}
} catch (e) {
console.log(e.message);
}
return null; // No text to return.
}
Next, the return value (Array, or null) is sent to the client code where it should be handled. Hopefully, the array should have string elements of real text, not lines of whitespace.
Empty strings ("") are not returned because you need a text node to properly indicate the presence of valid text. Returning ("") may give the false impression that a text node exists, leading someone to assume that they can alter the text by changing the value of .nodeValue. This is false, because a text node does not exist in the case of an empty string.
Example 1:
<p id="bio"></p> <!-- There is no text node here. Return null. -->
Example 2:
<p id="bio">
</p> <!-- There are at least two text nodes ("\n"), here. -->
The problem comes in when you want to make your HTML easy to read by spacing it out. Now, even though there is no human readable valid text, there are still text nodes with newline ("\n") characters in their .nodeValue properties.
Humans see examples one and two as functionally equivalent--empty elements waiting to be filled. The DOM is different than human reasoning. This is why the getStringsFromChildren() function must determine if text nodes exist and gather the .nodeValue values into an array.
for (var i = 0; i < length; i++) {
if (nodeList[i].nodeType === Node.TEXT_NODE) {
textNodes.push(nodeList[i].nodeValue);
}
}
In example two, two text nodes do exist and getStringFromChildren() will return the .nodeValue of both of them ("\n"). However, filterWhitespaceLines() uses a regular expression to filter out lines of pure whitespace characters.
Is returning null instead of newline ("\n") characters a form of lying to the client / calling code? In human terms, no. In DOM terms, yes. However, the issue here is getting text, not editing it. There is no human text to return to the calling code.
One can never know how many newline characters might appear in someone's HTML. Creating a counter that looks for the "second" newline character is unreliable. It might not exist.
Of course, further down the line, the issue of editing text in an empty <p></p> element with extra whitespace (example 2) might mean destroying (maybe, skipping) all but one text node between a paragraph's tags to ensure the element contains precisely what it is supposed to display.
Regardless, except for cases where you are doing something extraordinary, you will need a way to determine which text node's .nodeValue property has the true, human readable text that you want to edit. filterWhitespaceLines gets us half way there.
var whitespaceLine = /(?:^\s+$)/; // Non-capturing Regular Expression.
for (var i = 0; i < filteredTextArray.length; i++) {
if (!whitespaceLine.test(textArray[i])) { // If it is not a line of whitespace.
filteredTextArray.push(textArray[i].trim()); // Trimming here is fine.
}
}
At this point you may have output that looks like this:
["Dealing with text nodes is fun.", "Some people just use jQuery."]
There is no guarantee that these two strings are adjacent to each other in the DOM, so joining them with .join() might make an unnatural composite. Instead, in the code that calls getTextById(), you need to chose which string you want to work with.
Test the output.
try {
var strings = getTextById("bio");
if (strings === null) {
// Do something.
} else if (strings.length === 1) {
// Do something with strings[0]
} else { // Could be another else if
// Do something. It all depends on the context.
}
} catch (e) {
console.log(e.message);
}
One could add .trim() inside of getStringsFromChildren() to get rid of leading and trailing whitespace (or to turn a bunch of spaces into a zero length string (""), but how can you know a priori what every application may need to have happen to the text (string) once it is found? You don't, so leave that to a specific implementation, and let getStringsFromChildren() be generic.
There may be times when this level of specificity (the target and such) is not required. That is great. Use a simple solution in those cases. However, a generalized algorithm enables you to accommodate simple and complex situations.
.text() - for jquery
$('.title').clone() //clone the element
.children() //select all the children
.remove() //remove all the children
.end() //again go back to selected element
.text(); //get the text of element
This will ignore the whitespace as well so, your never got the Blank textNodes..code using core Javascript.
var oDiv = document.getElementById("MyDiv");
var firstText = "";
for (var i = 0; i < oDiv.childNodes.length; i++) {
var curNode = oDiv.childNodes[i];
whitespace = /^\s*$/;
if (curNode.nodeName === "#text" && !(whitespace.test(curNode.nodeValue))) {
firstText = curNode.nodeValue;
break;
}
}
Check it on jsfiddle : - http://jsfiddle.net/webx/ZhLep/
Simply via Vanilla JavaScript:
const el = document.querySelector('.title');
const text = el.firstChild.textContent.trim();
You can also use XPath's text() node test to get the text nodes only. For example
var target = document.querySelector('div.title');
var iter = document.evaluate('text()', target, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE);
var node;
var want = '';
while (node = iter.iterateNext()) {
want += node.data;
}
There are some overcomplicated solutions here but the operation is as straightforward as using .childNodes to get children of all node types and .filter to extract e.nodeType === Node.TEXT_NODEs. Optionally, we may want to do it recursively and/or ignore "empty" text nodes (all whitespace).
These examples convert the nodes to their text content for display purposes, but this is technically a separate step from filtering.
const immediateTextNodes = el =>
[...el.childNodes].filter(e => e.nodeType === Node.TEXT_NODE);
const immediateNonEmptyTextNodes = el =>
[...el.childNodes].filter(e =>
e.nodeType === Node.TEXT_NODE && e.textContent.trim()
);
const firstImmediateTextNode = el =>
[...el.childNodes].find(e => e.nodeType === Node.TEXT_NODE);
const firstImmediateNonEmptyTextNode = el =>
[...el.childNodes].find(e =>
e.nodeType === Node.TEXT_NODE && e.textContent.trim()
);
// example usage:
const text = el => el.textContent;
const p = document.querySelector("p");
console.log(immediateTextNodes(p).map(text));
console.log(immediateNonEmptyTextNodes(p).map(text));
console.log(text(firstImmediateTextNode(p)));
console.log(text(firstImmediateNonEmptyTextNode(p)));
// if you want to trim whitespace:
console.log(immediateNonEmptyTextNodes(p).map(e => text(e).trim()));
<p>
<span>IGNORE</span>
<b>IGNORE</b>
foo
<br>
bar
</p>
Recursive alternative to a NodeIterator:
const deepTextNodes = el => [...el.childNodes].flatMap(e =>
e.nodeType === Node.TEXT_NODE ? e : deepTextNodes(e)
);
const deepNonEmptyTextNodes = el =>
[...el.childNodes].flatMap(e =>
e.nodeType === Node.TEXT_NODE && e.textContent.trim()
? e : deepNonEmptyTextNodes(e)
);
// example usage:
const text = el => el.textContent;
const p = document.querySelector("p");
console.log(deepTextNodes(p).map(text));
console.log(deepNonEmptyTextNodes(p).map(text));
<p>
foo
<span>bar</span>
baz
<span><b>quux</b></span>
</p>
Finally, feel free to join the text node array into a string if you wish using .join(""). But as with trimming and text content extraction, I'd probably not bake this into the core filtering function and leave it to the caller to handle as needed.

removing all the DOM Elements with a specific className

How I can remove all DOM Elements with specific classname or element width ID's that start with a specific pattern. like (without any framework!)
id="selectbox1"
id="selectbox2"
id="selectbox3"
id="selectbox4"
Thanks
You'd have to use getElementsByTagName(*) iterate over the entire collection, check the .className property with a regex /\bYourClasName\b/ (className can have more than one class, seperated by a space) and then also check the element's .id property with another regex: /^IDStartsWithThis/ finally on any matches you would have to call element.parentNode.removeChild(element);
(On my way to work and in a rush, if you need more code I can supply it once I get there around 630 est)
Edit: here's the code:
usage: removeElemIf(idStartsWith,containsClass). you can pass null, only the id (second param is undefined), blanks (blanks are ignored, both parameters are trimmed first). Case is insensitive for both parameters.
function removeElemIf(theID, theClass) { /* class => full match, id => startswith */
checkID = !(theID === undefined || theID === null || (theID = theID.replace(/^\s*|\s*$/g, '')).length == 0);
checkClass = !(theClass === undefined || theClass === null || (theClass = theClass.replace(/^\s*|\s*$/g, '')).length == 0);
if (!(checkID || checkClass)) return;
var oBody = document.getElementsByTagName('body')[0]; // only search the body
var oElems = oBody.getElementsByTagName('*'); // get all the elements within body
for (i = oElems.length - 1; i >= 0; i--) { // loop through backwards in case of delete
el = oElems[i]; // set current element
found = false; // reset flag
if (checkID) { /* check the ID for starting with "theID", originally used indexOf but its case sensitive*/
re = new RegExp('^'+theID,'i');
if (el.id.match(re)) found = true;
}
if (!found && checkClass) { /* only test class if the id doesn't match,
save the regex in instances where the
class names are long or many*/
re = new RegExp('\\b' + theClass + '\\b', 'i');
if (el.className.match(re)) found = true;
}
if (found) el.parentNode.removeChild(el); /* if found, remove from parent */
}
}
Traverse through the dom tree and compare against the className property of each element found. Yes it's tedious, but that's how it's done. You can either traverse in a recursive fashion or an iterative one. The first is the easiest to write, but the second has much better performance.
see this SO thread:
jQuery matching pattern
and check out getElementsByTagName() function

Javascript Closure and DOM Builder

I am making a DOM builder which I have working succesfully but now I am trying to assign some shorthand functions so that div() -> e("div")
Here is my code:
//assign these objects to a namespace, defaults to window
(function(parent) {
/**
* Creates string of element
* #param tag The element to add
* #param options An object of attributes to add
* #param child ... n Child elements to nest
* #return HTML string to use with innerHTML
*/
var e = function(tag, options) {
var html = '<'+tag;
var children = Array.prototype.slice.apply(arguments, [(typeof options === "string") ? 1 : 2]);
if(options && typeof options !== "string") {
for(var option in options) {
html += ' '+option+'="'+options[option]+'"';
}
}
html += '>';
for(var child in children) {
html += children[child];
}
html += '</'+tag+'>';
return html;
}
//array of tags as shorthand for e(<tag>) THIS PART NOT WORKING
var tags = "div span strong cite em li ul ol table th tr td input form textarea".split(" "), i=0;
for(; i < tags.length; i++) {
(function(el) { //create closure to keep EL in scope
parent[el] = function() {
var args = Array.prototype.slice.call(arguments);
console.log(args);
args[0] = el; //make the first argument the shorthand tag
return e.apply(e,args);
};
})(tags[i]);
}
//assign e to parent
parent.e = e;
})(window);
What's currently happening is the args array is getting modified each time I call one of the shorthand functions and I assume what needs to happen is a closure somewhere so the args array I created does not get affected each time it is called. Here is the output of the unit tests:
div( div( span("Content")), span()) expected: <div><div><span>Content</span></div><span></span></div> result: <div><span></span></div>
div( div( span( e("b", e("b", e("b")))), span())) expected: <div><div><span><b><b><b></b></b></b></span><span></span></div></div> result: <div></div>
Though this doesn't directly answer your question,
for(var el in tags) {
is not entirely correct. tags is an array, not an object, so its properties cannot be enumerated using for (... in ...). Try
for(var el = 0; el < tags.length; el++) {
This can make a huge difference towards the interpreter's understanding of your code... and the correct execution of your algorithm.
Blonde moment, I was overwriting the first element when I meant to use args.unshift(el);
#MvanGeest - doing a for..in on an array is technically allowed. Arrays are still objects in javascript. The index of the array will be the key if iterated using a for..in loop. Obviously not the point of using an array in that case, but thought I would clarify.
#Anurag - the forEach method is not supported in IE8 (not sure about 9) so that might not be a reliable method to use until later on in the future.

Categories