Get text from HTML with appropriate whitespace

Get text from HTML with appropriate whitespace - javascript

It is easy to extract the text from HTML using the jQuery .text() method...
$("<p>This <b>That</b> Other</p>").text() == "This That Other"
But if there is no whitespace between the words/elements, then text becomes concatenated...
$("<p>This <b>That</b><br/>Other</p>").text() == "This ThatOther"
Desired: "This That Other"
$("<div><h1>Title</h1><p>Text</p></div>").text() == "TitleText"
Desired: "Title Text"
Is there any way to get all the text from the HTML (either using .text() or other methods) which would mean that the above examples would come out as desired?

You can traverse the DOM tree looking for a node with a nodeType of 3 (text node). When you find one, add it to an array. If you find a non-text node, you can pass it back into the function to keep looking.
function innerText(element) {
function getTextLoop(element) {
const texts = [];
Array.from(element.childNodes).forEach(node => {
if (node.nodeType === 3) {
texts.push(node.textContent.trim());
} else {
texts.push(...getTextLoop(node));
}
});
return texts;
}
return getTextLoop(element).join(' ');
}
/* EXAMPLES */
const div = document.createElement('div');
div.innerHTML = `<p>This <b>That</b><br/>Other</p>`;
console.log(innerText(div));
const div2 = document.createElement('div');
div2.innerHTML = `<div><h1>Title</h1><p>Text</p></div>`;
console.log(innerText(div2));

If you are just worried about br tags, you can replace them with a text node.
var elem = document.querySelector("#text")
var clone = elem.cloneNode(true)
clone.querySelectorAll("br").forEach( function (br) {
var space = document.createTextNode(' ')
br.replaceWith(space)
})
var cleanedText = clone.textContent.trim().replace(/\s+/,' ');
console.log(cleanedText)
<div id="text">
<p>This <b>That</br>Other</p>
</div>

Related

highlight the .replace() word in javascript

function handleTextNode(textNode) {
if(textNode.nodeName !== '#text'
|| textNode.parentNode.nodeName === 'SCRIPT'
|| textNode.parentNode.nodeName === 'STYLE'
) {
//Don't do anything except on text nodes, which are not children
// of or .
return;
}
var find = ["build", "add", "Fast"];
var replace = ["Develope", "add", "Fast"];
let origText = textNode.textContent;
let newHtml = origText.replace(new RegExp("(" + find.map(function(i){return i.replace(/[.?*+^$[\]\\(){}|-]/g, "\\$&")}).join("|") + ")", "g"),function(s){ return replace[ find.indexOf(s)]});
if( newHtml !== origText) {
let newSpan = document.createElement('span');
newSpan.innerHTML = newHtml;
textNode.parentNode.replaceChild(newSpan,textNode);
}
// <span title="Dyslexia is a learning disorder that involves difficulty reading due to problems identifying speech sounds. " style="background-color: yellow"></span>
}
let textNodes = [];
//Create a NodeIterator to get the text nodes in the body of the document
let nodeIter = document.createNodeIterator(document.body,NodeFilter.SHOW_TEXT);
let currentNode;
//Add the text nodes found to the list of text nodes to process.
while(currentNode = nodeIter.nextNode()) {
textNodes.push(currentNode);
}
//Process each text node
textNodes.forEach(function(el){
handleTextNode(el);
});
i want to make the replaced word highlighted. this code will run only for web pages. like i am building a adblocker with a functionality that help dyslexia Student. At the moment it only replace words correctly. can any body help?

Javascript: How to check/compare if a String contains some tag?

I'd like to use pure JS to check if some String, the textareas .innerHTML = newContent below, contains some tag (h1in my case) at the beginning (=as first child). What would be the best way to do this?
Thanks!
function submitNewSectionContent(e) {
for (var i = 0; i < sections.length; i++)
let newHeading = document.getElementById('edit-title').value;
/* edit-title is text-input*/
let newContent = document.getElementById('edit-sectionText').innerHTML;
/* edit-sectionText is textarea */
if (newContent.indexOf('<h1>') > -1 && newContent.indexOf('<h1>') < 10) { /* <h1> is at beginning so replace with newHeading */
let toberemoved = newContent.match('<h1>.*<\/h1>');
newContent = newContent.replace(toberemoved[0], '').trim();
sections[i].innerHTML = '<h1>'+newHeading+'</h1>' + sections[i].innerHTML;
} else { /* newContent has no h1 as first child, so add h1 from newHeading */
sections[i].innerHTML = '<h1>'+newHeading+'</h1>' + newContent;
}
}
}

Problem with Regular expressions is they do not really work well with HTML. So Your best bet is to convert it to a DOM fragment and do the manipulations and convert it back. Only issue with this method really is you can lose formatting. There are libraries out there that can pretty print HTML.
function updateHeadline(txt) {
const ta = document.querySelector("textarea");
const data = ta.value; // read value, not innerHTML
const temp = document.createElement('div'); // temp div to hold html
temp.innerHTML = data; // set the html to the temp element
let firstChild = temp.firstElementChild // look at the dom
if (!firstChild || firstChild.tagName!=="H1") { // see if we have an h1
firstChild = document.createElement("h1") // if not create one
temp.prepend(firstChild) // add it to the front
}
firstChild.innerHTML = txt // set the new text of the h1
ta.value = temp.innerHTML // put the content back into the textarea
}
const btn = document.querySelector("button");
btn.addEventListener("click", function (e) {
e.preventDefault()
updateHeadline(document.querySelector("#text").value)
})
textarea {
width: 300px;
height: 200px;
}
<textarea>
<p>Some other text</p>
<p>Some more text</p>
</textarea>
<input value="foo" id="text"/>
<button>Set</button>

You could use regex like so, (updated based on comment)
if( /^\s*<h1>/gi.test(stringToTest) ) {
//logic here
}
It checks if the stringToTest begins with ^ tag
See here : https://regex101.com/r/vSo4sL/1

convert to dom, and parse dom
this portion of code makes it possible to treat a chain to retrieve titles placed in the H1 tag and (on the fly) treat the string of characters.
It's easily expandable for future processing : or tag processing or other ...!
commented code
<html>
<script>
var s="<H1>Hey Title</H1>\n Hello,\n other title <H1>Green!</H1>\n Ipsum dolore sit...";
console.log(s);
console.log("-------------------------");
var partialDoc = document.createElement( 'html' );
partialDoc.innerHTML = s;
var parsed='';
var titles=[];
treatment(partialDoc);
console.log("\n-------------------------\n");
console.log("parsed",parsed);
console.log("\n-------------------------\n");
console.log("s var contains "+titles.length+ " H1 tag");
console.log("titles "+titles);
function treatment(root) {
var child = root.firstChild;
while (child) {
// child.nodeName = H1 | H2 | P etc...
// child.nodeType = 1
// catch H1
if (child.nodeName=='H1') {
// append your title,
parsed+=" [H1 FOUND content= {"+child.innerText+"} H1])";
// or
// parsed+="<H1>"+child.innerText+"<H1>";
// add your own process here
// add this title in array
// or what you want...
titles.push(child.innerText);
// next part of document
child = child.nextSibling;
continue;
}
// capture other text than H1
if (child.nodeType==3) { // Node Type Text
parsed+=child.nodeValue;
}
if (child.nodeType==1) { // Node Type ELEMENT, : sub nodes...
treatment(child);
}
// continue the rest of doc
child = child.nextSibling;
}
}
</script>
</html>

One way you could do it is: Node.firstElementChild which will avoid giving child node as #text for white-spaces and Node.nodeName
let firstChild = document.getElementById('edit-sectionText').firstElementChild;
if(firstChild.nodeName === "H1"){
firstChild.innerHTML = "Replacement Value"
}
Note & Update: The earlier api that I had suggested Node.firstChild will not prevent white-spaces which gives #text node and comments as #comment node.
2nd Way: Node.children and picking the first child out of it should have a similar result to Node.firstElementChild.
let elem = document.getElementById('edit-sectionText');
if(elem){
let firstChild = elem.children[0];
}
Update based on comments: Using Dom Parser Interface
The interface allows to parse XML or HTML source from a string based on the mime type provided for its method parseFromString(string, mimeType)
It will give the top level #document node with parsed HTML from the string where if exists <h1> or <H1> at the beginning would be the first child of body and subsequently can be tested via tagName property.
Note: Takes care of preceding HTML comments and spaces at the beginning but a caveat is doesn't check fully closed tags ex: var s = \t <h1>I am a heading <h1> here the <h1> was never closed and in the result will two fully formed headings at the body with content : I am a heading and ""
let textAreaString = document.getElementById("edit-sectionText").value;
const domParser = new DOMParser();
const parsedDoc = domParser.parseFromString(textAreaString, "text/html");
if (parsedDoc.body.firstElementChild.tagName === "H1") {
//yes it starts with <h1> or <H1>
}

How to extract text from an HTML node and maintain the correct space formatting with JavaScript?

I need to get the text value inside of a div. You can think of it like a text area, where you can do like this:
let inputArea = document.getElementById("text-area");
let text = inputArea.value;
text
// Expected result:
// Hello
//
// Hello
//
// Hello
//
// Hello
Unfortunately, I don't have a text area and I can't do that. So what here's what I did:
(1) Get the HTML node where I need to extract the text (var bodyHtml..)
(2) Convert the HTML node to string (calling extractStringFromNode()..)
(3) Extract the string out of the returned string (calling extractContentFromString()..)
My problem is that I lose the "text formatting". The let text (code above) contains the value of the inputArea with the proper formatting. I lose this when I try to reproduce it using my implementation (code below).
What is the right way to solve this?
var OriginalText;
function parseText(event) {
// get text
var bodyHtml = event.composeView.getBodyElement();
var stringBodyHtml = extractStringFromNode(bodyHtml);
console.log(stringBodyHtml)
var text = extractContentFromString(stringBodyHtml);
OriginalText = text;
console.log(text);
// now parse it
format(text);
}
// extract text from html
function extractContentFromString(s) {
var span = document.createElement('span');
span.innerHTML = s;
return span.textContent || span.innerText;
};
// from html node to string
function extractStringFromNode ( node ) {
var tmpNode = document.createElement( "div" );
tmpNode.appendChild( node.cloneNode( true ) );
var str = tmpNode.innerHTML;
tmpNode = node = null; // prevent memory leaks in IE
return str;
}
// Expected result: HelloHelloHelloHello

Use the CSS property white-space and set it to pre. This will preserve line breaks, however, you must place it in every place you want to have the text with similar spacing.
let div = document.querySelector("div");
//this div has CSS styling white-space: pre
span = document.body.appendChild( document.createElement("span") );
//this span does not
span.textContent = div.textContent;
//div will be "formatted" with line breaks. Span will not.
div {
white-space: pre;
}
<div>This
Is
A
Demo
</div>
<hr/>

Highlight Ajax Response with Javascript

I'm trying to highlight a query inside a text coming from an ajax response, before constructing HTML with it and pasting that into the DOM. Right now I'm using this code snippet:
function highlightWords(line, word, htmltag) {
var tag = htmltag || ["<b>", "</b>"];
var regex = new RegExp('(' + preg_quote(word) + ')', 'gi');
return line.replace(regex, tag[0] + "$1" + tag[1]);
}
function preg_quote(str) {
return (str + '').replace(/([\\\.\+\*\?\[\^\]\$\(\)\{\}\=\!\<\>\|\:])/g, "\\$1");
}
However, this is not capeable of highlighting different words if the query is something like sit behind. It will only highlight the complete phrase and not the single words. It also doesn't care about HTML tags and that produces unpretty results if the query is span for example...
I've found various libraries which handle highlighting way better, like https://markjs.io/ or https://www.the-art-of-web.com/javascript/search-highlight/
Those libraries though always want to highlight content which is already present in the DOM.
My search gets an ajax response, which I then turn into HTML with JS and paste the complete HTMLString into a parent container using DOM7 (which is similar to jQuery). Therfor I would prefer to highlight the text before creating the HTMLString and pasting it in the DOM.
Any ideas?

I just make the highlight in the response of ajax request. It's works for me:
$.ajax({
url : url,
type : 'POST',
success: function(response) {
// Highlight
let term = 'word';
$context = $("#selector");
$context.show().unmark();
if (term){
$context.mark(term, {
done: function() {
$context.not(":has(mark)").hide();
}
});
}
}
});

Snippet style: Warning: this uses DOM7 as per Question
Overview: Instead of appending the whole text as HTML string to your #container,
Append the portions of normal text, as text, and the highlighted elements as elements, so you can style them at will.
var text // your ajax text response
var strQuery = 'sit behind' // your query string
var queryWords = strQuery.split(' ')
var textWords = text.split(' ')
var bufferNormalWords = []
textWords.forEach(function (word) {
if (queryWords.indexOf(word) != -1) { // found
var normalWords = bufferNormalWords.splice(0, buffer.length) // empty buffer
// Your DOM7 commands
$$('#container').add('span').text(normalWords.join(' ')) // normal text
$$('#container').add('span').css('color', 'red').text(word + ' ') // why not red
}
else bufferNormalWords.push(word)
})
Do not mess up with text becoming HTMLStrings, just set text, and create the necesary elements to style them as you want with your DOM7.

If your ajax response contains html, I don't think there's an easy way to get around creating DOM elements first. Below gets the job done, even in the case where span is in the query and the ajax results contain <span>
function highlightWords(line, word, htmltag) {
var words = word.split(/\s+/);
var tag = htmltag || ["<b>", "</b>"];
var root = document.createElement("div");
root.innerHTML = line;
root = _highlightWords(words, tag, root);
return root.innerHTML;
}
// Recursively search the created DOM element
function _highlightWords(words, htmlTag, el) {
var children = [];
el.childNodes.forEach(function(el) {
if (el.nodeType != 3) { // anything other than Text Type
var highlighted = _highlightWords(words, htmlTag, el);
children.push(highlighted);
} else {
var line = _highlight(el.textContent, words, htmlTag);
var span = document.createElement("span");
span.innerHTML = line;
children.push(span);
}
});
// Clear the html of the element, so the new children can be added
el.innerHTML = "";
children.forEach(function (c) { el.appendChild(c)});
return el;
}
// Find and highlight any of the words
function _highlight(line, words, htmlTag) {
words.forEach(function(singleWord) {
if (!!singleWord) {
singleWord = htmlEscape(singleWord);
line = line.replace(singleWord, htmlTag[0] + singleWord + htmlTag[1]);
}
});
return line;
}

I think you were on the right track using a library for that.
I have been using for that a great library named mark.js.
It works without dependencies or with jQuery.
The way that you can make it work.
Make the AJAX call.
Load the string to the DOM.
Call the Mark.js API on the content you have loaded.
Here's a code snippet:
document.addEventListener('DOMContentLoaded', getText);
function getText() {
const headline = document.getElementsByTagName("h1")[0];
const p = document.getElementsByTagName("p")[0];
fetch('https://jsonplaceholder.typicode.com/posts/1').
then(response => response.json()).
then(json => {
console.log(json);
headline.innerHTML = json.title;
p.innerHTML = json.body;
addMark('aut facere');
});
}
function addMark(keyword) {
var markInstance = new Mark(document.querySelector('.context'));
var options = {
separateWordSearch: true
};
markInstance.unmark({
done: function() {
markInstance.mark(keyword, options);
},
});
}
<script src="https://cdn.jsdelivr.net/mark.js/8.6.0/mark.min.js"></script>
<div class="context">
<h1></h1>
<p></p>
</div>

dom wrapping a substring in textnode with a new span node

Suppose, I have this paragraph:
<p>this is a paragraph containing link to an image at http://lol/atme.png :)</p>
I want to replace http://lol/atme.png with an image element.
How do I do that?
Its like removing the text, but adding a image element in place of that text.
Help will be greatly appreciated.

There are two parts to this. The first is the extraction of the URLs from the text, which is a tricky issue I'm not that interested in. I would do some research before using this in production. For now, I'll use an extremely simple illustrative regex.
The second part is the code for doing the replacement within text nodes. I answered a related question the other day with some reusable code and now I'm getting to reuse it. Yay.
function createImage(matchedTextNode) {
var el = document.createElement("img");
el.src = matchedTextNode.data;
el.width = 30;
el.height = 20;
return el;
}
function surroundInElement(el, regex, surrounderCreateFunc) {
var child = el.lastChild;
while (child) {
if (child.nodeType == 1) {
surroundInElement(child, regex, createImage);
} else if (child.nodeType == 3) {
surroundMatchingText(child, regex, surrounderCreateFunc);
}
child = child.previousSibling;
}
}
function surroundMatchingText(textNode, regex, surrounderCreateFunc) {
var parent = textNode.parentNode;
var result, surroundingNode, matchedTextNode, matchLength, matchedText;
while ( textNode && (result = regex.exec(textNode.data)) ) {
matchedTextNode = textNode.splitText(result.index);
matchedText = result[0];
matchLength = matchedText.length;
textNode = (matchedTextNode.length > matchLength) ?
matchedTextNode.splitText(matchLength) : null;
surroundingNode = surrounderCreateFunc(matchedTextNode.cloneNode(true));
parent.insertBefore(surroundingNode, matchedTextNode);
parent.removeChild(matchedTextNode);
}
}
var urlRegex = /http(s?):\/\/($|[^\s]+)/;
function replaceImageUrls(el) {
surroundInElement(el, urlRegex, createImage);
}
<div id="s">One
http://www.google.co.uk/images/logos/ps_logo2.png
two
http://www.google.co.uk/images/logos/ps_logo2.png three</div>
<input type="button" onclick="replaceImageUrls(document.getElementById('s'))" value="replace">

i might misunderstand your question.
From what i understand, could use a div as a placeholder
//HTML
<p>
<div id="holder"><a>link to image</a></div></p>
//js
var h = document.getElementById("holder");
if(h)h.innerHTML = "<img.....>" //the image tag

We Keep Coding

JavaScript is the programming language of the Web.

Get text from HTML with appropriate whitespace - javascript

Related

highlight the .replace() word in javascript

Javascript: How to check/compare if a String contains some tag?

How to extract text from an HTML node and maintain the correct space formatting with JavaScript?

Highlight Ajax Response with Javascript

dom wrapping a substring in textnode with a new span node

Categories

Resources