Retrieve inner text with spacing - javascript

I want to extract text out of an arbitrary block of HTML. Naive attempt:
$('<div><p>Some</p>Inner<div>Text</div></div>').text()
This gives SomeInnerText, but I want Some Inner Text
What is a better way to extract text out of HTML, while maintaining some concept of the visual structure with which the HTML would be rendered?
In the example above, new lines between block elements would be great & spaces could be a sort of "flattened" output.

Use a regular expression to inject spaces before closing tags:
$('<div><p>Some</p>Inner<div>Text</div></div>'.replace(/</g, ' <')).text();
Fiddle: http://jsfiddle.net/mattdlockyer/uau6S/

You can insert '&nbsp' into your script:
$('<div><p>Some </p>Inner <div>Text</div></div>').text();

Well, you can extend jQuery to do that:
$.fn.textRespectingBlocks = function() {
return this.map(function() {
var $this = $(this);
var display = $this.css('display');
var isBlock = display !== 'none' && display !== 'inline' && display !== 'inline-block' && display !== 'inline-flex' && display !== 'inline-table';
var childText = Array.prototype.map.call(this.childNodes, function(node) {
if (node.nodeType === 1) {
return $(node).textRespectingBlocks();
}
if (node.nodeType === 3) {
return node.nodeValue;
}
return '';
}).join('');
return isBlock ? ' ' + childText + ' ' : childText;
}).get().join('');
};
Do a .replace(/^\s+|\s+$|\s(?=\s)/g, '') on the result, if you like.

Simply adding the spaces yourself will do the trick. However, due to the variations in the way that html is parsed by different browsers, this may result in variations of white space across browsers.
$('<div> <p>Some</p> Inner <div>Text</div></div>').text()

Related

Trouble using string.replace() Javascript with Unicode Symbols

I'm writing a javascript function to move a pointing arrow from one html element to another. "&#9664" displays an arrow in HTML.
The trouble is that while I can add an arrow to innerHTML I can't seem to remove the arrow from the current selection.
Here is the relevent portion of my code:
var current;
function changeArrowFunction(line) {
if (typeof current != 'undefined')
current.innerHTML = current.innerHTML.replace(" &#9664","");
line.innerHTML = line.innerHTML + " &#9664";
current = line;
}
I tried changing around the typeof condition or removing it completely with no sign of improvement, so it seems the problem is with replace().
The problem is inner html does not preserve the html codes so.
If you log/alert the value of innerHTML you could see that the character ◀ is visible there not the string &#9664, so the replace function won't be able to find the character sequence to replace it.
var current;
function changeArrayFunction(line) {
if (typeof current != 'undefined') {
current.innerHTML = current.innerHTML.replace(" ◀", "");
}
line.innerHTML = line.innerHTML + " &#9664";
current = line;
}
var c = 0;
function test() {
changeArrayFunction(document.getElementById('d-' + (++c)))
}
<div id="d-1">adf</div>
<div id="d-2">adf</div>
<div id="d-3">adf</div>
<div id="d-4">adf</div>
<button onclick="test();">Test</button>

Searching for most performant way for string replacing with javascript

I'm programming my own autocomplete textbox control using C# and javascript on clientside. On client side i want to replace the characters in string which matching the characters the user was searching for to highlight it. For example if the user was searching for the characters 'bue' i want to replace this letters in the word 'marbuel' like so:
mar<span style="color:#81BEF7;font-weight:bold">bue</span>l
in order to give the matching part another color. This works pretty fine if i have 100-200 items in my autocomplete, but when it comes to 500 or more, it takes too mutch time.
The following code shows my method which does the logic for this:
HighlightTextPart: function (text, part) {
var currentPartIndex = 0;
var partLength = part.length;
var finalString = '';
var highlightPart = '';
var bFoundPart = false;
var bFoundPartHandled = false;
var charToAdd;
for (var i = 0; i < text.length; i++) {
var myChar = text[i];
charToAdd = null;
if (!bFoundPart) {
var myCharLower = myChar.toLowerCase();
var charToCompare = part[currentPartIndex].toLowerCase();
if (charToCompare == myCharLower) {
highlightPart += myChar;
if (currentPartIndex == partLength - 1)
bFoundPart = true;
currentPartIndex++;
}
else {
currentPartIndex = 0;
highlightPart = '';
charToAdd = myChar;
}
}
else
charToAdd = myChar;
if (bFoundPart && !bFoundPartHandled) {
finalString += '<span style="color:#81BEF7;font-weight:bold">' + highlightPart + '</span>';
bFoundPartHandled = true;
}
if (charToAdd != null)
finalString += charToAdd;
}
return finalString;
},
This method only highlight the first occurence of the matching part.
I use it as follows. Once the request is coming back from server i build an html UL list with the matching items by looping over each item and in each loop i call this method in order to highlight the matching part.
As i told for up to 100 items it woks pretty nice but it is too mutch for 500 or more.
Is there any way to make it faster? Maybe by using regex or some other technique?
I also thought about using "setTimeOut" to do it in a extra function or maybe do it only for the items, which currently are visible, because only a couple of items are visible while for the others you have to scroll.
Try limiting visible list size, so you are only showing 100 items at maximum for example. From a usability standpoint, perhaps even go down to only 20 items, so it would be even faster than that. Also consider using classes - see if it improves performance. So instead of
mar<span style="color:#81BEF7;font-weight:bold">bue</span>l
You will have this:
mar<span class="highlight">bue</span>l
String replacement in JavaScript is pretty easy with String.replace():
function linkify(s, part)
{
return s.replace(part, function(m) {
return '<span style="color:#81BEF7;font-weight:bold">' + htmlspecialchars(m) + '</span>';
});
}
function htmlspecialchars(txt)
{
return txt.replace('<', '<')
.replace('>', '>')
.replace('"', '"')
.replace('&', '&');
}
console.log(linkify('marbuel', 'bue'));
I fixed this problem by using regex instead of my method posted previous. I replace the string now with the following code:
return text.replace(new RegExp('(' + part + ')', 'gi'), "<span>$1</span>");
This is pretty fast. Much faster as the code above. 500 items in the autocomplete seems to be no problem. But can anybody explain, why this is so mutch faster as my method or doing it with string.replace without regex? I have no idea.
Thx!

With javascript: What is the best way to block scripting without block html markups (<b>, <p>, etc.)?

I want to safely display a text coming from the user (by blocking scripts tags), but I need to accept html markups (b, p, li, ul, etc.).
It's need to be bullet proof against cross-site scripting attack.
Thank you!
If you have a simple tag whitelist and you don't need to worry about attacks at or below the encoding level (as is the case from within browser-side JavaScript), you can do the following:
function sanitize(tagWhitelist, html) {
// Get rid of all uses of '['.
html = String(html).replace(/\[/g, '[');
// Consider all uses of '<' and replace whitelisted tags with markers like
// [1] which are indices into a list of approved tag names.
// Replace all other uses of < and > with entities.
var tags = [];
html = html.replace(
/<!--[\s\S]*?-->|<(\/?)([a-z]\w*)(?:[^"'>]|"[^"]*"|'[^']*')*>/g,
function (_, close, tagName) {
if (tagName) {
tagName = tagName.toLowerCase();
if (tagWhitelist.hasOwnProperty(tagName) && tagWhitelist[tagName]) {
var index = tags.length;
tags.push('<' + (close || '') + tagName + '>');
return '[' + index + ']';
}
}
return '';
});
// Escape HTML special characters. Leave entities alone.
html = html.replace(/[<>"'#\`\u0000]/g,
function (c) {
switch (c) {
case '<': return '<';
case '>': return '>';
case '"': return '"';
case '\'': return ''';
case '#': return '#';
}
return '&#' + c.charCodeAt(0) + ';';
});
if (html.indexOf('<') >= 0) { throw new Error(); }  // Sanity check.
// Throw out any close tags that don't correspond to start tags.
// If <table> is used for formatting, embedded HTML shouldn't be able
// to use a mismatched </table> to break page layout.
var open = [];
for (var i = 0, n = tags.length; i < n; ++i) {
var tag = tags[i];
if (tag.charAt(1) === '/') {
var idx = open.lastIndexOf(tag);
if (idx < 0) { tags[i] = ""; } // Drop close tag.
else {
tags[i] = open.slice(idx).reverse().join('');
open.length = idx;
}
} else if (!HTML5_VOID_ELEMENTS.test(tag)) {
open.push('</' + tag.substring(1));
}
}
// Now html contains no tags or less-than characters that could become
// part of a tag via a replacement operation and tags only contains
// approved tags.
// Reinsert the white-listed tags.
html = html.replace(
/\[(\d+)\]/g, function (_, index) { return tags[index]; });
// Close any still open tags.
// This prevents unclosed formatting elements like <ol> and <table> from
// breaking the layout of containing HTML.
return html + open.reverse().join('');
}
var HTML5_VOID_ELEMENTS = new RegExp(
'^<(?:area|base|br|col|command|embed|hr|img|input'
+ '|keygen|link|meta|param|source|track|wbr)\\b');
which can be used like
sanitize({ p: true, b: true, i: true, br: true },
"Hello, <b>World</b>!<script>alert(1337)<\/script>");
If you need more configurability, like the ability to allow attributes on tags, see the Caja HTML sanitizer.
As others have pointed out, your server should not trust the result coming from the client so you should re-sanitize on the server before embedding the result into server-generated markup.
If you are using javascript for user input it won't be bulletproof no matter what you do.
Assuming you're writing a server-side backend, you should use the tried and true bbcode, there must be a library for it.

Javascript split result simplify

Is there a way to make this code more simplified?
<input type="text" id="tags" />
var splittext = document.getElementById('tags').value.split(' ');
if (document.getElementById('tags').value.split(' ').length < 2 || splittext[1] == '') {
alert("Two tags required.");
}
is there another way to make
splittext[1] == ''
be like
document.getElementById('tags').value.split(' ').something[1]
to avoid using the line
var splittext = document.getElementById('tags').value.split(' ')
The purpose of this is when a user inputs one tag and made a space after it, the split detects 2 values which i would like to avoid the space being counted as another tag because that would be like, uhm, cheating.
Trim first, and split on any number of white space characters:
if (document.getElementById('tags').value.trim( ).split(/\s+/).length < 2) {
alert("Two tags required.");
}
You will need to create the String.trim function if you want to support some versions of IE though... but it's a useful function to have. Put this in a utility js file, or just at the top of your js:
if(typeof String.prototype.trim !== 'function') {
String.prototype.trim = function() {
return this.replace(/^\s+|\s+$/g, '');
}
}
You should change your code to this to avoid making multiple calls to the same dom element(you are splitting the same thing twice)
var splittext = document.getElementById('tags').value.split(' ');
if (splittext.length < 2 || splittext[1] == '') {
alert("Two tags required.");
}
This is the whole point of using variables, to avoid calling the same function(with the same results) multiple times.
Something like this should do the trick:
var elem=document.getElementById('tags').value;
if(elem.indexOf(' ')>-1 && elem.split(' ').length>=2) {
alert('Worked!');
} else if(!elem || elem.indexOf(' ')<0) {
alert('Two tags required.');
}
yeah :
var g= document.getElementById('tags').value.split(/[ ]+/)
if (g.length==2) // ok.....
http://jsbin.com/ovibef/edit#javascript,html

Javascript Regular Expression [Remove Events]

does anyone know of a good regular expression to remove events from html.
For example the string:
"<h1 onmouseover="top.location='http://www.google.com">Large Text</h1>
Becomes
"<h1>Large Text</h1>
So HTML tags are preserved but events like onmouseover, onmouseout, onclick, etc. are removed.
Thanks in Advance!
How about:
data.replace(/ on\w+="[^"]*"/g, '');
Edit from the comments:
This is intended to be run on your markup as a one time thing. If you're trying to remove events dynamically during the execution of the page, that's a slightly different story. A javascript library like jQuery makes it extremely easy, though:
$('*').unbind();
Edit:
Restricting this to only within tags is a lot harder. I'm not confident it can be done with a single regex expression. However, this should get you by if no one can come up with one:
var matched;
do
{
matched = false;
data = data.replace(/(<[^>]+)( on\w+="[^"]*")+/g,
function(match, goodPart)
{
matched = true;
return goodPart;
});
} while(matched);
Edit:
I surrender at writing a single regex for this. There must be some way to check the context of a match without actually capturing the beginning of the tag in your match, but my RegEx-fu is not strong enough. This is the most elegant solution I'm going to come up with:
data = data.replace(/<[^>]+/g, function(match)
{
return match.replace(/ on\w+="[^"]*"/g, '');
});
Here's a pure JS way to do it:
function clean(html) {
function stripHTML(){
html = html.slice(0, strip) + html.slice(j);
j = strip;
strip = false;
}
function isValidTagChar(str) {
return str.match(/[a-z?\\\/!]/i);
}
var strip = false; //keeps track of index to strip from
var lastQuote = false; //keeps track of whether or not we're inside quotes and what type of quotes
for(var i=0; i<html.length; i++){
if(html[i] === "<" && html[i+1] && isValidTagChar(html[i+1])) {
i++;
//Enter element
for(var j=i; j<html.length; j++){
if(!lastQuote && html[j] === ">"){
if(strip) {
stripHTML();
}
i = j;
break;
}
if(lastQuote === html[j]){
lastQuote = false;
continue;
}
if(!lastQuote && html[j-1] === "=" && (html[j] === "'" || html[j] === '"')){
lastQuote = html[j];
}
//Find on statements
if(!lastQuote && html[j-2] === " " && html[j-1] === "o" && html[j] === "n"){
strip = j-2;
}
if(strip && html[j] === " " && !lastQuote){
stripHTML();
}
}
}
}
return html;
}

Categories