Regex to find price in HTML - javascript

Disclaimer: I know that parsing HTML with regex is not the correct approach. I am actually just trying to parse text inside the HTML.
I am parsing several pages, and I am looking for prices. Here is what I have so far:
var all = document.body.querySelectorAll(":not(script)");
var regex = /\$[0-9,]+(\.[0-9]{2})?/g;
for (var i = 0; i < all.length; i++) {
var node_value = all[i].nodeValue;
for (var j = 0; j < all[i].childNodes.length; j++) {
var node_value = all[i].childNodes[j].nodeValue;
if (node_value !== null) {
var matches = node_value.match(regex);
if (matches !== null && matches.length > 0) {
alert("that's a match");
}
}
}
}
This particular code can get me prices like this:
<div>This is the current price: <span class="current">$60.00</span></div>
However, there are some prices that have the following structure:
<div>This is the current price: <sup>$</sup><span>80.00</span></div>
How could I improve the algorithm in order to find those prices? Shall I look in the first for loop for <sup>symbol</sup><span>price</span> with regex?
Important: Once a match, I need to findout which DOM element is holding that price. The most inner element that is holding the price. So for example:
<div><span>$80.00</span></div>
I would need to say that is the element that is holding the price, not the div.

Try this:
var text = document.body.textContent || document.body.innerText,
regex = /\$\s*[0-9,]+(?:\s*\.\s*\d{2})?/g,
match = text.match(regex);
if( match) {
match = match[0].replace(/\s/g,"");
alert("Match found: "+match);
}
Using a recursive search:
function findPrice(node) {
node = node || document.body;
var text = node.textContent || node.innerText,
regex = /\$\s*[0-9,]+(?:\s*\.\s*\d{2})?/,
match = text.match(regex);
if( match) {
var children = node.children, l = children.length, i;
for( i=0; i<l; i++) {
if( findPrice(children[i])) {
return children[i];
}
}
// if no children matched, then this is the narrowest container
return node;
}
else return false;
}
var result = findPrice();

If you can choose your browser, you might use XPath to pre-select your candidates. The following code finds candidates nodes. I tried it in Firefox 25. You might also want to look at What browsers support Xpath 2.0? and http://www.yaldex.com/ajax-tutorial-4/BBL0029.html for cross-browser approaches.
<html><head><script type="text/javascript">
function func() {
//span containing digits preceeded by superscript dollar sign
var xpathExpr1 = "//span[translate(text(),'0123456789.,','')!=text()][preceding-sibling::sup[text()='$']]";
//span containing digits and starting with dollar sign
var xpathExpr2 = "//span[translate(text(),'0123456789.,','')!=text() and contains(text(),'$')]";
var xpathExpr3 = xpathExpr1 + "|" + xpathExpr2; // union
var contextNode = document.body;
var namespaceResolver = function(prefix){return "";}
var resultType = XPathResult.UNORDERED_NODE_ITERATOR_TYPE;
var xpathResult = document.evaluate(xpathExpr1, contextNode, namespaceResolver, resultType, null);
alert(xpathResult);
var node;
while ((node = xpathResult.iterateNext()) != null) {
alert(node.textContent);
}
}
</script></head>
<body onload="func()"> aaa
<sup>$</sup><span>80.00</span> bbb
<span>$129</span> ccc
<sup>$</sup><span>ABC</span> ddd
</body></html>

Related

How to modify data-list element or alternative

I am working on an autocomplete text input by testing for string similarity, rather than checking for perfect character matches. This way, a dropdown like a datalists would still present the user with suggestions even if they accidentally add an extra character or spell their desired input wrong.
I have a working Javascript file that can compare the string input from an HTML text input to all the strings in a JSON file that holds about 700 school names as strings. The Javascript file then formats the HTML and passes the 10 most similar strings into an unordered list(for debugging) and into a data-list (where the user will be able to pick their correct answer).
However, datalists seem to have built-in autocomplete that check for identical groups of characters and the datalists will intelligently remove suggestions if the inputted string does not exist within the suggestion.
<input
type ="text"
id="search"
list="hsDropdown"
class ="form-control form-control-lg"
placeholder="High School Name"
autocomplete="off"
autofocus = "false"
/>
<hr/>
<p id="word"></p>
<datalist id ="hsDropdown"></datalist>
<ul id ="list"></ul>
</main>
<script src="js/script.js" type ="text/javascript"></script>
<script src="js/ukkonen/index.js" type ="text/javascript"></script>
The options within the datalist in my HTML are properly populated by my script.js with the most similar strings, but I need to find a way to override the property of the datalist tag that causes results with nonperfect matches to not appear, or
I would need to find an alternative way to make a dropdown list appear from a textbox that is not limited to hard auto-correct.
You could look at the select2 jQuery plugin and the Fuzzy search issue opened there
As per requestor, he has implemented the fuzzy_match function and embedded it into the plugin as the following:
I've also a function called matcher, which looks something like:
function matcher(term, text){
if(term.term === undefined){
return {text: text, score: 1};
}
var match = fuzzy_match(term.term, text.text);
return (match[0])?{text: text, score: match[1]}:false;
}
I also have a sorter, which sorts the matched elements, (so matching elements come at top)
function sorter(data) {
return data.filter(function(item) {
return !!item;
}).sort((a, b) => b.score - a.score)
.map(item => item.text);
}
And whenever we're invoking a select2 on a element, we're passing this matcher as a matcher option, and sorter as sorter option, which looks something like:
$("#element").select2({
placeholder: 'select a name',
matcher,
sorter
})
Here is the fuzzy_match function code provided:
/**
*
* #param pattern
* #param str
* #returns {[boolean,score,formatted]}
*/
function fuzzy_match(pattern, str) {
// Score consts
var adjacency_bonus = 55; // bonus for adjacent matches
var separator_bonus = 10; // bonus if match occurs after a separator
var camel_bonus = 10; // bonus if match is uppercase and prev is lower
var leading_letter_penalty = -3; // penalty applied for every letter in str before the first match
var max_leading_letter_penalty = -9; // maximum penalty for leading letters
var unmatched_letter_penalty = -1; // penalty for every letter that doesn't matter
// Loop variables
var score = 0;
var patternIdx = 0;
var patternLength = pattern.length;
var strIdx = 0;
var strLength = str.length;
var prevMatched = false;
var prevLower = false;
var prevSeparator = true; // true so if first letter match gets separator bonus
// Use "best" matched letter if multiple string letters match the pattern
var bestLetter = null;
var bestLower = null;
var bestLetterIdx = null;
var bestLetterScore = 0;
var matchedIndices = [];
// Loop over strings
while (strIdx != strLength) {
var patternChar = patternIdx != patternLength ? pattern.charAt(patternIdx) : null;
var strChar = str.charAt(strIdx);
var patternLower = patternChar != null ? patternChar.toLowerCase() : null;
var strLower = strChar.toLowerCase();
var strUpper = strChar.toUpperCase();
var nextMatch = patternChar && patternLower == strLower;
var rematch = bestLetter && bestLower == strLower;
var advanced = nextMatch && bestLetter;
var patternRepeat = bestLetter && patternChar && bestLower == patternLower;
if (advanced || patternRepeat) {
score += bestLetterScore;
matchedIndices.push(bestLetterIdx);
bestLetter = null;
bestLower = null;
bestLetterIdx = null;
bestLetterScore = 0;
}
if (nextMatch || rematch) {
var newScore = 0;
// Apply penalty for each letter before the first pattern match
// Note: std::max because penalties are negative values. So max is smallest penalty.
if (patternIdx == 0) {
var penalty = Math.max(strIdx * leading_letter_penalty, max_leading_letter_penalty);
score += penalty;
}
// Apply bonus for consecutive bonuses
if (prevMatched)
newScore += adjacency_bonus;
// Apply bonus for matches after a separator
if (prevSeparator)
newScore += separator_bonus;
// Apply bonus across camel case boundaries. Includes "clever" isLetter check.
if (prevLower && strChar == strUpper && strLower != strUpper)
newScore += camel_bonus;
// Update patter index IFF the next pattern letter was matched
if (nextMatch)
++patternIdx;
// Update best letter in str which may be for a "next" letter or a "rematch"
if (newScore >= bestLetterScore) {
// Apply penalty for now skipped letter
if (bestLetter != null)
score += unmatched_letter_penalty;
bestLetter = strChar;
bestLower = bestLetter.toLowerCase();
bestLetterIdx = strIdx;
bestLetterScore = newScore;
}
prevMatched = true;
}
else {
// Append unmatch characters
formattedStr += strChar;
score += unmatched_letter_penalty;
prevMatched = false;
}
// Includes "clever" isLetter check.
prevLower = strChar == strLower && strLower != strUpper;
prevSeparator = strChar == '_' || strChar == ' ';
++strIdx;
}
// Apply score for last match
if (bestLetter) {
score += bestLetterScore;
matchedIndices.push(bestLetterIdx);
}
// Finish out formatted string after last pattern matched
// Build formated string based on matched letters
var formattedStr = "";
var lastIdx = 0;
for (var i = 0; i < matchedIndices.length; ++i) {
var idx = matchedIndices[i];
formattedStr += str.substr(lastIdx, idx - lastIdx) + "<b>" + str.charAt(idx) + "</b>";
lastIdx = idx + 1;
}
formattedStr += str.substr(lastIdx, str.length - lastIdx);
var matched = patternIdx == patternLength;
return [matched, score, formattedStr];
}

Filter table data from first character of string in jquery/javascript

I have a table with some records and a textbox. I want to filter table data based on string entered in textbox on keyup event.
Currently I am using a code block which filter the table data but it search the record in table which exist anywhere in the string.
For example:- If I enter 'ab' in textbox it filter the table record with strings contains the keyword 'ab' like abcd, babd, cdab etc.
But my requirement is when I enter the keyword 'ab' in textbox it search only those string which starts from 'ab' like abcd, abdc etc.
Here is my current code:-
function Search_Gridview(strKey, strGV) {
var strData = strKey.value.toLowerCase().split(" ");
var tblData = document.getElementById(strGV);
var rowData;
for (var i = 1; i < tblData.rows.length; i++) {
rowData = tblData.rows[i].cells[3].innerHTML;
var styleDisplay = 'none';
for (var j = 0; j < strData.length; j++) {
if (rowData.toLowerCase().indexOf(strData[j]) >= 0)
styleDisplay = '';
else {
styleDisplay = 'none';
break;
}
}
tblData.rows[i].style.display = styleDisplay;
}
}
Please help guys......
You can filter with jQuery the columns that contain a string beginning with e.g. "ab" of this way:
var re = $("#TABLE_ID td").filter(function(i){ return this.innerHTML.startsWith("ab") })
//You can after, get the values of each td of the result of this way
re.map(function(i){return this.innerHTML})
You can use RegExp's test method.
var stringData = [
'aaa', 'aab', 'aac',
'aba', 'abb', 'abc'
];
var searchPrefix = 'ab';
var result = stringData.filter(function (str) {
// return true if str has prefix with searchPrefix.
return (new RegExp('^' + searchPrefix)).test(str);
});
console.log(result);
JavaScript Regexp Reference
This appears the most elegant solution.
To change search behavior from "exists anywhere in the data" into "data starts with ". You only need to change one single character, on one single line of your original code and nothing more.
Change this line from this..
if (rowData.toLowerCase().indexOf(strData[j]) >= 0)
into this...
if (rowData.toLowerCase().indexOf(strData[j]) == 0)
What it does is forces the indexOf() to address zero, instead of allowing mid-string matches.
Below is the whole (already modified) code for copy and paste into a project, such as a html table filter.
function Search_Gridview(strKey, strGV) {
var strData = strKey.value.toLowerCase().split(" ");
var tblData = document.getElementById(strGV);
var rowData;
for (var i = 1; i < tblData.rows.length; i++) {
rowData = tblData.rows[i].cells[3].innerHTML;
var styleDisplay = 'none';
for (var j = 0; j < strData.length; j++) {
if (rowData.toLowerCase().indexOf(strData[j]) == 0)
styleDisplay = '';
else {
styleDisplay = 'none';
break;
}
}
tblData.rows[i].style.display = styleDisplay;
}
}
Search_Gridview() = the function's name.
strKey = input search characters
strGV = ID of html table></table

Find smallest substring containing a given set of letters in a larger string

Say you have the following string:
FJKAUNOJDCUTCRHBYDLXKEODVBWTYPTSHASQQFCPRMLDXIJMYPVOHBDUGSMBLMVUMMZYHULSUIZIMZTICQORLNTOVKVAMQTKHVRIFMNTSLYGHEHFAHWWATLYAPEXTHEPKJUGDVWUDDPRQLUZMSZOJPSIKAIHLTONYXAULECXXKWFQOIKELWOHRVRUCXIAASKHMWTMAJEWGEESLWRTQKVHRRCDYXNT
LDSUPXMQTQDFAQAPYBGXPOLOCLFQNGNKPKOBHZWHRXAWAWJKMTJSLDLNHMUGVVOPSAMRUJEYUOBPFNEHPZZCLPNZKWMTCXERPZRFKSXVEZTYCXFRHRGEITWHRRYPWSVAYBUHCERJXDCYAVICPTNBGIODLYLMEYLISEYNXNMCDPJJRCTLYNFMJZQNCLAGHUDVLYIGASGXSZYPZKLAWQUDVNTWGFFY
FFSMQWUNUPZRJMTHACFELGHDZEJWFDWVPYOZEVEJKQWHQAHOCIYWGVLPSHFESCGEUCJGYLGDWPIWIDWZZXRUFXERABQJOXZALQOCSAYBRHXQQGUDADYSORTYZQPWGMBLNAQOFODSNXSZFURUNPMZGHTAJUJROIGMRKIZHSFUSKIZJJTLGOEEPBMIXISDHOAIFNFEKKSLEXSJLSGLCYYFEQBKIZZTQQ
XBQZAPXAAIFQEIXELQEZGFEPCKFPGXULLAHXTSRXDEMKFKABUTAABSLNQBNMXNEPODPGAORYJXCHCGKECLJVRBPRLHORREEIZOBSHDSCETTTNFTSMQPQIJBLKNZDMXOTRBNMTKHHCZQQMSLOAXJQKRHDGZVGITHYGVDXRTVBJEAHYBYRYKJAVXPOKHFFMEPHAGFOOPFNKQAUGYLVPWUJUPCUGGIXGR
AMELUTEPYILBIUOCKKUUBJROQFTXMZRLXBAMHSDTEKRRIKZUFNLGTQAEUINMBPYTWXULQNIIRXHHGQDPENXAJNWXULFBNKBRINUMTRBFWBYVNKNKDFR
I'm trying to find the smallest substring containing the letters ABCDA.
I tried a regex approach.
console.log(str.match(/[A].*?[B].*?[C].*?[D].*?[A]/gm).sort((a, b) => a.length - b.length)[0]);
This works, but it only find strings where ABCDA appear (in that order). Meaning it won't find substring where the letters appear in a order like this: BCDAA
I'm trying to change my regex to account for this. How would I do that without using | and type out all the different cases?
You can't.
Let's consider a special case: Assume the letters you are looking for are A, A, and B. At some point in your regexp there will certainly be a B. However, the parts to the left and to the right of the B are independent of each other, so you cannot refer from one to the other. How many As are matched in the subexpression to the right of the B depends on the number of As being already matched in the left part. This is not possible with regular expressions, so you will have to unfold all the different orders, which can be many!
Another popular example that illustrates the problem is to match opening brackets with closing brackets. It's not possible to write a regular expression asserting that in a given string a sequence of opening brackets is followed by a sequence of closing brackets of the same length. The reason for this is that to count the brackets you would need a stack machine in contrast to a finite state machine but regular expressions are limited to patterns that can be matched using FSMs.
This algorithm doesn't use a regex, but found both solutions as well.
var haystack = 'FJKAUNOJDCUTCRHBYDLXKEODVBWTYPTSHASQQFCPRMLDXIJMYPVOHBDUGSMBLMVUMMZYHULSUIZIMZTICQORLNTOVKVAMQTKHVRIFMNTSLYGHEHFAHWWATLYAPEXTHEPKJUGDVWUDDPRQLUZMSZOJPSIKAIHLTONYXAULECXXKWFQOIKELWOHRVRUCXIAASKHMWTMAJEWGEESLWRTQKVHRRCDYXNTLDSUPXMQTQDFAQAPYBGXPOLOCLFQNGNKPKOBHZWHRXAWAWJKMTJSLDLNHMUGVVOPSAMRUJEYUOBPFNEHPZZCLPNZKWMTCXERPZRFKSXVEZTYCXFRHRGEITWHRRYPWSVAYBUHCERJXDCYAVICPTNBGIODLYLMEYLISEYNXNMCDPJJRCTLYNFMJZQNCLAGHUDVLYIGASGXSZYPZKLAWQUDVNTWGFFYFFSMQWUNUPZRJMTHACFELGHDZEJWFDWVPYOZEVEJKQWHQAHOCIYWGVLPSHFESCGEUCJGYLGDWPIWIDWZZXRUFXERABQJOXZALQOCSAYBRHXQQGUDADYSORTYZQPWGMBLNAQOFODSNXSZFURUNPMZGHTAJUJROIGMRKIZHSFUSKIZJJTLGOEEPBMIXISDHOAIFNFEKKSLEXSJLSGLCYYFEQBKIZZTQQXBQZAPXAAIFQEIXELQEZGFEPCKFPGXULLAHXTSRXDEMKFKABUTAABSLNQBNMXNEPODPGAORYJXCHCGKECLJVRBPRLHORREEIZOBSHDSCETTTNFTSMQPQIJBLKNZDMXOTRBNMTKHHCZQQMSLOAXJQKRHDGZVGITHYGVDXRTVBJEAHYBYRYKJAVXPOKHFFMEPHAGFOOPFNKQAUGYLVPWUJUPCUGGIXGRAMELUTEPYILBIUOCKKUUBJROQFTXMZRLXBAMHSDTEKRRIKZUFNLGTQAEUINMBPYTWXULQNIIRXHHGQDPENXAJNWXULFBNKBRINUMTRBFWBYVNKNKDFR';
var needle = 'ABCDA'; // the order of letters doesn't matter
var letters = {};
needle.split('').forEach(function(ch) {
letters[ch] = letters[ch] || 0;
letters[ch]++;
});
var shortestSubstringLength = haystack.length;
var shortestSubstrings = []; // storage for found substrings
var startingPos = 0;
var length;
var currentPos;
var notFound;
var letterKeys = Object.keys(letters); // unique leters
do {
lettersLeft = JSON.parse(JSON.stringify(letters)); // copy letters count object
notFound = false;
posStart = haystack.length;
posEnd = 0;
letterKeys.forEach(function(ch) {
currentPos = startingPos;
while (!notFound && lettersLeft[ch] > 0) {
currentPos = haystack.indexOf(ch, currentPos);
if (currentPos >= 0) {
lettersLeft[ch]--;
posStart = Math.min(currentPos, posStart);
posEnd = Math.max(currentPos, posEnd);
currentPos++;
} else {
notFound = true;
}
}
});
if (!notFound) {
length = posEnd - posStart + 1;
startingPos = posStart + 1; // starting position for next iteration
}
if (!notFound && length === shortestSubstringLength) {
shortestSubstrings.push(haystack.substr(posStart, length));
}
if (!notFound && length < shortestSubstringLength) {
shortestSubstrings = [haystack.substr(posStart, length)];
shortestSubstringLength = length;
}
} while (!notFound);
console.log(shortestSubstrings);
Maybe not as clear as using regex could be (well, for me regex are never really clear :D ) you can use brute force (not so brute)
Create an index of "valid" points of your string (those with the letters you want) and iterate with a double loop over it getting substrings containing at least 5 of those points, checking that they are valid solutions. Maybe not the most efficient way, but easy to implement, to understand, and probably to optimize.
var haystack="UGDVWUDDPRQLUZMSZOJPSIKAIHLTONYXAULECXXKWFQOIKELWOHRVRUCXIAASKHMWTMAJEWGEESLWRTQKVHRRCDYXNTLDSUPXMQTQDFAQAPYBGXPOLOCLFQNGNKPKOBHZWHRXAWAWJKMTJSLDLNHMUGVVOPSAMRUJEYUOBPFNEHPZZCLPNZKWMTCXERPZRFKSXVEZTYCXFRHRGEITWHRRYPWSVAYBUHCERJXDCYAVICPTNBGIODLYLMEYLISEYNXNMCDPJJRCTLYNFMJZQNCLAGHUDVLYIGASGXSZYPZKLAWQUDVNTWGFFYFFSMQWUNUPZRJMTHACFELGHDZEJWFDWVPYOZEVEJKQWHQAHOCIYWGVLPSHFESCGEUCJGYLGDWPIWIDWZZXRUFXERABQJOXZALQOCSAYBRHXQQGUDADYSORTYZQPWGMBLNAQOFODSNXSZFURUNPMZGHTAJUJROIGMRKIZHSFUSKIZJJTLGOEEPBMIXISDHOAIFNFEKKSLEXSJLSGLCYYFEQBKIZZTQQXBQZAPXAAIFQEIXELQEZGFEPCKFPGXULLAHXTSRXDEMKFKABUTAABSLNQBNMXNEPODPGAORYJXCHCGKECLJVRBPRLHORREEIZOBSHDSCETTTNFTSMQPQIJBLKNZDMXOTRBNMTKHHCZQQMSLOAXJQKRHDGZVGITHYGVDXRTVBJEAHYBYRYKJAVXPOKHFFMEPHAGFOOPFNKQAUGYLVPWUJUPCUGGIXGR";
var needle="ABCD";
var size=haystack.length;
var candidate_substring="";
var minimal_length=size;
var solutions=new Array();
var points=Array();
for(var i=0;i<size;i++){
if(needle.indexOf(haystack[i])>-1) points.push(i);
}
var limit_i= points.length-4;
var limit_k= points.length;
for (var i=0;i<limit_i;i++){
for(var k=i;k<limit_k;k++){
if(points[k]-points[i]+1<=minimal_length){
candidate_substring=haystack.substr(points[i],points[k]-points[i]+1);
if(is_valid(candidate_substring)){
solutions.push(candidate_substring);
if(candidate_substring.length < minimal_length) minimal_length=candidate_substring.length;
}
}
}
}
document.write('<p>Solution length:'+minimal_length+'<p>');
for(var i=0;i<solutions.length;i++){
if(solutions[i].length<=minimal_length) document.write('<p>Solution:'+solutions[i]+'<p>');
}
function is_valid(candidate_substring){
//verify we've got all characters
for(var j=0;j<candidate_substring.length;j++){
if(candidate_substring.indexOf(needle.charAt(j))<0) return false;
}
//...and verify we have two "A"
if(candidate_substring.indexOf("A")==candidate_substring.lastIndexOf("A")) return false;
return true;
}
Just had this problem in an interview as a coding assignment and came up with another solution, (it's not as optimal as the one above but maybe it's easier to understand).
function MinWindowSubstring(strArr) {
const N = strArr[0];
const K = strArr[1];
const letters = {};
K.split('').forEach( (character) => {
letters[character] = letters[character] ? letters[character] + 1 : 1;
});
let possibleSequencesList = [];
const letterKeys = Object.keys(letters);
for(let i=0; i< N.length; i++) {
const char = N[i];
if (new String(letterKeys).indexOf(char) !== -1) {
// found a character in the string
// update all previus sequences
possibleSequencesList.forEach((seq) => {
if(!seq.sequenceComplete) {
seq[char] = seq[char]-1;
seq.lastIndex = i;
// check if sequence is complete
var sequenceComplete = true;
letterKeys.forEach( (letter) => {
if(seq[letter] > 0) {
sequenceComplete = false;
}
});
seq.sequenceComplete = sequenceComplete
}
})
// create a new sequence starting from it
const newSeq = {
startPoint: i,
lastIndex: i,
sequenceComplete: false,
...letters
}
newSeq[char] = newSeq[char]-1;
possibleSequencesList.push(newSeq);
}
}
// cleanup sequences
let sequencesList = possibleSequencesList.filter(sequence => sequence.sequenceComplete);
let output = [];
let minLength = N.length;
// find the smalles one
sequencesList.forEach( seq => {
if( (seq.lastIndex - seq.startPoint) < minLength) {
minLength = seq.lastIndex - seq.startPoint;
output = N.substring(seq.startPoint, seq.lastIndex + 1);
}
})
return output;
}

JavaScript count all elements on a webpage

I want to be able to count everything on a site that contains a certain string, even if it's not displayed on the page.
I found this:
window.occurrencesFunc()
I am not sure how to use it. I did play around with it, but nothing seemed to work.
This is what I tried:
function main() {
function(r) {
var amount = window.occurrencesFunc(r, "string", false);
$('ul.reset').append('Count: ' + amount);
}
}
main();
I'd get the entire page as a string:
var markup = document.documentElement.innerHTML;
And, then I'd use the match method to match for the string occurrences in the string and count them:
// the g in the regular expression says to search for a word (not part of a word)
var resultArray= markup.match(/WORRDDD/g);
var count = resultArray.length
Shorthand, would be
var count = (markup.match(/WORRDDD/g) || []).length;
How about a function like this?
function findOccurrences (string)
{
var elements = document.getElementsByTagName ('*');
var foundCount = 0;
for (var i = 0, il = elements.length; i < il; i++) {
if (!elements[i].textContent || !elements[i].textContent.match (string)) {
continue;
}
foundCount++;
}
return 'Count: ' + foundCount;
}
Use it like this, HTML...
<div id="occurrenceCount"></div>
And the JavaScript...
var countElement = document.getElementById ('occurrenceCount');
countElement.textContent = findOccurrences ('your string');
This will make the <div> above contain something like:
Count: 35
I'd probably walk through element nodes present in the DOM
function countElementsMatching(needle, elements) {
return Array.prototype.slice.call(elements)
.map(function (node) {
return node.textContent;
})
.filter(function (str) {
return str.indexOf(needle) !== -1;
}).length;
}
Usage:
var total = countElementsMatching('some string', document.querySelectorAll('*'));

Javascript word-count for any given DOM element

I'm wondering if there's a way to count the words inside a div for example. Say we have a div like so:
<div id="content">
hello how are you?
</div>
Then have the JS function return an integer of 4.
Is this possible? I have done this with form elements but can't seem to do it for non-form ones.
Any ideas?
g
If you know that the DIV is only going to have text in it, you can KISS:
var count = document.getElementById('content').innerHTML.split(' ').length;
If the div can have HTML tags in it, you're going to have to traverse its children looking for text nodes:
function get_text(el) {
ret = "";
var length = el.childNodes.length;
for(var i = 0; i < length; i++) {
var node = el.childNodes[i];
if(node.nodeType != 8) {
ret += node.nodeType != 1 ? node.nodeValue : get_text(node);
}
}
return ret;
}
var words = get_text(document.getElementById('content'));
var count = words.split(' ').length;
This is the same logic that the jQuery library uses to achieve the effect of its text() function. jQuery is a pretty awesome library that in this case is not necessary. However, if you find yourself doing a lot of DOM manipulation or AJAX then you might want to check it out.
EDIT:
As noted by Gumbo in the comments, the way we are splitting the strings above would count two consecutive spaces as a word. If you expect that sort of thing (and even if you don't) it's probably best to avoid it by splitting on a regular expression instead of on a simple space character. Keeping that in mind, instead of doing the above split, you should do something like this:
var count = words.split(/\s+/).length;
The only difference being on what we're passing to the split function.
Paolo Bergantino's second solution is incorrect for empty strings or strings that begin or end with whitespaces. Here's the fix:
var count = !s ? 0 : (s.split(/^\s+$/).length === 2 ? 0 : 2 +
s.split(/\s+/).length - s.split(/^\s+/).length - s.split(/\s+$/).length);
Explanation: If the string is empty, there are zero words; If the string has only whitespaces, there are zero words; Else, count the number of whitespace groups without the ones from the beginning and the end of the string.
string_var.match(/[^\s]+/g).length
seems like it's a better method than
string_var.split(/\s+/).length
At least it won't count "word " as 2 words -- ['word'] rather than ['word', '']. And it doesn't really require any funny add-on logic.
Or just use Countable.js to do the hard job ;)
document.deepText= function(hoo){
var A= [];
if(hoo){
hoo= hoo.firstChild;
while(hoo!= null){
if(hoo.nodeType== 3){
A[A.length]= hoo.data;
}
else A= A.concat(arguments.callee(hoo));
hoo= hoo.nextSibling;
}
}
return A;
}
I'd be fairly strict about what a word is-
function countwords(hoo){
var text= document.deepText(hoo).join(' ');
return text.match(/[A-Za-z\'\-]+/g).length;
}
alert(countwords(document.body))
Or you can do this:
function CountWords (this_field, show_word_count, show_char_count) {
if (show_word_count == null) {
show_word_count = true;
}
if (show_char_count == null) {
show_char_count = false;
}
var char_count = this_field.value.length;
var fullStr = this_field.value + " ";
var initial_whitespace_rExp = /^[^A-Za-z0-9]+/gi;
var left_trimmedStr = fullStr.replace(initial_whitespace_rExp, "");
var non_alphanumerics_rExp = rExp = /[^A-Za-z0-9]+/gi;
var cleanedStr = left_trimmedStr.replace(non_alphanumerics_rExp, " ");
var splitString = cleanedStr.split(" ");
var word_count = splitString.length -1;
if (fullStr.length <2) {
word_count = 0;
}
if (word_count == 1) {
wordOrWords = " word";
} else {
wordOrWords = " words";
}
if (char_count == 1) {
charOrChars = " character";
} else {
charOrChars = " characters";
}
if (show_word_count & show_char_count) {
alert ("Word Count:\n" + " " + word_count + wordOrWords + "\n" + " " + char_count + charOrChars);
} else {
if (show_word_count) {
alert ("Word Count: " + word_count + wordOrWords);
} else {
if (show_char_count) {
alert ("Character Count: " + char_count + charOrChars);
}
}
}
return word_count;
}
The get_text function in Paolo Bergantino's answer didn't work properly for me when two child nodes have no space between them. eg <h1>heading</h1><p>paragraph</p> would be returned as headingparagraph (notice lack of space between the words). So prepending a space to the nodeValue fixes this. But it introduces a space at the front of the text but I found a word count function that trims it off (plus it uses several regexps to ensure it counts words only). Word count and edited get_text functions below:
function get_text(el) {
ret = "";
var length = el.childNodes.length;
for(var i = 0; i < length; i++) {
var node = el.childNodes[i];
if(node.nodeType != 8) {
ret += node.nodeType != 1 ? ' '+node.nodeValue : get_text(node);
}
}
return ret;
}
function wordCount(fullStr) {
if (fullStr.length == 0) {
return 0;
} else {
fullStr = fullStr.replace(/\r+/g, " ");
fullStr = fullStr.replace(/\n+/g, " ");
fullStr = fullStr.replace(/[^A-Za-z0-9 ]+/gi, "");
fullStr = fullStr.replace(/^\s+/, "");
fullStr = fullStr.replace(/\s+$/, "");
fullStr = fullStr.replace(/\s+/gi, " ");
var splitString = fullStr.split(" ");
return splitString.length;
}
}
EDIT
kennebec's word counter is really good. But the one I've found includes a number as a word which is what I needed. Still, that's easy to add to kennebec's. But kennebec's text retrieval function will have the same problem.
This should account for preceding & trailing whitespaces
const wordCount = document.querySelector('#content').innerText.trim().split(/\s+/).length;
string_var.match(/[^\s]+/g).length - 1;

Categories