Searching through international characters with Chosen.js - javascript

Has anyone found a fix for searching through international characters using chosen.js?
for example if I have a multi-select field with the following options:
- A French Château
- An English Chateau
And I search for "Cha"
Only the english will show up.

I don't know, if it is not too late for answering, but it could help someone else probably.
The point is to strip diacritics from entered string and from matched option string during comparison.
My modification is working with jquery Chosen plugin in version 1.1.0, downloaded from github.
First you have to have a function to strip diacritics (i've added it after winnow_results())
// strip czech diacritics from string
AbstractChosen.prototype.strip_diacritics= function(string) {
var
translationTable= [
// input with diacritics - add your characters if necessary
"éěÉĚřŘťŤžŽúÚůŮüÜíÍóÓáÁšŠďĎýÝčČňŇäÄĺĹľĽŕŔöÖ",
// stripped output
"eeEErRtTzZuUuUuUiIoOaAsSdDyYcCnNaAlLlLrRoO",
],
output= '';
// scan through whole string
for (var i= 0; i < string.length; i++) {
var charPosition= translationTable[0].indexOf(string[i]);
output+= charPosition == -1 ? string[i] : translationTable[1][charPosition];
}
return output;
}
Then use it in appropriate places in winnow_results() function.
Referencing chosen.jquery.js lines:
line #315
searchText = this.get_search_text();
// replace with
searchText = this.strip_diacritics(this.get_search_text());
line #339
option.search_match = this.search_string_match(option.search_text, regex);
// replace with
option.search_match = this.search_string_match(this.strip_diacritics(option.search_text), regex);
and finally line #345
startpos = option.search_text.search(zregex);
// replace with
startpos = this.strip_diacritics(option.search_text).search(zregex);
And you're done :-).
(I feel like writing some "savegame byte replacement" instruction for extra lives in old DOS game)
Whole modified file at pastebin.
Sept 29th, 2016: Modifications made in Chosen 1.6.2, tested ok.

It doesn't look as though Chosen does this by default - there's no functionality in the code for this.
The source code is here. I've posted the search function below, which is responsible for checking the characters. There's nothing in this function that deals with close characters like that, so you'd either have to write it or request that feature from the Chosen team. This is because, at the core, accented characters and non-accented characters don't have the same ASCII (or Unicode) values. You'd have to have some type of lookup table, and parse that for each character to return "fuzzy" results.
Sorry I couldn't be of more help. I'm sure if you could find a way to modify this function, you could get this working. Again, you'd need lookup tables or something for the underlying code values. Best of luck.
Edit: You may not need lookup tables - perhaps the Regex functionality has a built-in way to do this. Alternatively, you may be able to simply match on anything close to the letter you're searching for.
Chosen.prototype.winnow_results = function() {
var found, option, part, parts, regex, regexAnchor, result, result_id, results, searchText, startpos, text, zregex, _i, _j, _len, _len1, _ref;
this.no_results_clear();
results = 0;
searchText = this.search_field.val() === this.default_text ? "" : $('<div/>').text($.trim(this.search_field.val())).html();
regexAnchor = this.search_contains ? "" : "^";
regex = new RegExp(regexAnchor + searchText.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&"), 'i');
zregex = new RegExp(searchText.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&"), 'i');
_ref = this.results_data;
for (_i = 0, _len = _ref.length; _i < _len; _i++) {
option = _ref[_i];
if (!option.disabled && !option.empty) {
if (option.group) {
$('#' + option.dom_id).css('display', 'none');
} else if (!(this.is_multiple && option.selected)) {
found = false;
result_id = option.dom_id;
result = $("#" + result_id);
if (regex.test(option.html)) {
found = true;
results += 1;
} else if (this.enable_split_word_search && (option.html.indexOf(" ") >= 0 || option.html.indexOf("[") === 0)) {
parts = option.html.replace(/\[|\]/g, "").split(" ");
if (parts.length) {
for (_j = 0, _len1 = parts.length; _j < _len1; _j++) {
part = parts[_j];
if (regex.test(part)) {
found = true;
results += 1;
}
}
}
}
if (found) {
if (searchText.length) {
startpos = option.html.search(zregex);
text = option.html.substr(0, startpos + searchText.length) + '</em>' + option.html.substr(startpos + searchText.length);
text = text.substr(0, startpos) + '<em>' + text.substr(startpos);
} else {
text = option.html;
}
result.html(text);
this.result_activate(result);
if (option.group_array_index != null) {
$("#" + this.results_data[option.group_array_index].dom_id).css('display', 'list-item');
}
} else {
if (this.result_highlight && result_id === this.result_highlight.attr('id')) {
this.result_clear_highlight();
}
this.result_deactivate(result);
}
}
}
}
if (results < 1 && searchText.length) {
return this.no_results(searchText);
} else {
return this.winnow_results_set_highlight();
}
};

Related

Javascript: Split a string by comma, except inside parentheses

Given string in the form:
'"abc",ab(),c(d(),e()),f(g(),zyx),h(123)'
How can I split it to get the below array format:
abc
ab()
c(d(),e())
f(g(),zyx)
h(123)
I have tried normal javascript split, however it doesn't work as desired. Trying Regular Expression but not yet successful.
You can keep track of the parentheses, and add those expressions when the left and right parens equalize.
For example-
function splitNoParen(s){
var left= 0, right= 0, A= [],
M= s.match(/([^()]+)|([()])/g), L= M.length, next, str= '';
for(var i= 0; i<L; i++){
next= M[i];
if(next=== '(')++left;
else if(next=== ')')++right;
if(left!== 0){
str+= next;
if(left=== right){
A[A.length-1]+=str;
left= right= 0;
str= '';
}
}
else A=A.concat(next.match(/([^,]+)/g));
}
return A;
}
var s1= '"abc",ab(),c(d(),e()),f(g(),zyx),h(123)';
splitNoParen(s1).join('\n');
/* returned value: (String)
"abc"
ab()
c(d(),e())
f(g(),zyx)
h(123)
*/
This might be not the best or more refined solution, and also maybe won't fit every single possibility, but based on your example it works:
var data = '"abc",ab(),c(d(),e()),f(g(),zyx),h(123)';
// Create a preResult splitting the commas.
var preResult = data.replace(/"/g, '').split(',');
// Create an empty result.
var result = [];
for (var i = 0; i < preResult.length; i++) {
// Check on every preResult if the number of parentheses match.
// Opening ones...
var opening = preResult[i].match(/\(/g) || 0;
// Closing ones...
var closing = preResult[i].match(/\)/g) || 0;
if (opening != 0 &&
closing != 0 &&
opening.length != closing.length) {
// If the current item contains a different number of opening
// and closing parentheses, merge it with the next adding a
// comma in between.
result.push(preResult[i] + ',' + preResult[i + 1]);
i++;
} else {
// Leave it as it is.
result.push(preResult[i]);
}
}
Demo
For future reference, here's another approach to top-level splitting, using string.replace as a control flow operator:
function psplit(s) {
var depth = 0, seg = 0, rv = [];
s.replace(/[^(),]*([)]*)([(]*)(,)?/g,
function (m, cls, opn, com, off, s) {
depth += opn.length - cls.length;
var newseg = off + m.length;
if (!depth && com) {
rv.push(s.substring(seg, newseg - 1));
seg = newseg;
}
return m;
});
rv.push(s.substring(seg));
return rv;
}
console.log(psplit('abc,ab(),c(d(),e()),f(g(),zyx),h(123)'))
["abc", "ab()", "c(d(),e())", "f(g(),zyx)", "h(123)"]
Getting it to handle quotes as well would not be too complicated, but at some point you need to decide to use a real parser such as jison, and I suspect that would be the point. In any event, there's not enough detail in the question to know what the desired handling of double quotes is.
You can't use .split for this, but instead you'll have to write a small parser like this:
function splitNoParen(s){
let results = [];
let next;
let str = '';
let left = 0, right = 0;
function keepResult() {
results.push(str);
str = '';
}
for(var i = 0; i<s.length; i++) {
switch(s[i]) {
case ',':
if((left === right)) {
keepResult();
left = right = 0;
} else {
str += s[i];
}
break;
case '(':
left++;
str += s[i];
break;
case ')':
right++;
str += s[i];
break;
default:
str += s[i];
}
}
keepResult();
return results;
}
var s1= '"abc",ab(),c(d(),e()),f(g(),zyx),h(123)';
console.log(splitNoParen(s1).join('\n'));
var s2='cats,(my-foo)-bar,baz';
console.log(splitNoParen(s2).join('\n'));
Had a similar issue and existing solutions were hard to generalize. So here's another parser that's a bit more readable and easier to extend to your personal needs. It'll also work with curly braces, brackets, normal braces, and strings of any type. License is MIT.
/**
* This function takes an input string and splits it by the given token, but only if the token is not inside
* braces of any kind, or a string.
* #param {string} input The string to split.
* #param {string} split_by Must be a single character.
* #returns {string[]} An array of split parts without the split_by character.
*/
export function parse_split(input:string, split_by:string = ",") : string[]
{
// Javascript has 3 types of strings
const STRING_TYPES = ["'","`","\""] as const;
// Some symbols can be nested, like braces, and must be counted
const state = {"{":0,"[":0,"(":0};
// Some cannot be nested, like a string, and just flip a flag.
// Additionally, once the string flag has been flipped, it can only be unflipped
// by the same token.
let string_state : (typeof STRING_TYPES)[number] | undefined = undefined
// Nestable symbols come in sets, usually in pairs.
// These sets increase or decrease the state, depending on the symbol.
const pairs : Record<string,[keyof typeof state,number]> = {
"{":["{",1],
"}":["{",-1],
"[":["[",1],
"]":["[",-1],
"(":["(",1],
")":["(",-1]
}
let start = 0;
let results = [];
let length = input.length;
for(let i = 0; i < length; ++i)
{
let char = input[i];
// Backslash escapes the next character. We directly skip 2 characters by incrementing i one extra time.
if(char === "\\")
{
i++;
continue;
}
// If the symbol exists in the single/not nested state object, flip the corresponding state flag.
if(char == string_state)
{
string_state = undefined;
console.log("Closed string ", string_state);
}
// if it's not in a string, but it's a string opener, remember the string type in string_state.
else if(string_state === undefined && STRING_TYPES.includes(char as typeof STRING_TYPES[number]))
{
string_state = char as typeof STRING_TYPES[number];
}
// If it's not in a string, and if it's a paired symbol, increase or decrease the state based on our "pairs" constant.
else if(string_state === undefined && (char in pairs) )
{
let [key,value] = pairs[char];
state[key] += value;
}
// If it's our split symbol...
else if(char === split_by)
{
// ... check whether any flags are active ...
if(Object.entries(state).every(([k,v])=>v == 0) && (string_state === undefined))
{
// ... if not, then this is a valid split.
results.push(input.substring(start,i))
start = i+1;
}
}
}
// Add the last segment if the string didn't end in the split_by symbol, otherwise add an empty string
if(start < input.length)
{
results.push(input.substring(start,input.length))
}
else
results.push("");
return results;
}
With this regex, it makes the job:
const regex = /,(?![^(]*\))/g;
const str = '"abc",ab(),c(d(),e()),f(g(),zyx),h(123)';
const result = str.split(regex);
console.log(result);
Javascript
var str='"abc",ab(),c(d(),e()),f(g(),zyx),h(123)'
str.split('"').toString().split(',').filter(Boolean);
this should work

if else statement in while loop, where if else statement determines when while loop stops

var userInput = prompt("type something with or without double spacing");
var errorSpaces = [];
var maxSpaceCount = [];
var doneOnce = 0;
var done = 0;
var size;
var tempArray = [0, 0];
while (done === 0) {
if (doneOnce === 0) {
for (var i = 0; i<size; i++) {
size = userInput.length - 1;
if (userInput.substr(i, 2) == " ") {
userInput.replace(userInput.substring(i, j), userInput[i]);
errorSpaces.push(0);
}
}
doneOnce = 1;
maxSpaceCount.push(0);
} else if (doneOnce === 1 && tempArray.length != 1) {
for (var i = 0; i<size; i++) {
tempArray = [0];
size = userInput.length - 1;
if (userInput.substr(i, 2) == " ") {
userInput.replace(userInput.substring(i, j), userInput[i]);
tempArray.push(0);
}
doneOnce = 2;
}
maxSpaceCount.push(0);
} else {
done = 1;
}
}
alert("done");
This loops at the second for loop rather than finishing. I know it probably isn't the best way to do it, but how could I make the 'else if' work so that when there are no more double spaces, it will go to the final else?
I am trying to eliminate any multiple spaces by iteratively replacing double spaces with single spaces, then re-reading to replace further double (previously triple) spaces, etc.
Way , way too much code if your goal is to replace any double (or more spaces) with a single space
try regex
var userInput = prompt("type something with or without double spacing");
userInput = userInput.replace(/\s{2,}/g, ' ');
alert("done");
although not quite sure what you are trying to do with tempArray as it doesn't seem to make sense.
EDIT
There appears to be some indication that there is a requirement to count how many occurrences of 2 or more spaces, so using the below will give you the count. The reason for the || bit is because if none are found, it will return null, || [] will change the null to empty array, so the length of it will be zero. Thanks to #RobG
var countOfMultipleSpaces = (userInput.match(/\s{2,}/g) || []).length;
I'm sure it goes without saying that you have to do this before you replace them all
Is this what you want?
"type something with or without double spacing".replace(/\s{2,}/g, ' ');
//"type something with or without double spacing"

Javascript: matching a dynamic string against an array

I'm attempting to teach myself javascript. I chose something I assumed was simple, but ran into problems relatively quickly.
I'm attempting to search a string for another string given by the user.
My code so far is:
var source = "XREs2qqAQfjr6NZs6H5wkZdOES5mikexRkOPsj6grQiYNZfFoqXI4Nnc1iONKVrA";
var searchString = []; //the users input
searchString = prompt("Enter search string");
var hits = [];
var one = 0;
var two = 0;
var k = 0;
var sourceSearch = function(text) {
for(i = 0; i < source.length; i++) { //for each character in the source
if(source[i] === searchString[0]) { //if a character in source matches the first element in the users input
one = source.indexOf(i); //confused from here on
for(p = searchString.length; p > 0; p--) {
}
}
}
};
sourceSearch(searchString);
My idea was:
check to see if the first loop finds a character that matches the first character in the user input
if it matches, check to see if the next X characters after the first match the next X characters in the source string
if they all match, push them to the hits array
My problem: I have no idea how to iterate along the arrays without nesting quite a few if statements, and even then, that wouldn't be sufficient, considering I want the program to work with any input.
Any ideas would be helpful. Thanks very much in advance.
Note: There are a few un-used variables from ideas I was testing, but I couldn't make them work.
You can try:
if (source.indexOf(searchString) !== -1) {
// Match!
}
else
{
//No Match!
}
As the other answers so far point out, JavaScript strings have an indexOf function that does what you want. If you want to see how it's done "by hand", you can modify your function like this:
var sourceSearch = function(text) {
var i, j, ok; // always declare your local variables. globals are evil!
// for each start position
for(i = 0; i < source.length; i++) {
ok = true;
// check for a match
for (j = searchString.length - 1; ok && j >= 0; --j) {
ok = source[i + j] === searchString[j];
}
if (ok) {
// searchString found starting at index i in source
}
}
};
This function will find all positions in source at which searchString was found. (Of course, you could break out of the loop on the first success.) The logic is to use the outer loop to advance to each candidate start position in source and use the inner loop to test whether that position actually is the position of a match to searchString.
This is not the best algorithm for searching strings. The built-in algorithm is much faster (both because it is a better algorithm and because it is native code).
to follow your approach, you can just play with 2 indexes:
var sourceSearch = function(text) {
j = 0;
for(i = 0; i < source.length; i++) {
if(source[i] === text[j]) {
j++;
} else {
j = 0;
}
if (j == text.length) {
console.log(i - j); //this prints the starting index of the matching substring
}
}
};
These answers are all pretty good, but I'd probably opt for something like this:
var source = "XREs2qqAQfjr6NZs6H5wkZdOES5mikexRkOPsj6grQiYNZfFoqXI4Nnc1iONKVrA";
var searchString = []; //the users input
searchString = prompt("Enter search string");
var hits = source.split(searchString);
var hitsCount = hits.length - 1;
This way you have all of the data you need to figure out where each hit occurred in he source, if that's important to you.

Extract keyphrases from text (1-4 word ngrams)

What's the best way to extract keyphrases from a block of text? I'm writing a tool to do keyword extraction: something like this. I've found a few libraries for Python and Perl to extract n-grams, but I'm writing this in Node so I need a JavaScript solution. If there aren't any existing JavaScript libraries, could someone explain how to do this so I can just write it myself?
I like the idea, so I've implemented it: See below (descriptive comments are included).
Preview at: https://jsfiddle.net/WsKMx
/*#author Rob W, created on 16-17 September 2011, on request for Stackoverflow (http://stackoverflow.com/q/7085454/938089)
* Modified on 17 juli 2012, fixed IE bug by replacing [,] with [null]
* This script will calculate words. For the simplicity and efficiency,
* there's only one loop through a block of text.
* A 100% accuracy requires much more computing power, which is usually unnecessary
**/
var text = "A quick brown fox jumps over the lazy old bartender who said 'Hi!' as a response to the visitor who presumably assaulted the maid's brother, because he didn't pay his debts in time. In time in time does really mean in time. Too late is too early? Nonsense! 'Too late is too early' does not make any sense.";
var atLeast = 2; // Show results with at least .. occurrences
var numWords = 5; // Show statistics for one to .. words
var ignoreCase = true; // Case-sensitivity
var REallowedChars = /[^a-zA-Z'\-]+/g;
// RE pattern to select valid characters. Invalid characters are replaced with a whitespace
var i, j, k, textlen, len, s;
// Prepare key hash
var keys = [null]; //"keys[0] = null", a word boundary with length zero is empty
var results = [];
numWords++; //for human logic, we start counting at 1 instead of 0
for (i=1; i<=numWords; i++) {
keys.push({});
}
// Remove all irrelevant characters
text = text.replace(REallowedChars, " ").replace(/^\s+/,"").replace(/\s+$/,"");
// Create a hash
if (ignoreCase) text = text.toLowerCase();
text = text.split(/\s+/);
for (i=0, textlen=text.length; i<textlen; i++) {
s = text[i];
keys[1][s] = (keys[1][s] || 0) + 1;
for (j=2; j<=numWords; j++) {
if(i+j <= textlen) {
s += " " + text[i+j-1];
keys[j][s] = (keys[j][s] || 0) + 1;
} else break;
}
}
// Prepares results for advanced analysis
for (var k=1; k<=numWords; k++) {
results[k] = [];
var key = keys[k];
for (var i in key) {
if(key[i] >= atLeast) results[k].push({"word":i, "count":key[i]});
}
}
// Result parsing
var outputHTML = []; // Buffer data. This data is used to create a table using `.innerHTML`
var f_sortAscending = function(x,y) {return y.count - x.count;};
for (k=1; k<numWords; k++) {
results[k].sort(f_sortAscending);//sorts results
// Customize your output. For example:
var words = results[k];
if (words.length) outputHTML.push('<td colSpan="3" class="num-words-header">'+k+' word'+(k==1?"":"s")+'</td>');
for (i=0,len=words.length; i<len; i++) {
//Characters have been validated. No fear for XSS
outputHTML.push("<td>" + words[i].word + "</td><td>" +
words[i].count + "</td><td>" +
Math.round(words[i].count/textlen*10000)/100 + "%</td>");
// textlen defined at the top
// The relative occurence has a precision of 2 digits.
}
}
outputHTML = '<table id="wordAnalysis"><thead><tr>' +
'<td>Phrase</td><td>Count</td><td>Relativity</td></tr>' +
'</thead><tbody><tr>' +outputHTML.join("</tr><tr>")+
"</tr></tbody></table>";
document.getElementById("RobW-sample").innerHTML = outputHTML;
/*
CSS:
#wordAnalysis td{padding:1px 3px 1px 5px}
.num-words-header{font-weight:bold;border-top:1px solid #000}
HTML:
<div id="#RobW-sample"></div>
*/
I do not know such a library in JavaScript but the logic is
split text into array
then sort and count
alternatively
split into array
create a secondary array
traversing each item of the 1st array
check whether current item exists in secondary array
if not exists
push it as a item's key
else
increase value having a key = to item sought.
HTH
Ivo Stoykov
function ngrams(seq, n) {
to_return = []
for (let i=0; i<seq.length-(n-1); i++) {
let cur = []
for (let j=i; j<seq.length && j<=i+(n-1); j++) {
cur.push(seq[j])
}
to_return.push(cur.join(''))
}
return to_return
}
> ngrams(['a', 'b', 'c'], 2)
['ab', 'bc']

Javascript word-count for any given DOM element

I'm wondering if there's a way to count the words inside a div for example. Say we have a div like so:
<div id="content">
hello how are you?
</div>
Then have the JS function return an integer of 4.
Is this possible? I have done this with form elements but can't seem to do it for non-form ones.
Any ideas?
g
If you know that the DIV is only going to have text in it, you can KISS:
var count = document.getElementById('content').innerHTML.split(' ').length;
If the div can have HTML tags in it, you're going to have to traverse its children looking for text nodes:
function get_text(el) {
ret = "";
var length = el.childNodes.length;
for(var i = 0; i < length; i++) {
var node = el.childNodes[i];
if(node.nodeType != 8) {
ret += node.nodeType != 1 ? node.nodeValue : get_text(node);
}
}
return ret;
}
var words = get_text(document.getElementById('content'));
var count = words.split(' ').length;
This is the same logic that the jQuery library uses to achieve the effect of its text() function. jQuery is a pretty awesome library that in this case is not necessary. However, if you find yourself doing a lot of DOM manipulation or AJAX then you might want to check it out.
EDIT:
As noted by Gumbo in the comments, the way we are splitting the strings above would count two consecutive spaces as a word. If you expect that sort of thing (and even if you don't) it's probably best to avoid it by splitting on a regular expression instead of on a simple space character. Keeping that in mind, instead of doing the above split, you should do something like this:
var count = words.split(/\s+/).length;
The only difference being on what we're passing to the split function.
Paolo Bergantino's second solution is incorrect for empty strings or strings that begin or end with whitespaces. Here's the fix:
var count = !s ? 0 : (s.split(/^\s+$/).length === 2 ? 0 : 2 +
s.split(/\s+/).length - s.split(/^\s+/).length - s.split(/\s+$/).length);
Explanation: If the string is empty, there are zero words; If the string has only whitespaces, there are zero words; Else, count the number of whitespace groups without the ones from the beginning and the end of the string.
string_var.match(/[^\s]+/g).length
seems like it's a better method than
string_var.split(/\s+/).length
At least it won't count "word " as 2 words -- ['word'] rather than ['word', '']. And it doesn't really require any funny add-on logic.
Or just use Countable.js to do the hard job ;)
document.deepText= function(hoo){
var A= [];
if(hoo){
hoo= hoo.firstChild;
while(hoo!= null){
if(hoo.nodeType== 3){
A[A.length]= hoo.data;
}
else A= A.concat(arguments.callee(hoo));
hoo= hoo.nextSibling;
}
}
return A;
}
I'd be fairly strict about what a word is-
function countwords(hoo){
var text= document.deepText(hoo).join(' ');
return text.match(/[A-Za-z\'\-]+/g).length;
}
alert(countwords(document.body))
Or you can do this:
function CountWords (this_field, show_word_count, show_char_count) {
if (show_word_count == null) {
show_word_count = true;
}
if (show_char_count == null) {
show_char_count = false;
}
var char_count = this_field.value.length;
var fullStr = this_field.value + " ";
var initial_whitespace_rExp = /^[^A-Za-z0-9]+/gi;
var left_trimmedStr = fullStr.replace(initial_whitespace_rExp, "");
var non_alphanumerics_rExp = rExp = /[^A-Za-z0-9]+/gi;
var cleanedStr = left_trimmedStr.replace(non_alphanumerics_rExp, " ");
var splitString = cleanedStr.split(" ");
var word_count = splitString.length -1;
if (fullStr.length <2) {
word_count = 0;
}
if (word_count == 1) {
wordOrWords = " word";
} else {
wordOrWords = " words";
}
if (char_count == 1) {
charOrChars = " character";
} else {
charOrChars = " characters";
}
if (show_word_count & show_char_count) {
alert ("Word Count:\n" + " " + word_count + wordOrWords + "\n" + " " + char_count + charOrChars);
} else {
if (show_word_count) {
alert ("Word Count: " + word_count + wordOrWords);
} else {
if (show_char_count) {
alert ("Character Count: " + char_count + charOrChars);
}
}
}
return word_count;
}
The get_text function in Paolo Bergantino's answer didn't work properly for me when two child nodes have no space between them. eg <h1>heading</h1><p>paragraph</p> would be returned as headingparagraph (notice lack of space between the words). So prepending a space to the nodeValue fixes this. But it introduces a space at the front of the text but I found a word count function that trims it off (plus it uses several regexps to ensure it counts words only). Word count and edited get_text functions below:
function get_text(el) {
ret = "";
var length = el.childNodes.length;
for(var i = 0; i < length; i++) {
var node = el.childNodes[i];
if(node.nodeType != 8) {
ret += node.nodeType != 1 ? ' '+node.nodeValue : get_text(node);
}
}
return ret;
}
function wordCount(fullStr) {
if (fullStr.length == 0) {
return 0;
} else {
fullStr = fullStr.replace(/\r+/g, " ");
fullStr = fullStr.replace(/\n+/g, " ");
fullStr = fullStr.replace(/[^A-Za-z0-9 ]+/gi, "");
fullStr = fullStr.replace(/^\s+/, "");
fullStr = fullStr.replace(/\s+$/, "");
fullStr = fullStr.replace(/\s+/gi, " ");
var splitString = fullStr.split(" ");
return splitString.length;
}
}
EDIT
kennebec's word counter is really good. But the one I've found includes a number as a word which is what I needed. Still, that's easy to add to kennebec's. But kennebec's text retrieval function will have the same problem.
This should account for preceding & trailing whitespaces
const wordCount = document.querySelector('#content').innerText.trim().split(/\s+/).length;
string_var.match(/[^\s]+/g).length - 1;

Categories