Count number of characters present in foreign language - javascript

Is there any optimal way to implement character count for non English letters? For example, if we take the word "Mother" in English, it is a 6 letter word. But if you type the same word(மதர்) in Tamil, it is a three letter word(ம+த+ர்) but the last letter(ர்) will be considered as two characters(ர+ஂ=ர்) by the system. So is there any way to count the number of real characters?
One clue is that if we move the cursor in keyboard into the word (மதர்), it will pass through 3 letters only and not into 4 chars considering by the system, so is there any way to find the solution by using this? Any help on this would be greatly appreciated...

Update
Back from lunch =)
I'm afraid that the previous won't work this well with any foreign language
So i added another fiddle with a possible way
var UnicodeNsm = [Array 1280] //It holds all escaped Unicode Non Space Marks
function countNSMString(str) {
var chars = str.split("");
var count = 0;
for (var i = 0,ilen = chars.length;i<ilen;i++) {
if(UnicodeNsm.indexOf(escape(chars[i])) == -1) {
count++;
}
}
return count;
}
var English = "Mother";
var Tamil = "மதர்";
var Vietnamese = "mẹ"
var Hindi = "मां"
function logL (str) {
console.log(str + " has " + countNSMString(str) + " visible Characters and " + str.length + " normal Characters" ); //"மதர் has 3 visible Characters"
}
logL(English) //"Mother has 6 visible Characters and 6 normal Characters"
logL(Tamil) //"மதர் has 3 visible Characters and 4 normal Characters"
logL(Vietnamese) //"mẹ has 2 visible Characters and 3 normal Characters"
logL(Hindi) //"मां has 1 visible Characters and 3 normal Characters"
So this just checks if theres any Character in the String which is a Unicode NSM character and ignores the count for this, this should work for the Most languages, not Tamil only,
And an array with 1280 Elements shouldn't be that big of a performance issue
Here is a list with the Unicode NSM's
http://www.fileformat.info/info/unicode/category/Mn/list.htm
Here is the according JSBin
After experimenting a bit with string operations, it turns out
String.indexOf returns the same for
"ர்" and for "ர"
meaning
"ர்ரர".indexOf("ர்") == "ர்ரர".indexOf("ர" + "்") //true but
"ர்ரர".indexOf("ர") == "ர்ரர".indexOf("ர" + "ர") //false
I took this opportunity and tried something like this
//ர்
var char = "ரர்ர்ரர்்";
var char2 = "ரரர்ர்ரர்்";
var char3 = "ர்ரர்ர்ரர்்";
function countStr(str) {
var chars = str.split("");
var count = 0;
for(var i = 0, ilen = chars.length;i<ilen;i++) {
var chars2 = chars[i] + chars[i+1];
if (str.indexOf(chars[i]) == str.indexOf(chars2))
i += 1;
count++;
}
return count;
}
console.log("--");
console.log(countStr(char)); //6
console.log(countStr(char2)); //7
console.log(countStr(char3)); //7
Which seems to work for the String above, it may take some adjustments, as i don't know a thing about Encoding and stuff, but maybe its a point you can begin with
Heres the JSBin

You can ignore combining marks in the count calculation with this function:
function charCount( str ) {
var re = /[\u0300-\u036f\u1dc0-\u1dff\u20d0-\u20ff\ufe20-\ufe2f\u0b82\u0b83\u0bbe\u0bbf\u0bc0-\u0bc2\u0bc6-\u0bc8\u0bca-\u0bcd\u0bd7]/g
return str.replace( re, "").length;
}
console.log(charCount('மதர்'))// 3
//More tests on random Tamil text:
//Paint the text character by character to verify, for instance 'யெ' is a single character, not 2
console.log(charCount("மெய்யெழுத்துக்கள்")); //9
console.log(charCount("ஒவ்வொன்றுடனும்")); //8
console.log(charCount("தமிழ்")); //3
console.log(charCount("வருகின்றனர்.")); //8
console.log(charCount("எழுதப்படும்")); //7
The Tamil signs and marks are not composed into single characters with their target character in unicode, so normalization wouldn't help. I have added all the Tamil combining marks or signs manually
to the regex, but it also includes the ranges for normal combining marks, so charCount("ä") is 1 regardless of normalization form.

Related

Intl.NumberFormat() constructor with phone numbers [duplicate]

I'm looking to reformat (replace, not validate - there are many references for validating) a phone number for display in Javascript. Here's an example of some of the data:
123 4567890
(123) 456-7890
(123)456-7890
123 456 7890
123.456.7890
(blank/null)
1234567890
Is there an easy way to use a regular expression to do this? I'm looking for the best way to do this. Is there a better way?
I want to reformat the number to the following: (123) 456-7890
Assuming you want the format "(123) 456-7890":
function formatPhoneNumber(phoneNumberString) {
var cleaned = ('' + phoneNumberString).replace(/\D/g, '');
var match = cleaned.match(/^(\d{3})(\d{3})(\d{4})$/);
if (match) {
return '(' + match[1] + ') ' + match[2] + '-' + match[3];
}
return null;
}
Here's a version that allows the optional +1 international code:
function formatPhoneNumber(phoneNumberString) {
var cleaned = ('' + phoneNumberString).replace(/\D/g, '');
var match = cleaned.match(/^(1|)?(\d{3})(\d{3})(\d{4})$/);
if (match) {
var intlCode = (match[1] ? '+1 ' : '');
return [intlCode, '(', match[2], ') ', match[3], '-', match[4]].join('');
}
return null;
}
formatPhoneNumber('+12345678900') // => "+1 (234) 567-8900"
formatPhoneNumber('2345678900') // => "(234) 567-8900"
Possible solution:
function normalize(phone) {
//normalize string and remove all unnecessary characters
phone = phone.replace(/[^\d]/g, "");
//check if number length equals to 10
if (phone.length == 10) {
//reformat and return phone number
return phone.replace(/(\d{3})(\d{3})(\d{4})/, "($1) $2-$3");
}
return null;
}
var phone = '(123)4567890';
phone = normalize(phone); //(123) 456-7890
var x = '301.474.4062';
x = x.replace(/\D+/g, '')
.replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3');
alert(x);
This answer borrows from maerics' answer. It differs primarily in that it accepts partially entered phone numbers and formats the parts that have been entered.
phone = value.replace(/\D/g, '');
const match = phone.match(/^(\d{1,3})(\d{0,3})(\d{0,4})$/);
if (match) {
phone = `${match[1]}${match[2] ? ' ' : ''}${match[2]}${match[3] ? '-' : ''}${match[3]}`;
}
return phone
I'm using this function to format US numbers.
function formatUsPhone(phone) {
var phoneTest = new RegExp(/^((\+1)|1)? ?\(?(\d{3})\)?[ .-]?(\d{3})[ .-]?(\d{4})( ?(ext\.? ?|x)(\d*))?$/);
phone = phone.trim();
var results = phoneTest.exec(phone);
if (results !== null && results.length > 8) {
return "(" + results[3] + ") " + results[4] + "-" + results[5] + (typeof results[8] !== "undefined" ? " x" + results[8] : "");
}
else {
return phone;
}
}
It accepts almost all imaginable ways of writing a US phone number. The result is formatted to a standard form of (987) 654-3210 x123
thinking backwards
Take the last digits only (up to 10) ignoring first "1".
function formatUSNumber(entry = '') {
const match = entry
.replace(/\D+/g, '').replace(/^1/, '')
.match(/([^\d]*\d[^\d]*){1,10}$/)[0]
const part1 = match.length > 2 ? `(${match.substring(0,3)})` : match
const part2 = match.length > 3 ? ` ${match.substring(3, 6)}` : ''
const part3 = match.length > 6 ? `-${match.substring(6, 10)}` : ''
return `${part1}${part2}${part3}`
}
example input / output as you type
formatUSNumber('+1333')
// (333)
formatUSNumber('333')
// (333)
formatUSNumber('333444')
// (333) 444
formatUSNumber('3334445555')
// (333) 444-5555
2021
libphonenumber-js
Example
import parsePhoneNumber from 'libphonenumber-js'
const phoneNumber = parsePhoneNumber('+12133734253')
phoneNumber.formatInternational() === '+1 213 373 4253'
phoneNumber.formatNational() === '(213) 373-4253'
phoneNumber.getURI() === 'tel:+12133734253'
Based on David Baucum's answer - here is a version that trys to improve auto-replacement "as you type" for example in a React onChange event handler:
function formatPhoneNumber(phoneNumber) {
const cleanNum = phoneNumber.toString().replace(/\D/g, '');
const match = cleanNum.match(/^(\d{3})(\d{0,3})(\d{0,4})$/);
if (match) {
return '(' + match[1] + ') ' + (match[2] ? match[2] + "-" : "") + match[3];
}
return cleanNum;
}
//...
onChange={e => setPhoneNum(formatPhoneNumber(e.target.value))}
It will insert (###) as soon as there are 3 numbers and then it will keep following the RegEx until it looks like this (###) ###-####
I've extended David Baucum's answer to include support for extensions up to 4 digits in length. It also includes the parentheses requested in the original question. This formatting will work as you type in the field.
phone = phone.replace(/\D/g, '');
const match = phone.match(/^(\d{1,3})(\d{0,3})(\d{0,4})(\d{0,4})$/);
if (match) {
phone = `(${match[1]}${match[2] ? ') ' : ''}${match[2]}${match[3] ? '-' : ''}${match[3]}${match[4] ? ' x' : ''}${match[4]}`;
}
return phone;
Almost all of these have issues when the user tries to backspace over the delimiters, particularly from the middle of the string.
Here's a jquery solution that handles that, and also makes sure the cursor stays in the right place as you edit:
//format text input as phone number (nnn) nnn-nnnn
$('.myPhoneField').on('input', function (e){
var $phoneField = e.target;
var cursorPosition = $phoneField.selectionStart;
var numericString = $phoneField.value.replace(/\D/g, '').substring(0, 10);
// let user backspace over the '-'
if (cursorPosition === 9 && numericString.length > 6) return;
// let user backspace over the ') '
if (cursorPosition === 5 && numericString.length > 3) return;
if (cursorPosition === 4 && numericString.length > 3) return;
var match = numericString.match(/^(\d{1,3})(\d{0,3})(\d{0,4})$/);
if (match) {
var newVal = '(' + match[1];
newVal += match[2] ? ') ' + match[2] : '';
newVal += match[3] ? '-' + match[3] : '';
// to help us put the cursor back in the right place
var delta = newVal.length - Math.min($phoneField.value.length, 14);
$phoneField.value = newVal;
$phoneField.selectionEnd = cursorPosition + delta;
} else {
$phoneField.value = '';
}
})
var numbers = "(123) 456-7890".replace(/[^\d]/g, ""); //This strips all characters that aren't digits
if (numbers.length != 10) //wrong format
//handle error
var phone = "(" + numbers.substr(0, 3) + ") " + numbers.substr(3, 3) + "-" + numbers.substr(6); //Create format with substrings
Here is one that will accept both phone numbers and phone numbers with extensions.
function phoneNumber(tel) {
var toString = String(tel),
phoneNumber = toString.replace(/[^0-9]/g, ""),
countArrayStr = phoneNumber.split(""),
numberVar = countArrayStr.length,
closeStr = countArrayStr.join("");
if (numberVar == 10) {
var phone = closeStr.replace(/(\d{3})(\d{3})(\d{4})/, "$1.$2.$3"); // Change number symbols here for numbers 10 digits in length. Just change the periods to what ever is needed.
} else if (numberVar > 10) {
var howMany = closeStr.length,
subtract = (10 - howMany),
phoneBeginning = closeStr.slice(0, subtract),
phoneExtention = closeStr.slice(subtract),
disX = "x", // Change the extension symbol here
phoneBeginningReplace = phoneBeginning.replace(/(\d{3})(\d{3})(\d{4})/, "$1.$2.$3"), // Change number symbols here for numbers greater than 10 digits in length. Just change the periods and to what ever is needed.
array = [phoneBeginningReplace, disX, phoneExtention],
afterarray = array.splice(1, 0, " "),
phone = array.join("");
} else {
var phone = "invalid number US number";
}
return phone;
}
phoneNumber("1234567891"); // Your phone number here
For all international Phone numbers with country code upto 3 digits, we can change the original answer a little bit as below.
For first match instead of looking for '1' we should look for 1-3 digits.
export const formatPhoneNumber = (phoneNumberString) => {
var cleaned = ('' + phoneNumberString).replace(/\D/g, '');
var match = cleaned.match(/^(\d{1,3}|)?(\d{3})(\d{3})(\d{4})$/);
if (match) {
var intlCode = (match[1] ? `+${match[1]} ` : '');
return [intlCode, '(', match[2], ') ', match[3], '-', match[4]].join('');
}
return null;
}
console.log( formatPhoneNumber('16464765278') )//+1 (646) 476-5278
console.log( formatPhoneNumber('+2549114765278')) //+254 (911) 476-5278
console.log( formatPhoneNumber('929876543210') )//+92 (987) 654-3210
Fulfils my requirement.
For US Phone Numbers
/^\(?(\d{3})\)?[- ]?(\d{3})[- ]?(\d{4})$/
Let’s divide this regular expression in smaller fragments to make is easy to understand.
/^\(?: Means that the phone number may begin with an optional (.
(\d{3}): After the optional ( there must be 3 numeric digits. If the phone number does not have a (, it must start with 3 digits. E.g. (308 or 308.
\)?: Means that the phone number can have an optional ) after first 3 digits.
[- ]?: Next the phone number can have an optional hyphen (-) after ) if present or after first 3 digits.
(\d{3}): Then there must be 3 more numeric digits. E.g (308)-135 or 308-135 or 308135
[- ]?: After the second set of 3 digits the phone number can have another optional hyphen (-). E.g (308)-135- or 308-135- or 308135-
(\d{4})$/: Finally, the phone number must end with four digits. E.g (308)-135-7895 or 308-135-7895 or 308135-7895 or 3081357895.
Reference :
http://www.zparacha.com/phone_number_regex/
You can use this functions to check valid phone numbers and normalize them:
let formatPhone = (dirtyNumber) => {
return dirtyNumber.replace(/\D+/g, '').replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3');
}
let isPhone = (phone) => {
//normalize string and remove all unnecessary characters
phone = phone.replace(/\D+/g, '');
return phone.length == 10? true : false;
}
The solutions above are superior, especially if using Java, and encountering more numbers with more than 10 digits such as the international code prefix or additional extension numbers. This solution is basic (I'm a beginner in the regex world) and designed with US Phone numbers in mind and is only useful for strings with just 10 numbers with perhaps some formatting characters, or perhaps no formatting characters at all (just 10 numbers). As such I would recomend this solution only for semi-automatic applications. I Personally prefer to store numbers as just 10 numbers without formatting characters, but also want to be able to convert or clean phone numbers to the standard format normal people and apps/phones will recognize instantly at will.
I came across this post looking for something I could use with a text cleaner app that has PCRE Regex capabilities (but no java functions). I will post this here for people who could use a simple pure Regex solution that could work in a variety of text editors, cleaners, expanders, or even some clipboard managers. I personally use Sublime and TextSoap. This solution was made for Text Soap as it lives in the menu bar and provides a drop-down menu where you can trigger text manipulation actions on what is selected by the cursor or what's in the clipboard.
My approach is essentially two substitution/search and replace regexes. Each substitution search and replace involves two regexes, one for search and one for replace.
Substitution/ Search & Replace #1
The first substitution/ search & replace strips non-numeric numbers from an otherwise 10-digit number to a 10-digit string.
First Substitution/ Search Regex: \D
This search string matches all characters that is not a digit.
First Substitution/ Replace Regex: "" (nothing, not even a space)
Leave the substitute field completely blank, no white space should exist including spaces. This will result in all matched non-digit characters being deleted. You should have gone in with 10 digits + formatting characters prior this operation and come out with 10 digits sans formatting characters.
Substitution/ Search & Replace #2
The second substitution/search and replace search part of the operation captures groups for area code $1, a capture group for the second set of three numbers $2, and the last capture group for the last set of four numbers $3. The regex for the substitute portion of the operation inserts US phone number formatting in between the captured group of digits.
Second Substitution/ Search Regex: (\d{3})(\d{3})(\d{4})
Second Substitution/ Replace Regex: \($1\) $2\-$3
The backslash \ escapes the special characters (, ) , (<-whitespace), and - since we are inserting them between our captured numbers in capture groups $1, $2, & $3 for US phone number formatting purposes.
In TextSoap I created a custom cleaner that includes the two substitution operation actions, so in practice it feels identical to executing a script. I'm sure this solution could be improved but I expect complexity to go up quite a bit. An improved version of this solution is welcomed as a learning experience if anyone wants to add to this.

How to do word counts for a mixture of English and Chinese in Javascript

I want to count the number of words in a passage that contains both English and Chinese. For English, it's simple. Each word is a word. For Chinese, we count each character as a word. Therefore, 香港人 is three words here.
So for example, "I am a 香港人" should have a word count of 6.
Any idea how can I count it in Javascript/jQuery?
Thanks!
Try a regex like this:
/[\u00ff-\uffff]|\S+/g
For example, "I am a 香港人".match(/[\u00ff-\uffff]|\S+/g) gives:
["I", "am", "a", "香", "港", "人"]
Then you can just check the length of the resulting array.
The \u00ff-\uffff part of the regex is a unicode character range; you probably want to narrow this down to just the characters you want to count as words. For example, CJK Unified would be \u4e00-\u9fcc.
function countWords(str) {
var matches = str.match(/[\u00ff-\uffff]|\S+/g);
return matches ? matches.length : 0;
}
It can't be 6, because when you calculate length of a string it includes spaces too.
So,
var d = "I am a 香港人";
d.length //returns 10
d.replace(/\s+/g, "").length //returns 7, excluding spaces
FYI: Your site should be properly encoded.
I think I found what you need. "I am a 香港人" this contains a repeated twice. So
With the help of #PSL 's answer, I found a way.
var d = "I am a 香港人";
var uniqueList=d.replace(/\s+/g, '').split('').filter(function(item,i,allItems){
return i==allItems.indexOf(item);
}).join('');
console.log(uniqueList.length); //returns 6
JSFiddle
As you comments, I assume you sentence as "I am a 香 港 人" space between each word. Now I altered the code
var d = "I am a 香 港 人";
var uniqueList=d.split(' ').filter(function(item,i,allItems){
return i==allItems.indexOf(item);
});
console.log(uniqueList.length); //returns 6
JSFiddle
I have tried the script, but it will sometimes wrongly count the number of words.
For example, some people will type "香港人computing都不錯的", but the script will count it as 4 words (using the following script).
<script>
var str = "香港人computing都不錯的";
var matches = str.match(/[\u00ff-\uffff]|\S+/g);
x= matches ? matches.length : 0;
alert(x)
</script>
To fix the problem, I have changed the codes to:
<script>
var str="香港人computing都不錯的";
/// fix problem in special characters such as middle-dot, etc.
str= str.replace(/[\u007F-\u00FE]/g,' ');
/// make a duplicate first...
var str1=str;
var str2=str;
/// the following remove all chinese characters and then count the number of english characters in the string
str1=str1.replace(/[^!-~\d\s]+/gi,' ')
/// the following remove all english characters and then count the number of chinese characters in the string
str2=str2.replace(/[!-~\d\s]+/gi,'')
var matches1 = str1.match(/[\u00ff-\uffff]|\S+/g);
var matches2 = str2.match(/[\u00ff-\uffff]|\S+/g);
count1= matches1 ? matches1.length : 0;
count2= matches2 ? matches2.length : 0;
/// return the total of the mixture
var lvar1= (count1+count2);
alert(lvar1);
</script>
Now the script counts the number of words in a mixture of chinese and english correctly.... Enjoy..

Check if input contains 8 chars min, 1 digit and 1 letter minimum

Here's what I tried...
It works if I only check if the value of the input is lesser than 8, but doesn't work to check if it contains at least 1 letter and 1 digit. What am I doing wrong ? =/
$(document).ready(function() {
var jVal = {
'passWord' : function() {
$('body').append('<div id="nameInfo" class="info"></div>');
var nameInfo = $('#nameInfo');
var ele = $('#password');
var pos = ele.offset();
ra = /^[A-Za-z]+$/;
re = /^[0-9]+$/;
nameInfo.css({
top: pos.top - 3,
left: pos.left + ele.width() + 15
});
if (ele.val().length < 8 & re.test(ele.value) & ra.test(ele.value)) {
jVal.errors = true;
nameInfo.removeClass('correct').addClass('error').html('← too short').show();
ele.removeClass('normal').addClass('wrong');
}
else {
nameInfo.removeClass('error').addClass('correct').html('√').show();
ele.removeClass('wrong').addClass('normal');
}
}
}
$('#password').change(jVal.passWord);
});
ra checks if the password is made ENTIRELY of letters. re checks if the password is made ENTIRELY of numbers. They are mutually exclusive and therefore cannot both be true.
Instead, use ra = /[a-z]/i; re = /[0-9]/;.
EDIT: Also, since you're using jQuery, you should be testing on ele.val(), not ele.value.
You could use a single regex to do everything:
/^(?=.*\d.*)(?=.*[a-z].*)\w{8,}$/i
The first two pieces check for both a digit, and an a-z char in the whole string, and then the last piece ensures it's at least 8 characters. You could change the last \w to . to allow special chars if so desired.

Count number of words in string using JavaScript

I am trying to count the number of words in a given string using the following code:
var t = document.getElementById('MSO_ContentTable').textContent;
if (t == undefined) {
var total = document.getElementById('MSO_ContentTable').innerText;
} else {
var total = document.getElementById('MSO_ContentTable').textContent;
}
countTotal = cword(total);
function cword(w) {
var count = 0;
var words = w.split(" ");
for (i = 0; i < words.length; i++) {
// inner loop -- do the count
if (words[i] != "") {
count += 1;
}
}
return (count);
}
In that code I am getting data from a div tag and sending it to the cword() function for counting. Though the return value is different in IE and Firefox. Is there any change required in the regular expression? One thing that I show that both browser send same string there is a problem inside the cword() function.
[edit 2022, based on comment] Nowadays, one would not extend the native prototype this way. A way to extend the native protype without the danger of naming conflicts is to use the es20xx symbol. Here is an example of a wordcounter using that.
Old answer: you can use split and add a wordcounter to the String prototype:
if (!String.prototype.countWords) {
String.prototype.countWords = function() {
return this.length && this.split(/\s+\b/).length || 0;
};
}
console.log(`'this string has five words'.countWords() => ${
'this string has five words'.countWords()}`);
console.log(`'this string has five words ... and counting'.countWords() => ${
'this string has five words ... and counting'.countWords()}`);
console.log(`''.countWords() => ${''.countWords()}`);
I would prefer a RegEx only solution:
var str = "your long string with many words.";
var wordCount = str.match(/(\w+)/g).length;
alert(wordCount); //6
The regex is
\w+ between one and unlimited word characters
/g greedy - don't stop after the first match
The brackets create a group around every match. So the length of all matched groups should match the word count.
This is the best solution I've found:
function wordCount(str) {
var m = str.match(/[^\s]+/g)
return m ? m.length : 0;
}
This inverts whitespace selection, which is better than \w+ because it only matches the latin alphabet and _ (see http://www.ecma-international.org/ecma-262/5.1/#sec-15.10.2.6)
If you're not careful with whitespace matching you'll count empty strings, strings with leading and trailing whitespace, and all whitespace strings as matches while this solution handles strings like ' ', ' a\t\t!\r\n#$%() d ' correctly (if you define 'correct' as 0 and 4).
You can make a clever use of the replace() method although you are not replacing anything.
var str = "the very long text you have...";
var counter = 0;
// lets loop through the string and count the words
str.replace(/(\b+)/g,function (a) {
// for each word found increase the counter value by 1
counter++;
})
alert(counter);
the regex can be improved to exclude html tags for example
//Count words in a string or what appears as words :-)
function countWordsString(string){
var counter = 1;
// Change multiple spaces for one space
string=string.replace(/[\s]+/gim, ' ');
// Lets loop through the string and count the words
string.replace(/(\s+)/g, function (a) {
// For each word found increase the counter value by 1
counter++;
});
return counter;
}
var numberWords = countWordsString(string);

Javascript substr(); limit by word not char

I would like to limit the substr by words and not chars. I am thinking regular expression and spaces but don't know how to pull it off.
Scenario: Limit a paragraph of words to 200 words using javascript/jQuery.
var $postBody = $postBody.substr(' ',200);
This is great but splits words in half :) Thanks ahead of time!
function trim_words(theString, numWords) {
expString = theString.split(/\s+/,numWords);
theNewString=expString.join(" ");
return theNewString;
}
if you're satisfied with a not-quite accurate solution, you could simply keep a running count on the number of space characters within the text and assume that it is equal to the number of words.
Otherwise, I would use split() on the string with " " as the delimiter and then count the size of the array that split returns.
very quick and dirty
$("#textArea").val().split(/\s/).length
I suppose you need to consider punctuation and other non-word, non-whitespace characters as well. You want 200 words, not counting whitespace and non-letter characters.
var word_count = 0;
var in_word = false;
for (var x=0; x < text.length; x++) {
if ( ... text[x] is a letter) {
if (!in_word) word_count++;
in_word = true;
} else {
in_word = false;
}
if (!in_word && word_count >= 200) ... cut the string at "x" position
}
You should also decide whether you treat digits as a word, and whether you treat single letters as a word.

Categories