REGEX - Input must have x unique digits - javascript

I was wondering if anyone knows a way for regex to detect that there has been a minimum of x digits used.
For example if I put in a number 6 digit number of 111222 but my regex says there must be at least 3 unique numbers this would cause a valid fail.
but if I had 123456 that would pass because there is more than three unique digits used.
Something like (obviously it wont be like bellow.)
/^[0-9]{6}*3$/
non regex way
var telcstart = $('#num').val(),
telc1 = (telcstart.match(/1/g) || []).length,
telc2 = (telcstart.match(/2/g) || []).length,
telc3 = (telcstart.match(/3/g) || []).length,
telc4 = (telcstart.match(/4/g) || []).length,
telc5 = (telcstart.match(/5/g) || []).length,
telc6 = (telcstart.match(/6/g) || []).length,
telc7 = (telcstart.match(/7/g) || []).length,
telc8 = (telcstart.match(/8/g) || []).length,
telc9 = (telcstart.match(/9/g) || []).length,
telc0 = (telcstart.match(/0/g) || []).length,
totaltelc = "a:" + telc1 + " b:" + telc2 + " c:" + telc3 + " d:" + telc4 + "e:" + telc5 + " f:" + telc6 + " g:" + telc7 + " h:" + telc8 + " i:" + telc9 + " j:" + telc0,
finaltelc = (totaltelc.match(/0/g) || []).length;
if (finaltelc <= 8) {
alert('passed');
} else {
alert('failed');
}
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
<input type="number" id="num" value="123451212111">

^(?=.*(.)(?!$|.*\1))(?=.*(?!\1)(.)(?!$|.*\2))[0-9]+$
function check(value) {
var res = document.getElementById('result');
if (/^(?=.*(.)(?!$|.*\1))(?=.*(?!\1)(.)(?!$|.*\2))[0-9]+$/.test(value)) {
res.innerText = '✓ OK';
} else {
res.innerText = '✗ No match';
}
}
<p>Enter a number with ≥3 different digits</p>
<p><input type=text onkeyup='check(this.value)'> <span id=result></span></p>
Let's split it up:
^
(?=
.*(.) # Pick any character in the string as group 1
(?!$|.*\1) # Ensure this is not the last character, but is the last unique character
)
(?=
.*(?!\1)(.) # Pick another character in the string as group 2
(?!$|.*\2) # Same condition as above
)
[0-9]+
$
The first condition ensures there is a string like ??1??a. The second condition ensures a string like ???2?b, where 1 ≠ a and 2 ≠ b and 1 ≠ 2. From this we can conclude there are at least 3 different characters.
This can be easily generalized to e.g. at least 8 different characters needed:
^
(?=.*(.)(?!$|.*\1))
(?=.*(?!\1)(.)(?!$|.*\2))
(?=.*(?!\1|\2)(.)(?!$|.*\3))
(?=.*(?!\1|\2|\3)(.)(?!$|.*\4))
(?=.*(?!\1|\2|\3|\4)(.)(?!$|.*\5))
(?=.*(?!\1|\2|\3|\4|\5)(.)(?!$|.*\6))
(?=.*(?!\1|\2|\3|\4|\5|\6)(.)(?!$|.*\7))
[0-9]+
$
Because JavaScript only support up to \9, you can't use this method to check more than 10 different characters. At this point you should really question whether regex is a right tool for this job though 😉.

You actually can do this with a regex, but it'll be very ugly and near-incomprehensible.
Looking at just one case, we can write a regex for a string that contains 0, 1, and at least one number from 2 to 9:
/^0(\d)*1(\d)*[2-9](\d)*$/
For each possible pair of numbers, we would have to write a regex like this ad combine all of them using | so that we catch all cases.
There are 10*9 = 90 pairs of distinct digits, and for each group there are 2 permutations, totalling 180 groups.
So we would have to do:
/(^0(\d)*1(\d)*[2-9](\d)*$/)|(^0(\d)*2(\d)*(1|[3-9])(\d)*$/)| ... /
Continuing for all 180 groups. That would be a gigantic regex, and would probably take a very long while to compile.
You should do this validation with code instead of running a regex.
EDIT: Apparently, JavaScript regexes have some extended features which make this doable, namely reusing the value captured by a given group. refer to #kennytm's answerr.

Based on negative lookaheads you can ensure a different digit matches three times:
(\d)\d*?((?!\1)\d)\d*?(?!\1|\2)\d+$
RegEx Demo
(?!\1) negative lookahead to ensure next match is not same as captured group #1
(?!\1|\2) negative lookahead to ensure next match is not same as captured group #1 and group #2

function uniqueDigit(str)
{
var a = 0;
for (var i=0; i<10; i++)
{
(new RegExp( i, 'g')).test(str) && a++;
}
return a
}
console.log(uniqueDigit('10122211100')) // in str
console.log(uniqueDigit(222111333888)) // in int
var a = 123456;
console.log(/^\d{6}$/.test(a) && uniqueDigit(a) > 2) // example

Related

Intl.NumberFormat() constructor with phone numbers [duplicate]

I'm looking to reformat (replace, not validate - there are many references for validating) a phone number for display in Javascript. Here's an example of some of the data:
123 4567890
(123) 456-7890
(123)456-7890
123 456 7890
123.456.7890
(blank/null)
1234567890
Is there an easy way to use a regular expression to do this? I'm looking for the best way to do this. Is there a better way?
I want to reformat the number to the following: (123) 456-7890
Assuming you want the format "(123) 456-7890":
function formatPhoneNumber(phoneNumberString) {
var cleaned = ('' + phoneNumberString).replace(/\D/g, '');
var match = cleaned.match(/^(\d{3})(\d{3})(\d{4})$/);
if (match) {
return '(' + match[1] + ') ' + match[2] + '-' + match[3];
}
return null;
}
Here's a version that allows the optional +1 international code:
function formatPhoneNumber(phoneNumberString) {
var cleaned = ('' + phoneNumberString).replace(/\D/g, '');
var match = cleaned.match(/^(1|)?(\d{3})(\d{3})(\d{4})$/);
if (match) {
var intlCode = (match[1] ? '+1 ' : '');
return [intlCode, '(', match[2], ') ', match[3], '-', match[4]].join('');
}
return null;
}
formatPhoneNumber('+12345678900') // => "+1 (234) 567-8900"
formatPhoneNumber('2345678900') // => "(234) 567-8900"
Possible solution:
function normalize(phone) {
//normalize string and remove all unnecessary characters
phone = phone.replace(/[^\d]/g, "");
//check if number length equals to 10
if (phone.length == 10) {
//reformat and return phone number
return phone.replace(/(\d{3})(\d{3})(\d{4})/, "($1) $2-$3");
}
return null;
}
var phone = '(123)4567890';
phone = normalize(phone); //(123) 456-7890
var x = '301.474.4062';
x = x.replace(/\D+/g, '')
.replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3');
alert(x);
This answer borrows from maerics' answer. It differs primarily in that it accepts partially entered phone numbers and formats the parts that have been entered.
phone = value.replace(/\D/g, '');
const match = phone.match(/^(\d{1,3})(\d{0,3})(\d{0,4})$/);
if (match) {
phone = `${match[1]}${match[2] ? ' ' : ''}${match[2]}${match[3] ? '-' : ''}${match[3]}`;
}
return phone
I'm using this function to format US numbers.
function formatUsPhone(phone) {
var phoneTest = new RegExp(/^((\+1)|1)? ?\(?(\d{3})\)?[ .-]?(\d{3})[ .-]?(\d{4})( ?(ext\.? ?|x)(\d*))?$/);
phone = phone.trim();
var results = phoneTest.exec(phone);
if (results !== null && results.length > 8) {
return "(" + results[3] + ") " + results[4] + "-" + results[5] + (typeof results[8] !== "undefined" ? " x" + results[8] : "");
}
else {
return phone;
}
}
It accepts almost all imaginable ways of writing a US phone number. The result is formatted to a standard form of (987) 654-3210 x123
thinking backwards
Take the last digits only (up to 10) ignoring first "1".
function formatUSNumber(entry = '') {
const match = entry
.replace(/\D+/g, '').replace(/^1/, '')
.match(/([^\d]*\d[^\d]*){1,10}$/)[0]
const part1 = match.length > 2 ? `(${match.substring(0,3)})` : match
const part2 = match.length > 3 ? ` ${match.substring(3, 6)}` : ''
const part3 = match.length > 6 ? `-${match.substring(6, 10)}` : ''
return `${part1}${part2}${part3}`
}
example input / output as you type
formatUSNumber('+1333')
// (333)
formatUSNumber('333')
// (333)
formatUSNumber('333444')
// (333) 444
formatUSNumber('3334445555')
// (333) 444-5555
2021
libphonenumber-js
Example
import parsePhoneNumber from 'libphonenumber-js'
const phoneNumber = parsePhoneNumber('+12133734253')
phoneNumber.formatInternational() === '+1 213 373 4253'
phoneNumber.formatNational() === '(213) 373-4253'
phoneNumber.getURI() === 'tel:+12133734253'
Based on David Baucum's answer - here is a version that trys to improve auto-replacement "as you type" for example in a React onChange event handler:
function formatPhoneNumber(phoneNumber) {
const cleanNum = phoneNumber.toString().replace(/\D/g, '');
const match = cleanNum.match(/^(\d{3})(\d{0,3})(\d{0,4})$/);
if (match) {
return '(' + match[1] + ') ' + (match[2] ? match[2] + "-" : "") + match[3];
}
return cleanNum;
}
//...
onChange={e => setPhoneNum(formatPhoneNumber(e.target.value))}
It will insert (###) as soon as there are 3 numbers and then it will keep following the RegEx until it looks like this (###) ###-####
I've extended David Baucum's answer to include support for extensions up to 4 digits in length. It also includes the parentheses requested in the original question. This formatting will work as you type in the field.
phone = phone.replace(/\D/g, '');
const match = phone.match(/^(\d{1,3})(\d{0,3})(\d{0,4})(\d{0,4})$/);
if (match) {
phone = `(${match[1]}${match[2] ? ') ' : ''}${match[2]}${match[3] ? '-' : ''}${match[3]}${match[4] ? ' x' : ''}${match[4]}`;
}
return phone;
Almost all of these have issues when the user tries to backspace over the delimiters, particularly from the middle of the string.
Here's a jquery solution that handles that, and also makes sure the cursor stays in the right place as you edit:
//format text input as phone number (nnn) nnn-nnnn
$('.myPhoneField').on('input', function (e){
var $phoneField = e.target;
var cursorPosition = $phoneField.selectionStart;
var numericString = $phoneField.value.replace(/\D/g, '').substring(0, 10);
// let user backspace over the '-'
if (cursorPosition === 9 && numericString.length > 6) return;
// let user backspace over the ') '
if (cursorPosition === 5 && numericString.length > 3) return;
if (cursorPosition === 4 && numericString.length > 3) return;
var match = numericString.match(/^(\d{1,3})(\d{0,3})(\d{0,4})$/);
if (match) {
var newVal = '(' + match[1];
newVal += match[2] ? ') ' + match[2] : '';
newVal += match[3] ? '-' + match[3] : '';
// to help us put the cursor back in the right place
var delta = newVal.length - Math.min($phoneField.value.length, 14);
$phoneField.value = newVal;
$phoneField.selectionEnd = cursorPosition + delta;
} else {
$phoneField.value = '';
}
})
var numbers = "(123) 456-7890".replace(/[^\d]/g, ""); //This strips all characters that aren't digits
if (numbers.length != 10) //wrong format
//handle error
var phone = "(" + numbers.substr(0, 3) + ") " + numbers.substr(3, 3) + "-" + numbers.substr(6); //Create format with substrings
Here is one that will accept both phone numbers and phone numbers with extensions.
function phoneNumber(tel) {
var toString = String(tel),
phoneNumber = toString.replace(/[^0-9]/g, ""),
countArrayStr = phoneNumber.split(""),
numberVar = countArrayStr.length,
closeStr = countArrayStr.join("");
if (numberVar == 10) {
var phone = closeStr.replace(/(\d{3})(\d{3})(\d{4})/, "$1.$2.$3"); // Change number symbols here for numbers 10 digits in length. Just change the periods to what ever is needed.
} else if (numberVar > 10) {
var howMany = closeStr.length,
subtract = (10 - howMany),
phoneBeginning = closeStr.slice(0, subtract),
phoneExtention = closeStr.slice(subtract),
disX = "x", // Change the extension symbol here
phoneBeginningReplace = phoneBeginning.replace(/(\d{3})(\d{3})(\d{4})/, "$1.$2.$3"), // Change number symbols here for numbers greater than 10 digits in length. Just change the periods and to what ever is needed.
array = [phoneBeginningReplace, disX, phoneExtention],
afterarray = array.splice(1, 0, " "),
phone = array.join("");
} else {
var phone = "invalid number US number";
}
return phone;
}
phoneNumber("1234567891"); // Your phone number here
For all international Phone numbers with country code upto 3 digits, we can change the original answer a little bit as below.
For first match instead of looking for '1' we should look for 1-3 digits.
export const formatPhoneNumber = (phoneNumberString) => {
var cleaned = ('' + phoneNumberString).replace(/\D/g, '');
var match = cleaned.match(/^(\d{1,3}|)?(\d{3})(\d{3})(\d{4})$/);
if (match) {
var intlCode = (match[1] ? `+${match[1]} ` : '');
return [intlCode, '(', match[2], ') ', match[3], '-', match[4]].join('');
}
return null;
}
console.log( formatPhoneNumber('16464765278') )//+1 (646) 476-5278
console.log( formatPhoneNumber('+2549114765278')) //+254 (911) 476-5278
console.log( formatPhoneNumber('929876543210') )//+92 (987) 654-3210
Fulfils my requirement.
For US Phone Numbers
/^\(?(\d{3})\)?[- ]?(\d{3})[- ]?(\d{4})$/
Let’s divide this regular expression in smaller fragments to make is easy to understand.
/^\(?: Means that the phone number may begin with an optional (.
(\d{3}): After the optional ( there must be 3 numeric digits. If the phone number does not have a (, it must start with 3 digits. E.g. (308 or 308.
\)?: Means that the phone number can have an optional ) after first 3 digits.
[- ]?: Next the phone number can have an optional hyphen (-) after ) if present or after first 3 digits.
(\d{3}): Then there must be 3 more numeric digits. E.g (308)-135 or 308-135 or 308135
[- ]?: After the second set of 3 digits the phone number can have another optional hyphen (-). E.g (308)-135- or 308-135- or 308135-
(\d{4})$/: Finally, the phone number must end with four digits. E.g (308)-135-7895 or 308-135-7895 or 308135-7895 or 3081357895.
Reference :
http://www.zparacha.com/phone_number_regex/
You can use this functions to check valid phone numbers and normalize them:
let formatPhone = (dirtyNumber) => {
return dirtyNumber.replace(/\D+/g, '').replace(/(\d{3})(\d{3})(\d{4})/, '($1) $2-$3');
}
let isPhone = (phone) => {
//normalize string and remove all unnecessary characters
phone = phone.replace(/\D+/g, '');
return phone.length == 10? true : false;
}
The solutions above are superior, especially if using Java, and encountering more numbers with more than 10 digits such as the international code prefix or additional extension numbers. This solution is basic (I'm a beginner in the regex world) and designed with US Phone numbers in mind and is only useful for strings with just 10 numbers with perhaps some formatting characters, or perhaps no formatting characters at all (just 10 numbers). As such I would recomend this solution only for semi-automatic applications. I Personally prefer to store numbers as just 10 numbers without formatting characters, but also want to be able to convert or clean phone numbers to the standard format normal people and apps/phones will recognize instantly at will.
I came across this post looking for something I could use with a text cleaner app that has PCRE Regex capabilities (but no java functions). I will post this here for people who could use a simple pure Regex solution that could work in a variety of text editors, cleaners, expanders, or even some clipboard managers. I personally use Sublime and TextSoap. This solution was made for Text Soap as it lives in the menu bar and provides a drop-down menu where you can trigger text manipulation actions on what is selected by the cursor or what's in the clipboard.
My approach is essentially two substitution/search and replace regexes. Each substitution search and replace involves two regexes, one for search and one for replace.
Substitution/ Search & Replace #1
The first substitution/ search & replace strips non-numeric numbers from an otherwise 10-digit number to a 10-digit string.
First Substitution/ Search Regex: \D
This search string matches all characters that is not a digit.
First Substitution/ Replace Regex: "" (nothing, not even a space)
Leave the substitute field completely blank, no white space should exist including spaces. This will result in all matched non-digit characters being deleted. You should have gone in with 10 digits + formatting characters prior this operation and come out with 10 digits sans formatting characters.
Substitution/ Search & Replace #2
The second substitution/search and replace search part of the operation captures groups for area code $1, a capture group for the second set of three numbers $2, and the last capture group for the last set of four numbers $3. The regex for the substitute portion of the operation inserts US phone number formatting in between the captured group of digits.
Second Substitution/ Search Regex: (\d{3})(\d{3})(\d{4})
Second Substitution/ Replace Regex: \($1\) $2\-$3
The backslash \ escapes the special characters (, ) , (<-whitespace), and - since we are inserting them between our captured numbers in capture groups $1, $2, & $3 for US phone number formatting purposes.
In TextSoap I created a custom cleaner that includes the two substitution operation actions, so in practice it feels identical to executing a script. I'm sure this solution could be improved but I expect complexity to go up quite a bit. An improved version of this solution is welcomed as a learning experience if anyone wants to add to this.

Check if first and last character contains given special char

I have input string
..-----''''''.......VAibhavs.sharma'..'-.'-.''-....''
I want to check if the first and last char place contains - or ' or ..
If yes then trim until we get name.
Expected output : VAibhavs.sharma
I am using like this.
while (
myString.charAt(0) == "." ||
myString.charAt(0) == "'" ||
myString.charAt(0) == "-" ||
myString.charAt(myString.length - 1) == "." ||
myString.charAt(myString.length - 1) == "'" ||
myString.charAt(myString.length - 1) == "-"
)
I know this is not correct way. How can I use regex?
I tried /^\'$. But this only checks or first char for a single special char.
You can use regular expression:
input = "..-----''''''.......VAibhavs.sharma'..'-.'-.''-....''"
output = input.replace(/^[-'\.]+/,"").replace(/[-'\.]+$/,"")
console.log(output)
[-'\.] ... -, ' or . character
+ ... one or more times
^ ... beginning of the string
$ ... end of the string
EDIT:
using match:
input = "..-----''''''.......VAibhavs.sharma'..'-.'-.''-....''"
output = input.match(/^[-'\.]+(.*?)[-'\.]+$/)[1]
console.log(output)
(...) ... (1st) group
.*? ... any chacter, zero or more times, ? means non-greedy
.match(...)[1] ... 1 means 1st group
There is already one accepted answer but still, this is how I would do.
var pattern = /\b[A-Za-z.]+\b/gm;
var str = "..-----''''''.......VAibhavs.sharma'..'-.'-.''-....''";
console.log(str.match(pattern));
// Output
// ["VAibhavs.sharma"]
\b is a zero-width word boundary. It matches positions where one side is a word character (usually a letter, digit or underscore) and the other side is not a word character (for instance, it may be the beginning of the string or a space character).

Format IPv4 string with regex

I'm writing an Angular 5 directive that validates an IPv4 input as the user types into it.
Currently it works as expected, however it's pretty verbose:
this.el.nativeElement.value = this.el.nativeElement.value
.replace(/^\D+/, '') // Remove any non-digit from position 0
.replace(/[^\d\.]+/, '') // Remove any non-digit, non dot from string
.replace(/\.{2,}/, '.') // Force sequences of dots into one single dot
.split('.').map(seq => seq.substring(0, 3)) // Limit to {1-3} digits per group
.slice(0, 4) // Limit to 4 groups of digits
.join('.') // Turn back into string and give dots back
How could I replace the split-map-slice-join sequence with another RegExp or string method in JavaScript?
Your request is a bit complicated let say fault tolerant. So given
"asdf1.2.3.aaa...4afd"
you are expected to get: "1.2.3.4".
The fist replace (.replace(/^\D+/, '')) seems to be superfluous because of the second replace. And the whole expression might be
var original = "asdf1.2.3.aaa...4afd";
var formatted = original.replace(/[^\d.]/g, "")
.replace(/^\.*(\d+)\.+(\d+)\.+(\d+)\.+(\d+)[^\d]?.*$/g, "$1.$2.$3.$4");
Note that this will give different result to non IPv4 strings. Like "1.2.3" so you may want to test this code as:
var a = "asdf1.2.3.aaa...4afd";
var a = "asdf2.3.aaa...4.5.6af.7d";
var a = "1.2.3.4";
var a = "1.2.3";
var a = "1.2";
var b = a.replace(/[^\d.]/g, "");
var c = b.replace(/^\.*(\d+)\.+(\d+)\.+(\d+)\.+(\d+)[^\d]?.*$/g, "$1.$2.$3.$4");
console.log(a + " " + b + " " + c);

How to get first 2 words?

Let data.title be ABC XYZ PQRS - www.aaa.tld.
Output needs to be like this ABC+XYZ
i've tried this:
var t = data.title.split(' ').join('+');
t = t.replace(/(([^\s]+\s\s*){1})(.*)/,"Unknown");
$("#log").text(t);
Here is one way to do it, no regex though, it only grabs the first two words and must have a space between those words.
First we split into and array, then we slice that array from the 0 index to 2(exclusive) or 1, and finally we join them with a '+':
var x = 'ABC XYZ PQRS';
var y = x.split(' ').slice(0,2).join('+');
// y = "ABC+XYZ"
Working Fiddle
Try using .match() with RegExp /([\w+]+)/g; concatenate first match, + character, second match
var matches = "ABC XYZ PQRS - www.aaa.tld".match(/([\w+]+)/g);
console.log(matches[0] + "+" + matches[1])
This is my general function for first n words. Haven't tested it extensively but it is fast even on long strings because it doesn't use a global regex or split every word. You can fine tune the regex for dealing with punctuation. I'm considering a hyphen as a delimiter but you can move that to the word portion instead if you prefer.
function regFirstWords(s, n) {
// ?: non-capturing subsequent sp+word.Change {} if you want to require n instead of allowing fewer
var a = s.match(new RegExp('[\\w\\.]+' + '(?:[\\s-]*[\\w\\.]+){0,' + (n - 1) + '}'));
return (a === undefined || a === null) ? '' : a[0];
}
To satisfy the OP's request to replace with '+'
regFirstWords('ABC XYZ PQRS - www.aaa.tld',2).replace(/\s/g,'+')

Count number of characters present in foreign language

Is there any optimal way to implement character count for non English letters? For example, if we take the word "Mother" in English, it is a 6 letter word. But if you type the same word(மதர்) in Tamil, it is a three letter word(ம+த+ர்) but the last letter(ர்) will be considered as two characters(ர+ஂ=ர்) by the system. So is there any way to count the number of real characters?
One clue is that if we move the cursor in keyboard into the word (மதர்), it will pass through 3 letters only and not into 4 chars considering by the system, so is there any way to find the solution by using this? Any help on this would be greatly appreciated...
Update
Back from lunch =)
I'm afraid that the previous won't work this well with any foreign language
So i added another fiddle with a possible way
var UnicodeNsm = [Array 1280] //It holds all escaped Unicode Non Space Marks
function countNSMString(str) {
var chars = str.split("");
var count = 0;
for (var i = 0,ilen = chars.length;i<ilen;i++) {
if(UnicodeNsm.indexOf(escape(chars[i])) == -1) {
count++;
}
}
return count;
}
var English = "Mother";
var Tamil = "மதர்";
var Vietnamese = "mẹ"
var Hindi = "मां"
function logL (str) {
console.log(str + " has " + countNSMString(str) + " visible Characters and " + str.length + " normal Characters" ); //"மதர் has 3 visible Characters"
}
logL(English) //"Mother has 6 visible Characters and 6 normal Characters"
logL(Tamil) //"மதர் has 3 visible Characters and 4 normal Characters"
logL(Vietnamese) //"mẹ has 2 visible Characters and 3 normal Characters"
logL(Hindi) //"मां has 1 visible Characters and 3 normal Characters"
So this just checks if theres any Character in the String which is a Unicode NSM character and ignores the count for this, this should work for the Most languages, not Tamil only,
And an array with 1280 Elements shouldn't be that big of a performance issue
Here is a list with the Unicode NSM's
http://www.fileformat.info/info/unicode/category/Mn/list.htm
Here is the according JSBin
After experimenting a bit with string operations, it turns out
String.indexOf returns the same for
"ர்" and for "ர"
meaning
"ர்ரர".indexOf("ர்") == "ர்ரர".indexOf("ர" + "்") //true but
"ர்ரர".indexOf("ர") == "ர்ரர".indexOf("ர" + "ர") //false
I took this opportunity and tried something like this
//ர்
var char = "ரர்ர்ரர்்";
var char2 = "ரரர்ர்ரர்்";
var char3 = "ர்ரர்ர்ரர்்";
function countStr(str) {
var chars = str.split("");
var count = 0;
for(var i = 0, ilen = chars.length;i<ilen;i++) {
var chars2 = chars[i] + chars[i+1];
if (str.indexOf(chars[i]) == str.indexOf(chars2))
i += 1;
count++;
}
return count;
}
console.log("--");
console.log(countStr(char)); //6
console.log(countStr(char2)); //7
console.log(countStr(char3)); //7
Which seems to work for the String above, it may take some adjustments, as i don't know a thing about Encoding and stuff, but maybe its a point you can begin with
Heres the JSBin
You can ignore combining marks in the count calculation with this function:
function charCount( str ) {
var re = /[\u0300-\u036f\u1dc0-\u1dff\u20d0-\u20ff\ufe20-\ufe2f\u0b82\u0b83\u0bbe\u0bbf\u0bc0-\u0bc2\u0bc6-\u0bc8\u0bca-\u0bcd\u0bd7]/g
return str.replace( re, "").length;
}
console.log(charCount('மதர்'))// 3
//More tests on random Tamil text:
//Paint the text character by character to verify, for instance 'யெ' is a single character, not 2
console.log(charCount("மெய்யெழுத்துக்கள்")); //9
console.log(charCount("ஒவ்வொன்றுடனும்")); //8
console.log(charCount("தமிழ்")); //3
console.log(charCount("வருகின்றனர்.")); //8
console.log(charCount("எழுதப்படும்")); //7
The Tamil signs and marks are not composed into single characters with their target character in unicode, so normalization wouldn't help. I have added all the Tamil combining marks or signs manually
to the regex, but it also includes the ranges for normal combining marks, so charCount("ä") is 1 regardless of normalization form.

Categories