Split string into pairs, triplets, quadruplets and on (ngrams)? - javascript

I would like to split natural text into word pairs, triplets, quadruplets and on!
I have figured out how to split into pairs so far. I assume I will need an additional loop to accommodate the word count
Here is the code for pairs
var test = "I love you so much, but Joe said \"he doesn't\"!";
var words = test.split(" ");
var two_words = [];
for (var i = 0; i < words.length - 1; i++) {
two_words.push(words[i] + ' ' + words[i + 1]);
}
console.log(two_words);
// Here is what I am trying
var words = test.split(" ");
var split_words = [];
var split_length = 5;
for (var l = 2; l <= split_length; l++) {
for (var i = 0; i < words.length - (l - 1); i++) {
var split_word;
for (c = 0; c <= l; c++) {
split_word += split_words[i + c];
}
split_words.push(split_word);
}
}
console.log(split_words);
Adding expected output...(an array of ngrams) sg like this
// 2grams
"I love"
"love you"
"you so"
"so much,"
"much, but"
"but Joe"
"Joe said"
"said "he"
""he doesn't"!"
//3grams
"I love you"
"love you so"
"you so much"
"so much, but"
//and on and on

This is called "n-grams" and can be done in modern JavaScript using generators like this:
function* ngrams(a, n) {
let buf = [];
for (let x of a) {
buf.push(x);
if (buf.length === n) {
yield buf;
buf.shift();
}
}
}
var test = "The quick brown fox jumps over the lazy dog";
for (let g of ngrams(test.split(' '), 3))
console.log(g.join(' '))
Another, more concise and probably faster option:
let ngrams = (a, n) => a.slice(0, 1 - n).map((_, i) => a.slice(i, i + n));

Assuming that your desired result does not include jumbled ordered combinations, you can try following
// Code goes here
var test = "I love you so much, but Joe said \"he doesn't\"!";
var arr = test.split(" ");
var words = arr.length; // total length of words
var result = [];
function process(arr, length) { // process array for number of words
var temp = [];
// use equal if want to include the complete string as well in the array
if (arr.length >= length) {
// the check excludes any left over words which do not meet the length criteria
for (var i = 0; (i + length) <= arr.length; i++) {
temp.push(arr.slice(i, length + i).join(" "));
}
result.push(temp);
process(arr, length + 1); // recursive calling
}
}
process(arr, 2);
console.log(result);

This should do what you're looking for:
function chunkIt(str,chunk) {
var words = str.split(" ");
var arr = [];
for (var i = (chunk - 1); i < words.length; i++) {
var start = i - (chunk - 1);
arr.push(words.slice(start, start + chunk));
}
return arr.map(v => v.join(" "));
}
var test = "I love you so much, but Joe said \"he doesn't\"!";
console.log(chunkIt(test,2));
console.log(chunkIt(test,3));
console.log(chunkIt(test,4));

You can dramatically shorten your code by using a library like lodash:
var word = 'foobarbaz';
var chunks = _.chunk(word, 2).map((chunk) => chunk.join(''));
console.log(chunks); //[ 'fo', 'ob', 'ar', 'ba', 'z' ]
Then you can pass in values other than 2 to suit your needs

Related

Find duplicate phrases (not just words) in array

Let's say I have an array:
[
"I want **a dog**",
"**A dog** is here",
"Pet **a dog**",
"A **red cat**",
"**red cat** is cute"
...
]
How do I figure out what the duplicate phrases are, not just words?
For example, I'd like "a dog" and "red cat" to be returned.
Most existing posts I found are only about getting individual words, not phrases (multiple words).
You're giving us too little information. I'm assuming you're splitting by spaces. ES6 to the rescue :). Sets have O(1) lookup for when you're looking for repeated phrases.
edit: Just realized that you can cut down space complexity by a ton with some small modifications. If you want me to do that, give me a shoutout.
const buildAllPhrases = sentence => {
const splitSentence = sentence.split(" ")
const phraseSize = splitSentence.length
const allPhrases = []
for (let i = phraseSize; i > 0; i--) {
for (let y = 0; y + i <= phraseSize; y++) {
allPhrases.push(splitSentence.slice(y, y + i))
}
}
return allPhrases.map(phrase => phrase.join(" "))
}
const findRepeats = sentences => {
const allPhrases = new Set()
const repeatedPhrases = new Set()
let phrases
sentences.forEach(phrase => {
phrases = buildAllPhrases(phrase)
phrases.forEach(subPhrase => {
if (allPhrases.has(subPhrase)) {
repeatedPhrases.add(subPhrase)
} else {
allPhrases.add(subPhrase)
}
})
})
return [...repeatedPhrases]
}
const sample = [
"I want **a dog**",
"**A dog** is here",
"Pet **a dog**",
"A **red cat**",
"**red cat** is cute"
]
findRepeats(sample)
//['dog**', '**a dog**', '**a', '**red cat**', '**red', 'cat**', 'is']
This is not final version of the javascript function, and it can be optimized further. Few changes may also be required, but it can be starter for your requirement.
function GetPhrases(stringsArray) {
//Array to split your string into words.
var jaggedArray = [];
//Array to keep indexes of strings where 2 matching words are found together.
var newArray = [];
var phrases = [];
//Loop through your array
for (var ic = 0; ic < stringsArray.length; ic++) {
//Convert every item to array of strings
var items = (stringsArray[ic]).split(" ");
for (var it = 0; it < items.length; it++)
items[it] = items[it].toLowerCase();
//Push the array of words to main array
jaggedArray.push(items);
}
//console.log(jaggedArray);
// Loop through the main array
for (var iLoop = 0; iLoop < jaggedArray.length; iLoop++) {
// For every item in main array, loop through words in that item.
for (var ik = 0; ik < jaggedArray[iLoop].length; ik++) {
var currentWord = jaggedArray[iLoop][ik];
// For every word, check its existence in the main array in all items coming after current item.
for (var il = iLoop + 1; il < jaggedArray.length; il++) {
// Find the index in the string.
var indexOfFind = jaggedArray[il].indexOf(currentWord);
if (indexOfFind > 0) {
// if matching index is more than 0, find if the word before this word also matches.
var indexofPrevWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik - 1]);
if ((indexofPrevWord >= 0) && (indexofPrevWord == (indexOfFind - 1)))
if (newArray.indexOf(il + " - " + iLoop) < 0)
newArray.push(il + " - " + iLoop);
// if matching index is more than 0, find if the word after this word also matches.
var indexofNextWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik + 1]);
if (indexofNextWord >= 0 && (indexofNextWord == (indexOfFind + 1)))
if (newArray.indexOf(il + " - " + iLoop) < 0)
newArray.push(il + " - " + iLoop);
}
else if (indexOfFind = 0) {
// if matching index is more than 0, find if the word after this word also matches.
var indexofNewWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik + 1]);
if (indexofNewWord >= 0 && (indexofNewWord == (indexOfFind + 1)))
if (newArray.indexOf(il + " - " + iLoop) < 0)
newArray.push(il + " - " + iLoop);
}
}
}
}
//newArray will store indexes of those string arrays in jagged array which has a matching sequence of atleast 2 words.
//console.log(newArray);
//Loop through newArray
for (var itl = 0; itl < newArray.length; itl++) {
var item = newArray[itl];
var values = item.split(" - ");
var firstArrayItem = jaggedArray[values[0]];
var secondArrayItem = jaggedArray[values[1]];
var phraseStartPoint = [];
//for every word in firstItem
for (var iy = 0; iy < firstArrayItem.length - 1; iy++) {
var t = iy + 1;
// check if that word and next word exist in second array
if (secondArrayItem.toString().indexOf(firstArrayItem[iy] + "," + firstArrayItem[t]) >= 0) {
// if they do exist, get the indexes of these and store in local array, if they are not there, since we do not want repeating words later.
if (phraseStartPoint.indexOf(iy) < 0)
phraseStartPoint.push(iy);
if (phraseStartPoint.indexOf(t) < 0)
phraseStartPoint.push(t);
}
}
var str = "";
// Prepare the phrase from the local array and push into phrases array, if it not exists there.
for (var ifinalLoop = 0; ifinalLoop < phraseStartPoint.length; ifinalLoop++) {
str = str + firstArrayItem[phraseStartPoint[ifinalLoop]] + " ";
}
if (phrases.indexOf(str) < 0)
phrases.push(str);
}
return phrases;
}
var stringsArray = [
"I want a dog",
"A dog is here",
"Pet a dog is cute",
"A red cat is here",
"red cat is cute"
];
var result = GetPhrases(stringsArray);
// Print the phrases array.
for (var iPhrase = 0; iPhrase < result.length; iPhrase++) {
console.log(result[iPhrase]);
}
with regex you can detect duplicates in strings.
According to this regex:
(?:.*?)(\b\w.{3,}\b)(?:.*?)(\1) ,
it only works if you're looking for twice the same pattern.
note: you can replace 3 in {3,} by any other integer and see the changes.
This paramater contrains the minimal string lenght you're looking for twice.

Reverse String In Place Using for loop in JavaScript

var n = "reversestrings", k=3;
want to reverse string in chunk of 'k',
Answer would be : ver sre tse nir gs;
if Last word less then 'k' then don't need to reverse.
I am using below code but not getting expected answer.
var n = 'stringreverses', k = 3, str = '', s = '';
var c = 0;
for( var i=0; i<n.length; i++ ){
if( c<k ){
c++
str += n[i];
s=str.split('').reverse().join('');
}
else{
console.log("-" + s);
c=0;
}
}
First we need to split input to chunks with the same size (the last one can be smaller), next we reverse every chunk and concatenate at the end.
var input = "123456",
chunks = input.match(new RegExp('.{1,' + k + '}', 'g'));
var result = chunks.map(function(chunk) {
return chunk.split('').reverse().join('');
}).join('');
Homework or not, here is a good use case to start with strings.
Here is a C approach but you have more in Javascript.
In fact you want to reverse by chunk so deal with chunk. How to create a chunk of string ? a way is to use slice https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/String/slice
var str = "abcdef";
console.log(str.slice(0,2));
So you have an easy way to slice your string into chunk.
Then you have to iterate over it, there is no good way of doing it actually there is dozen but you could do it from backward to the beginning of the string:
for( i=str.length ; i>0 ; i -= k ){
// i will go from the end of your str to
// the beginning by step of k(=3) and you can use i - k and i
// to slice your string (as we see it before)
// you have to take care of the last part that could be less than
// 3
}
then you have to format the result, the most easy way to do that is to concatenate results into a string here it is :
var strRes = "";
strRes += "res 1";
strRes += "res 2";
console.log(strRes); // should screen "res 1res 2"
As it is homework, I wont make a jsfiddle, you have here all the pieces and it's up to you to build the puzzle.
hope that help
$(function() {
var n = 'reversestrings', k = 3;
var revString = "";
for (var i =0; i<=n.length; i++) {
if (i%k == 0) {
l = parseInt(k) + parseInt(i);
var strChunk = n.substring(i,l);
var innerStr = "";
for (var j =0; j<strChunk.length; j++) {
var opp = parseInt(strChunk.length) - parseInt(j) - 1;
innerStr = innerStr + strChunk.charAt(opp);
}
revString = revString + " "+innerStr;
}
}
alert(revString);
});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.10.0/jquery.min.js"></script>
My take on this. Pure JS without even built-in functions:
function reverseSubStr(str) {
var right = str.length - 1, reversedSubStr = '';
while(right >= 0) {
reversedSubStr += str[right];
right--;
}
return reversedSubStr;
}
function reverseStr(str) {
var initialStr = str, newstr = '', k = 3, substr = ''
for(var i = 1; i <= initialStr.length; i++) {
substr += initialStr[i - 1]; // form a substring
if(i % k == 0) { // once there are 3 symbols - reverse the substring
newstr += reverseSubStr(substr) + " "; // ... and add space
substr = ''; // then clean temp var
}
}
return newstr += substr; // add the remainder of the string - 'gs' - and return the result
}
var str = 'reversestrings';
console.log(reverseStr(str)); //ver sre tse nir gs
I like #Jozef 's approch but here is mine as well for those who are not much into Regex -
//Taking care of Tail Calling
function reverStrInChunk(str, k, r=''){
let index=0, revStr,
res = str.substring(index, k), remStr;
revStr = res.split("").reverse().join("");
remStr = str.substring(k, str.length);
r = r + revStr;
if(remStr.length>k){
return reverStrInChunk(remStr,k, r+" ");
}
else if(remStr.length<k) {
return r +" "+remStr;
}else{
return r +" "+ remStr.split("").reverse().join("");
}
}
var aStr = reverStrInChunk('reversestrings',3);//ver sre tse nir gs
console.log(aStr);

Find the Longest Word in a String javascript

My code was work well with string like "The quick brown fox jumped over the lazy dog".
But not work with string like "Google do a barrel roll".
It says problem is "TypeError undefined is not an object(evaluating 'Astr[i].length') ".
function findLongestWord(str) {
var Astr = str.split(" ");
var t = Astr[0].length;
var Al = Astr.length;
var j = 0;
for(var i =1; i < t;i++)
{
if(t < Astr[i].length)
{
t = Astr[i].length;
j = i;
}
}
str = Astr[j];
return str.length;
}
findLongestWord("Google do a barrel roll");
Here is one way of improving your function:
var str = 'Google do a barrel roll';
function findLongestWord(str) {
var Astr = str.split(' ');
if (!Astr.length) {
throw new Error('findLongestWord(): no words in str');
}
var t = Astr[0].length;
var Al = Astr.length;
var j = 0;
for(var i = 0; i < Al; i++)
{
if(t < Astr[i].length)
{
t = Astr[i].length;
j = i;
}
}
str = Astr[j];
return str.length;
}
findLongestWord(str);
//=> 6
You can also do something like this (which is a little easier to understand):
str.split(' ').reduce(function(longest, cur) {
return (cur.length > longest.length) ? cur : longest;
}, '');
//=> Google
you have problem with the variables in your 'for' loop.
As you can see, you split the array and get the length of the first member in the array
So basicly you get the first word length instead of the word count
var Astr = str.split(" ");
var t = Astr[0].length;
Here you can see that you use 't' (the first word length) as your loop bounds.
for(var i =1; i < t;i++)
Keep your code simple & readable this way it will be maintainable.
function findLongestWord(str) {
var words = str.split(" ");
var words_count = words.length;
var longest_word_length = 0;
for(var i = 0; i < words_count; i++){
if(longest_word_length < words[i].length){
longest_word_length = words[i].length;
}
}
return longest_word_length;
}
findLongestWord("Google do a barrel roll");
Note that you always can use short-hand functions for that
function findLongestWord(str) {
return str.split(' ').reduce(function(longest, cur) {
return (cur.length > longest.length) ? cur : longest;
}, '').length;
}
findLongestWord("Google do a barrel roll");
function findLongestWord(str)
{var arr=[];
arr=str.split(' ');
arr=arr.sort(function(a,b)
{
return b.length-a.length; /*sorting the array in decending order of
lengths of each word*/
});
var st=arr[0]; /* obviously the first element of the array will
have longest length.*/
return st.length;
}
findLongestWord("Google do a barrel roll");

String with the highest frequency of recurring letters in a word

This is a challenge for coderbyte I thought I'd try to do it using a different method for solving it than loops, objects. It passed but it isn't perfect. The directions for the challenge are:
Have the function LetterCountI(str) take the str parameter being passed and return the first word with the greatest number of repeated letters. For example: "Today, is the greatest day ever!" should return greatest because it has 2 e's (and 2 t's) and it comes before ever which also has 2 e's. If there are no words with repeating letters return -1. Words will be separated by spaces.
function LetterCountI(str){
var wordsAndLetters = {};
var count = 0;
var finalword;
str = str.split(" ");
for(var i = 0; i < str.length; i++){
wordsAndLetters[str[i]] = wordsAndLetters[str[i]] || 0;
}
function countWordLetters(strs){
strs = strs.split("");
var lettercount = {};
for(var i = 0; i <strs.length; i++){
lettercount[strs[i]] = lettercount[strs[i]] || 0;
lettercount[strs[i]]++;
}
return lettercount;
}
for(var words in wordsAndLetters){
wordsAndLetters[words] = countWordLetters(words);
var highestLetterFrequency = wordsAndLetters[words];
for(var values in highestLetterFrequency){
if(highestLetterFrequency[values] > count){
count = highestLetterFrequency[values];
finalword = words;
}
if(count !== 1){
return finalword;
}
}
}
return -1;
}
LetterCountI("today is the greatest day ever!");
Sorry if some of the variable names are confusing I've been up for far too long trying to figure out what I did wrong. If you use the parameters at the bottom of the code it returns 'greatest' like it should however change the parameters to
LetterCountI("toddday is the greatttttest day ever!");
and it logs 'toddday' when it should log 'greatttttest'. Is my code completely wrong? I realize if the parameters were ("caatt dooog") it should log 'caatt' since there are 4 recurring letters but I'm not worried about that I just am concerned about it finding the most recurrence of one letter(but by all means if you have a solution I would like to hear it!). Any changes to the variables if needed to make this code more readable would be appreciated!
The problem with your code is the positioning of the following section of code:
if(count !== 1){
return finalword;
}
Move it from where it currently is to just before the return -1, like so:
for(var words in wordsAndLetters){
wordsAndLetters[words] = countWordLetters(words);
var highestLetterFrequency = wordsAndLetters[words];
for(var values in highestLetterFrequency){
if(highestLetterFrequency[values] > count){
count = highestLetterFrequency[values];
finalword = words;
}
}
}
if(count !== 1){
return finalword;
}
return -1;
The problem with your original code is that your were returning the first word that had repeating characters, which meant your code didn't get far enough to check if any subsequent words had more repeating characters.
Also, just for fun, here is my alternative solution.
Here you go
Array.prototype.getUnique = function(){
var u = {}, a = [];
for(var i = 0, l = this.length; i < l; ++i){
if(u.hasOwnProperty(this[i])) {
continue;
}
a.push(this[i]);
u[this[i]] = 1;
}
return a;
}
function LetterCountI(str){
var temp = str.split(" ");
var final = '', weight = 0;
for(var i = 0; i < temp.length; ++i) {
var word = temp[i].split("");
if(word.getUnique().length < word.length) {
var diff = word.length - word.getUnique().length;
if(diff > weight){
weight = diff;
final = temp[i];
}
}
}
return final;
}
console.log(LetterCountI("Catt dooog"));
console.log(LetterCountI("toddday is the greatttttest day ever!"));
Viva LinQ !!!!!
var resultPerWord = new Dictionary<string, int>();
var S = "toddday is the greatttttest day ever!";
foreach(var s in S.Split(' '))
{
var theArray =
from w in s
group w by w into g
orderby g.Count() descending
select new { Letter = g.Key, Occurrence = g.Count() };
resultPerWord.Add(s, theArray.First().Occurrence);
}
var r = "-1";
if (resultPerWord.Any(x => x.Value >1))
{
r = resultPerWord.OrderByDescending(x => x.Value).First().Key;
}

Javascript string matching pattern help

i need to find few words or matching pattern using a Javascript.
this is the requirement.
i have a string like this,
Here is a quick guide for the next
time you reach for your favorite oil and some other topics
and i need to match this string against a string like this
favorite oil and some other topics can be based on something blah blah
how do i get the intersection of matching text blocks?
I already tried intersect Javascript script function, for some strings it's not working properly.
How to solve this problem? can this be done using Regex?
Please advice.
You have to find the Longest common substring.
If the strings are not very long, I recommend using Tim's approach. Otherwise, this is a Javascript implementation of the Longest common substring algorithm with dynamic programming. The runtime is O(mn) where m and n are the lengths of the 2 strings respectively.
An example usage:
var first = "Here is a quick guide for the next time you reach for your favorite oil and some other topics";
var second = "favorite oil and some other topics can be based on something blah blah";
console.log(first.intersection(second)); // ["favorite oil and some other topic"]
This is the algorithm implementation. It returns an array of the longest common substring(s). Extended the native String class, so the intersect method is available on all strings.
String.prototype.intersection = function(anotherString) {
var grid = createGrid(this.length, anotherString.length);
var longestSoFar = 0;
var matches = [];
for(var i = 0; i < this.length; i++) {
for(var j = 0; j < anotherString.length; j++) {
if(this.charAt(i) == anotherString.charAt(j)) {
if(i == 0 || j == 0) {
grid[i][j] = 1;
}
else {
grid[i][j] = grid[i-1][j-1] + 1;
}
if(grid[i][j] > longestSoFar) {
longestSoFar = grid[i][j];
matches = [];
}
if(grid[i][j] == longestSoFar) {
var match = this.substring(i - longestSoFar + 1, i);
matches.push(match);
}
}
}
}
return matches;
}
Also need this helper function to create a 2d array with all elements initialize to 0.
// create a 2d array
function createGrid(rows, columns) {
var grid = new Array(rows);
for(var i = 0; i < rows; i++) {
grid[i] = new Array(columns);
for(var j = 0; j < columns; j++) {
grid[i][j] = 0;
}
}
return grid;
}
This isn't very efficient and there are much better ways to do this in general (see #Anurag's answer), but it's simple and works fine for short strings:
function stringIntersection(str1, str2) {
var strTemp;
// Swap parameters if necessary to ensure str1 is the shorter
if (str1.length > str2.length) {
strTemp = str1;
str1 = str2;
str2 = strTemp;
}
// Start with the whole of str1 and try shorter substrings until
// we have a common one
var str1Len = str1.length, l = str1Len, start, substring;
while (l > 0) {
start = str1Len - l;
while (start >= 0) {
substring = str1.slice(start, l);
if (str2.indexOf(substring) > -1) {
return substring;
}
start--;
}
l--;
}
return "";
}
var s1 = "Here is a quick guide for the next time you reach"
+ " for your favorite oil and some other topics";
var s2 = "favorite oil and some other topics can be based on"
+ " something blah blah";
alert( stringIntersection(s1, s2) );
A simple polyfill of filter a string
if (!String.prototype.intersection) {
String.prototype.intersection = function(anotherString, caseInsensitive = false) {
const value = (caseInsensitive) ? this.toLowerCase() : this;
const comp = (caseInsensitive) ? anotherString.toLowerCase() : anotherString;
const ruleArray = comp.split("").reduce((m,v) => {m[v]=true; return m;} ,{})
return this.split("").filter( (c, i) => ruleArray[value[i]] ).join("")
}
}
"HelloWorld".intersection("HEWOLRLLODo", true)
"HelloWorld" - case insensitive
"HelloWorld".intersection("HEWOLRLLODo")
"HoWo" - case sensitive

Categories