Find duplicate phrases (not just words) in array

Find duplicate phrases (not just words) in array - javascript

Let's say I have an array:
[
"I want **a dog**",
"**A dog** is here",
"Pet **a dog**",
"A **red cat**",
"**red cat** is cute"
...
]
How do I figure out what the duplicate phrases are, not just words?
For example, I'd like "a dog" and "red cat" to be returned.
Most existing posts I found are only about getting individual words, not phrases (multiple words).

You're giving us too little information. I'm assuming you're splitting by spaces. ES6 to the rescue :). Sets have O(1) lookup for when you're looking for repeated phrases.
edit: Just realized that you can cut down space complexity by a ton with some small modifications. If you want me to do that, give me a shoutout.
const buildAllPhrases = sentence => {
const splitSentence = sentence.split(" ")
const phraseSize = splitSentence.length
const allPhrases = []
for (let i = phraseSize; i > 0; i--) {
for (let y = 0; y + i <= phraseSize; y++) {
allPhrases.push(splitSentence.slice(y, y + i))
}
}
return allPhrases.map(phrase => phrase.join(" "))
}
const findRepeats = sentences => {
const allPhrases = new Set()
const repeatedPhrases = new Set()
let phrases
sentences.forEach(phrase => {
phrases = buildAllPhrases(phrase)
phrases.forEach(subPhrase => {
if (allPhrases.has(subPhrase)) {
repeatedPhrases.add(subPhrase)
} else {
allPhrases.add(subPhrase)
}
})
})
return [...repeatedPhrases]
}
const sample = [
"I want **a dog**",
"**A dog** is here",
"Pet **a dog**",
"A **red cat**",
"**red cat** is cute"
]
findRepeats(sample)
//['dog**', '**a dog**', '**a', '**red cat**', '**red', 'cat**', 'is']

This is not final version of the javascript function, and it can be optimized further. Few changes may also be required, but it can be starter for your requirement.
function GetPhrases(stringsArray) {
//Array to split your string into words.
var jaggedArray = [];
//Array to keep indexes of strings where 2 matching words are found together.
var newArray = [];
var phrases = [];
//Loop through your array
for (var ic = 0; ic < stringsArray.length; ic++) {
//Convert every item to array of strings
var items = (stringsArray[ic]).split(" ");
for (var it = 0; it < items.length; it++)
items[it] = items[it].toLowerCase();
//Push the array of words to main array
jaggedArray.push(items);
}
//console.log(jaggedArray);
// Loop through the main array
for (var iLoop = 0; iLoop < jaggedArray.length; iLoop++) {
// For every item in main array, loop through words in that item.
for (var ik = 0; ik < jaggedArray[iLoop].length; ik++) {
var currentWord = jaggedArray[iLoop][ik];
// For every word, check its existence in the main array in all items coming after current item.
for (var il = iLoop + 1; il < jaggedArray.length; il++) {
// Find the index in the string.
var indexOfFind = jaggedArray[il].indexOf(currentWord);
if (indexOfFind > 0) {
// if matching index is more than 0, find if the word before this word also matches.
var indexofPrevWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik - 1]);
if ((indexofPrevWord >= 0) && (indexofPrevWord == (indexOfFind - 1)))
if (newArray.indexOf(il + " - " + iLoop) < 0)
newArray.push(il + " - " + iLoop);
// if matching index is more than 0, find if the word after this word also matches.
var indexofNextWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik + 1]);
if (indexofNextWord >= 0 && (indexofNextWord == (indexOfFind + 1)))
if (newArray.indexOf(il + " - " + iLoop) < 0)
newArray.push(il + " - " + iLoop);
}
else if (indexOfFind = 0) {
// if matching index is more than 0, find if the word after this word also matches.
var indexofNewWord = jaggedArray[il].indexOf(jaggedArray[iLoop][ik + 1]);
if (indexofNewWord >= 0 && (indexofNewWord == (indexOfFind + 1)))
if (newArray.indexOf(il + " - " + iLoop) < 0)
newArray.push(il + " - " + iLoop);
}
}
}
}
//newArray will store indexes of those string arrays in jagged array which has a matching sequence of atleast 2 words.
//console.log(newArray);
//Loop through newArray
for (var itl = 0; itl < newArray.length; itl++) {
var item = newArray[itl];
var values = item.split(" - ");
var firstArrayItem = jaggedArray[values[0]];
var secondArrayItem = jaggedArray[values[1]];
var phraseStartPoint = [];
//for every word in firstItem
for (var iy = 0; iy < firstArrayItem.length - 1; iy++) {
var t = iy + 1;
// check if that word and next word exist in second array
if (secondArrayItem.toString().indexOf(firstArrayItem[iy] + "," + firstArrayItem[t]) >= 0) {
// if they do exist, get the indexes of these and store in local array, if they are not there, since we do not want repeating words later.
if (phraseStartPoint.indexOf(iy) < 0)
phraseStartPoint.push(iy);
if (phraseStartPoint.indexOf(t) < 0)
phraseStartPoint.push(t);
}
}
var str = "";
// Prepare the phrase from the local array and push into phrases array, if it not exists there.
for (var ifinalLoop = 0; ifinalLoop < phraseStartPoint.length; ifinalLoop++) {
str = str + firstArrayItem[phraseStartPoint[ifinalLoop]] + " ";
}
if (phrases.indexOf(str) < 0)
phrases.push(str);
}
return phrases;
}
var stringsArray = [
"I want a dog",
"A dog is here",
"Pet a dog is cute",
"A red cat is here",
"red cat is cute"
];
var result = GetPhrases(stringsArray);
// Print the phrases array.
for (var iPhrase = 0; iPhrase < result.length; iPhrase++) {
console.log(result[iPhrase]);
}

with regex you can detect duplicates in strings.
According to this regex:
(?:.*?)(\b\w.{3,}\b)(?:.*?)(\1) ,
it only works if you're looking for twice the same pattern.
note: you can replace 3 in {3,} by any other integer and see the changes.
This paramater contrains the minimal string lenght you're looking for twice.

Related

Longest chain of letters in word JavaScript

I need to write a program that is find longest chain of letters in a word and displays it in a console.log with their length. Example aaaAAAAdddeess - console.log( 'AAAA' ,4 ). Program must be in JavaScript and must distinguish capital letters. I`ve tried something like
const word = 'aaadddAAAwwwweee'
let newWord = ' '
for (let i = 0; i < word.length; i++) {
if (word[i] === word[i + 1]) {
newWord += word[i] + word[i + 1]
i++
}
}
console.log(newWord, newWord.lenght)

You can split the word to letters and check each letter with next one. Then push current sequence in an array, it will be current max sequence. Each iteration check the size of current longest sequnece with max sequence.
const word = 'aaadddAAAwwwweee'
let lettersArr = word.split('');
let currentSequence = [];
let maxSequence = [];
for (let index = 0; index < lettersArr.length; index++) {
let element = lettersArr[index];
currentSequence = [element];
for (let i = index + 1; i < lettersArr.length; i++) {
if (lettersArr[index] == lettersArr[i]) {
currentSequence.push(lettersArr[index]);
} else {
break;
}
}
if (currentSequence.length > maxSequence.length) {
maxSequence = currentSequence;
}
}
let newWord = maxSequence.join('');
console.log(newWord, newWord.length);

Javascript - Fastest search for a word in array of string (not full match also)

I am coding search on a webpage which searchs all files, which the webpage consists of.
I iterate through every file and save all words to array of strings.
For example: var array = ["these","are","some","random","words","on","a","webpage"]
The search engine works this way: e.g. user type "s" and if any word from array contains this letter, the word is displayed as a result. In this case the results would be: "these", "some", "words"
The problem is that I have like 30 files in which I search and in each file there is on average 500 words so the search is slow.
letter search (e.g. "s") ~ 4 seconds
letter search (e.g. "se") ~ 2.1 seconds
letter search (e.g. "sea") ~ 1.9 seconds
letter search (e.g. "sear") ~ 1.7 seconds...
I iterate through the array with for-cycle and I think that is the biggest problem. So what is the fastest way to find if searched word is in array of strings and compare also not full matches?
EDIT:
On the webpage it looks like this e.g.:
Searched word: "sear"
Results:
Intro (name of page; clickable url link)
...you can search in this page... (sentence with words around the searched word)
Code explanation:
iterating through files
remove html characters and other special characters and save words from file to array of strings
compare words from file with words which user search for
save sentence with the searched word to a sentence variable
save sentence to an object (this object is later iterated in .html file and the sentences are displayed at webpage)
words typed by user which are going to be searched are in variable words
Here is my code.
var searchIndexPromise;
var searchAppModule = angular.module("searchApp", []);
searchAppModule.run(function($rootScope, $http){
var globalSearch = $rootScope.globalSearch = {
query: "",
results: [],
open: function(){
window.location.href = "#!/51_00_Search";
globalSearch.search(globalSearch.query);
},
search: function(find) {
if(!searchIndexPromise) searchIndexPromise = $http.get("searchIndex.json").then(function(response){
return response.data;
});
console.log("searching", find);
searchIndexPromise.then(function(searchIndex){
var temp = [];
globalSearch.results = [];
var words = find.split(' ');
if (words < 1) {
return;
}
for (var key in searchIndex) {
for (var option in searchIndex[key]) {
for(var i=0; i < words.length; i++) {
if (key.includes(words[i].toLowerCase())) {
var name = searchIndex[key][option].name;
var page = searchIndex[key][option].page;
var word = words[i];
var count = 0;
for (var j = 0; j < temp.length; j++) {
if (temp[j].name == name && temp[j].word == word) {
break;
}
count++;
}
if (count == temp.length) {
temp.push({ name : name, page : page, word : word });
}
}
}
}
}
if (words.length < 2) {
globalSearch.results = temp;
}
else {
for (var i = 0; i < temp.length; i++) {
var count = 0;
var compare = temp[i];
for (var j = 0; j < temp.length; j++) {
if (compare.name == temp[j].name) {
if (globalSearch.results.indexOf(temp[j]) == -1) {
count++;
}
}
}
if (count == words.length) {
globalSearch.results.push(temp[i]);
}
}
}
//sentences
const pagesLoad = require("./pages.js");
globalSearch.pages = [];
for (var result in globalSearch.results) {
var page = globalSearch.results[result].page.substring(3);
if ((page + ".html" in pagesLoad)) {
var nameOfPage = page + ".html";
}
if ((page + ".md" in pagesLoad)) {
var nameOfPage = page + ".md";
}
var regex = /(<([^>]+)>)|\n|\#|\(|\)|\*|\-|[^\w\s!?]|\n| +(?= )/ig, data = pagesLoad[nameOfPage].src.replace(regex, " ");
var string = data.split(" ");
string = string.filter(Boolean);
let lowerString = string.map((item) => {
return item.toLowerCase();
});
//this part is slowing down the search
for (var i = 0; i < lowerString.length; i++) {
for (var j = 0; j < words.length; j++) {
if (lowerString[i].includes(words[j].toLowerCase())) {
var sentence = "...";
for (var k = i - 6; k < i + 6; k++) {
if (lowerString[k] == null) {
continue;
}
sentence = sentence + string[k] + " ";
}
sentence = sentence.slice(0, -1);
sentence += "...";
globalSearch.pages.push({page: globalSearch.results[result].page, sentence: sentence});
}
}
}
}
})
}
};
});

Split string into pairs, triplets, quadruplets and on (ngrams)?

I would like to split natural text into word pairs, triplets, quadruplets and on!
I have figured out how to split into pairs so far. I assume I will need an additional loop to accommodate the word count
Here is the code for pairs
var test = "I love you so much, but Joe said \"he doesn't\"!";
var words = test.split(" ");
var two_words = [];
for (var i = 0; i < words.length - 1; i++) {
two_words.push(words[i] + ' ' + words[i + 1]);
}
console.log(two_words);
// Here is what I am trying
var words = test.split(" ");
var split_words = [];
var split_length = 5;
for (var l = 2; l <= split_length; l++) {
for (var i = 0; i < words.length - (l - 1); i++) {
var split_word;
for (c = 0; c <= l; c++) {
split_word += split_words[i + c];
}
split_words.push(split_word);
}
}
console.log(split_words);
Adding expected output...(an array of ngrams) sg like this
// 2grams
"I love"
"love you"
"you so"
"so much,"
"much, but"
"but Joe"
"Joe said"
"said "he"
""he doesn't"!"
//3grams
"I love you"
"love you so"
"you so much"
"so much, but"
//and on and on

This is called "n-grams" and can be done in modern JavaScript using generators like this:
function* ngrams(a, n) {
let buf = [];
for (let x of a) {
buf.push(x);
if (buf.length === n) {
yield buf;
buf.shift();
}
}
}
var test = "The quick brown fox jumps over the lazy dog";
for (let g of ngrams(test.split(' '), 3))
console.log(g.join(' '))
Another, more concise and probably faster option:
let ngrams = (a, n) => a.slice(0, 1 - n).map((_, i) => a.slice(i, i + n));

Assuming that your desired result does not include jumbled ordered combinations, you can try following
// Code goes here
var test = "I love you so much, but Joe said \"he doesn't\"!";
var arr = test.split(" ");
var words = arr.length; // total length of words
var result = [];
function process(arr, length) { // process array for number of words
var temp = [];
// use equal if want to include the complete string as well in the array
if (arr.length >= length) {
// the check excludes any left over words which do not meet the length criteria
for (var i = 0; (i + length) <= arr.length; i++) {
temp.push(arr.slice(i, length + i).join(" "));
}
result.push(temp);
process(arr, length + 1); // recursive calling
}
}
process(arr, 2);
console.log(result);

This should do what you're looking for:
function chunkIt(str,chunk) {
var words = str.split(" ");
var arr = [];
for (var i = (chunk - 1); i < words.length; i++) {
var start = i - (chunk - 1);
arr.push(words.slice(start, start + chunk));
}
return arr.map(v => v.join(" "));
}
var test = "I love you so much, but Joe said \"he doesn't\"!";
console.log(chunkIt(test,2));
console.log(chunkIt(test,3));
console.log(chunkIt(test,4));

You can dramatically shorten your code by using a library like lodash:
var word = 'foobarbaz';
var chunks = _.chunk(word, 2).map((chunk) => chunk.join(''));
console.log(chunks); //[ 'fo', 'ob', 'ar', 'ba', 'z' ]
Then you can pass in values other than 2 to suit your needs

JavaScript String split()

We have multiple records, each record has an asset list. The assest list consists of multiple client names and sites (name-site, name-site_2, name2-site_3, name3-site_4).
I'm trying to split the asset list to end up with two lists, one of all of the names and one of all of the sites (I only want to show what's unique in each list). I'm splitting at the comma and then split again at the hyphen
What I've created below works, but recently I've encountered some client-site combos that have an extra hyphen, which breaks my solution (I initially split the string by comma and then split the substrings by hyphen). To add another wrinkle, the extra hyphen is not always in the same spot, depending on the category it could be the first hyphen (in the client name) or second hyphen (in the site) that needs to be ignored, fortunately, this is consistent by category (if category == "animals").
For category animals if there are two hyphens I need to ignore the first.
For category fruit if there are two hyphens I need to ignore the second.
Any ideas?
Example asset lists:
category == "animals"
if there is two hyphens, I need to split at the second.
assetList ="fish-mark, cat-jim, blue-dog-henry, red-bird-bill, green-snake-larry"
category == "fruit"
if there is two hyphens, I need to split at the first.
assetList = "lime-henry, lemon-susan, banana-bob-nelson, apple-rick-jones, pineapple-sam-smith"
Below is my code:
var assetList = "fish-mark, cat-jim, blue-dog-henry, red-bird-bill, green-snake-larry";
var count = (assetList.match(/-/g) || []).length;//counts the hyphens (client-siteId combo)
var splitObj = {};
var comboObj = {};
var clientObj = {};
var siteObj = {};
var mainSplitObj = {};
var allClient = '';
var allSite = '';
mainSplitObj = assetList.split(', ');
for (var i = 0; i < count; i++) {
splitObj["split"+i] = mainSplitObj[0+i]; //puts the client-siteID into a substring
comboObj["combo"+i] = splitObj["split"+i].split('-'); //splits the client-siteID at the dash
clientObj["client"+i] = comboObj["combo"+i][0]; //puts the client name in a substring
siteObj["site"+i] = comboObj["combo"+i][1]; //puts the siteid in a substring
allClient += clientObj["client"+i] +";"+ ' '; //cumulatively adds client substrings to allClient variable
allSite += siteObj["site"+i] +";" + ' '; //cumulatively adds site substrings to allSite variable
}
tempC = allClient.split(",")
uniqueClient = []
for (var i = 0; i < tempC.length; i++) {
isIn = 0
for (var j = 0; j < uniqueClient.length; j++) {
if (tempC[i] == uniqueClient[j]) {
isIn = 1
}
}
if (isIn == 0) {
uniqueClient.push(tempC[i])
}
}
tempS = allSite.split(",")
uniqueSite = []
for (var i = 0; i < tempS.length; i++) {
isIn = 0
for (var j = 0; j < uniqueSite.length; j++) {
if (tempS[i] == uniqueSite[j]) {
isIn = 1
}
}
if (isIn == 0) {
uniqueSite.push(tempS[i])
}
}

Here is a way to do at with indexOf and lastIndexOf. It splits on the last instead of the second hyphen which in your case gives you the same results.
// Split on first hyphen
var list = ["lime-henry", "lemon-susan", "banana-bob-nelson", "apple-rick-jones", "pineapple-sam-smith"];
for (var i = 0; i < list.length; i++) {
var delimiterIndex = list[i].indexOf("-");
var item = list[i];
var left = item.substring(0, delimiterIndex);
var right = item.substring(delimiterIndex + 1, item.length);
console.log(left, right);
}
/* outputs
* lime henry
* lemon susan
* banana bob-nelson
* apple rick-jones
* pineapple sam-smith
*/
// Split on last hyphen
var list = ["fish-mark", "cat-jim", "blue-dog-henry", "red-bird-bill", "green-snake-larry"];
for (var i = 0; i < list.length; i++) {
var delimiterIndex = list[i].lastIndexOf("-");
var item = list[i];
var left = item.substring(0, delimiterIndex);
var right = item.substring(delimiterIndex + 1, item.length);
console.log(left, right);
}
/* outputs
* fish mark
* cat jim
* blue-dog henry
* red-bird bill
* green-snake larry
*/

Javascript string matching pattern help

i need to find few words or matching pattern using a Javascript.
this is the requirement.
i have a string like this,
Here is a quick guide for the next
time you reach for your favorite oil and some other topics
and i need to match this string against a string like this
favorite oil and some other topics can be based on something blah blah
how do i get the intersection of matching text blocks?
I already tried intersect Javascript script function, for some strings it's not working properly.
How to solve this problem? can this be done using Regex?
Please advice.

You have to find the Longest common substring.
If the strings are not very long, I recommend using Tim's approach. Otherwise, this is a Javascript implementation of the Longest common substring algorithm with dynamic programming. The runtime is O(mn) where m and n are the lengths of the 2 strings respectively.
An example usage:
var first = "Here is a quick guide for the next time you reach for your favorite oil and some other topics";
var second = "favorite oil and some other topics can be based on something blah blah";
console.log(first.intersection(second)); // ["favorite oil and some other topic"]
This is the algorithm implementation. It returns an array of the longest common substring(s). Extended the native String class, so the intersect method is available on all strings.
String.prototype.intersection = function(anotherString) {
var grid = createGrid(this.length, anotherString.length);
var longestSoFar = 0;
var matches = [];
for(var i = 0; i < this.length; i++) {
for(var j = 0; j < anotherString.length; j++) {
if(this.charAt(i) == anotherString.charAt(j)) {
if(i == 0 || j == 0) {
grid[i][j] = 1;
}
else {
grid[i][j] = grid[i-1][j-1] + 1;
}
if(grid[i][j] > longestSoFar) {
longestSoFar = grid[i][j];
matches = [];
}
if(grid[i][j] == longestSoFar) {
var match = this.substring(i - longestSoFar + 1, i);
matches.push(match);
}
}
}
}
return matches;
}
Also need this helper function to create a 2d array with all elements initialize to 0.
// create a 2d array
function createGrid(rows, columns) {
var grid = new Array(rows);
for(var i = 0; i < rows; i++) {
grid[i] = new Array(columns);
for(var j = 0; j < columns; j++) {
grid[i][j] = 0;
}
}
return grid;
}

This isn't very efficient and there are much better ways to do this in general (see #Anurag's answer), but it's simple and works fine for short strings:
function stringIntersection(str1, str2) {
var strTemp;
// Swap parameters if necessary to ensure str1 is the shorter
if (str1.length > str2.length) {
strTemp = str1;
str1 = str2;
str2 = strTemp;
}
// Start with the whole of str1 and try shorter substrings until
// we have a common one
var str1Len = str1.length, l = str1Len, start, substring;
while (l > 0) {
start = str1Len - l;
while (start >= 0) {
substring = str1.slice(start, l);
if (str2.indexOf(substring) > -1) {
return substring;
}
start--;
}
l--;
}
return "";
}
var s1 = "Here is a quick guide for the next time you reach"
+ " for your favorite oil and some other topics";
var s2 = "favorite oil and some other topics can be based on"
+ " something blah blah";
alert( stringIntersection(s1, s2) );

A simple polyfill of filter a string
if (!String.prototype.intersection) {
String.prototype.intersection = function(anotherString, caseInsensitive = false) {
const value = (caseInsensitive) ? this.toLowerCase() : this;
const comp = (caseInsensitive) ? anotherString.toLowerCase() : anotherString;
const ruleArray = comp.split("").reduce((m,v) => {m[v]=true; return m;} ,{})
return this.split("").filter( (c, i) => ruleArray[value[i]] ).join("")
}
}
"HelloWorld".intersection("HEWOLRLLODo", true)
"HelloWorld" - case insensitive
"HelloWorld".intersection("HEWOLRLLODo")
"HoWo" - case sensitive

We Keep Coding

JavaScript is the programming language of the Web.

Find duplicate phrases (not just words) in array - javascript

Related

Longest chain of letters in word JavaScript

Javascript - Fastest search for a word in array of string (not full match also)

Split string into pairs, triplets, quadruplets and on (ngrams)?

JavaScript String split()

Javascript string matching pattern help

Categories

Resources