Extract keyphrases from text (1-4 word ngrams) - javascript

What's the best way to extract keyphrases from a block of text? I'm writing a tool to do keyword extraction: something like this. I've found a few libraries for Python and Perl to extract n-grams, but I'm writing this in Node so I need a JavaScript solution. If there aren't any existing JavaScript libraries, could someone explain how to do this so I can just write it myself?

I like the idea, so I've implemented it: See below (descriptive comments are included).
Preview at: https://jsfiddle.net/WsKMx
/*#author Rob W, created on 16-17 September 2011, on request for Stackoverflow (http://stackoverflow.com/q/7085454/938089)
* Modified on 17 juli 2012, fixed IE bug by replacing [,] with [null]
* This script will calculate words. For the simplicity and efficiency,
* there's only one loop through a block of text.
* A 100% accuracy requires much more computing power, which is usually unnecessary
**/
var text = "A quick brown fox jumps over the lazy old bartender who said 'Hi!' as a response to the visitor who presumably assaulted the maid's brother, because he didn't pay his debts in time. In time in time does really mean in time. Too late is too early? Nonsense! 'Too late is too early' does not make any sense.";
var atLeast = 2; // Show results with at least .. occurrences
var numWords = 5; // Show statistics for one to .. words
var ignoreCase = true; // Case-sensitivity
var REallowedChars = /[^a-zA-Z'\-]+/g;
// RE pattern to select valid characters. Invalid characters are replaced with a whitespace
var i, j, k, textlen, len, s;
// Prepare key hash
var keys = [null]; //"keys[0] = null", a word boundary with length zero is empty
var results = [];
numWords++; //for human logic, we start counting at 1 instead of 0
for (i=1; i<=numWords; i++) {
keys.push({});
}
// Remove all irrelevant characters
text = text.replace(REallowedChars, " ").replace(/^\s+/,"").replace(/\s+$/,"");
// Create a hash
if (ignoreCase) text = text.toLowerCase();
text = text.split(/\s+/);
for (i=0, textlen=text.length; i<textlen; i++) {
s = text[i];
keys[1][s] = (keys[1][s] || 0) + 1;
for (j=2; j<=numWords; j++) {
if(i+j <= textlen) {
s += " " + text[i+j-1];
keys[j][s] = (keys[j][s] || 0) + 1;
} else break;
}
}
// Prepares results for advanced analysis
for (var k=1; k<=numWords; k++) {
results[k] = [];
var key = keys[k];
for (var i in key) {
if(key[i] >= atLeast) results[k].push({"word":i, "count":key[i]});
}
}
// Result parsing
var outputHTML = []; // Buffer data. This data is used to create a table using `.innerHTML`
var f_sortAscending = function(x,y) {return y.count - x.count;};
for (k=1; k<numWords; k++) {
results[k].sort(f_sortAscending);//sorts results
// Customize your output. For example:
var words = results[k];
if (words.length) outputHTML.push('<td colSpan="3" class="num-words-header">'+k+' word'+(k==1?"":"s")+'</td>');
for (i=0,len=words.length; i<len; i++) {
//Characters have been validated. No fear for XSS
outputHTML.push("<td>" + words[i].word + "</td><td>" +
words[i].count + "</td><td>" +
Math.round(words[i].count/textlen*10000)/100 + "%</td>");
// textlen defined at the top
// The relative occurence has a precision of 2 digits.
}
}
outputHTML = '<table id="wordAnalysis"><thead><tr>' +
'<td>Phrase</td><td>Count</td><td>Relativity</td></tr>' +
'</thead><tbody><tr>' +outputHTML.join("</tr><tr>")+
"</tr></tbody></table>";
document.getElementById("RobW-sample").innerHTML = outputHTML;
/*
CSS:
#wordAnalysis td{padding:1px 3px 1px 5px}
.num-words-header{font-weight:bold;border-top:1px solid #000}
HTML:
<div id="#RobW-sample"></div>
*/

I do not know such a library in JavaScript but the logic is
split text into array
then sort and count
alternatively
split into array
create a secondary array
traversing each item of the 1st array
check whether current item exists in secondary array
if not exists
push it as a item's key
else
increase value having a key = to item sought.
HTH
Ivo Stoykov

function ngrams(seq, n) {
to_return = []
for (let i=0; i<seq.length-(n-1); i++) {
let cur = []
for (let j=i; j<seq.length && j<=i+(n-1); j++) {
cur.push(seq[j])
}
to_return.push(cur.join(''))
}
return to_return
}
> ngrams(['a', 'b', 'c'], 2)
['ab', 'bc']

Related

Why is my array 'undefined'? (vanilla javascript)

I'm trying to make a simple 'bad words' filter with javascript. It's meant to listen to any submit events on the page, then iterate through all input fields of the text type, check them for bad stuff by comparing the entered text with the word list, and finally return an according console.log/alert (for now).
I have two files: word-list.js with the critical words (loads first) and filter.js which pulls an array with all words from word-list.js.
My problems is, swear_words_arr[1] is 'undefined' and I don't understand why. I've been looking around for solutions, but still I can't seem to determine the reason for this. Help is much appreciated.
// get all inputs type = text and turn html collection into array
var getInputs = document.querySelectorAll("input[type=text]")
var inputs = Array.from(getInputs);
//var swear_alert_arr -> from in word-list.js
var swear_alert_arr = new Array();
var swear_alert_count = 0;
function reset_alert_count() {
swear_alert_count = 0;
}
function validate_text() {
reset_alert_count();
inputs.forEach(function(input) {
var compare_text = input.value;
console.log(compare_text);
for (var i = 0; i < swear_words_arr.length; i++) {
for (var j = 0; j < compare_text.length; i++) {
if (
swear_words_arr[i] ==
compare_text.substring(j, j + swear_words_arr[i].length).toLowerCase()
) {
swear_alert_arr[swear_alert_count] =
compare_text.substring(
j,
j + swear_words_arr[i].length
);
swear_alert_count++;
}
}
}
var alert_text = "";
for (var k = 1; k <= swear_alert_count; k++) {
alert_text += "\n" + "(" + k + ") " + swear_alert_arr[k - 1];
if (swear_alert_count > 0) {
alert("No!");
console.log('omg no bad stuff! D:');
} else {
console.log('no bad stuff found :)');
}
}
});
}
window.onload = reset_alert_count;
window.addEventListener('submit', function() {
validate_text();
});
It doesn't look like you've declared the array you're trying to access.
But, instead of loops with nested loops and keeping track of loop counters, just get a new array that contains any bad words in the submitted array. You can do this a number of ways, but the Array.prototype.filter() method works nicely:
let badWords = ["worse", "terrible", "horrible", "bad"];
let submittedWords = ["Good", "Terrible", "Great", "Fabulous", "Bad", "OK"];
// Loop over the submitted words and return an array of all the bad words found within it
let bad = submittedWords.filter(function(word){
// Do a case-insensitive match test. Return the word from the submitted words
// if it's on the bad word list.
return badWords.indexOf(word.toLowerCase()) > -1 ? word: null;
});
console.log("Bad words found in submitted data: " + bad.join(", "));

Increment counts of sites visited including TLDs and subdomains outputted to JSON

So I was asked to create an algorithm that when given a basic input of an array of counts and sites, it will output the accumulated visits to each TLD and Subdomain represented in a JSON object that will yield data like:
1120 com
800 google.com
310 reddit.com
60 mail.yahoo.com
10 mobile.sports.yahoo.com
50 sports.yahoo.com
10 stackoverflow.com
3 org
3 wikipedia.org
2 en.wikipedia.org
2 es.wikipedia.org
1 mobile.sports
1 sports
The input is something like:
// visits = [ "800,google.com",
// "60,mail.yahoo.com",
// "10,mobile.sports.yahoo.com",
// "40,sports.yahoo.com",
// "310,reddit.com",
// "10,stackoverflow.com",
// "2,en.wikipedia.org",
// "1,es.wikipedia.org",
// "1,mobile.sports" ]
My code looks like this so far and I know its wrong, but my brain is melted at the moment and I am not sure how to proceed. I am not necessarily looking for you to write the algorithm for me, but I do want to understand logically how I could break this down.
function getDomainHits(arr){
var splitCount = [];
var splitDomains = [];
var domainCountDict = {"Domains" : [],"Count" : 0};
for (var i = 0; i < arr.length; i++){
splitCount = arr[i].split(",");
splitDomains = splitCount[1].split(".");
for (var j = 0; j < splitDomains.length; j++){
if (!domainCountDict.Domain.includes(splitDomains[j])){
domainCountDict.Domain.push(splitDomains[j]);
}
}
}
console.log(domainCountDict);
}
As you can see I stopped here because I couldn't think of the best way to split these into different key, value pairs - one being domains and the other being the counts. Also my algorithm doesn't exactly follow the requirements.
So I figured out the algorithm. Define a variable - initialize it as an Array, and a dictionary to store the processed array data.
var splitCount = [];
var domainCountDict = {};
Then you need to take the Array of strings (arr - the function parameter) and iterate through it. On each iteration you need to split the string element into another Array to further process it.
for (var i = 0; i < arr.length; i++){
splitCount = arr[i].split(",");
...
}
So for the example input data of
// visits = [ "800,google.com",
// "60,mail.yahoo.com",
// "10,mobile.sports.yahoo.com",
// "40,sports.yahoo.com",
// "310,reddit.com",
// "10,stackoverflow.com",
// "2,en.wikipedia.org",
// "1,es.wikipedia.org",
// "1,mobile.sports" ]
Iteration 0 would be split into an Array of ["800","google.com"] and assigned to Var splitCount. You would then need to access splitCount and because of the input formatting you don't need to create a for loop. I created a variable to store the current count of the site - which will always be element 0 because of the format of the input data.
I didn't bother with input sanitation here because I didn't have time to create a map function that will turn the number elements into - well... numbers. I relied on the assumption that the input data will always have a number in the 0th index - which is terrible. Don't do this.
var curCnt = 0;
if (splitCount[0]){
curCnt = splitCount[0];
}
This next chunk of logic hurt my brain a little bit because I needed to find a way to store each domain component and its count in the dict and determine if the other domains contained components that already existed and if so increment those. Lets make some more Arrays!
var domain = [];
var currentDom = [];
if (splitCount[1] != undefined && splitCount[1]){
domain = splitCount[1].split(".");
for (var j = domain.length - 1; j >= 0; j--){
...
}
}
Above you will see that created an Array to hold the domain components called domain and another called currentDom to hold the components that are being worked and have already been worked, because we want to make sure that we count com and google.com. Lets look inside of the for loop.
for (var j = domain.length - 1; j >= 0; j--){
currentDom.unshift(domain.pop());
/*console.log("current iter: " + k + "\n"
+ "currentDom: " + currentDom.join(".") + "\n"
+ "current count: " + curCnt + "\n");*/
if (currentDom.join(".") in domainCountDict){
/*console.log("currentDom2: " + currentDom.join("."));
console.log("increment existing");*/
domainCountDict[currentDom.join(".")] += parseInt(curCnt);
}
if (!(currentDom.join(".") in domainCountDict)){
/*console.log("currentDom3: " + currentDom.join("."));
console.log("increment new");*/
domainCountDict[currentDom.join(".")] = parseInt(curCnt);
//console.log(domainCountDict);
}
}
Above you will see that I am iterating backwards in this loop to work the TLD first and then the domains/subdomains. I chose to pop the last element off the end of the current array and unshift it to the beginning of the new Array, currentDom. This will effectively let me work on a portion of the entire FQDN to determine if it has been included in the dictionary.
I have a few if statements to determine if the currentDom is included in the array. I had to use Array.join() to accurately check if the string of the current domain components have been included in the dictionary. If not then the string of currentDom would be added as a key and the curCnt would be the value assigned. If so, then the value would be incremented. Because of my lazy input sanitation in the curCnt assignment I had to parse these as Int because JS dynamic types. I am sure there is a better way, but my brain hurts now.
Finally make sure that you return the created dictionary on the outside of all of these for loops.
The full algorithm is below
// Sample output (in any order/format):
// getTotalsByDomain(counts)
// 1320 com
// 900 google.com
// 410 yahoo.com
// 60 mail.yahoo.com
// 10 mobile.sports.yahoo.com
// 50 sports.yahoo.com
// 10 stackoverflow.com
// 3 org
// 3 wikipedia.org
// 2 en.wikipedia.org
// 1 es.wikipedia.org
// 1 mobile.sports
// 1 sports
let counts = [ "900,google.com",
"60,mail.yahoo.com",
"10,mobile.sports.yahoo.com",
"40,sports.yahoo.com",
"300,yahoo.com",
"10,stackoverflow.com",
"2,en.wikipedia.org",
"1,es.wikipedia.org",
"1,mobile.sports" ];
console.log(getDomainHits(counts));
function getDomainHits(arr){
var splitCount = [];
var domainCountDict = {};
for (var i = 0; i < arr.length; i++){
splitCount = arr[i].split(",");
var curCnt = 0;
if (splitCount[0]){
curCnt = splitCount[0];
}
var domain = [];
var currentDom = [];
if (splitCount[1] != undefined && splitCount[1]){
domain = splitCount[1].split(".");
for (var j = domain.length - 1; j >= 0; j--){
currentDom.unshift(domain.pop());
/*console.log("current iter: " + k + "\n"
+ "currentDom: " + currentDom.join(".") + "\n"
+ "current count: " + curCnt + "\n");*/
if (currentDom.join(".") in domainCountDict){
/*console.log("currentDom2: " + currentDom.join("."));
console.log("increment existing");*/
domainCountDict[currentDom.join(".")] += parseInt(curCnt);
}
if (!(currentDom.join(".") in domainCountDict)){
/*console.log("currentDom3: " + currentDom.join("."));
console.log("increment new");*/
domainCountDict[currentDom.join(".")] = parseInt(curCnt);
//console.log(domainCountDict);
}
}
}
}
return domainCountDict;
}

Find a number of characters/letters in a string in javascript

I want to find a number of "a" characters in a string. Ideally, I want to get an output as an array that would print out in the console something like: c - 15, b - 5, a - 4 etc.
<!DOCTYPE html>
<html>
<body>
<script>
function findStrings() {
mainString="Mazher Mahmood is a clever, canny and creative reporter who generates his own stories. It's important to place that on record because, before we delve into his use of the darker journalistic arts, there should not be any illusion about his reporting skills. "
result=(mainString.split("a").length - 1);
console.log(result);
}
</script>
</body>
</html>
You would just need to loop through each letter, making the check as you go and perhaps append it to an object:
function findStrings() {
var letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
ret = {};
for(i=0; i<letters.length; i++){
ret[letters[i]]=(mainString.split(letters[i]).length - 1);
}
console.log(ret);
}
findStrings();
Should you want to explicitly check for any other characters, just add them on to the end of the letters string.
JSFiddle
You can count like this:
mainString="Your Big String";
function count(str) {
var chars = {};
var astr = str.split("");
for (var i = 0, len = astr.length; i < len; i++) {
var letter = astr[i];
chars[letter] = chars.hasOwnProperty(letter) && chars[letter] + 1 || 1;
}
return chars;
}
console.log(count(mainString));
Note that, this way, you will count the occurrence of every character, including spaces, commas, etc.
Try this:
function getCharAppearences(str) {
var character,result = {};
for(var i = 0; i < str.length; i++) {
character = str.charAt(i);
result[character] = result[character] + 1 || 1;
}
return result;
}
Fiddle
If you only want to count the occurrences of letters/characters, you just have to loop through the string:
function findStrings() {
var mainString="Mazher Mahmood is a clever, canny and creative reporter who generates his own stories. It's important to place that on record because, before we delve into his use of the darker journalistic arts, there should not be any illusion about his reporting skills. "
var findings = {};
for(var i=0; i<mainString.length; i++){
if(typeof( findings[mainString[i]] ) == "undefined"){
findings[mainString[i]] = 0;
}
findings[mainString[i]]++;
}
for(var sym in findings){
console.log("The character "+sym+" has been found "+findings[sym]+" times");
}
}
findStrings();
Note:
oGeez`s answer only counts the occurrence of specific characters, while my answer counts the occurrence of all appearing characters. Your question isn't clear enough on what you actually want to achieve.
JSFiddle: http://jsfiddle.net/A4WWN/

Finding the rank of the Given string in list of all possible permutations

I am trying to find the Rank of the given string in the list of possible permutations. I tried to come up with a solution that tries to find all possible permutations, assign a rank to them and then display it.
But this drastically reduces the performance when the length of the string keeps increasing. So was wondering if someone can think of an efficient solution for this problem..
function permute(str) {
// Sort the string
var arr = [];
for (var i = 0; i < str.length; i++) arr.push(str[i]);
var sortedString = arr.sort().join(''),
// Length of the string
length = str.length,
used = [];
// Create a boolean array for length of the string
while (length--) used.push(false);
// String buffer that holds the current string
var out = '';
// Call the function
doPermute(sortedString, str, out, used, str.length, 0);
}
var count = 0;
function doPermute(inp, givenString, out, used, length, level) {
// Only if length of the string equal to current level print it
// That is permutation length is eqaul to string length
if (level == length) {
count++;
//console.log('Perm :: ' + out + ' -- ' + count);
if (out === givenString) {
var pp = 'Rank of :: ' + out + ' -- ' + count;
$('div').append('<p>' + pp + '</p>');
}
return;
}
for (var i = 0; i < length; ++i) {
// If variable used continue
if (used[i]) continue;
// Append the current char in loop
out += inp[i];
// set variable to true
used[i] = true;
// Call the function again
doPermute(inp, givenString, out, used, length, level + 1);
// Set it to false as the variable can be reused
used[i] = false;
// remove the last character of the buffer
out = out.slice(0, out.length - 1)
}
}
permute('dbcarf')
Fiddle
Sure: if input string is "cab".
What is the lowest rank that a string starting with c could get?
c
Note the strings that come before it.
abc
acb
bac
bca
So a string starting with c has minimum rank 5.This is just number of characters in input string that come lexicographically before c.(in order a,b,c,d,e,f...)So we have 2.Each word starting with a letter can have 2 words.
Next letter is "a"?
What is minimum rank that a word starting with "ca" can get?
5
Why?
"a" is the best way we can fill the second spot with the remaining letters.
And the same goes for third element "b".
So rank of "cab" is 5.
In general.(Assuming no duplicates, though this is not much harder)
var W; //input string
var C[26];
var rank = 1;
for (var i = 0; i < W.length; i++) C[W[i] - 'a']++;
for (var i = 0; i < W.length; i++) {
//How many characters which are not used, that come before current character
var count = 0;
for (var j = 0; j < 26; j++) {
if (j == (W[i] - 'a')) break;
if (C[j] > 0) count++;
}
C[W[i] - 'a'] = 0;
rank += count * fact(W.length - i - 1);
}
There is an explanation in https://en.wikipedia.org/wiki/Permutation#Numbering_permutations of how to convert a permutation on n objects to a number in the range 0..n!-1 and it goes on to say that "Converting successive natural numbers to the factorial number system produces those sequences in lexicographic order (as is the case with any mixed radix number system), and further converting them to permutations preserves the lexicographic ordering, provided the Lehmer code interpretation is used" So I would try doing this number conversion and see if it produces something related to the rank that you need, by your definition.

Javascript: matching a dynamic string against an array

I'm attempting to teach myself javascript. I chose something I assumed was simple, but ran into problems relatively quickly.
I'm attempting to search a string for another string given by the user.
My code so far is:
var source = "XREs2qqAQfjr6NZs6H5wkZdOES5mikexRkOPsj6grQiYNZfFoqXI4Nnc1iONKVrA";
var searchString = []; //the users input
searchString = prompt("Enter search string");
var hits = [];
var one = 0;
var two = 0;
var k = 0;
var sourceSearch = function(text) {
for(i = 0; i < source.length; i++) { //for each character in the source
if(source[i] === searchString[0]) { //if a character in source matches the first element in the users input
one = source.indexOf(i); //confused from here on
for(p = searchString.length; p > 0; p--) {
}
}
}
};
sourceSearch(searchString);
My idea was:
check to see if the first loop finds a character that matches the first character in the user input
if it matches, check to see if the next X characters after the first match the next X characters in the source string
if they all match, push them to the hits array
My problem: I have no idea how to iterate along the arrays without nesting quite a few if statements, and even then, that wouldn't be sufficient, considering I want the program to work with any input.
Any ideas would be helpful. Thanks very much in advance.
Note: There are a few un-used variables from ideas I was testing, but I couldn't make them work.
You can try:
if (source.indexOf(searchString) !== -1) {
// Match!
}
else
{
//No Match!
}
As the other answers so far point out, JavaScript strings have an indexOf function that does what you want. If you want to see how it's done "by hand", you can modify your function like this:
var sourceSearch = function(text) {
var i, j, ok; // always declare your local variables. globals are evil!
// for each start position
for(i = 0; i < source.length; i++) {
ok = true;
// check for a match
for (j = searchString.length - 1; ok && j >= 0; --j) {
ok = source[i + j] === searchString[j];
}
if (ok) {
// searchString found starting at index i in source
}
}
};
This function will find all positions in source at which searchString was found. (Of course, you could break out of the loop on the first success.) The logic is to use the outer loop to advance to each candidate start position in source and use the inner loop to test whether that position actually is the position of a match to searchString.
This is not the best algorithm for searching strings. The built-in algorithm is much faster (both because it is a better algorithm and because it is native code).
to follow your approach, you can just play with 2 indexes:
var sourceSearch = function(text) {
j = 0;
for(i = 0; i < source.length; i++) {
if(source[i] === text[j]) {
j++;
} else {
j = 0;
}
if (j == text.length) {
console.log(i - j); //this prints the starting index of the matching substring
}
}
};
These answers are all pretty good, but I'd probably opt for something like this:
var source = "XREs2qqAQfjr6NZs6H5wkZdOES5mikexRkOPsj6grQiYNZfFoqXI4Nnc1iONKVrA";
var searchString = []; //the users input
searchString = prompt("Enter search string");
var hits = source.split(searchString);
var hitsCount = hits.length - 1;
This way you have all of the data you need to figure out where each hit occurred in he source, if that's important to you.

Categories