I have the code below which works perfectly in creating a CSV list of word occurences in a string (massive .txt file), that looks like this:
Name;Total
THE;23562
OF;15954
AND;15318
IN;12159
TO;11879
A;11145
I;6135
WAS;6045
etc...
What I want now is word pairs of two, and maybe even three if it proves easy enough. So something like
Name;Total
OF THE;25
FROM THE;20
BY WHICH;13
OF WHICH;5
etc...
How can I modify my existing code to check for pairs instead of single words?
//chrisjopa.com/2016/04/21/counting-word-frequencies-with-javascript/
var fs = require('fs');
var file = 'INPUT.txt';
//Create Headers for the CSV File
const createCsvWriter = require('csv-writer').createObjectCsvWriter;
const csvWriter = createCsvWriter({
//Define Pathname to your choice
path: 'Data1.csv',
header: [
{id: 'name', title: 'Name'},
{id: 'total', title: 'Total'},
]
});
// read file from current directory
fs.readFile(file, 'utf8', function (err, data) {
if (err) throw err;
var wordsArray = splitByWords(data);
var wordsMap = createWordMap(wordsArray);
var finalWordsArray = sortByCount(wordsMap);
//Write CSV Output File
csvWriter
.writeRecords(finalWordsArray)
.then(()=> console.log('DONE'));
});
function splitByWords (text) {
// Removes all special characters, then white spaces,
//then converts to all capital letters, then splits the words
var noPunctuation = text.replace(/[\.,-\/#!$%\^&\*;:{}�=\-_'`’~"()#\+\?><\[\]\+]/g, '');
var noExtraSpaces = noPunctuation.replace(/\s{2,}/g," ");
var allUpperCase = noExtraSpaces.toUpperCase();
var wordsArray = allUpperCase.split(/\s+/);
return wordsArray;
}
//This is the part in the code that I feel is the place to check for word
//pairs, but I'm not sure how I'm supposed to write it.
function createWordMap (wordsArray, ) {
// create map for word counts
var wordsMap = {};
wordsArray.forEach(function (key) {
if (wordsMap.hasOwnProperty(key)) {
wordsMap[key]++;
} else {
wordsMap[key] = 1;
}
});
return wordsMap;
}
function sortByCount (wordsMap) {
// sort by count in descending order
var finalWordsArray = [];
finalWordsArray = Object.keys(wordsMap).map(function(key) {
return {
name: key,
total: wordsMap[key]
};
});
finalWordsArray.sort(function(a, b) {
return b.total - a.total;
});
return finalWordsArray;
}
From the wordsArray, create another array that puts together every pair of words. For example, from a wordsArray of
['Foo', 'Bar', 'Baz', 'Buzz']
create:
['Foo Bar', 'Bar Baz', 'Baz Buzz']
Then, you can use the exact same function you already have to count up the number of occurrences of each pair - just call createWordMap with it (and then sortByCount). For example:
const wordsArray = ['Foo', 'Bar', 'Baz', 'Buzz', 'Foo', 'Bar'];
const wordPairsArray = [];
for (let i = 1; i < wordsArray.length; i++) {
wordPairsArray.push(wordsArray[i - 1] + ' ' + wordsArray[i]);
}
const wordPairMap = createWordMap(wordPairsArray);
const wordPairCount = sortByCount(wordPairMap);
console.log(wordPairCount);
// the following is your original code:
function createWordMap(wordsArray, ) {
// create map for word counts
var wordsMap = {};
wordsArray.forEach(function(key) {
if (wordsMap.hasOwnProperty(key)) {
wordsMap[key]++;
} else {
wordsMap[key] = 1;
}
});
return wordsMap;
}
function sortByCount(wordsMap) {
// sort by count in descending order
var finalWordsArray = [];
finalWordsArray = Object.keys(wordsMap).map(function(key) {
return {
name: key,
total: wordsMap[key]
};
});
finalWordsArray.sort(function(a, b) {
return b.total - a.total;
});
return finalWordsArray;
}
To extend this to more than just pairs, just change the loop to join together a dynamic number of elements:
function combineWords(words, wordsInItem) {
const items = [];
for (let i = wordsInItem - 1; i < words.length; i++) {
const start = i - (wordsInItem - 1);
const end = i + 1;
items.push(words.slice(start, end).join(' '));
}
return items;
}
function getCount(words, wordsInItem) {
const combinedWords = combineWords(words, wordsInItem);
const map = createWordMap(combinedWords);
const count = sortByCount(map);
console.log(count);
}
getCount(['Foo', 'Bar', 'Baz', 'Buzz', 'Foo', 'Bar'], 2);
getCount(['Foo', 'Bar', 'Baz', 'Buzz', 'Foo', 'Bar', 'Baz'], 3);
// the following is your original code:
function createWordMap(wordsArray, ) {
// create map for word counts
var wordsMap = {};
wordsArray.forEach(function(key) {
if (wordsMap.hasOwnProperty(key)) {
wordsMap[key]++;
} else {
wordsMap[key] = 1;
}
});
return wordsMap;
}
function sortByCount(wordsMap) {
// sort by count in descending order
var finalWordsArray = [];
finalWordsArray = Object.keys(wordsMap).map(function(key) {
return {
name: key,
total: wordsMap[key]
};
});
finalWordsArray.sort(function(a, b) {
return b.total - a.total;
});
return finalWordsArray;
}
Related
Sample input
Array
Here I am showing a 3-dimension array but the actual number of dimensions vary and is known as n.
[
[
[1,2],
[3,4]
],
[
[5,6],
[7,8]
]
]
Separators
It has the same length (n) as the number of dimensions of the array where the i-th element represent the separator of the i-th level of the array.
[',', '_', '-']
Desired output
1-2_3-4,5-6_7-8
What I've tried
It works for a 3-dimension array but not for a 4-dimension one.
I know what is going wrong with my code but I have no idea how to fix it.
Besides, I think there are simpler and/or more efficient methods.
3-dimension (working)
const array = [[[1,2],[3,4]],[[5,6],[7,8]]];
const separators = [',', '_', '-'];
const _separators = separators.reverse();
let i;
function join(array, first = false) {
const next = Array.isArray(array[0]);
let result;
if (next) {
result = array.map(e => {
if (first) { i = 0; }
return join(e);
});
i++;
result = result.join(_separators[i]);
}
else {
result = array.join(_separators[i]);
}
return result;
}
const result = join(array, true);
console.log(result);
4-dimension (not working properly)
const array = [[[[1,2],[3,4]],[[5,6],[7,8]]],[[['A','B'],['C','D']],[['E','F'],['G','H']]]];
const separators = ['|', ',', '_', '-'];
const _separators = separators.reverse();
let i;
function join(array, first = false) {
const next = Array.isArray(array[0]);
let result;
if (next) {
result = array.map(e => {
if (first) { i = 0; }
return join(e);
});
i++;
result = result.join(_separators[i]);
}
else {
result = array.join(_separators[i]);
}
return result;
}
const result = join(array, true);
console.log(result);
// desired output: 1-2_3-4,5-6_7-8|A-B_C-D,E-F_G-H
Something like this with recursion
const join = (array, separators, depth) => {
if (depth < separators.length -1) {
return array.map(el => join(el, separators, depth + 1)).join(separators[depth]);
} else {
return array.join(separators[depth]);
}
};
{
const array = [[[1,2],[3,4]],[[5,6],[7,8]]];
const separators = [',', '_', '-'];
console.log(join(array, separators, 0));
}
{
const array = [[[[1,2],[3,4]],[[5,6],[7,8]]],[[['A','B'],['C','D']],[['E','F'],['G','H']]]];
const separators = ['|', ',', '_', '-'];
console.log(join(array, separators, 0));
}
I'm building some program in Nodejs, which will need to keep track in memory of a large number of users. Also, i will have a function that filters a user by id. The code would look something like this:
const users = [
{
id: 1,
name: 'John',
friends: [3, 6, 8]
},
{
id: 2,
name: 'Mark',
friends: [567, 23]
}
]
function getUserById(userId) {
const user = users.filter(user => user.id === userId);
return user[0];
}
The question is, whether this version is generally faster(each key is user id):
const users = {
1: {
id: 1,
name: 'John',
friends: [3, 6, 8]
},
2: {
id: 2,
name: 'Mark',
friends: [567, 23]
}
}
function getUserById(userId) {
return users[userId];
}
My intuition says that the dictionary is faster. What are the facts?
Key lookup time in objects is not guaranteed. It might also be O(n), but most engines will optimize it towards O(1) if you dynamically look up a key multiple times. Filtering an array is O(n), .find() however is twice faster on average:
return users.find(user => user.id === userId);
Now the only datastructure that guarantees O(log n) lookup are Maps:
const userMap = new Map(users.map(u => [u.id, u]));
console.log(userMap.get("test"));
If you however plan to do that in a very large scale (100k is large), I would rather move that task to a database, as it is heavily optimized for those tasks. MongoDB would be easy to adopt, Redis would be very fast, there are many others out there.
I've written a small script that can be copy pasted to the console and shows the actuall data for this question and varifies in practice the answer of Jonas Wilms.
function random_int_from_range(x, y) {
return (x + Math.floor(Math.random() * (y - x + 1)));
}
function generate_name(length_min, length_max) {
var letters = 'abcdefghijklmnopqrstuvwxyz';
var name_array = [];
for (var i = 0; i <= random_int_from_range(length_min, length_max); i ++) {
name_array.push(letters.charAt(Math.floor(Math.random() * letters.length +1)));
}
return name_array.join('')
}
function generate_friends_array(length_min, length_max, num_users) {
friends_array = [];
for (var i = 0; i < random_int_from_range(length_min, length_max); i++) {
friends_array.push(random_int_from_range(0, num_users - 1))
}
return friends_array
}
function generate_users_dict(num_users) {
var users = {};
for (var i = 0; i < num_users; i++) {
users[i] = {
'id': i,
'name': generate_name(4,6),
'friends': generate_friends_array(0, 20, num_users)
}
}
return users
}
function generate_users_list_from_dict(users_dict) {
var users_list = [];
for (var key in users_dict) {
users_list.push(users_dict[key]);
}
return users_list;
}
function get_diff_in_seconds_from_two_milisecond_values(early_value, late_value) {
return (late_value - early_value) / 1000
}
function get_user_by_id_from_dict(users_dict, user_id) {
return users_dict[user_id]
}
function get_user_by_id_from_list(users_list, user_id) {
const users = users_list.filter(user => user.id === user_id);
return users[0]
}
function get_time_for_retrieval_of_item_from_object(object, object_length) {
var function_names = ['get_user_by_id_from_dict', 'get_user_by_id_from_list'];
var random_id = random_int_from_range(0, object_length - 1);
var function_name = '';
if (Array.isArray(object)) {
function_name = function_names[1];
}
else {
function_name = function_names[0];
}
var time_before_retrieval = new Date().getTime();
window[function_name](object, random_id);
var time_after_retrieval = new Date().getTime();
return get_diff_in_seconds_from_two_milisecond_values(time_before_retrieval,
time_after_retrieval);
}
function test_retrieval_times(number_of_users, tests_num, object_type) {
var users_dict = generate_users_dict(number_of_users);
var users_list = generate_users_list_from_dict(users_dict);
var times_array = [];
var object = '';
if (object_type == 'dict') {
object = users_dict;
}
else {
object = users_list;
}
for (var i = 0; i < tests_num; i++) {
times_array.push(get_time_for_retrieval_of_item_from_object(object,
number_of_users));
}
return times_array;
}
function get_average_retrieval_time(object_type, number_of_users,
numbers_of_retrievals) {
var retrieval_times = test_retrieval_times(number_of_users, numbers_of_retrievals,
object_type);
var sum = 0;
for (var i = 0; i < retrieval_times.length; i++) {
sum += retrieval_times[i];
}
console.log('average retrieval time for ' + object_type + ': ' + sum /
numbers_of_retrievals);
}
var number_of_users = parseInt(prompt("Please enter object size", "1000000"));
var number_of_retrievals = parseInt(prompt("Please enter number of retrievals",
"100"));
get_average_retrieval_time('dict', number_of_users, number_of_retrievals);
get_average_retrieval_time('list', number_of_users, number_of_retrievals);
The results of the tests are printed to the console.
I like to create a object that looks like this for the following URL:
faq/jamie/hutber/faq.json
faq/jamie/hutber/faq_sales.json
sales/people/faq_refunds.json
{
faq: {
jamie: {
hutber:[
"faq.json",
"faq_sales.json"
]
}
},
sales: {
people: [
faq_refunds.json
]
}
}
I feel confident to be able to build we'll need some kind of recursion... which I am lacking in.
const data = {}
const list = 'faq/jamie/hutber/faq.json'.split('/').reverse();
list.forEach((cur, index) => {
if(cur.includes('.json')){
data[cur];
} else if(poo[cur]) {
data[cur] = {}
}else{
data[cur] = {}
}
});
var a = ["faq/jamie/hutber/faq.json",
"faq/jamie/hutber/faq_sales.json",
"sales/people/faq_refunds.json"]; //your URLs
var jsonObj = {}; //this json object will store your result
function urlToJson(array, index, jsonObj){ //function that implements your logic
if(index == array.length - 2){
jsonObj[ array[index] ] = jsonObj[ array[index] ] || [];
jsonObj[ array[index] ].push(array[index + 1]);
return;
}
jsonObj[ array[index] ] = jsonObj[ array[index] ] || {};
urlToJson(array, index + 1, jsonObj[ array[index] ]);
}
for(var key in a){
var array = a[key].split("/");
urlToJson(array, 0, jsonObj);
}
console.log(jsonObj);
You can do this in a loop. Note that you won't support folders that contain both folders and files in your current format.
Here's an example that loops over all paths and adds object to the tree. It's a bit ugly, but it should help you write your own function.
const paths = [
"faq/jamie/hutber/faq.json",
"faq/jamie/hutber/faq_sales.json",
"sales/people/faq_refunds.json"
];
const makeTree = (paths, tree = {}) =>
paths.reduce(
(tree, path) => {
const parts = path.split("/");
const folders = parts.slice(0, -2);
const container = parts[parts.length - 2];
const file = parts[parts.length - 1];
let loc = tree;
folders.forEach(f => {
loc[f] = loc[f] || {};
loc = loc[f];
});
loc[container] = loc[container] || [];
loc[container].push(file);
return tree;
},
tree
);
console.log(makeTree(paths));
I've got a collection in the database MongoDB called words, which stores all words. They have been extracted via queries on the backend and pushed to the front end.
This has been done in the front end:
this.annotationSub = this.annotationService
.getWordUpdateListener()
.subscribe((thewords: ComplexWord[]) => {
this.thewords = thewords;
this.thewords.map(word => {
if (word.word === this.setWord) {
this.wordIWant = word.word;
}
console.log(word);
});
The console.log(word); on top give these fields =
{word: "Lorem", annotation: "Explain Lorem"},
{word: "Aenean", annotation: "Explain Aenean"},
{word: "Hello", annotation: "Explaining Hello"}
This retrieves all the text:
this.postsService.getPosts();
this.postsSub = this.postsService
.getPostUpdateListener()
.subscribe((posts: Post[]) => {
this.posts = posts;
this.posts.map(post => {
if (post.id === this.id) {
this.postIWant = post.fileText;
}
});
});
On this.postIWant I have got all the text from the post.
Now how do I check if any words have any matches with the text in this.postIWant?
Many thanks in advance
This is the best solution.
Passing in the text to the function:
function complexWordIdentification(text) {
const complexWords = ['Hello', 'World', 'Complex Phrase'];
const results = [];
let match, regexp, result;
for (let i = 0; i < complexWords.length; i++) {
// the complex word we are checking in this iteration
const complexWord = complexWords[i];
regexp = new RegExp(complexWord, 'g');
while ((match = regexp.exec(text)) !== null) {
result = {
begin: (regexp.lastIndex - complexWords[i].length),
end: regexp.lastIndex,
text: complexWord
};
results.push(result);
}
}
return results;
}
Something like this
var joined = function(arr) {
var res = [];
for (var i in arr) {
var u = DB.getUser(arr[i].user_id, function(user) {
return user;
});
arr[i].user = u;
res = arr[i];
}
return res;
}
I need to get user variable from DB.getUser scope.
Is just inserted some comments into you code to help understand the async flow:
var joined = function(arr) {
// Timestamp: 0
var res = [];
for (var i in arr) {
// Timestamp: 1
var u = DB.getUser(arr[i].user_id, function(user) {
// Timestamp 4 ... length of arr
// user contains what you are looking for
// but this is not return to u, because we passed that a long time ago
return user;
});
// u is null or undefined, because DB.getUser returns nothing
// is a async function, you need wait for the callback
arr[i].user = u;
// Timestamp: 2 return useless arr
res = arr[i];
}
// Timestamp: 3 again, return a useless array
return res;
}
Edit:
You need to this before you pass everything to the template, e.g.:
var joined = function(arr, doneCallback) {
var res = []
var count = arr.length;
for (var i in arr) {
DB.getUser(arr[i].user_id, function(user) {
count--;
res.push(user);
if (count == 0) {
doneCallback(res);
}
})
}
}
joined(somedata, function(mydata) {
render(template, mydata)
});
Take a look at some flow control libraries. (My favorite async)
var getJoinedAndDoSomeThingWithThem = function(ids) {
var joined = [];
var i = 0;
var getAUser = function () {
DB.getUser(ids[i].user_id, function(user) {
joined.push(user);
i++;
if (i == ids.length -1) {
doSomeThingWithTheResult(joined);
return;
}
getAUser();
});
}
getAUser();
}