Need to chunk strings into 140-character chunks while respecting words - javascript

I have a very simple problem but need a solution that works. I have a node script that opens a text file, loops over each line and chunks the line if its over 140 characters but needs to respect word boundaries. This is what I have so far but the lines come out unaffected. I've also tried _.invoke(lines, function() { splitText(this); }; but this also leaves the lines unaffected. Can anyone suggest another way of doing this?
var args = process.argv.splice(2),
fs = require('fs'),
_ = require('underscore'),
splitText;
splitText = function (textSegment) {
var len = 140, curr = len, prev = 0, output = [], currReverse;
while (textSegment[curr]) {
if (textSegment[curr++] == ' ') {
output.push(textSegment.substring(prev, curr));
prev = curr;
curr += len;
} else {
currReverse = curr;
do {
if (textSegment.substring(currReverse - 1, currReverse) == ' ') {
output.push(textSegment.substring(prev, currReverse));
prev = currReverse;
curr = currReverse + len;
break;
}
currReverse--;
} while (currReverse > prev);
}
}
output.push(textSegment.substring(prev));
return output;
}
text = fs.readFileSync(args[0], 'utf-8');
lines = text.split("\n");
lines = _.filter(lines, function (line) {
return line.length >= 100;
});
lines = _.map(lines, function (line) {
return splitText(line);
});
fs.writeFile(args[0], lines.join("\n"), function (err) {
if (err) throw err;
console.log('test');
});

I don't have any experience with underscore.js, but I do know a fairly straightforward way to fix this without it:
function formatStr(text, len) {
len = len||140;
var i=0,
str, newline, breakpt,
formatted = '';
while (i+len<text.length) {
str = text.substr(i, len);
newline = str.indexOf('\n');
if (newline!=-1) {
formatted += str.substr(0,newline+1);
console.log(i,newline);
i += newline + 1;
continue;
}
breakpt = str.lastIndexOf(' ');
formatted += str.substr(0,breakpt) + '\n';
i+=breakpt+1;
}
// add last line to the end and return; credit to Charly
// for mentioning this was missing.
return formatted + text.substr(i);
}
DEMO
What this loop does is the following:
Store the next 140 characters in a var
test if there are already any newlines in this string
if so, just add that part of the string to the formatted string, and continue from there
get the last index of a space in the string variable
append the part of the string until the next space in the formatted string
finally, return the formatted string.

Kind of an old thread but here's a quick solution.
Try this regex, you can see how it works here: http://regexper.com/#%5E(%5Cr%5Cn%7C.)%7B1%2C140%7D%5Cb
str.match(/^(\r\n|.){1,140}\b/g).join('')

Related

Javascript Regex Custom Replace

How do I get the following conversion using Regex?
Content(input data structure):
a-test
b-123
c-qweq
d-gdfgd
e-312
Conversion:
1-test
2-123
3-qweq
4-gdfgd
Final-312
var index = 1;
function c_replace() {
if(index == 5) { return "Final"; }
return index++;
}
there you go :D
// i assume you have a string input that contains linebreaks due to your question format
const input = `a-test
b-123
c-qweq
d-gdfgd
e-312`.trim(); // removing whitespace in front or behind the input data.
//splitting the lines on whitespace using \s+
const output = input.split(/\s+/).map((s, i, a) => {
// this will map your pattern asd-foooasdasd
const data = s.match(/^[a-z]+-(.+)$/);
// you may want to tweak this. right now it will simply throw an error.
if (!data) throw new Error(`${s} at position ${i} is a malformed input`);
// figure out if we are in the final iteration
const final = i == a.length -1;
// the actual output data
return `${final ? "Final" : (i + 1)}-${data[1]}`;
// and of course join the array into a linebreak separated list similar to your input.
}).join("\n");
console.log(output);
Test
var index=1;
var text=`a-test
b-123
c-qweq
d-gdfgd
e-312`;
function c_replace() {
if(index == 5) { return "Final-"; }
return index++ +'-';
}
console.log(text.replace(/.-/g,c_replace));
var input = [
'a-test',
'b-123',
'c-qweq',
'd-gdfgd',
'e-312'
];
var output = input.map((e, i) => ++i + e.slice(1));
output[output.length - 1] = 'Final' + output[output.length - 1].slice(1);
console.log(output);

Splitting a string into multiple lines in javascript

I'm trying to find a way to split long strings to multiple lines so what I'm doing is insert text into an image and if it gets too long it overflows, newlines work but it wouldn't be best idea to let user add the newlines and split it in code, so if i give it a limit it checks if its over limit split to two lines or i mean a newline \n between it, however that's easy but my problem is when it comes that the second part is also over the limit then it should split it in to 3 newlines, how would you go implement that?
Examples
split("sometext", 5); // somet\next
split("Hello", 2); // he\nll\no
Very straightforward answer to your question:
function customSplit(str, maxLength){
if(str.length <= maxLength)
return str;
var reg = new RegExp(".{1," + maxLength + "}","g");
var parts = str.match(reg);
return parts.join('\n');
}
You need a function like the following:
function split(str, maxWidth) {
const newLineStr = "\n";
done = false;
res = '';
do {
found = false;
// Inserts new line at first whitespace of the line
for (i = maxWidth - 1; i >= 0; i--) {
if (testWhite(str.charAt(i))) {
res = res + [str.slice(0, i), newLineStr].join('');
str = str.slice(i + 1);
found = true;
break;
}
}
// Inserts new line at maxWidth position, the word is too long to wrap
if (!found) {
res += [str.slice(0, maxWidth), newLineStr].join('');
str = str.slice(maxWidth);
}
if (str.length < maxWidth)
done = true;
} while (!done);
return res + str;
}
function testWhite(x) {
const white = new RegExp(/^\s$/);
return white.test(x.charAt(0));
};
console.log(split("sometext", 5));
console.log(split("Hello", 2));
https://j11y.io/snippets/wordwrap-for-javascript/
Wraps using specified limit on characters. :)
What kind of interface are you building? If it's a web interface, you should style the string on the front end, instead of modifying it on the data layer.
If it's a text based interface and you really need to do this, then you can get the first n characters while there is a non-empty string, then join with '\n'. Assuming you have underscore:
function split(str, n) {
let numberOfSegments = Math.ceil(_.size(str) / n);
let segments = _.map(_.range(numberOfSegments),
segmentIndex => str.substr(segmentIndex * n, n));
return segments.join('\n');
}

How can I exclude punctuation in "pig latin" function?

I apologize if this question has been answered somewhere - please point me in the right direction if so. I have read through a bunch of solutions and have not yet cracked it!
Sooo...basically, I need to:
Move the first letter of each word to the end of it, then add "ay" to the end of the word. Leave punctuation marks untouched.
This is my code so far:
function pigIt(str) {
var newStr = str.split(" ");
var changed = newStr.map(function(input) {
return input.substring(1) + input.charAt(0) + "ay";
});
changed = changed.join(" ");
return changed;
}
console.log(pigIt('Pig latin is cool'));
As you can see, the code will work for any input that doesn't include punctuation. Great. Now I need to maybe add a Regex somewhere to exclude punctuation but I don't know where to put it! Please help!!
You could split by the word boundary /(\W+)/ while capturing separator. Transform words only. And then join back.
function pigIt(str) {
var newStr = str.split(/(\W+)/); // ['Pig', ' ', 'latin', ',- ',..]
var changed = newStr.map(function(input) {
if (!/\w/.test(input)) return input // keep non word elements as is
return input.substring(1) + input.charAt(0) + "ay";
});
return changed.join("");
}
console.log(pigIt('Pig latin,- is cool!'));
I think because you are going to want to put the punctuation back in the same place after processing, then you will probably be better of doing it all manually.
Loop the input 1 char at a time and build a 'word buffer', every time you hit a non-letter character then process the word buffer and append the non-letter character too.
function pigIt(str) {
var process = function(s) {
if (s.length < 2) {
return s;
}
return s.substring(1) + s.charAt(0) + "ay";
};
var result = '';
var buffer = '';
for (var i = 0; i < str.length; i++) {
var c = str[i];
if (c.match(/[a-zA-Z]/i)) {
buffer += c;
} else {
if (buffer.length) {
result += process(buffer);
buffer = '';
}
result += c;
}
}
result += process(buffer);
buffer = '';
return result;
}
var output = pigIt('Pig latin is cool.');
console.log(output);

Highlighting string at multiple occurrences

I'm currently implementing a substring search. From the algorithm, I get array of substrings occurence positions where each element is in the form of [startPos, endPos].
For example (in javascript array):
[[1,3], [8,10], [15,18]]
And the string to highlight is:
ACGATCGATCGGATCGAGCGATCGAGCGATCGAT
I want to highlight (in HTML using <b>) the original string, so it will highlight or bold the string from position 1 to 3, then 8 to 10, then 15 to 18, etc (0-indexed).
A<b>CGA</b>TCGA<b>TCG</b>GATC<b>GAGC</b>GATCGAGCGATCGAT
This is what I have tried (JavaScript):
function hilightAtPositions(text, posArray) {
var startPos, endPos;
var startTag = "<b>";
var endTag = "</b>";
var hilightedText = "";
for (var i = 0; i < posArray.length; i++) {
startPos = posArray[i][0];
endPos = posArray[i][1];
hilightedText = [text.slice(0, startPos), startTag, text.slice(startPos, endPos), endTag, text.slice(endPos)].join('');
}
return hilightedText;
}
But it highlights just a range from the posArray (and I know it is still incorrect yet). So, how can I highlight a string given multiple occurrences position?
Looking at this question, and following John3136's suggestion of going from tail to head, you could do:
String.prototype.splice = function( idx, rem, s ) {
return (this.slice(0,idx) + s + this.slice(idx + Math.abs(rem)));
};
function hilightAtPositions(text, posArray) {
var startPos, endPos;
posArray = posArray.sort(function(a,b){ return a[0] - b[0];});
for (var i = posArray.length-1; i >= 0; i--) {
startPos = posArray[i][0];
endPos = posArray[i][1];
text= text.splice(endPos, 0, "</b>");
text= text.splice(startPos, 0, "<b>");
}
return text;
}
Note that in your code, you are overwriting hilightedText with each iteration, losing your changes.
Try this:
var stringToHighlight = "ACGATCGATCGGATCGAGCGATCGAGCGATCGAT";
var highlightPositions = [[1,3], [8,10], [15,18]];
var lengthDelta = 0;
for (var highlight in highlightPositions) {
var start = highlightPositions[highlight][0] + lengthDelta;
var end = highlightPositions[highlight][1] + lengthDelta;
var first = stringToHighlight.substring(0, start);
var second = stringToHighlight.substring(start, end + 1);
var third = stringToHighlight.substring(end + 1);
stringToHighlight = first + "<b>" + second + "</b>" + third;
lengthDelta += ("<b></b>").length;
}
alert(stringToHighlight);
Demo: http://jsfiddle.net/kPkk3/
Assuming that you're trying to highlight search terms or something like that. Why not replace the term with the bolding?
example:
term: abc
var text = 'abcdefgabcqq';
var term = 'abc';
text.replace(term, '<b>' + term + '</b>');
This would allow you to avoid worrying about positions, assuming that you are trying to highlight a specific string.
Assuming your list of segments is ordered from lowest start to highest, try doing your array from last to first.
That way you are not changing parts of the string you haven't reached yet.
Just change the loop to:
for (var i = posArray.length-1; i >=0; i--) {
If you want to check for multiple string matches and highlight them, this code snippet works.
function highlightMatch(text, matchString) {
let textArr = text.split(' ');
let returnArr = [];
for(let i=0; i<textArr.length; i++) {
let subStrMatch = textArr[i].toLowerCase().indexOf(matchString.toLowerCase());
if(subStrMatch !== -1) {
let subStr = textArr[i].split('');
let subStrReturn = [];
for(let j=0 ;j<subStr.length; j++) {
if(j === subStrMatch) {
subStrReturn.push('<strong>' + subStr[j]);
} else if (j === subStrMatch + (matchString.length-1)){
subStrReturn.push(subStr[j] + '<strong>');
} else {
subStrReturn.push(subStr[j]);
}
}
returnArr.push(subStrReturn.join(''));
} else {
returnArr.push(textArr[i]);
}
}
return returnArr;
}
highlightMatch('Multi Test returns multiple results', 'multi');
=> (5) ['<strong>Multi<strong>', 'Test', 'returns', '<strong>multi<strong>ple', 'results']

Javascript word-count for any given DOM element

I'm wondering if there's a way to count the words inside a div for example. Say we have a div like so:
<div id="content">
hello how are you?
</div>
Then have the JS function return an integer of 4.
Is this possible? I have done this with form elements but can't seem to do it for non-form ones.
Any ideas?
g
If you know that the DIV is only going to have text in it, you can KISS:
var count = document.getElementById('content').innerHTML.split(' ').length;
If the div can have HTML tags in it, you're going to have to traverse its children looking for text nodes:
function get_text(el) {
ret = "";
var length = el.childNodes.length;
for(var i = 0; i < length; i++) {
var node = el.childNodes[i];
if(node.nodeType != 8) {
ret += node.nodeType != 1 ? node.nodeValue : get_text(node);
}
}
return ret;
}
var words = get_text(document.getElementById('content'));
var count = words.split(' ').length;
This is the same logic that the jQuery library uses to achieve the effect of its text() function. jQuery is a pretty awesome library that in this case is not necessary. However, if you find yourself doing a lot of DOM manipulation or AJAX then you might want to check it out.
EDIT:
As noted by Gumbo in the comments, the way we are splitting the strings above would count two consecutive spaces as a word. If you expect that sort of thing (and even if you don't) it's probably best to avoid it by splitting on a regular expression instead of on a simple space character. Keeping that in mind, instead of doing the above split, you should do something like this:
var count = words.split(/\s+/).length;
The only difference being on what we're passing to the split function.
Paolo Bergantino's second solution is incorrect for empty strings or strings that begin or end with whitespaces. Here's the fix:
var count = !s ? 0 : (s.split(/^\s+$/).length === 2 ? 0 : 2 +
s.split(/\s+/).length - s.split(/^\s+/).length - s.split(/\s+$/).length);
Explanation: If the string is empty, there are zero words; If the string has only whitespaces, there are zero words; Else, count the number of whitespace groups without the ones from the beginning and the end of the string.
string_var.match(/[^\s]+/g).length
seems like it's a better method than
string_var.split(/\s+/).length
At least it won't count "word " as 2 words -- ['word'] rather than ['word', '']. And it doesn't really require any funny add-on logic.
Or just use Countable.js to do the hard job ;)
document.deepText= function(hoo){
var A= [];
if(hoo){
hoo= hoo.firstChild;
while(hoo!= null){
if(hoo.nodeType== 3){
A[A.length]= hoo.data;
}
else A= A.concat(arguments.callee(hoo));
hoo= hoo.nextSibling;
}
}
return A;
}
I'd be fairly strict about what a word is-
function countwords(hoo){
var text= document.deepText(hoo).join(' ');
return text.match(/[A-Za-z\'\-]+/g).length;
}
alert(countwords(document.body))
Or you can do this:
function CountWords (this_field, show_word_count, show_char_count) {
if (show_word_count == null) {
show_word_count = true;
}
if (show_char_count == null) {
show_char_count = false;
}
var char_count = this_field.value.length;
var fullStr = this_field.value + " ";
var initial_whitespace_rExp = /^[^A-Za-z0-9]+/gi;
var left_trimmedStr = fullStr.replace(initial_whitespace_rExp, "");
var non_alphanumerics_rExp = rExp = /[^A-Za-z0-9]+/gi;
var cleanedStr = left_trimmedStr.replace(non_alphanumerics_rExp, " ");
var splitString = cleanedStr.split(" ");
var word_count = splitString.length -1;
if (fullStr.length <2) {
word_count = 0;
}
if (word_count == 1) {
wordOrWords = " word";
} else {
wordOrWords = " words";
}
if (char_count == 1) {
charOrChars = " character";
} else {
charOrChars = " characters";
}
if (show_word_count & show_char_count) {
alert ("Word Count:\n" + " " + word_count + wordOrWords + "\n" + " " + char_count + charOrChars);
} else {
if (show_word_count) {
alert ("Word Count: " + word_count + wordOrWords);
} else {
if (show_char_count) {
alert ("Character Count: " + char_count + charOrChars);
}
}
}
return word_count;
}
The get_text function in Paolo Bergantino's answer didn't work properly for me when two child nodes have no space between them. eg <h1>heading</h1><p>paragraph</p> would be returned as headingparagraph (notice lack of space between the words). So prepending a space to the nodeValue fixes this. But it introduces a space at the front of the text but I found a word count function that trims it off (plus it uses several regexps to ensure it counts words only). Word count and edited get_text functions below:
function get_text(el) {
ret = "";
var length = el.childNodes.length;
for(var i = 0; i < length; i++) {
var node = el.childNodes[i];
if(node.nodeType != 8) {
ret += node.nodeType != 1 ? ' '+node.nodeValue : get_text(node);
}
}
return ret;
}
function wordCount(fullStr) {
if (fullStr.length == 0) {
return 0;
} else {
fullStr = fullStr.replace(/\r+/g, " ");
fullStr = fullStr.replace(/\n+/g, " ");
fullStr = fullStr.replace(/[^A-Za-z0-9 ]+/gi, "");
fullStr = fullStr.replace(/^\s+/, "");
fullStr = fullStr.replace(/\s+$/, "");
fullStr = fullStr.replace(/\s+/gi, " ");
var splitString = fullStr.split(" ");
return splitString.length;
}
}
EDIT
kennebec's word counter is really good. But the one I've found includes a number as a word which is what I needed. Still, that's easy to add to kennebec's. But kennebec's text retrieval function will have the same problem.
This should account for preceding & trailing whitespaces
const wordCount = document.querySelector('#content').innerText.trim().split(/\s+/).length;
string_var.match(/[^\s]+/g).length - 1;

Categories