Simple syntax highlighting using javascript - javascript

Currently I have an editable div and I want to add very basic syntax highlighting. Essentially I want text between * to turn a different color and text in quotes to turn a different color. For example:
input: "hello" *world*
output: <span class='a'>"hello"</span> <span class='b'>*world*</span>
I'm using Rangy.js library to save and restore the caret position so there's no issues there. However I'm really struggling to turn the input into the output. The big problem I have is ignoring any " and * that are already highlighted.
If anyone could point me in the direction of a basic algorithm or regular expression or something it would be much appreciated.

function highlight(text) {
var result = [];
for (var i = 0; i < text.length; i++) {
if (text[i] === '"') {
var stop = text.indexOf('"', i + 1);
result.push('<span class="a">');
result.push(text.substring(i, stop+1));
result.push('</span>');
i = stop;
}
else if (text[i] === '*') {
var stop = text.indexOf('*', i + 1);
result.push('<span class="b">');
result.push(text.substring(i, stop+1));
result.push('</span>');
i = stop;
}
else if (text[i] === '<') {
// Skip simple HTML tags.
var stop = text.indexOf('>', i + 1);
result.push(text.substring(i, stop+1));
i = stop;
}
else {
result.push(text.substring(i,i+1));
}
}
return result.join('');
}
Example:
>>> highlight('foo *bar"baz"qux* "foobar" qux')
"foo <span class="b">*bar"baz"qux*</span> <span class="a">"foobar"</span> qux"
Or with regular expressions:
function highlight2(text) {
return text.replace(/([*"]).*?\1|<[^<>]*>/g, function (match, ch) {
// 'match' contains the whole match
// 'ch' contains the first capture-group
if (ch === '"') {
return '<span class="a">' + match + '</span>';
}
else if (ch === '*') {
return '<span class="b">' + match + '</span>';
}
else {
return match;
}
});
}
The regular expression ([*"]).*?\1 contains the following:
[*"] matches * or ". (They don't need to be escaped inside [ ]).
( ) captures the matched string into capture-group 1.
.*? matches anything up until the first...
\1 matches the same string as was captured into capture-group 1.
| is "Or". It tries to match the left side, and if that fails, it tries to match the right side.
<[^<>]*> matches simple html-tags. It will not be able to handle attributes with literal < or > in them: <a href="info.php?tag=<i>"> (that is bad HTML anyway, but some browsers will accept it.)
In the case when it matches an HTML tag, the ch parameter will be undefined, and the else-branch will be picked.
If you want to add more characters, just put them inside the [ ], and add an if-statement to handle them. You can use any character except -, \ and ] without escaping them. To add those characters, you need to put another \ in front of them.

Your basic algorithm is
function highlight(myInput) {
// Split the string into tokens.
// "[^"]*" matches a minimal run surrounded by quotes
// \*[^*]*\* matches a minimal run surrounded by asterisks
// ["*][^"*]* matches an unmatched quote or asterisk and the tail of the string
// [^"*]+ matches a maximal un-styled run
var tokens = myInput.match(/"[^"]*"|\*[^*]*\*|["*][^"*]*$|[^"*]+/g);
// Walk over the list of tokens and turn them into styled HTML
var htmlOut = [];
for (var i = 0, n = tokens.length; i < n; ++i) {
var token = tokens[i];
// Choose a style.
var className =
token.charAt(0) == '"' ? "a" : token.charAt(0) == '*' ? "b" : null;
// Surround in a span if we have a style.
if (className) { htmlOut.push("<span class='", className, "'>"); }
// HTML escape the token content.
htmlOut.push(token.replace(/&/g, "&").replace(/</g, "<"));
if (className) { htmlOut.push("</span>"); }
}
// Join the output tokens.
return htmlOut.join('');
}
alert(highlight('"hello" *world*'));

Related

javascript indexof regex A-Za-z0-9 always returns false

I have created a JS fiddle https://jsfiddle.net/95r110s9/#&togetherjs=Emdw6ORNpc
HTML
<input id="landlordstreetaddress2" class="landlordinputs" onfocusout="validateinputentries()" />
JS
validateinputentries(){
landlordstreetaddress2 = document.getElementById('landlordstreetaddress2').value;
goodcharacters = "/^[a-zA-Z0-9#.,;:'\s]+$/gi";
for (var i = 0; i < landlordstreetaddress2.length; i++){
if (goodcharacters.indexOf(landlordstreetaddress2.charAt(i)) != -1){
console.log('Character is valid');
}
}
}
Its pulling the value from an input and running an indexOf regex expression with A-Z a-z and 0-9 with a few additional characters as well.
The problem is that it works with the entry of BCDEFG...etc and 12345...etc, but when I type "A" or "Z" or "0" or "1", it returns incorrectly.
I need it to return the same with 0123456789, ABCDEF...XYZ and abcdef...xyz
I should point out that the below does work as intended:
var badcharacters = "*|,\":<>[]`\';#?=+/\\";
badcharacter = false;
//firstname
for (var i = 0; i < landlordfirstname.value.length; i++){
if (badcharacters.indexOf(landlordfirstname.value.charAt(i)) != -1){
badcharacter = true;
break;
}
if(landlordfirstname.value.charAt(0) == " "){
badcharacter = true;
break;
}
}
String.prototype.indexOf()
The indexOf() method returns the index within the calling String object of the first occurrence of the specified value, starting the search at fromIndex. Returns -1 if the value is not found.
So, you're trying to search this value "/^[a-zA-Z0-9#.,;:'\s]+$/gi" which "never" will be found in the entered string.
You actually want to test that regexp against the entered value.
/^[a-zA-Z0-9#.,;:'\s]+$/gi.test(landlordstreetaddress2)
function validateinputentries() {
var landlordstreetaddress2 = document.getElementById('landlordstreetaddress2').value;
if (/^[a-zA-Z0-9#.,;:'\s]+$/gi.test(landlordstreetaddress2)) {
console.log('Characters are valid');
} else {
console.log('Characters are invalid');
}
}
<input id="landlordstreetaddress2" class="landlordinputs" onfocusout="validateinputentries()" />
You're trying to combine two different methods of testing a string -- one way is with a regex; the other way is by checking each character against a list of allowed characters. What you've wound up with is checking each character against a list of what would have been a regex, if you hadn't declared it as a string.
Those methods conflict with each other; you need to pick one or the other.
Check each character:
This is closest to what you were attempting. You can't use character ranges here (like a-zA-Z) as you would in a regex; you have to spell out each allowed character individually:
var validateinputentries = function() {
var address = document.getElementById('landlordstreetaddress2').value;
var goodcharacters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789#.,;:' ";
var badcharactersfound = false;
for (var i = 0; i < address.length; i++) {
if (goodcharacters.indexOf(address.charAt(i)) == -1) {
badcharactersfound = true;
console.log("not allowed: ", address.charAt(i));
}
}
if (badcharactersfound) {
// Show validation error here
}
}
<input id="landlordstreetaddress2" class="landlordinputs" onfocusout="validateinputentries()" />
Regular Expressions
The regex version is much simpler, because the regular expression is doing most of the work. You don't need to step through the string, just test the whole string against the regex and see what comes out. In this case you're looking to see if the input contains any characters that aren't allowed, so you want to use the character exception rule: [^abc] will match any character that is not a, b, or c. You don't want to anchor the match to the beginning or the end of the string, as you were doing with the initial ^ and the trailing $; and you can leave out the + because you don't care if there are sequential bad characters, you just care if they exist at all.
var validateinputentries = function() {
var address = document.getElementById('landlordstreetaddress2').value;
var regex = new RegExp("[^a-zA-Z0-9#.,;:'\\s]","g")
var badcharactersfound = address.match(regex);
// or the above two lines could also have been written like this:
// var bad = address.match(/[^a-zA-Z0-9#.,;:'\s]/g)
// In either case the "g" operator could be omitted; then it would only return the first bad character.
if (badcharactersfound) {
console.log("Not allowed: ", badcharactersfound);
}
}
<input id="landlordstreetaddress2" class="landlordinputs" onfocusout="validateinputentries()" />

Regex match character set without previous duplication

What is the best way to create a dynamic regex to match a distinct set of characters (characters and their order provided during runtime).
character set: abcd
character format: ??j? (question mark represents a a character from character set)
Example
abjd = match
bdja = match
dbja = match
ab = no match
aajd = no match
abjdd = no match
abj = no match
I have created a regex builder (in js) as follows:
// characters are the character set
// wordFormat is the character format
// replace(str, search, replacement) replaces search in str with replacement
var chars = "[" + characters + "]{1}";
var afterSpecialConversion = replace(wordFormat, "?", chars);
var myRegex = new RegExp("^" + afterSpecialConversion + "$", "gi");
Unfortunately this does not achieve the result as it does not consider duplicate items. I thought about using matching groups to avoid duplicates however I don't know how to negate the already existing character group from the remainder of the set.
Also given character set aabcd now a can exist twice. Any suggestions?
Your regex-builder approach is correct (though a bit of a maintanability mess, so document it carefully), but not quite sophisticated enough. What you need to do is use lookaheads.
I've provided an example regex on Regex101 for the demo in your question.
The more general principle is to replace each set of n question marks with a pattern that matches this:
(?:([<chars>])(?!.*\<m>)){<n>}
Where <chars> is the character set you want to use, m is the index of the set of question marks (starting from 1 - more on this in a moment), and <n> is the number of question marks in the group. This yields regex-builder code that looks like this:
function getRe(pattern, chars) {
var re = "^";
var qMarkGroup = 1;
var qMarkCount = 0;
for (var index in pattern) {
var char = pattern[index];
if (char === "?") {
qMarkCount += 1;
} else {
if (qMarkCount > 0) {
re += "(?:([" + chars + "])(?!.*\\" + qMarkGroup + ")){" + qMarkCount + "}" + char;
qMarkCount = 0;
qMarkGroup += 1;
}
}
}
// Need to do this again in case we have a group of question marks at the end of the pattern
if (qMarkCount > 0) {
re += "(?:([" + chars + "])(?!.*\\" + qMarkGroup + ")){" + qMarkCount + "}";
}
re += "$";
return new Regexp(re, "gi");
}
Code demo on Repl.it
Obviously, this function definition is very verbose, to demonstrate the principles involved. Feel free to golf it (but remember to watch out for fencepost issues like I've described in the comments).
Additionally, be sure to sanitize the inputs. This is an example and will break if someone, for instance, puts in ] in chars.

javascript regex to select quoted string but not escape quotes

Original string:
some text "some \"string\"right here "
Want to get:
"some \"string\"right here"
I am using the following regex:
/\"(.*?)\"/g
Parsing the string correctly with a parser
With a JavaScript regex, it is impossible to start matching at the correct double quote. You will either match an escaped one, or you will fail to match the correct double quote after a literal \ before a quote. Thus, the safest way is to use a parser. Here is a sample one:
var s = "some text \\\"extras\" some \\\"string \\\" right\" here \"";
console.log("Incorrect (with regex): ", s.match(/"([^"\\]*(?:\\.[^"\\]*)*)"/g));
var res = [];
var tmp = "";
var in_quotes = false;
var in_entity = false;
for (var i=0; i<s.length; i++) {
if (s[i] === '\\' && in_entity === false) {
in_entity = true;
if (in_quotes === true) {
tmp += s[i];
}
} else if (in_entity === true) { // add a match
in_entity = false;
if (in_quotes === true) {
tmp += s[i];
}
} else if (s[i] === '"' && in_quotes === false) { // start a new match
in_quotes = true;
tmp += s[i];
} else if (s[i] === '"' && in_quotes === true) { // append char to match and add to results
tmp += s[i];
res.push(tmp);
tmp = "";
in_quotes = false;
} else if (in_quotes === true) { // append a char to the match
tmp += s[i];
}
}
console.log("Correct results: ", res);
Not-so-safe regex approach
It is not possible to match the string you need with lazy dot matching pattern since it will stop before the first ". If you know your string will never have an escaped quote before a quoted substring, and if you are sure there are no literal \ before double quotes (and these conditions are very strict to use the regex safely), you can use
/"([^"\\]*(?:\\.[^"\\]*)*)"/g
See the regex demo
" - match a quote
([^"\\]*(?:\\.[^"\\]*)*) - 0 or more sequences of
[^"\\]* - 0+ non-\ and non"s
(?:\\.[^"\\]*)* - zero or more sequences of
\\. - any escaped symbol
[^"\\]* - 0+ non-\ and non"s
" - trailing quote
JS demo:
var re = /"([^"\\]*(?:\\.[^"\\]*)*)"/g;
var str = `some text "some \\"string\\"right here " some text "another \\"string\\"right here "`;
var res = [];
while ((m = re.exec(str)) !== null) {
res.push(m[1]);
}
document.body.innerHTML = "<pre>" + JSON.stringify(res, 0, 4) + "</pre>"; // Just for demo
console.log(res); // or another result demo
Safe regex approach
Complementing #WiktorStribiżew's answer, there is a technique to start matching at the correct double quote using regex. It consists of matching both quoted and unquoted text in the form:
/"(quoted)"|unquoted/g
As you can see, the quoted text is matched by a group, so we'll only consider text backreferenced by match[1].
Regex
/"([^"\\]*(?:\\.[^"\\]*)*)"|[^"\\]*(?:\\.[^"\\]*)*/g
Code
var regex = /"([^"\\]*(?:\\.[^"\\]*)*)"|[^"\\]*(?:\\.[^"\\]*)*/g;
var s = "some text \\\"extras\" some \\\"string \\\" right\" here \"";
var match;
var res = [];
while ((match = regex.exec(s)) !== null) {
if (match.index === regex.lastIndex)
regex.lastIndex++;
if( match[1] != null )
res.push(match[1]); //Append to result only group 1
}
console.log("Correct results (regex technique): ",res)
You can use this regex :
/[^\\](\".*?[^\\]\")/g
[^\\] catch any caracter diferent of \. So \" will not be catch as start or end of your match.
In order to match from quote to quote while ignoring any simple escaped quotes (\"):
(:?[^\\]|^)(\"(:?.*?[^\\]){0,1}\")
Meaning (:? start of grouping with no extraction [^\\] match one char that is not a backslash | match the previous char or ^ which is beginning of string. ( start of extraction grouping \" find quotes (that follow non slash or start of string), (:?.*?[^\\] match shortest substring ending with none slash, ){0,1} zero times or one - that actually means one time or an empty substring, that is followed by \" a quote mark.
Edit:
Wiktor Stribiżew Correctly pointed out that some more cases with regex terms in the string will fail in my initial answer. for example \\" that should be matched similar to " in your case. To avoid this specific issue you can use
(:?[^\\]|^)((:?\\\\)*\"(:?.*?[^\\]){0,1}(:?\\\\)*\")
But for actual regex compatibility you will need to refer to Wiktor's answer.

How to separate the values of a line of .csv file which contains commas in data? [duplicate]

I have the following type of string
var string = "'string, duppi, du', 23, lala"
I want to split the string into an array on each comma, but only the commas outside the single quotation marks.
I can't figure out the right regular expression for the split...
string.split(/,/)
will give me
["'string", " duppi", " du'", " 23", " lala"]
but the result should be:
["string, duppi, du", "23", "lala"]
Is there a cross-browser solution?
Disclaimer
2014-12-01 Update: The answer below works only for one very specific format of CSV. As correctly pointed out by DG in the comments, this solution does NOT fit the RFC 4180 definition of CSV and it also does NOT fit MS Excel format. This solution simply demonstrates how one can parse one (non-standard) CSV line of input which contains a mix of string types, where the strings may contain escaped quotes and commas.
A non-standard CSV solution
As austincheney correctly points out, you really need to parse the string from start to finish if you wish to properly handle quoted strings that may contain escaped characters. Also, the OP does not clearly define what a "CSV string" really is. First we must define what constitutes a valid CSV string and its individual values.
Given: "CSV String" Definition
For the purpose of this discussion, a "CSV string" consists of zero or more values, where multiple values are separated by a comma. Each value may consist of:
A double quoted string. (may contain unescaped single quotes.)
A single quoted string. (may contain unescaped double quotes.)
A non-quoted string. (may NOT contain quotes, commas or backslashes.)
An empty value. (An all whitespace value is considered empty.)
Rules/Notes:
Quoted values may contain commas.
Quoted values may contain escaped-anything, e.g. 'that\'s cool'.
Values containing quotes, commas, or backslashes must be quoted.
Values containing leading or trailing whitespace must be quoted.
The backslash is removed from all: \' in single quoted values.
The backslash is removed from all: \" in double quoted values.
Non-quoted strings are trimmed of any leading and trailing spaces.
The comma separator may have adjacent whitespace (which is ignored).
Find:
A JavaScript function which converts a valid CSV string (as defined above) into an array of string values.
Solution:
The regular expressions used by this solution are complex. And (IMHO) all non-trivial regexes should be presented in free-spacing mode with lots of comments and indentation. Unfortunately, JavaScript does not allow free-spacing mode. Thus, the regular expressions implemented by this solution are first presented in native regex syntax (expressed using Python's handy: r'''...''' raw-multi-line-string syntax).
First here is a regular expression which validates that a CVS string meets the above requirements:
Regex to validate a "CSV string":
re_valid = r"""
# Validate a CSV string having single, double or un-quoted values.
^ # Anchor to start of string.
\s* # Allow whitespace before value.
(?: # Group for value alternatives.
'[^'\\]*(?:\\[\S\s][^'\\]*)*' # Either Single quoted string,
| "[^"\\]*(?:\\[\S\s][^"\\]*)*" # or Double quoted string,
| [^,'"\s\\]*(?:\s+[^,'"\s\\]+)* # or Non-comma, non-quote stuff.
) # End group of value alternatives.
\s* # Allow whitespace after value.
(?: # Zero or more additional values
, # Values separated by a comma.
\s* # Allow whitespace before value.
(?: # Group for value alternatives.
'[^'\\]*(?:\\[\S\s][^'\\]*)*' # Either Single quoted string,
| "[^"\\]*(?:\\[\S\s][^"\\]*)*" # or Double quoted string,
| [^,'"\s\\]*(?:\s+[^,'"\s\\]+)* # or Non-comma, non-quote stuff.
) # End group of value alternatives.
\s* # Allow whitespace after value.
)* # Zero or more additional values
$ # Anchor to end of string.
"""
If a string matches the above regex, then that string is a valid CSV string (according to the rules previously stated) and may be parsed using the following regex. The following regex is then used to match one value from the CSV string. It is applied repeatedly until no more matches are found (and all values have been parsed).
Regex to parse one value from valid CSV string:
re_value = r"""
# Match one value in valid CSV string.
(?!\s*$) # Don't match empty last value.
\s* # Strip whitespace before value.
(?: # Group for value alternatives.
'([^'\\]*(?:\\[\S\s][^'\\]*)*)' # Either $1: Single quoted string,
| "([^"\\]*(?:\\[\S\s][^"\\]*)*)" # or $2: Double quoted string,
| ([^,'"\s\\]*(?:\s+[^,'"\s\\]+)*) # or $3: Non-comma, non-quote stuff.
) # End group of value alternatives.
\s* # Strip whitespace after value.
(?:,|$) # Field ends on comma or EOS.
"""
Note that there is one special case value that this regex does not match - the very last value when that value is empty. This special "empty last value" case is tested for and handled by the js function which follows.
JavaScript function to parse CSV string:
// Return array of string values, or NULL if CSV string not well formed.
function CSVtoArray(text) {
var re_valid = /^\s*(?:'[^'\\]*(?:\\[\S\s][^'\\]*)*'|"[^"\\]*(?:\\[\S\s][^"\\]*)*"|[^,'"\s\\]*(?:\s+[^,'"\s\\]+)*)\s*(?:,\s*(?:'[^'\\]*(?:\\[\S\s][^'\\]*)*'|"[^"\\]*(?:\\[\S\s][^"\\]*)*"|[^,'"\s\\]*(?:\s+[^,'"\s\\]+)*)\s*)*$/;
var re_value = /(?!\s*$)\s*(?:'([^'\\]*(?:\\[\S\s][^'\\]*)*)'|"([^"\\]*(?:\\[\S\s][^"\\]*)*)"|([^,'"\s\\]*(?:\s+[^,'"\s\\]+)*))\s*(?:,|$)/g;
// Return NULL if input string is not well formed CSV string.
if (!re_valid.test(text)) return null;
var a = []; // Initialize array to receive values.
text.replace(re_value, // "Walk" the string using replace with callback.
function(m0, m1, m2, m3) {
// Remove backslash from \' in single quoted values.
if (m1 !== undefined) a.push(m1.replace(/\\'/g, "'"));
// Remove backslash from \" in double quoted values.
else if (m2 !== undefined) a.push(m2.replace(/\\"/g, '"'));
else if (m3 !== undefined) a.push(m3);
return ''; // Return empty string.
});
// Handle special case of empty last value.
if (/,\s*$/.test(text)) a.push('');
return a;
};
Example input and output:
In the following examples, curly braces are used to delimit the {result strings}. (This is to help visualize leading/trailing spaces and zero-length strings.)
// Test 1: Test string from original question.
var test = "'string, duppi, du', 23, lala";
var a = CSVtoArray(test);
/* Array hes 3 elements:
a[0] = {string, duppi, du}
a[1] = {23}
a[2] = {lala} */
// Test 2: Empty CSV string.
var test = "";
var a = CSVtoArray(test);
/* Array hes 0 elements: */
// Test 3: CSV string with two empty values.
var test = ",";
var a = CSVtoArray(test);
/* Array hes 2 elements:
a[0] = {}
a[1] = {} */
// Test 4: Double quoted CSV string having single quoted values.
var test = "'one','two with escaped \' single quote', 'three, with, commas'";
var a = CSVtoArray(test);
/* Array hes 3 elements:
a[0] = {one}
a[1] = {two with escaped ' single quote}
a[2] = {three, with, commas} */
// Test 5: Single quoted CSV string having double quoted values.
var test = '"one","two with escaped \" double quote", "three, with, commas"';
var a = CSVtoArray(test);
/* Array hes 3 elements:
a[0] = {one}
a[1] = {two with escaped " double quote}
a[2] = {three, with, commas} */
// Test 6: CSV string with whitespace in and around empty and non-empty values.
var test = " one , 'two' , , ' four' ,, 'six ', ' seven ' , ";
var a = CSVtoArray(test);
/* Array hes 8 elements:
a[0] = {one}
a[1] = {two}
a[2] = {}
a[3] = { four}
a[4] = {}
a[5] = {six }
a[6] = { seven }
a[7] = {} */
Additional notes:
This solution requires that the CSV string be "valid". For example, unquoted values may not contain backslashes or quotes, e.g. the following CSV string is NOT valid:
var invalid1 = "one, that's me!, escaped \, comma"
This is not really a limitation because any sub-string may be represented as either a single or double quoted value. Note also that this solution represents only one possible definition for: "Comma Separated Values".
Edit: 2014-05-19: Added disclaimer.
Edit: 2014-12-01: Moved disclaimer to top.
RFC 4180 solution
This does not solve the string in the question since its format is not conforming with RFC 4180; the acceptable encoding is escaping double quote with double quote. The solution below works correctly with CSV files d/l from google spreadsheets.
UPDATE (3/2017)
Parsing single line would be wrong. According to RFC 4180 fields may contain CRLF which will cause any line reader to break the CSV file. Here is an updated version that parses CSV string:
'use strict';
function csvToArray(text) {
let p = '', row = [''], ret = [row], i = 0, r = 0, s = !0, l;
for (l of text) {
if ('"' === l) {
if (s && l === p) row[i] += l;
s = !s;
} else if (',' === l && s) l = row[++i] = '';
else if ('\n' === l && s) {
if ('\r' === p) row[i] = row[i].slice(0, -1);
row = ret[++r] = [l = '']; i = 0;
} else row[i] += l;
p = l;
}
return ret;
};
let test = '"one","two with escaped """" double quotes""","three, with, commas",four with no quotes,"five with CRLF\r\n"\r\n"2nd line one","two with escaped """" double quotes""","three, with, commas",four with no quotes,"five with CRLF\r\n"';
console.log(csvToArray(test));
OLD ANSWER
(Single line solution)
function CSVtoArray(text) {
let ret = [''], i = 0, p = '', s = true;
for (let l in text) {
l = text[l];
if ('"' === l) {
s = !s;
if ('"' === p) {
ret[i] += '"';
l = '-';
} else if ('' === p)
l = '-';
} else if (s && ',' === l)
l = ret[++i] = '';
else
ret[i] += l;
p = l;
}
return ret;
}
let test = '"one","two with escaped """" double quotes""","three, with, commas",four with no quotes,five for fun';
console.log(CSVtoArray(test));
And for the fun, here is how you create CSV from the array:
function arrayToCSV(row) {
for (let i in row) {
row[i] = row[i].replace(/"/g, '""');
}
return '"' + row.join('","') + '"';
}
let row = [
"one",
"two with escaped \" double quote",
"three, with, commas",
"four with no quotes (now has)",
"five for fun"
];
let text = arrayToCSV(row);
console.log(text);
I liked FakeRainBrigand's answer, however it contains a few problems: It can not handle whitespace between a quote and a comma, and does not support 2 consecutive commas. I tried editing his answer but my edit got rejected by reviewers that apparently did not understand my code. Here is my version of FakeRainBrigand's code.
There is also a fiddle: http://jsfiddle.net/xTezm/46/
String.prototype.splitCSV = function() {
var matches = this.match(/(\s*"[^"]+"\s*|\s*[^,]+|,)(?=,|$)/g);
for (var n = 0; n < matches.length; ++n) {
matches[n] = matches[n].trim();
if (matches[n] == ',') matches[n] = '';
}
if (this[0] == ',') matches.unshift("");
return matches;
}
var string = ',"string, duppi, du" , 23 ,,, "string, duppi, du",dup,"", , lala';
var parsed = string.splitCSV();
alert(parsed.join('|'));
I had a very specific use case where I wanted to copy cells from Google Sheets into my web app. Cells could include double-quotes and new-line characters. Using copy and paste, the cells are delimited by a tab characters, and cells with odd data are double quoted. I tried this main solution, the linked article using regexp, and Jquery-CSV, and CSVToArray. http://papaparse.com/ Is the only one that worked out of the box. Copy and paste is seamless with Google Sheets with default auto-detect options.
PEG(.js) grammar that handles RFC 4180 examples at http://en.wikipedia.org/wiki/Comma-separated_values:
start
= [\n\r]* first:line rest:([\n\r]+ data:line { return data; })* [\n\r]* { rest.unshift(first); return rest; }
line
= first:field rest:("," text:field { return text; })*
& { return !!first || rest.length; } // ignore blank lines
{ rest.unshift(first); return rest; }
field
= '"' text:char* '"' { return text.join(''); }
/ text:[^\n\r,]* { return text.join(''); }
char
= '"' '"' { return '"'; }
/ [^"]
Test at http://jsfiddle.net/knvzk/10 or https://pegjs.org/online.
Download the generated parser at https://gist.github.com/3362830.
People seemed to be against RegEx for this. Why?
(\s*'[^']+'|\s*[^,]+)(?=,|$)
Here's the code. I also made a fiddle.
String.prototype.splitCSV = function(sep) {
var regex = /(\s*'[^']+'|\s*[^,]+)(?=,|$)/g;
return matches = this.match(regex);
}
var string = "'string, duppi, du', 23, 'string, duppi, du', lala";
console.log( string.splitCSV() );
.as-console-wrapper { max-height: 100% !important; top: 0; }
Adding one more to the list, because I find all of the above not quite "KISS" enough.
This one uses regex to find either commas or newlines while skipping over quoted items. Hopefully this is something noobies can read through on their own. The splitFinder regexp has three things it does (split by a |):
, - finds commas
\r?\n - finds new lines, (potentially with carriage return if the exporter was nice)
"(\\"|[^"])*?" - skips anynthing surrounded in quotes, because commas and newlines don't matter in there. If there is an escaped quote \\" in the quoted item, it will get captured before an end quote can be found.
const splitFinder = /,|\r?\n|"(\\"|[^"])*?"/g;
function csvTo2dArray(parseMe) {
let currentRow = [];
const rowsOut = [currentRow];
let lastIndex = splitFinder.lastIndex = 0;
// add text from lastIndex to before a found newline or comma
const pushCell = (endIndex) => {
endIndex = endIndex || parseMe.length;
const addMe = parseMe.substring(lastIndex, endIndex);
// remove quotes around the item
currentRow.push(addMe.replace(/^"|"$/g, ""));
lastIndex = splitFinder.lastIndex;
}
let regexResp;
// for each regexp match (either comma, newline, or quoted item)
while (regexResp = splitFinder.exec(parseMe)) {
const split = regexResp[0];
// if it's not a quote capture, add an item to the current row
// (quote captures will be pushed by the newline or comma following)
if (split.startsWith(`"`) === false) {
const splitStartIndex = splitFinder.lastIndex - split.length;
pushCell(splitStartIndex);
// then start a new row if newline
const isNewLine = /^\r?\n$/.test(split);
if (isNewLine) { rowsOut.push(currentRow = []); }
}
}
// make sure to add the trailing text (no commas or newlines after)
pushCell();
return rowsOut;
}
const rawCsv = `a,b,c\n"test\r\n","comma, test","\r\n",",",\nsecond,row,ends,with,empty\n"quote\"test"`
const rows = csvTo2dArray(rawCsv);
console.log(rows);
No regexp, readable, and according to https://en.wikipedia.org/wiki/Comma-separated_values#Basic_rules:
function csv2arr(str: string) {
let line = ["",];
const ret = [line,];
let quote = false;
for (let i = 0; i < str.length; i++) {
const cur = str[i];
const next = str[i + 1];
if (!quote) {
const cellIsEmpty = line[line.length - 1].length === 0;
if (cur === '"' && cellIsEmpty) quote = true;
else if (cur === ",") line.push("");
else if (cur === "\r" && next === "\n") { line = ["",]; ret.push(line); i++; }
else if (cur === "\n" || cur === "\r") { line = ["",]; ret.push(line); }
else line[line.length - 1] += cur;
} else {
if (cur === '"' && next === '"') { line[line.length - 1] += cur; i++; }
else if (cur === '"') quote = false;
else line[line.length - 1] += cur;
}
}
return ret;
}
If you can have your quote delimiter be double quotes, then this is a duplicate of Example JavaScript code to parse CSV data.
You can either translate all single-quotes to double-quotes first:
string = string.replace( /'/g, '"' );
...or you can edit the regex in that question to recognize single-quotes instead of double-quotes:
// Quoted fields.
"(?:'([^']*(?:''[^']*)*)'|" +
However, this assumes certain markup that is not clear from your question. Please clarify what all the various possibilities of markup can be, per my comment on your question.
I've used regex a number of times, but I always have to relearn it each time, which is frustrating :-)
So Here's a non-regex solution:
function csvRowToArray(row, delimiter = ',', quoteChar = '"'){
let nStart = 0, nEnd = 0, a=[], nRowLen=row.length, bQuotedValue;
while (nStart <= nRowLen) {
bQuotedValue = (row.charAt(nStart) === quoteChar);
if (bQuotedValue) {
nStart++;
nEnd = row.indexOf(quoteChar + delimiter, nStart)
} else {
nEnd = row.indexOf(delimiter, nStart)
}
if (nEnd < 0) nEnd = nRowLen;
a.push(row.substring(nStart,nEnd));
nStart = nEnd + delimiter.length + (bQuotedValue ? 1 : 0)
}
return a;
}
How it works:
Pass in the csv string in row.
While the start position of the next value is within the row, do the following:
If this value has been quoted, set nEnd to the closing quote.
Else if value has NOT been quoted, set nEnd to the next delimiter.
Add the value to an array.
Set nStart to nEnd plus the length of the delimeter.
Sometimes it's good to write your own small function, rather than use a library. Your own code is going to perform well and use only a small footprint. In addition, you can easily tweak it to suit your own needs.
Regular expressions to the rescue! These few lines of code properly handle quoted fields with embedded commas, quotes, and newlines based on the RFC 4180 standard.
function parseCsv(data, fieldSep, newLine) {
fieldSep = fieldSep || ',';
newLine = newLine || '\n';
var nSep = '\x1D';
var qSep = '\x1E';
var cSep = '\x1F';
var nSepRe = new RegExp(nSep, 'g');
var qSepRe = new RegExp(qSep, 'g');
var cSepRe = new RegExp(cSep, 'g');
var fieldRe = new RegExp('(?<=(^|[' + fieldSep + '\\n]))"(|[\\s\\S]+?(?<![^"]"))"(?=($|[' + fieldSep + '\\n]))', 'g');
var grid = [];
data.replace(/\r/g, '').replace(/\n+$/, '').replace(fieldRe, function(match, p1, p2) {
return p2.replace(/\n/g, nSep).replace(/""/g, qSep).replace(/,/g, cSep);
}).split(/\n/).forEach(function(line) {
var row = line.split(fieldSep).map(function(cell) {
return cell.replace(nSepRe, newLine).replace(qSepRe, '"').replace(cSepRe, ',');
});
grid.push(row);
});
return grid;
}
const csv = 'A1,B1,C1\n"A ""2""","B, 2","C\n2"';
const separator = ','; // field separator, default: ','
const newline = ' <br /> '; // newline representation in case a field contains newlines, default: '\n'
var grid = parseCsv(csv, separator, newline);
// expected: [ [ 'A1', 'B1', 'C1' ], [ 'A "2"', 'B, 2', 'C <br /> 2' ] ]
Unless stated elsewhere, you don't need a finite state machine. The regular expression handles RFC 4180 properly thanks to positive lookbehind, negative lookbehind, and positive lookahead.
Clone/download code at https://github.com/peterthoeny/parse-csv-js
I have also faced the same type of problem when I had to parse a CSV file.
The file contains a column address which contains the ',' .
After parsing that CSV file to JSON, I get mismatched mapping of the keys while converting it into a JSON file.
I used Node.js for parsing the file and libraries like baby parse and csvtojson.
Example of file -
address,pincode
foo,baar , 123456
While I was parsing directly without using baby parse in JSON, I was getting:
[{
address: 'foo',
pincode: 'baar',
'field3': '123456'
}]
So I wrote code which removes the comma(,) with any other delimiter
with every field:
/*
csvString(input) = "address, pincode\\nfoo, bar, 123456\\n"
output = "address, pincode\\nfoo {YOUR DELIMITER} bar, 123455\\n"
*/
const removeComma = function(csvString){
let delimiter = '|'
let Baby = require('babyparse')
let arrRow = Baby.parse(csvString).data;
/*
arrRow = [
[ 'address', 'pincode' ],
[ 'foo, bar', '123456']
]
*/
return arrRow.map((singleRow, index) => {
//the data will include
/*
singleRow = [ 'address', 'pincode' ]
*/
return singleRow.map(singleField => {
//for removing the comma in the feild
return singleField.split(',').join(delimiter)
})
}).reduce((acc, value, key) => {
acc = acc +(Array.isArray(value) ?
value.reduce((acc1, val)=> {
acc1 = acc1+ val + ','
return acc1
}, '') : '') + '\n';
return acc;
},'')
}
The function returned can be passed into the csvtojson library and thus the result can be used.
const csv = require('csvtojson')
let csvString = "address, pincode\\nfoo, bar, 123456\\n"
let jsonArray = []
modifiedCsvString = removeComma(csvString)
csv()
.fromString(modifiedCsvString)
.on('json', json => jsonArray.push(json))
.on('end', () => {
/* do any thing with the json Array */
})
Now you can get the output like:
[{
address: 'foo, bar',
pincode: 123456
}]
My answer presumes your input is a reflection of code/content from web sources where single and double quote characters are fully interchangeable provided they occur as an non-escaped matching set.
You cannot use regex for this. You actually have to write a micro parser to analyze the string you wish to split. I will, for the sake of this answer, call the quoted parts of your strings as sub-strings. You need to specifically walk across the string. Consider the following case:
var a = "some sample string with \"double quotes\" and 'single quotes' and some craziness like this: \\\" or \\'",
b = "sample of code from JavaScript with a regex containing a comma /\,/ that should probably be ignored.";
In this case you have absolutely no idea where a sub-string starts or ends by simply analyzing the input for a character pattern. Instead you have to write logic to make decisions on whether a quote character is used a quote character, is itself unquoted, and that the quote character is not following an escape.
I am not going to write that level of complexity of code for you, but you can look at something I recently wrote that has the pattern you need. This code has nothing to do with commas, but is otherwise a valid enough micro-parser for you to follow in writing your own code. Look into the asifix function of the following application:
https://github.com/austincheney/Pretty-Diff/blob/master/fulljsmin.js
To complement this answer
If you need to parse quotes escaped with another quote, example:
"some ""value"" that is on xlsx file",123
You can use
function parse(text) {
const csvExp = /(?!\s*$)\s*(?:'([^'\\]*(?:\\[\S\s][^'\\]*)*)'|"([^"\\]*(?:\\[\S\s][^"\\]*)*)"|"([^""]*(?:"[\S\s][^""]*)*)"|([^,'"\s\\]*(?:\s+[^,'"\s\\]+)*))\s*(?:,|$)/g;
const values = [];
text.replace(csvExp, (m0, m1, m2, m3, m4) => {
if (m1 !== undefined) {
values.push(m1.replace(/\\'/g, "'"));
}
else if (m2 !== undefined) {
values.push(m2.replace(/\\"/g, '"'));
}
else if (m3 !== undefined) {
values.push(m3.replace(/""/g, '"'));
}
else if (m4 !== undefined) {
values.push(m4);
}
return '';
});
if (/,\s*$/.test(text)) {
values.push('');
}
return values;
}
While reading the CSV file into a string, it contains null values in between strings, so try it with \0 line by line. It works for me.
stringLine = stringLine.replace(/\0/g, "" );
Try this one.
function parseCSV(csv) {
let quotes = [];
let token = /(?:(['"`])([\s\S]*?)\1)|([^\t,\r\n]+)\3?|([\r\n])/gm;
let text = csv.replace(/\\?(['"`])\1?/gm, s => s.length != 2 ? s : `_r#${quotes.push(s) - 1}`);
return [...text.matchAll(token)]
.map(t => (t[2] || t[3] || t[4])
.replace(/^_r#\d+$/, "")
.replace(/_r#\d+/g, q => quotes[q.replace(/\D+/, '')][1]))
.reduce((a, b) => /^[\r\n]$/g.test(b)
? a.push([]) && a
: a[a.length - 1].push(b) && a, [[]])
.filter(d => d.length);
}
Use the npm library csv-string to parse the strings instead of split: https://www.npmjs.com/package/csv-string
This will handle the comma in quotes and empty entries
This one is based on niry's answer but for semicolon:
'use strict';
function csvToArray(text) {
let p = '', row = [''], ret = [row], i = 0, r = 0, s = !0, l;
for (l of text) {
if ('"' === l) {
if (s && l === p) row[i] += l;
s = !s;
} else if (';' === l && s) l = row[++i] = '';
else if ('\n' === l && s) {
if ('\r' === p) row[i] = row[i].slice(0, -1);
row = ret[++r] = [l = '']; i = 0;
} else row[i] += l;
p = l;
}
return ret;
};
let test = '"one";"two with escaped """" double quotes""";"three; with; commas";four with no quotes;"five with CRLF\r\n"\r\n"2nd line one";"two with escaped """" double quotes""";"three, with; commas and semicolons";four with no quotes;"five with CRLF\r\n"';
console.log(csvToArray(test));
Aside from the excellent and complete answer from ridgerunner, I thought of a very simple workaround for when your backend runs PHP.
Add this PHP file to your domain's backend (say: csv.php)
<?php
session_start(); // Optional
header("content-type: text/xml");
header("charset=UTF-8");
// Set the delimiter and the End of Line character of your CSV content:
echo json_encode(array_map('str_getcsv', str_getcsv($_POST["csv"], "\n")));
?>
Now add this function to your JavaScript toolkit (should be revised a bit to make crossbrowser I believe).
function csvToArray(csv) {
var oXhr = new XMLHttpRequest;
oXhr.addEventListener("readystatechange",
function () {
if (this.readyState == 4 && this.status == 200) {
console.log(this.responseText);
console.log(JSON.parse(this.responseText));
}
}
);
oXhr.open("POST","path/to/csv.php",true);
oXhr.setRequestHeader("Content-type", "application/x-www-form-urlencoded; charset=utf-8");
oXhr.send("csv=" + encodeURIComponent(csv));
}
It will cost you one Ajax call, but at least you won't duplicate code nor include any external library.
Ref: http://php.net/manual/en/function.str-getcsv.php
You can use papaparse.js like the example below:
<!DOCTYPE html>
<html lang="en">
<head>
<title>CSV</title>
</head>
<body>
<input type="file" id="files" multiple="">
<button onclick="csvGetter()">CSV Getter</button>
<h3>The Result will be in the Console.</h3>
<script src="papaparse.min.js"></script>
<script>
function csvGetter() {
var file = document.getElementById('files').files[0];
Papa.parse(file, {
complete: function(results) {
console.log(results.data);
}
});
}
</script>
</body>
</html>
Don't forget to include papaparse.js in the same folder.
According to this blog post, this function should do it:
String.prototype.splitCSV = function(sep) {
for (var foo = this.split(sep = sep || ","), x = foo.length - 1, tl; x >= 0; x--) {
if (foo[x].replace(/'\s+$/, "'").charAt(foo[x].length - 1) == "'") {
if ((tl = foo[x].replace(/^\s+'/, "'")).length > 1 && tl.charAt(0) == "'") {
foo[x] = foo[x].replace(/^\s*'|'\s*$/g, '').replace(/''/g, "'");
} else if (x) {
foo.splice(x - 1, 2, [foo[x - 1], foo[x]].join(sep));
} else foo = foo.shift().split(sep).concat(foo);
} else foo[x].replace(/''/g, "'");
} return foo;
};
You would call it like so:
var string = "'string, duppi, du', 23, lala";
var parsed = string.splitCSV();
alert(parsed.join("|"));
This jsfiddle kind of works, but it looks like some of the elements have spaces before them.

Regular Expressions - Matching IRC-like parameters?

I am looking to create a IRC-like command format:
/commandname parameter1 "parameter 2" "parameter \"3\"" parameter"4 parameter\"5
Which would (ideally) give me a list of parameters:
parameter1
parameter 2
parameter "3"
parameter"4
parameter\"5
Now from what I have read, this isn't at all trivial and might as well be done in some other method.
Thoughts?
Below is C# code that does the job I need:
public List<string> ParseIrcCommand(string command)
{
command = command.Trim();
command = command.TrimStart(new char[] { '/' });
command += ' ';
List<string> Tokens = new List<string>();
int tokenStart = 0;
bool inQuotes = false;
bool inToken = true;
string currentToken = "";
for (int i = tokenStart; i < command.Length; i++)
{
char currentChar = command[i];
char nextChar = (i + 1 >= command.Length ? ' ' : command[i + 1]);
if (!inQuotes && inToken && currentChar == ' ')
{
Tokens.Add(currentToken);
currentToken = "";
inToken = false;
continue;
}
if (inQuotes && inToken && currentChar == '"')
{
Tokens.Add(currentToken);
currentToken = "";
inQuotes = false;
inToken = false;
if (nextChar == ' ') i++;
continue;
}
if (inQuotes && inToken && currentChar == '\\' && nextChar == '"')
{
i++;
currentToken += nextChar;
continue;
}
if (!inToken && currentChar != ' ')
{
inToken = true;
tokenStart = i;
if (currentChar == '"')
{
tokenStart++;
inQuotes = true;
continue;
}
}
currentToken += currentChar;
}
return Tokens;
}
You have shown your code - that's good, but it seems that you haven't thought about whether it is reasonable to parse the command like that:
Firstly, your code will allow new line character inside the command name and parameters. It would be reasonable if you assume that new line character can never be there.
Secondly, \ also needs to be escaped like ", since there will be no way to specify a single \ at the end of a parameter without causing any confusion.
Thirdly, it is a bit weird to have the command name parsed the same way as parameters - command names are usually per-determined and fixed, so there is no need to allow for flexible ways to specify it.
I cannot think of one-line solution in JavaScript that is general. JavaScript regex lacks \G, which asserts the last match boundary. So my solution will have to make do with beginning of string assertion ^ and chomping off the string as a token is matched.
(There is not much code here, mostly comments)
function parseCommand(str) {
/*
* Trim() in C# will trim off all whitespace characters
* \s in JavaScript regex also match any whitespace character
* However, the set of characters considered as whitespace might not be
* equivalent
* But you can be sure that \r, \n, \t, space (ASCII 32) are included.
*
* However, allowing all those whitespace characters in the command
* is questionable.
*/
str = str.replace(/^\s*\//, "");
/* Look-ahead (?!") is needed to prevent matching of quoted parameter with
* missing closing quote
* The look-ahead comes from the fact that your code does not backtrack
* while the regex engine will backtrack. Possessive qualifier can prevent
* backtracking, but it is not supported by JavaScript RegExp.
*
* We emulate the effect of \G by using ^ and repeatedly chomping off
* the string.
*
* The regex will match 2 cases:
* (?!")([^ ]+)
* This will match non-quoted tokens, which are not allowed to
* contain spaces
* The token is captured into capturing group 1
*
* "((?:[^\\"]|\\[\\"])*)"
* This will match quoted tokens, which consists of 0 or more:
* non-quote-or-backslash [^\\"] OR escaped quote \"
* OR escaped backslash \\
* The text inside the quote is captured into capturing group 2
*/
var regex = /^ *(?:(?!")([^ ]+)|"((?:[^\\"]|\\[\\"])*)")/;
var tokens = [];
var arr;
while ((arr = str.match(regex)) !== null) {
if (arr[1] !== void 0) {
// Non-space token
tokens.push(arr[1]);
} else {
// Quoted token, needs extra processing to
// convert escaped character back
tokens.push(arr[2].replace(/\\([\\"])/g, '$1'));
}
// Remove the matched text
str = str.substring(arr[0].length);
}
// Test that the leftover consists of only space characters
if (/^ *$/.test(str)) {
return tokens;
} else {
// The only way to reach here is opened quoted token
// Your code returns the tokens successfully parsed
// but I think it is better to show an error here.
return null;
}
}
I created a simple regex that matches the command line you wrote.
/\w+\s((("([^\\"]*\\")*[^\\"]*")|[^ ]+)(\b|\s+))+$
/\w+\s finds the first part of your command
(((
"([^\\"]*\\")* finds any string starting with " that doesn't contain \" followed by a \" one or more times (thus allowing "something\", "some\"thing\" and so on
[^\\"]*" followed by a list of characters not containing \ or " and at last a "
)|[^ ]+ this is an alternative: finds any nonspace character sequence
)
(\b|\s+) all followerd by a space or a word boundary
)+$ one or more times, one per command, until the end of the string
I'm afraid that this can fail sometimes, but I posted this to show that sometimes the arguments have a structure based on repetition, for example see "something\"something\"something\"end" where the repeated structure is something\", and you can use this idea to build your regex

Categories