I am trying to implement 'variable declaration future' to my parsed language.
PEG.js source:
start
=begin line (pl)+ end
pl
=varx" " left:identifier" "to" "middle:integer line { left=middle;}
/
print"(" middle:identifier ")" line {alert(middle);}
line
="\n"
begin
="start"
print
="print"
if
="if"
equals
="equals"
gth
="greater than"
identifier
=[a-zA-Z]+ {return text();}
to
="to"
varx
="set"
end
="end"
integer "integer"
= digits:[0-9]+ { return Number(parseInt(digits.join(""), 10)); }
My custom input source:
start
set a to 5
print(a)
end
What output I got:
[
"start",
"
",
[
undefined,
undefined
],
"end"
]
And as alert I got only the variable name a no the value...
try this
all
= "start" nl _ ptp:putThenPrint+ _ "end"
{
var all = [];
ptp.forEach(it => {
all.push(it);
});
var r = []
all.forEach(tp => {
tp.toPrint.forEach(p => {
r.push(tp.values[p])
});
});
return "\n" + r.join("\n") + "\n";
}
putThenPrint
= _ mn:multiPutN _ pn:multiPrintN _
{
return {values:mn,toPrint:pn};
}
multiPrintN
= _ mp:printN+ _
{
var r = [];
mp.forEach(it => {
r.push(it);
});
return r;
}
multiPutN
= _ mp:putN+ _
{
var r = {};
mp.forEach(it => {
r[it[0]]=it[1];
});
return r;
}
putN
= _ "set " _ vn:varName _ "to " _ vv:n _ nl+ { return [vn, vv]}
printN
= _ "print(" _ n:varName _ ")" _ nl+ {return n;}
varName
= [a-zA-Z]+ {return text();}
n "integer number"
= _ [0-9]+ { return parseInt(text(), 10); }
nl "new line"
= [\n]
_ "whitespace"
= [ \t]*
this grammar does not support "if" and some other things you are trying to do in your grammar but it will give you a start up idea.
You need to elaborate more of what you want in text with more example and expected output
Related
Why we can output once in PEG.js?
do anyone know any other way to implement many output?
I am using Stack π and a function , code below:
function evalStack() {
for (var i = stack.length - 1; 0 <= i; i--) {
stack[i]();
}
return result;
}
My custom input:
start
A=4;
A
B=8;
B
Result I expected:
4
8
Result I got:
4
Please help me
try this dirty solution:
all
= _ ptp:putThenPrint+ _
{
var all = [];
ptp.forEach(it => {
all.push(it);
});
var r = []
all.forEach(tp => {
tp.toPrint.forEach(p => {
r.push(tp.values[p])
});
});
return "\n" + r.join("\n") + "\n";
}
putThenPrint
= _ mn:multiPutN _ pn:multiPrintN _
{
return {values:mn,toPrint:pn};
}
multiPrintN
= _ mp:printN+ _
{
var r = [];
mp.forEach(it => {
r.push(it);
});
return r;
}
multiPutN
= _ mp:putN+ _
{
var r = {};
mp.forEach(it => {
r[it[0]]=it[1];
});
return r;
}
putN
= _ vn:varName _ "=" _ nn:n _ ";" _ nl+ { return [vn, nn]}
printN
= _ n:varName _ nl+ {return n;}
varName
= [a-zA-Z]+ {return text();}
n "integer number"
= _ [0-9]+ { return parseInt(text(), 10); }
nl "new line"
= [\n]
_ "whitespace or new line"
= [ \t]*
in the above it only give you the value of the variable in the same section so once you print the variables you can not print them again however if you change the js code inside the grammar of "all" block you can make full scan first then print all variable if this what you want, but then you will print out values before assigning. as I mentioned this is just a dirty solution that need to optimise and clean up
As part of my parser I want to add arithmetic and boolean expressions. I wanted to take default PEG.js example at https://pegjs.org/online but the problem is that that parser is recursive and you can't write two or more lines:
For instance this is valid JavaScript:
2 * (3 + 4)
2 * (3 + 4)
2 * (3 + 4)
+ 10
as you can see there are 3 expressions and end of the line don't terminate them. But with PEG.js they need to be explicitly encoded, so the expression can terminate.
How would you go and create infinite expressions like this, that terminate and go to next expression?
You could add a Start rule like the following for multiple expressions.
Start
= head:Expression tail:("\n" Expression)* {
return [head].concat(tail.map(function(element) {
return element[1];
}));
}
Expression
= head:Term tail:(_ ("+" / "-") _ Term)* {
return tail.reduce(function(result, element) {
if (element[1] === "+") { return result + element[3]; }
if (element[1] === "-") { return result - element[3]; }
}, head);
}
Term
= head:Factor tail:(_ ("*" / "/") _ Factor)* {
return tail.reduce(function(result, element) {
if (element[1] === "*") { return result * element[3]; }
if (element[1] === "/") { return result / element[3]; }
}, head);
}
Factor
= "(" _ expr:Expression _ ")" { return expr; }
/ Integer
Integer "integer"
= _ [0-9]+ { return parseInt(text(), 10); }
_ "whitespace"
= [ \t\n\r]*
I have a task to calculate characters to highlight in the text based on a query.
Let's say the given text is "London, United Kingdom" and query is "lond". Then the result should be [[0, 4]].
I have a simple implementation which works fine for this case:
// ...
.reduce((result, word) => {
const wordLen = word.length;
const prefix = wordCharacterRegex.test(word[0]) ? "\\b" : "";
const regex = new RegExp(prefix + escapeRegexCharacters(word), "i");
const index = text.search(regex);
if (index > -1) {
result.push([index, index + wordLen]);
text =
text.slice(0, index) +
new Array(wordLen + 1).join(" ") +
text.slice(index + wordLen);
}
return result;
}, [])
// ...
but then if text is "EC2V 6DB, London, United Kingdom" and the query is "ec2v6db" it doesn't work because the regular expression will be /\bec2v6db/i.
So, how can I change my code and fix the problem?
First of all, the word boundary you add if the first char is a word char should probably be consistent with regards to non-word chars: if you add \b before word chars, add \B before non-word chars to get the same behavior.
const prefix = wordCharacterRegex.test(word[0]) ? "\\b" : "\\B";
Then, it is not clear what your escapeRegexCharacters method looks like but it is there where you may insert \s* between each char of the keyword:
function escapeRegexCharacters(s) {
var res = s.replace(/([-\/\\^$*+?.()|[\]{}])|[\s\S]/g, (m,g) => (g ? "\\" + g : m) + "\\s*");
return res.substring(0, res.length -3);
}
Here is a demo:
let word = "ec2v6db"; // lond is checked
let text = "EC2V 6DB, London, United Kingdom";
const wordCharacterRegex = /\w/;
function escapeRegexCharacters(s) {
var res = s.replace(/([-\/\\^$*+?.()|[\]{}])|[\s\S]/g, (m,g) => (g ? "\\" + g : m) + "\\s*");
return res.substring(0, res.length -3);
}
const prefix = wordCharacterRegex.test(word[0]) ? "\\b" : "\\B";
const regex = new RegExp(prefix + escapeRegexCharacters(word), "i");
// Replacing text with spaces
console.log(text.replace(regex, m => " ".repeat(m.length)));
// => " , London, United Kingdom"
// Adding tags around the match
console.log(text.replace(regex, "<highlight>$&</highlight>"));
// Getting the indices:
let match = regex.exec(text);
if (match) {
console.log([match.index, match.index+match[0].length]);
}
I have a text file in which digital characters are there like -
_ _ _ _ _ _ _ _ _ (line 1)
| _| _||_||_ |_ ||_||_| (line 2)
||_ _| | _||_| ||_| _| (line 3)
(line 4)
_ _ _ _ _ _ _ (line 5)
|_||_|| ||_||_ | | ||_ (line 6)
| _||_||_||_| | | | _| (line 7)
(line 8)
Invoice number format:
Each invoice number is constructed of 9 digits [0..9]
Invoice number is written using _ and | characters.
Invoice number input takes 4 lines.
The first 3 lines contain 27 characters.
The fourth line is blank.
The output should be -
723956789
490867715
I read the text file using fs module of node js like this and break every digital character in to three parts basically -
var fun = function(){
fs.readFile("./input1.txt", 'utf8', function(err, data) {
var i = 0;
console.log(data.length);
while(data[i] != "\n" && i<data.length) {
if(data[i] != " ") {
var str = "";
while(data[i]!= " " && data[i]!= "\n") {
str = str + data[i];
i++;
}
inputA.push(str);
} else {
i++;
}
}
i++;
console.log(i,inputA);
while(data[i] != "\n" && i<data.length) {
if(data[i] != " ") {
var str = "";
while(data[i]!= " " && data[i]!= "\n") {
str = str + data[i];
i++;
}
inputB.push(str);
// console.log(inputA);
} else {
i++;
}
}
i++;
console.log(i,inputB);
while(data[i] != "\n" && i<data.length) {
if(data[i] != " ") {
var str = "";
while(data[i]!= " " && data[i]!= "\n") {
str = str + data[i];
i++;
}
console.log(str);
inputC.push(str);
// console.log(inputA);
} else {
i++;
}
}
console.log(inputA);
console.log(inputB);
console.log(inputC);
})
}
But not able to read properly. Can any one help me to read and get the desired output.
Here are two functions in ES6 which do the job:
function getDigit(pattern) {
return {
" _ | ||_|": 0,
" | |": 1,
" _ _||_ ": 2,
" _ _| _|": 3,
" |_| |": 4,
" _ |_ _|": 5,
" _ |_ |_|": 6,
" _ | |": 7,
" _ |_||_|": 8,
" _ |_| |": 9,
" _ |_| _|": 9, // alternative 9
}[pattern];
}
function getNumber(lines) {
// Chop each line into 9 pieces of 3 chars:
lines = lines.map( line => line.match(/.../g) );
// Combine the pieces of each digit-pattern together:
return +lines[0].map ( (piece, i) => piece + lines[1][i] + lines[2][i] )
// Translate each pattern of 3x3=9 characters to a digit
.map(getDigit)
// Join digits together into one number
.join('');
}
// Sample data
const data =
' _ _ _ _ _ _ _ _ _ \n'
+ ' | _| _||_||_ |_ ||_||_|\n'
+ ' ||_ _| | _||_| ||_| _|\n'
+ '\n'
+ ' _ _ _ _ _ _ _ \n'
+ '|_||_|| ||_||_ | | ||_ \n'
+ ' | _||_||_||_| | | | _|';
const lines = data.split('\n');
var a = getNumber(lines.slice(0, 3));
var b = getNumber(lines.slice(4));
console.log(a);
console.log(b);
As Timo stated in the comments, it would be far preferable to ask whomever produced the text file to export their data in a sane format.
If this is not possible, here is one way to translate the numbers into something usable, though I have taken the liberty of changing the format of your first 9 to match the two other nines in your sample input.
If the 9s can be in two different formats, I would simply use a Map to catch both formats.
// Note the first 9 in the first number has been changed.
let input = `
_ _ _ _ _ _ _ _ _
| _| _||_||_ |_ ||_||_|
||_ _| _| _||_| ||_| _|
_ _ _ _ _ _ _
|_||_|| ||_||_ | | ||_
| _||_||_||_| | | | _|
`;
// Strip empty lines
let lines = input.split('\n').filter(Boolean);
// Format: line1 + line2 + line3
let translator = [
' _ | ||_|', //0
' | |', //1
' _ _||_ ', //2
' _ _| _|', //3
' |_| |', //4
' _ |_ _|', //5
' _ |_ |_|', //6
' _ | |', //7
' _ |_||_|', //8
' _ |_| _|', //9, or ' _ |_| |' if the other 9 format was correct
]
let nums = [];
// Each "number" is 3 lines long.
for (let i = 0; i < lines.length; i += 3)
{
let num = 0;
// There are 9 numbers per line, 3 characters per number per line
for (let n = 0; n < 27; n += 3)
{
let s = lines[i].substr(n, 3) + lines[i + 1].substr(n, 3) + lines[i + 2].substr(n, 3);
num = num * 10 + translator.indexOf(s);
}
nums.push(num);
}
console.log(nums[0] == 723956789, nums[0]);
console.log(nums[1] == 490867715, nums[1]);
I have essentially the same question as PEG for Python style indentation, but I'd like to get a little more direction regarding this answer.
The answer successfully generates an array of strings that are each line of input with 'INDENT' and 'DEDENT' between lines. It seems like he's pretty much used PEG.js to tokenize, but no real parsing is happening.
So how can I extend his example to do some actual parsing?
As an example, how can I change this grammar:
start = obj
obj = id:id children:(indent obj* outdent)?
{
if (children) {
let o = {}; o[id] = children[1];
return o;
} else {
return id;
}
}
id = [a-z]
indent = '{'
outdent = '}'
to use indentation instead of braces to delineate blocks, and still get the same output?
(Use http://pegjs.majda.cz/online to test that grammar with the following input: a{bcd{zyx{}}})
Parser:
// do not use result cache, nor line and column tracking
{ var indentStack = [], indent = ""; }
start
= INDENT? l:line
{ return l; }
line
= SAMEDENT line:(!EOL c:. { return c; })+ EOL?
children:( INDENT c:line* DEDENT { return c; })?
{ var o = {}; o[line] = children; return children ? o : line.join(""); }
EOL
= "\r\n" / "\n" / "\r"
SAMEDENT
= i:[ \t]* &{ return i.join("") === indent; }
INDENT
= &(i:[ \t]+ &{ return i.length > indent.length; }
{ indentStack.push(indent); indent = i.join(""); pos = offset; })
DEDENT
= { indent = indentStack.pop(); }
Input:
a
b
c
d
z
y
x
Output:
{
"a": [
"b",
"c",
{
"d": [
"z",
"y",
"x"
]
}
]
}
It cannot parse an empty object (last x), however, it should be easy to solve. Trick here is the SAMEDENT rule, it succeeds when indentation level hasn't changed. INDENT and DEDENT change current indentation level without changing position in text pos = offset.
Update 2021
Here is a working example which runs in the online playground of Peggy.js. Peggy.js is a fork of PEG.js under active development. PEG.js was discontinued by David Maida.
The example shows, how the INDENT, SAMEDENT and DEDENT rules are parsed, and how to use parsing locations. Check the console log.
It uses these syntaxes, which may not be known from other parser generators:
(top of file)
{{...}} (Global initializer) β Run ... on parser generation.
{...} (Per-parse initializer) β Run ... on parser instantiation.
(in-file)
X {...} (action) β Do ... when X succeeds. Variables from the initializers are available. If ... returns something, it will replace what X returns.
$X β Return the raw text parsed with X, instead of the result of X.
... #X ... (pluck operator) β Replace the result of ... X ... with the result of X.
X &{...} (predicate) β "and ... also needs to be true for X to succeed".
X = &(...) β If ... succeeds, X succeeds. ... consumes no input.
See the docs for more information.
{{
console.clear()
console.log('Parser generated')
}}
{
let indentstack = []
let indent = ''
function found (what) {
let loc = location()
console.log(`[${loc.start.line}:${loc.start.column} - ${loc.end.line}:${loc.end.column}] found ${what}`)
}
console.log('Parser instantiated')
}
DOCUMENT = NEWLINES? #THINGS NEWLINES? _
THINGS = ( SAMEDENT #( OBJECT / LINE ) )*
OBJECT = key:KEY childs:(BLOCK / INLINE) {
found(`object "${key}"`)
let o = {}
o[key] = childs
return o
}
KEY = #$( [^ \t\r\n:]+ ) _ ':' _
BLOCK = NEWLINES INDENT #THINGS DEDENT
INLINE = line:LINE { return [line] }
LINE = text:$( (!EOL .)+ ) NEWLINES? {
found(`line "${text}"`)
return text
}
INDENT = &(
spaces:$( [ \t]+ ) &{
return spaces.length > indent.length
} {
indentstack.push(indent)
indent = spaces
}
) {
found('indent')
}
SAMEDENT = spaces:$( [ \t]* ) &{
return spaces === indent
} {
found('samedent')
}
/* Because of this rule, results cache must be disabled */
DEDENT = &{
indent = indentstack.pop()
return true
} {
found('dedent')
}
_ = [ \t]*
EOL = '\r\n' / '\n' / '\r'
NEWLINES = (_ EOL)+
/* Test with this input
H:
a
b
c
G:
d
e
f
*/
Old Answer
Here is a fix for #Jakub KulhanΒ΄s grammar which works in PEG.js v 0.10.0. The last line needs to be changed to = &{ indent = indentStack.pop(); return true;} because PEG.js now does not allow standalone actions ({...}) in a grammar anymore. This line is now a predicate (&{...}) which always succeeds (return true;).
i also removed the pos = offset; because it gives an error offset is not defined. Probably Jakub was referring to some global variable available in older versions of PEG.js. PEG.js now provides the location() function which returns an object which contains offset and other information.
// do not use result cache, nor line and column tracking
{ var indentStack = [], indent = ""; }
start
= INDENT? l:line
{ return l; }
line
= SAMEDENT line:(!EOL c:. { return c; })+ EOL?
children:( INDENT c:line* DEDENT { return c; })?
{ var o = {}; o[line] = children; return children ? o : line.join(""); }
EOL
= "\r\n" / "\n" / "\r"
SAMEDENT
= i:[ \t]* &{ return i.join("") === indent; }
INDENT
= &(i:[ \t]+ &{ return i.length > indent.length; }
{ indentStack.push(indent); indent = i.join(""); })
DEDENT
= &{ indent = indentStack.pop(); return true;}
Starting with v 0.11.0 PEG.js also supports the Value Plucking operator, # which would allow writing this grammar even simpler, but as it is currently not in the online parser I will refrain from adding it to this example.