refactor: relaxation of grammar & fault-tolerant parser

The lexer has become difficult to manage. The definition and priority of tokens were conflicting. For example, I defined these two tokens: - A "name" token is a sequence of <a-z+><,><a-z+> (e.g. Doe,John). - A "value" token is a sequence of any character. Because the "name" token is defined before the "value" token, the parser would fail for a tag expecting a "value" token that contains the same sequence as the "name" token. e.g., AB - foo,bar ^ failure: expected "value" token but saw a "name" token instead In addition some vendors seem to produce RIS files that, to the best of my knowledge, aren't compatible with the RIS specification which admittedly is sometimes confusing. For example I have seen this kind of RIS file: TY - JOUR KW - foo ER - If your editor doesn't show you whitespaces, the error is that the ER tag contains and extra space: ER<SPACE><SPACE>-<SPACE><SPACE> Whereas the RIS spec says that it should be: ER<SPACE><SPACE>-<SPACE> In this particular case it seemed unnecessary to have the parser fail. It will definitely not be obvious to the user what the error is and it so easy for the parser to just ignore that extra space. I have therefore decided that the grammar and the lexer will simply facilitate the parsing of RIS files and act less as validating agents. BREAKING CHANGE: The parser will not fail unless the RIS file doesn't follow the basic specification: <TAG><SPACE><SPACE>-<SPACE><CONTENT> For example the parser used to fail for this: RP - FOOBAR Expected value for the "RP" tag is: - IN FILE - NOT IN FILE - ON REQUEST (mm/dd/yyyy) Now the parser will simply take the content as is and will make best-effort attempts to make sense of the data. In a nutshell do not use the parser as a validation tool anymore.
customcommander · Jul 5, 2020 · 3b8a16f · 3b8a16f
1 parent c5eded6
commit 3b8a16f
Show file tree

Hide file tree

Showing 10 changed files with 158 additions and 120 deletions.
diff --git a/Makefile b/Makefile
@@ -3,6 +3,9 @@ burrito-test: /tmp/ris.burrito-test
 sample: grammar.js sample.ris
 	cat sample.ris | yarn -s nearley-test -q grammar.js | tee out.txt
 
+samples/%.json: samples/%.ris
+	node -p -e 'const fs = require("fs"); const parse = require("./index.js"); JSON.stringify(parse(fs.readFileSync("$^","utf-8")));' | jq --sort-keys '.' > $@
+
 parse: grammar.js
 	node -p -e 'const fs = require("fs"); const parse = require("./index.js"); console.log(parse(fs.readFileSync("./sample.ris","utf-8")));'
 

diff --git a/grammar.js b/grammar.js
@@ -18,22 +18,14 @@ function id(x) { return x[0]; }
     {"name": "reference$ebnf$1", "symbols": []},
     {"name": "reference$ebnf$1", "symbols": ["reference$ebnf$1", "entry"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
     {"name": "reference", "symbols": ["start", "reference$ebnf$1", "end"], "postprocess": ([type, entries]) => [type, entries]},
-    {"name": "start", "symbols": [(lexer.has("type") ? {type: "type"} : type), (lexer.has("sep") ? {type: "sep"} : sep), (lexer.has("type_value") ? {type: "type_value"} : type_value), (lexer.has("newline") ? {type: "newline"} : newline)], "postprocess": ([{value: key},,{value}]) => ({key, value})},
-    {"name": "end", "symbols": [(lexer.has("end") ? {type: "end"} : end), (lexer.has("sep") ? {type: "sep"} : sep), "_"], "postprocess": () => null},
-    {"name": "entry$subexpression$1", "symbols": ["entry_std"]},
-    {"name": "entry$subexpression$1", "symbols": ["entry_name"]},
-    {"name": "entry$subexpression$1", "symbols": ["entry_date"]},
-    {"name": "entry$subexpression$1", "symbols": ["entry_pubyear"]},
-    {"name": "entry$subexpression$1", "symbols": ["entry_reprint"]},
-    {"name": "entry", "symbols": ["entry$subexpression$1"], "postprocess": ([[entry]]) => entry},
-    {"name": "entry_std$ebnf$1", "symbols": ["std_value"]},
-    {"name": "entry_std$ebnf$1", "symbols": ["entry_std$ebnf$1", "std_value"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
-    {"name": "entry_std", "symbols": [(lexer.has("std") ? {type: "std"} : std), (lexer.has("sep") ? {type: "sep"} : sep), "entry_std$ebnf$1"], "postprocess": ([{value: key},,value]) => ({key, value: value.join(' ')})},
-    {"name": "std_value", "symbols": [(lexer.has("std_value") ? {type: "std_value"} : std_value), (lexer.has("newline") ? {type: "newline"} : newline)], "postprocess": ([{value}]) => value},
-    {"name": "entry_name", "symbols": [(lexer.has("name") ? {type: "name"} : name), (lexer.has("sep") ? {type: "sep"} : sep), (lexer.has("name_value") ? {type: "name_value"} : name_value), (lexer.has("newline") ? {type: "newline"} : newline)], "postprocess": ([{value: key},,{value}]) => ({key, value})},
-    {"name": "entry_date", "symbols": [(lexer.has("date") ? {type: "date"} : date), (lexer.has("sep") ? {type: "sep"} : sep), (lexer.has("date_value") ? {type: "date_value"} : date_value), (lexer.has("newline") ? {type: "newline"} : newline)], "postprocess": ([{value: key},,{value}]) => ({key, value})},
-    {"name": "entry_pubyear", "symbols": [(lexer.has("pubyear") ? {type: "pubyear"} : pubyear), (lexer.has("sep") ? {type: "sep"} : sep), (lexer.has("pubyear_value") ? {type: "pubyear_value"} : pubyear_value), (lexer.has("newline") ? {type: "newline"} : newline)], "postprocess": ([{value: key},,{value}]) => ({key, value})},
-    {"name": "entry_reprint", "symbols": [(lexer.has("reprint") ? {type: "reprint"} : reprint), (lexer.has("sep") ? {type: "sep"} : sep), (lexer.has("reprint_value") ? {type: "reprint_value"} : reprint_value), (lexer.has("newline") ? {type: "newline"} : newline)], "postprocess": ([{value: key},,{value}]) => ({key, value})}
+    {"name": "start", "symbols": [(lexer.has("type") ? {type: "type"} : type), (lexer.has("sep") ? {type: "sep"} : sep), (lexer.has("value") ? {type: "value"} : value), (lexer.has("newline") ? {type: "newline"} : newline)], "postprocess": ([{value: key},,{value}]) => ({key, value})},
+    {"name": "end$ebnf$1", "symbols": []},
+    {"name": "end$ebnf$1", "symbols": ["end$ebnf$1", (lexer.has("value") ? {type: "value"} : value)], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
+    {"name": "end", "symbols": [(lexer.has("end") ? {type: "end"} : end), (lexer.has("sep") ? {type: "sep"} : sep), "end$ebnf$1", "_"], "postprocess": () => null},
+    {"name": "entry$ebnf$1", "symbols": ["value"]},
+    {"name": "entry$ebnf$1", "symbols": ["entry$ebnf$1", "value"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
+    {"name": "entry", "symbols": [(lexer.has("tag") ? {type: "tag"} : tag), (lexer.has("sep") ? {type: "sep"} : sep), "entry$ebnf$1"], "postprocess": ([{value: key},,value]) => ({key, value: value.join(' ')})},
+    {"name": "value", "symbols": [(lexer.has("value") ? {type: "value"} : value), (lexer.has("newline") ? {type: "newline"} : newline)], "postprocess": ([{value}]) => value}
 ]
   , ParserStart: "ris"
 }

diff --git a/grammar.ne b/grammar.ne
@@ -13,37 +13,18 @@ reference ->
     {% ([type, entries]) => [type, entries] %}
 
 start ->
-  %type %sep %type_value %newline
+  %type %sep %value %newline
     {% ([{value: key},,{value}]) => ({key, value}) %}
 
 end ->
-  %end %sep _
+  %end %sep %value:* _
     {% () => null %}
 
 entry ->
-  (entry_std | entry_name | entry_date | entry_pubyear | entry_reprint)
-    {% ([[entry]]) => entry %}
-
-entry_std ->
-  %std %sep std_value:+
+  %tag %sep value:+
     {% ([{value: key},,value]) => ({key, value: value.join(' ')}) %}
 
-std_value ->
-  %std_value %newline
+value ->
+  %value %newline
     {% ([{value}]) => value %}
 
-entry_name ->
-  %name %sep %name_value %newline
-    {% ([{value: key},,{value}]) => ({key, value}) %}
-
-entry_date ->
-  %date %sep %date_value %newline
-    {% ([{value: key},,{value}]) => ({key, value}) %}
-
-entry_pubyear ->
-  %pubyear %sep %pubyear_value %newline
-    {% ([{value: key},,{value}]) => ({key, value}) %}
-
-entry_reprint ->
-  %reprint %sep %reprint_value %newline
-    {% ([{value: key},,{value}]) => ({key, value}) %}
diff --git a/index.js b/index.js
@@ -5,6 +5,11 @@
 const nearley = require('nearley');
 const grammar = require('./grammar.js');
 
+const zip =
+  (keys, values) =>
+    keys.reduce((o, k, i) =>
+      (o[k] = values[i], o), {});
+
 const append =
   (acc, {key, value}) =>
     ( acc[key] = (acc[key] || []).concat(value)
@@ -15,10 +20,20 @@ const add =
     ( acc[key] = value
     , acc );
 
+// Convert US date to date object
+// e.g. "06/30/2020" -> {year: "2020", month: "06", day: "30"}
+const from_mdy =
+  str =>
+    zip(['month', 'day', 'year'], str.split('/'));
+
+const name_obj =
+  ([last_name = '', first_name = '', suffix = '']) =>
+    ({last_name, first_name, suffix});
+
 const name_add =
-  (acc, {key, value: [last_name, first_name, suffix = '']}) =>
+  (acc, {key, value}) =>
     ( acc[key] = acc[key] || []
-    , acc[key].push({last_name, first_name, suffix})
+    , acc[key].push(name_obj(value.split(',').map(s => s.trim())))
     , acc );
 
 const defaults =
@@ -52,8 +67,11 @@ const OPS =
   , CP: add
   , CT: add
   , CY: add
-  , DA: (acc, {value: [year, month, day, info]}) =>
-          ( acc.DA = {year, month, day, info}
+  , DA: (acc, {value}) =>
+          ( acc.DA =
+              ( /(?:\d{4})?\/(?:(?:\d\d)?\/){2}(?:[A-Za-z \-]+)?/.test(value)
+                  ? zip(['year', 'month', 'day', 'info'], value.split('/'))
+                  : value )
           , acc )
   , DB: add
   , DO: add
@@ -88,13 +106,11 @@ const OPS =
   , PY: add
   , RI: add
   , RN: add
-  , RP: (acc, {value: {status, date}}) =>
-          ( acc.RP = status !== 'ON REQUEST'
-                        ? {status}
-                        : {status, date: { year: date[2]
-                                         , month: date[0]
-                                         , day: date[1]
-                                         }}
+  , RP: (acc, {value}) =>
+          ( acc.RP = (value === 'IN FILE' || value === 'NOT IN FILE')
+                        ? { status: value }
+                        : { status: 'ON REQUEST'
+                          , date: from_mdy(value.match(/\d{2}\/\d{2}\/\d{4}/)[0]) }
           , acc )
   , SE: add
   , SN: add

diff --git a/lexer.js b/lexer.js
@@ -8,76 +8,12 @@ const lexer =
   moo.compile
     ( { newline: {match: /\n/, lineBreaks: true}
       , sep: "  - "
-      /*
-      These tokens define the start of a new record entry:
-
-      TY  - JOUR
-      KW  - foo
-      ER  - 
-
-      So they should be followed by `  - ` (SPACE SPACE HYPHEN SPACE)
-      to be considered as such.
-
-      We use a positive lookahead to make sure that
-      they aren't accidentally detected when parsing tag content.
-
-      For example:
-
-      KW  - TY foo bar
-
-      In this case `TY` shouldn't be interpreted as a token but
-      as part of the content of the `KW` tag.
-
-      >>>
-      */
       , type: /TY(?=  - )/
       , end: /ER(?=  - )/
-      , std: /(?:AB|AD|AN|AV|BT|C1|C2|C3|C4|C5|C6|C7|C8|CA|CN|CP|CT|CY|DB|DO|DP|ED|EP|ET|ID|IS|J1|J2|JA|JF|JO|KW|L1|L2|L3|L4|LA|LB|LK|M1|M2|M3|N1|N2|NV|OP|PB|PP|RI|RN|SE|SN|SP|ST|T1|T2|T3|TA|TI|TT|U1|U2|U3|U4|U5|UR|VL|VO|Y1|Y2)(?=  - )/
-      , name: /(?:A1|A2|A3|A4|AU)(?=  - )/
-      , date: /DA(?=  - )/
-      , pubyear: /PY(?=  - )/
-      , reprint: /RP(?=  - )/
-      /* <<< */
-      , type_value:
-          [ "ABST"   , "ADVS"  , "AGGR"
-          , "ANCIENT", "ART"   , "BILL"
-          , "BLOG"   , "BOOK"  , "CASE"
-          , "CHAP"   , "CHART" , "CLSWK"
-          , "COMP"   , "CONF"  , "CPAPER"
-          , "CTLG"   , "DATA"  , "DBASE"
-          , "DICT"   , "EBOOK" , "ECHAP"
-          , "EDBOOK" , "EJOUR" , "ELEC"
-          , "ENCYC"  , "EQUA"  , "FIGURE"
-          , "GEN"    , "GOVDOC", "GRNT"
-          , "HEAR"   , "ICOMM" , "INPR"
-          , "JFULL"  , "JOUR"  , "LEGAL"
-          , "MANSCPT", "MAP"   , "MGZN"
-          , "MPCT"   , "MULTI" , "MUSIC"
-          , "NEWS"   , "PAMP"  , "PAT"
-          , "PCOMM"  , "RPRT"  , "SER"
-          , "SLIDE"  , "SOUND" , "STAND"
-          , "STAT"   , "THES"  , "UNBILL"
-          , "UNPD"   , "VIDEO"
-          ]
-      , reprint_value:
-          { match: /(?:IN FILE|NOT IN FILE|ON REQUEST \(\d{2}\/\d{2}\/\d{4}\))/
-          , value: m => m.startsWith('ON REQUEST')
-                      ? { status: 'ON REQUEST', date: m.match(/(\d{2})\/(\d{2})\/(\d{4})/).slice(1) }
-                      : { status: m }
-          }
-      , date_value:
-          { match: /(?:\d{4})?\/(?:(?:\d\d)?\/){2}(?:[A-Za-z \-]+)?/
-          , value: m => m.split('/')
-          }
-      , name_value:
-          { match: /[a-zA-Z \-]+,[a-zA-Z \-\.]+(?:,[a-zA-Z\.]+)*/
-          , value: name => name.split(',').map(part => part.trim())
-          }
-      , pubyear_value : /\d{4}/
-      , std_value:
-            { match: /[a-zA-Z0-9 \-\t:\/\.;]+/
-            , value: m => m.trim()
-            }
+      , tag: /[A-Z][A-Z0-9](?=  - )/
+      , value: { match: /.+/
+               , value: m => m.trim()
+               }
       }
     );
 

diff --git a/ris-parser.feature b/ris-parser.feature
@@ -277,3 +277,12 @@ Scenario: Default values
     """
   When I parse the file
   Then I will find a reference where 'RP' is set to '{"status": "NOT IN FILE"}'
+
+Scenario Outline: Samples
+  Given I have this file <ris>
+  When I parse the file
+  Then I will get a list of references as seen in file <response>
+
+  Examples:
+    | ris    | response |
+    | 01.ris | 01.json  |
diff --git a/sample.ris b/sample.ris
@@ -21,7 +21,7 @@ ED  - this is a ED value
 DA  - ///
 KW  - foo
 KW  - bar
-baz
+      baz
 UR  - http://example.com; http://example.org;
 http://example.net
 PY  - 2014

diff --git a/samples/01.json b/samples/01.json
@@ -0,0 +1,51 @@
+[
+  {
+    "A1": [
+      {
+        "first_name": "Jacek",
+        "last_name": "Borysow",
+        "suffix": ""
+      },
+      {
+        "first_name": "Lothar",
+        "last_name": "Frommhold",
+        "suffix": ""
+      },
+      {
+        "first_name": "George",
+        "last_name": "Birnbaum",
+        "suffix": ""
+      }
+    ],
+    "DO": "10.1086/166112",
+    "JO": "The Astrophysical Journal",
+    "KW": [
+      "Absorption Spectra",
+      "Helium",
+      "Hydrogen",
+      "Planetary Atmospheres",
+      "Planetary Radiation",
+      "Cool Stars",
+      "Far Infrared Radiation",
+      "Molecular Collisions",
+      "Molecular Rotation",
+      "Atomic and Molecular Physics",
+      "LABORATORY SPECTRA",
+      "MOLECULAR PROCESSES",
+      "PLANETS: SPECTRA"
+    ],
+    "N2": "The zeroth, first, and second spectral moments of the rototranslational collision-induced absorption (RT CIA) spectra of hydrogen-helium mixtures are calculated from the fundamental theory, for temperatures from 40 to 3000 K. With the help of simple analytical functions of three parameters and the information given, the RT CIA spectra of H2-He pairs can be generated on computers of small capacity, with rms deviations from exact quantum profiles of not more than a few percent. Such representations of the CIA spectra are of interest for work related to the atmospheres of the outer planets and cool stars. The theoretical spectra are in close agreement with existing laboratory measurements at various temperatures from about 77 to 3000 K.",
+    "RP": {
+      "status": "NOT IN FILE"
+    },
+    "SN": "0004-637X",
+    "SP": "509",
+    "T1": "Collision-induced Rototranslational Absorption Spectra of H 2-He Pairs at Temperatures from 40 to 3000 K",
+    "TY": "JOUR",
+    "UR": [
+      "https://ui.adsabs.harvard.edu/abs/1988ApJ...326..509B"
+    ],
+    "VL": "326",
+    "Y1": "1988/03/1"
+  }
+]
diff --git a/samples/01.ris b/samples/01.ris
@@ -0,0 +1,39 @@
+TY  - JOUR
+T1  - Collision-induced Rototranslational Absorption Spectra of H 2-He Pairs at Temperatures from 40 to 3000 K
+A1  - Borysow, Jacek
+A1  - Frommhold, Lothar
+A1  - Birnbaum, George
+JO  - The Astrophysical Journal
+VL  - 326
+Y1  - 1988/03/1
+SP  - 509
+KW  - Absorption Spectra
+KW  - Helium
+KW  - Hydrogen
+KW  - Planetary Atmospheres
+KW  - Planetary Radiation
+KW  - Cool Stars
+KW  - Far Infrared Radiation
+KW  - Molecular Collisions
+KW  - Molecular Rotation
+KW  - Atomic and Molecular Physics
+KW  - LABORATORY SPECTRA
+KW  - MOLECULAR PROCESSES
+KW  - PLANETS: SPECTRA
+UR  - https://ui.adsabs.harvard.edu/abs/1988ApJ...326..509B
+N2  - The zeroth, first, and second spectral moments of the rototranslational
+collision-induced absorption (RT CIA) spectra of hydrogen-helium
+mixtures are calculated from the fundamental theory, for temperatures
+from 40 to 3000 K. With the help of simple analytical functions of three
+parameters and the information given, the RT CIA spectra of H2-He pairs
+can be generated on computers of small capacity, with rms deviations
+from exact quantum profiles of not more than a few percent. Such
+representations of the CIA spectra are of interest for work related to
+the atmospheres of the outer planets and cool stars. The theoretical
+spectra are in close agreement with existing laboratory measurements at
+various temperatures from about 77 to 3000 K.
+DO  - 10.1086/166112
+SN  - 0004-637X
+ER  -  
+
+
diff --git a/steps.js b/steps.js
@@ -1,3 +1,5 @@
+const fs = require('fs');
+const path = require('path');
 const assert = require('assert').strict;
 const { defineStep } = require('cucumber');
 const sut = require('./');
@@ -6,6 +8,10 @@ defineStep('I have this RIS file', function (file) {
   this.file = file;
 });
 
+defineStep('I have this file {word}', function (file) {
+  this.file = fs.readFileSync(path.join(__dirname, 'samples', file), 'utf-8');
+});
+
 defineStep('I parse the file', function () {
   this.list = sut(this.file);
 });
@@ -14,6 +20,11 @@ defineStep('I will get a list of {int} reference(s)', function (count) {
   assert.equal(this.list.length, count);
 });
 
+defineStep('I will get a list of references as seen in file {word}', function (file) {
+  const json = require(`./samples/${file}`)
+  assert.deepStrictEqual(this.list, json);
+});
+
 defineStep('I will find a reference where {string} is set to {string}', function (field, value) {
   let expected_value;
   try {