lexer: Remove match-at dependency, use RegExp (#1447)

* lexer: Remove `match-at` dependency, use RegExp * chore(package): update flow-bin to version 0.75.0 * Fix flow error * Remove unused flow libs * Minor fix * Throw an error when `RegExp.exec` jumps
2025-10-11 22:18:41 +00:00 · 2018-06-28 03:13:27 +09:00
parent 12dcb05209
commit 518379aed5
10 changed files with 37 additions and 46 deletions
--- a/.flowconfig
+++ b/.flowconfig
@@ -4,8 +4,6 @@
 [include]
 [libs]
 flow-typed
 ./node_modules/stylelint/decls
 [lints]
--- a/flow-typed/match-at.js
+++ b/flow-typed/match-at.js
@@ -1,3 +0,0 @@
 declare module 'match-at' {
    declare module.exports: (re: RegExp, str: string, pos: number) => (Array<string>|null);
 }
--- a/flow-typed/object-assign.js
+++ b/flow-typed/object-assign.js
@@ -1,6 +0,0 @@
 declare module 'object-assign' {
    declare module.exports:
        <T>(target: {[string]: T}, ...sources: Array<{[string]: T}>)
            => {[string]: T};
 }
--- a/package-lock.json
+++ b/package-lock.json
@@ -4212,9 +4212,9 @@
      "dev": true
    },
    "flow-bin": {
-      "version": "0.74.0",
+      "version": "0.75.0",
-      "resolved": "https://registry.npmjs.org/flow-bin/-/flow-bin-0.74.0.tgz",
+      "resolved": "https://registry.npmjs.org/flow-bin/-/flow-bin-0.75.0.tgz",
-      "integrity": "sha512-tIN9J5qg71S4UbofCu80tve8a+p7Hj7ytwUtu79cLg9KJVVTNnVVJXKgCghVzaZT1Rvl9SMHVPlDs9uYhPHEGQ==",
+      "integrity": "sha1-uW0e6Z07RGoyJr5mtAEyJM6d8mA=",
      "dev": true
    },
    "flush-write-stream": {
@@ -7430,11 +7430,6 @@
      "integrity": "sha512-NcWuJFHDA8V3wkDgR/j4+gZx+YQwstPgfQDV8ndUeWWzta3dnDTBxpVzqS9lkmJAuV5YX35lmyojl6HO5JXAgw==",
      "dev": true
    },
    "match-at": {
      "version": "0.1.1",
      "resolved": "https://registry.npmjs.org/match-at/-/match-at-0.1.1.tgz",
      "integrity": "sha512-h4Yd392z9mST+dzc+yjuybOGFNOZjmXIPKWjxBd1Bb23r4SmDOsk2NYCU2BMUBGbSpZqwVsZYNq26QS3xfaT3Q=="
    },
    "math-expression-evaluator": {
      "version": "1.2.17",
      "resolved": "https://registry.npmjs.org/math-expression-evaluator/-/math-expression-evaluator-1.2.17.tgz",
--- a/package.json
+++ b/package.json
@@ -30,7 +30,7 @@
    "eslint": "^5.0.0",
    "eslint-plugin-flowtype": "^2.40.1",
    "file-loader": "^1.1.11",
-    "flow-bin": "^0.74.0",
+    "flow-bin": "^0.75.0",
    "husky": "^1.0.0-rc.8",
    "jest": "^23.0.1",
    "jest-serializer-html": "^5.0.0",
@@ -80,7 +80,6 @@
    "dist:dist": "rimraf dist/ && cp -r build/katex/ dist/"
  },
  "dependencies": {
    "match-at": "^0.1.1",
    "nomnom": "^1.8.1"
  },
  "husky": {
--- a/src/Lexer.js
+++ b/src/Lexer.js
@@ -12,7 +12,6 @@
 * kinds.
 */
 import matchAt from "match-at";
 import ParseError from "./ParseError";
 import SourceLocation from "./SourceLocation";
 import {LexerInterface, Token} from "./Token";
@@ -44,8 +43,7 @@ const controlWordWhitespaceRegex = new RegExp(
 const combiningDiacriticalMarkString = "[\u0300-\u036f]";
 export const combiningDiacriticalMarksEndRegex =
    new RegExp(`${combiningDiacriticalMarkString}+$`);
-const tokenRegex = new RegExp(
+const tokenRegexString = `(${spaceRegexString}+)|` +  // whitespace
    `(${spaceRegexString}+)|` +                       // whitespace
    `(${commentRegexString}` +                        // comments
    "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
    `${combiningDiacriticalMarkString}*` +            // ...plus accents
@@ -54,11 +52,8 @@ const tokenRegex = new RegExp(
    "|\\\\verb\\*([^]).*?\\3" +                       // \verb*
    "|\\\\verb([^*a-zA-Z]).*?\\4" +                   // \verb unstarred
    `|${controlWordWhitespaceRegexString}` +          // \macroName + spaces
-    `|${controlSymbolRegexString}` +                  // \\, \', etc.
+    `|${controlSymbolRegexString})`;                  // \\, \', etc.
    ")"
 );
 // tokenRegex has no ^ marker, as required by matchAt.
 // These regexs are for matching results from tokenRegex,
 // so they do have ^ markers.
 export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
@@ -67,12 +62,12 @@ const commentRegex = new RegExp(`^${commentRegexString}`);
 /** Main Lexer class */
 export default class Lexer implements LexerInterface {
    input: string;
-    pos: number;
+    tokenRegex: RegExp;
    constructor(input: string) {
        // Separate accents from characters
        this.input = input;
-        this.pos = 0;
+        this.tokenRegex = new RegExp(tokenRegexString, 'g');
    }
    /**
@@ -80,20 +75,17 @@ export default class Lexer implements LexerInterface {
     */
    lex(): Token {
        const input = this.input;
-        const pos = this.pos;
+        const pos = this.tokenRegex.lastIndex;
        if (pos === input.length) {
            return new Token("EOF", new SourceLocation(this, pos, pos));
        }
-        const match = matchAt(tokenRegex, input, pos);
+        const match = this.tokenRegex.exec(input);
-        if (match === null) {
+        if (match === null || match.index !== pos) {
            throw new ParseError(
                `Unexpected character: '${input[pos]}'`,
                new Token(input[pos], new SourceLocation(this, pos, pos + 1)));
        }
        let text = match[2] || " ";
        const start = this.pos;
        this.pos += match[0].length;
        const end = this.pos;
        // Trim any trailing whitespace from control word match
        const controlMatch = text.match(controlWordWhitespaceRegex);
@@ -104,7 +96,8 @@ export default class Lexer implements LexerInterface {
        if (commentRegex.test(text)) {
            return this.lex();
        } else {
-            return new Token(text, new SourceLocation(this, start, end));
+            return new Token(text, new SourceLocation(this, pos,
                this.tokenRegex.lastIndex));
        }
    }
 }
--- a/src/Token.js
+++ b/src/Token.js
@@ -5,7 +5,7 @@ import SourceLocation from "./SourceLocation";
 * Interface required to break circular dependency between Token, Lexer, and
 * ParseError.
 */
-export interface LexerInterface {input: string, pos: number}
+export interface LexerInterface {input: string, tokenRegex: RegExp}
 /**
 * The resulting token returned from `lex`.
@@ -43,4 +43,3 @@ export class Token {
        return new Token(text, SourceLocation.range(this, endToken));
    }
 }
--- a/test/snapshots/katex-spec.js.snap
+++ b/test/snapshots/katex-spec.js.snap
@@ -27,7 +27,9 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
                        "end": 37,
                        "lexer": {
                          "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
-                          "pos": 56
+                          "tokenRegex": {
                            "lastIndex": 56
                          }
                        },
                        "start": 36
                      },
@@ -56,7 +58,9 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
                        "end": 39,
                        "lexer": {
                          "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
-                          "pos": 56
+                          "tokenRegex": {
                            "lastIndex": 56
                          }
                        },
                        "start": 38
                      },
@@ -87,7 +91,9 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
                        "end": 42,
                        "lexer": {
                          "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
-                          "pos": 56
+                          "tokenRegex": {
                            "lastIndex": 56
                          }
                        },
                        "start": 41
                      },
@@ -116,7 +122,9 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
                        "end": 44,
                        "lexer": {
                          "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
-                          "pos": 56
+                          "tokenRegex": {
                            "lastIndex": 56
                          }
                        },
                        "start": 43
                      },
--- a/test/errors-spec.js
+++ b/test/errors-spec.js
@@ -285,9 +285,9 @@ describe("Lexer:", function() {
    describe("#_innerLex", function() {
        it("rejects lone surrogate char", function() {
-            expect("\udcba").toFailWithParseError(
+            expect("\udcba ").toFailWithParseError(
                   "Unexpected character: '\udcba' at position 1:" +
-                    " \udcba\u0332");
+                    " \udcba\u0332 ");
        });
        it("rejects lone backslash at end of input", function() {
            expect("\\").toFailWithParseError(
--- a/test/setup.js
+++ b/test/setup.js
@@ -23,9 +23,17 @@ const typeFirstCompare = (a, b) => {
    }
 };
 const regExpReplacer = (key, value) => {
    return value instanceof RegExp ? {lastIndex: value.lastIndex} : value;
 };
 const serializer = {
    print(val) {
-        return stringify(val, {cmp: typeFirstCompare, space: '  '});
+        return stringify(val, {
            cmp: typeFirstCompare,
            space: '  ',
            replacer: regExpReplacer,
        });
    },
    test(val) {
        // Leave strings (e.g. XML) to other serializers