lexer: Remove match-at dependency, use RegExp (#1447)

* lexer: Remove `match-at` dependency, use RegExp * chore(package): update flow-bin to version 0.75.0 * Fix flow error * Remove unused flow libs * Minor fix * Throw an error when `RegExp.exec` jumps
2025-10-10 21:48:41 +00:00 · 2018-06-28 03:13:27 +09:00
parent 12dcb05209
commit 518379aed5
10 changed files with 37 additions and 46 deletions
--- a/src/Lexer.js
+++ b/src/Lexer.js
@@ -12,7 +12,6 @@
 * kinds.
 */

-import matchAt from "match-at";
 import ParseError from "./ParseError";
 import SourceLocation from "./SourceLocation";
 import {LexerInterface, Token} from "./Token";
@@ -44,8 +43,7 @@ const controlWordWhitespaceRegex = new RegExp(
 const combiningDiacriticalMarkString = "[\u0300-\u036f]";
 export const combiningDiacriticalMarksEndRegex =
    new RegExp(`${combiningDiacriticalMarkString}+$`);
-const tokenRegex = new RegExp(
-    `(${spaceRegexString}+)|` +                       // whitespace
+const tokenRegexString = `(${spaceRegexString}+)|` +  // whitespace
    `(${commentRegexString}` +                        // comments
    "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
    `${combiningDiacriticalMarkString}*` +            // ...plus accents
@@ -54,11 +52,8 @@ const tokenRegex = new RegExp(
    "|\\\\verb\\*([^]).*?\\3" +                       // \verb*
    "|\\\\verb([^*a-zA-Z]).*?\\4" +                   // \verb unstarred
    `|${controlWordWhitespaceRegexString}` +          // \macroName + spaces
-    `|${controlSymbolRegexString}` +                  // \\, \', etc.
-    ")"
-);
+    `|${controlSymbolRegexString})`;                  // \\, \', etc.

-// tokenRegex has no ^ marker, as required by matchAt.
 // These regexs are for matching results from tokenRegex,
 // so they do have ^ markers.
 export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
@@ -67,12 +62,12 @@ const commentRegex = new RegExp(`^${commentRegexString}`);
 /** Main Lexer class */
 export default class Lexer implements LexerInterface {
    input: string;
-    pos: number;
+    tokenRegex: RegExp;

    constructor(input: string) {
        // Separate accents from characters
        this.input = input;
-        this.pos = 0;
+        this.tokenRegex = new RegExp(tokenRegexString, 'g');
    }

    /**
@@ -80,20 +75,17 @@ export default class Lexer implements LexerInterface {
     */
    lex(): Token {
        const input = this.input;
-        const pos = this.pos;
+        const pos = this.tokenRegex.lastIndex;
        if (pos === input.length) {
            return new Token("EOF", new SourceLocation(this, pos, pos));
        }
-        const match = matchAt(tokenRegex, input, pos);
-        if (match === null) {
+        const match = this.tokenRegex.exec(input);
+        if (match === null || match.index !== pos) {
            throw new ParseError(
                `Unexpected character: '${input[pos]}'`,
                new Token(input[pos], new SourceLocation(this, pos, pos + 1)));
        }
        let text = match[2] || " ";
-        const start = this.pos;
-        this.pos += match[0].length;
-        const end = this.pos;

        // Trim any trailing whitespace from control word match
        const controlMatch = text.match(controlWordWhitespaceRegex);
@@ -104,7 +96,8 @@ export default class Lexer implements LexerInterface {
        if (commentRegex.test(text)) {
            return this.lex();
        } else {
-            return new Token(text, new SourceLocation(this, start, end));
+            return new Token(text, new SourceLocation(this, pos,
+                this.tokenRegex.lastIndex));
        }
    }
 }
--- a/src/Token.js
+++ b/src/Token.js
@@ -5,7 +5,7 @@ import SourceLocation from "./SourceLocation";
 * Interface required to break circular dependency between Token, Lexer, and
 * ParseError.
 */
-export interface LexerInterface {input: string, pos: number}
+export interface LexerInterface {input: string, tokenRegex: RegExp}

 /**
 * The resulting token returned from `lex`.
@@ -43,4 +43,3 @@ export class Token {
        return new Token(text, SourceLocation.range(this, endToken));
    }
 }
-