Add catcode to Lexer, move comment parsing back to Lexer (#1789)

* Remove redundant consumeSpaces() - Spaces after command sequence are ignored in Lexer - parseExpression consumes spaces in the math mode * Add catcode to Lexer, move comment parsing back to Lexer - Fix parsing a comment before a sup/subscript argument - Fix parsing a comment before an expression - Fix parsing a comment before or between \hline - Fix parsing a comment in the macro definition - Fix parsing a comment including a command sequence * Update Lexer.js * Update Parser.js * catcode -> catcodes
2025-10-09 04:58:40 +00:00 · 2018-11-25 08:42:14 +09:00
parent ec6a2b4f36
commit 3dfd17d9b4
8 changed files with 62 additions and 55 deletions
--- a/src/Lexer.js
+++ b/src/Lexer.js
@@ -17,6 +17,7 @@ import SourceLocation from "./SourceLocation";
 import {Token} from "./Token";

 import type {LexerInterface} from "./Token";
+import type Settings from "./Settings";

 /* The following tokenRegex
 * - matches typical whitespace (but not NBSP etc.) using its first group
@@ -53,19 +54,26 @@ const tokenRegexString = `(${spaceRegexString}+)|` +  // whitespace
    `|${controlWordWhitespaceRegexString}` +          // \macroName + spaces
    `|${controlSymbolRegexString})`;                  // \\, \', etc.

-// These regexs are for matching results from tokenRegex,
-// so they do have ^ markers.
-export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
-
 /** Main Lexer class */
 export default class Lexer implements LexerInterface {
    input: string;
+    settings: Settings;
    tokenRegex: RegExp;
+    // category codes, only supports comment characters (14) for now
+    catcodes: {[string]: number};

-    constructor(input: string) {
+    constructor(input: string, settings: Settings) {
        // Separate accents from characters
        this.input = input;
+        this.settings = settings;
        this.tokenRegex = new RegExp(tokenRegexString, 'g');
+        this.catcodes = {
+            "%": 14, // comment character
+        };
+    }
+
+    setCatcode(char: string, code: number) {
+        this.catcodes[char] = code;
    }

    /**
@@ -85,6 +93,19 @@ export default class Lexer implements LexerInterface {
        }
        let text = match[2] || " ";

+        if (this.catcodes[text] === 14) { // comment character
+            const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex);
+            if (nlIndex === -1) {
+                this.tokenRegex.lastIndex = input.length; // EOF
+                this.settings.reportNonstrict("commentAtEnd",
+                    "% comment has no terminating newline; LaTeX would " +
+                    "fail because of commenting the end of math mode (e.g. $)");
+            } else {
+                this.tokenRegex.lastIndex = nlIndex + 1;
+            }
+            return this.lex();
+        }
+
        // Trim any trailing whitespace from control word match
        const controlMatch = text.match(controlWordWhitespaceRegex);
        if (controlMatch) {
--- a/src/MacroExpander.js
+++ b/src/MacroExpander.js
@@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
     * (with existing macros etc.).
     */
    feed(input: string) {
-        this.lexer = new Lexer(input);
+        this.lexer = new Lexer(input, this.settings);
    }

    /**
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
                    ++numArgs;
                }
            }
-            const bodyLexer = new Lexer(expansion);
+            const bodyLexer = new Lexer(expansion, this.settings);
            const tokens = [];
            let tok = bodyLexer.lex();
            while (tok.text !== "EOF") {
@@ -343,4 +343,3 @@ export default class MacroExpander implements MacroContextInterface {
            implicitCommands.hasOwnProperty(name);
    }
 }
-
--- a/src/Parser.js
+++ b/src/Parser.js
@@ -363,8 +363,6 @@ export default class Parser {
                }
                // Put everything into an ordgroup as the superscript
                superscript = {type: "ordgroup", mode: this.mode, body: primes};
-            } else if (lex.text === "%") {
-                this.consumeComment();
            } else {
                // If it wasn't ^, _, or ', stop parsing super/subscripts
                break;
@@ -414,6 +412,11 @@ export default class Parser {
                "Can't use function '" + func + "' in math mode", token);
        }

+        // hyperref package sets the catcode of % as an active character
+        if (funcData.argTypes && funcData.argTypes[0] === "url") {
+            this.gullet.lexer.setCatcode("%", 13);
+        }
+
        // Consume the command token after possibly switching to the
        // mode specified by the function (for instant mode switching),
        // and then immediately switch back.
@@ -555,27 +558,6 @@ export default class Parser {
        }
    }

-    consumeComment() {
-        // the newline character is normalized in Lexer, check original source
-        while (this.nextToken.text !== "EOF" && this.nextToken.loc &&
-                this.nextToken.loc.getSource().indexOf("\n") === -1) {
-            this.consume();
-        }
-        if (this.nextToken.text === "EOF") {
-            this.settings.reportNonstrict("commentAtEnd",
-                "% comment has no terminating newline; LaTeX would " +
-                "fail because of commenting the end of math mode (e.g. $)");
-        }
-        if (this.mode === "math") {
-            this.consumeSpaces(); // ignore spaces in math mode
-        } else if (this.nextToken.loc) { // text mode
-            const source = this.nextToken.loc.getSource();
-            if (source.indexOf("\n") === source.length - 1) {
-                this.consumeSpaces(); // if no space after the first newline
-            }
-        }
-    }
-
    /**
     * Parses a group, essentially returning the string formed by the
     * brace-enclosed tokens plus some position information.
@@ -594,6 +576,7 @@ export default class Parser {
            } else if (raw && nextToken.text !== "EOF" &&
                    /[^{}[\]]/.test(nextToken.text)) {
                // allow a single character in raw string group
+                this.gullet.lexer.setCatcode("%", 14); // reset the catcode of %
                this.consume();
                return nextToken;
            }
@@ -611,12 +594,6 @@ export default class Parser {
                    throw new ParseError(
                        "Unexpected end of input in " + modeName,
                        firstToken.range(lastToken, str));
-                case "%":
-                    if (!raw) { // allow % in raw string group
-                        this.consumeComment();
-                        continue;
-                    }
-                    break;
                case groupBegin:
                    nested++;
                    break;
@@ -629,6 +606,7 @@ export default class Parser {
            this.consume();
        }
        this.mode = outerMode;
+        this.gullet.lexer.setCatcode("%", 14); // reset the catcode of %
        this.expect(groupEnd);
        return firstToken.range(lastToken, str);
    }
@@ -647,12 +625,8 @@ export default class Parser {
        const firstToken = this.nextToken;
        let lastToken = firstToken;
        let str = "";
-        while (this.nextToken.text !== "EOF" && (regex.test(
-                str + this.nextToken.text) || this.nextToken.text === "%")) {
-            if (this.nextToken.text === "%") {
-                this.consumeComment();
-                continue;
-            }
+        while (this.nextToken.text !== "EOF" &&
+                regex.test(str + this.nextToken.text)) {
            lastToken = this.nextToken;
            str += lastToken.text;
            this.consume();
@@ -914,9 +888,6 @@ export default class Parser {
                body: arg,
                star,
            };
-        } else if (text === "%") {
-            this.consumeComment();
-            return this.parseSymbol();
        }
        // At this point, we should have a symbol, possibly with accents.
        // First expand any accented base symbol according to unicodeSymbols.
--- a/src/SourceLocation.js
+++ b/src/SourceLocation.js
@@ -17,10 +17,6 @@ export default class SourceLocation {
        this.end = end;
    }

-    getSource(): string {
-        return this.lexer.input.slice(this.start, this.end);
-    }
-
    /**
     * Merges two `SourceLocation`s from location providers, given they are
     * provided in order of appearance.
--- a/src/functions/font.js
+++ b/src/functions/font.js
@@ -99,7 +99,6 @@ defineFunction({
    },
    handler: ({parser, funcName, breakOnTokenText}, args) => {
        const {mode} = parser;
-        parser.consumeSpaces();
        const body = parser.parseExpression(true, breakOnTokenText);
        const style = `math${funcName.slice(1)}`;

--- a/src/functions/sizing.js
+++ b/src/functions/sizing.js
@@ -61,7 +61,6 @@ defineFunction({
        allowedInText: true,
    },
    handler: ({breakOnTokenText, funcName, parser}, args) => {
-        parser.consumeSpaces();
        const body = parser.parseExpression(false, breakOnTokenText);

        return {
--- a/src/functions/styling.js
+++ b/src/functions/styling.js
@@ -25,7 +25,6 @@ defineFunction({
    },
    handler({breakOnTokenText, funcName, parser}, args) {
        // parse out the implicit body
-        parser.consumeSpaces();
        const body = parser.parseExpression(true, breakOnTokenText);

        // TODO: Refactor to avoid duplicating styleMap in multiple places (e.g.
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -1627,6 +1627,8 @@ describe("A comment parser", function() {

    it("should parse comments between subscript and superscript", () => {
        expect("x_3 %comment\n^2").toParseLike`x_3^2`;
+        expect("x^ %comment\n{2}").toParseLike`x^{2}`;
+        expect("x^ %comment\n\\frac{1}{2}").toParseLike`x^\frac{1}{2}`;
    });

    it("should parse comments in size and color groups", () => {
@@ -1635,6 +1637,24 @@ describe("A comment parser", function() {
        expect("\\color{#f00%red\n}").toParse();
    });

+    it("should parse comments before an expression", () => {
+        expect("%comment\n{2}").toParseLike`{2}`;
+    });
+
+    it("should parse comments before and between \\hline", () => {
+        expect("\\begin{matrix}a&b\\\\ %hline\n" +
+            "\\hline %hline\n" +
+            "\\hline c&d\\end{matrix}").toParse();
+    });
+
+    it("should parse comments in the macro definition", () => {
+        expect("\\def\\foo{1 %}\n2}\n\\foo").toParseLike`12`;
+    });
+
+    it("should not expand nor ignore spaces after a command sequence in a comment", () => {
+        expect("\\def\\foo{1\n2}\nx %\\foo\n").toParseLike`x`;
+    });
+
    it("should not parse a comment without newline in strict mode", () => {
        expect`x%y`.not.toParse(strictSettings);
        expect`x%y`.toParse(nonstrictSettings);
@@ -2586,9 +2606,8 @@ describe("href and url commands", function() {

    it("should allow single-character URLs", () => {
        expect`\href%end`.toParseLike("\\href{%}end");
-        expect`\href %end`.toParseLike("\\href{%}end");
        expect("\\url%end").toParseLike("\\url{%}end");
-        expect("\\url %end").toParseLike("\\url{%}end");
+        expect("\\url%%end\n").toParseLike("\\url{%}");
        expect("\\url end").toParseLike("\\url{e}nd");
        expect("\\url%end").toParseLike("\\url {%}end");
    });
@@ -2630,6 +2649,10 @@ describe("href and url commands", function() {
        expect(parsed2.href).toBe(url);
    });

+    it("should allow comments after URLs", function() {
+        expect("\\url{http://example.com/}%comment\n").toBuild();
+    });
+
    it("should be marked up correctly", function() {
        const markup = katex.renderToString(r`\href{http://example.com/}{example here}`);
        expect(markup).toContain("<a href=\"http://example.com/\">");