Add raw string group, move comment parsing to Parser, change URL group parser (#1711)

* Add raw string group * Move comment parsing to Parser * Use raw string group in URL group parser * Update types.js * Add multi-level nested url test
2025-10-07 12:18:39 +00:00 · 2018-10-13 10:21:57 +09:00
parent ba8e224b8d
commit 3907545e2c
7 changed files with 135 additions and 98 deletions
--- a/src/Lexer.js
+++ b/src/Lexer.js
@@ -17,11 +17,9 @@ import SourceLocation from "./SourceLocation";
 import {Token} from "./Token";
 import type {LexerInterface} from "./Token";
 import type Settings from "./Settings";
 /* The following tokenRegex
 * - matches typical whitespace (but not NBSP etc.) using its first group
 * - matches comments (must have trailing newlines)
 * - does not match any control character \x00-\x1f except whitespace
 * - does not match a bare backslash
 * - matches any ASCII character except those just mentioned
@@ -36,7 +34,6 @@ import type Settings from "./Settings";
 * still reject the input.
 */
 const spaceRegexString = "[ \r\n\t]";
 const commentRegexString = "%[^\n]*(?:\n|$)";
 const controlWordRegexString = "\\\\[a-zA-Z@]+";
 const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
 const controlWordWhitespaceRegexString =
@@ -46,37 +43,28 @@ const controlWordWhitespaceRegex = new RegExp(
 const combiningDiacriticalMarkString = "[\u0300-\u036f]";
 export const combiningDiacriticalMarksEndRegex =
    new RegExp(`${combiningDiacriticalMarkString}+$`);
 const urlFunctionRegexString = "(\\\\href|\\\\url)" +
    `(?:${spaceRegexString}*\\{((?:[^{}\\\\]|\\\\[^]|{[^{}]*})*)\\}` +
    `|${spaceRegexString}+([^{}])` +
    `|${spaceRegexString}*([^{}a-zA-Z]))`;
 const tokenRegexString = `(${spaceRegexString}+)|` +  // whitespace
-    `(${commentRegexString}` +                        // comments
+    "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
    "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
    `${combiningDiacriticalMarkString}*` +            // ...plus accents
    "|[\uD800-\uDBFF][\uDC00-\uDFFF]" +               // surrogate pair
    `${combiningDiacriticalMarkString}*` +            // ...plus accents
    "|\\\\verb\\*([^]).*?\\3" +                       // \verb*
    "|\\\\verb([^*a-zA-Z]).*?\\4" +                   // \verb unstarred
    `|${urlFunctionRegexString}` +                    // URL arguments
    `|${controlWordWhitespaceRegexString}` +          // \macroName + spaces
    `|${controlSymbolRegexString})`;                  // \\, \', etc.
 // These regexs are for matching results from tokenRegex,
 // so they do have ^ markers.
 export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
 export const urlFunctionRegex = new RegExp(`^${urlFunctionRegexString}`);
 /** Main Lexer class */
 export default class Lexer implements LexerInterface {
    input: string;
    settings: Settings;
    tokenRegex: RegExp;
-    constructor(input: string, settings: Settings) {
+    constructor(input: string) {
        // Separate accents from characters
        this.input = input;
        this.settings = settings;
        this.tokenRegex = new RegExp(tokenRegexString, 'g');
    }
@@ -100,19 +88,10 @@ export default class Lexer implements LexerInterface {
        // Trim any trailing whitespace from control word match
        const controlMatch = text.match(controlWordWhitespaceRegex);
        if (controlMatch) {
-            text = controlMatch[1] + text.slice(controlMatch[0].length);
+            text = controlMatch[1];
        }
-        if (text[0] === "%") {
+        return new Token(text, new SourceLocation(this, pos,
-            if (text[text.length - 1] !== "\n") {
+            this.tokenRegex.lastIndex));
                this.settings.reportNonstrict("commentAtEnd",
                    "% comment has no terminating newline; LaTeX would " +
                    "fail because of commenting the end of math mode (e.g. $)");
            }
            return this.lex();
        } else {
            return new Token(text, new SourceLocation(this, pos,
                this.tokenRegex.lastIndex));
        }
    }
 }
--- a/src/MacroExpander.js
+++ b/src/MacroExpander.js
@@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
     * (with existing macros etc.).
     */
    feed(input: string) {
-        this.lexer = new Lexer(input, this.settings);
+        this.lexer = new Lexer(input);
    }
    /**
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
                    ++numArgs;
                }
            }
-            const bodyLexer = new Lexer(expansion, this.settings);
+            const bodyLexer = new Lexer(expansion);
            const tokens = [];
            let tok = bodyLexer.lex();
            while (tok.text !== "EOF") {
--- a/src/Parser.js
+++ b/src/Parser.js
@@ -11,7 +11,7 @@ import unicodeSymbols from "./unicodeSymbols";
 import utils from "./utils";
 import {assertNodeType, checkNodeType} from "./parseNode";
 import ParseError from "./ParseError";
-import {combiningDiacriticalMarksEndRegex, urlFunctionRegex} from "./Lexer";
+import {combiningDiacriticalMarksEndRegex} from "./Lexer";
 import Settings from "./Settings";
 import SourceLocation from "./SourceLocation";
 import {Token} from "./Token";
@@ -405,6 +405,8 @@ export default class Parser {
                }
                // Put everything into an ordgroup as the superscript
                superscript = {type: "ordgroup", mode: this.mode, body: primes};
            } else if (lex.text === "%") {
                this.consumeComment();
            } else {
                // If it wasn't ^, _, or ', stop parsing super/subscripts
                break;
@@ -658,9 +660,15 @@ export default class Parser {
            return this.parseSizeGroup(optional);
        }
        if (type === "url") {
-            throw new ParseError(
+            return this.parseUrlGroup(optional);
-                "Internal bug: 'url' arguments should be handled by Lexer",
+        }
-                this.nextToken);
+        if (type === "raw") {
            const token = this.parseStringGroup("raw", optional, true);
            return token ? newArgument({
                type: "raw",
                mode: this.mode,
                string: token.text,
            }, token) : null;
        }
        // By the time we get here, type is one of "text" or "math".
@@ -674,6 +682,27 @@ export default class Parser {
        }
    }
    consumeComment() {
        // the newline character is normalized in Lexer, check original source
        while (this.nextToken.text !== "EOF" && this.nextToken.loc &&
                this.nextToken.loc.getSource().indexOf("\n") === -1) {
            this.consume();
        }
        if (this.nextToken.text === "EOF") {
            this.settings.reportNonstrict("commentAtEnd",
                "% comment has no terminating newline; LaTeX would " +
                "fail because of commenting the end of math mode (e.g. $)");
        }
        if (this.mode === "math") {
            this.consumeSpaces(); // ignore spaces in math mode
        } else if (this.nextToken.loc) { // text mode
            const source = this.nextToken.loc.getSource();
            if (source.indexOf("\n") === source.length - 1) {
                this.consumeSpaces(); // if no space after the first newline
            }
        }
    }
    /**
     * Parses a group, essentially returning the string formed by the
     * brace-enclosed tokens plus some position information.
@@ -681,28 +710,53 @@ export default class Parser {
    parseStringGroup(
        modeName: ArgType,  // Used to describe the mode in error messages.
        optional: boolean,
        raw?: boolean,
    ): ?Token {
-        if (optional && this.nextToken.text !== "[") {
+        const groupBegin = optional ? "[" : "{";
-            return null;
+        const groupEnd = optional ? "]" : "}";
        const nextToken = this.nextToken;
        if (nextToken.text !== groupBegin) {
            if (optional) {
                return null;
            } else if (raw && nextToken.text !== "EOF" &&
                    /[^{}[\]]/.test(nextToken.text)) {
                // allow a single character in raw string group
                this.consume();
                return nextToken;
            }
        }
        const outerMode = this.mode;
        this.mode = "text";
-        this.expect(optional ? "[" : "{");
+        this.expect(groupBegin);
        let str = "";
        const firstToken = this.nextToken;
        let nested = 0; // allow nested braces in raw string group
        let lastToken = firstToken;
-        while (this.nextToken.text !== (optional ? "]" : "}")) {
+        while ((raw && nested > 0) || this.nextToken.text !== groupEnd) {
-            if (this.nextToken.text === "EOF") {
+            switch (this.nextToken.text) {
-                throw new ParseError(
+                case "EOF":
-                    "Unexpected end of input in " + modeName,
+                    throw new ParseError(
-                    firstToken.range(this.nextToken, str));
+                        "Unexpected end of input in " + modeName,
                        firstToken.range(lastToken, str));
                case "%":
                    if (!raw) { // allow % in raw string group
                        this.consumeComment();
                        continue;
                    }
                    break;
                case groupBegin:
                    nested++;
                    break;
                case groupEnd:
                    nested--;
                    break;
            }
            lastToken = this.nextToken;
            str += lastToken.text;
            this.consume();
        }
        this.mode = outerMode;
-        this.expect(optional ? "]" : "}");
+        this.expect(groupEnd);
        return firstToken.range(lastToken, str);
    }
@@ -720,8 +774,12 @@ export default class Parser {
        const firstToken = this.nextToken;
        let lastToken = firstToken;
        let str = "";
-        while (this.nextToken.text !== "EOF"
+        while (this.nextToken.text !== "EOF" && (regex.test(
-            && regex.test(str + this.nextToken.text)) {
+                str + this.nextToken.text) || this.nextToken.text === "%")) {
            if (this.nextToken.text === "%") {
                this.consumeComment();
                continue;
            }
            lastToken = this.nextToken;
            str += lastToken.text;
            this.consume();
@@ -802,6 +860,34 @@ export default class Parser {
        }, res);
    }
    /**
     * Parses an URL, checking escaped letters and allowed protocols.
     */
    parseUrlGroup(optional: boolean): ?ParsedArg {
        const res = this.parseStringGroup("url", optional, true); // get raw string
        if (!res) {
            return null;
        }
        // hyperref package allows backslashes alone in href, but doesn't
        // generate valid links in such cases; we interpret this as
        // "undefined" behaviour, and keep them as-is. Some browser will
        // replace backslashes with forward slashes.
        const url = res.text.replace(/\\([#$%&~_^{}])/g, '$1');
        let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
        protocol = (protocol != null ? protocol[1] : "_relative");
        const allowed = this.settings.allowedProtocols;
        if (!utils.contains(allowed,  "*") &&
            !utils.contains(allowed, protocol)) {
            throw new ParseError(
                `Forbidden protocol '${protocol}'`, res);
        }
        return newArgument({
            type: "url",
            mode: this.mode,
            url,
        }, res);
    }
    /**
     * If `optional` is false or absent, this parses an ordinary group,
     * which is either a single nucleus (like "x") or an expression
@@ -913,53 +999,6 @@ export default class Parser {
            // The token will be consumed later in parseGivenFunction
            // (after possibly switching modes).
            return newFunction(nucleus);
        } else if (/^\\(href|url)[^a-zA-Z]/.test(text)) {
            const match = text.match(urlFunctionRegex);
            if (!match) {
                throw new ParseError(
                    `Internal error: invalid URL token '${text}'`, nucleus);
            }
            const funcName = match[1];
            // match[2] is the only one that can be an empty string,
            // so it must be at the end of the following or chain:
            const rawUrl = match[4] || match[3] || match[2];
            // hyperref package allows backslashes alone in href, but doesn't
            // generate valid links in such cases; we interpret this as
            // "undefined" behaviour, and keep them as-is. Some browser will
            // replace backslashes with forward slashes.
            const url = rawUrl.replace(/\\([#$%&~_^{}])/g, '$1');
            let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
            protocol = (protocol != null ? protocol[1] : "_relative");
            const allowed = this.settings.allowedProtocols;
            if (!utils.contains(allowed,  "*") &&
                !utils.contains(allowed, protocol)) {
                throw new ParseError(
                    `Forbidden protocol '${protocol}' in ${funcName}`, nucleus);
            }
            const urlArg = {
                type: "url",
                mode: this.mode,
                url,
            };
            this.consume();
            if (funcName === "\\href") {  // two arguments
                this.consumeSpaces();  // ignore spaces between arguments
                let description = this.parseGroupOfType("original", false);
                if (description == null) {
                    throw new ParseError(`${funcName} missing second argument`,
                        nucleus);
                }
                if (description.type === "fn") {
                    description = this.parseGivenFunction(description);
                } else { // arg.type === "arg"
                    description = description.result;
                }
                return newArgument(this.callFunction(
                    funcName, [urlArg, description], []), nucleus);
            } else {  // one argument (\url)
                return newArgument(this.callFunction(
                    funcName, [urlArg], []), nucleus);
            }
        } else if (/^\\verb[^a-zA-Z]/.test(text)) {
            this.consume();
            let arg = text.slice(5);
@@ -980,6 +1019,9 @@ export default class Parser {
                body: arg,
                star,
            }, nucleus);
        } else if (text === "%") {
            this.consumeComment();
            return this.parseSymbol();
        }
        // At this point, we should have a symbol, possibly with accents.
        // First expand any accented base symbol according to unicodeSymbols.
--- a/src/SourceLocation.js
+++ b/src/SourceLocation.js
@@ -17,6 +17,10 @@ export default class SourceLocation {
        this.end = end;
    }
    getSource(): string {
        return this.lexer.input.slice(this.start, this.end);
    }
    /**
     * Merges two `SourceLocation`s from location providers, given they are
     * provided in order of appearance.
--- a/src/parseNode.js
+++ b/src/parseNode.js
@@ -80,6 +80,12 @@ type ParseNodeTypes = {
        loc?: ?SourceLocation,
        body: AnyParseNode[],
    |},
    "raw": {|
        type: "raw",
        mode: Mode,
        loc?: ?SourceLocation,
        string: string,
    |},
    "size": {|
        type: "size",
        mode: Mode,
--- a/src/types.js
+++ b/src/types.js
@@ -12,13 +12,15 @@ export type Mode = "math" | "text";
 //   - "color": An html color, like "#abc" or "blue"
 //   - "url": An url string, in which "\" will be ignored
 //   -        if it precedes [#$%&~_^\{}]
 //   - "raw": A string, allowing single character, percent sign,
 //            and nested braces
 //   - "original": The same type as the environment that the
 //                 function being parsed is in (e.g. used for the
 //                 bodies of functions like \textcolor where the
 //                 first argument is special and the second
 //                 argument is parsed normally)
 //   - Mode: Node group parsed in given mode.
-export type ArgType = "color" | "size" | "url" | "original" | Mode;
+export type ArgType = "color" | "size" | "url" | "raw" | "original" | Mode;
 // LaTeX display style.
 export type StyleStr = "text" | "display" | "script" | "scriptscript";
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -1597,6 +1597,16 @@ describe("A comment parser", function() {
        expect("% comment 1\n% comment 2\n").toParse();
    });
    it("should parse comments between subscript and superscript", () => {
        expect("x_3 %comment\n^2").toParseLike`x_3^2`;
    });
    it("should parse comments in size and color groups", () => {
        expect("\\kern{1 %kern\nem}").toParse();
        expect("\\kern1 %kern\nem").toParse();
        expect("\\color{#f00%red\n}").toParse();
    });
    it("should not parse a comment without newline in strict mode", () => {
        expect`x%y`.not.toParse(strictSettings);
        expect`x%y`.toParse(nonstrictSettings);
@@ -2527,12 +2537,6 @@ describe("href and url commands", function() {
        expect("\\url%end").toParseLike("\\url {%}end");
    });
    it("should detect missing second argument in \\href", () => {
        expect`\href{http://example.com/}`.not.toParse();
        expect`\href%`.not.toParse();
        expect`\href %`.not.toParse();
    });
    it("should allow spaces single-character URLs", () => {
        expect`\href %end`.toParseLike("\\href{%}end");
        expect("\\url %end").toParseLike("\\url{%}end");
@@ -2547,7 +2551,7 @@ describe("href and url commands", function() {
    });
    it("should allow balanced braces in url", function() {
-        const url = "http://example.org/{too}";
+        const url = "http://example.org/{{}t{oo}}";
        const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0];
        expect(parsed1.href).toBe(url);
        const parsed2 = getParsed(`\\url{${url}}`)[0];