Add raw string group, move comment parsing to Parser, change URL group parser (#1711)

* Add raw string group * Move comment parsing to Parser * Use raw string group in URL group parser * Update types.js * Add multi-level nested url test
2025-10-07 04:08:43 +00:00 · 2018-10-13 10:21:57 +09:00
parent ba8e224b8d
commit 3907545e2c
7 changed files with 135 additions and 98 deletions
--- a/src/Lexer.js
+++ b/src/Lexer.js
@@ -17,11 +17,9 @@ import SourceLocation from "./SourceLocation";
 import {Token} from "./Token";

 import type {LexerInterface} from "./Token";
-import type Settings from "./Settings";

 /* The following tokenRegex
 * - matches typical whitespace (but not NBSP etc.) using its first group
- * - matches comments (must have trailing newlines)
 * - does not match any control character \x00-\x1f except whitespace
 * - does not match a bare backslash
 * - matches any ASCII character except those just mentioned
@@ -36,7 +34,6 @@ import type Settings from "./Settings";
 * still reject the input.
 */
 const spaceRegexString = "[ \r\n\t]";
-const commentRegexString = "%[^\n]*(?:\n|$)";
 const controlWordRegexString = "\\\\[a-zA-Z@]+";
 const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
 const controlWordWhitespaceRegexString =
@@ -46,37 +43,28 @@ const controlWordWhitespaceRegex = new RegExp(
 const combiningDiacriticalMarkString = "[\u0300-\u036f]";
 export const combiningDiacriticalMarksEndRegex =
    new RegExp(`${combiningDiacriticalMarkString}+$`);
-const urlFunctionRegexString = "(\\\\href|\\\\url)" +
-    `(?:${spaceRegexString}*\\{((?:[^{}\\\\]|\\\\[^]|{[^{}]*})*)\\}` +
-    `|${spaceRegexString}+([^{}])` +
-    `|${spaceRegexString}*([^{}a-zA-Z]))`;
 const tokenRegexString = `(${spaceRegexString}+)|` +  // whitespace
-    `(${commentRegexString}` +                        // comments
-    "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
+    "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
    `${combiningDiacriticalMarkString}*` +            // ...plus accents
    "|[\uD800-\uDBFF][\uDC00-\uDFFF]" +               // surrogate pair
    `${combiningDiacriticalMarkString}*` +            // ...plus accents
    "|\\\\verb\\*([^]).*?\\3" +                       // \verb*
    "|\\\\verb([^*a-zA-Z]).*?\\4" +                   // \verb unstarred
-    `|${urlFunctionRegexString}` +                    // URL arguments
    `|${controlWordWhitespaceRegexString}` +          // \macroName + spaces
    `|${controlSymbolRegexString})`;                  // \\, \', etc.

 // These regexs are for matching results from tokenRegex,
 // so they do have ^ markers.
 export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
-export const urlFunctionRegex = new RegExp(`^${urlFunctionRegexString}`);

 /** Main Lexer class */
 export default class Lexer implements LexerInterface {
    input: string;
-    settings: Settings;
    tokenRegex: RegExp;

-    constructor(input: string, settings: Settings) {
+    constructor(input: string) {
        // Separate accents from characters
        this.input = input;
-        this.settings = settings;
        this.tokenRegex = new RegExp(tokenRegexString, 'g');
    }

@@ -100,19 +88,10 @@ export default class Lexer implements LexerInterface {
        // Trim any trailing whitespace from control word match
        const controlMatch = text.match(controlWordWhitespaceRegex);
        if (controlMatch) {
-            text = controlMatch[1] + text.slice(controlMatch[0].length);
+            text = controlMatch[1];
        }

-        if (text[0] === "%") {
-            if (text[text.length - 1] !== "\n") {
-                this.settings.reportNonstrict("commentAtEnd",
-                    "% comment has no terminating newline; LaTeX would " +
-                    "fail because of commenting the end of math mode (e.g. $)");
-            }
-            return this.lex();
-        } else {
        return new Token(text, new SourceLocation(this, pos,
            this.tokenRegex.lastIndex));
    }
 }
-}
--- a/src/MacroExpander.js
+++ b/src/MacroExpander.js
@@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
     * (with existing macros etc.).
     */
    feed(input: string) {
-        this.lexer = new Lexer(input, this.settings);
+        this.lexer = new Lexer(input);
    }

    /**
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
                    ++numArgs;
                }
            }
-            const bodyLexer = new Lexer(expansion, this.settings);
+            const bodyLexer = new Lexer(expansion);
            const tokens = [];
            let tok = bodyLexer.lex();
            while (tok.text !== "EOF") {
--- a/src/Parser.js
+++ b/src/Parser.js
@@ -11,7 +11,7 @@ import unicodeSymbols from "./unicodeSymbols";
 import utils from "./utils";
 import {assertNodeType, checkNodeType} from "./parseNode";
 import ParseError from "./ParseError";
-import {combiningDiacriticalMarksEndRegex, urlFunctionRegex} from "./Lexer";
+import {combiningDiacriticalMarksEndRegex} from "./Lexer";
 import Settings from "./Settings";
 import SourceLocation from "./SourceLocation";
 import {Token} from "./Token";
@@ -405,6 +405,8 @@ export default class Parser {
                }
                // Put everything into an ordgroup as the superscript
                superscript = {type: "ordgroup", mode: this.mode, body: primes};
+            } else if (lex.text === "%") {
+                this.consumeComment();
            } else {
                // If it wasn't ^, _, or ', stop parsing super/subscripts
                break;
@@ -658,9 +660,15 @@ export default class Parser {
            return this.parseSizeGroup(optional);
        }
        if (type === "url") {
-            throw new ParseError(
-                "Internal bug: 'url' arguments should be handled by Lexer",
-                this.nextToken);
+            return this.parseUrlGroup(optional);
+        }
+        if (type === "raw") {
+            const token = this.parseStringGroup("raw", optional, true);
+            return token ? newArgument({
+                type: "raw",
+                mode: this.mode,
+                string: token.text,
+            }, token) : null;
        }

        // By the time we get here, type is one of "text" or "math".
@@ -674,6 +682,27 @@ export default class Parser {
        }
    }

+    consumeComment() {
+        // the newline character is normalized in Lexer, check original source
+        while (this.nextToken.text !== "EOF" && this.nextToken.loc &&
+                this.nextToken.loc.getSource().indexOf("\n") === -1) {
+            this.consume();
+        }
+        if (this.nextToken.text === "EOF") {
+            this.settings.reportNonstrict("commentAtEnd",
+                "% comment has no terminating newline; LaTeX would " +
+                "fail because of commenting the end of math mode (e.g. $)");
+        }
+        if (this.mode === "math") {
+            this.consumeSpaces(); // ignore spaces in math mode
+        } else if (this.nextToken.loc) { // text mode
+            const source = this.nextToken.loc.getSource();
+            if (source.indexOf("\n") === source.length - 1) {
+                this.consumeSpaces(); // if no space after the first newline
+            }
+        }
+    }
+
    /**
     * Parses a group, essentially returning the string formed by the
     * brace-enclosed tokens plus some position information.
@@ -681,28 +710,53 @@ export default class Parser {
    parseStringGroup(
        modeName: ArgType,  // Used to describe the mode in error messages.
        optional: boolean,
+        raw?: boolean,
    ): ?Token {
-        if (optional && this.nextToken.text !== "[") {
+        const groupBegin = optional ? "[" : "{";
+        const groupEnd = optional ? "]" : "}";
+        const nextToken = this.nextToken;
+        if (nextToken.text !== groupBegin) {
+            if (optional) {
                return null;
+            } else if (raw && nextToken.text !== "EOF" &&
+                    /[^{}[\]]/.test(nextToken.text)) {
+                // allow a single character in raw string group
+                this.consume();
+                return nextToken;
+            }
        }
        const outerMode = this.mode;
        this.mode = "text";
-        this.expect(optional ? "[" : "{");
+        this.expect(groupBegin);
        let str = "";
        const firstToken = this.nextToken;
+        let nested = 0; // allow nested braces in raw string group
        let lastToken = firstToken;
-        while (this.nextToken.text !== (optional ? "]" : "}")) {
-            if (this.nextToken.text === "EOF") {
+        while ((raw && nested > 0) || this.nextToken.text !== groupEnd) {
+            switch (this.nextToken.text) {
+                case "EOF":
                    throw new ParseError(
                        "Unexpected end of input in " + modeName,
-                    firstToken.range(this.nextToken, str));
+                        firstToken.range(lastToken, str));
+                case "%":
+                    if (!raw) { // allow % in raw string group
+                        this.consumeComment();
+                        continue;
+                    }
+                    break;
+                case groupBegin:
+                    nested++;
+                    break;
+                case groupEnd:
+                    nested--;
+                    break;
            }
            lastToken = this.nextToken;
            str += lastToken.text;
            this.consume();
        }
        this.mode = outerMode;
-        this.expect(optional ? "]" : "}");
+        this.expect(groupEnd);
        return firstToken.range(lastToken, str);
    }

@@ -720,8 +774,12 @@ export default class Parser {
        const firstToken = this.nextToken;
        let lastToken = firstToken;
        let str = "";
-        while (this.nextToken.text !== "EOF"
-            && regex.test(str + this.nextToken.text)) {
+        while (this.nextToken.text !== "EOF" && (regex.test(
+                str + this.nextToken.text) || this.nextToken.text === "%")) {
+            if (this.nextToken.text === "%") {
+                this.consumeComment();
+                continue;
+            }
            lastToken = this.nextToken;
            str += lastToken.text;
            this.consume();
@@ -802,6 +860,34 @@ export default class Parser {
        }, res);
    }

+    /**
+     * Parses an URL, checking escaped letters and allowed protocols.
+     */
+    parseUrlGroup(optional: boolean): ?ParsedArg {
+        const res = this.parseStringGroup("url", optional, true); // get raw string
+        if (!res) {
+            return null;
+        }
+        // hyperref package allows backslashes alone in href, but doesn't
+        // generate valid links in such cases; we interpret this as
+        // "undefined" behaviour, and keep them as-is. Some browser will
+        // replace backslashes with forward slashes.
+        const url = res.text.replace(/\\([#$%&~_^{}])/g, '$1');
+        let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
+        protocol = (protocol != null ? protocol[1] : "_relative");
+        const allowed = this.settings.allowedProtocols;
+        if (!utils.contains(allowed,  "*") &&
+            !utils.contains(allowed, protocol)) {
+            throw new ParseError(
+                `Forbidden protocol '${protocol}'`, res);
+        }
+        return newArgument({
+            type: "url",
+            mode: this.mode,
+            url,
+        }, res);
+    }
+
    /**
     * If `optional` is false or absent, this parses an ordinary group,
     * which is either a single nucleus (like "x") or an expression
@@ -913,53 +999,6 @@ export default class Parser {
            // The token will be consumed later in parseGivenFunction
            // (after possibly switching modes).
            return newFunction(nucleus);
-        } else if (/^\\(href|url)[^a-zA-Z]/.test(text)) {
-            const match = text.match(urlFunctionRegex);
-            if (!match) {
-                throw new ParseError(
-                    `Internal error: invalid URL token '${text}'`, nucleus);
-            }
-            const funcName = match[1];
-            // match[2] is the only one that can be an empty string,
-            // so it must be at the end of the following or chain:
-            const rawUrl = match[4] || match[3] || match[2];
-            // hyperref package allows backslashes alone in href, but doesn't
-            // generate valid links in such cases; we interpret this as
-            // "undefined" behaviour, and keep them as-is. Some browser will
-            // replace backslashes with forward slashes.
-            const url = rawUrl.replace(/\\([#$%&~_^{}])/g, '$1');
-            let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
-            protocol = (protocol != null ? protocol[1] : "_relative");
-            const allowed = this.settings.allowedProtocols;
-            if (!utils.contains(allowed,  "*") &&
-                !utils.contains(allowed, protocol)) {
-                throw new ParseError(
-                    `Forbidden protocol '${protocol}' in ${funcName}`, nucleus);
-            }
-            const urlArg = {
-                type: "url",
-                mode: this.mode,
-                url,
-            };
-            this.consume();
-            if (funcName === "\\href") {  // two arguments
-                this.consumeSpaces();  // ignore spaces between arguments
-                let description = this.parseGroupOfType("original", false);
-                if (description == null) {
-                    throw new ParseError(`${funcName} missing second argument`,
-                        nucleus);
-                }
-                if (description.type === "fn") {
-                    description = this.parseGivenFunction(description);
-                } else { // arg.type === "arg"
-                    description = description.result;
-                }
-                return newArgument(this.callFunction(
-                    funcName, [urlArg, description], []), nucleus);
-            } else {  // one argument (\url)
-                return newArgument(this.callFunction(
-                    funcName, [urlArg], []), nucleus);
-            }
        } else if (/^\\verb[^a-zA-Z]/.test(text)) {
            this.consume();
            let arg = text.slice(5);
@@ -980,6 +1019,9 @@ export default class Parser {
                body: arg,
                star,
            }, nucleus);
+        } else if (text === "%") {
+            this.consumeComment();
+            return this.parseSymbol();
        }
        // At this point, we should have a symbol, possibly with accents.
        // First expand any accented base symbol according to unicodeSymbols.
--- a/src/SourceLocation.js
+++ b/src/SourceLocation.js
@@ -17,6 +17,10 @@ export default class SourceLocation {
        this.end = end;
    }

+    getSource(): string {
+        return this.lexer.input.slice(this.start, this.end);
+    }
+
    /**
     * Merges two `SourceLocation`s from location providers, given they are
     * provided in order of appearance.
--- a/src/parseNode.js
+++ b/src/parseNode.js
@@ -80,6 +80,12 @@ type ParseNodeTypes = {
        loc?: ?SourceLocation,
        body: AnyParseNode[],
    |},
+    "raw": {|
+        type: "raw",
+        mode: Mode,
+        loc?: ?SourceLocation,
+        string: string,
+    |},
    "size": {|
        type: "size",
        mode: Mode,
--- a/src/types.js
+++ b/src/types.js
@@ -12,13 +12,15 @@ export type Mode = "math" | "text";
 //   - "color": An html color, like "#abc" or "blue"
 //   - "url": An url string, in which "\" will be ignored
 //   -        if it precedes [#$%&~_^\{}]
+//   - "raw": A string, allowing single character, percent sign,
+//            and nested braces
 //   - "original": The same type as the environment that the
 //                 function being parsed is in (e.g. used for the
 //                 bodies of functions like \textcolor where the
 //                 first argument is special and the second
 //                 argument is parsed normally)
 //   - Mode: Node group parsed in given mode.
-export type ArgType = "color" | "size" | "url" | "original" | Mode;
+export type ArgType = "color" | "size" | "url" | "raw" | "original" | Mode;

 // LaTeX display style.
 export type StyleStr = "text" | "display" | "script" | "scriptscript";
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -1597,6 +1597,16 @@ describe("A comment parser", function() {
        expect("% comment 1\n% comment 2\n").toParse();
    });

+    it("should parse comments between subscript and superscript", () => {
+        expect("x_3 %comment\n^2").toParseLike`x_3^2`;
+    });
+
+    it("should parse comments in size and color groups", () => {
+        expect("\\kern{1 %kern\nem}").toParse();
+        expect("\\kern1 %kern\nem").toParse();
+        expect("\\color{#f00%red\n}").toParse();
+    });
+
    it("should not parse a comment without newline in strict mode", () => {
        expect`x%y`.not.toParse(strictSettings);
        expect`x%y`.toParse(nonstrictSettings);
@@ -2527,12 +2537,6 @@ describe("href and url commands", function() {
        expect("\\url%end").toParseLike("\\url {%}end");
    });

-    it("should detect missing second argument in \\href", () => {
-        expect`\href{http://example.com/}`.not.toParse();
-        expect`\href%`.not.toParse();
-        expect`\href %`.not.toParse();
-    });
-
    it("should allow spaces single-character URLs", () => {
        expect`\href %end`.toParseLike("\\href{%}end");
        expect("\\url %end").toParseLike("\\url{%}end");
@@ -2547,7 +2551,7 @@ describe("href and url commands", function() {
    });

    it("should allow balanced braces in url", function() {
-        const url = "http://example.org/{too}";
+        const url = "http://example.org/{{}t{oo}}";
        const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0];
        expect(parsed1.href).toBe(url);
        const parsed2 = getParsed(`\\url{${url}}`)[0];