Comments without terminating newlines, \href fixes, \url support (#1529)

* Comments without terminating newlines in nonstrict mode Fix #1506 by allowing single-line comments (`%` without terminating newline) in nonstrict mode. `Lexer` and `MacroExpander` now store the `Settings` object, so the `Lexer` can complain about missing newline according to the `strict` setting. I filtered this out from the snapshot tests with a slightly different `replacer`. * Reimplement \href like \verb, add \url Major restructuring to lex URL arguments differently, e.g. to support `\href%{hello}` and `\href{http://foo.com/#test%}{hello}`. The new URL parsing code is simpler, but involves a special case in `parseSymbol` like `\verb`. Also add support for `\url` while we're here. * Cleanup * Fix flow errors and improve error messages * Add \url to documentation * Improve doc formatting
2025-10-05 19:28:39 +00:00 · 2018-07-31 14:13:30 -04:00
parent b73e43832b
commit 2202aa774f
9 changed files with 181 additions and 115 deletions
--- a/docs/options.md
+++ b/docs/options.md
@@ -18,6 +18,9 @@ You can provide an object of options as the last argument to [`katex.render` and
    incorrect (especially in terms of vertical heights).
  - `"unicodeTextInMathMode"`: Use of Unicode text characters in math mode.
  - `"mathVsTextUnits"`: Mismatch of math vs. text commands and units/mode.
+  - `"commentAtEnd"`: Use of `%` comment without a terminating newline.
+    LaTeX would thereby comment out the end of math mode (e.g. `$`),
+    causing an error.
  A second category of `errorCode`s never throw errors, but their strictness
  affects the behavior of KaTeX:
  - `"newLineInDisplayMode"`: Use of `\\` or `\newline` in display mode
--- a/docs/supported.md
+++ b/docs/supported.md
@@ -86,7 +86,10 @@ The `{array}` environment does not yet support `\cline` or `\multicolumn`.

 ## HTML

-$\href{https://khan.github.io/KaTeX/}{KaTeX}$ `\href{https://khan.github.io/KaTeX/}{KaTeX}`
+|||
+|:----------------|:-------------------|
+| $\href{https://khan.github.io/KaTeX/}{KaTeX}$ | `\href{https://khan.github.io/KaTeX/}{KaTeX}` |
+| $\url{https://khan.github.io/KaTeX/}$ | `\url{https://khan.github.io/KaTeX/}` |

 ## Letters and Unicode

--- a/src/Lexer.js
+++ b/src/Lexer.js
@@ -15,6 +15,7 @@
 import ParseError from "./ParseError";
 import SourceLocation from "./SourceLocation";
 import {LexerInterface, Token} from "./Token";
+import type Settings from "./Settings";

 /* The following tokenRegex
 * - matches typical whitespace (but not NBSP etc.) using its first group
@@ -33,7 +34,7 @@ import {LexerInterface, Token} from "./Token";
 * still reject the input.
 */
 const spaceRegexString = "[ \r\n\t]";
-const commentRegexString = "%[^\n]*[\n]";
+const commentRegexString = "%[^\n]*(?:\n|$)";
 const controlWordRegexString = "\\\\[a-zA-Z@]+";
 const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
 const controlWordWhitespaceRegexString =
@@ -43,6 +44,10 @@ const controlWordWhitespaceRegex = new RegExp(
 const combiningDiacriticalMarkString = "[\u0300-\u036f]";
 export const combiningDiacriticalMarksEndRegex =
    new RegExp(`${combiningDiacriticalMarkString}+$`);
+const urlFunctionRegexString = "(\\\\href|\\\\url)" +
+    `(?:${spaceRegexString}*\\{((?:[^{}\\\\]|\\\\[^]|{[^{}]*})*)\\}` +
+    `|${spaceRegexString}+([^{}])` +
+    `|${spaceRegexString}*([^{}a-zA-Z]))`;
 const tokenRegexString = `(${spaceRegexString}+)|` +  // whitespace
    `(${commentRegexString}` +                        // comments
    "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
@@ -51,22 +56,25 @@ const tokenRegexString = `(${spaceRegexString}+)|` +  // whitespace
    `${combiningDiacriticalMarkString}*` +            // ...plus accents
    "|\\\\verb\\*([^]).*?\\3" +                       // \verb*
    "|\\\\verb([^*a-zA-Z]).*?\\4" +                   // \verb unstarred
+    `|${urlFunctionRegexString}` +                    // URL arguments
    `|${controlWordWhitespaceRegexString}` +          // \macroName + spaces
    `|${controlSymbolRegexString})`;                  // \\, \', etc.

 // These regexs are for matching results from tokenRegex,
 // so they do have ^ markers.
 export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
-const commentRegex = new RegExp(`^${commentRegexString}`);
+export const urlFunctionRegex = new RegExp(`^${urlFunctionRegexString}`);

 /** Main Lexer class */
 export default class Lexer implements LexerInterface {
    input: string;
+    settings: Settings;
    tokenRegex: RegExp;

-    constructor(input: string) {
+    constructor(input: string, settings: Settings) {
        // Separate accents from characters
        this.input = input;
+        this.settings = settings;
        this.tokenRegex = new RegExp(tokenRegexString, 'g');
    }

@@ -90,10 +98,15 @@ export default class Lexer implements LexerInterface {
        // Trim any trailing whitespace from control word match
        const controlMatch = text.match(controlWordWhitespaceRegex);
        if (controlMatch) {
-            text = controlMatch[1];
+            text = controlMatch[1] + text.slice(controlMatch[0].length);
        }

-        if (commentRegex.test(text)) {
+        if (text[0] === "%") {
+            if (text[text.length - 1] !== "\n") {
+                this.settings.reportNonstrict("commentAtEnd",
+                    "% comment has no terminating newline; LaTeX would " +
+                    "fail because of commenting the end of math mode (e.g. $)");
+            }
            return this.lex();
        } else {
            return new Token(text, new SourceLocation(this, pos,
--- a/src/MacroExpander.js
+++ b/src/MacroExpander.js
@@ -28,17 +28,19 @@ export const implicitCommands = {
 };

 export default class MacroExpander implements MacroContextInterface {
-    maxExpand: number;
+    settings: Settings;
+    expansionCount: number;
    lexer: Lexer;
    macros: Namespace<MacroDefinition>;
    stack: Token[];
    mode: Mode;

    constructor(input: string, settings: Settings, mode: Mode) {
+        this.settings = settings;
+        this.expansionCount = 0;
        this.feed(input);
        // Make new global namespace
        this.macros = new Namespace(builtinMacros, settings.macros);
-        this.maxExpand = settings.maxExpand;
        this.mode = mode;
        this.stack = []; // contains tokens in REVERSE order
    }
@@ -48,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
     * (with existing macros etc.).
     */
    feed(input: string) {
-        this.lexer = new Lexer(input);
+        this.lexer = new Lexer(input, this.settings);
    }

    /**
@@ -188,13 +190,11 @@ export default class MacroExpander implements MacroContextInterface {
            this.pushToken(topToken);
            return topToken;
        }
-        if (this.maxExpand !== Infinity) {
-            this.maxExpand--;
-            if (this.maxExpand < 0) {
+        this.expansionCount++;
+        if (this.expansionCount > this.settings.maxExpand) {
            throw new ParseError("Too many expansions: infinite loop or " +
                "need to increase maxExpand setting");
        }
-        }
        let tokens = expansion.tokens;
        if (expansion.numArgs) {
            const args = this.consumeArgs(expansion.numArgs);
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
                    ++numArgs;
                }
            }
-            const bodyLexer = new Lexer(expansion);
+            const bodyLexer = new Lexer(expansion, this.settings);
            const tokens = [];
            let tok = bodyLexer.lex();
            while (tok.text !== "EOF") {
--- a/src/Parser.js
+++ b/src/Parser.js
@@ -11,7 +11,7 @@ import unicodeSymbols from "./unicodeSymbols";
 import utils from "./utils";
 import ParseNode, {assertNodeType, checkNodeType} from "./ParseNode";
 import ParseError from "./ParseError";
-import {combiningDiacriticalMarksEndRegex} from "./Lexer.js";
+import {combiningDiacriticalMarksEndRegex, urlFunctionRegex} from "./Lexer.js";
 import Settings from "./Settings";
 import {Token} from "./Token";
 import type {AnyParseNode} from "./ParseNode";
@@ -28,7 +28,7 @@ import type {EnvSpec} from "./defineEnvironment";
 *
 * The main functions (the `.parse...` ones) take a position in the current
 * parse string to parse tokens from. The lexer (found in Lexer.js, stored at
- * this.lexer) also supports pulling out tokens at arbitrary places. When
+ * this.gullet.lexer) also supports pulling out tokens at arbitrary places. When
 * individual tokens are needed at a position, the lexer is called to pull out a
 * token, which is then used.
 *
@@ -660,7 +660,9 @@ export default class Parser {
            return this.parseSizeGroup(optional);
        }
        if (type === "url") {
-            return this.parseUrlGroup(optional);
+            throw new ParseError(
+                "Internal bug: 'url' arguments should be handled by Lexer",
+                this.nextToken);
        }

        // By the time we get here, type is one of "text" or "math".
@@ -706,51 +708,6 @@ export default class Parser {
        return firstToken.range(lastToken, str);
    }

-    /**
-     * Parses a group, essentially returning the string formed by the
-     * brace-enclosed tokens plus some position information, possibly
-     * with nested braces.
-     */
-    parseStringGroupWithBalancedBraces(
-        modeName: ArgType,  // Used to describe the mode in error messages.
-        optional: boolean,
-    ): ?Token {
-        if (optional && this.nextToken.text !== "[") {
-            return null;
-        }
-        const outerMode = this.mode;
-        this.mode = "text";
-        this.expect(optional ? "[" : "{");
-        let str = "";
-        let nest = 0;
-        const firstToken = this.nextToken;
-        let lastToken = firstToken;
-        while (nest > 0 || this.nextToken.text !== (optional ? "]" : "}")) {
-            if (this.nextToken.text === "EOF") {
-                throw new ParseError(
-                    "Unexpected end of input in " + modeName,
-                    firstToken.range(this.nextToken, str));
-            }
-            lastToken = this.nextToken;
-            str += lastToken.text;
-            if (lastToken.text === "{") {
-                nest += 1;
-            } else if (lastToken.text === "}") {
-                if (nest <= 0) {
-                    throw new ParseError(
-                        "Unbalanced brace of input in " + modeName,
-                        firstToken.range(this.nextToken, str));
-                } else {
-                    nest -= 1;
-                }
-            }
-            this.consume();
-        }
-        this.mode = outerMode;
-        this.expect(optional ? "]" : "}");
-        return firstToken.range(lastToken, str);
-    }
-
    /**
     * Parses a regex-delimited group: the largest sequence of tokens
     * whose concatenated strings match `regex`. Returns the string
@@ -795,32 +752,6 @@ export default class Parser {
        return newArgument(new ParseNode("color-token", match[0], this.mode), res);
    }

-    /**
-     * Parses a url string.
-     */
-    parseUrlGroup(optional: boolean): ?ParsedArg {
-        const res = this.parseStringGroupWithBalancedBraces("url", optional);
-        if (!res) {
-            return null;
-        }
-        const raw = res.text;
-        // hyperref package allows backslashes alone in href, but doesn't generate
-        // valid links in such cases; we interpret this as "undefiend" behaviour,
-        // and keep them as-is. Some browser will replace backslashes with
-        // forward slashes.
-        const url = raw.replace(/\\([#$%&~_^{}])/g, '$1');
-        const protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
-        const allowed = this.settings.allowedProtocols;
-        if (!utils.contains(allowed,  "*") && !utils.contains(allowed,
-                protocol != null ? protocol[1] : "_relative")) {
-            throw new ParseError('Not allowed \\href protocol', res);
-        }
-        return newArgument(new ParseNode("url", {
-            type: "url",
-            value: url,
-        }, this.mode), res);
-    }
-
    /**
     * Parses a size specification, consisting of magnitude and unit.
     */
@@ -957,6 +888,52 @@ export default class Parser {
            // The token will be consumed later in parseGivenFunction
            // (after possibly switching modes).
            return newFunction(nucleus);
+        } else if (/^\\(href|url)[^a-zA-Z]/.test(text)) {
+            const match = text.match(urlFunctionRegex);
+            if (!match) {
+                throw new ParseError(
+                    `Internal error: invalid URL token '${text}'`, nucleus);
+            }
+            const funcName = match[1];
+            // match[2] is the only one that can be an empty string,
+            // so it must be at the end of the following or chain:
+            const rawUrl = match[4] || match[3] || match[2];
+            // hyperref package allows backslashes alone in href, but doesn't
+            // generate valid links in such cases; we interpret this as
+            // "undefined" behaviour, and keep them as-is. Some browser will
+            // replace backslashes with forward slashes.
+            const url = rawUrl.replace(/\\([#$%&~_^{}])/g, '$1');
+            let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
+            protocol = (protocol != null ? protocol[1] : "_relative");
+            const allowed = this.settings.allowedProtocols;
+            if (!utils.contains(allowed,  "*") &&
+                !utils.contains(allowed, protocol)) {
+                throw new ParseError(
+                    `Forbidden protocol '${protocol}' in ${funcName}`, nucleus);
+            }
+            const urlArg = new ParseNode("url", {
+                type: "url",
+                value: url,
+            }, this.mode);
+            this.consume();
+            if (funcName === "\\href") {  // two arguments
+                this.consumeSpaces();  // ignore spaces between arguments
+                let description = this.parseGroupOfType("original", false);
+                if (description == null) {
+                    throw new ParseError(`${funcName} missing second argument`,
+                        nucleus);
+                }
+                if (description.type === "fn") {
+                    description = this.parseGivenFunction(description);
+                } else { // arg.type === "arg"
+                    description = description.result;
+                }
+                return newArgument(this.callFunction(
+                    funcName, [urlArg, description], []), nucleus);
+            } else {  // one argument (\url)
+                return newArgument(this.callFunction(
+                    funcName, [urlArg], []), nucleus);
+            }
        } else if (/^\\verb[^a-zA-Z]/.test(text)) {
            this.consume();
            let arg = text.slice(5);
--- a/src/functions/href.js
+++ b/src/functions/href.js
@@ -14,6 +14,7 @@ defineFunction({
    props: {
        numArgs: 2,
        argTypes: ["url", "original"],
+        allowedInText: true,
    },
    handler: ({parser}, args) => {
        const body = args[1];
@@ -41,3 +42,34 @@ defineFunction({
        return math;
    },
 });
+
+defineFunction({
+    type: "href",
+    names: ["\\url"],
+    props: {
+        numArgs: 1,
+        argTypes: ["url"],
+        allowedInText: true,
+    },
+    handler: ({parser}, args) => {
+        const href = assertNodeType(args[0], "url").value.value;
+        const chars = [];
+        for (let i = 0; i < href.length; i++) {
+            let c = href[i];
+            if (c === "~") {
+                c = "\\textasciitilde";
+            }
+            chars.push(new ParseNode("textord", c, "text"));
+        }
+        const body = new ParseNode("text", {
+            type: "text",
+            font: "\\texttt",
+            body: chars,
+        }, parser.mode);
+        return new ParseNode("href", {
+            type: "href",
+            href: href,
+            body: ordargument(body),
+        }, parser.mode);
+    },
+});
--- a/test/snapshots/katex-spec.js.snap
+++ b/test/snapshots/katex-spec.js.snap
@@ -27,9 +27,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
                        "end": 37,
                        "lexer": {
                          "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
-                          "tokenRegex": {
                          "lastIndex": 56
-                          }
                        },
                        "start": 36
                      },
@@ -58,9 +56,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
                        "end": 39,
                        "lexer": {
                          "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
-                          "tokenRegex": {
                          "lastIndex": 56
-                          }
                        },
                        "start": 38
                      },
@@ -91,9 +87,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
                        "end": 42,
                        "lexer": {
                          "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
-                          "tokenRegex": {
                          "lastIndex": 56
-                          }
                        },
                        "start": 41
                      },
@@ -122,9 +116,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
                        "end": 44,
                        "lexer": {
                          "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
-                          "tokenRegex": {
                          "lastIndex": 56
-                          }
                        },
                        "start": 43
                      },
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -1563,8 +1563,9 @@ describe("A comment parser", function() {
        expect("% comment 1\n% comment 2\n").toParse();
    });

-    it("should not parse a comment that isn't followed by a newline", () => {
-        expect`x%y`.not.toParse();
+    it("should not parse a comment without newline in strict mode", () => {
+        expect`x%y`.not.toParse(strictSettings);
+        expect`x%y`.toParse(nonstrictSettings);
    });

    it("should not produce or consume space", () => {
@@ -2451,33 +2452,69 @@ describe("operatorname support", function() {
    });
 });

-describe("An href command", function() {
+describe("href and url commands", function() {
+    // We can't use raw strings for \url because \u is for Unicode escapes.
+
    it("should parse its input", function() {
-        expect`\href{http://example.com/}{example here}`.toParse();
+        expect`\href{http://example.com/}{example here}`.toBuild();
+        expect("\\url{http://example.com/}").toBuild();
+    });
+
+    it("should allow empty URLs", function() {
+        expect`\href{}{example here}`.toBuild();
+        expect("\\url{}").toBuild();
+    });
+
+    it("should allow single-character URLs", () => {
+        expect`\href%end`.toParseLike("\\href{%}end");
+        expect`\href %end`.toParseLike("\\href{%}end");
+        expect("\\url%end").toParseLike("\\url{%}end");
+        expect("\\url %end").toParseLike("\\url{%}end");
+        expect("\\url end").toParseLike("\\url{e}nd");
+        expect("\\url%end").toParseLike("\\url {%}end");
+    });
+
+    it("should detect missing second argument in \\href", () => {
+        expect`\href{http://example.com/}`.not.toParse();
+        expect`\href%`.not.toParse();
+        expect`\href %`.not.toParse();
+    });
+
+    it("should allow spaces single-character URLs", () => {
+        expect`\href %end`.toParseLike("\\href{%}end");
+        expect("\\url %end").toParseLike("\\url{%}end");
    });

    it("should allow letters [#$%&~_^] without escaping", function() {
        const url = "http://example.org/~bar/#top?foo=$foo&bar=ba^r_boo%20baz";
-        const hash = getParsed(`\\href{${url}}{\\alpha}`)[0];
-        expect(hash.value.href).toBe(url);
+        const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0];
+        expect(parsed1.value.href).toBe(url);
+        const parsed2 = getParsed(`\\url{${url}}`)[0];
+        expect(parsed2.value.href).toBe(url);
    });

    it("should allow balanced braces in url", function() {
        const url = "http://example.org/{too}";
-        const hash = getParsed(`\\href{${url}}{\\alpha}`)[0];
-        expect(hash.value.href).toBe(url);
+        const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0];
+        expect(parsed1.value.href).toBe(url);
+        const parsed2 = getParsed(`\\url{${url}}`)[0];
+        expect(parsed2.value.href).toBe(url);
    });

    it("should not allow unbalanced brace(s) in url", function() {
        expect`\href{http://example.com/{a}{bar}`.not.toParse();
        expect`\href{http://example.com/}a}{bar}`.not.toParse();
+        expect`\\url{http://example.com/{a}`.not.toParse();
+        expect`\\url{http://example.com/}a}`.not.toParse();
    });

    it("should allow escape for letters [#$%&~_^{}]", function() {
        const url = "http://example.org/~bar/#top?foo=$}foo{&bar=bar^r_boo%20baz";
        const input = url.replace(/([#$%&~_^{}])/g, '\\$1');
-        const ae = getParsed(`\\href{${input}}{\\alpha}`)[0];
-        expect(ae.value.href).toBe(url);
+        const parsed1 = getParsed(`\\href{${input}}{\\alpha}`)[0];
+        expect(parsed1.value.href).toBe(url);
+        const parsed2 = getParsed(`\\url{${input}}`)[0];
+        expect(parsed2.value.href).toBe(url);
    });

    it("should be marked up correctly", function() {
--- a/test/setup.js
+++ b/test/setup.js
@@ -1,6 +1,7 @@
 /* global expect: false */

 import stringify from 'json-stable-stringify';
+import Lexer from "../src/Lexer";
 import ParseError from "../src/ParseError";
 import {
    Mode, ConsoleWarning,
@@ -19,8 +20,16 @@ const typeFirstCompare = (a, b) => {
    }
 };

-const regExpReplacer = (key, value) => {
-    return value instanceof RegExp ? {lastIndex: value.lastIndex} : value;
+const replacer = (key, value) => {
+    if (value instanceof Lexer) {
+        return {
+            input: value.input,
+            // omit value.settings
+            lastIndex: value.tokenRegex.lastIndex,
+        };
+    } else {
+        return value;
+    }
 };

 const serializer = {
@@ -28,7 +37,7 @@ const serializer = {
        return stringify(val, {
            cmp: typeFirstCompare,
            space: '  ',
-            replacer: regExpReplacer,
+            replacer: replacer,
        });
    },
    test(val) {