Fix space handling (#912)

Fixes several issues with space handling: (fix #910) 1. "Control symbols" (as they're called in the TeXbook), such as `\\`, should not have spaces eaten after them (only "control words" such as `\foo`). 2. In math mode, spaces should be consumed at the parser level, not the gullet level. This enables `\\ [x]` to parse differently from `\\[x]` 3. Eat spaces between arguments, so `\frac x y` still works. (This used to work only because math mode ate all spaces. The analog in text mode wouldn't have worked.) Also eat spaces in initial arguments in math mode, and before ^ and _ in atoms.
2025-10-05 03:08:40 +00:00 · 2017-10-10 10:09:37 -04:00
parent 49f95e61eb
commit 3280652bd6
5 changed files with 105 additions and 18 deletions
--- a/src/Lexer.js
+++ b/src/Lexer.js
@@ -34,6 +34,8 @@ import {LexerInterface, Token} from "./Token";
 * still reject the input.
 */
 const commentRegexString = "%[^\n]*[\n]";
+const controlWordRegexString = "\\\\[a-zA-Z@]+";
+const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
 const tokenRegex = new RegExp(
    "([ \r\n\t]+)|" +                                 // whitespace
    `(${commentRegexString}|` +                       // comments
@@ -41,11 +43,16 @@ const tokenRegex = new RegExp(
    "|[\uD800-\uDBFF][\uDC00-\uDFFF]" +               // surrogate pair
    "|\\\\verb\\*([^]).*?\\3" +                       // \verb*
    "|\\\\verb([^*a-zA-Z]).*?\\4" +                   // \verb unstarred
-    "|\\\\(?:[a-zA-Z@]+|[^\uD800-\uDFFF])" +          // function name
+    `|${controlWordRegexString}` +                    // \macroName
+    `|${controlSymbolRegexString}` +                  // \\, \', etc.
    ")"
 );

-const commentRegex = new RegExp(commentRegexString);
+// tokenRegex has no ^ marker, as required by matchAt.
+// These regexs are for matching results from tokenRegex,
+// so they do have ^ markers.
+export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
+const commentRegex = new RegExp(`^${commentRegexString}`);

 /** Main Lexer class */
 export default class Lexer implements LexerInterface {
--- a/src/MacroExpander.js
+++ b/src/MacroExpander.js
@@ -4,7 +4,7 @@
 * until only non-macro tokens remain.
 */

-import Lexer from "./Lexer";
+import Lexer, {controlWordRegex} from "./Lexer";
 import {Token} from "./Token";
 import builtinMacros from "./macros";
 import ParseError from "./ParseError";
@@ -82,8 +82,8 @@ export default class MacroExpander implements MacroContextInterface {
        const topToken = this.popToken();
        const name = topToken.text;
        const isMacro = (name.charAt(0) === "\\");
-        if (isMacro) {
-            // Consume all spaces after \macro
+        if (isMacro && controlWordRegex.test(name)) {
+            // Consume all spaces after \macro (but not \\, \', etc.)
            this.consumeSpaces();
        }
        if (!(isMacro && this.macros.hasOwnProperty(name))) {
--- a/src/Parser.js
+++ b/src/Parser.js
@@ -140,9 +140,13 @@ export default class Parser {
     * and fetches the one after that as the new look ahead.
     */
    consume() {
-        this.nextToken = this.gullet.get(this.mode === "math");
+        this.nextToken = this.gullet.get(false);
    }

+    /**
+     * Switches between "text" and "math" modes, reconsuming nextToken
+     * in case it would be read differently in the new mode.
+     */
    switchMode(newMode) {
        this.gullet.unget(this.nextToken);
        this.mode = newMode;
@@ -193,6 +197,10 @@ export default class Parser {
        // Keep adding atoms to the body until we can't parse any more atoms (either
        // we reached the end, a }, or a \right)
        while (true) {
+            // Ignore spaces in math mode
+            if (this.mode === "math") {
+                this.consumeSpaces();
+            }
            const lex = this.nextToken;
            if (Parser.endOfExpression.indexOf(lex.text) !== -1) {
                break;
@@ -283,6 +291,7 @@ export default class Parser {
        const symbolToken = this.nextToken;
        const symbol = symbolToken.text;
        this.consume();
+        this.consumeSpaces(); // ignore spaces before sup/subscript argument
        const group = this.parseGroup();

        if (!group) {
@@ -367,6 +376,9 @@ export default class Parser {
        let superscript;
        let subscript;
        while (true) {
+            // Guaranteed in math mode, so eat any spaces first.
+            this.consumeSpaces();
+
            // Lex the first token
            const lex = this.nextToken;

@@ -676,9 +688,25 @@ export default class Parser {
        const optArgs = [];

        for (let i = 0; i < totalArgs; i++) {
-            const nextToken = this.nextToken;
            const argType = funcData.argTypes && funcData.argTypes[i];
            const isOptional = i < funcData.numOptionalArgs;
+            // Ignore spaces between arguments.  As the TeXbook says:
+            // "After you have said ‘\def\row#1#2{...}’, you are allowed to
+            //  put spaces between the arguments (e.g., ‘\row x n’), because
+            //  TeX doesn’t use single spaces as undelimited arguments."
+            if (i > 0 && !isOptional) {
+                this.consumeSpaces();
+            }
+            // Also consume leading spaces in math mode, as parseSymbol
+            // won't know what to do with them.  This can only happen with
+            // macros, e.g. \frac\foo\foo where \foo expands to a space symbol.
+            // In LaTeX, the \foo's get treated as (blank) arguments).
+            // In KaTeX, for now, both spaces will get consumed.
+            // TODO(edemaine)
+            if (i === 0 && !isOptional && this.mode === "math") {
+                this.consumeSpaces();
+            }
+            const nextToken = this.nextToken;
            let arg = argType ?
                this.parseGroupOfType(argType, isOptional) :
                this.parseGroup(isOptional);
@@ -735,14 +763,9 @@ export default class Parser {
            return this.parseSizeGroup(optional);
        }

-        this.switchMode(innerMode);
-        if (innerMode === "text") {
-            // text mode is special because it should ignore the whitespace before
-            // it
-            this.consumeSpaces();
-        }
        // By the time we get here, innerMode is one of "text" or "math".
        // We switch the mode of the parser, recurse, then restore the old mode.
+        this.switchMode(innerMode);
        const res = this.parseGroup(optional);
        this.switchMode(outerMode);
        return res;
--- a/src/functions.js
+++ b/src/functions.js
@@ -178,6 +178,7 @@ defineFunction(["\\kern", "\\mkern"], {
 // A KaTeX logo
 defineFunction(["\\KaTeX"], {
    numArgs: 0,
+    allowedInText: true,
 }, function(context) {
    return {
        type: "katex",
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -210,9 +210,11 @@ describe("A parser", function() {
    });

    it("should ignore whitespace", function() {
-        const parseA = stripPositions(getParsed("    x    y    "));
-        const parseB = stripPositions(getParsed("xy"));
-        expect(parseA).toEqual(parseB);
+        expect("    x    y    ").toParseLike("xy");
+    });
+
+    it("should ignore whitespace in atom", function() {
+        expect("    x   ^ y    ").toParseLike("x^y");
    });
 });

@@ -2397,6 +2399,16 @@ describe("An aligned environment", function() {
            .toParse();
    });

+    it("should allow cells in brackets", function() {
+        expect("\\begin{aligned}[a]&[b]\\\\ [c]&[d]\\end{aligned}")
+            .toParse();
+    });
+
+    it("should forbid cells in brackets without space", function() {
+        expect("\\begin{aligned}[a]&[b]\\\\[c]&[d]\\end{aligned}")
+            .toNotParse();
+    });
+
 });

 describe("A parser that does not throw on unsupported commands", function() {
@@ -2441,7 +2453,7 @@ describe("A parser that does not throw on unsupported commands", function() {
    });
 });

-describe("The symbol table integraty", function() {
+describe("The symbol table integrity", function() {
    it("should treat certain symbols as synonyms", function() {
        expect(getBuilt("<")).toEqual(getBuilt("\\lt"));
        expect(getBuilt(">")).toEqual(getBuilt("\\gt"));
@@ -2475,10 +2487,30 @@ describe("A macro expander", function() {
        compareParseTree("\\foo", "x", {"\\foo": " x"});
    });

-    it("should consume spaces after macro", function() {
+    it("should consume spaces after control-word macro", function() {
        compareParseTree("\\text{\\foo }", "\\text{x}", {"\\foo": "x"});
    });

+    it("should consume spaces after macro with \\relax", function() {
+        compareParseTree("\\text{\\foo }", "\\text{}", {"\\foo": "\\relax"});
+    });
+
+    it("should consume spaces after \\relax", function() {
+        compareParseTree("\\text{\\relax }", "\\text{}");
+    });
+
+    it("should consume spaces after control-word function", function() {
+        compareParseTree("\\text{\\KaTeX }", "\\text{\\KaTeX}");
+    });
+
+    it("should preserve spaces after control-symbol macro", function() {
+        compareParseTree("\\text{\\% y}", "\\text{x y}", {"\\%": "x"});
+    });
+
+    it("should preserve spaces after control-symbol function", function() {
+        expect("\\text{\\' }").toParse();
+    });
+
    it("should consume spaces between arguments", function() {
        compareParseTree("\\text{\\foo 1 2}", "\\text{12end}", {"\\foo": "#1#2end"});
        compareParseTree("\\text{\\foo {1} {2}}", "\\text{12end}", {"\\foo": "#1#2end"});
@@ -2519,6 +2551,20 @@ describe("A macro expander", function() {
        });
    });

+    it("should allow for space second argument (text version)", function() {
+        compareParseTree("\\text{\\foo\\bar\\bar}", "\\text{( , )}", {
+            "\\foo": "(#1,#2)",
+            "\\bar": " ",
+        });
+    });
+
+    it("should allow for space second argument (math version)", function() {
+        compareParseTree("\\foo\\bar\\bar", "(,)", {
+            "\\foo": "(#1,#2)",
+            "\\bar": " ",
+        });
+    });
+
    it("should allow for empty macro argument", function() {
        compareParseTree("\\foo\\bar", "()", {
            "\\foo": "(#1)",
@@ -2526,6 +2572,16 @@ describe("A macro expander", function() {
        });
    });

+    // TODO: The following is not currently possible to get working, given that
+    // functions and macros are dealt with separately.
+/*
+    it("should allow for space function arguments", function() {
+        compareParseTree("\\frac\\bar\\bar", "\\frac{}{}", {
+            "\\bar": " ",
+        });
+    });
+*/
+
    it("should expand the \\overset macro as expected", function() {
        expect("\\overset?=").toParseLike("\\mathop{=}\\limits^{?}");
        expect("\\overset{x=y}{\\sqrt{ab}}")