From c85250d14e7dcace95eca76a66973d10d1b6ee9f Mon Sep 17 00:00:00 2001
From: Erik Demaine <edemaine@mit.edu>
Date: Wed, 5 May 2021 21:54:41 -0400
Subject: [PATCH] fix: Correctly parse \ followed by whitespace (#2877)

* fix: Correctly parse \ followed by whitespace

LaTeX parses `\` followed by whitespace including up to one newline
as equivalent to `\ `.  (With multiple newlines, you get paragraph
breaks.)

Fix #2860.

* Improve comments

* Avoid second RegExp match in control words

* Document capturing groups

Co-authored-by: Ron Kok <ronkok@comcast.net>
---
 src/Lexer.js       | 30 ++++++++++++++++--------------
 test/katex-spec.js |  7 ++++++-
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/Lexer.js b/src/Lexer.js
index 7cf96edd..849eee63 100644
--- a/src/Lexer.js
+++ b/src/Lexer.js
@@ -28,8 +28,16 @@ import type Settings from "./Settings";
  * - does not match bare surrogate code units
  * - matches any BMP character except for those just described
  * - matches any valid Unicode surrogate pair
- * - matches a backslash followed by one or more letters
- * - matches a backslash followed by any BMP character, including newline
+ * - matches a backslash followed by one or more whitespace characters
+ * - matches a backslash followed by one or more letters then whitespace
+ * - matches a backslash followed by any BMP character
+ * Capturing groups:
+ *   [1] regular whitespace
+ *   [2] backslash followed by whitespace
+ *   [3] anything else, which may include:
+ *     [4] left character of \verb*
+ *     [5] left character of \verb
+ *     [6] backslash followed by word, excluding any trailing whitespace
  * Just because the Lexer matches something doesn't mean it's valid input:
  * If there is no matching function or symbol definition, the Parser will
  * still reject the input.
@@ -38,19 +46,19 @@ const spaceRegexString = "[ \r\n\t]";
 const controlWordRegexString = "\\\\[a-zA-Z@]+";
 const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
 const controlWordWhitespaceRegexString =
-    `${controlWordRegexString}${spaceRegexString}*`;
-const controlWordWhitespaceRegex = new RegExp(
-    `^(${controlWordRegexString})${spaceRegexString}*$`);
+    `(${controlWordRegexString})${spaceRegexString}*`;
+const controlSpaceRegexString = "\\\\(\n|[ \r\t]+\n?)[ \r\t]*";
 const combiningDiacriticalMarkString = "[\u0300-\u036f]";
 export const combiningDiacriticalMarksEndRegex: RegExp =
     new RegExp(`${combiningDiacriticalMarkString}+$`);
 const tokenRegexString = `(${spaceRegexString}+)|` +  // whitespace
+    `${controlSpaceRegexString}|` +                   // \whitespace
     "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
     `${combiningDiacriticalMarkString}*` +            // ...plus accents
     "|[\uD800-\uDBFF][\uDC00-\uDFFF]" +               // surrogate pair
     `${combiningDiacriticalMarkString}*` +            // ...plus accents
-    "|\\\\verb\\*([^]).*?\\3" +                       // \verb*
-    "|\\\\verb([^*a-zA-Z]).*?\\4" +                   // \verb unstarred
+    "|\\\\verb\\*([^]).*?\\4" +                       // \verb*
+    "|\\\\verb([^*a-zA-Z]).*?\\5" +                   // \verb unstarred
     "|\\\\operatorname\\*" +                          // \operatorname*
     `|${controlWordWhitespaceRegexString}` +          // \macroName + spaces
     `|${controlSymbolRegexString})`;                  // \\, \', etc.
@@ -94,7 +102,7 @@ export default class Lexer implements LexerInterface {
                 `Unexpected character: '${input[pos]}'`,
                 new Token(input[pos], new SourceLocation(this, pos, pos + 1)));
         }
-        let text = match[2] || " ";
+        const text = match[6] || match[3] || (match[2] ? "\\ " : " ");
 
         if (this.catcodes[text] === 14) { // comment character
             const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex);
@@ -109,12 +117,6 @@ export default class Lexer implements LexerInterface {
             return this.lex();
         }
 
-        // Trim any trailing whitespace from control word match
-        const controlMatch = text.match(controlWordWhitespaceRegex);
-        if (controlMatch) {
-            text = controlMatch[1];
-        }
-
         return new Token(text, new SourceLocation(this, pos,
             this.tokenRegex.lastIndex));
     }
diff --git a/test/katex-spec.js b/test/katex-spec.js
index 421e977f..51979469 100644
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -678,7 +678,7 @@ describe("A text parser", function() {
     const noBraceTextExpression = r`\text x`;
     const nestedTextExpression =
         r`\text{a {b} \blue{c} \textcolor{#fff}{x} \llap{x}}`;
-    const spaceTextExpression = r`\text{  a \ }`;
+    const spaceTextExpression = r`\text{  a \  }`;
     const leadingSpaceTextExpression = r`\text {moo}`;
     const badTextExpression = r`\text{a b%}`;
     const badFunctionExpression = r`\text{\sqrt{x}}`;
@@ -722,12 +722,17 @@ describe("A text parser", function() {
         const parse = getParsed(spaceTextExpression)[0];
         const group = parse.body;
 
+        expect(group.length).toEqual(4);
         expect(group[0].type).toEqual("spacing");
         expect(group[1].type).toEqual("textord");
         expect(group[2].type).toEqual("spacing");
         expect(group[3].type).toEqual("spacing");
     });
 
+    it("should handle backslash followed by newline", () => {
+        expect("\\text{\\ \t\r \n \t\r  }").toParseLike("\\text{\\ }");
+    });
+
     it("should accept math mode tokens after its argument", function() {
         expect(mathTokenAfterText).toParse();
     });