From c85250d14e7dcace95eca76a66973d10d1b6ee9f Mon Sep 17 00:00:00 2001 From: Erik Demaine Date: Wed, 5 May 2021 21:54:41 -0400 Subject: [PATCH] fix: Correctly parse \ followed by whitespace (#2877) * fix: Correctly parse \ followed by whitespace LaTeX parses `\` followed by whitespace including up to one newline as equivalent to `\ `. (With multiple newlines, you get paragraph breaks.) Fix #2860. * Improve comments * Avoid second RegExp match in control words * Document capturing groups Co-authored-by: Ron Kok --- src/Lexer.js | 30 ++++++++++++++++-------------- test/katex-spec.js | 7 ++++++- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/Lexer.js b/src/Lexer.js index 7cf96edd..849eee63 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -28,8 +28,16 @@ import type Settings from "./Settings"; * - does not match bare surrogate code units * - matches any BMP character except for those just described * - matches any valid Unicode surrogate pair - * - matches a backslash followed by one or more letters - * - matches a backslash followed by any BMP character, including newline + * - matches a backslash followed by one or more whitespace characters + * - matches a backslash followed by one or more letters then whitespace + * - matches a backslash followed by any BMP character + * Capturing groups: + * [1] regular whitespace + * [2] backslash followed by whitespace + * [3] anything else, which may include: + * [4] left character of \verb* + * [5] left character of \verb + * [6] backslash followed by word, excluding any trailing whitespace * Just because the Lexer matches something doesn't mean it's valid input: * If there is no matching function or symbol definition, the Parser will * still reject the input. @@ -38,19 +46,19 @@ const spaceRegexString = "[ \r\n\t]"; const controlWordRegexString = "\\\\[a-zA-Z@]+"; const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]"; const controlWordWhitespaceRegexString = - `${controlWordRegexString}${spaceRegexString}*`; -const controlWordWhitespaceRegex = new RegExp( - `^(${controlWordRegexString})${spaceRegexString}*$`); + `(${controlWordRegexString})${spaceRegexString}*`; +const controlSpaceRegexString = "\\\\(\n|[ \r\t]+\n?)[ \r\t]*"; const combiningDiacriticalMarkString = "[\u0300-\u036f]"; export const combiningDiacriticalMarksEndRegex: RegExp = new RegExp(`${combiningDiacriticalMarkString}+$`); const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace + `${controlSpaceRegexString}|` + // \whitespace "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint `${combiningDiacriticalMarkString}*` + // ...plus accents "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair `${combiningDiacriticalMarkString}*` + // ...plus accents - "|\\\\verb\\*([^]).*?\\3" + // \verb* - "|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred + "|\\\\verb\\*([^]).*?\\4" + // \verb* + "|\\\\verb([^*a-zA-Z]).*?\\5" + // \verb unstarred "|\\\\operatorname\\*" + // \operatorname* `|${controlWordWhitespaceRegexString}` + // \macroName + spaces `|${controlSymbolRegexString})`; // \\, \', etc. @@ -94,7 +102,7 @@ export default class Lexer implements LexerInterface { `Unexpected character: '${input[pos]}'`, new Token(input[pos], new SourceLocation(this, pos, pos + 1))); } - let text = match[2] || " "; + const text = match[6] || match[3] || (match[2] ? "\\ " : " "); if (this.catcodes[text] === 14) { // comment character const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex); @@ -109,12 +117,6 @@ export default class Lexer implements LexerInterface { return this.lex(); } - // Trim any trailing whitespace from control word match - const controlMatch = text.match(controlWordWhitespaceRegex); - if (controlMatch) { - text = controlMatch[1]; - } - return new Token(text, new SourceLocation(this, pos, this.tokenRegex.lastIndex)); } diff --git a/test/katex-spec.js b/test/katex-spec.js index 421e977f..51979469 100644 --- a/test/katex-spec.js +++ b/test/katex-spec.js @@ -678,7 +678,7 @@ describe("A text parser", function() { const noBraceTextExpression = r`\text x`; const nestedTextExpression = r`\text{a {b} \blue{c} \textcolor{#fff}{x} \llap{x}}`; - const spaceTextExpression = r`\text{ a \ }`; + const spaceTextExpression = r`\text{ a \ }`; const leadingSpaceTextExpression = r`\text {moo}`; const badTextExpression = r`\text{a b%}`; const badFunctionExpression = r`\text{\sqrt{x}}`; @@ -722,12 +722,17 @@ describe("A text parser", function() { const parse = getParsed(spaceTextExpression)[0]; const group = parse.body; + expect(group.length).toEqual(4); expect(group[0].type).toEqual("spacing"); expect(group[1].type).toEqual("textord"); expect(group[2].type).toEqual("spacing"); expect(group[3].type).toEqual("spacing"); }); + it("should handle backslash followed by newline", () => { + expect("\\text{\\ \t\r \n \t\r }").toParseLike("\\text{\\ }"); + }); + it("should accept math mode tokens after its argument", function() { expect(mathTokenAfterText).toParse(); });