fix: Correctly parse \ followed by whitespace (#2877)

* fix: Correctly parse \ followed by whitespace

LaTeX parses `\` followed by whitespace including up to one newline
as equivalent to `\ `.  (With multiple newlines, you get paragraph
breaks.)

Fix #2860.

* Improve comments

* Avoid second RegExp match in control words

* Document capturing groups

Co-authored-by: Ron Kok <ronkok@comcast.net>
This commit is contained in:
Erik Demaine
2021-05-05 21:54:41 -04:00
committed by GitHub
parent e3b54c73ea
commit c85250d14e
2 changed files with 22 additions and 15 deletions

View File

@@ -28,8 +28,16 @@ import type Settings from "./Settings";
* - does not match bare surrogate code units * - does not match bare surrogate code units
* - matches any BMP character except for those just described * - matches any BMP character except for those just described
* - matches any valid Unicode surrogate pair * - matches any valid Unicode surrogate pair
* - matches a backslash followed by one or more letters * - matches a backslash followed by one or more whitespace characters
* - matches a backslash followed by any BMP character, including newline * - matches a backslash followed by one or more letters then whitespace
* - matches a backslash followed by any BMP character
* Capturing groups:
* [1] regular whitespace
* [2] backslash followed by whitespace
* [3] anything else, which may include:
* [4] left character of \verb*
* [5] left character of \verb
* [6] backslash followed by word, excluding any trailing whitespace
* Just because the Lexer matches something doesn't mean it's valid input: * Just because the Lexer matches something doesn't mean it's valid input:
* If there is no matching function or symbol definition, the Parser will * If there is no matching function or symbol definition, the Parser will
* still reject the input. * still reject the input.
@@ -38,19 +46,19 @@ const spaceRegexString = "[ \r\n\t]";
const controlWordRegexString = "\\\\[a-zA-Z@]+"; const controlWordRegexString = "\\\\[a-zA-Z@]+";
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]"; const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
const controlWordWhitespaceRegexString = const controlWordWhitespaceRegexString =
`${controlWordRegexString}${spaceRegexString}*`; `(${controlWordRegexString})${spaceRegexString}*`;
const controlWordWhitespaceRegex = new RegExp( const controlSpaceRegexString = "\\\\(\n|[ \r\t]+\n?)[ \r\t]*";
`^(${controlWordRegexString})${spaceRegexString}*$`);
const combiningDiacriticalMarkString = "[\u0300-\u036f]"; const combiningDiacriticalMarkString = "[\u0300-\u036f]";
export const combiningDiacriticalMarksEndRegex: RegExp = export const combiningDiacriticalMarksEndRegex: RegExp =
new RegExp(`${combiningDiacriticalMarkString}+$`); new RegExp(`${combiningDiacriticalMarkString}+$`);
const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
`${controlSpaceRegexString}|` + // \whitespace
"([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
`${combiningDiacriticalMarkString}*` + // ...plus accents `${combiningDiacriticalMarkString}*` + // ...plus accents
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
`${combiningDiacriticalMarkString}*` + // ...plus accents `${combiningDiacriticalMarkString}*` + // ...plus accents
"|\\\\verb\\*([^]).*?\\3" + // \verb* "|\\\\verb\\*([^]).*?\\4" + // \verb*
"|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred "|\\\\verb([^*a-zA-Z]).*?\\5" + // \verb unstarred
"|\\\\operatorname\\*" + // \operatorname* "|\\\\operatorname\\*" + // \operatorname*
`|${controlWordWhitespaceRegexString}` + // \macroName + spaces `|${controlWordWhitespaceRegexString}` + // \macroName + spaces
`|${controlSymbolRegexString})`; // \\, \', etc. `|${controlSymbolRegexString})`; // \\, \', etc.
@@ -94,7 +102,7 @@ export default class Lexer implements LexerInterface {
`Unexpected character: '${input[pos]}'`, `Unexpected character: '${input[pos]}'`,
new Token(input[pos], new SourceLocation(this, pos, pos + 1))); new Token(input[pos], new SourceLocation(this, pos, pos + 1)));
} }
let text = match[2] || " "; const text = match[6] || match[3] || (match[2] ? "\\ " : " ");
if (this.catcodes[text] === 14) { // comment character if (this.catcodes[text] === 14) { // comment character
const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex); const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex);
@@ -109,12 +117,6 @@ export default class Lexer implements LexerInterface {
return this.lex(); return this.lex();
} }
// Trim any trailing whitespace from control word match
const controlMatch = text.match(controlWordWhitespaceRegex);
if (controlMatch) {
text = controlMatch[1];
}
return new Token(text, new SourceLocation(this, pos, return new Token(text, new SourceLocation(this, pos,
this.tokenRegex.lastIndex)); this.tokenRegex.lastIndex));
} }

View File

@@ -678,7 +678,7 @@ describe("A text parser", function() {
const noBraceTextExpression = r`\text x`; const noBraceTextExpression = r`\text x`;
const nestedTextExpression = const nestedTextExpression =
r`\text{a {b} \blue{c} \textcolor{#fff}{x} \llap{x}}`; r`\text{a {b} \blue{c} \textcolor{#fff}{x} \llap{x}}`;
const spaceTextExpression = r`\text{ a \ }`; const spaceTextExpression = r`\text{ a \ }`;
const leadingSpaceTextExpression = r`\text {moo}`; const leadingSpaceTextExpression = r`\text {moo}`;
const badTextExpression = r`\text{a b%}`; const badTextExpression = r`\text{a b%}`;
const badFunctionExpression = r`\text{\sqrt{x}}`; const badFunctionExpression = r`\text{\sqrt{x}}`;
@@ -722,12 +722,17 @@ describe("A text parser", function() {
const parse = getParsed(spaceTextExpression)[0]; const parse = getParsed(spaceTextExpression)[0];
const group = parse.body; const group = parse.body;
expect(group.length).toEqual(4);
expect(group[0].type).toEqual("spacing"); expect(group[0].type).toEqual("spacing");
expect(group[1].type).toEqual("textord"); expect(group[1].type).toEqual("textord");
expect(group[2].type).toEqual("spacing"); expect(group[2].type).toEqual("spacing");
expect(group[3].type).toEqual("spacing"); expect(group[3].type).toEqual("spacing");
}); });
it("should handle backslash followed by newline", () => {
expect("\\text{\\ \t\r \n \t\r }").toParseLike("\\text{\\ }");
});
it("should accept math mode tokens after its argument", function() { it("should accept math mode tokens after its argument", function() {
expect(mathTokenAfterText).toParse(); expect(mathTokenAfterText).toParse();
}); });