mirror of
https://github.com/Smaug123/KaTeX
synced 2025-10-06 19:58:40 +00:00
fix: Correctly parse \ followed by whitespace (#2877)
* fix: Correctly parse \ followed by whitespace LaTeX parses `\` followed by whitespace including up to one newline as equivalent to `\ `. (With multiple newlines, you get paragraph breaks.) Fix #2860. * Improve comments * Avoid second RegExp match in control words * Document capturing groups Co-authored-by: Ron Kok <ronkok@comcast.net>
This commit is contained in:
30
src/Lexer.js
30
src/Lexer.js
@@ -28,8 +28,16 @@ import type Settings from "./Settings";
|
|||||||
* - does not match bare surrogate code units
|
* - does not match bare surrogate code units
|
||||||
* - matches any BMP character except for those just described
|
* - matches any BMP character except for those just described
|
||||||
* - matches any valid Unicode surrogate pair
|
* - matches any valid Unicode surrogate pair
|
||||||
* - matches a backslash followed by one or more letters
|
* - matches a backslash followed by one or more whitespace characters
|
||||||
* - matches a backslash followed by any BMP character, including newline
|
* - matches a backslash followed by one or more letters then whitespace
|
||||||
|
* - matches a backslash followed by any BMP character
|
||||||
|
* Capturing groups:
|
||||||
|
* [1] regular whitespace
|
||||||
|
* [2] backslash followed by whitespace
|
||||||
|
* [3] anything else, which may include:
|
||||||
|
* [4] left character of \verb*
|
||||||
|
* [5] left character of \verb
|
||||||
|
* [6] backslash followed by word, excluding any trailing whitespace
|
||||||
* Just because the Lexer matches something doesn't mean it's valid input:
|
* Just because the Lexer matches something doesn't mean it's valid input:
|
||||||
* If there is no matching function or symbol definition, the Parser will
|
* If there is no matching function or symbol definition, the Parser will
|
||||||
* still reject the input.
|
* still reject the input.
|
||||||
@@ -38,19 +46,19 @@ const spaceRegexString = "[ \r\n\t]";
|
|||||||
const controlWordRegexString = "\\\\[a-zA-Z@]+";
|
const controlWordRegexString = "\\\\[a-zA-Z@]+";
|
||||||
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
|
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
|
||||||
const controlWordWhitespaceRegexString =
|
const controlWordWhitespaceRegexString =
|
||||||
`${controlWordRegexString}${spaceRegexString}*`;
|
`(${controlWordRegexString})${spaceRegexString}*`;
|
||||||
const controlWordWhitespaceRegex = new RegExp(
|
const controlSpaceRegexString = "\\\\(\n|[ \r\t]+\n?)[ \r\t]*";
|
||||||
`^(${controlWordRegexString})${spaceRegexString}*$`);
|
|
||||||
const combiningDiacriticalMarkString = "[\u0300-\u036f]";
|
const combiningDiacriticalMarkString = "[\u0300-\u036f]";
|
||||||
export const combiningDiacriticalMarksEndRegex: RegExp =
|
export const combiningDiacriticalMarksEndRegex: RegExp =
|
||||||
new RegExp(`${combiningDiacriticalMarkString}+$`);
|
new RegExp(`${combiningDiacriticalMarkString}+$`);
|
||||||
const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
|
const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
|
||||||
|
`${controlSpaceRegexString}|` + // \whitespace
|
||||||
"([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
|
"([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
|
||||||
`${combiningDiacriticalMarkString}*` + // ...plus accents
|
`${combiningDiacriticalMarkString}*` + // ...plus accents
|
||||||
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
|
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
|
||||||
`${combiningDiacriticalMarkString}*` + // ...plus accents
|
`${combiningDiacriticalMarkString}*` + // ...plus accents
|
||||||
"|\\\\verb\\*([^]).*?\\3" + // \verb*
|
"|\\\\verb\\*([^]).*?\\4" + // \verb*
|
||||||
"|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred
|
"|\\\\verb([^*a-zA-Z]).*?\\5" + // \verb unstarred
|
||||||
"|\\\\operatorname\\*" + // \operatorname*
|
"|\\\\operatorname\\*" + // \operatorname*
|
||||||
`|${controlWordWhitespaceRegexString}` + // \macroName + spaces
|
`|${controlWordWhitespaceRegexString}` + // \macroName + spaces
|
||||||
`|${controlSymbolRegexString})`; // \\, \', etc.
|
`|${controlSymbolRegexString})`; // \\, \', etc.
|
||||||
@@ -94,7 +102,7 @@ export default class Lexer implements LexerInterface {
|
|||||||
`Unexpected character: '${input[pos]}'`,
|
`Unexpected character: '${input[pos]}'`,
|
||||||
new Token(input[pos], new SourceLocation(this, pos, pos + 1)));
|
new Token(input[pos], new SourceLocation(this, pos, pos + 1)));
|
||||||
}
|
}
|
||||||
let text = match[2] || " ";
|
const text = match[6] || match[3] || (match[2] ? "\\ " : " ");
|
||||||
|
|
||||||
if (this.catcodes[text] === 14) { // comment character
|
if (this.catcodes[text] === 14) { // comment character
|
||||||
const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex);
|
const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex);
|
||||||
@@ -109,12 +117,6 @@ export default class Lexer implements LexerInterface {
|
|||||||
return this.lex();
|
return this.lex();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Trim any trailing whitespace from control word match
|
|
||||||
const controlMatch = text.match(controlWordWhitespaceRegex);
|
|
||||||
if (controlMatch) {
|
|
||||||
text = controlMatch[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Token(text, new SourceLocation(this, pos,
|
return new Token(text, new SourceLocation(this, pos,
|
||||||
this.tokenRegex.lastIndex));
|
this.tokenRegex.lastIndex));
|
||||||
}
|
}
|
||||||
|
@@ -678,7 +678,7 @@ describe("A text parser", function() {
|
|||||||
const noBraceTextExpression = r`\text x`;
|
const noBraceTextExpression = r`\text x`;
|
||||||
const nestedTextExpression =
|
const nestedTextExpression =
|
||||||
r`\text{a {b} \blue{c} \textcolor{#fff}{x} \llap{x}}`;
|
r`\text{a {b} \blue{c} \textcolor{#fff}{x} \llap{x}}`;
|
||||||
const spaceTextExpression = r`\text{ a \ }`;
|
const spaceTextExpression = r`\text{ a \ }`;
|
||||||
const leadingSpaceTextExpression = r`\text {moo}`;
|
const leadingSpaceTextExpression = r`\text {moo}`;
|
||||||
const badTextExpression = r`\text{a b%}`;
|
const badTextExpression = r`\text{a b%}`;
|
||||||
const badFunctionExpression = r`\text{\sqrt{x}}`;
|
const badFunctionExpression = r`\text{\sqrt{x}}`;
|
||||||
@@ -722,12 +722,17 @@ describe("A text parser", function() {
|
|||||||
const parse = getParsed(spaceTextExpression)[0];
|
const parse = getParsed(spaceTextExpression)[0];
|
||||||
const group = parse.body;
|
const group = parse.body;
|
||||||
|
|
||||||
|
expect(group.length).toEqual(4);
|
||||||
expect(group[0].type).toEqual("spacing");
|
expect(group[0].type).toEqual("spacing");
|
||||||
expect(group[1].type).toEqual("textord");
|
expect(group[1].type).toEqual("textord");
|
||||||
expect(group[2].type).toEqual("spacing");
|
expect(group[2].type).toEqual("spacing");
|
||||||
expect(group[3].type).toEqual("spacing");
|
expect(group[3].type).toEqual("spacing");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should handle backslash followed by newline", () => {
|
||||||
|
expect("\\text{\\ \t\r \n \t\r }").toParseLike("\\text{\\ }");
|
||||||
|
});
|
||||||
|
|
||||||
it("should accept math mode tokens after its argument", function() {
|
it("should accept math mode tokens after its argument", function() {
|
||||||
expect(mathTokenAfterText).toParse();
|
expect(mathTokenAfterText).toParse();
|
||||||
});
|
});
|
||||||
|
Reference in New Issue
Block a user