mirror of
https://github.com/Smaug123/KaTeX
synced 2025-10-09 04:58:40 +00:00
Add catcode to Lexer, move comment parsing back to Lexer (#1789)
* Remove redundant consumeSpaces() - Spaces after command sequence are ignored in Lexer - parseExpression consumes spaces in the math mode * Add catcode to Lexer, move comment parsing back to Lexer - Fix parsing a comment before a sup/subscript argument - Fix parsing a comment before an expression - Fix parsing a comment before or between \hline - Fix parsing a comment in the macro definition - Fix parsing a comment including a command sequence * Update Lexer.js * Update Parser.js * catcode -> catcodes
This commit is contained in:
committed by
Kevin Barabash
parent
ec6a2b4f36
commit
3dfd17d9b4
31
src/Lexer.js
31
src/Lexer.js
@@ -17,6 +17,7 @@ import SourceLocation from "./SourceLocation";
|
||||
import {Token} from "./Token";
|
||||
|
||||
import type {LexerInterface} from "./Token";
|
||||
import type Settings from "./Settings";
|
||||
|
||||
/* The following tokenRegex
|
||||
* - matches typical whitespace (but not NBSP etc.) using its first group
|
||||
@@ -53,19 +54,26 @@ const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
|
||||
`|${controlWordWhitespaceRegexString}` + // \macroName + spaces
|
||||
`|${controlSymbolRegexString})`; // \\, \', etc.
|
||||
|
||||
// These regexs are for matching results from tokenRegex,
|
||||
// so they do have ^ markers.
|
||||
export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
|
||||
|
||||
/** Main Lexer class */
|
||||
export default class Lexer implements LexerInterface {
|
||||
input: string;
|
||||
settings: Settings;
|
||||
tokenRegex: RegExp;
|
||||
// category codes, only supports comment characters (14) for now
|
||||
catcodes: {[string]: number};
|
||||
|
||||
constructor(input: string) {
|
||||
constructor(input: string, settings: Settings) {
|
||||
// Separate accents from characters
|
||||
this.input = input;
|
||||
this.settings = settings;
|
||||
this.tokenRegex = new RegExp(tokenRegexString, 'g');
|
||||
this.catcodes = {
|
||||
"%": 14, // comment character
|
||||
};
|
||||
}
|
||||
|
||||
setCatcode(char: string, code: number) {
|
||||
this.catcodes[char] = code;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -85,6 +93,19 @@ export default class Lexer implements LexerInterface {
|
||||
}
|
||||
let text = match[2] || " ";
|
||||
|
||||
if (this.catcodes[text] === 14) { // comment character
|
||||
const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex);
|
||||
if (nlIndex === -1) {
|
||||
this.tokenRegex.lastIndex = input.length; // EOF
|
||||
this.settings.reportNonstrict("commentAtEnd",
|
||||
"% comment has no terminating newline; LaTeX would " +
|
||||
"fail because of commenting the end of math mode (e.g. $)");
|
||||
} else {
|
||||
this.tokenRegex.lastIndex = nlIndex + 1;
|
||||
}
|
||||
return this.lex();
|
||||
}
|
||||
|
||||
// Trim any trailing whitespace from control word match
|
||||
const controlMatch = text.match(controlWordWhitespaceRegex);
|
||||
if (controlMatch) {
|
||||
|
@@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
|
||||
* (with existing macros etc.).
|
||||
*/
|
||||
feed(input: string) {
|
||||
this.lexer = new Lexer(input);
|
||||
this.lexer = new Lexer(input, this.settings);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
|
||||
++numArgs;
|
||||
}
|
||||
}
|
||||
const bodyLexer = new Lexer(expansion);
|
||||
const bodyLexer = new Lexer(expansion, this.settings);
|
||||
const tokens = [];
|
||||
let tok = bodyLexer.lex();
|
||||
while (tok.text !== "EOF") {
|
||||
@@ -343,4 +343,3 @@ export default class MacroExpander implements MacroContextInterface {
|
||||
implicitCommands.hasOwnProperty(name);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -363,8 +363,6 @@ export default class Parser {
|
||||
}
|
||||
// Put everything into an ordgroup as the superscript
|
||||
superscript = {type: "ordgroup", mode: this.mode, body: primes};
|
||||
} else if (lex.text === "%") {
|
||||
this.consumeComment();
|
||||
} else {
|
||||
// If it wasn't ^, _, or ', stop parsing super/subscripts
|
||||
break;
|
||||
@@ -414,6 +412,11 @@ export default class Parser {
|
||||
"Can't use function '" + func + "' in math mode", token);
|
||||
}
|
||||
|
||||
// hyperref package sets the catcode of % as an active character
|
||||
if (funcData.argTypes && funcData.argTypes[0] === "url") {
|
||||
this.gullet.lexer.setCatcode("%", 13);
|
||||
}
|
||||
|
||||
// Consume the command token after possibly switching to the
|
||||
// mode specified by the function (for instant mode switching),
|
||||
// and then immediately switch back.
|
||||
@@ -555,27 +558,6 @@ export default class Parser {
|
||||
}
|
||||
}
|
||||
|
||||
consumeComment() {
|
||||
// the newline character is normalized in Lexer, check original source
|
||||
while (this.nextToken.text !== "EOF" && this.nextToken.loc &&
|
||||
this.nextToken.loc.getSource().indexOf("\n") === -1) {
|
||||
this.consume();
|
||||
}
|
||||
if (this.nextToken.text === "EOF") {
|
||||
this.settings.reportNonstrict("commentAtEnd",
|
||||
"% comment has no terminating newline; LaTeX would " +
|
||||
"fail because of commenting the end of math mode (e.g. $)");
|
||||
}
|
||||
if (this.mode === "math") {
|
||||
this.consumeSpaces(); // ignore spaces in math mode
|
||||
} else if (this.nextToken.loc) { // text mode
|
||||
const source = this.nextToken.loc.getSource();
|
||||
if (source.indexOf("\n") === source.length - 1) {
|
||||
this.consumeSpaces(); // if no space after the first newline
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a group, essentially returning the string formed by the
|
||||
* brace-enclosed tokens plus some position information.
|
||||
@@ -594,6 +576,7 @@ export default class Parser {
|
||||
} else if (raw && nextToken.text !== "EOF" &&
|
||||
/[^{}[\]]/.test(nextToken.text)) {
|
||||
// allow a single character in raw string group
|
||||
this.gullet.lexer.setCatcode("%", 14); // reset the catcode of %
|
||||
this.consume();
|
||||
return nextToken;
|
||||
}
|
||||
@@ -611,12 +594,6 @@ export default class Parser {
|
||||
throw new ParseError(
|
||||
"Unexpected end of input in " + modeName,
|
||||
firstToken.range(lastToken, str));
|
||||
case "%":
|
||||
if (!raw) { // allow % in raw string group
|
||||
this.consumeComment();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case groupBegin:
|
||||
nested++;
|
||||
break;
|
||||
@@ -629,6 +606,7 @@ export default class Parser {
|
||||
this.consume();
|
||||
}
|
||||
this.mode = outerMode;
|
||||
this.gullet.lexer.setCatcode("%", 14); // reset the catcode of %
|
||||
this.expect(groupEnd);
|
||||
return firstToken.range(lastToken, str);
|
||||
}
|
||||
@@ -647,12 +625,8 @@ export default class Parser {
|
||||
const firstToken = this.nextToken;
|
||||
let lastToken = firstToken;
|
||||
let str = "";
|
||||
while (this.nextToken.text !== "EOF" && (regex.test(
|
||||
str + this.nextToken.text) || this.nextToken.text === "%")) {
|
||||
if (this.nextToken.text === "%") {
|
||||
this.consumeComment();
|
||||
continue;
|
||||
}
|
||||
while (this.nextToken.text !== "EOF" &&
|
||||
regex.test(str + this.nextToken.text)) {
|
||||
lastToken = this.nextToken;
|
||||
str += lastToken.text;
|
||||
this.consume();
|
||||
@@ -914,9 +888,6 @@ export default class Parser {
|
||||
body: arg,
|
||||
star,
|
||||
};
|
||||
} else if (text === "%") {
|
||||
this.consumeComment();
|
||||
return this.parseSymbol();
|
||||
}
|
||||
// At this point, we should have a symbol, possibly with accents.
|
||||
// First expand any accented base symbol according to unicodeSymbols.
|
||||
|
@@ -17,10 +17,6 @@ export default class SourceLocation {
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
getSource(): string {
|
||||
return this.lexer.input.slice(this.start, this.end);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges two `SourceLocation`s from location providers, given they are
|
||||
* provided in order of appearance.
|
||||
|
@@ -99,7 +99,6 @@ defineFunction({
|
||||
},
|
||||
handler: ({parser, funcName, breakOnTokenText}, args) => {
|
||||
const {mode} = parser;
|
||||
parser.consumeSpaces();
|
||||
const body = parser.parseExpression(true, breakOnTokenText);
|
||||
const style = `math${funcName.slice(1)}`;
|
||||
|
||||
|
@@ -61,7 +61,6 @@ defineFunction({
|
||||
allowedInText: true,
|
||||
},
|
||||
handler: ({breakOnTokenText, funcName, parser}, args) => {
|
||||
parser.consumeSpaces();
|
||||
const body = parser.parseExpression(false, breakOnTokenText);
|
||||
|
||||
return {
|
||||
|
@@ -25,7 +25,6 @@ defineFunction({
|
||||
},
|
||||
handler({breakOnTokenText, funcName, parser}, args) {
|
||||
// parse out the implicit body
|
||||
parser.consumeSpaces();
|
||||
const body = parser.parseExpression(true, breakOnTokenText);
|
||||
|
||||
// TODO: Refactor to avoid duplicating styleMap in multiple places (e.g.
|
||||
|
@@ -1627,6 +1627,8 @@ describe("A comment parser", function() {
|
||||
|
||||
it("should parse comments between subscript and superscript", () => {
|
||||
expect("x_3 %comment\n^2").toParseLike`x_3^2`;
|
||||
expect("x^ %comment\n{2}").toParseLike`x^{2}`;
|
||||
expect("x^ %comment\n\\frac{1}{2}").toParseLike`x^\frac{1}{2}`;
|
||||
});
|
||||
|
||||
it("should parse comments in size and color groups", () => {
|
||||
@@ -1635,6 +1637,24 @@ describe("A comment parser", function() {
|
||||
expect("\\color{#f00%red\n}").toParse();
|
||||
});
|
||||
|
||||
it("should parse comments before an expression", () => {
|
||||
expect("%comment\n{2}").toParseLike`{2}`;
|
||||
});
|
||||
|
||||
it("should parse comments before and between \\hline", () => {
|
||||
expect("\\begin{matrix}a&b\\\\ %hline\n" +
|
||||
"\\hline %hline\n" +
|
||||
"\\hline c&d\\end{matrix}").toParse();
|
||||
});
|
||||
|
||||
it("should parse comments in the macro definition", () => {
|
||||
expect("\\def\\foo{1 %}\n2}\n\\foo").toParseLike`12`;
|
||||
});
|
||||
|
||||
it("should not expand nor ignore spaces after a command sequence in a comment", () => {
|
||||
expect("\\def\\foo{1\n2}\nx %\\foo\n").toParseLike`x`;
|
||||
});
|
||||
|
||||
it("should not parse a comment without newline in strict mode", () => {
|
||||
expect`x%y`.not.toParse(strictSettings);
|
||||
expect`x%y`.toParse(nonstrictSettings);
|
||||
@@ -2586,9 +2606,8 @@ describe("href and url commands", function() {
|
||||
|
||||
it("should allow single-character URLs", () => {
|
||||
expect`\href%end`.toParseLike("\\href{%}end");
|
||||
expect`\href %end`.toParseLike("\\href{%}end");
|
||||
expect("\\url%end").toParseLike("\\url{%}end");
|
||||
expect("\\url %end").toParseLike("\\url{%}end");
|
||||
expect("\\url%%end\n").toParseLike("\\url{%}");
|
||||
expect("\\url end").toParseLike("\\url{e}nd");
|
||||
expect("\\url%end").toParseLike("\\url {%}end");
|
||||
});
|
||||
@@ -2630,6 +2649,10 @@ describe("href and url commands", function() {
|
||||
expect(parsed2.href).toBe(url);
|
||||
});
|
||||
|
||||
it("should allow comments after URLs", function() {
|
||||
expect("\\url{http://example.com/}%comment\n").toBuild();
|
||||
});
|
||||
|
||||
it("should be marked up correctly", function() {
|
||||
const markup = katex.renderToString(r`\href{http://example.com/}{example here}`);
|
||||
expect(markup).toContain("<a href=\"http://example.com/\">");
|
||||
|
Reference in New Issue
Block a user