Add catcode to Lexer, move comment parsing back to Lexer (#1789)

* Remove redundant consumeSpaces()

- Spaces after command sequence are ignored in Lexer
- parseExpression consumes spaces in the math mode

* Add catcode to Lexer, move comment parsing back to Lexer

- Fix parsing a comment before a sup/subscript argument
- Fix parsing a comment before an expression
- Fix parsing a comment before or between \hline
- Fix parsing a comment in the macro definition
- Fix parsing a comment including a command sequence

* Update Lexer.js

* Update Parser.js

* catcode -> catcodes
This commit is contained in:
ylemkimon
2018-11-25 08:42:14 +09:00
committed by Kevin Barabash
parent ec6a2b4f36
commit 3dfd17d9b4
8 changed files with 62 additions and 55 deletions

View File

@@ -17,6 +17,7 @@ import SourceLocation from "./SourceLocation";
import {Token} from "./Token";
import type {LexerInterface} from "./Token";
import type Settings from "./Settings";
/* The following tokenRegex
* - matches typical whitespace (but not NBSP etc.) using its first group
@@ -53,19 +54,26 @@ const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
`|${controlWordWhitespaceRegexString}` + // \macroName + spaces
`|${controlSymbolRegexString})`; // \\, \', etc.
// These regexs are for matching results from tokenRegex,
// so they do have ^ markers.
export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
/** Main Lexer class */
export default class Lexer implements LexerInterface {
input: string;
settings: Settings;
tokenRegex: RegExp;
// category codes, only supports comment characters (14) for now
catcodes: {[string]: number};
constructor(input: string) {
constructor(input: string, settings: Settings) {
// Separate accents from characters
this.input = input;
this.settings = settings;
this.tokenRegex = new RegExp(tokenRegexString, 'g');
this.catcodes = {
"%": 14, // comment character
};
}
setCatcode(char: string, code: number) {
this.catcodes[char] = code;
}
/**
@@ -85,6 +93,19 @@ export default class Lexer implements LexerInterface {
}
let text = match[2] || " ";
if (this.catcodes[text] === 14) { // comment character
const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex);
if (nlIndex === -1) {
this.tokenRegex.lastIndex = input.length; // EOF
this.settings.reportNonstrict("commentAtEnd",
"% comment has no terminating newline; LaTeX would " +
"fail because of commenting the end of math mode (e.g. $)");
} else {
this.tokenRegex.lastIndex = nlIndex + 1;
}
return this.lex();
}
// Trim any trailing whitespace from control word match
const controlMatch = text.match(controlWordWhitespaceRegex);
if (controlMatch) {

View File

@@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
* (with existing macros etc.).
*/
feed(input: string) {
this.lexer = new Lexer(input);
this.lexer = new Lexer(input, this.settings);
}
/**
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
++numArgs;
}
}
const bodyLexer = new Lexer(expansion);
const bodyLexer = new Lexer(expansion, this.settings);
const tokens = [];
let tok = bodyLexer.lex();
while (tok.text !== "EOF") {
@@ -343,4 +343,3 @@ export default class MacroExpander implements MacroContextInterface {
implicitCommands.hasOwnProperty(name);
}
}

View File

@@ -363,8 +363,6 @@ export default class Parser {
}
// Put everything into an ordgroup as the superscript
superscript = {type: "ordgroup", mode: this.mode, body: primes};
} else if (lex.text === "%") {
this.consumeComment();
} else {
// If it wasn't ^, _, or ', stop parsing super/subscripts
break;
@@ -414,6 +412,11 @@ export default class Parser {
"Can't use function '" + func + "' in math mode", token);
}
// hyperref package sets the catcode of % as an active character
if (funcData.argTypes && funcData.argTypes[0] === "url") {
this.gullet.lexer.setCatcode("%", 13);
}
// Consume the command token after possibly switching to the
// mode specified by the function (for instant mode switching),
// and then immediately switch back.
@@ -555,27 +558,6 @@ export default class Parser {
}
}
consumeComment() {
// the newline character is normalized in Lexer, check original source
while (this.nextToken.text !== "EOF" && this.nextToken.loc &&
this.nextToken.loc.getSource().indexOf("\n") === -1) {
this.consume();
}
if (this.nextToken.text === "EOF") {
this.settings.reportNonstrict("commentAtEnd",
"% comment has no terminating newline; LaTeX would " +
"fail because of commenting the end of math mode (e.g. $)");
}
if (this.mode === "math") {
this.consumeSpaces(); // ignore spaces in math mode
} else if (this.nextToken.loc) { // text mode
const source = this.nextToken.loc.getSource();
if (source.indexOf("\n") === source.length - 1) {
this.consumeSpaces(); // if no space after the first newline
}
}
}
/**
* Parses a group, essentially returning the string formed by the
* brace-enclosed tokens plus some position information.
@@ -594,6 +576,7 @@ export default class Parser {
} else if (raw && nextToken.text !== "EOF" &&
/[^{}[\]]/.test(nextToken.text)) {
// allow a single character in raw string group
this.gullet.lexer.setCatcode("%", 14); // reset the catcode of %
this.consume();
return nextToken;
}
@@ -611,12 +594,6 @@ export default class Parser {
throw new ParseError(
"Unexpected end of input in " + modeName,
firstToken.range(lastToken, str));
case "%":
if (!raw) { // allow % in raw string group
this.consumeComment();
continue;
}
break;
case groupBegin:
nested++;
break;
@@ -629,6 +606,7 @@ export default class Parser {
this.consume();
}
this.mode = outerMode;
this.gullet.lexer.setCatcode("%", 14); // reset the catcode of %
this.expect(groupEnd);
return firstToken.range(lastToken, str);
}
@@ -647,12 +625,8 @@ export default class Parser {
const firstToken = this.nextToken;
let lastToken = firstToken;
let str = "";
while (this.nextToken.text !== "EOF" && (regex.test(
str + this.nextToken.text) || this.nextToken.text === "%")) {
if (this.nextToken.text === "%") {
this.consumeComment();
continue;
}
while (this.nextToken.text !== "EOF" &&
regex.test(str + this.nextToken.text)) {
lastToken = this.nextToken;
str += lastToken.text;
this.consume();
@@ -914,9 +888,6 @@ export default class Parser {
body: arg,
star,
};
} else if (text === "%") {
this.consumeComment();
return this.parseSymbol();
}
// At this point, we should have a symbol, possibly with accents.
// First expand any accented base symbol according to unicodeSymbols.

View File

@@ -17,10 +17,6 @@ export default class SourceLocation {
this.end = end;
}
getSource(): string {
return this.lexer.input.slice(this.start, this.end);
}
/**
* Merges two `SourceLocation`s from location providers, given they are
* provided in order of appearance.

View File

@@ -99,7 +99,6 @@ defineFunction({
},
handler: ({parser, funcName, breakOnTokenText}, args) => {
const {mode} = parser;
parser.consumeSpaces();
const body = parser.parseExpression(true, breakOnTokenText);
const style = `math${funcName.slice(1)}`;

View File

@@ -61,7 +61,6 @@ defineFunction({
allowedInText: true,
},
handler: ({breakOnTokenText, funcName, parser}, args) => {
parser.consumeSpaces();
const body = parser.parseExpression(false, breakOnTokenText);
return {

View File

@@ -25,7 +25,6 @@ defineFunction({
},
handler({breakOnTokenText, funcName, parser}, args) {
// parse out the implicit body
parser.consumeSpaces();
const body = parser.parseExpression(true, breakOnTokenText);
// TODO: Refactor to avoid duplicating styleMap in multiple places (e.g.

View File

@@ -1627,6 +1627,8 @@ describe("A comment parser", function() {
it("should parse comments between subscript and superscript", () => {
expect("x_3 %comment\n^2").toParseLike`x_3^2`;
expect("x^ %comment\n{2}").toParseLike`x^{2}`;
expect("x^ %comment\n\\frac{1}{2}").toParseLike`x^\frac{1}{2}`;
});
it("should parse comments in size and color groups", () => {
@@ -1635,6 +1637,24 @@ describe("A comment parser", function() {
expect("\\color{#f00%red\n}").toParse();
});
it("should parse comments before an expression", () => {
expect("%comment\n{2}").toParseLike`{2}`;
});
it("should parse comments before and between \\hline", () => {
expect("\\begin{matrix}a&b\\\\ %hline\n" +
"\\hline %hline\n" +
"\\hline c&d\\end{matrix}").toParse();
});
it("should parse comments in the macro definition", () => {
expect("\\def\\foo{1 %}\n2}\n\\foo").toParseLike`12`;
});
it("should not expand nor ignore spaces after a command sequence in a comment", () => {
expect("\\def\\foo{1\n2}\nx %\\foo\n").toParseLike`x`;
});
it("should not parse a comment without newline in strict mode", () => {
expect`x%y`.not.toParse(strictSettings);
expect`x%y`.toParse(nonstrictSettings);
@@ -2586,9 +2606,8 @@ describe("href and url commands", function() {
it("should allow single-character URLs", () => {
expect`\href%end`.toParseLike("\\href{%}end");
expect`\href %end`.toParseLike("\\href{%}end");
expect("\\url%end").toParseLike("\\url{%}end");
expect("\\url %end").toParseLike("\\url{%}end");
expect("\\url%%end\n").toParseLike("\\url{%}");
expect("\\url end").toParseLike("\\url{e}nd");
expect("\\url%end").toParseLike("\\url {%}end");
});
@@ -2630,6 +2649,10 @@ describe("href and url commands", function() {
expect(parsed2.href).toBe(url);
});
it("should allow comments after URLs", function() {
expect("\\url{http://example.com/}%comment\n").toBuild();
});
it("should be marked up correctly", function() {
const markup = katex.renderToString(r`\href{http://example.com/}{example here}`);
expect(markup).toContain("<a href=\"http://example.com/\">");