diff --git a/src/Lexer.js b/src/Lexer.js index ae3d6c47..af74cd62 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -17,6 +17,7 @@ import SourceLocation from "./SourceLocation"; import {Token} from "./Token"; import type {LexerInterface} from "./Token"; +import type Settings from "./Settings"; /* The following tokenRegex * - matches typical whitespace (but not NBSP etc.) using its first group @@ -53,19 +54,26 @@ const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace `|${controlWordWhitespaceRegexString}` + // \macroName + spaces `|${controlSymbolRegexString})`; // \\, \', etc. -// These regexs are for matching results from tokenRegex, -// so they do have ^ markers. -export const controlWordRegex = new RegExp(`^${controlWordRegexString}`); - /** Main Lexer class */ export default class Lexer implements LexerInterface { input: string; + settings: Settings; tokenRegex: RegExp; + // category codes, only supports comment characters (14) for now + catcodes: {[string]: number}; - constructor(input: string) { + constructor(input: string, settings: Settings) { // Separate accents from characters this.input = input; + this.settings = settings; this.tokenRegex = new RegExp(tokenRegexString, 'g'); + this.catcodes = { + "%": 14, // comment character + }; + } + + setCatcode(char: string, code: number) { + this.catcodes[char] = code; } /** @@ -85,6 +93,19 @@ export default class Lexer implements LexerInterface { } let text = match[2] || " "; + if (this.catcodes[text] === 14) { // comment character + const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex); + if (nlIndex === -1) { + this.tokenRegex.lastIndex = input.length; // EOF + this.settings.reportNonstrict("commentAtEnd", + "% comment has no terminating newline; LaTeX would " + + "fail because of commenting the end of math mode (e.g. $)"); + } else { + this.tokenRegex.lastIndex = nlIndex + 1; + } + return this.lex(); + } + // Trim any trailing whitespace from control word match const controlMatch = text.match(controlWordWhitespaceRegex); if (controlMatch) { diff --git a/src/MacroExpander.js b/src/MacroExpander.js index f45fac7b..2b8b23f5 100644 --- a/src/MacroExpander.js +++ b/src/MacroExpander.js @@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface { * (with existing macros etc.). */ feed(input: string) { - this.lexer = new Lexer(input); + this.lexer = new Lexer(input, this.settings); } /** @@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface { ++numArgs; } } - const bodyLexer = new Lexer(expansion); + const bodyLexer = new Lexer(expansion, this.settings); const tokens = []; let tok = bodyLexer.lex(); while (tok.text !== "EOF") { @@ -343,4 +343,3 @@ export default class MacroExpander implements MacroContextInterface { implicitCommands.hasOwnProperty(name); } } - diff --git a/src/Parser.js b/src/Parser.js index 2b42d386..9043b217 100644 --- a/src/Parser.js +++ b/src/Parser.js @@ -363,8 +363,6 @@ export default class Parser { } // Put everything into an ordgroup as the superscript superscript = {type: "ordgroup", mode: this.mode, body: primes}; - } else if (lex.text === "%") { - this.consumeComment(); } else { // If it wasn't ^, _, or ', stop parsing super/subscripts break; @@ -414,6 +412,11 @@ export default class Parser { "Can't use function '" + func + "' in math mode", token); } + // hyperref package sets the catcode of % as an active character + if (funcData.argTypes && funcData.argTypes[0] === "url") { + this.gullet.lexer.setCatcode("%", 13); + } + // Consume the command token after possibly switching to the // mode specified by the function (for instant mode switching), // and then immediately switch back. @@ -555,27 +558,6 @@ export default class Parser { } } - consumeComment() { - // the newline character is normalized in Lexer, check original source - while (this.nextToken.text !== "EOF" && this.nextToken.loc && - this.nextToken.loc.getSource().indexOf("\n") === -1) { - this.consume(); - } - if (this.nextToken.text === "EOF") { - this.settings.reportNonstrict("commentAtEnd", - "% comment has no terminating newline; LaTeX would " + - "fail because of commenting the end of math mode (e.g. $)"); - } - if (this.mode === "math") { - this.consumeSpaces(); // ignore spaces in math mode - } else if (this.nextToken.loc) { // text mode - const source = this.nextToken.loc.getSource(); - if (source.indexOf("\n") === source.length - 1) { - this.consumeSpaces(); // if no space after the first newline - } - } - } - /** * Parses a group, essentially returning the string formed by the * brace-enclosed tokens plus some position information. @@ -594,6 +576,7 @@ export default class Parser { } else if (raw && nextToken.text !== "EOF" && /[^{}[\]]/.test(nextToken.text)) { // allow a single character in raw string group + this.gullet.lexer.setCatcode("%", 14); // reset the catcode of % this.consume(); return nextToken; } @@ -611,12 +594,6 @@ export default class Parser { throw new ParseError( "Unexpected end of input in " + modeName, firstToken.range(lastToken, str)); - case "%": - if (!raw) { // allow % in raw string group - this.consumeComment(); - continue; - } - break; case groupBegin: nested++; break; @@ -629,6 +606,7 @@ export default class Parser { this.consume(); } this.mode = outerMode; + this.gullet.lexer.setCatcode("%", 14); // reset the catcode of % this.expect(groupEnd); return firstToken.range(lastToken, str); } @@ -647,12 +625,8 @@ export default class Parser { const firstToken = this.nextToken; let lastToken = firstToken; let str = ""; - while (this.nextToken.text !== "EOF" && (regex.test( - str + this.nextToken.text) || this.nextToken.text === "%")) { - if (this.nextToken.text === "%") { - this.consumeComment(); - continue; - } + while (this.nextToken.text !== "EOF" && + regex.test(str + this.nextToken.text)) { lastToken = this.nextToken; str += lastToken.text; this.consume(); @@ -914,9 +888,6 @@ export default class Parser { body: arg, star, }; - } else if (text === "%") { - this.consumeComment(); - return this.parseSymbol(); } // At this point, we should have a symbol, possibly with accents. // First expand any accented base symbol according to unicodeSymbols. diff --git a/src/SourceLocation.js b/src/SourceLocation.js index bf7e5636..6fb74b6d 100644 --- a/src/SourceLocation.js +++ b/src/SourceLocation.js @@ -17,10 +17,6 @@ export default class SourceLocation { this.end = end; } - getSource(): string { - return this.lexer.input.slice(this.start, this.end); - } - /** * Merges two `SourceLocation`s from location providers, given they are * provided in order of appearance. diff --git a/src/functions/font.js b/src/functions/font.js index 9acd6a7e..f6f0743a 100644 --- a/src/functions/font.js +++ b/src/functions/font.js @@ -99,7 +99,6 @@ defineFunction({ }, handler: ({parser, funcName, breakOnTokenText}, args) => { const {mode} = parser; - parser.consumeSpaces(); const body = parser.parseExpression(true, breakOnTokenText); const style = `math${funcName.slice(1)}`; diff --git a/src/functions/sizing.js b/src/functions/sizing.js index 3847c66a..3bbb20ac 100644 --- a/src/functions/sizing.js +++ b/src/functions/sizing.js @@ -61,7 +61,6 @@ defineFunction({ allowedInText: true, }, handler: ({breakOnTokenText, funcName, parser}, args) => { - parser.consumeSpaces(); const body = parser.parseExpression(false, breakOnTokenText); return { diff --git a/src/functions/styling.js b/src/functions/styling.js index 52425fe7..f443a454 100644 --- a/src/functions/styling.js +++ b/src/functions/styling.js @@ -25,7 +25,6 @@ defineFunction({ }, handler({breakOnTokenText, funcName, parser}, args) { // parse out the implicit body - parser.consumeSpaces(); const body = parser.parseExpression(true, breakOnTokenText); // TODO: Refactor to avoid duplicating styleMap in multiple places (e.g. diff --git a/test/katex-spec.js b/test/katex-spec.js index b5804e42..bfd45919 100644 --- a/test/katex-spec.js +++ b/test/katex-spec.js @@ -1627,6 +1627,8 @@ describe("A comment parser", function() { it("should parse comments between subscript and superscript", () => { expect("x_3 %comment\n^2").toParseLike`x_3^2`; + expect("x^ %comment\n{2}").toParseLike`x^{2}`; + expect("x^ %comment\n\\frac{1}{2}").toParseLike`x^\frac{1}{2}`; }); it("should parse comments in size and color groups", () => { @@ -1635,6 +1637,24 @@ describe("A comment parser", function() { expect("\\color{#f00%red\n}").toParse(); }); + it("should parse comments before an expression", () => { + expect("%comment\n{2}").toParseLike`{2}`; + }); + + it("should parse comments before and between \\hline", () => { + expect("\\begin{matrix}a&b\\\\ %hline\n" + + "\\hline %hline\n" + + "\\hline c&d\\end{matrix}").toParse(); + }); + + it("should parse comments in the macro definition", () => { + expect("\\def\\foo{1 %}\n2}\n\\foo").toParseLike`12`; + }); + + it("should not expand nor ignore spaces after a command sequence in a comment", () => { + expect("\\def\\foo{1\n2}\nx %\\foo\n").toParseLike`x`; + }); + it("should not parse a comment without newline in strict mode", () => { expect`x%y`.not.toParse(strictSettings); expect`x%y`.toParse(nonstrictSettings); @@ -2586,9 +2606,8 @@ describe("href and url commands", function() { it("should allow single-character URLs", () => { expect`\href%end`.toParseLike("\\href{%}end"); - expect`\href %end`.toParseLike("\\href{%}end"); expect("\\url%end").toParseLike("\\url{%}end"); - expect("\\url %end").toParseLike("\\url{%}end"); + expect("\\url%%end\n").toParseLike("\\url{%}"); expect("\\url end").toParseLike("\\url{e}nd"); expect("\\url%end").toParseLike("\\url {%}end"); }); @@ -2630,6 +2649,10 @@ describe("href and url commands", function() { expect(parsed2.href).toBe(url); }); + it("should allow comments after URLs", function() { + expect("\\url{http://example.com/}%comment\n").toBuild(); + }); + it("should be marked up correctly", function() { const markup = katex.renderToString(r`\href{http://example.com/}{example here}`); expect(markup).toContain("");