From 3907545e2c709e96404ddbd8329dc1a39c1ac3b1 Mon Sep 17 00:00:00 2001 From: ylemkimon Date: Sat, 13 Oct 2018 10:21:57 +0900 Subject: [PATCH] Add `raw` string group, move comment parsing to Parser, change URL group parser (#1711) * Add raw string group * Move comment parsing to Parser * Use raw string group in URL group parser * Update types.js * Add multi-level nested url test --- src/Lexer.js | 31 ++------ src/MacroExpander.js | 4 +- src/Parser.js | 166 ++++++++++++++++++++++++++---------------- src/SourceLocation.js | 4 + src/parseNode.js | 6 ++ src/types.js | 4 +- test/katex-spec.js | 18 +++-- 7 files changed, 135 insertions(+), 98 deletions(-) diff --git a/src/Lexer.js b/src/Lexer.js index b7aba3e2..ae3d6c47 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -17,11 +17,9 @@ import SourceLocation from "./SourceLocation"; import {Token} from "./Token"; import type {LexerInterface} from "./Token"; -import type Settings from "./Settings"; /* The following tokenRegex * - matches typical whitespace (but not NBSP etc.) using its first group - * - matches comments (must have trailing newlines) * - does not match any control character \x00-\x1f except whitespace * - does not match a bare backslash * - matches any ASCII character except those just mentioned @@ -36,7 +34,6 @@ import type Settings from "./Settings"; * still reject the input. */ const spaceRegexString = "[ \r\n\t]"; -const commentRegexString = "%[^\n]*(?:\n|$)"; const controlWordRegexString = "\\\\[a-zA-Z@]+"; const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]"; const controlWordWhitespaceRegexString = @@ -46,37 +43,28 @@ const controlWordWhitespaceRegex = new RegExp( const combiningDiacriticalMarkString = "[\u0300-\u036f]"; export const combiningDiacriticalMarksEndRegex = new RegExp(`${combiningDiacriticalMarkString}+$`); -const urlFunctionRegexString = "(\\\\href|\\\\url)" + - `(?:${spaceRegexString}*\\{((?:[^{}\\\\]|\\\\[^]|{[^{}]*})*)\\}` + - `|${spaceRegexString}+([^{}])` + - `|${spaceRegexString}*([^{}a-zA-Z]))`; const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace - `(${commentRegexString}` + // comments - "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint + "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint `${combiningDiacriticalMarkString}*` + // ...plus accents "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair `${combiningDiacriticalMarkString}*` + // ...plus accents "|\\\\verb\\*([^]).*?\\3" + // \verb* "|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred - `|${urlFunctionRegexString}` + // URL arguments `|${controlWordWhitespaceRegexString}` + // \macroName + spaces `|${controlSymbolRegexString})`; // \\, \', etc. // These regexs are for matching results from tokenRegex, // so they do have ^ markers. export const controlWordRegex = new RegExp(`^${controlWordRegexString}`); -export const urlFunctionRegex = new RegExp(`^${urlFunctionRegexString}`); /** Main Lexer class */ export default class Lexer implements LexerInterface { input: string; - settings: Settings; tokenRegex: RegExp; - constructor(input: string, settings: Settings) { + constructor(input: string) { // Separate accents from characters this.input = input; - this.settings = settings; this.tokenRegex = new RegExp(tokenRegexString, 'g'); } @@ -100,19 +88,10 @@ export default class Lexer implements LexerInterface { // Trim any trailing whitespace from control word match const controlMatch = text.match(controlWordWhitespaceRegex); if (controlMatch) { - text = controlMatch[1] + text.slice(controlMatch[0].length); + text = controlMatch[1]; } - if (text[0] === "%") { - if (text[text.length - 1] !== "\n") { - this.settings.reportNonstrict("commentAtEnd", - "% comment has no terminating newline; LaTeX would " + - "fail because of commenting the end of math mode (e.g. $)"); - } - return this.lex(); - } else { - return new Token(text, new SourceLocation(this, pos, - this.tokenRegex.lastIndex)); - } + return new Token(text, new SourceLocation(this, pos, + this.tokenRegex.lastIndex)); } } diff --git a/src/MacroExpander.js b/src/MacroExpander.js index 05034bab..f45fac7b 100644 --- a/src/MacroExpander.js +++ b/src/MacroExpander.js @@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface { * (with existing macros etc.). */ feed(input: string) { - this.lexer = new Lexer(input, this.settings); + this.lexer = new Lexer(input); } /** @@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface { ++numArgs; } } - const bodyLexer = new Lexer(expansion, this.settings); + const bodyLexer = new Lexer(expansion); const tokens = []; let tok = bodyLexer.lex(); while (tok.text !== "EOF") { diff --git a/src/Parser.js b/src/Parser.js index 9d581164..7499e56c 100644 --- a/src/Parser.js +++ b/src/Parser.js @@ -11,7 +11,7 @@ import unicodeSymbols from "./unicodeSymbols"; import utils from "./utils"; import {assertNodeType, checkNodeType} from "./parseNode"; import ParseError from "./ParseError"; -import {combiningDiacriticalMarksEndRegex, urlFunctionRegex} from "./Lexer"; +import {combiningDiacriticalMarksEndRegex} from "./Lexer"; import Settings from "./Settings"; import SourceLocation from "./SourceLocation"; import {Token} from "./Token"; @@ -405,6 +405,8 @@ export default class Parser { } // Put everything into an ordgroup as the superscript superscript = {type: "ordgroup", mode: this.mode, body: primes}; + } else if (lex.text === "%") { + this.consumeComment(); } else { // If it wasn't ^, _, or ', stop parsing super/subscripts break; @@ -658,9 +660,15 @@ export default class Parser { return this.parseSizeGroup(optional); } if (type === "url") { - throw new ParseError( - "Internal bug: 'url' arguments should be handled by Lexer", - this.nextToken); + return this.parseUrlGroup(optional); + } + if (type === "raw") { + const token = this.parseStringGroup("raw", optional, true); + return token ? newArgument({ + type: "raw", + mode: this.mode, + string: token.text, + }, token) : null; } // By the time we get here, type is one of "text" or "math". @@ -674,6 +682,27 @@ export default class Parser { } } + consumeComment() { + // the newline character is normalized in Lexer, check original source + while (this.nextToken.text !== "EOF" && this.nextToken.loc && + this.nextToken.loc.getSource().indexOf("\n") === -1) { + this.consume(); + } + if (this.nextToken.text === "EOF") { + this.settings.reportNonstrict("commentAtEnd", + "% comment has no terminating newline; LaTeX would " + + "fail because of commenting the end of math mode (e.g. $)"); + } + if (this.mode === "math") { + this.consumeSpaces(); // ignore spaces in math mode + } else if (this.nextToken.loc) { // text mode + const source = this.nextToken.loc.getSource(); + if (source.indexOf("\n") === source.length - 1) { + this.consumeSpaces(); // if no space after the first newline + } + } + } + /** * Parses a group, essentially returning the string formed by the * brace-enclosed tokens plus some position information. @@ -681,28 +710,53 @@ export default class Parser { parseStringGroup( modeName: ArgType, // Used to describe the mode in error messages. optional: boolean, + raw?: boolean, ): ?Token { - if (optional && this.nextToken.text !== "[") { - return null; + const groupBegin = optional ? "[" : "{"; + const groupEnd = optional ? "]" : "}"; + const nextToken = this.nextToken; + if (nextToken.text !== groupBegin) { + if (optional) { + return null; + } else if (raw && nextToken.text !== "EOF" && + /[^{}[\]]/.test(nextToken.text)) { + // allow a single character in raw string group + this.consume(); + return nextToken; + } } const outerMode = this.mode; this.mode = "text"; - this.expect(optional ? "[" : "{"); + this.expect(groupBegin); let str = ""; const firstToken = this.nextToken; + let nested = 0; // allow nested braces in raw string group let lastToken = firstToken; - while (this.nextToken.text !== (optional ? "]" : "}")) { - if (this.nextToken.text === "EOF") { - throw new ParseError( - "Unexpected end of input in " + modeName, - firstToken.range(this.nextToken, str)); + while ((raw && nested > 0) || this.nextToken.text !== groupEnd) { + switch (this.nextToken.text) { + case "EOF": + throw new ParseError( + "Unexpected end of input in " + modeName, + firstToken.range(lastToken, str)); + case "%": + if (!raw) { // allow % in raw string group + this.consumeComment(); + continue; + } + break; + case groupBegin: + nested++; + break; + case groupEnd: + nested--; + break; } lastToken = this.nextToken; str += lastToken.text; this.consume(); } this.mode = outerMode; - this.expect(optional ? "]" : "}"); + this.expect(groupEnd); return firstToken.range(lastToken, str); } @@ -720,8 +774,12 @@ export default class Parser { const firstToken = this.nextToken; let lastToken = firstToken; let str = ""; - while (this.nextToken.text !== "EOF" - && regex.test(str + this.nextToken.text)) { + while (this.nextToken.text !== "EOF" && (regex.test( + str + this.nextToken.text) || this.nextToken.text === "%")) { + if (this.nextToken.text === "%") { + this.consumeComment(); + continue; + } lastToken = this.nextToken; str += lastToken.text; this.consume(); @@ -802,6 +860,34 @@ export default class Parser { }, res); } + /** + * Parses an URL, checking escaped letters and allowed protocols. + */ + parseUrlGroup(optional: boolean): ?ParsedArg { + const res = this.parseStringGroup("url", optional, true); // get raw string + if (!res) { + return null; + } + // hyperref package allows backslashes alone in href, but doesn't + // generate valid links in such cases; we interpret this as + // "undefined" behaviour, and keep them as-is. Some browser will + // replace backslashes with forward slashes. + const url = res.text.replace(/\\([#$%&~_^{}])/g, '$1'); + let protocol = /^\s*([^\\/#]*?)(?::|�*58|�*3a)/i.exec(url); + protocol = (protocol != null ? protocol[1] : "_relative"); + const allowed = this.settings.allowedProtocols; + if (!utils.contains(allowed, "*") && + !utils.contains(allowed, protocol)) { + throw new ParseError( + `Forbidden protocol '${protocol}'`, res); + } + return newArgument({ + type: "url", + mode: this.mode, + url, + }, res); + } + /** * If `optional` is false or absent, this parses an ordinary group, * which is either a single nucleus (like "x") or an expression @@ -913,53 +999,6 @@ export default class Parser { // The token will be consumed later in parseGivenFunction // (after possibly switching modes). return newFunction(nucleus); - } else if (/^\\(href|url)[^a-zA-Z]/.test(text)) { - const match = text.match(urlFunctionRegex); - if (!match) { - throw new ParseError( - `Internal error: invalid URL token '${text}'`, nucleus); - } - const funcName = match[1]; - // match[2] is the only one that can be an empty string, - // so it must be at the end of the following or chain: - const rawUrl = match[4] || match[3] || match[2]; - // hyperref package allows backslashes alone in href, but doesn't - // generate valid links in such cases; we interpret this as - // "undefined" behaviour, and keep them as-is. Some browser will - // replace backslashes with forward slashes. - const url = rawUrl.replace(/\\([#$%&~_^{}])/g, '$1'); - let protocol = /^\s*([^\\/#]*?)(?::|�*58|�*3a)/i.exec(url); - protocol = (protocol != null ? protocol[1] : "_relative"); - const allowed = this.settings.allowedProtocols; - if (!utils.contains(allowed, "*") && - !utils.contains(allowed, protocol)) { - throw new ParseError( - `Forbidden protocol '${protocol}' in ${funcName}`, nucleus); - } - const urlArg = { - type: "url", - mode: this.mode, - url, - }; - this.consume(); - if (funcName === "\\href") { // two arguments - this.consumeSpaces(); // ignore spaces between arguments - let description = this.parseGroupOfType("original", false); - if (description == null) { - throw new ParseError(`${funcName} missing second argument`, - nucleus); - } - if (description.type === "fn") { - description = this.parseGivenFunction(description); - } else { // arg.type === "arg" - description = description.result; - } - return newArgument(this.callFunction( - funcName, [urlArg, description], []), nucleus); - } else { // one argument (\url) - return newArgument(this.callFunction( - funcName, [urlArg], []), nucleus); - } } else if (/^\\verb[^a-zA-Z]/.test(text)) { this.consume(); let arg = text.slice(5); @@ -980,6 +1019,9 @@ export default class Parser { body: arg, star, }, nucleus); + } else if (text === "%") { + this.consumeComment(); + return this.parseSymbol(); } // At this point, we should have a symbol, possibly with accents. // First expand any accented base symbol according to unicodeSymbols. diff --git a/src/SourceLocation.js b/src/SourceLocation.js index 6fb74b6d..bf7e5636 100644 --- a/src/SourceLocation.js +++ b/src/SourceLocation.js @@ -17,6 +17,10 @@ export default class SourceLocation { this.end = end; } + getSource(): string { + return this.lexer.input.slice(this.start, this.end); + } + /** * Merges two `SourceLocation`s from location providers, given they are * provided in order of appearance. diff --git a/src/parseNode.js b/src/parseNode.js index d49b4419..7924729d 100644 --- a/src/parseNode.js +++ b/src/parseNode.js @@ -80,6 +80,12 @@ type ParseNodeTypes = { loc?: ?SourceLocation, body: AnyParseNode[], |}, + "raw": {| + type: "raw", + mode: Mode, + loc?: ?SourceLocation, + string: string, + |}, "size": {| type: "size", mode: Mode, diff --git a/src/types.js b/src/types.js index 94baaabf..b9bd48e8 100644 --- a/src/types.js +++ b/src/types.js @@ -12,13 +12,15 @@ export type Mode = "math" | "text"; // - "color": An html color, like "#abc" or "blue" // - "url": An url string, in which "\" will be ignored // - if it precedes [#$%&~_^\{}] +// - "raw": A string, allowing single character, percent sign, +// and nested braces // - "original": The same type as the environment that the // function being parsed is in (e.g. used for the // bodies of functions like \textcolor where the // first argument is special and the second // argument is parsed normally) // - Mode: Node group parsed in given mode. -export type ArgType = "color" | "size" | "url" | "original" | Mode; +export type ArgType = "color" | "size" | "url" | "raw" | "original" | Mode; // LaTeX display style. export type StyleStr = "text" | "display" | "script" | "scriptscript"; diff --git a/test/katex-spec.js b/test/katex-spec.js index 5515b990..d2b0753f 100644 --- a/test/katex-spec.js +++ b/test/katex-spec.js @@ -1597,6 +1597,16 @@ describe("A comment parser", function() { expect("% comment 1\n% comment 2\n").toParse(); }); + it("should parse comments between subscript and superscript", () => { + expect("x_3 %comment\n^2").toParseLike`x_3^2`; + }); + + it("should parse comments in size and color groups", () => { + expect("\\kern{1 %kern\nem}").toParse(); + expect("\\kern1 %kern\nem").toParse(); + expect("\\color{#f00%red\n}").toParse(); + }); + it("should not parse a comment without newline in strict mode", () => { expect`x%y`.not.toParse(strictSettings); expect`x%y`.toParse(nonstrictSettings); @@ -2527,12 +2537,6 @@ describe("href and url commands", function() { expect("\\url%end").toParseLike("\\url {%}end"); }); - it("should detect missing second argument in \\href", () => { - expect`\href{http://example.com/}`.not.toParse(); - expect`\href%`.not.toParse(); - expect`\href %`.not.toParse(); - }); - it("should allow spaces single-character URLs", () => { expect`\href %end`.toParseLike("\\href{%}end"); expect("\\url %end").toParseLike("\\url{%}end"); @@ -2547,7 +2551,7 @@ describe("href and url commands", function() { }); it("should allow balanced braces in url", function() { - const url = "http://example.org/{too}"; + const url = "http://example.org/{{}t{oo}}"; const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0]; expect(parsed1.href).toBe(url); const parsed2 = getParsed(`\\url{${url}}`)[0];