diff --git a/docs/options.md b/docs/options.md index 3e8c9ca9..33b061f5 100644 --- a/docs/options.md +++ b/docs/options.md @@ -18,6 +18,9 @@ You can provide an object of options as the last argument to [`katex.render` and incorrect (especially in terms of vertical heights). - `"unicodeTextInMathMode"`: Use of Unicode text characters in math mode. - `"mathVsTextUnits"`: Mismatch of math vs. text commands and units/mode. + - `"commentAtEnd"`: Use of `%` comment without a terminating newline. + LaTeX would thereby comment out the end of math mode (e.g. `$`), + causing an error. A second category of `errorCode`s never throw errors, but their strictness affects the behavior of KaTeX: - `"newLineInDisplayMode"`: Use of `\\` or `\newline` in display mode diff --git a/docs/supported.md b/docs/supported.md index 14408ad9..7bc52d13 100644 --- a/docs/supported.md +++ b/docs/supported.md @@ -86,7 +86,10 @@ The `{array}` environment does not yet support `\cline` or `\multicolumn`. ## HTML -$\href{https://khan.github.io/KaTeX/}{KaTeX}$ `\href{https://khan.github.io/KaTeX/}{KaTeX}` +||| +|:----------------|:-------------------| +| $\href{https://khan.github.io/KaTeX/}{KaTeX}$ | `\href{https://khan.github.io/KaTeX/}{KaTeX}` | +| $\url{https://khan.github.io/KaTeX/}$ | `\url{https://khan.github.io/KaTeX/}` | ## Letters and Unicode diff --git a/src/Lexer.js b/src/Lexer.js index 87e1504d..54611f10 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -15,6 +15,7 @@ import ParseError from "./ParseError"; import SourceLocation from "./SourceLocation"; import {LexerInterface, Token} from "./Token"; +import type Settings from "./Settings"; /* The following tokenRegex * - matches typical whitespace (but not NBSP etc.) using its first group @@ -33,7 +34,7 @@ import {LexerInterface, Token} from "./Token"; * still reject the input. */ const spaceRegexString = "[ \r\n\t]"; -const commentRegexString = "%[^\n]*[\n]"; +const commentRegexString = "%[^\n]*(?:\n|$)"; const controlWordRegexString = "\\\\[a-zA-Z@]+"; const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]"; const controlWordWhitespaceRegexString = @@ -43,6 +44,10 @@ const controlWordWhitespaceRegex = new RegExp( const combiningDiacriticalMarkString = "[\u0300-\u036f]"; export const combiningDiacriticalMarksEndRegex = new RegExp(`${combiningDiacriticalMarkString}+$`); +const urlFunctionRegexString = "(\\\\href|\\\\url)" + + `(?:${spaceRegexString}*\\{((?:[^{}\\\\]|\\\\[^]|{[^{}]*})*)\\}` + + `|${spaceRegexString}+([^{}])` + + `|${spaceRegexString}*([^{}a-zA-Z]))`; const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace `(${commentRegexString}` + // comments "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint @@ -51,22 +56,25 @@ const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace `${combiningDiacriticalMarkString}*` + // ...plus accents "|\\\\verb\\*([^]).*?\\3" + // \verb* "|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred + `|${urlFunctionRegexString}` + // URL arguments `|${controlWordWhitespaceRegexString}` + // \macroName + spaces `|${controlSymbolRegexString})`; // \\, \', etc. // These regexs are for matching results from tokenRegex, // so they do have ^ markers. export const controlWordRegex = new RegExp(`^${controlWordRegexString}`); -const commentRegex = new RegExp(`^${commentRegexString}`); +export const urlFunctionRegex = new RegExp(`^${urlFunctionRegexString}`); /** Main Lexer class */ export default class Lexer implements LexerInterface { input: string; + settings: Settings; tokenRegex: RegExp; - constructor(input: string) { + constructor(input: string, settings: Settings) { // Separate accents from characters this.input = input; + this.settings = settings; this.tokenRegex = new RegExp(tokenRegexString, 'g'); } @@ -90,10 +98,15 @@ export default class Lexer implements LexerInterface { // Trim any trailing whitespace from control word match const controlMatch = text.match(controlWordWhitespaceRegex); if (controlMatch) { - text = controlMatch[1]; + text = controlMatch[1] + text.slice(controlMatch[0].length); } - if (commentRegex.test(text)) { + if (text[0] === "%") { + if (text[text.length - 1] !== "\n") { + this.settings.reportNonstrict("commentAtEnd", + "% comment has no terminating newline; LaTeX would " + + "fail because of commenting the end of math mode (e.g. $)"); + } return this.lex(); } else { return new Token(text, new SourceLocation(this, pos, diff --git a/src/MacroExpander.js b/src/MacroExpander.js index acf2fbc1..05034bab 100644 --- a/src/MacroExpander.js +++ b/src/MacroExpander.js @@ -28,17 +28,19 @@ export const implicitCommands = { }; export default class MacroExpander implements MacroContextInterface { - maxExpand: number; + settings: Settings; + expansionCount: number; lexer: Lexer; macros: Namespace; stack: Token[]; mode: Mode; constructor(input: string, settings: Settings, mode: Mode) { + this.settings = settings; + this.expansionCount = 0; this.feed(input); // Make new global namespace this.macros = new Namespace(builtinMacros, settings.macros); - this.maxExpand = settings.maxExpand; this.mode = mode; this.stack = []; // contains tokens in REVERSE order } @@ -48,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface { * (with existing macros etc.). */ feed(input: string) { - this.lexer = new Lexer(input); + this.lexer = new Lexer(input, this.settings); } /** @@ -188,12 +190,10 @@ export default class MacroExpander implements MacroContextInterface { this.pushToken(topToken); return topToken; } - if (this.maxExpand !== Infinity) { - this.maxExpand--; - if (this.maxExpand < 0) { - throw new ParseError("Too many expansions: infinite loop or " + - "need to increase maxExpand setting"); - } + this.expansionCount++; + if (this.expansionCount > this.settings.maxExpand) { + throw new ParseError("Too many expansions: infinite loop or " + + "need to increase maxExpand setting"); } let tokens = expansion.tokens; if (expansion.numArgs) { @@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface { ++numArgs; } } - const bodyLexer = new Lexer(expansion); + const bodyLexer = new Lexer(expansion, this.settings); const tokens = []; let tok = bodyLexer.lex(); while (tok.text !== "EOF") { diff --git a/src/Parser.js b/src/Parser.js index 1dc96a4c..4e6552d6 100644 --- a/src/Parser.js +++ b/src/Parser.js @@ -11,7 +11,7 @@ import unicodeSymbols from "./unicodeSymbols"; import utils from "./utils"; import ParseNode, {assertNodeType, checkNodeType} from "./ParseNode"; import ParseError from "./ParseError"; -import {combiningDiacriticalMarksEndRegex} from "./Lexer.js"; +import {combiningDiacriticalMarksEndRegex, urlFunctionRegex} from "./Lexer.js"; import Settings from "./Settings"; import {Token} from "./Token"; import type {AnyParseNode} from "./ParseNode"; @@ -28,7 +28,7 @@ import type {EnvSpec} from "./defineEnvironment"; * * The main functions (the `.parse...` ones) take a position in the current * parse string to parse tokens from. The lexer (found in Lexer.js, stored at - * this.lexer) also supports pulling out tokens at arbitrary places. When + * this.gullet.lexer) also supports pulling out tokens at arbitrary places. When * individual tokens are needed at a position, the lexer is called to pull out a * token, which is then used. * @@ -660,7 +660,9 @@ export default class Parser { return this.parseSizeGroup(optional); } if (type === "url") { - return this.parseUrlGroup(optional); + throw new ParseError( + "Internal bug: 'url' arguments should be handled by Lexer", + this.nextToken); } // By the time we get here, type is one of "text" or "math". @@ -706,51 +708,6 @@ export default class Parser { return firstToken.range(lastToken, str); } - /** - * Parses a group, essentially returning the string formed by the - * brace-enclosed tokens plus some position information, possibly - * with nested braces. - */ - parseStringGroupWithBalancedBraces( - modeName: ArgType, // Used to describe the mode in error messages. - optional: boolean, - ): ?Token { - if (optional && this.nextToken.text !== "[") { - return null; - } - const outerMode = this.mode; - this.mode = "text"; - this.expect(optional ? "[" : "{"); - let str = ""; - let nest = 0; - const firstToken = this.nextToken; - let lastToken = firstToken; - while (nest > 0 || this.nextToken.text !== (optional ? "]" : "}")) { - if (this.nextToken.text === "EOF") { - throw new ParseError( - "Unexpected end of input in " + modeName, - firstToken.range(this.nextToken, str)); - } - lastToken = this.nextToken; - str += lastToken.text; - if (lastToken.text === "{") { - nest += 1; - } else if (lastToken.text === "}") { - if (nest <= 0) { - throw new ParseError( - "Unbalanced brace of input in " + modeName, - firstToken.range(this.nextToken, str)); - } else { - nest -= 1; - } - } - this.consume(); - } - this.mode = outerMode; - this.expect(optional ? "]" : "}"); - return firstToken.range(lastToken, str); - } - /** * Parses a regex-delimited group: the largest sequence of tokens * whose concatenated strings match `regex`. Returns the string @@ -795,32 +752,6 @@ export default class Parser { return newArgument(new ParseNode("color-token", match[0], this.mode), res); } - /** - * Parses a url string. - */ - parseUrlGroup(optional: boolean): ?ParsedArg { - const res = this.parseStringGroupWithBalancedBraces("url", optional); - if (!res) { - return null; - } - const raw = res.text; - // hyperref package allows backslashes alone in href, but doesn't generate - // valid links in such cases; we interpret this as "undefiend" behaviour, - // and keep them as-is. Some browser will replace backslashes with - // forward slashes. - const url = raw.replace(/\\([#$%&~_^{}])/g, '$1'); - const protocol = /^\s*([^\\/#]*?)(?::|�*58|�*3a)/i.exec(url); - const allowed = this.settings.allowedProtocols; - if (!utils.contains(allowed, "*") && !utils.contains(allowed, - protocol != null ? protocol[1] : "_relative")) { - throw new ParseError('Not allowed \\href protocol', res); - } - return newArgument(new ParseNode("url", { - type: "url", - value: url, - }, this.mode), res); - } - /** * Parses a size specification, consisting of magnitude and unit. */ @@ -957,6 +888,52 @@ export default class Parser { // The token will be consumed later in parseGivenFunction // (after possibly switching modes). return newFunction(nucleus); + } else if (/^\\(href|url)[^a-zA-Z]/.test(text)) { + const match = text.match(urlFunctionRegex); + if (!match) { + throw new ParseError( + `Internal error: invalid URL token '${text}'`, nucleus); + } + const funcName = match[1]; + // match[2] is the only one that can be an empty string, + // so it must be at the end of the following or chain: + const rawUrl = match[4] || match[3] || match[2]; + // hyperref package allows backslashes alone in href, but doesn't + // generate valid links in such cases; we interpret this as + // "undefined" behaviour, and keep them as-is. Some browser will + // replace backslashes with forward slashes. + const url = rawUrl.replace(/\\([#$%&~_^{}])/g, '$1'); + let protocol = /^\s*([^\\/#]*?)(?::|�*58|�*3a)/i.exec(url); + protocol = (protocol != null ? protocol[1] : "_relative"); + const allowed = this.settings.allowedProtocols; + if (!utils.contains(allowed, "*") && + !utils.contains(allowed, protocol)) { + throw new ParseError( + `Forbidden protocol '${protocol}' in ${funcName}`, nucleus); + } + const urlArg = new ParseNode("url", { + type: "url", + value: url, + }, this.mode); + this.consume(); + if (funcName === "\\href") { // two arguments + this.consumeSpaces(); // ignore spaces between arguments + let description = this.parseGroupOfType("original", false); + if (description == null) { + throw new ParseError(`${funcName} missing second argument`, + nucleus); + } + if (description.type === "fn") { + description = this.parseGivenFunction(description); + } else { // arg.type === "arg" + description = description.result; + } + return newArgument(this.callFunction( + funcName, [urlArg, description], []), nucleus); + } else { // one argument (\url) + return newArgument(this.callFunction( + funcName, [urlArg], []), nucleus); + } } else if (/^\\verb[^a-zA-Z]/.test(text)) { this.consume(); let arg = text.slice(5); diff --git a/src/functions/href.js b/src/functions/href.js index 1d4e2995..23c096e6 100644 --- a/src/functions/href.js +++ b/src/functions/href.js @@ -14,6 +14,7 @@ defineFunction({ props: { numArgs: 2, argTypes: ["url", "original"], + allowedInText: true, }, handler: ({parser}, args) => { const body = args[1]; @@ -41,3 +42,34 @@ defineFunction({ return math; }, }); + +defineFunction({ + type: "href", + names: ["\\url"], + props: { + numArgs: 1, + argTypes: ["url"], + allowedInText: true, + }, + handler: ({parser}, args) => { + const href = assertNodeType(args[0], "url").value.value; + const chars = []; + for (let i = 0; i < href.length; i++) { + let c = href[i]; + if (c === "~") { + c = "\\textasciitilde"; + } + chars.push(new ParseNode("textord", c, "text")); + } + const body = new ParseNode("text", { + type: "text", + font: "\\texttt", + body: chars, + }, parser.mode); + return new ParseNode("href", { + type: "href", + href: href, + body: ordargument(body), + }, parser.mode); + }, +}); diff --git a/test/__snapshots__/katex-spec.js.snap b/test/__snapshots__/katex-spec.js.snap index ab74c677..da6eaa44 100755 --- a/test/__snapshots__/katex-spec.js.snap +++ b/test/__snapshots__/katex-spec.js.snap @@ -27,9 +27,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = ` "end": 37, "lexer": { "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}", - "tokenRegex": { - "lastIndex": 56 - } + "lastIndex": 56 }, "start": 36 }, @@ -58,9 +56,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = ` "end": 39, "lexer": { "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}", - "tokenRegex": { - "lastIndex": 56 - } + "lastIndex": 56 }, "start": 38 }, @@ -91,9 +87,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = ` "end": 42, "lexer": { "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}", - "tokenRegex": { - "lastIndex": 56 - } + "lastIndex": 56 }, "start": 41 }, @@ -122,9 +116,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = ` "end": 44, "lexer": { "input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}", - "tokenRegex": { - "lastIndex": 56 - } + "lastIndex": 56 }, "start": 43 }, diff --git a/test/katex-spec.js b/test/katex-spec.js index bb0b57fe..92653fd8 100644 --- a/test/katex-spec.js +++ b/test/katex-spec.js @@ -1563,8 +1563,9 @@ describe("A comment parser", function() { expect("% comment 1\n% comment 2\n").toParse(); }); - it("should not parse a comment that isn't followed by a newline", () => { - expect`x%y`.not.toParse(); + it("should not parse a comment without newline in strict mode", () => { + expect`x%y`.not.toParse(strictSettings); + expect`x%y`.toParse(nonstrictSettings); }); it("should not produce or consume space", () => { @@ -2451,33 +2452,69 @@ describe("operatorname support", function() { }); }); -describe("An href command", function() { +describe("href and url commands", function() { + // We can't use raw strings for \url because \u is for Unicode escapes. + it("should parse its input", function() { - expect`\href{http://example.com/}{example here}`.toParse(); + expect`\href{http://example.com/}{example here}`.toBuild(); + expect("\\url{http://example.com/}").toBuild(); + }); + + it("should allow empty URLs", function() { + expect`\href{}{example here}`.toBuild(); + expect("\\url{}").toBuild(); + }); + + it("should allow single-character URLs", () => { + expect`\href%end`.toParseLike("\\href{%}end"); + expect`\href %end`.toParseLike("\\href{%}end"); + expect("\\url%end").toParseLike("\\url{%}end"); + expect("\\url %end").toParseLike("\\url{%}end"); + expect("\\url end").toParseLike("\\url{e}nd"); + expect("\\url%end").toParseLike("\\url {%}end"); + }); + + it("should detect missing second argument in \\href", () => { + expect`\href{http://example.com/}`.not.toParse(); + expect`\href%`.not.toParse(); + expect`\href %`.not.toParse(); + }); + + it("should allow spaces single-character URLs", () => { + expect`\href %end`.toParseLike("\\href{%}end"); + expect("\\url %end").toParseLike("\\url{%}end"); }); it("should allow letters [#$%&~_^] without escaping", function() { const url = "http://example.org/~bar/#top?foo=$foo&bar=ba^r_boo%20baz"; - const hash = getParsed(`\\href{${url}}{\\alpha}`)[0]; - expect(hash.value.href).toBe(url); + const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0]; + expect(parsed1.value.href).toBe(url); + const parsed2 = getParsed(`\\url{${url}}`)[0]; + expect(parsed2.value.href).toBe(url); }); it("should allow balanced braces in url", function() { const url = "http://example.org/{too}"; - const hash = getParsed(`\\href{${url}}{\\alpha}`)[0]; - expect(hash.value.href).toBe(url); + const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0]; + expect(parsed1.value.href).toBe(url); + const parsed2 = getParsed(`\\url{${url}}`)[0]; + expect(parsed2.value.href).toBe(url); }); it("should not allow unbalanced brace(s) in url", function() { expect`\href{http://example.com/{a}{bar}`.not.toParse(); expect`\href{http://example.com/}a}{bar}`.not.toParse(); + expect`\\url{http://example.com/{a}`.not.toParse(); + expect`\\url{http://example.com/}a}`.not.toParse(); }); it("should allow escape for letters [#$%&~_^{}]", function() { const url = "http://example.org/~bar/#top?foo=$}foo{&bar=bar^r_boo%20baz"; const input = url.replace(/([#$%&~_^{}])/g, '\\$1'); - const ae = getParsed(`\\href{${input}}{\\alpha}`)[0]; - expect(ae.value.href).toBe(url); + const parsed1 = getParsed(`\\href{${input}}{\\alpha}`)[0]; + expect(parsed1.value.href).toBe(url); + const parsed2 = getParsed(`\\url{${input}}`)[0]; + expect(parsed2.value.href).toBe(url); }); it("should be marked up correctly", function() { diff --git a/test/setup.js b/test/setup.js index 29a35b35..83884b2e 100644 --- a/test/setup.js +++ b/test/setup.js @@ -1,6 +1,7 @@ /* global expect: false */ import stringify from 'json-stable-stringify'; +import Lexer from "../src/Lexer"; import ParseError from "../src/ParseError"; import { Mode, ConsoleWarning, @@ -19,8 +20,16 @@ const typeFirstCompare = (a, b) => { } }; -const regExpReplacer = (key, value) => { - return value instanceof RegExp ? {lastIndex: value.lastIndex} : value; +const replacer = (key, value) => { + if (value instanceof Lexer) { + return { + input: value.input, + // omit value.settings + lastIndex: value.tokenRegex.lastIndex, + }; + } else { + return value; + } }; const serializer = { @@ -28,7 +37,7 @@ const serializer = { return stringify(val, { cmp: typeFirstCompare, space: ' ', - replacer: regExpReplacer, + replacer: replacer, }); }, test(val) {