Comments without terminating newlines, \href fixes, \url support (#1529)

* Comments without terminating newlines in nonstrict mode

Fix #1506 by allowing single-line comments (`%` without terminating newline)
in nonstrict mode.  `Lexer` and `MacroExpander` now store the `Settings`
object, so the `Lexer` can complain about missing newline according to the
`strict` setting.  I filtered this out from the snapshot tests with a slightly
different `replacer`.

* Reimplement \href like \verb, add \url

Major restructuring to lex URL arguments differently, e.g. to support
`\href%{hello}` and `\href{http://foo.com/#test%}{hello}`.  The new URL
parsing code is simpler, but involves a special case in `parseSymbol`
like `\verb`.

Also add support for `\url` while we're here.

* Cleanup

* Fix flow errors and improve error messages

* Add \url to documentation

* Improve doc formatting
This commit is contained in:
Erik Demaine
2018-07-31 14:13:30 -04:00
committed by GitHub
parent b73e43832b
commit 2202aa774f
9 changed files with 181 additions and 115 deletions

View File

@@ -18,6 +18,9 @@ You can provide an object of options as the last argument to [`katex.render` and
incorrect (especially in terms of vertical heights).
- `"unicodeTextInMathMode"`: Use of Unicode text characters in math mode.
- `"mathVsTextUnits"`: Mismatch of math vs. text commands and units/mode.
- `"commentAtEnd"`: Use of `%` comment without a terminating newline.
LaTeX would thereby comment out the end of math mode (e.g. `$`),
causing an error.
A second category of `errorCode`s never throw errors, but their strictness
affects the behavior of KaTeX:
- `"newLineInDisplayMode"`: Use of `\\` or `\newline` in display mode

View File

@@ -86,7 +86,10 @@ The `{array}` environment does not yet support `\cline` or `\multicolumn`.
## HTML
$\href{https://khan.github.io/KaTeX/}{KaTeX}$ `\href{https://khan.github.io/KaTeX/}{KaTeX}`
|||
|:----------------|:-------------------|
| $\href{https://khan.github.io/KaTeX/}{KaTeX}$ | `\href{https://khan.github.io/KaTeX/}{KaTeX}` |
| $\url{https://khan.github.io/KaTeX/}$ | `\url{https://khan.github.io/KaTeX/}` |
## Letters and Unicode

View File

@@ -15,6 +15,7 @@
import ParseError from "./ParseError";
import SourceLocation from "./SourceLocation";
import {LexerInterface, Token} from "./Token";
import type Settings from "./Settings";
/* The following tokenRegex
* - matches typical whitespace (but not NBSP etc.) using its first group
@@ -33,7 +34,7 @@ import {LexerInterface, Token} from "./Token";
* still reject the input.
*/
const spaceRegexString = "[ \r\n\t]";
const commentRegexString = "%[^\n]*[\n]";
const commentRegexString = "%[^\n]*(?:\n|$)";
const controlWordRegexString = "\\\\[a-zA-Z@]+";
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
const controlWordWhitespaceRegexString =
@@ -43,6 +44,10 @@ const controlWordWhitespaceRegex = new RegExp(
const combiningDiacriticalMarkString = "[\u0300-\u036f]";
export const combiningDiacriticalMarksEndRegex =
new RegExp(`${combiningDiacriticalMarkString}+$`);
const urlFunctionRegexString = "(\\\\href|\\\\url)" +
`(?:${spaceRegexString}*\\{((?:[^{}\\\\]|\\\\[^]|{[^{}]*})*)\\}` +
`|${spaceRegexString}+([^{}])` +
`|${spaceRegexString}*([^{}a-zA-Z]))`;
const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
`(${commentRegexString}` + // comments
"|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
@@ -51,22 +56,25 @@ const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
`${combiningDiacriticalMarkString}*` + // ...plus accents
"|\\\\verb\\*([^]).*?\\3" + // \verb*
"|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred
`|${urlFunctionRegexString}` + // URL arguments
`|${controlWordWhitespaceRegexString}` + // \macroName + spaces
`|${controlSymbolRegexString})`; // \\, \', etc.
// These regexs are for matching results from tokenRegex,
// so they do have ^ markers.
export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
const commentRegex = new RegExp(`^${commentRegexString}`);
export const urlFunctionRegex = new RegExp(`^${urlFunctionRegexString}`);
/** Main Lexer class */
export default class Lexer implements LexerInterface {
input: string;
settings: Settings;
tokenRegex: RegExp;
constructor(input: string) {
constructor(input: string, settings: Settings) {
// Separate accents from characters
this.input = input;
this.settings = settings;
this.tokenRegex = new RegExp(tokenRegexString, 'g');
}
@@ -90,10 +98,15 @@ export default class Lexer implements LexerInterface {
// Trim any trailing whitespace from control word match
const controlMatch = text.match(controlWordWhitespaceRegex);
if (controlMatch) {
text = controlMatch[1];
text = controlMatch[1] + text.slice(controlMatch[0].length);
}
if (commentRegex.test(text)) {
if (text[0] === "%") {
if (text[text.length - 1] !== "\n") {
this.settings.reportNonstrict("commentAtEnd",
"% comment has no terminating newline; LaTeX would " +
"fail because of commenting the end of math mode (e.g. $)");
}
return this.lex();
} else {
return new Token(text, new SourceLocation(this, pos,

View File

@@ -28,17 +28,19 @@ export const implicitCommands = {
};
export default class MacroExpander implements MacroContextInterface {
maxExpand: number;
settings: Settings;
expansionCount: number;
lexer: Lexer;
macros: Namespace<MacroDefinition>;
stack: Token[];
mode: Mode;
constructor(input: string, settings: Settings, mode: Mode) {
this.settings = settings;
this.expansionCount = 0;
this.feed(input);
// Make new global namespace
this.macros = new Namespace(builtinMacros, settings.macros);
this.maxExpand = settings.maxExpand;
this.mode = mode;
this.stack = []; // contains tokens in REVERSE order
}
@@ -48,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
* (with existing macros etc.).
*/
feed(input: string) {
this.lexer = new Lexer(input);
this.lexer = new Lexer(input, this.settings);
}
/**
@@ -188,13 +190,11 @@ export default class MacroExpander implements MacroContextInterface {
this.pushToken(topToken);
return topToken;
}
if (this.maxExpand !== Infinity) {
this.maxExpand--;
if (this.maxExpand < 0) {
this.expansionCount++;
if (this.expansionCount > this.settings.maxExpand) {
throw new ParseError("Too many expansions: infinite loop or " +
"need to increase maxExpand setting");
}
}
let tokens = expansion.tokens;
if (expansion.numArgs) {
const args = this.consumeArgs(expansion.numArgs);
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
++numArgs;
}
}
const bodyLexer = new Lexer(expansion);
const bodyLexer = new Lexer(expansion, this.settings);
const tokens = [];
let tok = bodyLexer.lex();
while (tok.text !== "EOF") {

View File

@@ -11,7 +11,7 @@ import unicodeSymbols from "./unicodeSymbols";
import utils from "./utils";
import ParseNode, {assertNodeType, checkNodeType} from "./ParseNode";
import ParseError from "./ParseError";
import {combiningDiacriticalMarksEndRegex} from "./Lexer.js";
import {combiningDiacriticalMarksEndRegex, urlFunctionRegex} from "./Lexer.js";
import Settings from "./Settings";
import {Token} from "./Token";
import type {AnyParseNode} from "./ParseNode";
@@ -28,7 +28,7 @@ import type {EnvSpec} from "./defineEnvironment";
*
* The main functions (the `.parse...` ones) take a position in the current
* parse string to parse tokens from. The lexer (found in Lexer.js, stored at
* this.lexer) also supports pulling out tokens at arbitrary places. When
* this.gullet.lexer) also supports pulling out tokens at arbitrary places. When
* individual tokens are needed at a position, the lexer is called to pull out a
* token, which is then used.
*
@@ -660,7 +660,9 @@ export default class Parser {
return this.parseSizeGroup(optional);
}
if (type === "url") {
return this.parseUrlGroup(optional);
throw new ParseError(
"Internal bug: 'url' arguments should be handled by Lexer",
this.nextToken);
}
// By the time we get here, type is one of "text" or "math".
@@ -706,51 +708,6 @@ export default class Parser {
return firstToken.range(lastToken, str);
}
/**
* Parses a group, essentially returning the string formed by the
* brace-enclosed tokens plus some position information, possibly
* with nested braces.
*/
parseStringGroupWithBalancedBraces(
modeName: ArgType, // Used to describe the mode in error messages.
optional: boolean,
): ?Token {
if (optional && this.nextToken.text !== "[") {
return null;
}
const outerMode = this.mode;
this.mode = "text";
this.expect(optional ? "[" : "{");
let str = "";
let nest = 0;
const firstToken = this.nextToken;
let lastToken = firstToken;
while (nest > 0 || this.nextToken.text !== (optional ? "]" : "}")) {
if (this.nextToken.text === "EOF") {
throw new ParseError(
"Unexpected end of input in " + modeName,
firstToken.range(this.nextToken, str));
}
lastToken = this.nextToken;
str += lastToken.text;
if (lastToken.text === "{") {
nest += 1;
} else if (lastToken.text === "}") {
if (nest <= 0) {
throw new ParseError(
"Unbalanced brace of input in " + modeName,
firstToken.range(this.nextToken, str));
} else {
nest -= 1;
}
}
this.consume();
}
this.mode = outerMode;
this.expect(optional ? "]" : "}");
return firstToken.range(lastToken, str);
}
/**
* Parses a regex-delimited group: the largest sequence of tokens
* whose concatenated strings match `regex`. Returns the string
@@ -795,32 +752,6 @@ export default class Parser {
return newArgument(new ParseNode("color-token", match[0], this.mode), res);
}
/**
* Parses a url string.
*/
parseUrlGroup(optional: boolean): ?ParsedArg {
const res = this.parseStringGroupWithBalancedBraces("url", optional);
if (!res) {
return null;
}
const raw = res.text;
// hyperref package allows backslashes alone in href, but doesn't generate
// valid links in such cases; we interpret this as "undefiend" behaviour,
// and keep them as-is. Some browser will replace backslashes with
// forward slashes.
const url = raw.replace(/\\([#$%&~_^{}])/g, '$1');
const protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
const allowed = this.settings.allowedProtocols;
if (!utils.contains(allowed, "*") && !utils.contains(allowed,
protocol != null ? protocol[1] : "_relative")) {
throw new ParseError('Not allowed \\href protocol', res);
}
return newArgument(new ParseNode("url", {
type: "url",
value: url,
}, this.mode), res);
}
/**
* Parses a size specification, consisting of magnitude and unit.
*/
@@ -957,6 +888,52 @@ export default class Parser {
// The token will be consumed later in parseGivenFunction
// (after possibly switching modes).
return newFunction(nucleus);
} else if (/^\\(href|url)[^a-zA-Z]/.test(text)) {
const match = text.match(urlFunctionRegex);
if (!match) {
throw new ParseError(
`Internal error: invalid URL token '${text}'`, nucleus);
}
const funcName = match[1];
// match[2] is the only one that can be an empty string,
// so it must be at the end of the following or chain:
const rawUrl = match[4] || match[3] || match[2];
// hyperref package allows backslashes alone in href, but doesn't
// generate valid links in such cases; we interpret this as
// "undefined" behaviour, and keep them as-is. Some browser will
// replace backslashes with forward slashes.
const url = rawUrl.replace(/\\([#$%&~_^{}])/g, '$1');
let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
protocol = (protocol != null ? protocol[1] : "_relative");
const allowed = this.settings.allowedProtocols;
if (!utils.contains(allowed, "*") &&
!utils.contains(allowed, protocol)) {
throw new ParseError(
`Forbidden protocol '${protocol}' in ${funcName}`, nucleus);
}
const urlArg = new ParseNode("url", {
type: "url",
value: url,
}, this.mode);
this.consume();
if (funcName === "\\href") { // two arguments
this.consumeSpaces(); // ignore spaces between arguments
let description = this.parseGroupOfType("original", false);
if (description == null) {
throw new ParseError(`${funcName} missing second argument`,
nucleus);
}
if (description.type === "fn") {
description = this.parseGivenFunction(description);
} else { // arg.type === "arg"
description = description.result;
}
return newArgument(this.callFunction(
funcName, [urlArg, description], []), nucleus);
} else { // one argument (\url)
return newArgument(this.callFunction(
funcName, [urlArg], []), nucleus);
}
} else if (/^\\verb[^a-zA-Z]/.test(text)) {
this.consume();
let arg = text.slice(5);

View File

@@ -14,6 +14,7 @@ defineFunction({
props: {
numArgs: 2,
argTypes: ["url", "original"],
allowedInText: true,
},
handler: ({parser}, args) => {
const body = args[1];
@@ -41,3 +42,34 @@ defineFunction({
return math;
},
});
defineFunction({
type: "href",
names: ["\\url"],
props: {
numArgs: 1,
argTypes: ["url"],
allowedInText: true,
},
handler: ({parser}, args) => {
const href = assertNodeType(args[0], "url").value.value;
const chars = [];
for (let i = 0; i < href.length; i++) {
let c = href[i];
if (c === "~") {
c = "\\textasciitilde";
}
chars.push(new ParseNode("textord", c, "text"));
}
const body = new ParseNode("text", {
type: "text",
font: "\\texttt",
body: chars,
}, parser.mode);
return new ParseNode("href", {
type: "href",
href: href,
body: ordargument(body),
}, parser.mode);
},
});

View File

@@ -27,9 +27,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
"end": 37,
"lexer": {
"input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
"tokenRegex": {
"lastIndex": 56
}
},
"start": 36
},
@@ -58,9 +56,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
"end": 39,
"lexer": {
"input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
"tokenRegex": {
"lastIndex": 56
}
},
"start": 38
},
@@ -91,9 +87,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
"end": 42,
"lexer": {
"input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
"tokenRegex": {
"lastIndex": 56
}
},
"start": 41
},
@@ -122,9 +116,7 @@ exports[`A begin/end parser should grab \\arraystretch 1`] = `
"end": 44,
"lexer": {
"input": "\\\\def\\\\arraystretch{1.5}\\\\begin{matrix}a&b\\\\\\\\c&d\\\\end{matrix}",
"tokenRegex": {
"lastIndex": 56
}
},
"start": 43
},

View File

@@ -1563,8 +1563,9 @@ describe("A comment parser", function() {
expect("% comment 1\n% comment 2\n").toParse();
});
it("should not parse a comment that isn't followed by a newline", () => {
expect`x%y`.not.toParse();
it("should not parse a comment without newline in strict mode", () => {
expect`x%y`.not.toParse(strictSettings);
expect`x%y`.toParse(nonstrictSettings);
});
it("should not produce or consume space", () => {
@@ -2451,33 +2452,69 @@ describe("operatorname support", function() {
});
});
describe("An href command", function() {
describe("href and url commands", function() {
// We can't use raw strings for \url because \u is for Unicode escapes.
it("should parse its input", function() {
expect`\href{http://example.com/}{example here}`.toParse();
expect`\href{http://example.com/}{example here}`.toBuild();
expect("\\url{http://example.com/}").toBuild();
});
it("should allow empty URLs", function() {
expect`\href{}{example here}`.toBuild();
expect("\\url{}").toBuild();
});
it("should allow single-character URLs", () => {
expect`\href%end`.toParseLike("\\href{%}end");
expect`\href %end`.toParseLike("\\href{%}end");
expect("\\url%end").toParseLike("\\url{%}end");
expect("\\url %end").toParseLike("\\url{%}end");
expect("\\url end").toParseLike("\\url{e}nd");
expect("\\url%end").toParseLike("\\url {%}end");
});
it("should detect missing second argument in \\href", () => {
expect`\href{http://example.com/}`.not.toParse();
expect`\href%`.not.toParse();
expect`\href %`.not.toParse();
});
it("should allow spaces single-character URLs", () => {
expect`\href %end`.toParseLike("\\href{%}end");
expect("\\url %end").toParseLike("\\url{%}end");
});
it("should allow letters [#$%&~_^] without escaping", function() {
const url = "http://example.org/~bar/#top?foo=$foo&bar=ba^r_boo%20baz";
const hash = getParsed(`\\href{${url}}{\\alpha}`)[0];
expect(hash.value.href).toBe(url);
const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0];
expect(parsed1.value.href).toBe(url);
const parsed2 = getParsed(`\\url{${url}}`)[0];
expect(parsed2.value.href).toBe(url);
});
it("should allow balanced braces in url", function() {
const url = "http://example.org/{too}";
const hash = getParsed(`\\href{${url}}{\\alpha}`)[0];
expect(hash.value.href).toBe(url);
const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0];
expect(parsed1.value.href).toBe(url);
const parsed2 = getParsed(`\\url{${url}}`)[0];
expect(parsed2.value.href).toBe(url);
});
it("should not allow unbalanced brace(s) in url", function() {
expect`\href{http://example.com/{a}{bar}`.not.toParse();
expect`\href{http://example.com/}a}{bar}`.not.toParse();
expect`\\url{http://example.com/{a}`.not.toParse();
expect`\\url{http://example.com/}a}`.not.toParse();
});
it("should allow escape for letters [#$%&~_^{}]", function() {
const url = "http://example.org/~bar/#top?foo=$}foo{&bar=bar^r_boo%20baz";
const input = url.replace(/([#$%&~_^{}])/g, '\\$1');
const ae = getParsed(`\\href{${input}}{\\alpha}`)[0];
expect(ae.value.href).toBe(url);
const parsed1 = getParsed(`\\href{${input}}{\\alpha}`)[0];
expect(parsed1.value.href).toBe(url);
const parsed2 = getParsed(`\\url{${input}}`)[0];
expect(parsed2.value.href).toBe(url);
});
it("should be marked up correctly", function() {

View File

@@ -1,6 +1,7 @@
/* global expect: false */
import stringify from 'json-stable-stringify';
import Lexer from "../src/Lexer";
import ParseError from "../src/ParseError";
import {
Mode, ConsoleWarning,
@@ -19,8 +20,16 @@ const typeFirstCompare = (a, b) => {
}
};
const regExpReplacer = (key, value) => {
return value instanceof RegExp ? {lastIndex: value.lastIndex} : value;
const replacer = (key, value) => {
if (value instanceof Lexer) {
return {
input: value.input,
// omit value.settings
lastIndex: value.tokenRegex.lastIndex,
};
} else {
return value;
}
};
const serializer = {
@@ -28,7 +37,7 @@ const serializer = {
return stringify(val, {
cmp: typeFirstCompare,
space: ' ',
replacer: regExpReplacer,
replacer: replacer,
});
},
test(val) {