mirror of
https://github.com/Smaug123/KaTeX
synced 2025-10-07 04:08:43 +00:00
Add raw
string group, move comment parsing to Parser, change URL group parser (#1711)
* Add raw string group * Move comment parsing to Parser * Use raw string group in URL group parser * Update types.js * Add multi-level nested url test
This commit is contained in:
committed by
Kevin Barabash
parent
ba8e224b8d
commit
3907545e2c
27
src/Lexer.js
27
src/Lexer.js
@@ -17,11 +17,9 @@ import SourceLocation from "./SourceLocation";
|
||||
import {Token} from "./Token";
|
||||
|
||||
import type {LexerInterface} from "./Token";
|
||||
import type Settings from "./Settings";
|
||||
|
||||
/* The following tokenRegex
|
||||
* - matches typical whitespace (but not NBSP etc.) using its first group
|
||||
* - matches comments (must have trailing newlines)
|
||||
* - does not match any control character \x00-\x1f except whitespace
|
||||
* - does not match a bare backslash
|
||||
* - matches any ASCII character except those just mentioned
|
||||
@@ -36,7 +34,6 @@ import type Settings from "./Settings";
|
||||
* still reject the input.
|
||||
*/
|
||||
const spaceRegexString = "[ \r\n\t]";
|
||||
const commentRegexString = "%[^\n]*(?:\n|$)";
|
||||
const controlWordRegexString = "\\\\[a-zA-Z@]+";
|
||||
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
|
||||
const controlWordWhitespaceRegexString =
|
||||
@@ -46,37 +43,28 @@ const controlWordWhitespaceRegex = new RegExp(
|
||||
const combiningDiacriticalMarkString = "[\u0300-\u036f]";
|
||||
export const combiningDiacriticalMarksEndRegex =
|
||||
new RegExp(`${combiningDiacriticalMarkString}+$`);
|
||||
const urlFunctionRegexString = "(\\\\href|\\\\url)" +
|
||||
`(?:${spaceRegexString}*\\{((?:[^{}\\\\]|\\\\[^]|{[^{}]*})*)\\}` +
|
||||
`|${spaceRegexString}+([^{}])` +
|
||||
`|${spaceRegexString}*([^{}a-zA-Z]))`;
|
||||
const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
|
||||
`(${commentRegexString}` + // comments
|
||||
"|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
|
||||
"([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
|
||||
`${combiningDiacriticalMarkString}*` + // ...plus accents
|
||||
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
|
||||
`${combiningDiacriticalMarkString}*` + // ...plus accents
|
||||
"|\\\\verb\\*([^]).*?\\3" + // \verb*
|
||||
"|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred
|
||||
`|${urlFunctionRegexString}` + // URL arguments
|
||||
`|${controlWordWhitespaceRegexString}` + // \macroName + spaces
|
||||
`|${controlSymbolRegexString})`; // \\, \', etc.
|
||||
|
||||
// These regexs are for matching results from tokenRegex,
|
||||
// so they do have ^ markers.
|
||||
export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
|
||||
export const urlFunctionRegex = new RegExp(`^${urlFunctionRegexString}`);
|
||||
|
||||
/** Main Lexer class */
|
||||
export default class Lexer implements LexerInterface {
|
||||
input: string;
|
||||
settings: Settings;
|
||||
tokenRegex: RegExp;
|
||||
|
||||
constructor(input: string, settings: Settings) {
|
||||
constructor(input: string) {
|
||||
// Separate accents from characters
|
||||
this.input = input;
|
||||
this.settings = settings;
|
||||
this.tokenRegex = new RegExp(tokenRegexString, 'g');
|
||||
}
|
||||
|
||||
@@ -100,19 +88,10 @@ export default class Lexer implements LexerInterface {
|
||||
// Trim any trailing whitespace from control word match
|
||||
const controlMatch = text.match(controlWordWhitespaceRegex);
|
||||
if (controlMatch) {
|
||||
text = controlMatch[1] + text.slice(controlMatch[0].length);
|
||||
text = controlMatch[1];
|
||||
}
|
||||
|
||||
if (text[0] === "%") {
|
||||
if (text[text.length - 1] !== "\n") {
|
||||
this.settings.reportNonstrict("commentAtEnd",
|
||||
"% comment has no terminating newline; LaTeX would " +
|
||||
"fail because of commenting the end of math mode (e.g. $)");
|
||||
}
|
||||
return this.lex();
|
||||
} else {
|
||||
return new Token(text, new SourceLocation(this, pos,
|
||||
this.tokenRegex.lastIndex));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
|
||||
* (with existing macros etc.).
|
||||
*/
|
||||
feed(input: string) {
|
||||
this.lexer = new Lexer(input, this.settings);
|
||||
this.lexer = new Lexer(input);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
|
||||
++numArgs;
|
||||
}
|
||||
}
|
||||
const bodyLexer = new Lexer(expansion, this.settings);
|
||||
const bodyLexer = new Lexer(expansion);
|
||||
const tokens = [];
|
||||
let tok = bodyLexer.lex();
|
||||
while (tok.text !== "EOF") {
|
||||
|
160
src/Parser.js
160
src/Parser.js
@@ -11,7 +11,7 @@ import unicodeSymbols from "./unicodeSymbols";
|
||||
import utils from "./utils";
|
||||
import {assertNodeType, checkNodeType} from "./parseNode";
|
||||
import ParseError from "./ParseError";
|
||||
import {combiningDiacriticalMarksEndRegex, urlFunctionRegex} from "./Lexer";
|
||||
import {combiningDiacriticalMarksEndRegex} from "./Lexer";
|
||||
import Settings from "./Settings";
|
||||
import SourceLocation from "./SourceLocation";
|
||||
import {Token} from "./Token";
|
||||
@@ -405,6 +405,8 @@ export default class Parser {
|
||||
}
|
||||
// Put everything into an ordgroup as the superscript
|
||||
superscript = {type: "ordgroup", mode: this.mode, body: primes};
|
||||
} else if (lex.text === "%") {
|
||||
this.consumeComment();
|
||||
} else {
|
||||
// If it wasn't ^, _, or ', stop parsing super/subscripts
|
||||
break;
|
||||
@@ -658,9 +660,15 @@ export default class Parser {
|
||||
return this.parseSizeGroup(optional);
|
||||
}
|
||||
if (type === "url") {
|
||||
throw new ParseError(
|
||||
"Internal bug: 'url' arguments should be handled by Lexer",
|
||||
this.nextToken);
|
||||
return this.parseUrlGroup(optional);
|
||||
}
|
||||
if (type === "raw") {
|
||||
const token = this.parseStringGroup("raw", optional, true);
|
||||
return token ? newArgument({
|
||||
type: "raw",
|
||||
mode: this.mode,
|
||||
string: token.text,
|
||||
}, token) : null;
|
||||
}
|
||||
|
||||
// By the time we get here, type is one of "text" or "math".
|
||||
@@ -674,6 +682,27 @@ export default class Parser {
|
||||
}
|
||||
}
|
||||
|
||||
consumeComment() {
|
||||
// the newline character is normalized in Lexer, check original source
|
||||
while (this.nextToken.text !== "EOF" && this.nextToken.loc &&
|
||||
this.nextToken.loc.getSource().indexOf("\n") === -1) {
|
||||
this.consume();
|
||||
}
|
||||
if (this.nextToken.text === "EOF") {
|
||||
this.settings.reportNonstrict("commentAtEnd",
|
||||
"% comment has no terminating newline; LaTeX would " +
|
||||
"fail because of commenting the end of math mode (e.g. $)");
|
||||
}
|
||||
if (this.mode === "math") {
|
||||
this.consumeSpaces(); // ignore spaces in math mode
|
||||
} else if (this.nextToken.loc) { // text mode
|
||||
const source = this.nextToken.loc.getSource();
|
||||
if (source.indexOf("\n") === source.length - 1) {
|
||||
this.consumeSpaces(); // if no space after the first newline
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a group, essentially returning the string formed by the
|
||||
* brace-enclosed tokens plus some position information.
|
||||
@@ -681,28 +710,53 @@ export default class Parser {
|
||||
parseStringGroup(
|
||||
modeName: ArgType, // Used to describe the mode in error messages.
|
||||
optional: boolean,
|
||||
raw?: boolean,
|
||||
): ?Token {
|
||||
if (optional && this.nextToken.text !== "[") {
|
||||
const groupBegin = optional ? "[" : "{";
|
||||
const groupEnd = optional ? "]" : "}";
|
||||
const nextToken = this.nextToken;
|
||||
if (nextToken.text !== groupBegin) {
|
||||
if (optional) {
|
||||
return null;
|
||||
} else if (raw && nextToken.text !== "EOF" &&
|
||||
/[^{}[\]]/.test(nextToken.text)) {
|
||||
// allow a single character in raw string group
|
||||
this.consume();
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
const outerMode = this.mode;
|
||||
this.mode = "text";
|
||||
this.expect(optional ? "[" : "{");
|
||||
this.expect(groupBegin);
|
||||
let str = "";
|
||||
const firstToken = this.nextToken;
|
||||
let nested = 0; // allow nested braces in raw string group
|
||||
let lastToken = firstToken;
|
||||
while (this.nextToken.text !== (optional ? "]" : "}")) {
|
||||
if (this.nextToken.text === "EOF") {
|
||||
while ((raw && nested > 0) || this.nextToken.text !== groupEnd) {
|
||||
switch (this.nextToken.text) {
|
||||
case "EOF":
|
||||
throw new ParseError(
|
||||
"Unexpected end of input in " + modeName,
|
||||
firstToken.range(this.nextToken, str));
|
||||
firstToken.range(lastToken, str));
|
||||
case "%":
|
||||
if (!raw) { // allow % in raw string group
|
||||
this.consumeComment();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case groupBegin:
|
||||
nested++;
|
||||
break;
|
||||
case groupEnd:
|
||||
nested--;
|
||||
break;
|
||||
}
|
||||
lastToken = this.nextToken;
|
||||
str += lastToken.text;
|
||||
this.consume();
|
||||
}
|
||||
this.mode = outerMode;
|
||||
this.expect(optional ? "]" : "}");
|
||||
this.expect(groupEnd);
|
||||
return firstToken.range(lastToken, str);
|
||||
}
|
||||
|
||||
@@ -720,8 +774,12 @@ export default class Parser {
|
||||
const firstToken = this.nextToken;
|
||||
let lastToken = firstToken;
|
||||
let str = "";
|
||||
while (this.nextToken.text !== "EOF"
|
||||
&& regex.test(str + this.nextToken.text)) {
|
||||
while (this.nextToken.text !== "EOF" && (regex.test(
|
||||
str + this.nextToken.text) || this.nextToken.text === "%")) {
|
||||
if (this.nextToken.text === "%") {
|
||||
this.consumeComment();
|
||||
continue;
|
||||
}
|
||||
lastToken = this.nextToken;
|
||||
str += lastToken.text;
|
||||
this.consume();
|
||||
@@ -802,6 +860,34 @@ export default class Parser {
|
||||
}, res);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses an URL, checking escaped letters and allowed protocols.
|
||||
*/
|
||||
parseUrlGroup(optional: boolean): ?ParsedArg {
|
||||
const res = this.parseStringGroup("url", optional, true); // get raw string
|
||||
if (!res) {
|
||||
return null;
|
||||
}
|
||||
// hyperref package allows backslashes alone in href, but doesn't
|
||||
// generate valid links in such cases; we interpret this as
|
||||
// "undefined" behaviour, and keep them as-is. Some browser will
|
||||
// replace backslashes with forward slashes.
|
||||
const url = res.text.replace(/\\([#$%&~_^{}])/g, '$1');
|
||||
let protocol = /^\s*([^\\/#]*?)(?::|�*58|�*3a)/i.exec(url);
|
||||
protocol = (protocol != null ? protocol[1] : "_relative");
|
||||
const allowed = this.settings.allowedProtocols;
|
||||
if (!utils.contains(allowed, "*") &&
|
||||
!utils.contains(allowed, protocol)) {
|
||||
throw new ParseError(
|
||||
`Forbidden protocol '${protocol}'`, res);
|
||||
}
|
||||
return newArgument({
|
||||
type: "url",
|
||||
mode: this.mode,
|
||||
url,
|
||||
}, res);
|
||||
}
|
||||
|
||||
/**
|
||||
* If `optional` is false or absent, this parses an ordinary group,
|
||||
* which is either a single nucleus (like "x") or an expression
|
||||
@@ -913,53 +999,6 @@ export default class Parser {
|
||||
// The token will be consumed later in parseGivenFunction
|
||||
// (after possibly switching modes).
|
||||
return newFunction(nucleus);
|
||||
} else if (/^\\(href|url)[^a-zA-Z]/.test(text)) {
|
||||
const match = text.match(urlFunctionRegex);
|
||||
if (!match) {
|
||||
throw new ParseError(
|
||||
`Internal error: invalid URL token '${text}'`, nucleus);
|
||||
}
|
||||
const funcName = match[1];
|
||||
// match[2] is the only one that can be an empty string,
|
||||
// so it must be at the end of the following or chain:
|
||||
const rawUrl = match[4] || match[3] || match[2];
|
||||
// hyperref package allows backslashes alone in href, but doesn't
|
||||
// generate valid links in such cases; we interpret this as
|
||||
// "undefined" behaviour, and keep them as-is. Some browser will
|
||||
// replace backslashes with forward slashes.
|
||||
const url = rawUrl.replace(/\\([#$%&~_^{}])/g, '$1');
|
||||
let protocol = /^\s*([^\\/#]*?)(?::|�*58|�*3a)/i.exec(url);
|
||||
protocol = (protocol != null ? protocol[1] : "_relative");
|
||||
const allowed = this.settings.allowedProtocols;
|
||||
if (!utils.contains(allowed, "*") &&
|
||||
!utils.contains(allowed, protocol)) {
|
||||
throw new ParseError(
|
||||
`Forbidden protocol '${protocol}' in ${funcName}`, nucleus);
|
||||
}
|
||||
const urlArg = {
|
||||
type: "url",
|
||||
mode: this.mode,
|
||||
url,
|
||||
};
|
||||
this.consume();
|
||||
if (funcName === "\\href") { // two arguments
|
||||
this.consumeSpaces(); // ignore spaces between arguments
|
||||
let description = this.parseGroupOfType("original", false);
|
||||
if (description == null) {
|
||||
throw new ParseError(`${funcName} missing second argument`,
|
||||
nucleus);
|
||||
}
|
||||
if (description.type === "fn") {
|
||||
description = this.parseGivenFunction(description);
|
||||
} else { // arg.type === "arg"
|
||||
description = description.result;
|
||||
}
|
||||
return newArgument(this.callFunction(
|
||||
funcName, [urlArg, description], []), nucleus);
|
||||
} else { // one argument (\url)
|
||||
return newArgument(this.callFunction(
|
||||
funcName, [urlArg], []), nucleus);
|
||||
}
|
||||
} else if (/^\\verb[^a-zA-Z]/.test(text)) {
|
||||
this.consume();
|
||||
let arg = text.slice(5);
|
||||
@@ -980,6 +1019,9 @@ export default class Parser {
|
||||
body: arg,
|
||||
star,
|
||||
}, nucleus);
|
||||
} else if (text === "%") {
|
||||
this.consumeComment();
|
||||
return this.parseSymbol();
|
||||
}
|
||||
// At this point, we should have a symbol, possibly with accents.
|
||||
// First expand any accented base symbol according to unicodeSymbols.
|
||||
|
@@ -17,6 +17,10 @@ export default class SourceLocation {
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
getSource(): string {
|
||||
return this.lexer.input.slice(this.start, this.end);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges two `SourceLocation`s from location providers, given they are
|
||||
* provided in order of appearance.
|
||||
|
@@ -80,6 +80,12 @@ type ParseNodeTypes = {
|
||||
loc?: ?SourceLocation,
|
||||
body: AnyParseNode[],
|
||||
|},
|
||||
"raw": {|
|
||||
type: "raw",
|
||||
mode: Mode,
|
||||
loc?: ?SourceLocation,
|
||||
string: string,
|
||||
|},
|
||||
"size": {|
|
||||
type: "size",
|
||||
mode: Mode,
|
||||
|
@@ -12,13 +12,15 @@ export type Mode = "math" | "text";
|
||||
// - "color": An html color, like "#abc" or "blue"
|
||||
// - "url": An url string, in which "\" will be ignored
|
||||
// - if it precedes [#$%&~_^\{}]
|
||||
// - "raw": A string, allowing single character, percent sign,
|
||||
// and nested braces
|
||||
// - "original": The same type as the environment that the
|
||||
// function being parsed is in (e.g. used for the
|
||||
// bodies of functions like \textcolor where the
|
||||
// first argument is special and the second
|
||||
// argument is parsed normally)
|
||||
// - Mode: Node group parsed in given mode.
|
||||
export type ArgType = "color" | "size" | "url" | "original" | Mode;
|
||||
export type ArgType = "color" | "size" | "url" | "raw" | "original" | Mode;
|
||||
|
||||
// LaTeX display style.
|
||||
export type StyleStr = "text" | "display" | "script" | "scriptscript";
|
||||
|
@@ -1597,6 +1597,16 @@ describe("A comment parser", function() {
|
||||
expect("% comment 1\n% comment 2\n").toParse();
|
||||
});
|
||||
|
||||
it("should parse comments between subscript and superscript", () => {
|
||||
expect("x_3 %comment\n^2").toParseLike`x_3^2`;
|
||||
});
|
||||
|
||||
it("should parse comments in size and color groups", () => {
|
||||
expect("\\kern{1 %kern\nem}").toParse();
|
||||
expect("\\kern1 %kern\nem").toParse();
|
||||
expect("\\color{#f00%red\n}").toParse();
|
||||
});
|
||||
|
||||
it("should not parse a comment without newline in strict mode", () => {
|
||||
expect`x%y`.not.toParse(strictSettings);
|
||||
expect`x%y`.toParse(nonstrictSettings);
|
||||
@@ -2527,12 +2537,6 @@ describe("href and url commands", function() {
|
||||
expect("\\url%end").toParseLike("\\url {%}end");
|
||||
});
|
||||
|
||||
it("should detect missing second argument in \\href", () => {
|
||||
expect`\href{http://example.com/}`.not.toParse();
|
||||
expect`\href%`.not.toParse();
|
||||
expect`\href %`.not.toParse();
|
||||
});
|
||||
|
||||
it("should allow spaces single-character URLs", () => {
|
||||
expect`\href %end`.toParseLike("\\href{%}end");
|
||||
expect("\\url %end").toParseLike("\\url{%}end");
|
||||
@@ -2547,7 +2551,7 @@ describe("href and url commands", function() {
|
||||
});
|
||||
|
||||
it("should allow balanced braces in url", function() {
|
||||
const url = "http://example.org/{too}";
|
||||
const url = "http://example.org/{{}t{oo}}";
|
||||
const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0];
|
||||
expect(parsed1.href).toBe(url);
|
||||
const parsed2 = getParsed(`\\url{${url}}`)[0];
|
||||
|
Reference in New Issue
Block a user