Refactor Parser (#1723)

* Move unsupported command (undefined control sequence) error to parseSymbol

* Change parseGivenFunction and parseFunction to parse only function

* Move \begin handling to environment.js

* Remove ParsedFunc/Arg, move logics into parseGroup

* Fix flow error

* Remove parseFunction, rename parseGivenFunction to parseFunction

* Minor fixes

* Remove previously resolved TODO

* Minor fixes

* Update flow typing
This commit is contained in:
ylemkimon
2018-10-14 00:58:13 +09:00
committed by GitHub
parent 94d433805a
commit 0bc31f1822
5 changed files with 174 additions and 289 deletions

View File

@@ -1,21 +1,20 @@
// @flow
/* eslint no-constant-condition:0 */
import functions from "./functions";
import environments from "./environments";
import MacroExpander from "./MacroExpander";
import MacroExpander, {implicitCommands} from "./MacroExpander";
import symbols, {ATOMS, extraLatin} from "./symbols";
import {validUnit} from "./units";
import {supportedCodepoint} from "./unicodeScripts";
import unicodeAccents from "./unicodeAccents";
import unicodeSymbols from "./unicodeSymbols";
import utils from "./utils";
import {assertNodeType, checkNodeType} from "./parseNode";
import {checkNodeType} from "./parseNode";
import ParseError from "./ParseError";
import {combiningDiacriticalMarksEndRegex} from "./Lexer";
import Settings from "./Settings";
import SourceLocation from "./SourceLocation";
import {Token} from "./Token";
import type {AnyParseNode, SymbolParseNode} from "./parseNode";
import type {ParseNode, AnyParseNode, SymbolParseNode} from "./parseNode";
import type {Atom, Group} from "./symbols";
import type {Mode, ArgType, BreakToken} from "./types";
import type {FunctionContext, FunctionSpec} from "./defineFunction";
@@ -48,33 +47,9 @@ import type {EnvSpec} from "./defineEnvironment";
* There are also extra `.handle...` functions, which pull out some reused
* functionality into self-contained functions.
*
* The earlier functions return ParseNodes.
* The later functions (which are called deeper in the parse) sometimes return
* ParsedFuncOrArg, which contain a ParseNode as well as some data about
* whether the parsed object is a function which is missing some arguments, or a
* standalone object which can be used as an argument to another function.
* The functions return ParseNodes.
*/
type ParsedFunc = {|
type: "fn",
result: string, // Function name defined via defineFunction (e.g. "\\frac").
token: Token,
|};
type ParsedArg = {|
type: "arg",
result: AnyParseNode,
token: Token,
|};
type ParsedFuncOrArg = ParsedFunc | ParsedArg;
function newArgument(result: AnyParseNode, token: Token): ParsedArg {
return {type: "arg", result, token};
}
function newFunction(token: Token): ParsedFunc {
return {type: "fn", result: token.text, token};
}
export default class Parser {
mode: Mode;
gullet: MacroExpander;
@@ -190,12 +165,6 @@ export default class Parser {
}
const atom = this.parseAtom(breakOnTokenText);
if (!atom) {
if (!this.settings.throwOnError && lex.text[0] === "\\") {
const errorNode = this.handleUnsupportedCmd();
body.push(errorNode);
continue;
}
break;
}
body.push(atom);
@@ -275,33 +244,16 @@ export default class Parser {
const symbol = symbolToken.text;
this.consume();
this.consumeSpaces(); // ignore spaces before sup/subscript argument
const group = this.parseGroup();
const group = this.parseGroup(name, false, Parser.SUPSUB_GREEDINESS);
if (!group) {
if (!this.settings.throwOnError && this.nextToken.text[0] === "\\") {
return this.handleUnsupportedCmd();
} else {
throw new ParseError(
"Expected group after '" + symbol + "'",
symbolToken
);
}
}
if (group.type === "fn") {
// ^ and _ have a greediness, so handle interactions with functions'
// greediness
const funcGreediness = functions[group.result].greediness;
if (funcGreediness > Parser.SUPSUB_GREEDINESS) {
return this.parseGivenFunction(group);
} else {
throw new ParseError(
"Got function '" + group.result + "' with no arguments " +
"as " + name, symbolToken);
}
} else {
return group.result;
}
return group;
}
/**
@@ -339,7 +291,7 @@ export default class Parser {
parseAtom(breakOnTokenText?: BreakToken): ?AnyParseNode {
// The body of an atom is an implicit group, so that things like
// \left(x\right)^2 work correctly.
const base = this.parseImplicitGroup(breakOnTokenText);
const base = this.parseGroup("atom", false, null, breakOnTokenText);
// In text mode, we don't have superscripts or subscripts
if (this.mode === "text") {
@@ -430,96 +382,30 @@ export default class Parser {
}
}
/**
* Parses an implicit group, which is a group that starts at the end of a
* specified, and ends right before a higher explicit group ends, or at EOL. It
* is used for functions that appear to affect the current style, like \Large or
* \textrm, where instead of keeping a style we just pretend that there is an
* implicit grouping after it until the end of the group. E.g.
* small text {\Large large text} small text again
*/
parseImplicitGroup(breakOnTokenText?: BreakToken): ?AnyParseNode {
const start = this.parseSymbol();
if (start == null) {
// If we didn't get anything we handle, fall back to parseFunction
return this.parseFunction();
} else if (start.type === "arg") {
// Defer to parseGivenFunction if it's not a function we handle
return this.parseGivenFunction(start);
}
const func = start.result;
if (func === "\\begin") {
// begin...end is similar to left...right
const begin =
assertNodeType(this.parseGivenFunction(start), "environment");
const envName = begin.name;
if (!environments.hasOwnProperty(envName)) {
throw new ParseError(
"No such environment: " + envName, begin.nameGroup);
}
// Build the environment object. Arguments and other information will
// be made available to the begin and end methods using properties.
const env = environments[envName];
const {args, optArgs} =
this.parseArguments("\\begin{" + envName + "}", env);
const context = {
mode: this.mode,
envName: envName,
parser: this,
};
const result = env.handler(context, args, optArgs);
this.expect("\\end", false);
const endNameToken = this.nextToken;
let end = this.parseFunction();
if (!end) {
throw new ParseError("failed to parse function after \\end");
}
end = assertNodeType(end, "environment");
if (end.name !== envName) {
throw new ParseError(
`Mismatch: \\begin{${envName}} matched by \\end{${end.name}}`,
endNameToken);
}
return result;
} else {
// Defer to parseGivenFunction if it's not a function we handle
return this.parseGivenFunction(start, breakOnTokenText);
}
}
/**
* Parses an entire function, including its base and all of its arguments.
* It also handles the case where the parsed node is not a function.
*/
parseFunction(): ?AnyParseNode {
const baseGroup = this.parseGroup();
return baseGroup ? this.parseGivenFunction(baseGroup) : null;
}
/**
* Same as parseFunction(), except that the base is provided, guaranteeing a
* non-nullable result.
*/
parseGivenFunction(
baseGroup: ParsedFuncOrArg,
parseFunction(
breakOnTokenText?: BreakToken,
): AnyParseNode {
if (baseGroup.type === "fn") {
const func = baseGroup.result;
name?: string, // For error reporting.
greediness?: ?number,
): ?AnyParseNode {
const token = this.nextToken;
const func = token.text;
const funcData = functions[func];
if (this.mode === "text" && !funcData.allowedInText) {
if (!funcData) {
return null;
}
if (greediness != null && funcData.greediness <= greediness) {
throw new ParseError(
"Can't use function '" + func + "' in text mode",
baseGroup.token);
} else if (this.mode === "math" &&
funcData.allowedInMath === false) {
"Got function '" + func + "' with no arguments" +
(name ? " as " + name : ""), token);
} else if (this.mode === "text" && !funcData.allowedInText) {
throw new ParseError(
"Can't use function '" + func + "' in math mode",
baseGroup.token);
"Can't use function '" + func + "' in text mode", token);
} else if (this.mode === "math" && funcData.allowedInMath === false) {
throw new ParseError(
"Can't use function '" + func + "' in math mode", token);
}
// Consume the command token after possibly switching to the
@@ -534,12 +420,7 @@ export default class Parser {
this.consume();
}
const {args, optArgs} = this.parseArguments(func, funcData);
const token = baseGroup.token;
return this.callFunction(
func, args, optArgs, token, breakOnTokenText);
} else {
return baseGroup.result;
}
return this.callFunction(func, args, optArgs, token, breakOnTokenText);
}
/**
@@ -605,37 +486,17 @@ export default class Parser {
this.consumeSpaces();
}
const nextToken = this.nextToken;
let arg = argType ?
this.parseGroupOfType(argType, isOptional) :
this.parseGroup(isOptional);
const arg = this.parseGroupOfType("argument to '" + func + "'",
argType, isOptional, baseGreediness);
if (!arg) {
if (isOptional) {
optArgs.push(null);
continue;
}
if (!this.settings.throwOnError &&
this.nextToken.text[0] === "\\") {
arg = newArgument(this.handleUnsupportedCmd(), nextToken);
} else {
throw new ParseError(
"Expected group after '" + func + "'", nextToken);
}
}
let argNode: AnyParseNode;
if (arg.type === "fn") {
const argGreediness =
functions[arg.result].greediness;
if (argGreediness > baseGreediness) {
argNode = this.parseGivenFunction(arg);
} else {
throw new ParseError(
"Got function '" + arg.result + "' as " +
"argument to '" + func + "'", nextToken);
}
} else {
argNode = arg.result;
}
(isOptional ? optArgs : args).push(argNode);
(isOptional ? optArgs : args).push(arg);
}
return {args, optArgs};
@@ -645,35 +506,29 @@ export default class Parser {
* Parses a group when the mode is changing.
*/
parseGroupOfType(
type: ArgType, // Used to describe the mode in error messages.
name: string,
type: ?ArgType,
optional: boolean,
): ?ParsedFuncOrArg {
// Handle `original` argTypes
if (type === "original") {
type = this.mode;
}
if (type === "color") {
greediness: ?number,
): ?AnyParseNode {
switch (type) {
case "color":
return this.parseColorGroup(optional);
}
if (type === "size") {
case "size":
return this.parseSizeGroup(optional);
}
if (type === "url") {
case "url":
return this.parseUrlGroup(optional);
case "math":
case "text":
return this.parseGroup(name, optional, greediness, undefined, type);
case "original":
case null:
case undefined:
return this.parseGroup(name, optional, greediness);
default:
throw new ParseError(
"Unknown group type as " + name, this.nextToken);
}
if (type === "raw") {
const token = this.parseStringGroup("raw", optional, true);
return token ? newArgument({
type: "raw",
mode: this.mode,
string: token.text,
}, token) : null;
}
// By the time we get here, type is one of "text" or "math".
// Specify this as mode to parseGroup.
return this.parseGroup(optional, type);
}
consumeSpaces() {
@@ -796,7 +651,7 @@ export default class Parser {
/**
* Parses a color description.
*/
parseColorGroup(optional: boolean): ?ParsedArg {
parseColorGroup(optional: boolean): ?ParseNode<"color-token"> {
const res = this.parseStringGroup("color", optional);
if (!res) {
return null;
@@ -812,17 +667,17 @@ export default class Parser {
// Predefined color names are all missed by this RegEx pattern.
color = "#" + color;
}
return newArgument({
return {
type: "color-token",
mode: this.mode,
color,
}, res);
};
}
/**
* Parses a size specification, consisting of magnitude and unit.
*/
parseSizeGroup(optional: boolean): ?ParsedArg {
parseSizeGroup(optional: boolean): ?ParseNode<"size"> {
let res;
let isBlank = false;
if (!optional && this.nextToken.text !== "{") {
@@ -852,18 +707,18 @@ export default class Parser {
if (!validUnit(data)) {
throw new ParseError("Invalid unit: '" + data.unit + "'", res);
}
return newArgument({
return {
type: "size",
mode: this.mode,
value: data,
isBlank,
}, res);
};
}
/**
* Parses an URL, checking escaped letters and allowed protocols.
*/
parseUrlGroup(optional: boolean): ?ParsedArg {
parseUrlGroup(optional: boolean): ?ParseNode<"url"> {
const res = this.parseStringGroup("url", optional, true); // get raw string
if (!res) {
return null;
@@ -881,32 +736,43 @@ export default class Parser {
throw new ParseError(
`Forbidden protocol '${protocol}'`, res);
}
return newArgument({
return {
type: "url",
mode: this.mode,
url,
}, res);
};
}
/**
* If `optional` is false or absent, this parses an ordinary group,
* which is either a single nucleus (like "x") or an expression
* in braces (like "{x+y}").
* in braces (like "{x+y}") or an implicit group, a group that starts
* at the current position, and ends right before a higher explicit
* group ends, or at EOF.
* If `optional` is true, it parses either a bracket-delimited expression
* (like "[x+y]") or returns null to indicate the absence of a
* bracket-enclosed group.
* If `mode` is present, switches to that mode while parsing the group,
* and switches back after.
*/
parseGroup(optional?: boolean, mode?: Mode): ?ParsedFuncOrArg {
parseGroup(
name: string, // For error reporting.
optional?: boolean,
greediness?: ?number,
breakOnTokenText?: BreakToken,
mode?: Mode,
): ?AnyParseNode {
const outerMode = this.mode;
const firstToken = this.nextToken;
// Try to parse an open brace
if (this.nextToken.text === (optional ? "[" : "{")) {
// Switch to specified mode before we expand symbol after brace
const text = firstToken.text;
// Switch to specified mode
if (mode) {
this.switchMode(mode);
}
let result;
// Try to parse an open brace
if (text === (optional ? "[" : "{")) {
// Start a new group namespace
this.gullet.beginGroup();
// If we get a brace, parse an expression
@@ -921,24 +787,36 @@ export default class Parser {
this.gullet.endGroup();
// Make sure we get a close brace
this.expect(optional ? "]" : "}");
return newArgument({
return {
type: "ordgroup",
mode: this.mode,
loc: SourceLocation.range(firstToken, lastToken),
body: expression,
}, firstToken.range(lastToken, firstToken.text));
};
} else if (optional) {
// Return nothing for an optional group
result = null;
} else {
// Otherwise, just return a nucleus, or nothing for an optional group
if (mode) {
this.switchMode(mode);
// If there exists a function with this name, parse the function.
// Otherwise, just return a nucleus
result = this.parseFunction(breakOnTokenText, name, greediness) ||
this.parseSymbol();
if (result == null && text[0] === "\\" &&
!implicitCommands.hasOwnProperty(text)) {
if (this.settings.throwOnError) {
throw new ParseError(
"Undefined control sequence: " + text, firstToken);
}
const result = optional ? null : this.parseSymbol();
result = this.handleUnsupportedCmd();
}
}
// Switch mode back
if (mode) {
this.switchMode(outerMode);
}
return result;
}
}
/**
* Form ligature-like combinations of characters for text mode.
@@ -986,20 +864,14 @@ export default class Parser {
}
/**
* Parse a single symbol out of the string. Here, we handle both the functions
* we have defined, as well as the single character symbols
* Parse a single symbol out of the string. Here, we handle single character
* symbols and special functions like verbatim
*/
parseSymbol(): ?ParsedFuncOrArg {
parseSymbol(): ?AnyParseNode {
const nucleus = this.nextToken;
let text = nucleus.text;
if (functions[text]) {
// If there exists a function with this name, we return the
// function and say that it is a function.
// The token will be consumed later in parseGivenFunction
// (after possibly switching modes).
return newFunction(nucleus);
} else if (/^\\verb[^a-zA-Z]/.test(text)) {
if (/^\\verb[^a-zA-Z]/.test(text)) {
this.consume();
let arg = text.slice(5);
const star = (arg.charAt(0) === "*");
@@ -1013,12 +885,12 @@ export default class Parser {
please report what input caused this bug`);
}
arg = arg.slice(1, -1); // remove first and last char
return newArgument({
return {
type: "verb",
mode: "text",
body: arg,
star,
}, nucleus);
};
} else if (text === "%") {
this.consumeComment();
return this.parseSymbol();
@@ -1123,6 +995,6 @@ export default class Parser {
};
}
}
return newArgument(symbol, nucleus);
return symbol;
}
}

View File

@@ -114,11 +114,8 @@ function parseArray(
}
break;
} else if (next === "\\cr") {
const cr = parser.parseFunction();
if (!cr) {
throw new ParseError(`Failed to parse function after ${next}`);
}
rowGaps.push(assertNodeType(cr, "cr").size);
const cr = assertNodeType(parser.parseFunction(), "cr");
rowGaps.push(cr.size);
// check for \hline(s) following the row separator
hLinesBeforeRow.push(getHLines(parser));

View File

@@ -170,16 +170,13 @@ defineFunction({
--parser.leftrightDepth;
// Check the next token
parser.expect("\\right", false);
const right = parser.parseFunction();
if (!right) {
throw new ParseError('failed to parse function after \\right');
}
const right = assertNodeType(parser.parseFunction(), "leftright-right");
return {
type: "leftright",
mode: parser.mode,
body,
left: delim.text,
right: assertNodeType(right, "leftright-right").delim,
right: right.delim,
};
},
htmlBuilder: (group, options) => {

View File

@@ -2,9 +2,11 @@
import defineFunction from "../defineFunction";
import ParseError from "../ParseError";
import {assertNodeType} from "../parseNode";
import environments from "../environments";
// Environment delimiters. HTML/MathML rendering is defined in the corresponding
// defineEnvironment definitions.
// $FlowFixMe, "environment" handler returns an environment ParseNode
defineFunction({
type: "environment",
names: ["\\begin", "\\end"],
@@ -12,19 +14,48 @@ defineFunction({
numArgs: 1,
argTypes: ["text"],
},
handler({parser}, args) {
handler({parser, funcName}, args) {
const nameGroup = args[0];
if (nameGroup.type !== "ordgroup") {
throw new ParseError("Invalid environment name", nameGroup);
}
let name = "";
let envName = "";
for (let i = 0; i < nameGroup.body.length; ++i) {
name += assertNodeType(nameGroup.body[i], "textord").text;
envName += assertNodeType(nameGroup.body[i], "textord").text;
}
if (funcName === "\\begin") {
// begin...end is similar to left...right
if (!environments.hasOwnProperty(envName)) {
throw new ParseError(
"No such environment: " + envName, nameGroup);
}
// Build the environment object. Arguments and other information will
// be made available to the begin and end methods using properties.
const env = environments[envName];
const {args, optArgs} =
parser.parseArguments("\\begin{" + envName + "}", env);
const context = {
mode: parser.mode,
envName,
parser,
};
const result = env.handler(context, args, optArgs);
parser.expect("\\end", false);
const endNameToken = parser.nextToken;
const end = assertNodeType(parser.parseFunction(), "environment");
if (end.name !== envName) {
throw new ParseError(
`Mismatch: \\begin{${envName}} matched by \\end{${end.name}}`,
endNameToken);
}
return result;
}
return {
type: "environment",
mode: parser.mode,
name,
name: envName,
nameGroup,
};
},

View File

@@ -29,7 +29,7 @@ describe("Parser:", function() {
it("rejects \\sqrt as argument to ^", function() {
expect`1^\sqrt{2}`.toFailWithParseError(
"Got function '\\sqrt' with no arguments as superscript" +
" at position 2: 1^̲\\sqrt{2}");
" at position 3: 1^\\̲s̲q̲r̲t̲{2}");
});
});
@@ -106,28 +106,17 @@ describe("Parser:", function() {
" at position 10: 1^{2\\sqrt}̲");
});
it("complains about functions as arguments to others", function() {
// TODO: The position looks pretty wrong here
expect`\sqrt\over2`.toFailWithParseError(
"Got function '\\over' as argument to '\\sqrt'" +
" at position 6: \\sqrt\\̲o̲v̲e̲r̲2");
"Got function '\\over' with no arguments as argument to" +
" '\\sqrt' at position 6: \\sqrt\\̲o̲v̲e̲r̲2");
});
});
describe("#parseArguments", function() {
it("complains about missing argument at end of input", function() {
expect`2\sqrt`.toFailWithParseError(
"Expected group after '\\sqrt' at end of input: 2\\sqrt");
});
it("complains about missing argument at end of group", function() {
expect`1^{2\sqrt}`.toFailWithParseError(
"Expected group after '\\sqrt'" +
" at position 10: 1^{2\\sqrt}̲");
});
it("complains about functions as arguments to others", function() {
// TODO: The position looks pretty wrong here
expect`\sqrt\over2`.toFailWithParseError(
"Got function '\\over' as argument to '\\sqrt'" +
" at position 6: \\sqrt\\̲o̲v̲e̲r̲2");
describe("#parseGroup", function() {
it("complains about undefined control sequence", function() {
expect`\xyz`.toFailWithParseError(
"Undefined control sequence: \\xyz" +
" at position 1: \\̲x̲y̲z̲");
});
});
@@ -248,7 +237,6 @@ describe("environments.js:", function() {
describe("array environment", function() {
it("rejects unknown column types", function() {
// TODO: The error position here looks strange
expect`\begin{array}{cba}\end{array}`.toFailWithParseError(
"Unknown column alignment: b at position 16:" +
" \\begin{array}{cb̲a}\\end{array}");