Files
KaTeX/src/Parser.js
Erik Demaine 3800dc49c1 [breaking] trust setting to indicate whether input text is trusted (#1794)
* trust option to indicate whether input text is trusted

* Revamp into trust contexts beyond just command

* Document new trust function style

* Fix screenshot testing

* Use trust setting in \url and \href

* Check `isTrusted` in `\url` and `\href` (so now disabled by default)
* Automatically compute `protocol` from `url` in `isTrusted`, so it
  doesn't need to be passed into every context.

* Document untrusted features in support list/table

* Existing tests trust by default

* remove allowedProtocols and fix flow errors

* remove 'allowedProtocols' from documentation

* add a comment about a flow error, rename urlToProtocol to protocolFromUrl

* add tests test that use function version of trust option

* default trust to false in MathML tests

* fix test title, remove 'trust: false' from test settings since it's the default
2019-07-08 21:57:23 -04:00

989 lines
35 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// @flow
/* eslint no-constant-condition:0 */
import functions from "./functions";
import MacroExpander, {implicitCommands} from "./MacroExpander";
import symbols, {ATOMS, extraLatin} from "./symbols";
import {validUnit} from "./units";
import {supportedCodepoint} from "./unicodeScripts";
import unicodeAccents from "./unicodeAccents";
import unicodeSymbols from "./unicodeSymbols";
import {checkNodeType} from "./parseNode";
import ParseError from "./ParseError";
import {combiningDiacriticalMarksEndRegex} from "./Lexer";
import Settings from "./Settings";
import SourceLocation from "./SourceLocation";
import {Token} from "./Token";
import type {ParseNode, AnyParseNode, SymbolParseNode, UnsupportedCmdParseNode}
from "./parseNode";
import type {Atom, Group} from "./symbols";
import type {Mode, ArgType, BreakToken} from "./types";
import type {FunctionContext, FunctionSpec} from "./defineFunction";
import type {EnvSpec} from "./defineEnvironment";
/**
* This file contains the parser used to parse out a TeX expression from the
* input. Since TeX isn't context-free, standard parsers don't work particularly
* well.
*
* The strategy of this parser is as such:
*
* The main functions (the `.parse...` ones) take a position in the current
* parse string to parse tokens from. The lexer (found in Lexer.js, stored at
* this.gullet.lexer) also supports pulling out tokens at arbitrary places. When
* individual tokens are needed at a position, the lexer is called to pull out a
* token, which is then used.
*
* The parser has a property called "mode" indicating the mode that
* the parser is currently in. Currently it has to be one of "math" or
* "text", which denotes whether the current environment is a math-y
* one or a text-y one (e.g. inside \text). Currently, this serves to
* limit the functions which can be used in text mode.
*
* The main functions then return an object which contains the useful data that
* was parsed at its given point, and a new position at the end of the parsed
* data. The main functions can call each other and continue the parsing by
* using the returned position as a new starting point.
*
* There are also extra `.handle...` functions, which pull out some reused
* functionality into self-contained functions.
*
* The functions return ParseNodes.
*/
export default class Parser {
mode: Mode;
gullet: MacroExpander;
settings: Settings;
leftrightDepth: number;
nextToken: Token;
constructor(input: string, settings: Settings) {
// Start in math mode
this.mode = "math";
// Create a new macro expander (gullet) and (indirectly via that) also a
// new lexer (mouth) for this parser (stomach, in the language of TeX)
this.gullet = new MacroExpander(input, settings, this.mode);
// Store the settings for use in parsing
this.settings = settings;
// Count leftright depth (for \middle errors)
this.leftrightDepth = 0;
}
/**
* Checks a result to make sure it has the right type, and throws an
* appropriate error otherwise.
*/
expect(text: string, consume?: boolean = true) {
if (this.nextToken.text !== text) {
throw new ParseError(
"Expected '" + text + "', got '" + this.nextToken.text + "'",
this.nextToken
);
}
if (consume) {
this.consume();
}
}
/**
* Considers the current look ahead token as consumed,
* and fetches the one after that as the new look ahead.
*/
consume() {
this.nextToken = this.gullet.expandNextToken();
}
/**
* Switches between "text" and "math" modes.
*/
switchMode(newMode: Mode) {
this.mode = newMode;
this.gullet.switchMode(newMode);
}
/**
* Main parsing function, which parses an entire input.
*/
parse(): AnyParseNode[] {
// Create a group namespace for the math expression.
// (LaTeX creates a new group for every $...$, $$...$$, \[...\].)
this.gullet.beginGroup();
// Use old \color behavior (same as LaTeX's \textcolor) if requested.
// We do this within the group for the math expression, so it doesn't
// pollute settings.macros.
if (this.settings.colorIsTextColor) {
this.gullet.macros.set("\\color", "\\textcolor");
}
// Try to parse the input
this.consume();
const parse = this.parseExpression(false);
// If we succeeded, make sure there's an EOF at the end
this.expect("EOF", false);
// End the group namespace for the expression
this.gullet.endGroup();
return parse;
}
static endOfExpression = ["}", "\\endgroup", "\\end", "\\right", "&"];
static endOfGroup = {
"[": "]",
"{": "}",
"\\begingroup": "\\endgroup",
}
/**
* Parses an "expression", which is a list of atoms.
*
* `breakOnInfix`: Should the parsing stop when we hit infix nodes? This
* happens when functions have higher precendence han infix
* nodes in implicit parses.
*
* `breakOnTokenText`: The text of the token that the expression should end
* with, or `null` if something else should end the
* expression.
*/
parseExpression(
breakOnInfix: boolean,
breakOnTokenText?: BreakToken,
): AnyParseNode[] {
const body = [];
// Keep adding atoms to the body until we can't parse any more atoms (either
// we reached the end, a }, or a \right)
while (true) {
// Ignore spaces in math mode
if (this.mode === "math") {
this.consumeSpaces();
}
const lex = this.nextToken;
if (Parser.endOfExpression.indexOf(lex.text) !== -1) {
break;
}
if (breakOnTokenText && lex.text === breakOnTokenText) {
break;
}
if (breakOnInfix && functions[lex.text] && functions[lex.text].infix) {
break;
}
const atom = this.parseAtom(breakOnTokenText);
if (!atom) {
break;
}
body.push(atom);
}
if (this.mode === "text") {
this.formLigatures(body);
}
return this.handleInfixNodes(body);
}
/**
* Rewrites infix operators such as \over with corresponding commands such
* as \frac.
*
* There can only be one infix operator per group. If there's more than one
* then the expression is ambiguous. This can be resolved by adding {}.
*/
handleInfixNodes(body: AnyParseNode[]): AnyParseNode[] {
let overIndex = -1;
let funcName;
for (let i = 0; i < body.length; i++) {
const node = checkNodeType(body[i], "infix");
if (node) {
if (overIndex !== -1) {
throw new ParseError(
"only one infix operator per group",
node.token);
}
overIndex = i;
funcName = node.replaceWith;
}
}
if (overIndex !== -1 && funcName) {
let numerNode;
let denomNode;
const numerBody = body.slice(0, overIndex);
const denomBody = body.slice(overIndex + 1);
if (numerBody.length === 1 && numerBody[0].type === "ordgroup") {
numerNode = numerBody[0];
} else {
numerNode = {type: "ordgroup", mode: this.mode, body: numerBody};
}
if (denomBody.length === 1 && denomBody[0].type === "ordgroup") {
denomNode = denomBody[0];
} else {
denomNode = {type: "ordgroup", mode: this.mode, body: denomBody};
}
let node;
if (funcName === "\\\\abovefrac") {
node = this.callFunction(funcName,
[numerNode, body[overIndex], denomNode], []);
} else {
node = this.callFunction(funcName, [numerNode, denomNode], []);
}
return [node];
} else {
return body;
}
}
// The greediness of a superscript or subscript
static SUPSUB_GREEDINESS = 1;
/**
* Handle a subscript or superscript with nice errors.
*/
handleSupSubscript(
name: string, // For error reporting.
): AnyParseNode {
const symbolToken = this.nextToken;
const symbol = symbolToken.text;
this.consume();
this.consumeSpaces(); // ignore spaces before sup/subscript argument
const group = this.parseGroup(name, false, Parser.SUPSUB_GREEDINESS);
if (!group) {
throw new ParseError(
"Expected group after '" + symbol + "'",
symbolToken
);
}
return group;
}
/**
* Converts the textual input of an unsupported command into a text node
* contained within a color node whose color is determined by errorColor
*/
formatUnsupportedCmd(text: string): UnsupportedCmdParseNode {
const textordArray = [];
for (let i = 0; i < text.length; i++) {
textordArray.push({type: "textord", mode: "text", text: text[i]});
}
const textNode = {
type: "text",
mode: this.mode,
body: textordArray,
};
const colorNode = {
type: "color",
mode: this.mode,
color: this.settings.errorColor,
body: [textNode],
};
return colorNode;
}
/**
* Parses a group with optional super/subscripts.
*/
parseAtom(breakOnTokenText?: BreakToken): ?AnyParseNode {
// The body of an atom is an implicit group, so that things like
// \left(x\right)^2 work correctly.
const base = this.parseGroup("atom", false, null, breakOnTokenText);
// In text mode, we don't have superscripts or subscripts
if (this.mode === "text") {
return base;
}
// Note that base may be empty (i.e. null) at this point.
let superscript;
let subscript;
while (true) {
// Guaranteed in math mode, so eat any spaces first.
this.consumeSpaces();
// Lex the first token
const lex = this.nextToken;
if (lex.text === "\\limits" || lex.text === "\\nolimits") {
// We got a limit control
const opNode = checkNodeType(base, "op");
if (opNode) {
const limits = lex.text === "\\limits";
opNode.limits = limits;
opNode.alwaysHandleSupSub = true;
} else {
throw new ParseError(
"Limit controls must follow a math operator",
lex);
}
this.consume();
} else if (lex.text === "^") {
// We got a superscript start
if (superscript) {
throw new ParseError("Double superscript", lex);
}
superscript = this.handleSupSubscript("superscript");
} else if (lex.text === "_") {
// We got a subscript start
if (subscript) {
throw new ParseError("Double subscript", lex);
}
subscript = this.handleSupSubscript("subscript");
} else if (lex.text === "'") {
// We got a prime
if (superscript) {
throw new ParseError("Double superscript", lex);
}
const prime = {type: "textord", mode: this.mode, text: "\\prime"};
// Many primes can be grouped together, so we handle this here
const primes = [prime];
this.consume();
// Keep lexing tokens until we get something that's not a prime
while (this.nextToken.text === "'") {
// For each one, add another prime to the list
primes.push(prime);
this.consume();
}
// If there's a superscript following the primes, combine that
// superscript in with the primes.
if (this.nextToken.text === "^") {
primes.push(this.handleSupSubscript("superscript"));
}
// Put everything into an ordgroup as the superscript
superscript = {type: "ordgroup", mode: this.mode, body: primes};
} else {
// If it wasn't ^, _, or ', stop parsing super/subscripts
break;
}
}
// Base must be set if superscript or subscript are set per logic above,
// but need to check here for type check to pass.
if (superscript || subscript) {
// If we got either a superscript or subscript, create a supsub
return {
type: "supsub",
mode: this.mode,
base: base,
sup: superscript,
sub: subscript,
};
} else {
// Otherwise return the original body
return base;
}
}
/**
* Parses an entire function, including its base and all of its arguments.
*/
parseFunction(
breakOnTokenText?: BreakToken,
name?: string, // For error reporting.
greediness?: ?number,
): ?AnyParseNode {
const token = this.nextToken;
const func = token.text;
const funcData = functions[func];
if (!funcData) {
return null;
}
if (greediness != null && funcData.greediness <= greediness) {
throw new ParseError(
"Got function '" + func + "' with no arguments" +
(name ? " as " + name : ""), token);
} else if (this.mode === "text" && !funcData.allowedInText) {
throw new ParseError(
"Can't use function '" + func + "' in text mode", token);
} else if (this.mode === "math" && funcData.allowedInMath === false) {
throw new ParseError(
"Can't use function '" + func + "' in math mode", token);
}
// hyperref package sets the catcode of % as an active character
if (funcData.argTypes && funcData.argTypes[0] === "url") {
this.gullet.lexer.setCatcode("%", 13);
}
// Consume the command token after possibly switching to the
// mode specified by the function (for instant mode switching),
// and then immediately switch back.
if (funcData.consumeMode) {
const oldMode = this.mode;
this.switchMode(funcData.consumeMode);
this.consume();
this.switchMode(oldMode);
} else {
this.consume();
}
const {args, optArgs} = this.parseArguments(func, funcData);
return this.callFunction(func, args, optArgs, token, breakOnTokenText);
}
/**
* Call a function handler with a suitable context and arguments.
*/
callFunction(
name: string,
args: AnyParseNode[],
optArgs: (?AnyParseNode)[],
token?: Token,
breakOnTokenText?: BreakToken,
): AnyParseNode {
const context: FunctionContext = {
funcName: name,
parser: this,
token,
breakOnTokenText,
};
const func = functions[name];
if (func && func.handler) {
return func.handler(context, args, optArgs);
} else {
throw new ParseError(`No function handler for ${name}`);
}
}
/**
* Parses the arguments of a function or environment
*/
parseArguments(
func: string, // Should look like "\name" or "\begin{name}".
funcData: FunctionSpec<*> | EnvSpec<*>,
): {
args: AnyParseNode[],
optArgs: (?AnyParseNode)[],
} {
const totalArgs = funcData.numArgs + funcData.numOptionalArgs;
if (totalArgs === 0) {
return {args: [], optArgs: []};
}
const baseGreediness = funcData.greediness;
const args = [];
const optArgs = [];
for (let i = 0; i < totalArgs; i++) {
const argType = funcData.argTypes && funcData.argTypes[i];
const isOptional = i < funcData.numOptionalArgs;
// Ignore spaces between arguments. As the TeXbook says:
// "After you have said \def\row#1#2{...}, you are allowed to
// put spaces between the arguments (e.g., \row x n), because
// TeX doesnt use single spaces as undelimited arguments."
if (i > 0 && !isOptional) {
this.consumeSpaces();
}
// Also consume leading spaces in math mode, as parseSymbol
// won't know what to do with them. This can only happen with
// macros, e.g. \frac\foo\foo where \foo expands to a space symbol.
// In LaTeX, the \foo's get treated as (blank) arguments).
// In KaTeX, for now, both spaces will get consumed.
// TODO(edemaine)
if (i === 0 && !isOptional && this.mode === "math") {
this.consumeSpaces();
}
const nextToken = this.nextToken;
const arg = this.parseGroupOfType("argument to '" + func + "'",
argType, isOptional, baseGreediness);
if (!arg) {
if (isOptional) {
optArgs.push(null);
continue;
}
throw new ParseError(
"Expected group after '" + func + "'", nextToken);
}
(isOptional ? optArgs : args).push(arg);
}
return {args, optArgs};
}
/**
* Parses a group when the mode is changing.
*/
parseGroupOfType(
name: string,
type: ?ArgType,
optional: boolean,
greediness: ?number,
): ?AnyParseNode {
switch (type) {
case "color":
return this.parseColorGroup(optional);
case "size":
return this.parseSizeGroup(optional);
case "url":
return this.parseUrlGroup(optional);
case "math":
case "text":
return this.parseGroup(name, optional, greediness, undefined, type);
case "raw": {
if (optional && this.nextToken.text === "{") {
return null;
}
const token = this.parseStringGroup("raw", optional, true);
if (token) {
return {
type: "raw",
mode: "text",
string: token.text,
};
} else {
throw new ParseError("Expected raw group", this.nextToken);
}
}
case "original":
case null:
case undefined:
return this.parseGroup(name, optional, greediness);
default:
throw new ParseError(
"Unknown group type as " + name, this.nextToken);
}
}
consumeSpaces() {
while (this.nextToken.text === " ") {
this.consume();
}
}
/**
* Parses a group, essentially returning the string formed by the
* brace-enclosed tokens plus some position information.
*/
parseStringGroup(
modeName: ArgType, // Used to describe the mode in error messages.
optional: boolean,
raw?: boolean,
): ?Token {
const groupBegin = optional ? "[" : "{";
const groupEnd = optional ? "]" : "}";
const nextToken = this.nextToken;
if (nextToken.text !== groupBegin) {
if (optional) {
return null;
} else if (raw && nextToken.text !== "EOF" &&
/[^{}[\]]/.test(nextToken.text)) {
// allow a single character in raw string group
this.gullet.lexer.setCatcode("%", 14); // reset the catcode of %
this.consume();
return nextToken;
}
}
const outerMode = this.mode;
this.mode = "text";
this.expect(groupBegin);
let str = "";
const firstToken = this.nextToken;
let nested = 0; // allow nested braces in raw string group
let lastToken = firstToken;
while ((raw && nested > 0) || this.nextToken.text !== groupEnd) {
switch (this.nextToken.text) {
case "EOF":
throw new ParseError(
"Unexpected end of input in " + modeName,
firstToken.range(lastToken, str));
case groupBegin:
nested++;
break;
case groupEnd:
nested--;
break;
}
lastToken = this.nextToken;
str += lastToken.text;
this.consume();
}
this.mode = outerMode;
this.gullet.lexer.setCatcode("%", 14); // reset the catcode of %
this.expect(groupEnd);
return firstToken.range(lastToken, str);
}
/**
* Parses a regex-delimited group: the largest sequence of tokens
* whose concatenated strings match `regex`. Returns the string
* formed by the tokens plus some position information.
*/
parseRegexGroup(
regex: RegExp,
modeName: string, // Used to describe the mode in error messages.
): Token {
const outerMode = this.mode;
this.mode = "text";
const firstToken = this.nextToken;
let lastToken = firstToken;
let str = "";
while (this.nextToken.text !== "EOF" &&
regex.test(str + this.nextToken.text)) {
lastToken = this.nextToken;
str += lastToken.text;
this.consume();
}
if (str === "") {
throw new ParseError(
"Invalid " + modeName + ": '" + firstToken.text + "'",
firstToken);
}
this.mode = outerMode;
return firstToken.range(lastToken, str);
}
/**
* Parses a color description.
*/
parseColorGroup(optional: boolean): ?ParseNode<"color-token"> {
const res = this.parseStringGroup("color", optional);
if (!res) {
return null;
}
const match = (/^(#[a-f0-9]{3}|#?[a-f0-9]{6}|[a-z]+)$/i).exec(res.text);
if (!match) {
throw new ParseError("Invalid color: '" + res.text + "'", res);
}
let color = match[0];
if (/^[0-9a-f]{6}$/i.test(color)) {
// We allow a 6-digit HTML color spec without a leading "#".
// This follows the xcolor package's HTML color model.
// Predefined color names are all missed by this RegEx pattern.
color = "#" + color;
}
return {
type: "color-token",
mode: this.mode,
color,
};
}
/**
* Parses a size specification, consisting of magnitude and unit.
*/
parseSizeGroup(optional: boolean): ?ParseNode<"size"> {
let res;
let isBlank = false;
if (!optional && this.nextToken.text !== "{") {
res = this.parseRegexGroup(
/^[-+]? *(?:$|\d+|\d+\.\d*|\.\d*) *[a-z]{0,2} *$/, "size");
} else {
res = this.parseStringGroup("size", optional);
}
if (!res) {
return null;
}
if (!optional && res.text.length === 0) {
// Because we've tested for what is !optional, this block won't
// affect \kern, \hspace, etc. It will capture the mandatory arguments
// to \genfrac and \above.
res.text = "0pt"; // Enable \above{}
isBlank = true; // This is here specifically for \genfrac
}
const match = (/([-+]?) *(\d+(?:\.\d*)?|\.\d+) *([a-z]{2})/).exec(res.text);
if (!match) {
throw new ParseError("Invalid size: '" + res.text + "'", res);
}
const data = {
number: +(match[1] + match[2]), // sign + magnitude, cast to number
unit: match[3],
};
if (!validUnit(data)) {
throw new ParseError("Invalid unit: '" + data.unit + "'", res);
}
return {
type: "size",
mode: this.mode,
value: data,
isBlank,
};
}
/**
* Parses an URL, checking escaped letters and allowed protocols.
*/
parseUrlGroup(optional: boolean): ?ParseNode<"url"> {
const res = this.parseStringGroup("url", optional, true); // get raw string
if (!res) {
return null;
}
// hyperref package allows backslashes alone in href, but doesn't
// generate valid links in such cases; we interpret this as
// "undefined" behaviour, and keep them as-is. Some browser will
// replace backslashes with forward slashes.
const url = res.text.replace(/\\([#$%&~_^{}])/g, '$1');
return {
type: "url",
mode: this.mode,
url,
};
}
/**
* If `optional` is false or absent, this parses an ordinary group,
* which is either a single nucleus (like "x") or an expression
* in braces (like "{x+y}") or an implicit group, a group that starts
* at the current position, and ends right before a higher explicit
* group ends, or at EOF.
* If `optional` is true, it parses either a bracket-delimited expression
* (like "[x+y]") or returns null to indicate the absence of a
* bracket-enclosed group.
* If `mode` is present, switches to that mode while parsing the group,
* and switches back after.
*/
parseGroup(
name: string, // For error reporting.
optional?: boolean,
greediness?: ?number,
breakOnTokenText?: BreakToken,
mode?: Mode,
): ?AnyParseNode {
const outerMode = this.mode;
const firstToken = this.nextToken;
const text = firstToken.text;
// Switch to specified mode
if (mode) {
this.switchMode(mode);
}
let groupEnd;
let result;
// Try to parse an open brace or \begingroup
if (optional ? text === "[" : text === "{" || text === "\\begingroup") {
groupEnd = Parser.endOfGroup[text];
// Start a new group namespace
this.gullet.beginGroup();
// If we get a brace, parse an expression
this.consume();
const expression = this.parseExpression(false, groupEnd);
const lastToken = this.nextToken;
// End group namespace before consuming symbol after close brace
this.gullet.endGroup();
result = {
type: "ordgroup",
mode: this.mode,
loc: SourceLocation.range(firstToken, lastToken),
body: expression,
// A group formed by \begingroup...\endgroup is a semi-simple group
// which doesn't affect spacing in math mode, i.e., is transparent.
// https://tex.stackexchange.com/questions/1930/when-should-one-
// use-begingroup-instead-of-bgroup
semisimple: text === "\\begingroup" || undefined,
};
} else if (optional) {
// Return nothing for an optional group
result = null;
} else {
// If there exists a function with this name, parse the function.
// Otherwise, just return a nucleus
result = this.parseFunction(breakOnTokenText, name, greediness) ||
this.parseSymbol();
if (result == null && text[0] === "\\" &&
!implicitCommands.hasOwnProperty(text)) {
if (this.settings.throwOnError) {
throw new ParseError(
"Undefined control sequence: " + text, firstToken);
}
result = this.formatUnsupportedCmd(text);
this.consume();
}
}
// Switch mode back
if (mode) {
this.switchMode(outerMode);
}
// Make sure we got a close brace
if (groupEnd) {
this.expect(groupEnd);
}
return result;
}
/**
* Form ligature-like combinations of characters for text mode.
* This includes inputs like "--", "---", "``" and "''".
* The result will simply replace multiple textord nodes with a single
* character in each value by a single textord node having multiple
* characters in its value. The representation is still ASCII source.
* The group will be modified in place.
*/
formLigatures(group: AnyParseNode[]) {
let n = group.length - 1;
for (let i = 0; i < n; ++i) {
const a = group[i];
// $FlowFixMe: Not every node type has a `text` property.
const v = a.text;
if (v === "-" && group[i + 1].text === "-") {
if (i + 1 < n && group[i + 2].text === "-") {
group.splice(i, 3, {
type: "textord",
mode: "text",
loc: SourceLocation.range(a, group[i + 2]),
text: "---",
});
n -= 2;
} else {
group.splice(i, 2, {
type: "textord",
mode: "text",
loc: SourceLocation.range(a, group[i + 1]),
text: "--",
});
n -= 1;
}
}
if ((v === "'" || v === "`") && group[i + 1].text === v) {
group.splice(i, 2, {
type: "textord",
mode: "text",
loc: SourceLocation.range(a, group[i + 1]),
text: v + v,
});
n -= 1;
}
}
}
/**
* Parse a single symbol out of the string. Here, we handle single character
* symbols and special functions like verbatim
*/
parseSymbol(): ?AnyParseNode {
const nucleus = this.nextToken;
let text = nucleus.text;
if (/^\\verb[^a-zA-Z]/.test(text)) {
this.consume();
let arg = text.slice(5);
const star = (arg.charAt(0) === "*");
if (star) {
arg = arg.slice(1);
}
// Lexer's tokenRegex is constructed to always have matching
// first/last characters.
if (arg.length < 2 || arg.charAt(0) !== arg.slice(-1)) {
throw new ParseError(`\\verb assertion failed --
please report what input caused this bug`);
}
arg = arg.slice(1, -1); // remove first and last char
return {
type: "verb",
mode: "text",
body: arg,
star,
};
}
// At this point, we should have a symbol, possibly with accents.
// First expand any accented base symbol according to unicodeSymbols.
if (unicodeSymbols.hasOwnProperty(text[0]) &&
!symbols[this.mode][text[0]]) {
// This behavior is not strict (XeTeX-compatible) in math mode.
if (this.settings.strict && this.mode === "math") {
this.settings.reportNonstrict("unicodeTextInMathMode",
`Accented Unicode text character "${text[0]}" used in ` +
`math mode`, nucleus);
}
text = unicodeSymbols[text[0]] + text.substr(1);
}
// Strip off any combining characters
const match = combiningDiacriticalMarksEndRegex.exec(text);
if (match) {
text = text.substring(0, match.index);
if (text === 'i') {
text = '\u0131'; // dotless i, in math and text mode
} else if (text === 'j') {
text = '\u0237'; // dotless j, in math and text mode
}
}
// Recognize base symbol
let symbol: AnyParseNode;
if (symbols[this.mode][text]) {
if (this.settings.strict && this.mode === 'math' &&
extraLatin.indexOf(text) >= 0) {
this.settings.reportNonstrict("unicodeTextInMathMode",
`Latin-1/Unicode text character "${text[0]}" used in ` +
`math mode`, nucleus);
}
const group: Group = symbols[this.mode][text].group;
const loc = SourceLocation.range(nucleus);
let s: SymbolParseNode;
if (ATOMS.hasOwnProperty(group)) {
// $FlowFixMe
const family: Atom = group;
s = {
type: "atom",
mode: this.mode,
family,
loc,
text,
};
} else {
// $FlowFixMe
s = {
type: group,
mode: this.mode,
loc,
text,
};
}
symbol = s;
} else if (text.charCodeAt(0) >= 0x80) { // no symbol for e.g. ^
if (this.settings.strict) {
if (!supportedCodepoint(text.charCodeAt(0))) {
this.settings.reportNonstrict("unknownSymbol",
`Unrecognized Unicode character "${text[0]}"` +
` (${text.charCodeAt(0)})`, nucleus);
} else if (this.mode === "math") {
this.settings.reportNonstrict("unicodeTextInMathMode",
`Unicode text character "${text[0]}" used in math mode`,
nucleus);
}
}
symbol = {
type: "textord",
mode: this.mode,
loc: SourceLocation.range(nucleus),
text,
};
} else {
return null; // EOF, ^, _, {, }, etc.
}
this.consume();
// Transform combining characters into accents
if (match) {
for (let i = 0; i < match[0].length; i++) {
const accent: string = match[0][i];
if (!unicodeAccents[accent]) {
throw new ParseError(`Unknown accent ' ${accent}'`, nucleus);
}
const command = unicodeAccents[accent][this.mode];
if (!command) {
throw new ParseError(
`Accent ${accent} unsupported in ${this.mode} mode`,
nucleus);
}
symbol = {
type: "accent",
mode: this.mode,
loc: SourceLocation.range(nucleus),
label: command,
isStretchy: false,
isShifty: true,
base: symbol,
};
}
}
return symbol;
}
}