Files
KaTeX/src/Lexer.js
renovate[bot] 16a34c3b35 chore(deps): update dependency flow-bin to v0.134.0 [skip netlify] (#2533)
* chore(deps): update dependency flow-bin to v0.134.0 [skip netlify]

* Fix flow errors

Co-authored-by: Renovate Bot <bot@renovateapp.com>
Co-authored-by: Young Min Kin <mail@ylem.kim>
2020-09-26 12:19:42 +09:00

120 lines
4.7 KiB
JavaScript

// @flow
/**
* The Lexer class handles tokenizing the input in various ways. Since our
* parser expects us to be able to backtrack, the lexer allows lexing from any
* given starting point.
*
* Its main exposed function is the `lex` function, which takes a position to
* lex from and a type of token to lex. It defers to the appropriate `_innerLex`
* function.
*
* The various `_innerLex` functions perform the actual lexing of different
* kinds.
*/
import ParseError from "./ParseError";
import SourceLocation from "./SourceLocation";
import {Token} from "./Token";
import type {LexerInterface} from "./Token";
import type Settings from "./Settings";
/* The following tokenRegex
* - matches typical whitespace (but not NBSP etc.) using its first group
* - does not match any control character \x00-\x1f except whitespace
* - does not match a bare backslash
* - matches any ASCII character except those just mentioned
* - does not match the BMP private use area \uE000-\uF8FF
* - does not match bare surrogate code units
* - matches any BMP character except for those just described
* - matches any valid Unicode surrogate pair
* - matches a backslash followed by one or more letters
* - matches a backslash followed by any BMP character, including newline
* Just because the Lexer matches something doesn't mean it's valid input:
* If there is no matching function or symbol definition, the Parser will
* still reject the input.
*/
const spaceRegexString = "[ \r\n\t]";
const controlWordRegexString = "\\\\[a-zA-Z@]+";
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
const controlWordWhitespaceRegexString =
`${controlWordRegexString}${spaceRegexString}*`;
const controlWordWhitespaceRegex = new RegExp(
`^(${controlWordRegexString})${spaceRegexString}*$`);
const combiningDiacriticalMarkString = "[\u0300-\u036f]";
export const combiningDiacriticalMarksEndRegex: RegExp =
new RegExp(`${combiningDiacriticalMarkString}+$`);
const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
"([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
`${combiningDiacriticalMarkString}*` + // ...plus accents
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
`${combiningDiacriticalMarkString}*` + // ...plus accents
"|\\\\verb\\*([^]).*?\\3" + // \verb*
"|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred
"|\\\\operatorname\\*" + // \operatorname*
`|${controlWordWhitespaceRegexString}` + // \macroName + spaces
`|${controlSymbolRegexString})`; // \\, \', etc.
/** Main Lexer class */
export default class Lexer implements LexerInterface {
input: string;
settings: Settings;
tokenRegex: RegExp;
// category codes, only supports comment characters (14) for now
catcodes: {[string]: number};
constructor(input: string, settings: Settings) {
// Separate accents from characters
this.input = input;
this.settings = settings;
this.tokenRegex = new RegExp(tokenRegexString, 'g');
this.catcodes = {
"%": 14, // comment character
};
}
setCatcode(char: string, code: number) {
this.catcodes[char] = code;
}
/**
* This function lexes a single token.
*/
lex(): Token {
const input = this.input;
const pos = this.tokenRegex.lastIndex;
if (pos === input.length) {
return new Token("EOF", new SourceLocation(this, pos, pos));
}
const match = this.tokenRegex.exec(input);
if (match === null || match.index !== pos) {
throw new ParseError(
`Unexpected character: '${input[pos]}'`,
new Token(input[pos], new SourceLocation(this, pos, pos + 1)));
}
let text = match[2] || " ";
if (this.catcodes[text] === 14) { // comment character
const nlIndex = input.indexOf('\n', this.tokenRegex.lastIndex);
if (nlIndex === -1) {
this.tokenRegex.lastIndex = input.length; // EOF
this.settings.reportNonstrict("commentAtEnd",
"% comment has no terminating newline; LaTeX would " +
"fail because of commenting the end of math mode (e.g. $)");
} else {
this.tokenRegex.lastIndex = nlIndex + 1;
}
return this.lex();
}
// Trim any trailing whitespace from control word match
const controlMatch = text.match(controlWordWhitespaceRegex);
if (controlMatch) {
text = controlMatch[1];
}
return new Token(text, new SourceLocation(this, pos,
this.tokenRegex.lastIndex));
}
}