Files
KaTeX/src/Lexer.js
Erik Demaine 484d44ee70 Unicode accents (#992)
* Unicode accents

* Lexer now looks for combining dicritical marks and adds them to the same character
* Parser's `parseSymbol` now recognizes both combined and uncombined forms of Unicode accents, and builds accent objects just like the accent functions
* Added CJK support to math mode (not just text mode)

* Add invalid combining character test

* Add MathML test

* Add weak support for other Latin-1 characters

This maintains backwards compatibility, but it uses the wrong font.
There's a TODO to fix this later.

Also refactor symbol code to use for..of

* Update Unicode screenshot

* Remove dot from accented i and j (in math mode)

Also add dotless Unicode characters to support some accented i's and j's

* Fix \imath, \jmath, \pounds, and more tests

* Switch from for..of to .split().forEach()

Save around 800 bytes in minified code

* Fix split

* normalize() detection

* Convert back to vanilla for loops

* Fix merge

* Move normalize dependency to unicodeMake.js

* Make unicodeSymbols into a lookup table instead of macros

This is important for multi-accented characters.

* Add comments about when to run

* Move symbols definition into unicodeMake/Symbols.js

* Remove CJK support in text mode

* Add missing semicolon

* Refactor unicodeAccents to its own file

* Dotless i/j support in text mode

* Remove excess character mappings

* Fix Åå in math mode (still via Times)

* Update to support #1030

* Add accented Greek letter support (for supported Greek symbols)

* Update screenshot

* remove Æ, æ, Ø, ø, and ß from math mode test
2017-12-28 23:32:45 -07:00

100 lines
3.8 KiB
JavaScript

// @flow
/**
* The Lexer class handles tokenizing the input in various ways. Since our
* parser expects us to be able to backtrack, the lexer allows lexing from any
* given starting point.
*
* Its main exposed function is the `lex` function, which takes a position to
* lex from and a type of token to lex. It defers to the appropriate `_innerLex`
* function.
*
* The various `_innerLex` functions perform the actual lexing of different
* kinds.
*/
import matchAt from "match-at";
import ParseError from "./ParseError";
import SourceLocation from "./SourceLocation";
import {LexerInterface, Token} from "./Token";
/* The following tokenRegex
* - matches typical whitespace (but not NBSP etc.) using its first group
* - matches comments (must have trailing newlines)
* - does not match any control character \x00-\x1f except whitespace
* - does not match a bare backslash
* - matches any ASCII character except those just mentioned
* - does not match the BMP private use area \uE000-\uF8FF
* - does not match bare surrogate code units
* - matches any BMP character except for those just described
* - matches any valid Unicode surrogate pair
* - matches a backslash followed by one or more letters
* - matches a backslash followed by any BMP character, including newline
* Just because the Lexer matches something doesn't mean it's valid input:
* If there is no matching function or symbol definition, the Parser will
* still reject the input.
*/
const commentRegexString = "%[^\n]*[\n]";
const controlWordRegexString = "\\\\[a-zA-Z@]+";
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
const combiningDiacriticalMarkString = "[\u0300-\u036f]";
export const combiningDiacriticalMarksEndRegex =
new RegExp(`${combiningDiacriticalMarkString}+$`);
const tokenRegex = new RegExp(
"([ \r\n\t]+)|" + // whitespace
`(${commentRegexString}` + // comments
"|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
`${combiningDiacriticalMarkString}*` + // ...plus accents
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
`${combiningDiacriticalMarkString}*` + // ...plus accents
"|\\\\verb\\*([^]).*?\\3" + // \verb*
"|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred
`|${controlWordRegexString}` + // \macroName
`|${controlSymbolRegexString}` + // \\, \', etc.
")"
);
// tokenRegex has no ^ marker, as required by matchAt.
// These regexs are for matching results from tokenRegex,
// so they do have ^ markers.
export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
const commentRegex = new RegExp(`^${commentRegexString}`);
/** Main Lexer class */
export default class Lexer implements LexerInterface {
input: string;
pos: number;
constructor(input: string) {
// Separate accents from characters
this.input = input;
this.pos = 0;
}
/**
* This function lexes a single token.
*/
lex(): Token {
const input = this.input;
const pos = this.pos;
if (pos === input.length) {
return new Token("EOF", new SourceLocation(this, pos, pos));
}
const match = matchAt(tokenRegex, input, pos);
if (match === null) {
throw new ParseError(
`Unexpected character: '${input[pos]}'`,
new Token(input[pos], new SourceLocation(this, pos, pos + 1)));
}
const text = match[2] || " ";
const start = this.pos;
this.pos += match[0].length;
const end = this.pos;
if (commentRegex.test(text)) {
return this.lex();
} else {
return new Token(text, new SourceLocation(this, start, end));
}
}
}