diff --git a/Makefile b/Makefile index 601bda1f..1d6d2782 100644 --- a/Makefile +++ b/Makefile @@ -123,6 +123,10 @@ PYTHON=$(shell python2 --version >/dev/null 2>&1 && echo python2 || echo python) metrics: cd metrics && $(PERL) ./mapping.pl | $(PYTHON) ./extract_tfms.py | $(PYTHON) ./extract_ttfs.py | $(PYTHON) ./format_json.py --width > ../src/fontMetricsData.js +unicode: + cd src && $(NODE) unicodeMake.js >unicodeSymbols.js +src/unicodeSymbols.js: unicode + clean: rm -rf build/* $(NIS) diff --git a/src/Lexer.js b/src/Lexer.js index 937ad9fd..a53b68c0 100644 --- a/src/Lexer.js +++ b/src/Lexer.js @@ -36,11 +36,16 @@ import {LexerInterface, Token} from "./Token"; const commentRegexString = "%[^\n]*[\n]"; const controlWordRegexString = "\\\\[a-zA-Z@]+"; const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]"; +const combiningDiacriticalMarkString = "[\u0300-\u036f]"; +export const combiningDiacriticalMarksEndRegex = + new RegExp(`${combiningDiacriticalMarkString}+$`); const tokenRegex = new RegExp( "([ \r\n\t]+)|" + // whitespace - `(${commentRegexString}|` + // comments - "[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint + `(${commentRegexString}` + // comments + "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint + `${combiningDiacriticalMarkString}*` + // ...plus accents "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair + `${combiningDiacriticalMarkString}*` + // ...plus accents "|\\\\verb\\*([^]).*?\\3" + // \verb* "|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred `|${controlWordRegexString}` + // \macroName @@ -60,6 +65,7 @@ export default class Lexer implements LexerInterface { pos: number; constructor(input: string) { + // Separate accents from characters this.input = input; this.pos = 0; } @@ -76,7 +82,7 @@ export default class Lexer implements LexerInterface { const match = matchAt(tokenRegex, input, pos); if (match === null) { throw new ParseError( - "Unexpected character: '" + input[pos] + "'", + `Unexpected character: '${input[pos]}'`, new Token(input[pos], new SourceLocation(this, pos, pos + 1))); } const text = match[2] || " "; diff --git a/src/Parser.js b/src/Parser.js index 124cb318..71b81418 100644 --- a/src/Parser.js +++ b/src/Parser.js @@ -1,4 +1,5 @@ /* eslint no-constant-condition:0 */ +/* eslint no-console:0 */ import functions from "./functions"; import environments from "./environments"; import MacroExpander from "./MacroExpander"; @@ -6,8 +7,11 @@ import symbols from "./symbols"; import utils from "./utils"; import { validUnit } from "./units"; import { cjkRegex } from "./unicodeRegexes"; +import unicodeAccents from "./unicodeAccents"; +import unicodeSymbols from "./unicodeSymbols"; import ParseNode from "./ParseNode"; import ParseError from "./ParseError"; +import { combiningDiacriticalMarksEndRegex } from "./Lexer.js"; /** * This file contains the parser used to parse out a TeX expression from the @@ -1042,30 +1046,16 @@ export default class Parser { */ parseSymbol() { const nucleus = this.nextToken; + let text = nucleus.text; - if (functions[nucleus.text]) { + if (functions[text]) { this.consume(); // If there exists a function with this name, we return the function and // say that it is a function. return newFunction(nucleus); - } else if (symbols[this.mode][nucleus.text]) { + } else if (/^\\verb[^a-zA-Z]/.test(text)) { this.consume(); - // Otherwise if this is a no-argument function, find the type it - // corresponds to in the symbols map - return newArgument( - new ParseNode(symbols[this.mode][nucleus.text].group, - nucleus.text, this.mode, nucleus), - nucleus); - } else if (this.mode === "text" && cjkRegex.test(nucleus.text)) { - this.consume(); - return newArgument( - new ParseNode("textord", nucleus.text, this.mode, nucleus), - nucleus); - } else if (nucleus.text === "$") { - return newDollar(nucleus); - } else if (/^\\verb[^a-zA-Z]/.test(nucleus.text)) { - this.consume(); - let arg = nucleus.text.slice(5); + let arg = text.slice(5); const star = (arg.charAt(0) === "*"); if (star) { arg = arg.slice(1); @@ -1082,8 +1072,58 @@ export default class Parser { body: arg, star: star, }, "text"), nucleus); - } else { - return null; + } else if (text === "$") { + return newDollar(nucleus); } + // At this point, we should have a symbol, possibly with accents. + // First expand any accented base symbol according to unicodeSymbols. + if (unicodeSymbols.hasOwnProperty(text[0]) && + !symbols[this.mode][text[0]]) { + text = unicodeSymbols[text[0]] + text.substr(1); + } + // Strip off any combining characters + const match = combiningDiacriticalMarksEndRegex.exec(text); + if (match) { + text = text.substring(0, match.index); + if (text === 'i') { + text = '\u0131'; // dotless i, in math and text mode + } else if (text === 'j') { + text = '\u0237'; // dotless j, in math and text mode + } + } + // Recognize base symbol + let symbol = null; + if (symbols[this.mode][text]) { + symbol = new ParseNode(symbols[this.mode][text].group, + text, this.mode, nucleus); + } else if (this.mode === "text" && cjkRegex.test(text)) { + symbol = new ParseNode("textord", text, this.mode, nucleus); + } else { + return null; // EOF, ^, _, {, }, etc. + } + this.consume(); + // Transform combining characters into accents + if (match) { + for (let i = 0; i < match[0].length; i++) { + const accent = match[0][i]; + if (!unicodeAccents[accent]) { + throw new ParseError(`Unknown accent ' ${accent}'`, nucleus); + } + const command = unicodeAccents[accent][this.mode]; + if (!command) { + throw new ParseError( + `Accent ${accent} unsupported in ${this.mode} mode`, + nucleus); + } + symbol = new ParseNode("accent", { + type: "accent", + label: command, + isStretchy: false, + isShifty: true, + base: symbol, + }, this.mode, nucleus); + } + } + return newArgument(symbol, nucleus); } } diff --git a/src/buildCommon.js b/src/buildCommon.js index 35633679..226d261e 100644 --- a/src/buildCommon.js +++ b/src/buildCommon.js @@ -19,9 +19,9 @@ import type {DomChildNode, CombinableDomNode} from "./domTree"; // The following have to be loaded from Main-Italic font, using class mainit const mainitLetters = [ - "\\imath", // dotless i - "\\jmath", // dotless j - "\\pounds", // pounds symbol + "\\imath", "ı", // dotless i + "\\jmath", "ȷ", // dotless j + "\\pounds", "\\mathsterling", "\\textsterling", "£", // pounds symbol ]; /** diff --git a/src/fontMetrics.js b/src/fontMetrics.js index aaf8a715..ba33d79d 100644 --- a/src/fontMetrics.js +++ b/src/fontMetrics.js @@ -97,63 +97,14 @@ import metricMap from "./fontMetricsData"; // TODO(kevinb) allow union of multiple glyph metrics for better accuracy. const extraCharacterMap = { // Latin-1 - 'À': 'A', - 'Á': 'A', - 'Â': 'A', - 'Ã': 'A', - 'Ä': 'A', 'Å': 'A', 'Ç': 'C', - 'È': 'E', - 'É': 'E', - 'Ê': 'E', - 'Ë': 'E', - 'Ì': 'I', - 'Í': 'I', - 'Î': 'I', - 'Ï': 'I', 'Ð': 'D', - 'Ñ': 'N', - 'Ò': 'O', - 'Ó': 'O', - 'Ô': 'O', - 'Õ': 'O', - 'Ö': 'O', - 'Ù': 'U', - 'Ú': 'U', - 'Û': 'U', - 'Ü': 'U', - 'Ý': 'Y', 'Þ': 'o', - 'à': 'a', - 'á': 'a', - 'â': 'a', - 'ã': 'a', - 'ä': 'a', 'å': 'a', 'ç': 'c', - 'è': 'e', - 'é': 'e', - 'ê': 'e', - 'ë': 'e', - 'ì': 'i', - 'í': 'i', - 'î': 'i', - 'ï': 'i', 'ð': 'd', - 'ñ': 'n', - 'ò': 'o', - 'ó': 'o', - 'ô': 'o', - 'õ': 'o', - 'ö': 'o', - 'ù': 'u', - 'ú': 'u', - 'û': 'u', - 'ü': 'u', - 'ý': 'y', 'þ': 'o', - 'ÿ': 'y', // Cyrillic 'А': 'A', diff --git a/src/macros.js b/src/macros.js index 34efa138..b59a88df 100644 --- a/src/macros.js +++ b/src/macros.js @@ -45,7 +45,7 @@ const builtinMacros: MacroMap = {}; export default builtinMacros; // This function might one day accept an additional argument and do more things. -function defineMacro(name: string, body: MacroDefinition) { +export function defineMacro(name: string, body: MacroDefinition) { builtinMacros[name] = body; } diff --git a/src/symbols.js b/src/symbols.js index 142f3c0e..4083167b 100644 --- a/src/symbols.js +++ b/src/symbols.js @@ -644,10 +644,10 @@ defineSymbol(math, main, accent, "\u02c7", "\\check"); defineSymbol(math, main, accent, "\u005e", "\\hat"); defineSymbol(math, main, accent, "\u20d7", "\\vec"); defineSymbol(math, main, accent, "\u02d9", "\\dot"); -defineSymbol(math, main, mathord, "\u0131", "\\imath"); -defineSymbol(math, main, mathord, "\u0237", "\\jmath"); -defineSymbol(text, main, textord, "\u0131", "\\i"); -defineSymbol(text, main, textord, "\u0237", "\\j"); +defineSymbol(math, main, mathord, "\u0131", "\\imath", true); +defineSymbol(math, main, mathord, "\u0237", "\\jmath", true); +defineSymbol(text, main, textord, "\u0131", "\\i", true); +defineSymbol(text, main, textord, "\u0237", "\\j", true); defineSymbol(text, main, textord, "\u00df", "\\ss", true); defineSymbol(text, main, textord, "\u00e6", "\\ae", true); defineSymbol(text, main, textord, "\u00e6", "\\ae", true); @@ -687,7 +687,7 @@ defineSymbol(text, main, textord, "\u00b0", "\\degree"); defineSymbol(math, main, mathord, "\u00a3", "\\pounds"); defineSymbol(math, main, mathord, "\u00a3", "\\mathsterling", true); defineSymbol(text, main, mathord, "\u00a3", "\\pounds"); -defineSymbol(text, main, mathord, "\u00a3", "\\textsterling"); +defineSymbol(text, main, mathord, "\u00a3", "\\textsterling", true); defineSymbol(math, ams, textord, "\u2720", "\\maltese"); defineSymbol(text, ams, textord, "\u2720", "\\maltese"); @@ -719,23 +719,20 @@ for (let i = 0; i < letters.length; i++) { defineSymbol(text, main, textord, ch, ch); } -// Latin-1 letters -for (let i = 0x00C0; i <= 0x00D6; i++) { - const ch = String.fromCharCode(i); +// We add these Latin-1 letters as symbols for backwards-compatibility, +// but they are not actually in the font, nor are they supported by the +// Unicode accent mechanism, so they fall back to Times font and look ugly. +// TODO(edemaine): Fix this. +const extraLatin = "ÇÐÞçðþ"; +for (let i = 0; i < extraLatin.length; i++) { + const ch = extraLatin.charAt(i); defineSymbol(math, main, mathord, ch, ch); defineSymbol(text, main, textord, ch, ch); } - -for (let i = 0x00D8; i <= 0x00F6; i++) { - const ch = String.fromCharCode(i); +const extraLatinMath = "Åå"; +for (let i = 0; i < extraLatinMath.length; i++) { + const ch = extraLatinMath.charAt(i); defineSymbol(math, main, mathord, ch, ch); - defineSymbol(text, main, textord, ch, ch); -} - -for (let i = 0x00F8; i <= 0x00FF; i++) { - const ch = String.fromCharCode(i); - defineSymbol(math, main, mathord, ch, ch); - defineSymbol(text, main, textord, ch, ch); } // Cyrillic diff --git a/src/unicodeAccents.js b/src/unicodeAccents.js new file mode 100644 index 00000000..4d217dc6 --- /dev/null +++ b/src/unicodeAccents.js @@ -0,0 +1,18 @@ +// Mapping of Unicode accent characters to their LaTeX equivalent in text and +// math mode (when they exist). + +// NOTE: This module needs to be written with Node-style modules (not +// ES6 modules) so that unicodeMake.js (a Node application) can import it. +module.exports = { + '\u0301': {text: "\\'", math: '\\acute'}, + '\u0300': {text: '\\`', math: '\\grave'}, + '\u0308': {text: '\\"', math: '\\ddot'}, + '\u0303': {text: '\\~', math: '\\tilde'}, + '\u0304': {text: '\\=', math: '\\bar'}, + '\u0306': {text: '\\u', math: '\\breve'}, + '\u030c': {text: '\\v', math: '\\check'}, + '\u0302': {text: '\\^', math: '\\hat'}, + '\u0307': {text: '\\.', math: '\\dot'}, + '\u030a': {text: '\\r'}, + '\u030b': {text: '\\H'}, +}; diff --git a/src/unicodeMake.js b/src/unicodeMake.js new file mode 100644 index 00000000..f6833ba5 --- /dev/null +++ b/src/unicodeMake.js @@ -0,0 +1,59 @@ +/* eslint no-console:0 */ + +// This is an internal Node tool, not part of the KaTeX distribution, +// whose purpose is to generate unicodeSymbols.js in this directory. +// In this way, only this tool, and not the distribution/browser, +// needs String's normalize function. +// +// This tool should be run (via `node unicodeMake.js` or `make unicode`) +// whenever KaTeX adds support for new accents, and whenever +// the Unicode spec adds new symbols that should be supported. + +const accents = require('./unicodeAccents'); + +console.log("// This file is GENERATED by unicodeMake.js. DO NOT MODIFY."); +console.log(""); + +const encode = function(string) { + let output = '"'; + for (let i = 0; i < string.length; i++) { + let hex = string.charCodeAt(i).toString(16); + while (hex.length < 4) { + hex = `0${hex}`; + } + output += `\\u${hex}`; + } + output = `${output}"`; + return output; +}; + +console.log("export default {"); + +const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + + "αβγδεϵζηθϑικλμνξοπϖρϱςστυφϕχψωΓΔΘΛΞΠΣΥΦΨΩ"; +for (const letter of letters) { + for (const accent of Object.getOwnPropertyNames(accents)) { + const combined = letter + accent; + const normalized = combined.normalize('NFC'); + if (normalized.length === 1) { + console.log( + ` ${encode(normalized)}: ${encode(combined)},` + + ` // ${normalized} = ${accents[accent].text}{${letter}}`); + } + for (const accent2 of Object.getOwnPropertyNames(accents)) { + if (accent === accent2) { + continue; + } + const combined2 = combined + accent2; + const normalized2 = combined2.normalize('NFC'); + if (normalized2.length === 1) { + console.log( + ` ${encode(normalized2)}: ${encode(combined2)},` + + ` // ${normalized2} = ${accents[accent].text}` + + `${accents[accent2].text}{${letter}}`); + } + } + } +} + +console.log("};"); diff --git a/src/unicodeSymbols.js b/src/unicodeSymbols.js new file mode 100644 index 00000000..3b100479 --- /dev/null +++ b/src/unicodeSymbols.js @@ -0,0 +1,322 @@ +// This file is GENERATED by unicodeMake.js. DO NOT MODIFY. + +export default { + "\u00e1": "\u0061\u0301", // á = \'{a} + "\u00e0": "\u0061\u0300", // à = \`{a} + "\u00e4": "\u0061\u0308", // ä = \"{a} + "\u01df": "\u0061\u0308\u0304", // ǟ = \"\={a} + "\u00e3": "\u0061\u0303", // ã = \~{a} + "\u0101": "\u0061\u0304", // ā = \={a} + "\u0103": "\u0061\u0306", // ă = \u{a} + "\u1eaf": "\u0061\u0306\u0301", // ắ = \u\'{a} + "\u1eb1": "\u0061\u0306\u0300", // ằ = \u\`{a} + "\u1eb5": "\u0061\u0306\u0303", // ẵ = \u\~{a} + "\u01ce": "\u0061\u030c", // ǎ = \v{a} + "\u00e2": "\u0061\u0302", // â = \^{a} + "\u1ea5": "\u0061\u0302\u0301", // ấ = \^\'{a} + "\u1ea7": "\u0061\u0302\u0300", // ầ = \^\`{a} + "\u1eab": "\u0061\u0302\u0303", // ẫ = \^\~{a} + "\u0227": "\u0061\u0307", // ȧ = \.{a} + "\u01e1": "\u0061\u0307\u0304", // ǡ = \.\={a} + "\u00e5": "\u0061\u030a", // å = \r{a} + "\u01fb": "\u0061\u030a\u0301", // ǻ = \r\'{a} + "\u1e03": "\u0062\u0307", // ḃ = \.{b} + "\u0107": "\u0063\u0301", // ć = \'{c} + "\u010d": "\u0063\u030c", // č = \v{c} + "\u0109": "\u0063\u0302", // ĉ = \^{c} + "\u010b": "\u0063\u0307", // ċ = \.{c} + "\u010f": "\u0064\u030c", // ď = \v{d} + "\u1e0b": "\u0064\u0307", // ḋ = \.{d} + "\u00e9": "\u0065\u0301", // é = \'{e} + "\u00e8": "\u0065\u0300", // è = \`{e} + "\u00eb": "\u0065\u0308", // ë = \"{e} + "\u1ebd": "\u0065\u0303", // ẽ = \~{e} + "\u0113": "\u0065\u0304", // ē = \={e} + "\u1e17": "\u0065\u0304\u0301", // ḗ = \=\'{e} + "\u1e15": "\u0065\u0304\u0300", // ḕ = \=\`{e} + "\u0115": "\u0065\u0306", // ĕ = \u{e} + "\u011b": "\u0065\u030c", // ě = \v{e} + "\u00ea": "\u0065\u0302", // ê = \^{e} + "\u1ebf": "\u0065\u0302\u0301", // ế = \^\'{e} + "\u1ec1": "\u0065\u0302\u0300", // ề = \^\`{e} + "\u1ec5": "\u0065\u0302\u0303", // ễ = \^\~{e} + "\u0117": "\u0065\u0307", // ė = \.{e} + "\u1e1f": "\u0066\u0307", // ḟ = \.{f} + "\u01f5": "\u0067\u0301", // ǵ = \'{g} + "\u1e21": "\u0067\u0304", // ḡ = \={g} + "\u011f": "\u0067\u0306", // ğ = \u{g} + "\u01e7": "\u0067\u030c", // ǧ = \v{g} + "\u011d": "\u0067\u0302", // ĝ = \^{g} + "\u0121": "\u0067\u0307", // ġ = \.{g} + "\u1e27": "\u0068\u0308", // ḧ = \"{h} + "\u021f": "\u0068\u030c", // ȟ = \v{h} + "\u0125": "\u0068\u0302", // ĥ = \^{h} + "\u1e23": "\u0068\u0307", // ḣ = \.{h} + "\u00ed": "\u0069\u0301", // í = \'{i} + "\u00ec": "\u0069\u0300", // ì = \`{i} + "\u00ef": "\u0069\u0308", // ï = \"{i} + "\u1e2f": "\u0069\u0308\u0301", // ḯ = \"\'{i} + "\u0129": "\u0069\u0303", // ĩ = \~{i} + "\u012b": "\u0069\u0304", // ī = \={i} + "\u012d": "\u0069\u0306", // ĭ = \u{i} + "\u01d0": "\u0069\u030c", // ǐ = \v{i} + "\u00ee": "\u0069\u0302", // î = \^{i} + "\u01f0": "\u006a\u030c", // ǰ = \v{j} + "\u0135": "\u006a\u0302", // ĵ = \^{j} + "\u1e31": "\u006b\u0301", // ḱ = \'{k} + "\u01e9": "\u006b\u030c", // ǩ = \v{k} + "\u013a": "\u006c\u0301", // ĺ = \'{l} + "\u013e": "\u006c\u030c", // ľ = \v{l} + "\u1e3f": "\u006d\u0301", // ḿ = \'{m} + "\u1e41": "\u006d\u0307", // ṁ = \.{m} + "\u0144": "\u006e\u0301", // ń = \'{n} + "\u01f9": "\u006e\u0300", // ǹ = \`{n} + "\u00f1": "\u006e\u0303", // ñ = \~{n} + "\u0148": "\u006e\u030c", // ň = \v{n} + "\u1e45": "\u006e\u0307", // ṅ = \.{n} + "\u00f3": "\u006f\u0301", // ó = \'{o} + "\u00f2": "\u006f\u0300", // ò = \`{o} + "\u00f6": "\u006f\u0308", // ö = \"{o} + "\u022b": "\u006f\u0308\u0304", // ȫ = \"\={o} + "\u00f5": "\u006f\u0303", // õ = \~{o} + "\u1e4d": "\u006f\u0303\u0301", // ṍ = \~\'{o} + "\u1e4f": "\u006f\u0303\u0308", // ṏ = \~\"{o} + "\u022d": "\u006f\u0303\u0304", // ȭ = \~\={o} + "\u014d": "\u006f\u0304", // ō = \={o} + "\u1e53": "\u006f\u0304\u0301", // ṓ = \=\'{o} + "\u1e51": "\u006f\u0304\u0300", // ṑ = \=\`{o} + "\u014f": "\u006f\u0306", // ŏ = \u{o} + "\u01d2": "\u006f\u030c", // ǒ = \v{o} + "\u00f4": "\u006f\u0302", // ô = \^{o} + "\u1ed1": "\u006f\u0302\u0301", // ố = \^\'{o} + "\u1ed3": "\u006f\u0302\u0300", // ồ = \^\`{o} + "\u1ed7": "\u006f\u0302\u0303", // ỗ = \^\~{o} + "\u022f": "\u006f\u0307", // ȯ = \.{o} + "\u0231": "\u006f\u0307\u0304", // ȱ = \.\={o} + "\u0151": "\u006f\u030b", // ő = \H{o} + "\u1e55": "\u0070\u0301", // ṕ = \'{p} + "\u1e57": "\u0070\u0307", // ṗ = \.{p} + "\u0155": "\u0072\u0301", // ŕ = \'{r} + "\u0159": "\u0072\u030c", // ř = \v{r} + "\u1e59": "\u0072\u0307", // ṙ = \.{r} + "\u015b": "\u0073\u0301", // ś = \'{s} + "\u1e65": "\u0073\u0301\u0307", // ṥ = \'\.{s} + "\u0161": "\u0073\u030c", // š = \v{s} + "\u1e67": "\u0073\u030c\u0307", // ṧ = \v\.{s} + "\u015d": "\u0073\u0302", // ŝ = \^{s} + "\u1e61": "\u0073\u0307", // ṡ = \.{s} + "\u1e97": "\u0074\u0308", // ẗ = \"{t} + "\u0165": "\u0074\u030c", // ť = \v{t} + "\u1e6b": "\u0074\u0307", // ṫ = \.{t} + "\u00fa": "\u0075\u0301", // ú = \'{u} + "\u00f9": "\u0075\u0300", // ù = \`{u} + "\u00fc": "\u0075\u0308", // ü = \"{u} + "\u01d8": "\u0075\u0308\u0301", // ǘ = \"\'{u} + "\u01dc": "\u0075\u0308\u0300", // ǜ = \"\`{u} + "\u01d6": "\u0075\u0308\u0304", // ǖ = \"\={u} + "\u01da": "\u0075\u0308\u030c", // ǚ = \"\v{u} + "\u0169": "\u0075\u0303", // ũ = \~{u} + "\u1e79": "\u0075\u0303\u0301", // ṹ = \~\'{u} + "\u016b": "\u0075\u0304", // ū = \={u} + "\u1e7b": "\u0075\u0304\u0308", // ṻ = \=\"{u} + "\u016d": "\u0075\u0306", // ŭ = \u{u} + "\u01d4": "\u0075\u030c", // ǔ = \v{u} + "\u00fb": "\u0075\u0302", // û = \^{u} + "\u016f": "\u0075\u030a", // ů = \r{u} + "\u0171": "\u0075\u030b", // ű = \H{u} + "\u1e7d": "\u0076\u0303", // ṽ = \~{v} + "\u1e83": "\u0077\u0301", // ẃ = \'{w} + "\u1e81": "\u0077\u0300", // ẁ = \`{w} + "\u1e85": "\u0077\u0308", // ẅ = \"{w} + "\u0175": "\u0077\u0302", // ŵ = \^{w} + "\u1e87": "\u0077\u0307", // ẇ = \.{w} + "\u1e98": "\u0077\u030a", // ẘ = \r{w} + "\u1e8d": "\u0078\u0308", // ẍ = \"{x} + "\u1e8b": "\u0078\u0307", // ẋ = \.{x} + "\u00fd": "\u0079\u0301", // ý = \'{y} + "\u1ef3": "\u0079\u0300", // ỳ = \`{y} + "\u00ff": "\u0079\u0308", // ÿ = \"{y} + "\u1ef9": "\u0079\u0303", // ỹ = \~{y} + "\u0233": "\u0079\u0304", // ȳ = \={y} + "\u0177": "\u0079\u0302", // ŷ = \^{y} + "\u1e8f": "\u0079\u0307", // ẏ = \.{y} + "\u1e99": "\u0079\u030a", // ẙ = \r{y} + "\u017a": "\u007a\u0301", // ź = \'{z} + "\u017e": "\u007a\u030c", // ž = \v{z} + "\u1e91": "\u007a\u0302", // ẑ = \^{z} + "\u017c": "\u007a\u0307", // ż = \.{z} + "\u00c1": "\u0041\u0301", // Á = \'{A} + "\u00c0": "\u0041\u0300", // À = \`{A} + "\u00c4": "\u0041\u0308", // Ä = \"{A} + "\u01de": "\u0041\u0308\u0304", // Ǟ = \"\={A} + "\u00c3": "\u0041\u0303", // à = \~{A} + "\u0100": "\u0041\u0304", // Ā = \={A} + "\u0102": "\u0041\u0306", // Ă = \u{A} + "\u1eae": "\u0041\u0306\u0301", // Ắ = \u\'{A} + "\u1eb0": "\u0041\u0306\u0300", // Ằ = \u\`{A} + "\u1eb4": "\u0041\u0306\u0303", // Ẵ = \u\~{A} + "\u01cd": "\u0041\u030c", // Ǎ = \v{A} + "\u00c2": "\u0041\u0302", //  = \^{A} + "\u1ea4": "\u0041\u0302\u0301", // Ấ = \^\'{A} + "\u1ea6": "\u0041\u0302\u0300", // Ầ = \^\`{A} + "\u1eaa": "\u0041\u0302\u0303", // Ẫ = \^\~{A} + "\u0226": "\u0041\u0307", // Ȧ = \.{A} + "\u01e0": "\u0041\u0307\u0304", // Ǡ = \.\={A} + "\u00c5": "\u0041\u030a", // Å = \r{A} + "\u01fa": "\u0041\u030a\u0301", // Ǻ = \r\'{A} + "\u1e02": "\u0042\u0307", // Ḃ = \.{B} + "\u0106": "\u0043\u0301", // Ć = \'{C} + "\u010c": "\u0043\u030c", // Č = \v{C} + "\u0108": "\u0043\u0302", // Ĉ = \^{C} + "\u010a": "\u0043\u0307", // Ċ = \.{C} + "\u010e": "\u0044\u030c", // Ď = \v{D} + "\u1e0a": "\u0044\u0307", // Ḋ = \.{D} + "\u00c9": "\u0045\u0301", // É = \'{E} + "\u00c8": "\u0045\u0300", // È = \`{E} + "\u00cb": "\u0045\u0308", // Ë = \"{E} + "\u1ebc": "\u0045\u0303", // Ẽ = \~{E} + "\u0112": "\u0045\u0304", // Ē = \={E} + "\u1e16": "\u0045\u0304\u0301", // Ḗ = \=\'{E} + "\u1e14": "\u0045\u0304\u0300", // Ḕ = \=\`{E} + "\u0114": "\u0045\u0306", // Ĕ = \u{E} + "\u011a": "\u0045\u030c", // Ě = \v{E} + "\u00ca": "\u0045\u0302", // Ê = \^{E} + "\u1ebe": "\u0045\u0302\u0301", // Ế = \^\'{E} + "\u1ec0": "\u0045\u0302\u0300", // Ề = \^\`{E} + "\u1ec4": "\u0045\u0302\u0303", // Ễ = \^\~{E} + "\u0116": "\u0045\u0307", // Ė = \.{E} + "\u1e1e": "\u0046\u0307", // Ḟ = \.{F} + "\u01f4": "\u0047\u0301", // Ǵ = \'{G} + "\u1e20": "\u0047\u0304", // Ḡ = \={G} + "\u011e": "\u0047\u0306", // Ğ = \u{G} + "\u01e6": "\u0047\u030c", // Ǧ = \v{G} + "\u011c": "\u0047\u0302", // Ĝ = \^{G} + "\u0120": "\u0047\u0307", // Ġ = \.{G} + "\u1e26": "\u0048\u0308", // Ḧ = \"{H} + "\u021e": "\u0048\u030c", // Ȟ = \v{H} + "\u0124": "\u0048\u0302", // Ĥ = \^{H} + "\u1e22": "\u0048\u0307", // Ḣ = \.{H} + "\u00cd": "\u0049\u0301", // Í = \'{I} + "\u00cc": "\u0049\u0300", // Ì = \`{I} + "\u00cf": "\u0049\u0308", // Ï = \"{I} + "\u1e2e": "\u0049\u0308\u0301", // Ḯ = \"\'{I} + "\u0128": "\u0049\u0303", // Ĩ = \~{I} + "\u012a": "\u0049\u0304", // Ī = \={I} + "\u012c": "\u0049\u0306", // Ĭ = \u{I} + "\u01cf": "\u0049\u030c", // Ǐ = \v{I} + "\u00ce": "\u0049\u0302", // Î = \^{I} + "\u0130": "\u0049\u0307", // İ = \.{I} + "\u0134": "\u004a\u0302", // Ĵ = \^{J} + "\u1e30": "\u004b\u0301", // Ḱ = \'{K} + "\u01e8": "\u004b\u030c", // Ǩ = \v{K} + "\u0139": "\u004c\u0301", // Ĺ = \'{L} + "\u013d": "\u004c\u030c", // Ľ = \v{L} + "\u1e3e": "\u004d\u0301", // Ḿ = \'{M} + "\u1e40": "\u004d\u0307", // Ṁ = \.{M} + "\u0143": "\u004e\u0301", // Ń = \'{N} + "\u01f8": "\u004e\u0300", // Ǹ = \`{N} + "\u00d1": "\u004e\u0303", // Ñ = \~{N} + "\u0147": "\u004e\u030c", // Ň = \v{N} + "\u1e44": "\u004e\u0307", // Ṅ = \.{N} + "\u00d3": "\u004f\u0301", // Ó = \'{O} + "\u00d2": "\u004f\u0300", // Ò = \`{O} + "\u00d6": "\u004f\u0308", // Ö = \"{O} + "\u022a": "\u004f\u0308\u0304", // Ȫ = \"\={O} + "\u00d5": "\u004f\u0303", // Õ = \~{O} + "\u1e4c": "\u004f\u0303\u0301", // Ṍ = \~\'{O} + "\u1e4e": "\u004f\u0303\u0308", // Ṏ = \~\"{O} + "\u022c": "\u004f\u0303\u0304", // Ȭ = \~\={O} + "\u014c": "\u004f\u0304", // Ō = \={O} + "\u1e52": "\u004f\u0304\u0301", // Ṓ = \=\'{O} + "\u1e50": "\u004f\u0304\u0300", // Ṑ = \=\`{O} + "\u014e": "\u004f\u0306", // Ŏ = \u{O} + "\u01d1": "\u004f\u030c", // Ǒ = \v{O} + "\u00d4": "\u004f\u0302", // Ô = \^{O} + "\u1ed0": "\u004f\u0302\u0301", // Ố = \^\'{O} + "\u1ed2": "\u004f\u0302\u0300", // Ồ = \^\`{O} + "\u1ed6": "\u004f\u0302\u0303", // Ỗ = \^\~{O} + "\u022e": "\u004f\u0307", // Ȯ = \.{O} + "\u0230": "\u004f\u0307\u0304", // Ȱ = \.\={O} + "\u0150": "\u004f\u030b", // Ő = \H{O} + "\u1e54": "\u0050\u0301", // Ṕ = \'{P} + "\u1e56": "\u0050\u0307", // Ṗ = \.{P} + "\u0154": "\u0052\u0301", // Ŕ = \'{R} + "\u0158": "\u0052\u030c", // Ř = \v{R} + "\u1e58": "\u0052\u0307", // Ṙ = \.{R} + "\u015a": "\u0053\u0301", // Ś = \'{S} + "\u1e64": "\u0053\u0301\u0307", // Ṥ = \'\.{S} + "\u0160": "\u0053\u030c", // Š = \v{S} + "\u1e66": "\u0053\u030c\u0307", // Ṧ = \v\.{S} + "\u015c": "\u0053\u0302", // Ŝ = \^{S} + "\u1e60": "\u0053\u0307", // Ṡ = \.{S} + "\u0164": "\u0054\u030c", // Ť = \v{T} + "\u1e6a": "\u0054\u0307", // Ṫ = \.{T} + "\u00da": "\u0055\u0301", // Ú = \'{U} + "\u00d9": "\u0055\u0300", // Ù = \`{U} + "\u00dc": "\u0055\u0308", // Ü = \"{U} + "\u01d7": "\u0055\u0308\u0301", // Ǘ = \"\'{U} + "\u01db": "\u0055\u0308\u0300", // Ǜ = \"\`{U} + "\u01d5": "\u0055\u0308\u0304", // Ǖ = \"\={U} + "\u01d9": "\u0055\u0308\u030c", // Ǚ = \"\v{U} + "\u0168": "\u0055\u0303", // Ũ = \~{U} + "\u1e78": "\u0055\u0303\u0301", // Ṹ = \~\'{U} + "\u016a": "\u0055\u0304", // Ū = \={U} + "\u1e7a": "\u0055\u0304\u0308", // Ṻ = \=\"{U} + "\u016c": "\u0055\u0306", // Ŭ = \u{U} + "\u01d3": "\u0055\u030c", // Ǔ = \v{U} + "\u00db": "\u0055\u0302", // Û = \^{U} + "\u016e": "\u0055\u030a", // Ů = \r{U} + "\u0170": "\u0055\u030b", // Ű = \H{U} + "\u1e7c": "\u0056\u0303", // Ṽ = \~{V} + "\u1e82": "\u0057\u0301", // Ẃ = \'{W} + "\u1e80": "\u0057\u0300", // Ẁ = \`{W} + "\u1e84": "\u0057\u0308", // Ẅ = \"{W} + "\u0174": "\u0057\u0302", // Ŵ = \^{W} + "\u1e86": "\u0057\u0307", // Ẇ = \.{W} + "\u1e8c": "\u0058\u0308", // Ẍ = \"{X} + "\u1e8a": "\u0058\u0307", // Ẋ = \.{X} + "\u00dd": "\u0059\u0301", // Ý = \'{Y} + "\u1ef2": "\u0059\u0300", // Ỳ = \`{Y} + "\u0178": "\u0059\u0308", // Ÿ = \"{Y} + "\u1ef8": "\u0059\u0303", // Ỹ = \~{Y} + "\u0232": "\u0059\u0304", // Ȳ = \={Y} + "\u0176": "\u0059\u0302", // Ŷ = \^{Y} + "\u1e8e": "\u0059\u0307", // Ẏ = \.{Y} + "\u0179": "\u005a\u0301", // Ź = \'{Z} + "\u017d": "\u005a\u030c", // Ž = \v{Z} + "\u1e90": "\u005a\u0302", // Ẑ = \^{Z} + "\u017b": "\u005a\u0307", // Ż = \.{Z} + "\u03ac": "\u03b1\u0301", // ά = \'{α} + "\u1f70": "\u03b1\u0300", // ὰ = \`{α} + "\u1fb1": "\u03b1\u0304", // ᾱ = \={α} + "\u1fb0": "\u03b1\u0306", // ᾰ = \u{α} + "\u03ad": "\u03b5\u0301", // έ = \'{ε} + "\u1f72": "\u03b5\u0300", // ὲ = \`{ε} + "\u03ae": "\u03b7\u0301", // ή = \'{η} + "\u1f74": "\u03b7\u0300", // ὴ = \`{η} + "\u03af": "\u03b9\u0301", // ί = \'{ι} + "\u1f76": "\u03b9\u0300", // ὶ = \`{ι} + "\u03ca": "\u03b9\u0308", // ϊ = \"{ι} + "\u0390": "\u03b9\u0308\u0301", // ΐ = \"\'{ι} + "\u1fd2": "\u03b9\u0308\u0300", // ῒ = \"\`{ι} + "\u1fd1": "\u03b9\u0304", // ῑ = \={ι} + "\u1fd0": "\u03b9\u0306", // ῐ = \u{ι} + "\u03cc": "\u03bf\u0301", // ό = \'{ο} + "\u1f78": "\u03bf\u0300", // ὸ = \`{ο} + "\u03cd": "\u03c5\u0301", // ύ = \'{υ} + "\u1f7a": "\u03c5\u0300", // ὺ = \`{υ} + "\u03cb": "\u03c5\u0308", // ϋ = \"{υ} + "\u03b0": "\u03c5\u0308\u0301", // ΰ = \"\'{υ} + "\u1fe2": "\u03c5\u0308\u0300", // ῢ = \"\`{υ} + "\u1fe1": "\u03c5\u0304", // ῡ = \={υ} + "\u1fe0": "\u03c5\u0306", // ῠ = \u{υ} + "\u03ce": "\u03c9\u0301", // ώ = \'{ω} + "\u1f7c": "\u03c9\u0300", // ὼ = \`{ω} + "\u038e": "\u03a5\u0301", // Ύ = \'{Υ} + "\u1fea": "\u03a5\u0300", // Ὺ = \`{Υ} + "\u03ab": "\u03a5\u0308", // Ϋ = \"{Υ} + "\u1fe9": "\u03a5\u0304", // Ῡ = \={Υ} + "\u1fe8": "\u03a5\u0306", // Ῠ = \u{Υ} + "\u038f": "\u03a9\u0301", // Ώ = \'{Ω} + "\u1ffa": "\u03a9\u0300", // Ὼ = \`{Ω} +}; diff --git a/test/__snapshots__/mathml-spec.js.snap b/test/__snapshots__/mathml-spec.js.snap index 227b3f0f..59e4a447 100644 --- a/test/__snapshots__/mathml-spec.js.snap +++ b/test/__snapshots__/mathml-spec.js.snap @@ -1,5 +1,62 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP +exports[`A MathML builder accents turn into in MathML 1`] = ` + + + + + + + u + + + ¨ + + + + b + + + e + + + r + + + f + + + i + + + a + + + n + + + c + + + + e + + + ´ + + + + e + + + + über fiancée + + + + +`; + exports[`A MathML builder should generate nodes for \\phantom 1`] = ` diff --git a/test/errors-spec.js b/test/errors-spec.js index 415db00b..97e859ca 100644 --- a/test/errors-spec.js +++ b/test/errors-spec.js @@ -375,3 +375,10 @@ describe("Lexer:", function() { }); }); + +describe("Unicode accents", function() { + it("should return error for invalid combining characters", function() { + expect("A\u0328").toFailWithParseError( + "Unknown accent ' ̨' at position 1: Ą̲̲"); + }); +}); diff --git a/test/katex-spec.js b/test/katex-spec.js index 71a245cf..f9ed9b5d 100644 --- a/test/katex-spec.js +++ b/test/katex-spec.js @@ -2757,15 +2757,64 @@ describe("A parser taking String objects", function() { }); }); +describe("Unicode accents", function() { + it("should parse Latin-1 letters in math mode", function() { + // TODO(edemaine): Unsupported Latin-1 letters in math: ÅåÇÐÞçðþ + expect("ÀÁÂÃÄÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäèéêëìíîïñòóôõöùúûüýÿ") + .toParseLike( + "\\grave A\\acute A\\hat A\\tilde A\\ddot A" + + "\\grave E\\acute E\\hat E\\ddot E" + + "\\grave I\\acute I\\hat I\\ddot I" + + "\\tilde N" + + "\\grave O\\acute O\\hat O\\tilde O\\ddot O" + + "\\grave U\\acute U\\hat U\\ddot U" + + "\\acute Y" + + "\\grave a\\acute a\\hat a\\tilde a\\ddot a" + + "\\grave e\\acute e\\hat e\\ddot e" + + "\\grave ı\\acute ı\\hat ı\\ddot ı" + + "\\tilde n" + + "\\grave o\\acute o\\hat o\\tilde o\\ddot o" + + "\\grave u\\acute u\\hat u\\ddot u" + + "\\acute y\\ddot y"); + }); + + it("should parse Latin-1 letters in text mode", function() { + // TODO(edemaine): Unsupported Latin-1 letters in text: ÇÐÞçðþ + expect("\\text{ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿ}") + .toParseLike( + "\\text{\\`A\\'A\\^A\\~A\\\"A\\r A" + + "\\`E\\'E\\^E\\\"E" + + "\\`I\\'I\\^I\\\"I" + + "\\~N" + + "\\`O\\'O\\^O\\~O\\\"O" + + "\\`U\\'U\\^U\\\"U" + + "\\'Y" + + "\\`a\\'a\\^a\\~a\\\"a\\r a" + + "\\`e\\'e\\^e\\\"e" + + "\\`ı\\'ı\\^ı\\\"ı" + + "\\~n" + + "\\`o\\'o\\^o\\~o\\\"o" + + "\\`u\\'u\\^u\\\"u" + + "\\'y\\\"y}"); + }); + + it("should parse combining characters", function() { + expect("A\u0301C\u0301").toParseLike("Á\\acute C"); + expect("\\text{A\u0301C\u0301}").toParseLike("\\text{Á\\'C}"); + }); + + it("should parse multi-accented characters", function() { + expect("ấā́ắ\\text{ấā́ắ}").toParse(); + // Doesn't parse quite the same as + // "\\text{\\'{\\^a}\\'{\\=a}\\'{\\u a}}" because of the ordgroups. + }); + + it("should parse accented i's and j's", function() { + expect("íȷ́").toParseLike("\\acute ı\\acute ȷ"); + }); +}); + describe("Unicode", function() { - it("should parse all lower case Greek letters", function() { - expect("αβγδεϵζηθϑικλμνξοπϖρϱςστυφϕχψω").toParse(); - }); - - it("should parse 'ΓΔΘΞΠΣΦΨΩ'", function() { - expect("ΓΔΘΞΠΣΦΨΩ").toParse(); - }); - it("should parse negated relations", function() { expect("∉∤∦≁≆≠≨≩≮≯≰≱⊀⊁⊈⊉⊊⊋⊬⊭⊮⊯⋠⋡⋦⋧⋨⋩⋬⋭⪇⪈⪉⪊⪵⪶⪹⪺⫋⫌").toParse(); }); diff --git a/test/mathml-spec.js b/test/mathml-spec.js index ed926078..3430edab 100644 --- a/test/mathml-spec.js +++ b/test/mathml-spec.js @@ -93,4 +93,8 @@ describe("A MathML builder", function() { expect(getMathML(`\\boldsymbol{Ax2k\\omega\\Omega\\imath+}`)) .toMatchSnapshot(); }); + + it('accents turn into in MathML', function() { + expect(getMathML("über fiancée")).toMatchSnapshot(); + }); }); diff --git a/test/screenshotter/images/Unicode-chrome.png b/test/screenshotter/images/Unicode-chrome.png index 7389eafc..229b33d3 100644 Binary files a/test/screenshotter/images/Unicode-chrome.png and b/test/screenshotter/images/Unicode-chrome.png differ diff --git a/test/screenshotter/images/Unicode-firefox.png b/test/screenshotter/images/Unicode-firefox.png index 31cbe113..6b1b8aa7 100644 Binary files a/test/screenshotter/images/Unicode-firefox.png and b/test/screenshotter/images/Unicode-firefox.png differ diff --git a/test/unicode-spec.js b/test/unicode-spec.js index 62a4d97f..ff68d395 100644 --- a/test/unicode-spec.js +++ b/test/unicode-spec.js @@ -67,11 +67,21 @@ describe("unicode", function() { }); it("should parse Latin-1 inside \\text{}", function() { - expect('\\text{ÀàÇçÉéÏïÖöÛû}').toParse(); + expect('\\text{ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿ' + + 'ÆÇÐØÞßæçðøþ}').toParse(); }); it("should parse Latin-1 outside \\text{}", function() { - expect('ÀàÇçÉéÏïÖöÛû').toParse(); + expect('ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿ' + + 'ÇÐÞçðþ').toParse(); + }); + + it("should parse all lower case Greek letters", function() { + expect("αβγδεϵζηθϑικλμνξοπϖρϱςστυφϕχψω").toParse(); + }); + + it("should parse math upper case Greek letters", function() { + expect("ΓΔΘΛΞΠΣΥΦΨΩ").toParse(); }); it("should parse Cyrillic inside \\text{}", function() {