From d8fc35e6a97f8e561c723b93ad275cf5a7f3094a Mon Sep 17 00:00:00 2001 From: Ron Kok Date: Fri, 20 May 2022 06:50:34 -0700 Subject: [PATCH] feat: Support Unicode (sub|super)script characters (#3633) * feat: Support Unicode (sub|super)script characters * Acquire tokens via repeated fetch() * Match more Unicode (sub|super)script characters * Update docs with new characters * Add Greek characters to RegEx * Pick up review comments Co-authored-by: Erik Demaine --- docs/supported.md | 3 ++ src/Parser.js | 24 +++++++++ src/unicodeSupOrSub.js | 108 +++++++++++++++++++++++++++++++++++++++++ test/katex-spec.js | 4 ++ 4 files changed, 139 insertions(+) create mode 100644 src/unicodeSupOrSub.js diff --git a/docs/supported.md b/docs/supported.md index ddf8a421..ae73abd3 100644 --- a/docs/supported.md +++ b/docs/supported.md @@ -190,6 +190,9 @@ $\allowbreak α β γ δ ϵ ζ η θ ι κ λ μ ν ξ o π \allowbreak ρ σ τ Direct Input: $∂ ∇ ℑ Ⅎ ℵ ℶ ℷ ℸ ⅁ ℏ ð − ∗$ ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖÙÚÛÜÝÞßàáâãäåçèéêëìíîïðñòóôöùúûüýþÿ +₊₋₌₍₎₀₁₂₃₄₅₆₇₈₉ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓᵦᵧᵨᵩᵪ⁺⁻⁼⁽⁾⁰¹²³⁴⁵⁶⁷⁸⁹ᵃᵇᶜᵈᵉᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘʷˣʸᶻᵛᵝᵞᵟᵠᵡ + +Math-mode Unicode (sub|super)script characters will render as if you had written regular characters in a subscript or superscript. For instance, `A²⁺³` will render the same as `A^{2+3}`.
diff --git a/src/Parser.js b/src/Parser.js index cb35758c..43de4f92 100644 --- a/src/Parser.js +++ b/src/Parser.js @@ -9,6 +9,7 @@ import ParseError from "./ParseError"; import {combiningDiacriticalMarksEndRegex} from "./Lexer"; import Settings from "./Settings"; import SourceLocation from "./SourceLocation"; +import {uSubsAndSups, unicodeSubRegEx} from "./unicodeSupOrSub"; import {Token} from "./Token"; // Pre-evaluate both modules as unicodeSymbols require String.normalize() @@ -399,6 +400,29 @@ export default class Parser { } // Put everything into an ordgroup as the superscript superscript = {type: "ordgroup", mode: this.mode, body: primes}; + } else if (uSubsAndSups[lex.text]) { + // A Unicode subscript or superscript character. + // We treat these similarly to the unicode-math package. + // So we render a string of Unicode (sub|super)scripts the + // same as a (sub|super)script of regular characters. + let str = uSubsAndSups[lex.text]; + const isSub = unicodeSubRegEx.test(lex.text); + this.consume(); + // Continue fetching tokens to fill out the string. + while (true) { + const token = this.fetch().text; + if (!(uSubsAndSups[token])) { break; } + if (unicodeSubRegEx.test(token) !== isSub) { break; } + this.consume(); + str += uSubsAndSups[token]; + } + // Now create a (sub|super)script. + const body = (new Parser(str, this.settings)).parse(); + if (isSub) { + subscript = {type: "ordgroup", mode: "math", body}; + } else { + superscript = {type: "ordgroup", mode: "math", body}; + } } else { // If it wasn't ^, _, or ', stop parsing super/subscripts break; diff --git a/src/unicodeSupOrSub.js b/src/unicodeSupOrSub.js new file mode 100644 index 00000000..5d5a4a45 --- /dev/null +++ b/src/unicodeSupOrSub.js @@ -0,0 +1,108 @@ +// Helpers for Parser.js handling of Unicode (sub|super)script characters. + +export const unicodeSubRegEx = /^[₊₋₌₍₎₀₁₂₃₄₅₆₇₈₉ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓᵦᵧᵨᵩᵪ]/; + +export const uSubsAndSups = Object.freeze({ + '₊': '+', + '₋': '-', + '₌': '=', + '₍': '(', + '₎': ')', + '₀': '0', + '₁': '1', + '₂': '2', + '₃': '3', + '₄': '4', + '₅': '5', + '₆': '6', + '₇': '7', + '₈': '8', + '₉': '9', + '\u2090': 'a', + '\u2091': 'e', + '\u2095': 'h', + '\u1D62': 'i', + '\u2C7C': 'j', + '\u2096': 'k', + '\u2097': 'l', + '\u2098': 'm', + '\u2099': 'n', + '\u2092': 'o', + '\u209A': 'p', + '\u1D63': 'r', + '\u209B': 's', + '\u209C': 't', + '\u1D64': 'u', + '\u1D65': 'v', + '\u2093': 'x', + '\u1D66': 'β', + '\u1D67': 'γ', + '\u1D68': 'ρ', + '\u1D69': '\u03d5', + '\u1D6A': 'χ', + '⁺': '+', + '⁻': '-', + '⁼': '=', + '⁽': '(', + '⁾': ')', + '⁰': '0', + '¹': '1', + '²': '2', + '³': '3', + '⁴': '4', + '⁵': '5', + '⁶': '6', + '⁷': '7', + '⁸': '8', + '⁹': '9', + '\u1D2C': 'A', + '\u1D2E': 'B', + '\u1D30': 'D', + '\u1D31': 'E', + '\u1D33': 'G', + '\u1D34': 'H', + '\u1D35': 'I', + '\u1D36': 'J', + '\u1D37': 'K', + '\u1D38': 'L', + '\u1D39': 'M', + '\u1D3A': 'N', + '\u1D3C': 'O', + '\u1D3E': 'P', + '\u1D3F': 'R', + '\u1D40': 'T', + '\u1D41': 'U', + '\u2C7D': 'V', + '\u1D42': 'W', + '\u1D43': 'a', + '\u1D47': 'b', + '\u1D9C': 'c', + '\u1D48': 'd', + '\u1D49': 'e', + '\u1DA0': 'f', + '\u1D4D': 'g', + '\u02B0': 'h', + '\u2071': 'i', + '\u02B2': 'j', + '\u1D4F': 'k', + '\u02E1': 'l', + '\u1D50': 'm', + '\u207F': 'n', + '\u1D52': 'o', + '\u1D56': 'p', + '\u02B3': 'r', + '\u02E2': 's', + '\u1D57': 't', + '\u1D58': 'u', + '\u1D5B': 'v', + '\u02B7': 'w', + '\u02E3': 'x', + '\u02B8': 'y', + '\u1DBB': 'z', + '\u1D5D': 'β', + '\u1D5E': 'γ', + '\u1D5F': 'δ', + '\u1D60': '\u03d5', + '\u1D61': 'χ', + '\u1DBF': 'θ', +}); diff --git a/test/katex-spec.js b/test/katex-spec.js index 6b69df82..2b40ebe2 100644 --- a/test/katex-spec.js +++ b/test/katex-spec.js @@ -275,6 +275,10 @@ describe("A subscript and superscript parser", function() { expect`x_{x^x}`.toParse(); expect`x_{x_x}`.toParse(); }); + + it("should work with Unicode (sub|super)script characters", function() { + expect`A² + B²⁺³ + ¹²C + E₂³ + F₂₊₃`.toParseLike("A^{2} + B^{2+3} + ^{12}C + E_{2}^{3} + F_{2+3}"); + }); }); describe("A subscript and superscript tree-builder", function() {