feat: Support Unicode (sub|super)script characters (#3633)

* feat: Support Unicode (sub|super)script characters * Acquire tokens via repeated fetch() * Match more Unicode (sub|super)script characters * Update docs with new characters * Add Greek characters to RegEx * Pick up review comments Co-authored-by: Erik Demaine <edemaine@mit.edu>
2025-10-05 19:28:39 +00:00 · 2022-05-20 06:50:34 -07:00
parent c31256f838
commit d8fc35e6a9
4 changed files with 139 additions and 0 deletions
--- a/docs/supported.md
+++ b/docs/supported.md
@@ -190,6 +190,9 @@ $\allowbreak α β γ δ ϵ ζ η θ ι κ λ μ ν ξ o π \allowbreak ρ σ τ
 Direct Input: $∂ ∇ ℑ Ⅎ ℵ ℶ ℷ ℸ ⅁ ℏ ð − ∗$
 ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖÙÚÛÜÝÞßàáâãäåçèéêëìíîïðñòóôöùúûüýþÿ
 ₊₋₌₍₎₀₁₂₃₄₅₆₇₈₉ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓᵦᵧᵨᵩᵪ⁺⁻⁼⁽⁾⁰¹²³⁴⁵⁶⁷⁸⁹ᵃᵇᶜᵈᵉᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘʷˣʸᶻᵛᵝᵞᵟᵠᵡ
 Math-mode Unicode (sub|super)script characters will render as if you had written regular characters in a subscript or superscript. For instance, `A²⁺³` will render the same as `A^{2+3}`.
 </div>
 <div class="katex-cards" id="math-alpha">
--- a/src/Parser.js
+++ b/src/Parser.js
@@ -9,6 +9,7 @@ import ParseError from "./ParseError";
 import {combiningDiacriticalMarksEndRegex} from "./Lexer";
 import Settings from "./Settings";
 import SourceLocation from "./SourceLocation";
 import {uSubsAndSups, unicodeSubRegEx} from "./unicodeSupOrSub";
 import {Token} from "./Token";
 // Pre-evaluate both modules as unicodeSymbols require String.normalize()
@@ -399,6 +400,29 @@ export default class Parser {
                }
                // Put everything into an ordgroup as the superscript
                superscript = {type: "ordgroup", mode: this.mode, body: primes};
            } else if (uSubsAndSups[lex.text]) {
                // A Unicode subscript or superscript character.
                // We treat these similarly to the unicode-math package.
                // So we render a string of Unicode (sub|super)scripts the
                // same as a (sub|super)script of regular characters.
                let str = uSubsAndSups[lex.text];
                const isSub = unicodeSubRegEx.test(lex.text);
                this.consume();
                // Continue fetching tokens to fill out the string.
                while (true) {
                    const token = this.fetch().text;
                    if (!(uSubsAndSups[token])) { break; }
                    if (unicodeSubRegEx.test(token) !== isSub) { break; }
                    this.consume();
                    str += uSubsAndSups[token];
                }
                // Now create a (sub|super)script.
                const body = (new Parser(str, this.settings)).parse();
                if (isSub) {
                    subscript = {type: "ordgroup", mode: "math", body};
                } else {
                    superscript = {type: "ordgroup", mode: "math", body};
                }
            } else {
                // If it wasn't ^, _, or ', stop parsing super/subscripts
                break;
--- a/src/unicodeSupOrSub.js
+++ b/src/unicodeSupOrSub.js
@@ -0,0 +1,108 @@
 // Helpers for Parser.js handling of Unicode (sub|super)script characters.
 export const unicodeSubRegEx = /^[₊₋₌₍₎₀₁₂₃₄₅₆₇₈₉ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓᵦᵧᵨᵩᵪ]/;
 export const uSubsAndSups = Object.freeze({
    '₊': '+',
    '₋': '-',
    '₌': '=',
    '₍': '(',
    '₎': ')',
    '₀': '0',
    '₁': '1',
    '₂': '2',
    '₃': '3',
    '₄': '4',
    '₅': '5',
    '₆': '6',
    '₇': '7',
    '₈': '8',
    '₉': '9',
    '\u2090': 'a',
    '\u2091': 'e',
    '\u2095': 'h',
    '\u1D62': 'i',
    '\u2C7C': 'j',
    '\u2096': 'k',
    '\u2097': 'l',
    '\u2098': 'm',
    '\u2099': 'n',
    '\u2092': 'o',
    '\u209A': 'p',
    '\u1D63': 'r',
    '\u209B': 's',
    '\u209C': 't',
    '\u1D64': 'u',
    '\u1D65': 'v',
    '\u2093': 'x',
    '\u1D66': 'β',
    '\u1D67': 'γ',
    '\u1D68': 'ρ',
    '\u1D69': '\u03d5',
    '\u1D6A': 'χ',
    '⁺': '+',
    '⁻': '-',
    '⁼': '=',
    '⁽': '(',
    '⁾': ')',
    '⁰': '0',
    '¹': '1',
    '²': '2',
    '³': '3',
    '⁴': '4',
    '⁵': '5',
    '⁶': '6',
    '⁷': '7',
    '⁸': '8',
    '⁹': '9',
    '\u1D2C': 'A',
    '\u1D2E': 'B',
    '\u1D30': 'D',
    '\u1D31': 'E',
    '\u1D33': 'G',
    '\u1D34': 'H',
    '\u1D35': 'I',
    '\u1D36': 'J',
    '\u1D37': 'K',
    '\u1D38': 'L',
    '\u1D39': 'M',
    '\u1D3A': 'N',
    '\u1D3C': 'O',
    '\u1D3E': 'P',
    '\u1D3F': 'R',
    '\u1D40': 'T',
    '\u1D41': 'U',
    '\u2C7D': 'V',
    '\u1D42': 'W',
    '\u1D43': 'a',
    '\u1D47': 'b',
    '\u1D9C': 'c',
    '\u1D48': 'd',
    '\u1D49': 'e',
    '\u1DA0': 'f',
    '\u1D4D': 'g',
    '\u02B0': 'h',
    '\u2071': 'i',
    '\u02B2': 'j',
    '\u1D4F': 'k',
    '\u02E1': 'l',
    '\u1D50': 'm',
    '\u207F': 'n',
    '\u1D52': 'o',
    '\u1D56': 'p',
    '\u02B3': 'r',
    '\u02E2': 's',
    '\u1D57': 't',
    '\u1D58': 'u',
    '\u1D5B': 'v',
    '\u02B7': 'w',
    '\u02E3': 'x',
    '\u02B8': 'y',
    '\u1DBB': 'z',
    '\u1D5D': 'β',
    '\u1D5E': 'γ',
    '\u1D5F': 'δ',
    '\u1D60': '\u03d5',
    '\u1D61': 'χ',
    '\u1DBF': 'θ',
 });
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -275,6 +275,10 @@ describe("A subscript and superscript parser", function() {
        expect`x_{x^x}`.toParse();
        expect`x_{x_x}`.toParse();
    });
    it("should work with Unicode (sub|super)script characters", function() {
        expect`A² + B²⁺³ + ¹²C + E₂³ + F₂₊₃`.toParseLike("A^{2} + B^{2+3} + ^{12}C + E_{2}^{3} + F_{2+3}");
    });
 });
 describe("A subscript and superscript tree-builder", function() {