Unicode characters in math render in text mode (#2040)

* Unicode characters in math render in text mode Improve #2031 by rendering all supported Unicode text characters (via supportedCodepoints) in text mode, mimicking wrapping them in `\text`, thereby using metrics of letter M as usual. * Add tests * Improve documentation * Implement review comments
2025-10-10 05:28:41 +00:00 · 2019-08-08 20:33:09 -04:00
parent 3b80e0123b
commit 3b6ed48f5b
4 changed files with 44 additions and 16 deletions
--- a/docs/supported.md
+++ b/docs/supported.md
@@ -180,9 +180,14 @@ Direct Input: $∂ ∇ ℑ Ⅎ ℵ ℶ ℷ ℸ ⅁ ℏ ð$

 **Unicode**

-The letters listed above will render in any KaTeX rendering mode.
+The letters listed above will render properly in any KaTeX rendering mode.

-If the KaTeX rendering mode is set to `strict: false` or `strict:"warn"` (default), then KaTeX will accept all Unicode letters. The letters not listed above will be rendered from system fonts, not KaTeX-supplied fonts, so their typography may clash. They may also cause small vertical alignment issues. KaTeX has detailed metrics for glyphs in Latin, Greek, and Cyrillic, but other glyphs are treated as if they are each as tall as the letter M.
+In addition, Brahmic, Georgian, Chinese, Japanese, and Korean glyphs are always accepted in text mode. However, these glyphs will be rendered from system fonts (not KaTeX-supplied fonts) so their typography may clash.
+You can provide rules for CSS classes `.latin-fallback`, `.cyrillic-fallback`, `.brahmic-fallback`, `.georgian-fallback`, `.cjk-fallback`, and `.hangul-fallback` to provide fallback fonts for these languages.
+Use of these glyphs may cause small vertical alignment issues: KaTeX has detailed metrics for listed symbols and most Latin, Greek, and Cyrillic letters, but other accepted glyphs are treated as if they are each as tall as the letter M in the current KaTeX font.
+
+If the KaTeX rendering mode is set to `strict: false` or `strict: "warn"` (default), then KaTeX will accept all Unicode letters in both text and math mode.
+All unrecognized characters will be treated as if they appeared in text mode, and are subject to the same issues of using system fonts and possibly using incorrect vertical alignment.

 For Persian composite characters, a user-supplied [plug-in](https://github.com/HosseinAgha/persian-katex-plugin) is under development.

--- a/src/Parser.js
+++ b/src/Parser.js
@@ -971,9 +971,16 @@ export default class Parser {
                        nucleus);
                }
            }
+            // All nonmathematical Unicode characters are rendered as if they
+            // are in text mode (wrapped in \text) because that's what it
+            // takes to render them in LaTeX.  Setting `mode: this.mode` is
+            // another natural choice (the user requested math mode), but
+            // this makes it more difficult for getCharacterMetrics() to
+            // distinguish Unicode characters without metrics and those for
+            // which we want to simulate the letter M.
            symbol = {
                type: "textord",
-                mode: this.mode,
+                mode: "text",
                loc: SourceLocation.range(nucleus),
                text,
            };
--- a/src/buildCommon.js
+++ b/src/buildCommon.js
@@ -80,9 +80,8 @@ const makeSymbol = function(
            metrics.width, classes);
    } else {
        // TODO(emily): Figure out a good way to only print this in development
-        typeof console !== "undefined" && console.warn(
-            "No character metrics for '" + value + "' in style '" +
-                fontName + "'");
+        typeof console !== "undefined" && console.warn("No character metrics " +
+            `for '${value}' in style '${fontName}' and mode '${mode}'`);
        symbolNode = new SymbolNode(value, 0, 0, 0, 0, 0, classes);
    }

--- a/test/unicode-spec.js
+++ b/test/unicode-spec.js
@@ -9,7 +9,7 @@ import {strictSettings, nonstrictSettings} from "./helpers";
 describe("unicode", function() {
    it("should parse Latin-1 inside \\text{}", function() {
        expect`\text{ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿÆÇÐØÞßæçðøþ}`
-            .toParse();
+            .toBuild();
    });

    it("should not parse Latin-1 outside \\text{} with strict", function() {
@@ -21,19 +21,23 @@ describe("unicode", function() {

    it("should parse Latin-1 outside \\text{}", function() {
        expect`ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿÇÐÞçðþ`
-            .toParse(nonstrictSettings);
+            .toBuild(nonstrictSettings);
    });

    it("should parse all lower case Greek letters", function() {
-        expect`αβγδεϵζηθϑικλμνξοπϖρϱςστυφϕχψω`.toParse();
+        expect`αβγδεϵζηθϑικλμνξοπϖρϱςστυφϕχψω`.toBuild();
    });

    it("should parse math upper case Greek letters", function() {
-        expect`ΓΔΘΛΞΠΣΥΦΨΩ`.toParse();
+        expect`ΓΔΘΛΞΠΣΥΦΨΩ`.toBuild();
    });

    it("should parse Cyrillic inside \\text{}", function() {
-        expect`\text{БГДЖЗЙЛФЦШЫЮЯ}`.toParse();
+        expect`\text{БГДЖЗЙЛФЦШЫЮЯ}`.toBuild();
+    });
+
+    it("should parse Cyrillic outside \\text{}", function() {
+        expect`БГДЖЗЙЛФЦШЫЮЯ`.toBuild(nonstrictSettings);
    });

    it("should not parse Cyrillic outside \\text{} with strict", function() {
@@ -41,8 +45,13 @@ describe("unicode", function() {
    });

    it("should parse CJK inside \\text{}", function() {
-        expect`\text{私はバナナです}`.toParse();
-        expect`\text{여보세요}`.toParse();
+        expect`\text{私はバナナです}`.toBuild();
+        expect`\text{여보세요}`.toBuild();
+    });
+
+    it("should parse CJK outside \\text{}", function() {
+        expect`私はバナナです`.toBuild(nonstrictSettings);
+        expect`여보세요`.toBuild(nonstrictSettings);
    });

    it("should not parse CJK outside \\text{} with strict", function() {
@@ -51,7 +60,11 @@ describe("unicode", function() {
    });

    it("should parse Devangari inside \\text{}", function() {
-        expect`\text{नमस्ते}`.toParse();
+        expect`\text{नमस्ते}`.toBuild();
+    });
+
+    it("should parse Devangari outside \\text{}", function() {
+        expect`नमस्ते`.toBuild(nonstrictSettings);
    });

    it("should not parse Devangari outside \\text{} with strict", function() {
@@ -59,7 +72,11 @@ describe("unicode", function() {
    });

    it("should parse Georgian inside \\text{}", function() {
-        expect`\text{გამარჯობა}`.toParse();
+        expect`\text{გამარჯობა}`.toBuild();
+    });
+
+    it("should parse Georgian outside \\text{}", function() {
+        expect`გამარჯობა`.toBuild(nonstrictSettings);
    });

    it("should not parse Georgian outside \\text{} with strict", function() {
@@ -67,7 +84,7 @@ describe("unicode", function() {
    });

    it("should parse extended Latin characters inside \\text{}", function() {
-        expect`\text{ěščřžůřťďňőİı}`.toParse();
+        expect`\text{ěščřžůřťďňőİı}`.toBuild();
    });

    it("should not parse extended Latin outside \\text{} with strict", function() {