Support more scripts in \text{} environments. (#1076)

* Support more scripts in \text{} environments. This diff is a follow-up to PR #1060 which added support for Indic scripts. In order to support Czech, Turkish and Hungarian text (at least) inside \text{} environments, we need to recognize the Latin Extended A and B Unicode blocks. The patch also adds support for Georgian, and enhances support for Cyrillic by defining the entire Cyrillic unicode block instead of defining symbols for a subset of Cyrillic letters as we did previously. * Only return fontMetrics for supported Unicode scripts in text mode The Unicode scripts listed in unicodeScripts.js are supported in text mode but getCharacterMetrics() was returning fake metrics for them even in math mode. This caused bad handling of \boldsymbol\imath * use Mode from types.js
2025-10-05 03:08:40 +00:00 · 2018-01-21 21:53:17 -08:00
parent 5f32b71c85
commit 853e2a4fb7
7 changed files with 90 additions and 45 deletions
--- a/test/unicode-spec.js
+++ b/test/unicode-spec.js
@@ -110,38 +110,63 @@ describe("unicode", function() {
    it("should not parse Devangari outside \\text{}", function() {
        expect('नमस्ते').toNotParse();
    });
+
+    it("should parse Georgian inside \\text{}", function() {
+        expect('\\text{გამარჯობა}').toParse();
+    });
+
+    it("should not parse Georgian outside \\text{}", function() {
+        expect('გამარჯობა').toNotParse();
+    });
+
+    it("should parse extended Latin characters inside \\text{}", function() {
+        expect('\\text{ěščřžůřťďňőİı}').toParse();
+    });
+
+    it("should not parse extended Latin outside \\text{}", function() {
+        expect('ěščřžůřťďňőİı').toNotParse();
+    });
+
 });

 describe("unicodeScripts", () => {
-    const cjkRE = /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60]/;
-    const hangulRE = /[\uAC00-\uD7AF]/;
-    const brahmicRE = /[\u0900-\u109F]/;
-    const allRE =
-        /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60\uAC00-\uD7AF\u0900-\u109F]/;
+    const scriptRegExps = {
+        latin: /[\u0100-\u024f\u0300-\u036f]/,
+        cyrillic: /[\u0400-\u04ff]/,
+        brahmic: /[\u0900-\u109F]/,
+        georgian: /[\u10a0-\u10ff]/,
+        cjk: /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60]/,
+        hangul: /[\uAC00-\uD7AF]/,
+    };
+
+    const scriptNames = Object.keys(scriptRegExps);
+
+    const allRegExp = new RegExp(
+        Object.values(scriptRegExps).map(re => re.source).join('|')
+    );

    it("supportedCodepoint() should return the correct values", () => {
        for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
            expect(supportedCodepoint(codepoint)).toBe(
-                allRE.test(String.fromCharCode(codepoint))
+                allRegExp.test(String.fromCharCode(codepoint))
            );
        }
    });

    it("scriptFromCodepoint() should return correct values", () => {
-        for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
+        outer: for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
            const character = String.fromCharCode(codepoint);
            const script = scriptFromCodepoint(codepoint);

-            if (cjkRE.test(character)) {
-                expect(script).toEqual('cjk');
-            } else if (hangulRE.test(character)) {
-                expect(script).toEqual('hangul');
-            } else if (brahmicRE.test(character)) {
-                expect(script).toEqual('brahmic');
-            } else {
-                expect(script).toBe(null);
-                expect(supportedCodepoint(codepoint)).toBe(false);
+            for (const scriptName of scriptNames) {
+                if (scriptRegExps[scriptName].test(character)) {
+                    expect(script).toEqual(scriptName);
+                    continue outer;
+                }
            }
+
+            expect(script).toBe(null);
+            expect(supportedCodepoint(codepoint)).toBe(false);
        }
    });
 });