Support more scripts in \text{} environments. (#1076)

* Support more scripts in \text{} environments. This diff is a follow-up to PR #1060 which added support for Indic scripts. In order to support Czech, Turkish and Hungarian text (at least) inside \text{} environments, we need to recognize the Latin Extended A and B Unicode blocks. The patch also adds support for Georgian, and enhances support for Cyrillic by defining the entire Cyrillic unicode block instead of defining symbols for a subset of Cyrillic letters as we did previously. * Only return fontMetrics for supported Unicode scripts in text mode The Unicode scripts listed in unicodeScripts.js are supported in text mode but getCharacterMetrics() was returning fake metrics for them even in math mode. This caused bad handling of \boldsymbol\imath * use Mode from types.js
2025-10-05 03:08:40 +00:00 · 2018-01-21 21:53:17 -08:00
parent 5f32b71c85
commit 853e2a4fb7
7 changed files with 90 additions and 45 deletions
--- a/src/buildCommon.js
+++ b/src/buildCommon.js
@@ -40,7 +40,7 @@ const lookupSymbol = function(
    }
    return {
        value: value,
-        metrics: fontMetrics.getCharacterMetrics(value, fontFamily),
+        metrics: fontMetrics.getCharacterMetrics(value, fontFamily, mode),
    };
 };

--- a/src/buildMathML.js
+++ b/src/buildMathML.js
@@ -53,7 +53,7 @@ const getVariant = function(group, options) {
    }

    const fontName = buildCommon.fontMap[font].fontName;
-    if (fontMetrics.getCharacterMetrics(value, fontName)) {
+    if (fontMetrics.getCharacterMetrics(value, fontName, mode)) {
        return buildCommon.fontMap[font].variant;
    }

--- a/src/delimiter.js
+++ b/src/delimiter.js
@@ -33,13 +33,13 @@ import utils from "./utils";
 * Get the metrics for a given symbol and font, after transformation (i.e.
 * after following replacement from symbols.js)
 */
-const getMetrics = function(symbol, font) {
+const getMetrics = function(symbol, font, mode) {
    if (symbols.math[symbol] && symbols.math[symbol].replace) {
        return fontMetrics.getCharacterMetrics(
-            symbols.math[symbol].replace, font);
+            symbols.math[symbol].replace, font, mode);
    } else {
        return fontMetrics.getCharacterMetrics(
-            symbol, font);
+            symbol, font, mode);
    }
 };

@@ -240,16 +240,16 @@ const makeStackedDelim = function(delim, heightTotal, center, options, mode,
    }

    // Get the metrics of the four sections
-    const topMetrics = getMetrics(top, font);
+    const topMetrics = getMetrics(top, font, mode);
    const topHeightTotal = topMetrics.height + topMetrics.depth;
-    const repeatMetrics = getMetrics(repeat, font);
+    const repeatMetrics = getMetrics(repeat, font, mode);
    const repeatHeightTotal = repeatMetrics.height + repeatMetrics.depth;
-    const bottomMetrics = getMetrics(bottom, font);
+    const bottomMetrics = getMetrics(bottom, font, mode);
    const bottomHeightTotal = bottomMetrics.height + bottomMetrics.depth;
    let middleHeightTotal = 0;
    let middleFactor = 1;
    if (middle !== null) {
-        const middleMetrics = getMetrics(middle, font);
+        const middleMetrics = getMetrics(middle, font, mode);
        middleHeightTotal = middleMetrics.height + middleMetrics.depth;
        middleFactor = 2; // repeat symmetrically above and below middle
    }
@@ -522,7 +522,7 @@ const traverseSequence = function(delim, height, sequence, options) {
            break;
        }

-        const metrics = getMetrics(delim, delimTypeToFont(sequence[i]));
+        const metrics = getMetrics(delim, delimTypeToFont(sequence[i]), "math");
        let heightDepth = metrics.height + metrics.depth;

        // Small delimiters are scaled down versions of the same font, so we
--- a/src/fontMetrics.js
+++ b/src/fontMetrics.js
@@ -1,6 +1,8 @@
 // @flow
 import { supportedCodepoint } from "./unicodeScripts";

+import type { Mode } from "./types";
+
 /**
 * This file contains metrics regarding fonts and individual symbols. The sigma
 * and xi variables, as well as the metricMap map contain data extracted from
@@ -191,6 +193,7 @@ export type CharacterMetrics = {
 const getCharacterMetrics = function(
    character: string,
    font: string,
+    mode: Mode,
 ): ?CharacterMetrics {
    if (!metricMap[font]) {
        throw new Error(`Font metrics not found for font: ${font}.`);
@@ -201,10 +204,12 @@ const getCharacterMetrics = function(
    }
    let metrics = metricMap[font][ch];

-    if (!metrics) {
+    if (!metrics && mode === 'text') {
        // We don't typically have font metrics for Asian scripts.
+        // But since we support them in text mode, we need to return
+        // some sort of metrics.
        // So if the character is in a script we support but we
-        // dont have metrics for it, just use the metrics for
+        // don't have metrics for it, just use the metrics for
        // the Latin capital letter M. This is close enough because
        // we (currently) only care about the height of the glpyh
        // not its width.
--- a/src/symbols.js
+++ b/src/symbols.js
@@ -735,12 +735,6 @@ for (let i = 0; i < extraLatinMath.length; i++) {
    defineSymbol(math, main, mathord, ch, ch);
 }

-// Cyrillic
-for (let i = 0x0410; i <= 0x044F; i++) {
-    const ch = String.fromCharCode(i);
-    defineSymbol(text, main, textord, ch, ch);
-}
-
 // Unicode versions of existing characters
 defineSymbol(text, main, textord, "\u2013", "–");
 defineSymbol(text, main, textord, "\u2014", "—");
--- a/src/unicodeScripts.js
+++ b/src/unicodeScripts.js
@@ -18,24 +18,25 @@ type Script = {
 };

 /**
- * Unicode block data for the families of scripts we support.
+ * Unicode block data for the families of scripts we support in \text{}.
+ * Scripts only need to appear here if they do not have font metrics.
 */
 const scriptData: Array<Script> = [
    {
-        // Chinese and Japanese.
-        // The "k" in cjk is for Korean, but we've separated Korean out
-        name: "cjk",
+        // Latin characters beyond the Latin-1 characters we have metrics for.
+        // Needed for Czech, Hungarian and Turkish text, for example.
+        name: 'latin',
        blocks: [
-            [0x3000, 0x30FF], // CJK symbols and punctuation, Hiragana, Katakana
-            [0x4E00, 0x9FAF], // CJK ideograms
-            [0xFF00, 0xFF60], // Fullwidth punctuation
-            // TODO: add halfwidth Katakana and Romanji glyphs
+            [0x0100, 0x024f],  // Latin Extended-A and Latin Extended-B
+            [0x0300, 0x036f],  // Combining Diacritical marks
        ],
    },
    {
-        // Korean
-        name: 'hangul',
-        blocks: [[0xAC00, 0xD7AF]],
+        // The Cyrillic script used by Russian and related languages.
+        // A Cyrillic subset used to be supported as explicitly defined
+        // symbols in symbols.js
+        name: 'cyrillic',
+        blocks: [[0x0400, 0x04ff]],
    },
    {
        // The Brahmic scripts of South and Southeast Asia
@@ -56,6 +57,26 @@ const scriptData: Array<Script> = [
        name: 'brahmic',
        blocks: [[0x0900, 0x109F]],
    },
+    {
+        name: 'georgian',
+        blocks: [[0x10A0, 0x10ff]],
+    },
+    {
+        // Chinese and Japanese.
+        // The "k" in cjk is for Korean, but we've separated Korean out
+        name: "cjk",
+        blocks: [
+            [0x3000, 0x30FF], // CJK symbols and punctuation, Hiragana, Katakana
+            [0x4E00, 0x9FAF], // CJK ideograms
+            [0xFF00, 0xFF60], // Fullwidth punctuation
+            // TODO: add halfwidth Katakana and Romanji glyphs
+        ],
+    },
+    {
+        // Korean
+        name: 'hangul',
+        blocks: [[0xAC00, 0xD7AF]],
+    },
 ];

 /**
--- a/test/unicode-spec.js
+++ b/test/unicode-spec.js
@@ -110,38 +110,63 @@ describe("unicode", function() {
    it("should not parse Devangari outside \\text{}", function() {
        expect('नमस्ते').toNotParse();
    });
+
+    it("should parse Georgian inside \\text{}", function() {
+        expect('\\text{გამარჯობა}').toParse();
+    });
+
+    it("should not parse Georgian outside \\text{}", function() {
+        expect('გამარჯობა').toNotParse();
+    });
+
+    it("should parse extended Latin characters inside \\text{}", function() {
+        expect('\\text{ěščřžůřťďňőİı}').toParse();
+    });
+
+    it("should not parse extended Latin outside \\text{}", function() {
+        expect('ěščřžůřťďňőİı').toNotParse();
+    });
+
 });

 describe("unicodeScripts", () => {
-    const cjkRE = /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60]/;
-    const hangulRE = /[\uAC00-\uD7AF]/;
-    const brahmicRE = /[\u0900-\u109F]/;
-    const allRE =
-        /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60\uAC00-\uD7AF\u0900-\u109F]/;
+    const scriptRegExps = {
+        latin: /[\u0100-\u024f\u0300-\u036f]/,
+        cyrillic: /[\u0400-\u04ff]/,
+        brahmic: /[\u0900-\u109F]/,
+        georgian: /[\u10a0-\u10ff]/,
+        cjk: /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60]/,
+        hangul: /[\uAC00-\uD7AF]/,
+    };
+
+    const scriptNames = Object.keys(scriptRegExps);
+
+    const allRegExp = new RegExp(
+        Object.values(scriptRegExps).map(re => re.source).join('|')
+    );

    it("supportedCodepoint() should return the correct values", () => {
        for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
            expect(supportedCodepoint(codepoint)).toBe(
-                allRE.test(String.fromCharCode(codepoint))
+                allRegExp.test(String.fromCharCode(codepoint))
            );
        }
    });

    it("scriptFromCodepoint() should return correct values", () => {
-        for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
+        outer: for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
            const character = String.fromCharCode(codepoint);
            const script = scriptFromCodepoint(codepoint);

-            if (cjkRE.test(character)) {
-                expect(script).toEqual('cjk');
-            } else if (hangulRE.test(character)) {
-                expect(script).toEqual('hangul');
-            } else if (brahmicRE.test(character)) {
-                expect(script).toEqual('brahmic');
-            } else {
-                expect(script).toBe(null);
-                expect(supportedCodepoint(codepoint)).toBe(false);
+            for (const scriptName of scriptNames) {
+                if (scriptRegExps[scriptName].test(character)) {
+                    expect(script).toEqual(scriptName);
+                    continue outer;
+                }
            }
+
+            expect(script).toBe(null);
+            expect(supportedCodepoint(codepoint)).toBe(false);
        }
    });
 });