Add basic support for Indic scripts in addition to CJK. (#1060)

This patch just makes KaTeX recognize Unicode codepoints in the range \u0900-\u109f so that those South and Southeast Asian scripts do not get automatically rejected. The patch also generalizes the way that Unicode blocks are handled to make it easier to add support for new scripts in the future. src/unicodeRegexes.js is replaced with the new file src/unicodeScripts.js
2025-10-05 03:08:40 +00:00 · 2018-01-12 16:14:31 -08:00
parent d6609f7319
commit 7fe6af2a82
6 changed files with 170 additions and 33 deletions
--- a/src/Parser.js
+++ b/src/Parser.js
@@ -7,7 +7,7 @@ import MacroExpander from "./MacroExpander";
 import symbols from "./symbols";
 import utils from "./utils";
 import { validUnit } from "./units";
-import { cjkRegex } from "./unicodeRegexes";
+import { supportedCodepoint } from "./unicodeScripts";
 import unicodeAccents from "./unicodeAccents";
 import unicodeSymbols from "./unicodeSymbols";
 import ParseNode from "./ParseNode";
@@ -1072,7 +1072,8 @@ export default class Parser {
        if (symbols[this.mode][text]) {
            symbol = new ParseNode(symbols[this.mode][text].group,
                            text, this.mode, nucleus);
-        } else if (this.mode === "text" && cjkRegex.test(text)) {
+        } else if (this.mode === "text" &&
+                   supportedCodepoint(text.charCodeAt(0))) {
            symbol = new ParseNode("textord", text, this.mode, nucleus);
        } else {
            return null;  // EOF, ^, _, {, }, etc.
--- a/src/domTree.js
+++ b/src/domTree.js
@@ -8,7 +8,7 @@
 *
 * Similar functions for working with MathML nodes exist in mathMLTree.js.
 */
-import {cjkRegex, hangulRegex} from "./unicodeRegexes";
+import { scriptFromCodepoint } from "./unicodeScripts";
 import utils from "./utils";
 import svgGeometry from "./svgGeometry";
 import type Options from "./Options";
@@ -408,19 +408,16 @@ class symbolNode implements CombinableDomNode {
        this.style = style || {};
        this.maxFontSize = 0;

-        // Mark CJK characters with specific classes so that we can specify which
-        // fonts to use.  This allows us to render these characters with a serif
-        // font in situations where the browser would either default to a sans serif
-        // or render a placeholder character.
-        if (cjkRegex.test(this.value)) {
-            // I couldn't find any fonts that contained Hangul as well as all of
-            // the other characters we wanted to test there for it gets its own
-            // CSS class.
-            if (hangulRegex.test(this.value)) {
-                this.classes.push('hangul_fallback');
-            } else {
-                this.classes.push('cjk_fallback');
-            }
+        // Mark text from non-Latin scripts with specific classes so that we
+        // can specify which fonts to use.  This allows us to render these
+        // characters with a serif font in situations where the browser would
+        // either default to a sans serif or render a placeholder character.
+        // We use CSS class names like cjk_fallback, hangul_fallback and
+        // brahmic_fallback. See ./unicodeScripts.js for the set of possible
+        // script names
+        const script = scriptFromCodepoint(this.value.charCodeAt(0));
+        if (script) {
+            this.classes.push(script + "_fallback");
        }

        if (/[îïíì]/.test(this.value)) {    // add ī when we add Extended Latin
--- a/src/fontMetrics.js
+++ b/src/fontMetrics.js
@@ -1,5 +1,5 @@
 // @flow
-import { cjkRegex } from "./unicodeRegexes";
+import { supportedCodepoint } from "./unicodeScripts";

 /**
 * This file contains metrics regarding fonts and individual symbols. The sigma
@@ -198,10 +198,21 @@ const getCharacterMetrics = function(
    let ch = character.charCodeAt(0);
    if (character[0] in extraCharacterMap) {
        ch = extraCharacterMap[character[0]].charCodeAt(0);
-    } else if (cjkRegex.test(character[0])) {
-        ch = 'M'.charCodeAt(0);
    }
-    const metrics = metricMap[font]['' + ch];
+    let metrics = metricMap[font][ch];
+
+    if (!metrics) {
+        // We don't typically have font metrics for Asian scripts.
+        // So if the character is in a script we support but we
+        // dont have metrics for it, just use the metrics for
+        // the Latin capital letter M. This is close enough because
+        // we (currently) only care about the height of the glpyh
+        // not its width.
+        if (supportedCodepoint(ch)) {
+            metrics = metricMap[font][77]; // 77 is the charcode for 'M'
+        }
+    }
+
    if (metrics) {
        return {
            depth: metrics[0],
--- a/src/unicodeRegexes.js
+++ b/src/unicodeRegexes.js
@@ -1,13 +0,0 @@
-// @flow
-export const hangulRegex = /[\uAC00-\uD7AF]/;
-
-// This regex combines
-// - CJK symbols and punctuation: [\u3000-\u303F]
-// - Hiragana: [\u3040-\u309F]
-// - Katakana: [\u30A0-\u30FF]
-// - CJK ideograms: [\u4E00-\u9FAF]
-// - Hangul syllables: [\uAC00-\uD7AF]
-// - Fullwidth punctuation: [\uFF00-\uFF60]
-// Notably missing are halfwidth Katakana and Romanji glyphs.
-export const cjkRegex =
-    /[\u3000-\u30FF\u4E00-\u9FAF\uAC00-\uD7AF\uFF00-\uFF60]/;
--- a/src/unicodeScripts.js
+++ b/src/unicodeScripts.js
@@ -0,0 +1,98 @@
+// @flow
+
+/*
+ * This file defines the Unicode scripts and script families that we
+ * support. To add new scripts or families, just add a new entry to the
+ * scriptData array below. Adding scripts to the scriptData array allows
+ * characters from that script to appear in \text{} environments.
+ */
+
+/**
+ * Each script or script family has a name and an array of blocks.
+ * Each block is an array of two numbers which specify the start and
+ * end points (inclusive) of a block of Unicode codepoints.
+ */
+type Script = {
+    name: string;
+    blocks: Array<Array<number>>;
+};
+
+/**
+ * Unicode block data for the families of scripts we support.
+ */
+const scriptData: Array<Script> = [
+    {
+        // Chinese and Japanese.
+        // The "k" in cjk is for Korean, but we've separated Korean out
+        name: "cjk",
+        blocks: [
+            [0x3000, 0x30FF], // CJK symbols and punctuation, Hiragana, Katakana
+            [0x4E00, 0x9FAF], // CJK ideograms
+            [0xFF00, 0xFF60], // Fullwidth punctuation
+            // TODO: add halfwidth Katakana and Romanji glyphs
+        ],
+    },
+    {
+        // Korean
+        name: 'hangul',
+        blocks: [[0xAC00, 0xD7AF]],
+    },
+    {
+        // The Brahmic scripts of South and Southeast Asia
+        // Devanagari (0900–097F)
+        // Bengali (0980–09FF)
+        // Gurmukhi (0A00–0A7F)
+        // Gujarati (0A80–0AFF)
+        // Oriya (0B00–0B7F)
+        // Tamil (0B80–0BFF)
+        // Telugu (0C00–0C7F)
+        // Kannada (0C80–0CFF)
+        // Malayalam (0D00–0D7F)
+        // Sinhala (0D80–0DFF)
+        // Thai (0E00–0E7F)
+        // Lao (0E80–0EFF)
+        // Tibetan (0F00–0FFF)
+        // Myanmar (1000–109F)
+        name: 'brahmic',
+        blocks: [[0x0900, 0x109F]],
+    },
+];
+
+/**
+ * Given a codepoint, return the name of the script or script family
+ * it is from, or null if it is not part of a known block
+ */
+export function scriptFromCodepoint(codepoint: number): ?string {
+    for (const script of scriptData) {
+        for (const block of script.blocks) {
+            if (codepoint >= block[0] && codepoint <= block[1]) {
+                return script.name;
+            }
+        }
+    }
+    return null;
+}
+
+/**
+ * A flattened version of all the supported blocks in a single array.
+ * This is an optimization to make supportedCodepoint() fast.
+ */
+const allBlocks: Array<number> = [];
+scriptData.forEach(s => s.blocks.forEach(b => allBlocks.push(...b)));
+
+/**
+ * Given a codepoint, return true if it falls within one of the
+ * scripts or script families defined above and false otherwise.
+ *
+ * Micro benchmarks shows that this is faster than
+ * /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60\uAC00-\uD7AF\u0900-\u109F]/.test()
+ * in Firefox, Chrome and Node.
+ */
+export function supportedCodepoint(codepoint: number): boolean {
+    for (let i = 0; i < allBlocks.length; i += 2) {
+        if (codepoint >= allBlocks[i] && codepoint <= allBlocks[i + 1]) {
+            return true;
+        }
+    }
+    return false;
+}
--- a/test/unicode-spec.js
+++ b/test/unicode-spec.js
@@ -6,6 +6,7 @@
 import ParseError from "../src/ParseError";
 import parseTree from "../src/parseTree";
 import Settings from "../src/Settings";
+import {scriptFromCodepoint, supportedCodepoint} from "../src/unicodeScripts";

 const defaultSettings = new Settings({});

@@ -101,4 +102,46 @@ describe("unicode", function() {
        expect('私はバナナです。').toNotParse();
        expect('여보세요').toNotParse();
    });
+
+    it("should parse Devangari inside \\text{}", function() {
+        expect('\\text{नमस्ते}').toParse();
+    });
+
+    it("should not parse Devangari outside \\text{}", function() {
+        expect('नमस्ते').toNotParse();
+    });
+});
+
+describe("unicodeScripts", () => {
+    const cjkRE = /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60]/;
+    const hangulRE = /[\uAC00-\uD7AF]/;
+    const brahmicRE = /[\u0900-\u109F]/;
+    const allRE =
+        /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60\uAC00-\uD7AF\u0900-\u109F]/;
+
+    it("supportedCodepoint() should return the correct values", () => {
+        for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
+            expect(supportedCodepoint(codepoint)).toBe(
+                allRE.test(String.fromCharCode(codepoint))
+            );
+        }
+    });
+
+    it("scriptFromCodepoint() should return correct values", () => {
+        for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
+            const character = String.fromCharCode(codepoint);
+            const script = scriptFromCodepoint(codepoint);
+
+            if (cjkRE.test(character)) {
+                expect(script).toEqual('cjk');
+            } else if (hangulRE.test(character)) {
+                expect(script).toEqual('hangul');
+            } else if (brahmicRE.test(character)) {
+                expect(script).toEqual('brahmic');
+            } else {
+                expect(script).toBe(null);
+                expect(supportedCodepoint(codepoint)).toBe(false);
+            }
+        }
+    });
 });