From 7fe6af2a82b508b0e7468043c19744644b59593c Mon Sep 17 00:00:00 2001
From: David Flanagan <davidflanagan@khanacademy.org>
Date: Fri, 12 Jan 2018 16:14:31 -0800
Subject: [PATCH] Add basic support for Indic scripts in addition to CJK.
 (#1060)

This patch just makes KaTeX recognize Unicode codepoints in the
range \u0900-\u109f so that those South and Southeast Asian scripts
do not get automatically rejected.

The patch also generalizes the way that Unicode blocks are handled
to make it easier to add support for new scripts in the future.
src/unicodeRegexes.js is replaced with the new file src/unicodeScripts.js
---
 src/Parser.js         |  5 ++-
 src/domTree.js        | 25 +++++------
 src/fontMetrics.js    | 19 +++++++--
 src/unicodeRegexes.js | 13 ------
 src/unicodeScripts.js | 98 +++++++++++++++++++++++++++++++++++++++++++
 test/unicode-spec.js  | 43 +++++++++++++++++++
 6 files changed, 170 insertions(+), 33 deletions(-)
 delete mode 100644 src/unicodeRegexes.js
 create mode 100644 src/unicodeScripts.js

diff --git a/src/Parser.js b/src/Parser.js
index d9da5c5b..7649128d 100644
--- a/src/Parser.js
+++ b/src/Parser.js
@@ -7,7 +7,7 @@ import MacroExpander from "./MacroExpander";
 import symbols from "./symbols";
 import utils from "./utils";
 import { validUnit } from "./units";
-import { cjkRegex } from "./unicodeRegexes";
+import { supportedCodepoint } from "./unicodeScripts";
 import unicodeAccents from "./unicodeAccents";
 import unicodeSymbols from "./unicodeSymbols";
 import ParseNode from "./ParseNode";
@@ -1072,7 +1072,8 @@ export default class Parser {
         if (symbols[this.mode][text]) {
             symbol = new ParseNode(symbols[this.mode][text].group,
                             text, this.mode, nucleus);
-        } else if (this.mode === "text" && cjkRegex.test(text)) {
+        } else if (this.mode === "text" &&
+                   supportedCodepoint(text.charCodeAt(0))) {
             symbol = new ParseNode("textord", text, this.mode, nucleus);
         } else {
             return null;  // EOF, ^, _, {, }, etc.
diff --git a/src/domTree.js b/src/domTree.js
index 9384a9be..7ff92113 100644
--- a/src/domTree.js
+++ b/src/domTree.js
@@ -8,7 +8,7 @@
  *
  * Similar functions for working with MathML nodes exist in mathMLTree.js.
  */
-import {cjkRegex, hangulRegex} from "./unicodeRegexes";
+import { scriptFromCodepoint } from "./unicodeScripts";
 import utils from "./utils";
 import svgGeometry from "./svgGeometry";
 import type Options from "./Options";
@@ -408,19 +408,16 @@ class symbolNode implements CombinableDomNode {
         this.style = style || {};
         this.maxFontSize = 0;
 
-        // Mark CJK characters with specific classes so that we can specify which
-        // fonts to use.  This allows us to render these characters with a serif
-        // font in situations where the browser would either default to a sans serif
-        // or render a placeholder character.
-        if (cjkRegex.test(this.value)) {
-            // I couldn't find any fonts that contained Hangul as well as all of
-            // the other characters we wanted to test there for it gets its own
-            // CSS class.
-            if (hangulRegex.test(this.value)) {
-                this.classes.push('hangul_fallback');
-            } else {
-                this.classes.push('cjk_fallback');
-            }
+        // Mark text from non-Latin scripts with specific classes so that we
+        // can specify which fonts to use.  This allows us to render these
+        // characters with a serif font in situations where the browser would
+        // either default to a sans serif or render a placeholder character.
+        // We use CSS class names like cjk_fallback, hangul_fallback and
+        // brahmic_fallback. See ./unicodeScripts.js for the set of possible
+        // script names
+        const script = scriptFromCodepoint(this.value.charCodeAt(0));
+        if (script) {
+            this.classes.push(script + "_fallback");
         }
 
         if (/[îïíì]/.test(this.value)) {    // add ī when we add Extended Latin
diff --git a/src/fontMetrics.js b/src/fontMetrics.js
index ba33d79d..5c72a676 100644
--- a/src/fontMetrics.js
+++ b/src/fontMetrics.js
@@ -1,5 +1,5 @@
 // @flow
-import { cjkRegex } from "./unicodeRegexes";
+import { supportedCodepoint } from "./unicodeScripts";
 
 /**
  * This file contains metrics regarding fonts and individual symbols. The sigma
@@ -198,10 +198,21 @@ const getCharacterMetrics = function(
     let ch = character.charCodeAt(0);
     if (character[0] in extraCharacterMap) {
         ch = extraCharacterMap[character[0]].charCodeAt(0);
-    } else if (cjkRegex.test(character[0])) {
-        ch = 'M'.charCodeAt(0);
     }
-    const metrics = metricMap[font]['' + ch];
+    let metrics = metricMap[font][ch];
+
+    if (!metrics) {
+        // We don't typically have font metrics for Asian scripts.
+        // So if the character is in a script we support but we
+        // dont have metrics for it, just use the metrics for
+        // the Latin capital letter M. This is close enough because
+        // we (currently) only care about the height of the glpyh
+        // not its width.
+        if (supportedCodepoint(ch)) {
+            metrics = metricMap[font][77]; // 77 is the charcode for 'M'
+        }
+    }
+
     if (metrics) {
         return {
             depth: metrics[0],
diff --git a/src/unicodeRegexes.js b/src/unicodeRegexes.js
deleted file mode 100644
index a7ae9b47..00000000
--- a/src/unicodeRegexes.js
+++ /dev/null
@@ -1,13 +0,0 @@
-// @flow
-export const hangulRegex = /[\uAC00-\uD7AF]/;
-
-// This regex combines
-// - CJK symbols and punctuation: [\u3000-\u303F]
-// - Hiragana: [\u3040-\u309F]
-// - Katakana: [\u30A0-\u30FF]
-// - CJK ideograms: [\u4E00-\u9FAF]
-// - Hangul syllables: [\uAC00-\uD7AF]
-// - Fullwidth punctuation: [\uFF00-\uFF60]
-// Notably missing are halfwidth Katakana and Romanji glyphs.
-export const cjkRegex =
-    /[\u3000-\u30FF\u4E00-\u9FAF\uAC00-\uD7AF\uFF00-\uFF60]/;
diff --git a/src/unicodeScripts.js b/src/unicodeScripts.js
new file mode 100644
index 00000000..9b5c287d
--- /dev/null
+++ b/src/unicodeScripts.js
@@ -0,0 +1,98 @@
+// @flow
+
+/*
+ * This file defines the Unicode scripts and script families that we
+ * support. To add new scripts or families, just add a new entry to the
+ * scriptData array below. Adding scripts to the scriptData array allows
+ * characters from that script to appear in \text{} environments.
+ */
+
+/**
+ * Each script or script family has a name and an array of blocks.
+ * Each block is an array of two numbers which specify the start and
+ * end points (inclusive) of a block of Unicode codepoints.
+ */
+type Script = {
+    name: string;
+    blocks: Array<Array<number>>;
+};
+
+/**
+ * Unicode block data for the families of scripts we support.
+ */
+const scriptData: Array<Script> = [
+    {
+        // Chinese and Japanese.
+        // The "k" in cjk is for Korean, but we've separated Korean out
+        name: "cjk",
+        blocks: [
+            [0x3000, 0x30FF], // CJK symbols and punctuation, Hiragana, Katakana
+            [0x4E00, 0x9FAF], // CJK ideograms
+            [0xFF00, 0xFF60], // Fullwidth punctuation
+            // TODO: add halfwidth Katakana and Romanji glyphs
+        ],
+    },
+    {
+        // Korean
+        name: 'hangul',
+        blocks: [[0xAC00, 0xD7AF]],
+    },
+    {
+        // The Brahmic scripts of South and Southeast Asia
+        // Devanagari (0900–097F)
+        // Bengali (0980–09FF)
+        // Gurmukhi (0A00–0A7F)
+        // Gujarati (0A80–0AFF)
+        // Oriya (0B00–0B7F)
+        // Tamil (0B80–0BFF)
+        // Telugu (0C00–0C7F)
+        // Kannada (0C80–0CFF)
+        // Malayalam (0D00–0D7F)
+        // Sinhala (0D80–0DFF)
+        // Thai (0E00–0E7F)
+        // Lao (0E80–0EFF)
+        // Tibetan (0F00–0FFF)
+        // Myanmar (1000–109F)
+        name: 'brahmic',
+        blocks: [[0x0900, 0x109F]],
+    },
+];
+
+/**
+ * Given a codepoint, return the name of the script or script family
+ * it is from, or null if it is not part of a known block
+ */
+export function scriptFromCodepoint(codepoint: number): ?string {
+    for (const script of scriptData) {
+        for (const block of script.blocks) {
+            if (codepoint >= block[0] && codepoint <= block[1]) {
+                return script.name;
+            }
+        }
+    }
+    return null;
+}
+
+/**
+ * A flattened version of all the supported blocks in a single array.
+ * This is an optimization to make supportedCodepoint() fast.
+ */
+const allBlocks: Array<number> = [];
+scriptData.forEach(s => s.blocks.forEach(b => allBlocks.push(...b)));
+
+/**
+ * Given a codepoint, return true if it falls within one of the
+ * scripts or script families defined above and false otherwise.
+ *
+ * Micro benchmarks shows that this is faster than
+ * /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60\uAC00-\uD7AF\u0900-\u109F]/.test()
+ * in Firefox, Chrome and Node.
+ */
+export function supportedCodepoint(codepoint: number): boolean {
+    for (let i = 0; i < allBlocks.length; i += 2) {
+        if (codepoint >= allBlocks[i] && codepoint <= allBlocks[i + 1]) {
+            return true;
+        }
+    }
+    return false;
+}
diff --git a/test/unicode-spec.js b/test/unicode-spec.js
index ff68d395..c1132c0e 100644
--- a/test/unicode-spec.js
+++ b/test/unicode-spec.js
@@ -6,6 +6,7 @@
 import ParseError from "../src/ParseError";
 import parseTree from "../src/parseTree";
 import Settings from "../src/Settings";
+import {scriptFromCodepoint, supportedCodepoint} from "../src/unicodeScripts";
 
 const defaultSettings = new Settings({});
 
@@ -101,4 +102,46 @@ describe("unicode", function() {
         expect('私はバナナです。').toNotParse();
         expect('여보세요').toNotParse();
     });
+
+    it("should parse Devangari inside \\text{}", function() {
+        expect('\\text{नमस्ते}').toParse();
+    });
+
+    it("should not parse Devangari outside \\text{}", function() {
+        expect('नमस्ते').toNotParse();
+    });
+});
+
+describe("unicodeScripts", () => {
+    const cjkRE = /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60]/;
+    const hangulRE = /[\uAC00-\uD7AF]/;
+    const brahmicRE = /[\u0900-\u109F]/;
+    const allRE =
+        /[\u3000-\u30FF\u4E00-\u9FAF\uFF00-\uFF60\uAC00-\uD7AF\u0900-\u109F]/;
+
+    it("supportedCodepoint() should return the correct values", () => {
+        for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
+            expect(supportedCodepoint(codepoint)).toBe(
+                allRE.test(String.fromCharCode(codepoint))
+            );
+        }
+    });
+
+    it("scriptFromCodepoint() should return correct values", () => {
+        for (let codepoint = 0; codepoint <= 0xffff; codepoint++) {
+            const character = String.fromCharCode(codepoint);
+            const script = scriptFromCodepoint(codepoint);
+
+            if (cjkRE.test(character)) {
+                expect(script).toEqual('cjk');
+            } else if (hangulRE.test(character)) {
+                expect(script).toEqual('hangul');
+            } else if (brahmicRE.test(character)) {
+                expect(script).toEqual('brahmic');
+            } else {
+                expect(script).toBe(null);
+                expect(supportedCodepoint(codepoint)).toBe(false);
+            }
+        }
+    });
 });