From 7fe6af2a82b508b0e7468043c19744644b59593c Mon Sep 17 00:00:00 2001 From: David Flanagan Date: Fri, 12 Jan 2018 16:14:31 -0800 Subject: [PATCH] Add basic support for Indic scripts in addition to CJK. (#1060) This patch just makes KaTeX recognize Unicode codepoints in the range \u0900-\u109f so that those South and Southeast Asian scripts do not get automatically rejected. The patch also generalizes the way that Unicode blocks are handled to make it easier to add support for new scripts in the future. src/unicodeRegexes.js is replaced with the new file src/unicodeScripts.js --- src/Parser.js | 5 ++- src/domTree.js | 25 +++++------ src/fontMetrics.js | 19 +++++++-- src/unicodeRegexes.js | 13 ------ src/unicodeScripts.js | 98 +++++++++++++++++++++++++++++++++++++++++++ test/unicode-spec.js | 43 +++++++++++++++++++ 6 files changed, 170 insertions(+), 33 deletions(-) delete mode 100644 src/unicodeRegexes.js create mode 100644 src/unicodeScripts.js diff --git a/src/Parser.js b/src/Parser.js index d9da5c5b..7649128d 100644 --- a/src/Parser.js +++ b/src/Parser.js @@ -7,7 +7,7 @@ import MacroExpander from "./MacroExpander"; import symbols from "./symbols"; import utils from "./utils"; import { validUnit } from "./units"; -import { cjkRegex } from "./unicodeRegexes"; +import { supportedCodepoint } from "./unicodeScripts"; import unicodeAccents from "./unicodeAccents"; import unicodeSymbols from "./unicodeSymbols"; import ParseNode from "./ParseNode"; @@ -1072,7 +1072,8 @@ export default class Parser { if (symbols[this.mode][text]) { symbol = new ParseNode(symbols[this.mode][text].group, text, this.mode, nucleus); - } else if (this.mode === "text" && cjkRegex.test(text)) { + } else if (this.mode === "text" && + supportedCodepoint(text.charCodeAt(0))) { symbol = new ParseNode("textord", text, this.mode, nucleus); } else { return null; // EOF, ^, _, {, }, etc. diff --git a/src/domTree.js b/src/domTree.js index 9384a9be..7ff92113 100644 --- a/src/domTree.js +++ b/src/domTree.js @@ -8,7 +8,7 @@ * * Similar functions for working with MathML nodes exist in mathMLTree.js. */ -import {cjkRegex, hangulRegex} from "./unicodeRegexes"; +import { scriptFromCodepoint } from "./unicodeScripts"; import utils from "./utils"; import svgGeometry from "./svgGeometry"; import type Options from "./Options"; @@ -408,19 +408,16 @@ class symbolNode implements CombinableDomNode { this.style = style || {}; this.maxFontSize = 0; - // Mark CJK characters with specific classes so that we can specify which - // fonts to use. This allows us to render these characters with a serif - // font in situations where the browser would either default to a sans serif - // or render a placeholder character. - if (cjkRegex.test(this.value)) { - // I couldn't find any fonts that contained Hangul as well as all of - // the other characters we wanted to test there for it gets its own - // CSS class. - if (hangulRegex.test(this.value)) { - this.classes.push('hangul_fallback'); - } else { - this.classes.push('cjk_fallback'); - } + // Mark text from non-Latin scripts with specific classes so that we + // can specify which fonts to use. This allows us to render these + // characters with a serif font in situations where the browser would + // either default to a sans serif or render a placeholder character. + // We use CSS class names like cjk_fallback, hangul_fallback and + // brahmic_fallback. See ./unicodeScripts.js for the set of possible + // script names + const script = scriptFromCodepoint(this.value.charCodeAt(0)); + if (script) { + this.classes.push(script + "_fallback"); } if (/[îïíì]/.test(this.value)) { // add ī when we add Extended Latin diff --git a/src/fontMetrics.js b/src/fontMetrics.js index ba33d79d..5c72a676 100644 --- a/src/fontMetrics.js +++ b/src/fontMetrics.js @@ -1,5 +1,5 @@ // @flow -import { cjkRegex } from "./unicodeRegexes"; +import { supportedCodepoint } from "./unicodeScripts"; /** * This file contains metrics regarding fonts and individual symbols. The sigma @@ -198,10 +198,21 @@ const getCharacterMetrics = function( let ch = character.charCodeAt(0); if (character[0] in extraCharacterMap) { ch = extraCharacterMap[character[0]].charCodeAt(0); - } else if (cjkRegex.test(character[0])) { - ch = 'M'.charCodeAt(0); } - const metrics = metricMap[font]['' + ch]; + let metrics = metricMap[font][ch]; + + if (!metrics) { + // We don't typically have font metrics for Asian scripts. + // So if the character is in a script we support but we + // dont have metrics for it, just use the metrics for + // the Latin capital letter M. This is close enough because + // we (currently) only care about the height of the glpyh + // not its width. + if (supportedCodepoint(ch)) { + metrics = metricMap[font][77]; // 77 is the charcode for 'M' + } + } + if (metrics) { return { depth: metrics[0], diff --git a/src/unicodeRegexes.js b/src/unicodeRegexes.js deleted file mode 100644 index a7ae9b47..00000000 --- a/src/unicodeRegexes.js +++ /dev/null @@ -1,13 +0,0 @@ -// @flow -export const hangulRegex = /[\uAC00-\uD7AF]/; - -// This regex combines -// - CJK symbols and punctuation: [\u3000-\u303F] -// - Hiragana: [\u3040-\u309F] -// - Katakana: [\u30A0-\u30FF] -// - CJK ideograms: [\u4E00-\u9FAF] -// - Hangul syllables: [\uAC00-\uD7AF] -// - Fullwidth punctuation: [\uFF00-\uFF60] -// Notably missing are halfwidth Katakana and Romanji glyphs. -export const cjkRegex = - /[\u3000-\u30FF\u4E00-\u9FAF\uAC00-\uD7AF\uFF00-\uFF60]/; diff --git a/src/unicodeScripts.js b/src/unicodeScripts.js new file mode 100644 index 00000000..9b5c287d --- /dev/null +++ b/src/unicodeScripts.js @@ -0,0 +1,98 @@ +// @flow + +/* + * This file defines the Unicode scripts and script families that we + * support. To add new scripts or families, just add a new entry to the + * scriptData array below. Adding scripts to the scriptData array allows + * characters from that script to appear in \text{} environments. + */ + +/** + * Each script or script family has a name and an array of blocks. + * Each block is an array of two numbers which specify the start and + * end points (inclusive) of a block of Unicode codepoints. + */ +type Script = { + name: string; + blocks: Array>; +}; + +/** + * Unicode block data for the families of scripts we support. + */ +const scriptData: Array