Unicode accents (#992)

* Unicode accents * Lexer now looks for combining dicritical marks and adds them to the same character * Parser's `parseSymbol` now recognizes both combined and uncombined forms of Unicode accents, and builds accent objects just like the accent functions * Added CJK support to math mode (not just text mode) * Add invalid combining character test * Add MathML test * Add weak support for other Latin-1 characters This maintains backwards compatibility, but it uses the wrong font. There's a TODO to fix this later. Also refactor symbol code to use for..of * Update Unicode screenshot * Remove dot from accented i and j (in math mode) Also add dotless Unicode characters to support some accented i's and j's * Fix \imath, \jmath, \pounds, and more tests * Switch from for..of to .split().forEach() Save around 800 bytes in minified code * Fix split * normalize() detection * Convert back to vanilla for loops * Fix merge * Move normalize dependency to unicodeMake.js * Make unicodeSymbols into a lookup table instead of macros This is important for multi-accented characters. * Add comments about when to run * Move symbols definition into unicodeMake/Symbols.js * Remove CJK support in text mode * Add missing semicolon * Refactor unicodeAccents to its own file * Dotless i/j support in text mode * Remove excess character mappings * Fix Åå in math mode (still via Times) * Update to support #1030 * Add accented Greek letter support (for supported Greek symbols) * Update screenshot * remove Æ, æ, Ø, ø, and ß from math mode test
2025-10-05 03:08:40 +00:00 · 2017-12-28 22:32:45 -08:00
parent d822f04b9b
commit 484d44ee70
17 changed files with 628 additions and 104 deletions
--- a/test/snapshots/mathml-spec.js.snap
+++ b/test/snapshots/mathml-spec.js.snap
@@ -1,5 +1,62 @@
 // Jest Snapshot v1, https://goo.gl/fbAQLP

+exports[`A MathML builder accents turn into <mover accent="true"> in MathML 1`] = `
+
+<math>
+  <semantics>
+    <mrow>
+      <mover accent="true">
+        <mi>
+          u
+        </mi>
+        <mo>
+          ¨
+        </mo>
+      </mover>
+      <mi>
+        b
+      </mi>
+      <mi>
+        e
+      </mi>
+      <mi>
+        r
+      </mi>
+      <mi>
+        f
+      </mi>
+      <mi>
+        i
+      </mi>
+      <mi>
+        a
+      </mi>
+      <mi>
+        n
+      </mi>
+      <mi>
+        c
+      </mi>
+      <mover accent="true">
+        <mi>
+          e
+        </mi>
+        <mo>
+          ´
+        </mo>
+      </mover>
+      <mi>
+        e
+      </mi>
+    </mrow>
+    <annotation encoding="application/x-tex">
+      über fiancée
+    </annotation>
+  </semantics>
+</math>
+
+`;
+
 exports[`A MathML builder should generate <mphantom> nodes for \\phantom 1`] = `

 <math>
--- a/test/errors-spec.js
+++ b/test/errors-spec.js
@@ -375,3 +375,10 @@ describe("Lexer:", function() {
    });

 });
+
+describe("Unicode accents", function() {
+    it("should return error for invalid combining characters", function() {
+        expect("A\u0328").toFailWithParseError(
+            "Unknown accent ' ̨' at position 1: Ą̲̲");
+    });
+});
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -2757,15 +2757,64 @@ describe("A parser taking String objects", function() {
    });
 });

+describe("Unicode accents", function() {
+    it("should parse Latin-1 letters in math mode", function() {
+        // TODO(edemaine): Unsupported Latin-1 letters in math: ÅåÇÐÞçðþ
+        expect("ÀÁÂÃÄÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäèéêëìíîïñòóôõöùúûüýÿ")
+        .toParseLike(
+            "\\grave A\\acute A\\hat A\\tilde A\\ddot A" +
+            "\\grave E\\acute E\\hat E\\ddot E" +
+            "\\grave I\\acute I\\hat I\\ddot I" +
+            "\\tilde N" +
+            "\\grave O\\acute O\\hat O\\tilde O\\ddot O" +
+            "\\grave U\\acute U\\hat U\\ddot U" +
+            "\\acute Y" +
+            "\\grave a\\acute a\\hat a\\tilde a\\ddot a" +
+            "\\grave e\\acute e\\hat e\\ddot e" +
+            "\\grave ı\\acute ı\\hat ı\\ddot ı" +
+            "\\tilde n" +
+            "\\grave o\\acute o\\hat o\\tilde o\\ddot o" +
+            "\\grave u\\acute u\\hat u\\ddot u" +
+            "\\acute y\\ddot y");
+    });
+
+    it("should parse Latin-1 letters in text mode", function() {
+        // TODO(edemaine): Unsupported Latin-1 letters in text: ÇÐÞçðþ
+        expect("\\text{ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿ}")
+        .toParseLike(
+            "\\text{\\`A\\'A\\^A\\~A\\\"A\\r A" +
+            "\\`E\\'E\\^E\\\"E" +
+            "\\`I\\'I\\^I\\\"I" +
+            "\\~N" +
+            "\\`O\\'O\\^O\\~O\\\"O" +
+            "\\`U\\'U\\^U\\\"U" +
+            "\\'Y" +
+            "\\`a\\'a\\^a\\~a\\\"a\\r a" +
+            "\\`e\\'e\\^e\\\"e" +
+            "\\`ı\\'ı\\^ı\\\"ı" +
+            "\\~n" +
+            "\\`o\\'o\\^o\\~o\\\"o" +
+            "\\`u\\'u\\^u\\\"u" +
+            "\\'y\\\"y}");
+    });
+
+    it("should parse combining characters", function() {
+        expect("A\u0301C\u0301").toParseLike("Á\\acute C");
+        expect("\\text{A\u0301C\u0301}").toParseLike("\\text{Á\\'C}");
+    });
+
+    it("should parse multi-accented characters", function() {
+        expect("ấā́ắ\\text{ấā́ắ}").toParse();
+        // Doesn't parse quite the same as
+        // "\\text{\\'{\\^a}\\'{\\=a}\\'{\\u a}}" because of the ordgroups.
+    });
+
+    it("should parse accented i's and j's", function() {
+        expect("íȷ́").toParseLike("\\acute ı\\acute ȷ");
+    });
+});
+
 describe("Unicode", function() {
-    it("should parse all lower case Greek letters", function() {
-        expect("αβγδεϵζηθϑικλμνξοπϖρϱςστυφϕχψω").toParse();
-    });
-
-    it("should parse 'ΓΔΘΞΠΣΦΨΩ'", function() {
-        expect("ΓΔΘΞΠΣΦΨΩ").toParse();
-    });
-
    it("should parse negated relations", function() {
        expect("∉∤∦≁≆≠≨≩≮≯≰≱⊀⊁⊈⊉⊊⊋⊬⊭⊮⊯⋠⋡⋦⋧⋨⋩⋬⋭⪇⪈⪉⪊⪵⪶⪹⪺⫋⫌").toParse();
    });
--- a/test/mathml-spec.js
+++ b/test/mathml-spec.js
@@ -93,4 +93,8 @@ describe("A MathML builder", function() {
        expect(getMathML(`\\boldsymbol{Ax2k\\omega\\Omega\\imath+}`))
            .toMatchSnapshot();
    });
+
+    it('accents turn into <mover accent="true"> in MathML', function() {
+        expect(getMathML("über fiancée")).toMatchSnapshot();
+    });
 });
--- a/test/screenshotter/images/Unicode-chrome.png
+++ b/test/screenshotter/images/Unicode-chrome.png
--- a/test/screenshotter/images/Unicode-firefox.png
+++ b/test/screenshotter/images/Unicode-firefox.png
--- a/test/unicode-spec.js
+++ b/test/unicode-spec.js
@@ -67,11 +67,21 @@ describe("unicode", function() {
    });

    it("should parse Latin-1 inside \\text{}", function() {
-        expect('\\text{ÀàÇçÉéÏïÖöÛû}').toParse();
+        expect('\\text{ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿ' +
+            'ÆÇÐØÞßæçðøþ}').toParse();
    });

    it("should parse Latin-1 outside \\text{}", function() {
-        expect('ÀàÇçÉéÏïÖöÛû').toParse();
+        expect('ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿ' +
+            'ÇÐÞçðþ').toParse();
+    });
+
+    it("should parse all lower case Greek letters", function() {
+        expect("αβγδεϵζηθϑικλμνξοπϖρϱςστυφϕχψω").toParse();
+    });
+
+    it("should parse math upper case Greek letters", function() {
+        expect("ΓΔΘΛΞΠΣΥΦΨΩ").toParse();
    });

    it("should parse Cyrillic inside \\text{}", function() {