Unicode accents (#992)

* Unicode accents

* Lexer now looks for combining dicritical marks and adds them to the same character
* Parser's `parseSymbol` now recognizes both combined and uncombined forms of Unicode accents, and builds accent objects just like the accent functions
* Added CJK support to math mode (not just text mode)

* Add invalid combining character test

* Add MathML test

* Add weak support for other Latin-1 characters

This maintains backwards compatibility, but it uses the wrong font.
There's a TODO to fix this later.

Also refactor symbol code to use for..of

* Update Unicode screenshot

* Remove dot from accented i and j (in math mode)

Also add dotless Unicode characters to support some accented i's and j's

* Fix \imath, \jmath, \pounds, and more tests

* Switch from for..of to .split().forEach()

Save around 800 bytes in minified code

* Fix split

* normalize() detection

* Convert back to vanilla for loops

* Fix merge

* Move normalize dependency to unicodeMake.js

* Make unicodeSymbols into a lookup table instead of macros

This is important for multi-accented characters.

* Add comments about when to run

* Move symbols definition into unicodeMake/Symbols.js

* Remove CJK support in text mode

* Add missing semicolon

* Refactor unicodeAccents to its own file

* Dotless i/j support in text mode

* Remove excess character mappings

* Fix Åå in math mode (still via Times)

* Update to support #1030

* Add accented Greek letter support (for supported Greek symbols)

* Update screenshot

* remove Æ, æ, Ø, ø, and ß from math mode test
This commit is contained in:
Erik Demaine
2017-12-28 22:32:45 -08:00
committed by Kevin Barabash
parent d822f04b9b
commit 484d44ee70
17 changed files with 628 additions and 104 deletions

View File

@@ -1,5 +1,62 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`A MathML builder accents turn into <mover accent="true"> in MathML 1`] = `
<math>
<semantics>
<mrow>
<mover accent="true">
<mi>
u
</mi>
<mo>
¨
</mo>
</mover>
<mi>
b
</mi>
<mi>
e
</mi>
<mi>
r
</mi>
<mi>
f
</mi>
<mi>
i
</mi>
<mi>
a
</mi>
<mi>
n
</mi>
<mi>
c
</mi>
<mover accent="true">
<mi>
e
</mi>
<mo>
´
</mo>
</mover>
<mi>
e
</mi>
</mrow>
<annotation encoding="application/x-tex">
über fiancée
</annotation>
</semantics>
</math>
`;
exports[`A MathML builder should generate <mphantom> nodes for \\phantom 1`] = `
<math>

View File

@@ -375,3 +375,10 @@ describe("Lexer:", function() {
});
});
describe("Unicode accents", function() {
it("should return error for invalid combining characters", function() {
expect("A\u0328").toFailWithParseError(
"Unknown accent ' ̨' at position 1: Ą̲̲");
});
});

View File

@@ -2757,15 +2757,64 @@ describe("A parser taking String objects", function() {
});
});
describe("Unicode accents", function() {
it("should parse Latin-1 letters in math mode", function() {
// TODO(edemaine): Unsupported Latin-1 letters in math: ÅåÇÐÞçðþ
expect("ÀÁÂÃÄÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäèéêëìíîïñòóôõöùúûüýÿ")
.toParseLike(
"\\grave A\\acute A\\hat A\\tilde A\\ddot A" +
"\\grave E\\acute E\\hat E\\ddot E" +
"\\grave I\\acute I\\hat I\\ddot I" +
"\\tilde N" +
"\\grave O\\acute O\\hat O\\tilde O\\ddot O" +
"\\grave U\\acute U\\hat U\\ddot U" +
"\\acute Y" +
"\\grave a\\acute a\\hat a\\tilde a\\ddot a" +
"\\grave e\\acute e\\hat e\\ddot e" +
"\\grave ı\\acute ı\\hat ı\\ddot ı" +
"\\tilde n" +
"\\grave o\\acute o\\hat o\\tilde o\\ddot o" +
"\\grave u\\acute u\\hat u\\ddot u" +
"\\acute y\\ddot y");
});
it("should parse Latin-1 letters in text mode", function() {
// TODO(edemaine): Unsupported Latin-1 letters in text: ÇÐÞçðþ
expect("\\text{ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿ}")
.toParseLike(
"\\text{\\`A\\'A\\^A\\~A\\\"A\\r A" +
"\\`E\\'E\\^E\\\"E" +
"\\`I\\'I\\^I\\\"I" +
"\\~N" +
"\\`O\\'O\\^O\\~O\\\"O" +
"\\`U\\'U\\^U\\\"U" +
"\\'Y" +
"\\`a\\'a\\^a\\~a\\\"a\\r a" +
"\\`e\\'e\\^e\\\"e" +
"\\`ı\\'ı\\^ı\\\"ı" +
"\\~n" +
"\\`o\\'o\\^o\\~o\\\"o" +
"\\`u\\'u\\^u\\\"u" +
"\\'y\\\"y}");
});
it("should parse combining characters", function() {
expect("A\u0301C\u0301").toParseLike("Á\\acute C");
expect("\\text{A\u0301C\u0301}").toParseLike("\\text{Á\\'C}");
});
it("should parse multi-accented characters", function() {
expect("ấā́ắ\\text{ấā́ắ}").toParse();
// Doesn't parse quite the same as
// "\\text{\\'{\\^a}\\'{\\=a}\\'{\\u a}}" because of the ordgroups.
});
it("should parse accented i's and j's", function() {
expect("íȷ́").toParseLike("\\acute ı\\acute ȷ");
});
});
describe("Unicode", function() {
it("should parse all lower case Greek letters", function() {
expect("αβγδεϵζηθϑικλμνξοπϖρϱςστυφϕχψω").toParse();
});
it("should parse 'ΓΔΘΞΠΣΦΨΩ'", function() {
expect("ΓΔΘΞΠΣΦΨΩ").toParse();
});
it("should parse negated relations", function() {
expect("∉∤∦≁≆≠≨≩≮≯≰≱⊀⊁⊈⊉⊊⊋⊬⊭⊮⊯⋠⋡⋦⋧⋨⋩⋬⋭⪇⪈⪉⪊⪵⪶⪹⪺⫋⫌").toParse();
});

View File

@@ -93,4 +93,8 @@ describe("A MathML builder", function() {
expect(getMathML(`\\boldsymbol{Ax2k\\omega\\Omega\\imath+}`))
.toMatchSnapshot();
});
it('accents turn into <mover accent="true"> in MathML', function() {
expect(getMathML("über fiancée")).toMatchSnapshot();
});
});

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 24 KiB

View File

@@ -67,11 +67,21 @@ describe("unicode", function() {
});
it("should parse Latin-1 inside \\text{}", function() {
expect('\\text{ÀàÇçÉéÏïÖöÛû}').toParse();
expect('\\text{ÀÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿ' +
'ÆÇÐØÞßæçðøþ}').toParse();
});
it("should parse Latin-1 outside \\text{}", function() {
expect(àÇçÉéÏïÖöÛû').toParse();
expect(ÁÂÃÄÅÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåèéêëìíîïñòóôõöùúûüýÿ' +
'ÇÐÞçðþ').toParse();
});
it("should parse all lower case Greek letters", function() {
expect("αβγδεϵζηθϑικλμνξοπϖρϱςστυφϕχψω").toParse();
});
it("should parse math upper case Greek letters", function() {
expect("ΓΔΘΛΞΠΣΥΦΨΩ").toParse();
});
it("should parse Cyrillic inside \\text{}", function() {