\char character escaping and nicer MathML via \html@mathml (#1454)

* \html@mathml

Fix #1452

* Add missing file

* Implement \char (via internal \@char)

* Remove excess <mstyle> wrapper on \mathbin etc.

* Fix tests

* Add Unicode support for \copyright and \textregistered

Testing that this doesn't lead to an infinite loop thanks to \char` escaping.

* Add tests

* Use assertNodeType

* Switch from regex to lookup table, and no parseInt
This commit is contained in:
Erik Demaine
2018-07-14 08:36:23 -04:00
committed by GitHub
parent 33ef4bdc63
commit b7277049a4
10 changed files with 259 additions and 61 deletions

View File

@@ -89,6 +89,7 @@ export type AnyParseNode =
ParseNode<"genfrac"> |
ParseNode<"horizBrace"> |
ParseNode<"href"> |
ParseNode<"htmlmathml"> |
ParseNode<"infix"> |
ParseNode<"kern"> |
ParseNode<"lap"> |
@@ -258,6 +259,11 @@ export type ParseNodeTypes = {
href: string,
body: AnyParseNode[],
|},
"htmlmathml": {|
type: "htmlmathml",
html: AnyParseNode[],
mathml: AnyParseNode[],
|},
"infix": {|
type: "infix",
replaceWith: string,

View File

@@ -1,6 +1,5 @@
// @flow
/* eslint no-constant-condition:0 */
/* eslint no-console:0 */
import functions from "./functions";
import environments from "./environments";
import MacroExpander from "./MacroExpander";
@@ -1016,7 +1015,8 @@ export default class Parser {
if (this.settings.strict) {
if (!supportedCodepoint(text.charCodeAt(0))) {
this.settings.reportNonstrict("unknownSymbol",
`Unrecognized Unicode character "${text[0]}"`, nucleus);
`Unrecognized Unicode character "${text[0]}"` +
` (${text.charCodeAt(0)})`, nucleus);
} else if (this.mode === "math") {
this.settings.reportNonstrict("unicodeTextInMathMode",
`Unicode text character "${text[0]}" used in math mode`,

View File

@@ -10,6 +10,7 @@ export default functions;
import "./functions/accent";
import "./functions/accentunder";
import "./functions/arrow";
import "./functions/char";
import "./functions/color";
import "./functions/cr";
import "./functions/delimsizing";
@@ -19,6 +20,7 @@ import "./functions/font";
import "./functions/genfrac";
import "./functions/horizBrace";
import "./functions/href";
import "./functions/htmlmathml";
import "./functions/kern";
import "./functions/lap";
import "./functions/math";

30
src/functions/char.js Normal file
View File

@@ -0,0 +1,30 @@
// @flow
import defineFunction from "../defineFunction";
import ParseError from "../ParseError";
import ParseNode, {assertNodeType} from "../ParseNode";
// \@char is an internal function that takes a grouped decimal argument like
// {123} and converts into symbol with code 123. It is used by the *macro*
// \char defined in macros.js.
defineFunction({
type: "textord",
names: ["\\@char"],
props: {
numArgs: 1,
allowedInText: true,
},
handler({parser}, args) {
const arg = assertNodeType(args[0], "ordgroup");
const group = arg.value;
let number = "";
for (let i = 0; i < group.length; i++) {
const node = assertNodeType(group[i], "textord");
number += node.value;
}
const code = parseInt(number);
if (isNaN(code)) {
throw new ParseError(`\\@char has non-numeric argument ${number}`);
}
return new ParseNode("textord", String.fromCharCode(code), parser.mode);
},
});

View File

@@ -0,0 +1,34 @@
// @flow
import defineFunction, {ordargument} from "../defineFunction";
import buildCommon from "../buildCommon";
import ParseNode from "../ParseNode";
import * as html from "../buildHTML";
import * as mml from "../buildMathML";
defineFunction({
type: "htmlmathml",
names: ["\\html@mathml"],
props: {
numArgs: 2,
allowedInText: true,
},
handler: ({parser}, args) => {
return new ParseNode("htmlmathml", {
type: "htmlmathml",
html: ordargument(args[0]),
mathml: ordargument(args[1]),
}, parser.mode);
},
htmlBuilder: (group, options) => {
const elements = html.buildExpression(
group.value.html,
options,
false
);
return new buildCommon.makeFragment(elements);
},
mathmlBuilder: (group, options) => {
return mml.buildExpressionRow(group.value.mathml, options);
},
});

View File

@@ -1,7 +1,7 @@
// @flow
import defineFunction, {ordargument} from "../defineFunction";
import buildCommon from "../buildCommon";
import mathMLTree from "../mathMLTree";
import domTree from "../domTree";
import ParseNode from "../ParseNode";
import * as html from "../buildHTML";
@@ -16,7 +16,7 @@ function htmlBuilder(group, options) {
function mathmlBuilder(group, options) {
const inner = mml.buildExpression(group.value.value, options);
return new mathMLTree.MathNode("mstyle", inner);
return new domTree.documentFragment(inner);
}
// Math class commands except \mathop

View File

@@ -31,6 +31,11 @@ export interface MacroContextInterface {
*/
future(): Token;
/**
* Remove and return the next unexpanded token.
*/
popToken(): Token;
/**
* Expand the next token only once (if possible), and return the resulting
* top token on the stack (without removing anything from the stack).
@@ -131,6 +136,61 @@ defineMacro("\\TextOrMath", function(context) {
}
});
// Lookup table for parsing numbers in base 8 through 16
const digitToNumber = {
"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8,
"9": 9, "a": 10, "A": 10, "b": 11, "B": 11, "c": 12, "C": 12,
"d": 13, "D": 13, "e": 14, "E": 14, "f": 15, "F": 15,
};
// TeX \char makes a literal character (catcode 12) using the following forms:
// (see The TeXBook, p. 43)
// \char123 -- decimal
// \char'123 -- octal
// \char"123 -- hex
// \char`x -- character that can be written (i.e. isn't active)
// \char`\x -- character that cannot be written (e.g. %)
// These all refer to characters from the font, so we turn them into special
// calls to a function \@char dealt with in the Parser.
defineMacro("\\char", function(context) {
let token = context.popToken();
let base;
let number = '';
if (token.text === "'") {
base = 8;
token = context.popToken();
} else if (token.text === '"') {
base = 16;
token = context.popToken();
} else if (token.text === "`") {
token = context.popToken();
if (token.text[0] === "\\") {
number = token.text.charCodeAt(1);
} else if (token.text === "EOF") {
throw new ParseError("\\char` missing argument");
} else {
number = token.text.charCodeAt(0);
}
} else {
base = 10;
}
if (base) {
// Parse a number in the given base, starting with first `token`.
number = digitToNumber[token.text];
if (number == null || number >= base) {
throw new ParseError(`Invalid base-${base} digit ${token.text}`);
}
let digit;
while ((digit = digitToNumber[context.future().text]) != null &&
digit < base) {
number *= base;
number += digit;
context.popToken();
}
}
return `\\@char{${number}}`;
});
// Basic support for macro definitions:
// \def\macro{expansion}
// \def\macro#1{expansion}
@@ -251,15 +311,17 @@ defineMacro("\\rq", "'");
defineMacro("\\aa", "\\r a");
defineMacro("\\AA", "\\r A");
// Copyright (C) and registered (R) symbols. Use raw symbol in MathML.
// \DeclareTextCommandDefault{\textcopyright}{\textcircled{c}}
// \DeclareTextCommandDefault{\textregistered}{\textcircled{%
// \check@mathfonts\fontsize\sf@size\z@\math@fontsfalse\selectfont R}}
// \DeclareRobustCommand{\copyright}{%
// \ifmmode{\nfss@text{\textcopyright}}\else\textcopyright\fi}
defineMacro("\\textcopyright", "\\textcircled{c}");
defineMacro("\\textcopyright", "\\html@mathml{\\textcircled{c}}{\\char`©}");
defineMacro("\\copyright",
"\\TextOrMath{\\textcopyright}{\\text{\\textcopyright}}");
defineMacro("\\textregistered", "\\textcircled{\\scriptsize R}");
defineMacro("\\textregistered",
"\\html@mathml{\\textcircled{\\scriptsize R}}{\\char`®}");
// Unicode double-struck letters
defineMacro("\u2102", "\\mathbb{C}");
@@ -300,21 +362,32 @@ defineMacro("\\clap", "\\mathclap{\\textrm{#1}}");
// \DeclareRobustCommand
// \notin{\mathrel{\m@th\mathpalette\c@ncel\in}}
// \def\c@ncel#1#2{\m@th\ooalign{$\hfil#1\mkern1mu/\hfil$\crcr$#1#2$}}
defineMacro("\\neq", "\\not=");
defineMacro("\\neq", "\\html@mathml{\\not=}{\\mathrel{\\char`≠}}");
defineMacro("\\ne", "\\neq");
defineMacro("\u2260", "\\neq");
defineMacro("\\notin", "\\mathrel{{\\in}\\mathllap{/\\mskip1mu}}");
defineMacro("\\notin", "\\html@mathml{\\mathrel{{\\in}\\mathllap{/\\mskip1mu}}}"
+ "{\\mathrel{\\char`∉}}");
defineMacro("\u2209", "\\notin");
// Unicode stacked relations
defineMacro("\u2258",
"\\mathrel{=\\kern{-1em}\\raisebox{0.4em}{$\\scriptsize\\frown$}}");
defineMacro("\u2259", "\\stackrel{\\tiny\\wedge}{=}");
defineMacro("\u225A", "\\stackrel{\\tiny\\vee}{=}");
defineMacro("\u225B", "\\stackrel{\\scriptsize\\star}{=}");
defineMacro("\u225D", "\\stackrel{\\tiny\\mathrm{def}}{=}");
defineMacro("\u225E", "\\stackrel{\\tiny\\mathrm{m}}{=}");
defineMacro("\u225F", "\\stackrel{\\tiny?}{=}");
defineMacro("\u2258", "\\html@mathml{" +
"\\mathrel{=\\kern{-1em}\\raisebox{0.4em}{$\\scriptsize\\frown$}}" +
"}{\\mathrel{\\char`\u2258}}");
defineMacro("\u2259",
"\\html@mathml{\\stackrel{\\tiny\\wedge}{=}}{\\mathrel{\\char`\u2258}}");
defineMacro("\u225A",
"\\html@mathml{\\stackrel{\\tiny\\vee}{=}}{\\mathrel{\\char`\u225A}}");
defineMacro("\u225B",
"\\html@mathml{\\stackrel{\\scriptsize\\star}{=}}" +
"{\\mathrel{\\char`\u225B}}");
defineMacro("\u225D",
"\\html@mathml{\\stackrel{\\tiny\\mathrm{def}}{=}}" +
"{\\mathrel{\\char`\u225D}}");
defineMacro("\u225E",
"\\html@mathml{\\stackrel{\\tiny\\mathrm{m}}{=}}" +
"{\\mathrel{\\char`\u225E}}");
defineMacro("\u225F",
"\\html@mathml{\\stackrel{\\tiny?}{=}}{\\mathrel{\\char`\u225F}}");
// Misc Unicode
defineMacro("\u27C2", "\\perp");
@@ -324,6 +397,9 @@ defineMacro("\u231C", "\\ulcorner");
defineMacro("\u231D", "\\urcorner");
defineMacro("\u231E", "\\llcorner");
defineMacro("\u231F", "\\lrcorner");
defineMacro("\u00A9", "\\copyright");
defineMacro("\u00AE", "\\textregistered");
defineMacro("\uFE0F", "\\textregistered");
//////////////////////////////////////////////////////////////////////
// LaTeX_2ε
@@ -593,7 +669,9 @@ defineMacro("\\\\", "\\newline");
// TODO: Doesn't normally work in math mode because \@ fails. KaTeX doesn't
// support \@ yet, so that's omitted, and we add \text so that the result
// doesn't look funny in math mode.
defineMacro("\\TeX", "\\textrm{T\\kern-.1667em\\raisebox{-.5ex}{E}\\kern-.125emX}");
defineMacro("\\TeX", "\\textrm{\\html@mathml{" +
"T\\kern-.1667em\\raisebox{-.5ex}{E}\\kern-.125emX" +
"}{TeX}}");
// \DeclareRobustCommand{\LaTeX}{L\kern-.36em%
// {\sbox\z@ T%
@@ -611,14 +689,14 @@ defineMacro("\\TeX", "\\textrm{T\\kern-.1667em\\raisebox{-.5ex}{E}\\kern-.125emX
// which is size3, which has a scale factor of 0.7 (see Options.js).
const latexRaiseA = fontMetricsData['Main-Regular']["T".charCodeAt(0)][1] -
0.7 * fontMetricsData['Main-Regular']["A".charCodeAt(0)][1] + "em";
defineMacro("\\LaTeX",
`\\textrm{L\\kern-.36em\\raisebox{${latexRaiseA}}{\\scriptsize A}` +
"\\kern-.15em\\TeX}");
defineMacro("\\LaTeX", "\\textrm{\\html@mathml{" +
`L\\kern-.36em\\raisebox{${latexRaiseA}}{\\scriptsize A}` +
"\\kern-.15em\\TeX}{LaTeX}}");
// New KaTeX logo based on tweaking LaTeX logo
defineMacro("\\KaTeX",
`\\textrm{K\\kern-.17em\\raisebox{${latexRaiseA}}{\\scriptsize A}` +
"\\kern-.15em\\TeX}");
defineMacro("\\KaTeX", "\\textrm{\\html@mathml{" +
`K\\kern-.17em\\raisebox{${latexRaiseA}}{\\scriptsize A}` +
"\\kern-.15em\\TeX}{KaTeX}}");
// \DeclareRobustCommand\hspace{\@ifstar\@hspacer\@hspace}
// \def\@hspace#1{\hskip #1\relax}
@@ -705,9 +783,6 @@ defineMacro("\\approxcoloncolon",
"\\mathrel{\\approx\\mathrel{\\mkern-1.2mu}\\dblcolon}");
// Present in newtxmath, pxfonts and txfonts
// TODO: The unicode character U+220C ∌ should be added to the font, and this
// macro turned into a propper defineSymbol in symbols.js. That way, the
// MathML result will be much cleaner.
defineMacro("\\notni", "\\not\\ni");
defineMacro("\\notni", "\\html@mathml{\\not\\ni}{\\mathrel{\\char`\u220C}}");
defineMacro("\\limsup", "\\DOTSB\\mathop{\\operatorname{lim\\,sup}}\\limits");
defineMacro("\\liminf", "\\DOTSB\\mathop{\\operatorname{lim\\,inf}}\\limits");

View File

@@ -1,5 +1,34 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
exports[`A MathML builder \\html@mathml makes clean symbols 1`] = `
<math>
<semantics>
<mrow>
<mtext>
©
</mtext>
<mi mathvariant="normal">
</mi>
<mi mathvariant="normal">
</mi>
<mi mathvariant="normal">
</mi>
<mtext>
KaTeX
</mtext>
</mrow>
<annotation encoding="application/x-tex">
\\copyright\\neq\\notin≘\\KaTeX
</annotation>
</semantics>
</math>
`;
exports[`A MathML builder \\text fonts become mathvariant 1`] = `
<math>
@@ -382,34 +411,32 @@ exports[`A MathML builder should render boldsymbol with the correct mathvariants
<math>
<semantics>
<mrow>
<mstyle>
<mrow>
<mi mathvariant="bold-italic">
A
</mi>
<mi mathvariant="bold-italic">
x
</mi>
<mn mathvariant="bold-italic">
2
</mn>
<mi mathvariant="bold-italic">
k
</mi>
<mi mathvariant="bold-italic">
ω
</mi>
<mi mathvariant="bold-italic">
Ω
</mi>
<mi mathvariant="bold-italic">
ı
</mi>
<mo mathvariant="bold-italic">
+
</mo>
</mrow>
</mstyle>
<mrow>
<mi mathvariant="bold-italic">
A
</mi>
<mi mathvariant="bold-italic">
x
</mi>
<mn mathvariant="bold-italic">
2
</mn>
<mi mathvariant="bold-italic">
k
</mi>
<mi mathvariant="bold-italic">
ω
</mi>
<mi mathvariant="bold-italic">
Ω
</mi>
<mi mathvariant="bold-italic">
ı
</mi>
<mo mathvariant="bold-italic">
+
</mo>
</mrow>
</mrow>
<annotation encoding="application/x-tex">
\\boldsymbol{Ax2k\\omega\\Omega\\imath+}

View File

@@ -93,7 +93,10 @@ describe("A rel parser", function() {
expect(parse).toBeTruthy();
for (let i = 0; i < parse.length; i++) {
const group = parse[i];
let group = parse[i];
if (group.type === "htmlmathml") {
group = group.value.html[0];
}
expect(group.type).toEqual("rel");
}
});
@@ -1503,12 +1506,12 @@ describe("A font parser", function() {
expect(nestedParse.value.font).toEqual("mathbb");
expect(nestedParse.value.type).toEqual("font");
expect(nestedParse.value.body.value.length).toEqual(4);
const bbBody = nestedParse.value.body.value;
expect(bbBody.length).toEqual(3);
expect(bbBody[0].type).toEqual("mathord");
expect(bbBody[3].type).toEqual("font");
expect(bbBody[3].value.font).toEqual("mathrm");
expect(bbBody[3].value.type).toEqual("font");
expect(bbBody[2].type).toEqual("font");
expect(bbBody[2].value.font).toEqual("mathrm");
expect(bbBody[2].value.type).toEqual("font");
});
it("should work with \\textcolor", function() {
@@ -2802,6 +2805,20 @@ describe("A macro expander", function() {
{"\\mode": "\\TextOrMath{t}{m}"});
});
it("\\char produces literal characters", () => {
expect("\\char`a").toParseLike("\\char`\\a");
expect("\\char`\\%").toParseLike("\\char37");
expect("\\char`\\%").toParseLike("\\char'45");
expect("\\char`\\%").toParseLike('\\char"25');
expect("\\char").toNotParse();
expect("\\char`").toNotParse();
expect("\\char'").toNotParse();
expect('\\char"').toNotParse();
expect("\\char'a").toNotParse();
expect('\\char"g').toNotParse();
expect('\\char"g').toNotParse();
});
// TODO(edemaine): This doesn't work yet. Parses like `\text text`,
// which doesn't treat all four letters as an argument.
//it("\\TextOrMath should work in a macro passed to \\text", function() {
@@ -3070,7 +3087,9 @@ describe("Unicode", function() {
});
it("should parse symbols", function() {
expect("£¥ðℂℍℑℓℕ℘ℙℚℜℝℤℲℵℶℷℸ⅁∀∁∂∃∇∞∠∡∢♠♡♢♣♭♮♯✓°¬‼⋮\u00b7").toParse(strictSettings);
expect("ð").toParse(); // warns about lacking character metrics
expect("£¥ℂℍℑℓℕ℘ℙℚℜℝℤℲℵℶℷℸ⅁∀∁∂∃∇∞∠∡∢♠♡♢♣♭♮♯✓°¬‼⋮\u00B7\u00A9").toBuild(strictSettings);
expect("\\text{£¥\u00A9\u00AE\uFE0F}").toBuild(strictSettings);
});
it("should build Greek capital letters", function() {

View File

@@ -128,4 +128,9 @@ describe("A MathML builder", function() {
"\\texttt{tt\\textit{italic\\textbf{bold italic}}\\textbf{bold}}}"))
.toMatchSnapshot();
});
it('\\html@mathml makes clean symbols', () => {
expect(getMathML("\\copyright\\neq\\notin\u2258\\KaTeX"))
.toMatchSnapshot();
});
});