From b7277049a4f9cf1b6aaeb4b73d2f1340bf1b7d37 Mon Sep 17 00:00:00 2001 From: Erik Demaine Date: Sat, 14 Jul 2018 08:36:23 -0400 Subject: [PATCH] \char character escaping and nicer MathML via \html@mathml (#1454) * \html@mathml Fix #1452 * Add missing file * Implement \char (via internal \@char) * Remove excess wrapper on \mathbin etc. * Fix tests * Add Unicode support for \copyright and \textregistered Testing that this doesn't lead to an infinite loop thanks to \char` escaping. * Add tests * Use assertNodeType * Switch from regex to lookup table, and no parseInt --- src/ParseNode.js | 6 ++ src/Parser.js | 4 +- src/functions.js | 2 + src/functions/char.js | 30 ++++++ src/functions/htmlmathml.js | 34 +++++++ src/functions/mclass.js | 4 +- src/macros.js | 121 ++++++++++++++++++++----- test/__snapshots__/mathml-spec.js.snap | 83 +++++++++++------ test/katex-spec.js | 31 +++++-- test/mathml-spec.js | 5 + 10 files changed, 259 insertions(+), 61 deletions(-) create mode 100644 src/functions/char.js create mode 100644 src/functions/htmlmathml.js diff --git a/src/ParseNode.js b/src/ParseNode.js index 8d71cf23..4778c1c1 100644 --- a/src/ParseNode.js +++ b/src/ParseNode.js @@ -89,6 +89,7 @@ export type AnyParseNode = ParseNode<"genfrac"> | ParseNode<"horizBrace"> | ParseNode<"href"> | + ParseNode<"htmlmathml"> | ParseNode<"infix"> | ParseNode<"kern"> | ParseNode<"lap"> | @@ -258,6 +259,11 @@ export type ParseNodeTypes = { href: string, body: AnyParseNode[], |}, + "htmlmathml": {| + type: "htmlmathml", + html: AnyParseNode[], + mathml: AnyParseNode[], + |}, "infix": {| type: "infix", replaceWith: string, diff --git a/src/Parser.js b/src/Parser.js index 6905ffd9..1dc96a4c 100644 --- a/src/Parser.js +++ b/src/Parser.js @@ -1,6 +1,5 @@ // @flow /* eslint no-constant-condition:0 */ -/* eslint no-console:0 */ import functions from "./functions"; import environments from "./environments"; import MacroExpander from "./MacroExpander"; @@ -1016,7 +1015,8 @@ export default class Parser { if (this.settings.strict) { if (!supportedCodepoint(text.charCodeAt(0))) { this.settings.reportNonstrict("unknownSymbol", - `Unrecognized Unicode character "${text[0]}"`, nucleus); + `Unrecognized Unicode character "${text[0]}"` + + ` (${text.charCodeAt(0)})`, nucleus); } else if (this.mode === "math") { this.settings.reportNonstrict("unicodeTextInMathMode", `Unicode text character "${text[0]}" used in math mode`, diff --git a/src/functions.js b/src/functions.js index 0583238e..07af852b 100644 --- a/src/functions.js +++ b/src/functions.js @@ -10,6 +10,7 @@ export default functions; import "./functions/accent"; import "./functions/accentunder"; import "./functions/arrow"; +import "./functions/char"; import "./functions/color"; import "./functions/cr"; import "./functions/delimsizing"; @@ -19,6 +20,7 @@ import "./functions/font"; import "./functions/genfrac"; import "./functions/horizBrace"; import "./functions/href"; +import "./functions/htmlmathml"; import "./functions/kern"; import "./functions/lap"; import "./functions/math"; diff --git a/src/functions/char.js b/src/functions/char.js new file mode 100644 index 00000000..b91f0ee8 --- /dev/null +++ b/src/functions/char.js @@ -0,0 +1,30 @@ +// @flow +import defineFunction from "../defineFunction"; +import ParseError from "../ParseError"; +import ParseNode, {assertNodeType} from "../ParseNode"; + +// \@char is an internal function that takes a grouped decimal argument like +// {123} and converts into symbol with code 123. It is used by the *macro* +// \char defined in macros.js. +defineFunction({ + type: "textord", + names: ["\\@char"], + props: { + numArgs: 1, + allowedInText: true, + }, + handler({parser}, args) { + const arg = assertNodeType(args[0], "ordgroup"); + const group = arg.value; + let number = ""; + for (let i = 0; i < group.length; i++) { + const node = assertNodeType(group[i], "textord"); + number += node.value; + } + const code = parseInt(number); + if (isNaN(code)) { + throw new ParseError(`\\@char has non-numeric argument ${number}`); + } + return new ParseNode("textord", String.fromCharCode(code), parser.mode); + }, +}); diff --git a/src/functions/htmlmathml.js b/src/functions/htmlmathml.js new file mode 100644 index 00000000..cd65b134 --- /dev/null +++ b/src/functions/htmlmathml.js @@ -0,0 +1,34 @@ +// @flow +import defineFunction, {ordargument} from "../defineFunction"; +import buildCommon from "../buildCommon"; +import ParseNode from "../ParseNode"; + +import * as html from "../buildHTML"; +import * as mml from "../buildMathML"; + +defineFunction({ + type: "htmlmathml", + names: ["\\html@mathml"], + props: { + numArgs: 2, + allowedInText: true, + }, + handler: ({parser}, args) => { + return new ParseNode("htmlmathml", { + type: "htmlmathml", + html: ordargument(args[0]), + mathml: ordargument(args[1]), + }, parser.mode); + }, + htmlBuilder: (group, options) => { + const elements = html.buildExpression( + group.value.html, + options, + false + ); + return new buildCommon.makeFragment(elements); + }, + mathmlBuilder: (group, options) => { + return mml.buildExpressionRow(group.value.mathml, options); + }, +}); diff --git a/src/functions/mclass.js b/src/functions/mclass.js index 9c6bd299..0aeb62ea 100644 --- a/src/functions/mclass.js +++ b/src/functions/mclass.js @@ -1,7 +1,7 @@ // @flow import defineFunction, {ordargument} from "../defineFunction"; import buildCommon from "../buildCommon"; -import mathMLTree from "../mathMLTree"; +import domTree from "../domTree"; import ParseNode from "../ParseNode"; import * as html from "../buildHTML"; @@ -16,7 +16,7 @@ function htmlBuilder(group, options) { function mathmlBuilder(group, options) { const inner = mml.buildExpression(group.value.value, options); - return new mathMLTree.MathNode("mstyle", inner); + return new domTree.documentFragment(inner); } // Math class commands except \mathop diff --git a/src/macros.js b/src/macros.js index 9c663967..8efcdd76 100644 --- a/src/macros.js +++ b/src/macros.js @@ -31,6 +31,11 @@ export interface MacroContextInterface { */ future(): Token; + /** + * Remove and return the next unexpanded token. + */ + popToken(): Token; + /** * Expand the next token only once (if possible), and return the resulting * top token on the stack (without removing anything from the stack). @@ -131,6 +136,61 @@ defineMacro("\\TextOrMath", function(context) { } }); +// Lookup table for parsing numbers in base 8 through 16 +const digitToNumber = { + "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, + "9": 9, "a": 10, "A": 10, "b": 11, "B": 11, "c": 12, "C": 12, + "d": 13, "D": 13, "e": 14, "E": 14, "f": 15, "F": 15, +}; + +// TeX \char makes a literal character (catcode 12) using the following forms: +// (see The TeXBook, p. 43) +// \char123 -- decimal +// \char'123 -- octal +// \char"123 -- hex +// \char`x -- character that can be written (i.e. isn't active) +// \char`\x -- character that cannot be written (e.g. %) +// These all refer to characters from the font, so we turn them into special +// calls to a function \@char dealt with in the Parser. +defineMacro("\\char", function(context) { + let token = context.popToken(); + let base; + let number = ''; + if (token.text === "'") { + base = 8; + token = context.popToken(); + } else if (token.text === '"') { + base = 16; + token = context.popToken(); + } else if (token.text === "`") { + token = context.popToken(); + if (token.text[0] === "\\") { + number = token.text.charCodeAt(1); + } else if (token.text === "EOF") { + throw new ParseError("\\char` missing argument"); + } else { + number = token.text.charCodeAt(0); + } + } else { + base = 10; + } + if (base) { + // Parse a number in the given base, starting with first `token`. + number = digitToNumber[token.text]; + if (number == null || number >= base) { + throw new ParseError(`Invalid base-${base} digit ${token.text}`); + } + let digit; + while ((digit = digitToNumber[context.future().text]) != null && + digit < base) { + number *= base; + number += digit; + context.popToken(); + } + } + return `\\@char{${number}}`; +}); + // Basic support for macro definitions: // \def\macro{expansion} // \def\macro#1{expansion} @@ -251,15 +311,17 @@ defineMacro("\\rq", "'"); defineMacro("\\aa", "\\r a"); defineMacro("\\AA", "\\r A"); +// Copyright (C) and registered (R) symbols. Use raw symbol in MathML. // \DeclareTextCommandDefault{\textcopyright}{\textcircled{c}} // \DeclareTextCommandDefault{\textregistered}{\textcircled{% // \check@mathfonts\fontsize\sf@size\z@\math@fontsfalse\selectfont R}} // \DeclareRobustCommand{\copyright}{% // \ifmmode{\nfss@text{\textcopyright}}\else\textcopyright\fi} -defineMacro("\\textcopyright", "\\textcircled{c}"); +defineMacro("\\textcopyright", "\\html@mathml{\\textcircled{c}}{\\char`©}"); defineMacro("\\copyright", "\\TextOrMath{\\textcopyright}{\\text{\\textcopyright}}"); -defineMacro("\\textregistered", "\\textcircled{\\scriptsize R}"); +defineMacro("\\textregistered", + "\\html@mathml{\\textcircled{\\scriptsize R}}{\\char`®}"); // Unicode double-struck letters defineMacro("\u2102", "\\mathbb{C}"); @@ -300,21 +362,32 @@ defineMacro("\\clap", "\\mathclap{\\textrm{#1}}"); // \DeclareRobustCommand // \notin{\mathrel{\m@th\mathpalette\c@ncel\in}} // \def\c@ncel#1#2{\m@th\ooalign{$\hfil#1\mkern1mu/\hfil$\crcr$#1#2$}} -defineMacro("\\neq", "\\not="); +defineMacro("\\neq", "\\html@mathml{\\not=}{\\mathrel{\\char`≠}}"); defineMacro("\\ne", "\\neq"); defineMacro("\u2260", "\\neq"); -defineMacro("\\notin", "\\mathrel{{\\in}\\mathllap{/\\mskip1mu}}"); +defineMacro("\\notin", "\\html@mathml{\\mathrel{{\\in}\\mathllap{/\\mskip1mu}}}" + + "{\\mathrel{\\char`∉}}"); defineMacro("\u2209", "\\notin"); // Unicode stacked relations -defineMacro("\u2258", - "\\mathrel{=\\kern{-1em}\\raisebox{0.4em}{$\\scriptsize\\frown$}}"); -defineMacro("\u2259", "\\stackrel{\\tiny\\wedge}{=}"); -defineMacro("\u225A", "\\stackrel{\\tiny\\vee}{=}"); -defineMacro("\u225B", "\\stackrel{\\scriptsize\\star}{=}"); -defineMacro("\u225D", "\\stackrel{\\tiny\\mathrm{def}}{=}"); -defineMacro("\u225E", "\\stackrel{\\tiny\\mathrm{m}}{=}"); -defineMacro("\u225F", "\\stackrel{\\tiny?}{=}"); +defineMacro("\u2258", "\\html@mathml{" + + "\\mathrel{=\\kern{-1em}\\raisebox{0.4em}{$\\scriptsize\\frown$}}" + + "}{\\mathrel{\\char`\u2258}}"); +defineMacro("\u2259", + "\\html@mathml{\\stackrel{\\tiny\\wedge}{=}}{\\mathrel{\\char`\u2258}}"); +defineMacro("\u225A", + "\\html@mathml{\\stackrel{\\tiny\\vee}{=}}{\\mathrel{\\char`\u225A}}"); +defineMacro("\u225B", + "\\html@mathml{\\stackrel{\\scriptsize\\star}{=}}" + + "{\\mathrel{\\char`\u225B}}"); +defineMacro("\u225D", + "\\html@mathml{\\stackrel{\\tiny\\mathrm{def}}{=}}" + + "{\\mathrel{\\char`\u225D}}"); +defineMacro("\u225E", + "\\html@mathml{\\stackrel{\\tiny\\mathrm{m}}{=}}" + + "{\\mathrel{\\char`\u225E}}"); +defineMacro("\u225F", + "\\html@mathml{\\stackrel{\\tiny?}{=}}{\\mathrel{\\char`\u225F}}"); // Misc Unicode defineMacro("\u27C2", "\\perp"); @@ -324,6 +397,9 @@ defineMacro("\u231C", "\\ulcorner"); defineMacro("\u231D", "\\urcorner"); defineMacro("\u231E", "\\llcorner"); defineMacro("\u231F", "\\lrcorner"); +defineMacro("\u00A9", "\\copyright"); +defineMacro("\u00AE", "\\textregistered"); +defineMacro("\uFE0F", "\\textregistered"); ////////////////////////////////////////////////////////////////////// // LaTeX_2ε @@ -593,7 +669,9 @@ defineMacro("\\\\", "\\newline"); // TODO: Doesn't normally work in math mode because \@ fails. KaTeX doesn't // support \@ yet, so that's omitted, and we add \text so that the result // doesn't look funny in math mode. -defineMacro("\\TeX", "\\textrm{T\\kern-.1667em\\raisebox{-.5ex}{E}\\kern-.125emX}"); +defineMacro("\\TeX", "\\textrm{\\html@mathml{" + + "T\\kern-.1667em\\raisebox{-.5ex}{E}\\kern-.125emX" + + "}{TeX}}"); // \DeclareRobustCommand{\LaTeX}{L\kern-.36em% // {\sbox\z@ T% @@ -611,14 +689,14 @@ defineMacro("\\TeX", "\\textrm{T\\kern-.1667em\\raisebox{-.5ex}{E}\\kern-.125emX // which is size3, which has a scale factor of 0.7 (see Options.js). const latexRaiseA = fontMetricsData['Main-Regular']["T".charCodeAt(0)][1] - 0.7 * fontMetricsData['Main-Regular']["A".charCodeAt(0)][1] + "em"; -defineMacro("\\LaTeX", - `\\textrm{L\\kern-.36em\\raisebox{${latexRaiseA}}{\\scriptsize A}` + - "\\kern-.15em\\TeX}"); +defineMacro("\\LaTeX", "\\textrm{\\html@mathml{" + + `L\\kern-.36em\\raisebox{${latexRaiseA}}{\\scriptsize A}` + + "\\kern-.15em\\TeX}{LaTeX}}"); // New KaTeX logo based on tweaking LaTeX logo -defineMacro("\\KaTeX", - `\\textrm{K\\kern-.17em\\raisebox{${latexRaiseA}}{\\scriptsize A}` + - "\\kern-.15em\\TeX}"); +defineMacro("\\KaTeX", "\\textrm{\\html@mathml{" + + `K\\kern-.17em\\raisebox{${latexRaiseA}}{\\scriptsize A}` + + "\\kern-.15em\\TeX}{KaTeX}}"); // \DeclareRobustCommand\hspace{\@ifstar\@hspacer\@hspace} // \def\@hspace#1{\hskip #1\relax} @@ -705,9 +783,6 @@ defineMacro("\\approxcoloncolon", "\\mathrel{\\approx\\mathrel{\\mkern-1.2mu}\\dblcolon}"); // Present in newtxmath, pxfonts and txfonts -// TODO: The unicode character U+220C ∌ should be added to the font, and this -// macro turned into a propper defineSymbol in symbols.js. That way, the -// MathML result will be much cleaner. -defineMacro("\\notni", "\\not\\ni"); +defineMacro("\\notni", "\\html@mathml{\\not\\ni}{\\mathrel{\\char`\u220C}}"); defineMacro("\\limsup", "\\DOTSB\\mathop{\\operatorname{lim\\,sup}}\\limits"); defineMacro("\\liminf", "\\DOTSB\\mathop{\\operatorname{lim\\,inf}}\\limits"); diff --git a/test/__snapshots__/mathml-spec.js.snap b/test/__snapshots__/mathml-spec.js.snap index 1e9f8df4..25b923ba 100644 --- a/test/__snapshots__/mathml-spec.js.snap +++ b/test/__snapshots__/mathml-spec.js.snap @@ -1,5 +1,34 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP +exports[`A MathML builder \\html@mathml makes clean symbols 1`] = ` + + + + + + © + + + ≠ + + + ∉ + + + ≘ + + + KaTeX + + + + \\copyright\\neq\\notin≘\\KaTeX + + + + +`; + exports[`A MathML builder \\text fonts become mathvariant 1`] = ` @@ -382,34 +411,32 @@ exports[`A MathML builder should render boldsymbol with the correct mathvariants - - - - A - - - x - - - 2 - - - k - - - ω - - - Ω - - - ı - - - + - - - + + + A + + + x + + + 2 + + + k + + + ω + + + Ω + + + ı + + + + + + \\boldsymbol{Ax2k\\omega\\Omega\\imath+} diff --git a/test/katex-spec.js b/test/katex-spec.js index ff2f287f..7689a08d 100644 --- a/test/katex-spec.js +++ b/test/katex-spec.js @@ -93,7 +93,10 @@ describe("A rel parser", function() { expect(parse).toBeTruthy(); for (let i = 0; i < parse.length; i++) { - const group = parse[i]; + let group = parse[i]; + if (group.type === "htmlmathml") { + group = group.value.html[0]; + } expect(group.type).toEqual("rel"); } }); @@ -1503,12 +1506,12 @@ describe("A font parser", function() { expect(nestedParse.value.font).toEqual("mathbb"); expect(nestedParse.value.type).toEqual("font"); - expect(nestedParse.value.body.value.length).toEqual(4); const bbBody = nestedParse.value.body.value; + expect(bbBody.length).toEqual(3); expect(bbBody[0].type).toEqual("mathord"); - expect(bbBody[3].type).toEqual("font"); - expect(bbBody[3].value.font).toEqual("mathrm"); - expect(bbBody[3].value.type).toEqual("font"); + expect(bbBody[2].type).toEqual("font"); + expect(bbBody[2].value.font).toEqual("mathrm"); + expect(bbBody[2].value.type).toEqual("font"); }); it("should work with \\textcolor", function() { @@ -2802,6 +2805,20 @@ describe("A macro expander", function() { {"\\mode": "\\TextOrMath{t}{m}"}); }); + it("\\char produces literal characters", () => { + expect("\\char`a").toParseLike("\\char`\\a"); + expect("\\char`\\%").toParseLike("\\char37"); + expect("\\char`\\%").toParseLike("\\char'45"); + expect("\\char`\\%").toParseLike('\\char"25'); + expect("\\char").toNotParse(); + expect("\\char`").toNotParse(); + expect("\\char'").toNotParse(); + expect('\\char"').toNotParse(); + expect("\\char'a").toNotParse(); + expect('\\char"g').toNotParse(); + expect('\\char"g').toNotParse(); + }); + // TODO(edemaine): This doesn't work yet. Parses like `\text text`, // which doesn't treat all four letters as an argument. //it("\\TextOrMath should work in a macro passed to \\text", function() { @@ -3070,7 +3087,9 @@ describe("Unicode", function() { }); it("should parse symbols", function() { - expect("£¥ðℂℍℑℓℕ℘ℙℚℜℝℤℲℵℶℷℸ⅁∀∁∂∃∇∞∠∡∢♠♡♢♣♭♮♯✓°¬‼⋮\u00b7").toParse(strictSettings); + expect("ð").toParse(); // warns about lacking character metrics + expect("£¥ℂℍℑℓℕ℘ℙℚℜℝℤℲℵℶℷℸ⅁∀∁∂∃∇∞∠∡∢♠♡♢♣♭♮♯✓°¬‼⋮\u00B7\u00A9").toBuild(strictSettings); + expect("\\text{£¥\u00A9\u00AE\uFE0F}").toBuild(strictSettings); }); it("should build Greek capital letters", function() { diff --git a/test/mathml-spec.js b/test/mathml-spec.js index cfd89ca2..f862feba 100644 --- a/test/mathml-spec.js +++ b/test/mathml-spec.js @@ -128,4 +128,9 @@ describe("A MathML builder", function() { "\\texttt{tt\\textit{italic\\textbf{bold italic}}\\textbf{bold}}}")) .toMatchSnapshot(); }); + + it('\\html@mathml makes clean symbols', () => { + expect(getMathML("\\copyright\\neq\\notin\u2258\\KaTeX")) + .toMatchSnapshot(); + }); });