Rewrote lexer, avoiding some mode-specific distinctions

There are two main motivations for this commit. One is unicode input, which requires unicode characters to get past the lexer. See discussion in #261. The second is in preparation for #266, where we'd deal with one token of look-ahead but might be lexing that token in an unknown mode in some cases. The unit test shipped with this commit addresses the latter concern, since it checks that a math-mode-only token may immediately follow some text mode content group. In this new implementation, all the various things that could get matched have been collected into a single regular expression. The hope is that this will be beneficial for performance and keep the code simpler. The code was written with Unicode input in mind, including non-BMP codepoints. The role of the lexer as a gate keeper, keeping out invalid TeX syntax, has been abandoned. That role is still fulfilled by the symbols and functions tables, though, since any input which is neither a symbol nor a command is still considered invalid input, even though it lexes successfully.
2025-10-07 12:18:39 +00:00 · 2015-07-07 14:15:58 +02:00
parent 95e2f1c8d7
commit d423bec089
2 changed files with 45 additions and 72 deletions
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -665,6 +665,7 @@ describe("A text parser", function() {
    var leadingSpaceTextExpression = "\\text {moo}";
    var badTextExpression = "\\text{a b%}";
    var badFunctionExpression = "\\text{\\sqrt{x}}";
+    var mathTokenAfterText = "\\text{sin}^2";

    it("should not fail", function() {
        expect(textExpression).toParse();
@@ -710,6 +711,10 @@ describe("A text parser", function() {
        expect(group[3].type).toMatch("spacing");
    });

+    it("should accept math mode tokens after its argument", function() {
+        expect(mathTokenAfterText).toParse();
+    });
+
    it("should ignore a space before the text group", function() {
        var parse = getParsed(leadingSpaceTextExpression)[0];
        // [m, o, o]