From 3907545e2c709e96404ddbd8329dc1a39c1ac3b1 Mon Sep 17 00:00:00 2001
From: ylemkimon <mail@ylem.kim>
Date: Sat, 13 Oct 2018 10:21:57 +0900
Subject: [PATCH] Add `raw` string group, move comment parsing to Parser,
 change URL group parser (#1711)

* Add raw string group

* Move comment parsing to Parser

* Use raw string group in URL group parser

* Update types.js

* Add multi-level nested url test
---
 src/Lexer.js          |  31 ++------
 src/MacroExpander.js  |   4 +-
 src/Parser.js         | 166 ++++++++++++++++++++++++++----------------
 src/SourceLocation.js |   4 +
 src/parseNode.js      |   6 ++
 src/types.js          |   4 +-
 test/katex-spec.js    |  18 +++--
 7 files changed, 135 insertions(+), 98 deletions(-)

diff --git a/src/Lexer.js b/src/Lexer.js
index b7aba3e2..ae3d6c47 100644
--- a/src/Lexer.js
+++ b/src/Lexer.js
@@ -17,11 +17,9 @@ import SourceLocation from "./SourceLocation";
 import {Token} from "./Token";
 
 import type {LexerInterface} from "./Token";
-import type Settings from "./Settings";
 
 /* The following tokenRegex
  * - matches typical whitespace (but not NBSP etc.) using its first group
- * - matches comments (must have trailing newlines)
  * - does not match any control character \x00-\x1f except whitespace
  * - does not match a bare backslash
  * - matches any ASCII character except those just mentioned
@@ -36,7 +34,6 @@ import type Settings from "./Settings";
  * still reject the input.
  */
 const spaceRegexString = "[ \r\n\t]";
-const commentRegexString = "%[^\n]*(?:\n|$)";
 const controlWordRegexString = "\\\\[a-zA-Z@]+";
 const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
 const controlWordWhitespaceRegexString =
@@ -46,37 +43,28 @@ const controlWordWhitespaceRegex = new RegExp(
 const combiningDiacriticalMarkString = "[\u0300-\u036f]";
 export const combiningDiacriticalMarksEndRegex =
     new RegExp(`${combiningDiacriticalMarkString}+$`);
-const urlFunctionRegexString = "(\\\\href|\\\\url)" +
-    `(?:${spaceRegexString}*\\{((?:[^{}\\\\]|\\\\[^]|{[^{}]*})*)\\}` +
-    `|${spaceRegexString}+([^{}])` +
-    `|${spaceRegexString}*([^{}a-zA-Z]))`;
 const tokenRegexString = `(${spaceRegexString}+)|` +  // whitespace
-    `(${commentRegexString}` +                        // comments
-    "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
+    "([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" +  // single codepoint
     `${combiningDiacriticalMarkString}*` +            // ...plus accents
     "|[\uD800-\uDBFF][\uDC00-\uDFFF]" +               // surrogate pair
     `${combiningDiacriticalMarkString}*` +            // ...plus accents
     "|\\\\verb\\*([^]).*?\\3" +                       // \verb*
     "|\\\\verb([^*a-zA-Z]).*?\\4" +                   // \verb unstarred
-    `|${urlFunctionRegexString}` +                    // URL arguments
     `|${controlWordWhitespaceRegexString}` +          // \macroName + spaces
     `|${controlSymbolRegexString})`;                  // \\, \', etc.
 
 // These regexs are for matching results from tokenRegex,
 // so they do have ^ markers.
 export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
-export const urlFunctionRegex = new RegExp(`^${urlFunctionRegexString}`);
 
 /** Main Lexer class */
 export default class Lexer implements LexerInterface {
     input: string;
-    settings: Settings;
     tokenRegex: RegExp;
 
-    constructor(input: string, settings: Settings) {
+    constructor(input: string) {
         // Separate accents from characters
         this.input = input;
-        this.settings = settings;
         this.tokenRegex = new RegExp(tokenRegexString, 'g');
     }
 
@@ -100,19 +88,10 @@ export default class Lexer implements LexerInterface {
         // Trim any trailing whitespace from control word match
         const controlMatch = text.match(controlWordWhitespaceRegex);
         if (controlMatch) {
-            text = controlMatch[1] + text.slice(controlMatch[0].length);
+            text = controlMatch[1];
         }
 
-        if (text[0] === "%") {
-            if (text[text.length - 1] !== "\n") {
-                this.settings.reportNonstrict("commentAtEnd",
-                    "% comment has no terminating newline; LaTeX would " +
-                    "fail because of commenting the end of math mode (e.g. $)");
-            }
-            return this.lex();
-        } else {
-            return new Token(text, new SourceLocation(this, pos,
-                this.tokenRegex.lastIndex));
-        }
+        return new Token(text, new SourceLocation(this, pos,
+            this.tokenRegex.lastIndex));
     }
 }
diff --git a/src/MacroExpander.js b/src/MacroExpander.js
index 05034bab..f45fac7b 100644
--- a/src/MacroExpander.js
+++ b/src/MacroExpander.js
@@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
      * (with existing macros etc.).
      */
     feed(input: string) {
-        this.lexer = new Lexer(input, this.settings);
+        this.lexer = new Lexer(input);
     }
 
     /**
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
                     ++numArgs;
                 }
             }
-            const bodyLexer = new Lexer(expansion, this.settings);
+            const bodyLexer = new Lexer(expansion);
             const tokens = [];
             let tok = bodyLexer.lex();
             while (tok.text !== "EOF") {
diff --git a/src/Parser.js b/src/Parser.js
index 9d581164..7499e56c 100644
--- a/src/Parser.js
+++ b/src/Parser.js
@@ -11,7 +11,7 @@ import unicodeSymbols from "./unicodeSymbols";
 import utils from "./utils";
 import {assertNodeType, checkNodeType} from "./parseNode";
 import ParseError from "./ParseError";
-import {combiningDiacriticalMarksEndRegex, urlFunctionRegex} from "./Lexer";
+import {combiningDiacriticalMarksEndRegex} from "./Lexer";
 import Settings from "./Settings";
 import SourceLocation from "./SourceLocation";
 import {Token} from "./Token";
@@ -405,6 +405,8 @@ export default class Parser {
                 }
                 // Put everything into an ordgroup as the superscript
                 superscript = {type: "ordgroup", mode: this.mode, body: primes};
+            } else if (lex.text === "%") {
+                this.consumeComment();
             } else {
                 // If it wasn't ^, _, or ', stop parsing super/subscripts
                 break;
@@ -658,9 +660,15 @@ export default class Parser {
             return this.parseSizeGroup(optional);
         }
         if (type === "url") {
-            throw new ParseError(
-                "Internal bug: 'url' arguments should be handled by Lexer",
-                this.nextToken);
+            return this.parseUrlGroup(optional);
+        }
+        if (type === "raw") {
+            const token = this.parseStringGroup("raw", optional, true);
+            return token ? newArgument({
+                type: "raw",
+                mode: this.mode,
+                string: token.text,
+            }, token) : null;
         }
 
         // By the time we get here, type is one of "text" or "math".
@@ -674,6 +682,27 @@ export default class Parser {
         }
     }
 
+    consumeComment() {
+        // the newline character is normalized in Lexer, check original source
+        while (this.nextToken.text !== "EOF" && this.nextToken.loc &&
+                this.nextToken.loc.getSource().indexOf("\n") === -1) {
+            this.consume();
+        }
+        if (this.nextToken.text === "EOF") {
+            this.settings.reportNonstrict("commentAtEnd",
+                "% comment has no terminating newline; LaTeX would " +
+                "fail because of commenting the end of math mode (e.g. $)");
+        }
+        if (this.mode === "math") {
+            this.consumeSpaces(); // ignore spaces in math mode
+        } else if (this.nextToken.loc) { // text mode
+            const source = this.nextToken.loc.getSource();
+            if (source.indexOf("\n") === source.length - 1) {
+                this.consumeSpaces(); // if no space after the first newline
+            }
+        }
+    }
+
     /**
      * Parses a group, essentially returning the string formed by the
      * brace-enclosed tokens plus some position information.
@@ -681,28 +710,53 @@ export default class Parser {
     parseStringGroup(
         modeName: ArgType,  // Used to describe the mode in error messages.
         optional: boolean,
+        raw?: boolean,
     ): ?Token {
-        if (optional && this.nextToken.text !== "[") {
-            return null;
+        const groupBegin = optional ? "[" : "{";
+        const groupEnd = optional ? "]" : "}";
+        const nextToken = this.nextToken;
+        if (nextToken.text !== groupBegin) {
+            if (optional) {
+                return null;
+            } else if (raw && nextToken.text !== "EOF" &&
+                    /[^{}[\]]/.test(nextToken.text)) {
+                // allow a single character in raw string group
+                this.consume();
+                return nextToken;
+            }
         }
         const outerMode = this.mode;
         this.mode = "text";
-        this.expect(optional ? "[" : "{");
+        this.expect(groupBegin);
         let str = "";
         const firstToken = this.nextToken;
+        let nested = 0; // allow nested braces in raw string group
         let lastToken = firstToken;
-        while (this.nextToken.text !== (optional ? "]" : "}")) {
-            if (this.nextToken.text === "EOF") {
-                throw new ParseError(
-                    "Unexpected end of input in " + modeName,
-                    firstToken.range(this.nextToken, str));
+        while ((raw && nested > 0) || this.nextToken.text !== groupEnd) {
+            switch (this.nextToken.text) {
+                case "EOF":
+                    throw new ParseError(
+                        "Unexpected end of input in " + modeName,
+                        firstToken.range(lastToken, str));
+                case "%":
+                    if (!raw) { // allow % in raw string group
+                        this.consumeComment();
+                        continue;
+                    }
+                    break;
+                case groupBegin:
+                    nested++;
+                    break;
+                case groupEnd:
+                    nested--;
+                    break;
             }
             lastToken = this.nextToken;
             str += lastToken.text;
             this.consume();
         }
         this.mode = outerMode;
-        this.expect(optional ? "]" : "}");
+        this.expect(groupEnd);
         return firstToken.range(lastToken, str);
     }
 
@@ -720,8 +774,12 @@ export default class Parser {
         const firstToken = this.nextToken;
         let lastToken = firstToken;
         let str = "";
-        while (this.nextToken.text !== "EOF"
-            && regex.test(str + this.nextToken.text)) {
+        while (this.nextToken.text !== "EOF" && (regex.test(
+                str + this.nextToken.text) || this.nextToken.text === "%")) {
+            if (this.nextToken.text === "%") {
+                this.consumeComment();
+                continue;
+            }
             lastToken = this.nextToken;
             str += lastToken.text;
             this.consume();
@@ -802,6 +860,34 @@ export default class Parser {
         }, res);
     }
 
+    /**
+     * Parses an URL, checking escaped letters and allowed protocols.
+     */
+    parseUrlGroup(optional: boolean): ?ParsedArg {
+        const res = this.parseStringGroup("url", optional, true); // get raw string
+        if (!res) {
+            return null;
+        }
+        // hyperref package allows backslashes alone in href, but doesn't
+        // generate valid links in such cases; we interpret this as
+        // "undefined" behaviour, and keep them as-is. Some browser will
+        // replace backslashes with forward slashes.
+        const url = res.text.replace(/\\([#$%&~_^{}])/g, '$1');
+        let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
+        protocol = (protocol != null ? protocol[1] : "_relative");
+        const allowed = this.settings.allowedProtocols;
+        if (!utils.contains(allowed,  "*") &&
+            !utils.contains(allowed, protocol)) {
+            throw new ParseError(
+                `Forbidden protocol '${protocol}'`, res);
+        }
+        return newArgument({
+            type: "url",
+            mode: this.mode,
+            url,
+        }, res);
+    }
+
     /**
      * If `optional` is false or absent, this parses an ordinary group,
      * which is either a single nucleus (like "x") or an expression
@@ -913,53 +999,6 @@ export default class Parser {
             // The token will be consumed later in parseGivenFunction
             // (after possibly switching modes).
             return newFunction(nucleus);
-        } else if (/^\\(href|url)[^a-zA-Z]/.test(text)) {
-            const match = text.match(urlFunctionRegex);
-            if (!match) {
-                throw new ParseError(
-                    `Internal error: invalid URL token '${text}'`, nucleus);
-            }
-            const funcName = match[1];
-            // match[2] is the only one that can be an empty string,
-            // so it must be at the end of the following or chain:
-            const rawUrl = match[4] || match[3] || match[2];
-            // hyperref package allows backslashes alone in href, but doesn't
-            // generate valid links in such cases; we interpret this as
-            // "undefined" behaviour, and keep them as-is. Some browser will
-            // replace backslashes with forward slashes.
-            const url = rawUrl.replace(/\\([#$%&~_^{}])/g, '$1');
-            let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
-            protocol = (protocol != null ? protocol[1] : "_relative");
-            const allowed = this.settings.allowedProtocols;
-            if (!utils.contains(allowed,  "*") &&
-                !utils.contains(allowed, protocol)) {
-                throw new ParseError(
-                    `Forbidden protocol '${protocol}' in ${funcName}`, nucleus);
-            }
-            const urlArg = {
-                type: "url",
-                mode: this.mode,
-                url,
-            };
-            this.consume();
-            if (funcName === "\\href") {  // two arguments
-                this.consumeSpaces();  // ignore spaces between arguments
-                let description = this.parseGroupOfType("original", false);
-                if (description == null) {
-                    throw new ParseError(`${funcName} missing second argument`,
-                        nucleus);
-                }
-                if (description.type === "fn") {
-                    description = this.parseGivenFunction(description);
-                } else { // arg.type === "arg"
-                    description = description.result;
-                }
-                return newArgument(this.callFunction(
-                    funcName, [urlArg, description], []), nucleus);
-            } else {  // one argument (\url)
-                return newArgument(this.callFunction(
-                    funcName, [urlArg], []), nucleus);
-            }
         } else if (/^\\verb[^a-zA-Z]/.test(text)) {
             this.consume();
             let arg = text.slice(5);
@@ -980,6 +1019,9 @@ export default class Parser {
                 body: arg,
                 star,
             }, nucleus);
+        } else if (text === "%") {
+            this.consumeComment();
+            return this.parseSymbol();
         }
         // At this point, we should have a symbol, possibly with accents.
         // First expand any accented base symbol according to unicodeSymbols.
diff --git a/src/SourceLocation.js b/src/SourceLocation.js
index 6fb74b6d..bf7e5636 100644
--- a/src/SourceLocation.js
+++ b/src/SourceLocation.js
@@ -17,6 +17,10 @@ export default class SourceLocation {
         this.end = end;
     }
 
+    getSource(): string {
+        return this.lexer.input.slice(this.start, this.end);
+    }
+
     /**
      * Merges two `SourceLocation`s from location providers, given they are
      * provided in order of appearance.
diff --git a/src/parseNode.js b/src/parseNode.js
index d49b4419..7924729d 100644
--- a/src/parseNode.js
+++ b/src/parseNode.js
@@ -80,6 +80,12 @@ type ParseNodeTypes = {
         loc?: ?SourceLocation,
         body: AnyParseNode[],
     |},
+    "raw": {|
+        type: "raw",
+        mode: Mode,
+        loc?: ?SourceLocation,
+        string: string,
+    |},
     "size": {|
         type: "size",
         mode: Mode,
diff --git a/src/types.js b/src/types.js
index 94baaabf..b9bd48e8 100644
--- a/src/types.js
+++ b/src/types.js
@@ -12,13 +12,15 @@ export type Mode = "math" | "text";
 //   - "color": An html color, like "#abc" or "blue"
 //   - "url": An url string, in which "\" will be ignored
 //   -        if it precedes [#$%&~_^\{}]
+//   - "raw": A string, allowing single character, percent sign,
+//            and nested braces
 //   - "original": The same type as the environment that the
 //                 function being parsed is in (e.g. used for the
 //                 bodies of functions like \textcolor where the
 //                 first argument is special and the second
 //                 argument is parsed normally)
 //   - Mode: Node group parsed in given mode.
-export type ArgType = "color" | "size" | "url" | "original" | Mode;
+export type ArgType = "color" | "size" | "url" | "raw" | "original" | Mode;
 
 // LaTeX display style.
 export type StyleStr = "text" | "display" | "script" | "scriptscript";
diff --git a/test/katex-spec.js b/test/katex-spec.js
index 5515b990..d2b0753f 100644
--- a/test/katex-spec.js
+++ b/test/katex-spec.js
@@ -1597,6 +1597,16 @@ describe("A comment parser", function() {
         expect("% comment 1\n% comment 2\n").toParse();
     });
 
+    it("should parse comments between subscript and superscript", () => {
+        expect("x_3 %comment\n^2").toParseLike`x_3^2`;
+    });
+
+    it("should parse comments in size and color groups", () => {
+        expect("\\kern{1 %kern\nem}").toParse();
+        expect("\\kern1 %kern\nem").toParse();
+        expect("\\color{#f00%red\n}").toParse();
+    });
+
     it("should not parse a comment without newline in strict mode", () => {
         expect`x%y`.not.toParse(strictSettings);
         expect`x%y`.toParse(nonstrictSettings);
@@ -2527,12 +2537,6 @@ describe("href and url commands", function() {
         expect("\\url%end").toParseLike("\\url {%}end");
     });
 
-    it("should detect missing second argument in \\href", () => {
-        expect`\href{http://example.com/}`.not.toParse();
-        expect`\href%`.not.toParse();
-        expect`\href %`.not.toParse();
-    });
-
     it("should allow spaces single-character URLs", () => {
         expect`\href %end`.toParseLike("\\href{%}end");
         expect("\\url %end").toParseLike("\\url{%}end");
@@ -2547,7 +2551,7 @@ describe("href and url commands", function() {
     });
 
     it("should allow balanced braces in url", function() {
-        const url = "http://example.org/{too}";
+        const url = "http://example.org/{{}t{oo}}";
         const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0];
         expect(parsed1.href).toBe(url);
         const parsed2 = getParsed(`\\url{${url}}`)[0];