Add raw string group, move comment parsing to Parser, change URL group parser (#1711)

* Add raw string group

* Move comment parsing to Parser

* Use raw string group in URL group parser

* Update types.js

* Add multi-level nested url test
This commit is contained in:
ylemkimon
2018-10-13 10:21:57 +09:00
committed by Kevin Barabash
parent ba8e224b8d
commit 3907545e2c
7 changed files with 135 additions and 98 deletions

View File

@@ -17,11 +17,9 @@ import SourceLocation from "./SourceLocation";
import {Token} from "./Token";
import type {LexerInterface} from "./Token";
import type Settings from "./Settings";
/* The following tokenRegex
* - matches typical whitespace (but not NBSP etc.) using its first group
* - matches comments (must have trailing newlines)
* - does not match any control character \x00-\x1f except whitespace
* - does not match a bare backslash
* - matches any ASCII character except those just mentioned
@@ -36,7 +34,6 @@ import type Settings from "./Settings";
* still reject the input.
*/
const spaceRegexString = "[ \r\n\t]";
const commentRegexString = "%[^\n]*(?:\n|$)";
const controlWordRegexString = "\\\\[a-zA-Z@]+";
const controlSymbolRegexString = "\\\\[^\uD800-\uDFFF]";
const controlWordWhitespaceRegexString =
@@ -46,37 +43,28 @@ const controlWordWhitespaceRegex = new RegExp(
const combiningDiacriticalMarkString = "[\u0300-\u036f]";
export const combiningDiacriticalMarksEndRegex =
new RegExp(`${combiningDiacriticalMarkString}+$`);
const urlFunctionRegexString = "(\\\\href|\\\\url)" +
`(?:${spaceRegexString}*\\{((?:[^{}\\\\]|\\\\[^]|{[^{}]*})*)\\}` +
`|${spaceRegexString}+([^{}])` +
`|${spaceRegexString}*([^{}a-zA-Z]))`;
const tokenRegexString = `(${spaceRegexString}+)|` + // whitespace
`(${commentRegexString}` + // comments
"|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
"([!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
`${combiningDiacriticalMarkString}*` + // ...plus accents
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
`${combiningDiacriticalMarkString}*` + // ...plus accents
"|\\\\verb\\*([^]).*?\\3" + // \verb*
"|\\\\verb([^*a-zA-Z]).*?\\4" + // \verb unstarred
`|${urlFunctionRegexString}` + // URL arguments
`|${controlWordWhitespaceRegexString}` + // \macroName + spaces
`|${controlSymbolRegexString})`; // \\, \', etc.
// These regexs are for matching results from tokenRegex,
// so they do have ^ markers.
export const controlWordRegex = new RegExp(`^${controlWordRegexString}`);
export const urlFunctionRegex = new RegExp(`^${urlFunctionRegexString}`);
/** Main Lexer class */
export default class Lexer implements LexerInterface {
input: string;
settings: Settings;
tokenRegex: RegExp;
constructor(input: string, settings: Settings) {
constructor(input: string) {
// Separate accents from characters
this.input = input;
this.settings = settings;
this.tokenRegex = new RegExp(tokenRegexString, 'g');
}
@@ -100,19 +88,10 @@ export default class Lexer implements LexerInterface {
// Trim any trailing whitespace from control word match
const controlMatch = text.match(controlWordWhitespaceRegex);
if (controlMatch) {
text = controlMatch[1] + text.slice(controlMatch[0].length);
text = controlMatch[1];
}
if (text[0] === "%") {
if (text[text.length - 1] !== "\n") {
this.settings.reportNonstrict("commentAtEnd",
"% comment has no terminating newline; LaTeX would " +
"fail because of commenting the end of math mode (e.g. $)");
}
return this.lex();
} else {
return new Token(text, new SourceLocation(this, pos,
this.tokenRegex.lastIndex));
}
}
}

View File

@@ -50,7 +50,7 @@ export default class MacroExpander implements MacroContextInterface {
* (with existing macros etc.).
*/
feed(input: string) {
this.lexer = new Lexer(input, this.settings);
this.lexer = new Lexer(input);
}
/**
@@ -314,7 +314,7 @@ export default class MacroExpander implements MacroContextInterface {
++numArgs;
}
}
const bodyLexer = new Lexer(expansion, this.settings);
const bodyLexer = new Lexer(expansion);
const tokens = [];
let tok = bodyLexer.lex();
while (tok.text !== "EOF") {

View File

@@ -11,7 +11,7 @@ import unicodeSymbols from "./unicodeSymbols";
import utils from "./utils";
import {assertNodeType, checkNodeType} from "./parseNode";
import ParseError from "./ParseError";
import {combiningDiacriticalMarksEndRegex, urlFunctionRegex} from "./Lexer";
import {combiningDiacriticalMarksEndRegex} from "./Lexer";
import Settings from "./Settings";
import SourceLocation from "./SourceLocation";
import {Token} from "./Token";
@@ -405,6 +405,8 @@ export default class Parser {
}
// Put everything into an ordgroup as the superscript
superscript = {type: "ordgroup", mode: this.mode, body: primes};
} else if (lex.text === "%") {
this.consumeComment();
} else {
// If it wasn't ^, _, or ', stop parsing super/subscripts
break;
@@ -658,9 +660,15 @@ export default class Parser {
return this.parseSizeGroup(optional);
}
if (type === "url") {
throw new ParseError(
"Internal bug: 'url' arguments should be handled by Lexer",
this.nextToken);
return this.parseUrlGroup(optional);
}
if (type === "raw") {
const token = this.parseStringGroup("raw", optional, true);
return token ? newArgument({
type: "raw",
mode: this.mode,
string: token.text,
}, token) : null;
}
// By the time we get here, type is one of "text" or "math".
@@ -674,6 +682,27 @@ export default class Parser {
}
}
consumeComment() {
// the newline character is normalized in Lexer, check original source
while (this.nextToken.text !== "EOF" && this.nextToken.loc &&
this.nextToken.loc.getSource().indexOf("\n") === -1) {
this.consume();
}
if (this.nextToken.text === "EOF") {
this.settings.reportNonstrict("commentAtEnd",
"% comment has no terminating newline; LaTeX would " +
"fail because of commenting the end of math mode (e.g. $)");
}
if (this.mode === "math") {
this.consumeSpaces(); // ignore spaces in math mode
} else if (this.nextToken.loc) { // text mode
const source = this.nextToken.loc.getSource();
if (source.indexOf("\n") === source.length - 1) {
this.consumeSpaces(); // if no space after the first newline
}
}
}
/**
* Parses a group, essentially returning the string formed by the
* brace-enclosed tokens plus some position information.
@@ -681,28 +710,53 @@ export default class Parser {
parseStringGroup(
modeName: ArgType, // Used to describe the mode in error messages.
optional: boolean,
raw?: boolean,
): ?Token {
if (optional && this.nextToken.text !== "[") {
const groupBegin = optional ? "[" : "{";
const groupEnd = optional ? "]" : "}";
const nextToken = this.nextToken;
if (nextToken.text !== groupBegin) {
if (optional) {
return null;
} else if (raw && nextToken.text !== "EOF" &&
/[^{}[\]]/.test(nextToken.text)) {
// allow a single character in raw string group
this.consume();
return nextToken;
}
}
const outerMode = this.mode;
this.mode = "text";
this.expect(optional ? "[" : "{");
this.expect(groupBegin);
let str = "";
const firstToken = this.nextToken;
let nested = 0; // allow nested braces in raw string group
let lastToken = firstToken;
while (this.nextToken.text !== (optional ? "]" : "}")) {
if (this.nextToken.text === "EOF") {
while ((raw && nested > 0) || this.nextToken.text !== groupEnd) {
switch (this.nextToken.text) {
case "EOF":
throw new ParseError(
"Unexpected end of input in " + modeName,
firstToken.range(this.nextToken, str));
firstToken.range(lastToken, str));
case "%":
if (!raw) { // allow % in raw string group
this.consumeComment();
continue;
}
break;
case groupBegin:
nested++;
break;
case groupEnd:
nested--;
break;
}
lastToken = this.nextToken;
str += lastToken.text;
this.consume();
}
this.mode = outerMode;
this.expect(optional ? "]" : "}");
this.expect(groupEnd);
return firstToken.range(lastToken, str);
}
@@ -720,8 +774,12 @@ export default class Parser {
const firstToken = this.nextToken;
let lastToken = firstToken;
let str = "";
while (this.nextToken.text !== "EOF"
&& regex.test(str + this.nextToken.text)) {
while (this.nextToken.text !== "EOF" && (regex.test(
str + this.nextToken.text) || this.nextToken.text === "%")) {
if (this.nextToken.text === "%") {
this.consumeComment();
continue;
}
lastToken = this.nextToken;
str += lastToken.text;
this.consume();
@@ -802,6 +860,34 @@ export default class Parser {
}, res);
}
/**
* Parses an URL, checking escaped letters and allowed protocols.
*/
parseUrlGroup(optional: boolean): ?ParsedArg {
const res = this.parseStringGroup("url", optional, true); // get raw string
if (!res) {
return null;
}
// hyperref package allows backslashes alone in href, but doesn't
// generate valid links in such cases; we interpret this as
// "undefined" behaviour, and keep them as-is. Some browser will
// replace backslashes with forward slashes.
const url = res.text.replace(/\\([#$%&~_^{}])/g, '$1');
let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
protocol = (protocol != null ? protocol[1] : "_relative");
const allowed = this.settings.allowedProtocols;
if (!utils.contains(allowed, "*") &&
!utils.contains(allowed, protocol)) {
throw new ParseError(
`Forbidden protocol '${protocol}'`, res);
}
return newArgument({
type: "url",
mode: this.mode,
url,
}, res);
}
/**
* If `optional` is false or absent, this parses an ordinary group,
* which is either a single nucleus (like "x") or an expression
@@ -913,53 +999,6 @@ export default class Parser {
// The token will be consumed later in parseGivenFunction
// (after possibly switching modes).
return newFunction(nucleus);
} else if (/^\\(href|url)[^a-zA-Z]/.test(text)) {
const match = text.match(urlFunctionRegex);
if (!match) {
throw new ParseError(
`Internal error: invalid URL token '${text}'`, nucleus);
}
const funcName = match[1];
// match[2] is the only one that can be an empty string,
// so it must be at the end of the following or chain:
const rawUrl = match[4] || match[3] || match[2];
// hyperref package allows backslashes alone in href, but doesn't
// generate valid links in such cases; we interpret this as
// "undefined" behaviour, and keep them as-is. Some browser will
// replace backslashes with forward slashes.
const url = rawUrl.replace(/\\([#$%&~_^{}])/g, '$1');
let protocol = /^\s*([^\\/#]*?)(?::|&#0*58|&#x0*3a)/i.exec(url);
protocol = (protocol != null ? protocol[1] : "_relative");
const allowed = this.settings.allowedProtocols;
if (!utils.contains(allowed, "*") &&
!utils.contains(allowed, protocol)) {
throw new ParseError(
`Forbidden protocol '${protocol}' in ${funcName}`, nucleus);
}
const urlArg = {
type: "url",
mode: this.mode,
url,
};
this.consume();
if (funcName === "\\href") { // two arguments
this.consumeSpaces(); // ignore spaces between arguments
let description = this.parseGroupOfType("original", false);
if (description == null) {
throw new ParseError(`${funcName} missing second argument`,
nucleus);
}
if (description.type === "fn") {
description = this.parseGivenFunction(description);
} else { // arg.type === "arg"
description = description.result;
}
return newArgument(this.callFunction(
funcName, [urlArg, description], []), nucleus);
} else { // one argument (\url)
return newArgument(this.callFunction(
funcName, [urlArg], []), nucleus);
}
} else if (/^\\verb[^a-zA-Z]/.test(text)) {
this.consume();
let arg = text.slice(5);
@@ -980,6 +1019,9 @@ export default class Parser {
body: arg,
star,
}, nucleus);
} else if (text === "%") {
this.consumeComment();
return this.parseSymbol();
}
// At this point, we should have a symbol, possibly with accents.
// First expand any accented base symbol according to unicodeSymbols.

View File

@@ -17,6 +17,10 @@ export default class SourceLocation {
this.end = end;
}
getSource(): string {
return this.lexer.input.slice(this.start, this.end);
}
/**
* Merges two `SourceLocation`s from location providers, given they are
* provided in order of appearance.

View File

@@ -80,6 +80,12 @@ type ParseNodeTypes = {
loc?: ?SourceLocation,
body: AnyParseNode[],
|},
"raw": {|
type: "raw",
mode: Mode,
loc?: ?SourceLocation,
string: string,
|},
"size": {|
type: "size",
mode: Mode,

View File

@@ -12,13 +12,15 @@ export type Mode = "math" | "text";
// - "color": An html color, like "#abc" or "blue"
// - "url": An url string, in which "\" will be ignored
// - if it precedes [#$%&~_^\{}]
// - "raw": A string, allowing single character, percent sign,
// and nested braces
// - "original": The same type as the environment that the
// function being parsed is in (e.g. used for the
// bodies of functions like \textcolor where the
// first argument is special and the second
// argument is parsed normally)
// - Mode: Node group parsed in given mode.
export type ArgType = "color" | "size" | "url" | "original" | Mode;
export type ArgType = "color" | "size" | "url" | "raw" | "original" | Mode;
// LaTeX display style.
export type StyleStr = "text" | "display" | "script" | "scriptscript";

View File

@@ -1597,6 +1597,16 @@ describe("A comment parser", function() {
expect("% comment 1\n% comment 2\n").toParse();
});
it("should parse comments between subscript and superscript", () => {
expect("x_3 %comment\n^2").toParseLike`x_3^2`;
});
it("should parse comments in size and color groups", () => {
expect("\\kern{1 %kern\nem}").toParse();
expect("\\kern1 %kern\nem").toParse();
expect("\\color{#f00%red\n}").toParse();
});
it("should not parse a comment without newline in strict mode", () => {
expect`x%y`.not.toParse(strictSettings);
expect`x%y`.toParse(nonstrictSettings);
@@ -2527,12 +2537,6 @@ describe("href and url commands", function() {
expect("\\url%end").toParseLike("\\url {%}end");
});
it("should detect missing second argument in \\href", () => {
expect`\href{http://example.com/}`.not.toParse();
expect`\href%`.not.toParse();
expect`\href %`.not.toParse();
});
it("should allow spaces single-character URLs", () => {
expect`\href %end`.toParseLike("\\href{%}end");
expect("\\url %end").toParseLike("\\url{%}end");
@@ -2547,7 +2551,7 @@ describe("href and url commands", function() {
});
it("should allow balanced braces in url", function() {
const url = "http://example.org/{too}";
const url = "http://example.org/{{}t{oo}}";
const parsed1 = getParsed(`\\href{${url}}{\\alpha}`)[0];
expect(parsed1.href).toBe(url);
const parsed2 = getParsed(`\\url{${url}}`)[0];