mirror of
https://github.com/Smaug123/WoofWare.PrattParser
synced 2025-10-05 09:28:41 +00:00
310 lines
13 KiB
Forth
310 lines
13 KiB
Forth
namespace PrattParser
|
|
|
|
/// Specification of how to parse things which act like brackets: that is, they start with a token,
|
|
/// then consume some stuff, then there's another token to mark the end.
|
|
///
|
|
/// Optionally you can specify that the bracket-like token consumes something at the end too:
|
|
/// for example, `if...then...` does not have a trailing "end-if".
|
|
/// The trailing clause will consume as much as it can, so e.g. `if foo then bar!` would parse as
|
|
/// `if foo then (bar!)`.
|
|
///
|
|
/// Optionally you can specify a single construct with multiple delimiters:
|
|
/// for example, `if...then...else...` consumes three expressions.
|
|
type BracketLikeParser<'tokenTag, 'expr> =
|
|
{
|
|
/// Whether to consume input after the final token, e.g. like `if...then...else...` consumes,
|
|
/// whereas `(...)` does not.
|
|
ConsumeAfterFinalToken : bool
|
|
/// The successive list of delimiters after the initial delimiter that "opens the brackets".
|
|
/// For example, this might be `[then]`, or `[then ; else]`, or `[')']`.
|
|
BoundaryTokens : 'tokenTag list
|
|
/// How to build an expression given that you've got all the constituent chunks that came
|
|
/// between the delimiters.
|
|
///
|
|
/// We guarantee that the input list will have (as many elements as BoundaryTokens)+1
|
|
/// if ConsumeAfterFinalToken is true, or as many elements as BoundaryTokens
|
|
/// if ConsumeAfterFinalToken is false.
|
|
Construct : 'expr list -> 'expr
|
|
}
|
|
|
|
/// An entity which knows how to parse a stream of 'tokens into an 'expr.
|
|
type Parser<'tokenTag, 'token, 'expr> when 'tokenTag : comparison =
|
|
private
|
|
{
|
|
GetTag : 'token -> 'tokenTag
|
|
UnaryPrefix : Map<'tokenTag, (unit * int) * ('expr -> 'expr)>
|
|
UnaryPostfix : Map<'tokenTag, (int * unit) * ('expr -> 'expr)>
|
|
Infix : Map<'tokenTag, (int * int) * ('expr -> 'expr -> 'expr)>
|
|
Atom : string -> 'token -> 'expr option
|
|
BracketLike : Map<'tokenTag, BracketLikeParser<'tokenTag, 'expr> list>
|
|
}
|
|
|
|
/// Module for constructing and executing Parsers.
|
|
[<RequireQualifiedAccess>]
|
|
module Parser =
|
|
/// The basic parser with the minimum possible information.
|
|
/// You specify how to take a token and get a tag from it,
|
|
/// and you specify how to convert atoms (such as constant ints, or variables) into expressions.
|
|
///
|
|
/// The atom-parsing function is given the entire source string, as well as the 'token
|
|
/// of which we are asking "is this an atom, and if so, how shall it be represented in the AST?".
|
|
let make<'tokenTag, 'token, 'expr when 'tokenTag : comparison>
|
|
(getTag : 'token -> 'tokenTag)
|
|
(atoms : string -> 'token -> 'expr option)
|
|
: Parser<'tokenTag, 'token, 'expr>
|
|
=
|
|
{
|
|
GetTag = getTag
|
|
UnaryPrefix = Map.empty
|
|
UnaryPostfix = Map.empty
|
|
Infix = Map.empty
|
|
Atom = atoms
|
|
BracketLike = Map.empty
|
|
}
|
|
|
|
/// Add a prefix operator to this parser.
|
|
/// The precedence is an int, where higher numbers bind more tightly.
|
|
/// (Following [matklad](https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html), we
|
|
/// express this as `unit * int` to make it clear that it's binding on the right.)
|
|
let withUnaryPrefix<'tokenTag, 'token, 'expr when 'tokenTag : comparison>
|
|
(tokenType : 'tokenTag)
|
|
(precedence : unit * int)
|
|
(construct : 'expr -> 'expr)
|
|
(parser : Parser<'tokenTag, 'token, 'expr>)
|
|
: Parser<'tokenTag, 'token, 'expr>
|
|
=
|
|
{ parser with
|
|
UnaryPrefix =
|
|
parser.UnaryPrefix
|
|
|> Map.change
|
|
tokenType
|
|
(fun existing ->
|
|
match existing with
|
|
| None -> Some (precedence, construct)
|
|
| Some _ -> failwithf "Duplicate unary prefix parser added for token %+A" tokenType
|
|
)
|
|
}
|
|
|
|
/// Add a postfix operator to this parser.
|
|
/// The precedence is an int, where higher numbers bind more tightly.
|
|
/// (Following [matklad](https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html), we
|
|
/// express this as `int * unit` to make it clear that it's binding on the left.)
|
|
let withUnaryPostfix<'tokenTag, 'token, 'expr when 'tokenTag : comparison>
|
|
(tokenType : 'tokenTag)
|
|
(precedence : int * unit)
|
|
(construct : 'expr -> 'expr)
|
|
(parser : Parser<'tokenTag, 'token, 'expr>)
|
|
: Parser<'tokenTag, 'token, 'expr>
|
|
=
|
|
{ parser with
|
|
UnaryPostfix =
|
|
parser.UnaryPostfix
|
|
|> Map.change
|
|
tokenType
|
|
(fun existing ->
|
|
match existing with
|
|
| None -> Some (precedence, construct)
|
|
| Some _ -> failwithf "Duplicate unary postfix parser added for token %+A" tokenType
|
|
)
|
|
}
|
|
|
|
/// Add a binary infix operator to this parser.
|
|
/// The precedence is a pair of ints, where higher numbers bind more tightly.
|
|
///
|
|
/// For example, to make an operator associate on the left, you would give it
|
|
/// tighter (higher-precedence) binding on the right, whereupon parsing would proceed as follows:
|
|
///
|
|
/// 1 + 2 + 3 := 1 +2 +3
|
|
///
|
|
/// after which the only possible bracketing that doesn't split up a tightly-bound operator is:
|
|
///
|
|
/// (1 + 2) + 3
|
|
///
|
|
/// This situation could be specified with a precedence of (n, n + 1), for example.
|
|
let withInfix<'tokenTag, 'token, 'expr when 'tokenTag : comparison>
|
|
(tokenType : 'tokenTag)
|
|
(precedence : int * int)
|
|
(construct : 'expr -> 'expr -> 'expr)
|
|
(parser : Parser<'tokenTag, 'token, 'expr>)
|
|
: Parser<'tokenTag, 'token, 'expr>
|
|
=
|
|
{ parser with
|
|
Infix =
|
|
parser.Infix
|
|
|> Map.change
|
|
tokenType
|
|
(fun existing ->
|
|
match existing with
|
|
| None -> Some (precedence, construct)
|
|
| Some _ -> failwithf "Duplicate infix parser added for token %+A" tokenType
|
|
)
|
|
}
|
|
|
|
/// Add a bracket-like parser to the parser, introduced by a given delimiter.
|
|
/// See the docs for BracketLikeParser.
|
|
let withBracketLike<'tokenTag, 'token, 'expr when 'tokenTag : comparison>
|
|
(tokenType : 'tokenTag)
|
|
(toAdd : BracketLikeParser<'tokenTag, 'expr>)
|
|
(parser : Parser<'tokenTag, 'token, 'expr>)
|
|
: Parser<'tokenTag, 'token, 'expr>
|
|
=
|
|
{ parser with
|
|
BracketLike =
|
|
parser.BracketLike
|
|
|> Map.change
|
|
tokenType
|
|
(fun existing ->
|
|
match existing with
|
|
| None -> Some [ toAdd ]
|
|
| Some existing -> Some (toAdd :: existing)
|
|
)
|
|
}
|
|
|
|
let rec private parseBracketLike
|
|
(parser : Parser<'tokenTag, 'token, 'expr>)
|
|
(inputString : string)
|
|
(subParsers : BracketLikeParser<'tokenTag, 'expr> list)
|
|
(exprsSoFar : 'expr list)
|
|
(tokens : 'token list)
|
|
: ('expr * 'token list) list
|
|
=
|
|
let subParsersEnded, subParsersContinuing =
|
|
subParsers |> List.partition _.BoundaryTokens.IsEmpty
|
|
|
|
let fromSubParsersContinuing =
|
|
// If there are any subparsers expecting further boundary tokens,
|
|
// then we should get the next contents so that we can provide it to them.
|
|
// But if there aren't any such subparsers, we don't have to (and indeed
|
|
// it would be incorrect to, because the token stream might validly have ended).
|
|
if subParsersContinuing.IsEmpty then
|
|
[]
|
|
else
|
|
|
|
let contents, rest = parseInner parser inputString tokens 0
|
|
|
|
match rest with
|
|
| [] ->
|
|
// No valid parses down this path: we've run out of tokens despite all
|
|
// bracket-like parsers expecting another boundary token
|
|
[]
|
|
| next :: rest ->
|
|
|
|
// Which bracket-like parsers are now ruled out by the next bracket-like token?
|
|
let subParsersContinuing =
|
|
subParsersContinuing
|
|
|> List.choose (fun subParser ->
|
|
match subParser.BoundaryTokens with
|
|
| [] -> failwith "logic error, this was ruled out earlier"
|
|
| head :: boundary ->
|
|
if head = parser.GetTag next then
|
|
Some
|
|
{ subParser with
|
|
BoundaryTokens = boundary
|
|
}
|
|
else
|
|
None
|
|
)
|
|
|
|
// And proceed with the ones which are still valid.
|
|
parseBracketLike parser inputString subParsersContinuing (contents :: exprsSoFar) rest
|
|
|
|
// We'll only consider bracket-like parsers which have already consumed all they want to consume
|
|
// if no other parser wanted to consume more. (That is, `if-then-else` is preferred to `if-then`
|
|
// as long as `if-then-else` succeeded, but if `if-then-else` failed, we'll fall back to `if-then`.)
|
|
if fromSubParsersContinuing.IsEmpty then
|
|
subParsersEnded
|
|
|> List.map (fun subParser ->
|
|
if subParser.ConsumeAfterFinalToken then
|
|
let contents, rest = parseInner parser inputString tokens 0
|
|
subParser.Construct (List.rev (contents :: exprsSoFar)), rest
|
|
else
|
|
subParser.Construct (List.rev exprsSoFar), tokens
|
|
)
|
|
else
|
|
fromSubParsersContinuing
|
|
|
|
/// The input string is only required so that the tokens have something to slice into.
|
|
and private parseInner
|
|
(parser : Parser<'tokenTag, 'token, 'expr>)
|
|
(inputString : string)
|
|
(tokens : 'token list)
|
|
(minBinding : int)
|
|
: 'expr * 'token list
|
|
=
|
|
match tokens with
|
|
| [] -> failwith "cannot parse an empty list of tokens"
|
|
| firstToken :: rest ->
|
|
|
|
let lhs, rest =
|
|
match parser.Atom inputString firstToken with
|
|
| Some token ->
|
|
printfn "Parsed an atom: %+A" token
|
|
token, rest
|
|
| None ->
|
|
|
|
match parser.BracketLike.TryGetValue (parser.GetTag firstToken) with
|
|
| true, parse ->
|
|
// This is an ambiguous parse if multiple parsers genuinely matched.
|
|
// (We already filter to the longest possible matching parser.)
|
|
match parseBracketLike parser inputString parse [] rest with
|
|
| [] -> failwithf "Failed to parse any bracket-like parsers for %+A" firstToken
|
|
| [ x ] -> x
|
|
| _ ->
|
|
failwithf
|
|
"Ambiguous parse for bracket-like construct. You should restrict the grammar. %+A"
|
|
firstToken
|
|
| false, _ ->
|
|
|
|
match parser.UnaryPrefix.TryGetValue (parser.GetTag firstToken) with
|
|
| true, (((), precedence), assemble) ->
|
|
printfn "Parsing a prefix op: %+A" firstToken
|
|
let rhs, rest = parseInner parser inputString rest precedence
|
|
printfn "Returning to parse of prefix op: %+A, remaining tokens: %+A" firstToken rest
|
|
assemble rhs, rest
|
|
| false, _ -> failwithf "didn't get an atom or prefix, got: %+A" firstToken
|
|
|
|
let rec go (lhs : 'expr) (tokens : 'token list) : 'expr * 'token list =
|
|
match tokens with
|
|
| [] -> lhs, []
|
|
| op :: rest ->
|
|
|
|
match parser.UnaryPostfix.TryGetValue (parser.GetTag op) with
|
|
| true, ((precedence, ()), construct) ->
|
|
if precedence < minBinding then
|
|
printfn "Hit a postfix op which does not bind: %+A" op
|
|
lhs, rest
|
|
else
|
|
printfn "Hit a postfix op which binds: %+A" op
|
|
go (construct lhs) rest
|
|
| false, _ ->
|
|
|
|
match parser.Infix.TryGetValue (parser.GetTag op) with
|
|
| true, ((leftBinding, rightBinding), construct) ->
|
|
if leftBinding < minBinding then
|
|
printfn "Hit an infix op which does not bind on the left: %+A" op
|
|
lhs, op :: rest
|
|
else
|
|
|
|
printfn "Hit an infix op which binds on the left: %+A" op
|
|
|
|
let rhs, remainingTokens = parseInner parser inputString rest rightBinding
|
|
|
|
go (construct lhs rhs) remainingTokens
|
|
| false, _ ->
|
|
// TODO: This could be function application!
|
|
lhs, op :: rest
|
|
|
|
go lhs rest
|
|
|
|
/// Execute the given parser against a string which was tokenised in the given way.
|
|
/// We give you the string so that you may have your tokens slice into it.
|
|
///
|
|
/// Returns the parsed expression, and any leftover tokens that may be trailing.
|
|
let execute<'tokenTag, 'token, 'expr when 'tokenTag : comparison>
|
|
(parser : Parser<'tokenTag, 'token, 'expr>)
|
|
(inputString : string)
|
|
(tokens : 'token list)
|
|
: 'expr * 'token list
|
|
=
|
|
parseInner parser inputString tokens 0
|