(* This file is a recursive descent parser for a simple let-expression language. However, it does not yet handle binary expressions. Uncommenting the appropriate line below (in the definition of function exp_p) will mean that the stack will overflow whenever we attempt to parse... *) (**************************************) (* Some utilies *) (**************************************) (* explode a string into a list of characters *) let explode(s:string) : char list = let rec loop i cs = if i < 0 then cs else loop (i - 1) ((String.get s i)::cs) in loop (String.length s - 1) [] (* collapse a list of characters into a string *) let implode(cs:char list) : string = let ss = List.map (fun c -> String.make 1 c) cs in String.concat "" ss (* just a wrapper for consing a pair of values *) let cons(x,y) = x::y (* Generalizes in two ways over our earlier regexps: First, we work over an arbitrary token type (not just characters). Second, we see that the combinators seem to work for at least some classes of recursive grammars -- notably, those without left recursion. Also introduce a left-biased alt, which allows us to implement greedy matching (i.e., longest match). *) module Parse = struct (* Here the type variable 'tok represents the input type to the grammar. For instance, a lexer will have char as the input type, whereas a parser will have token as the input type. Given a list of 'tok, we return a 'a and a list of the remaining 'tok (i.e., the 'tok that weren't consumed to produce the 'a). Note that we return *all* possible parses, i.e., return a list of ('a * ('tok list)). *) type ('tok,'a) grammar = 'tok list -> ('a * ('tok list)) list (* Given a grammar and list of tokens of the appropriate input type, runs the grammar, and filters out all of the results that don't use all of the tokens. *) let parse (r:('tok,'a) grammar) (tokens : 'tok list) : 'a list = let results = r tokens in let uses_all = List.filter (fun p -> snd p = []) results in List.map fst uses_all (* The following definitions are exactly the same as our development with regexps, except they are generalized for the input type. *) let ch(c:char) : (char,char) grammar = function | c'::rest -> if c = c' then [(c,rest)] else [] | _ -> [] let eps : ('tok,unit) grammar = fun (s:'tok list) -> [((), s)] let void : ('tok,'a) grammar = fun s -> [] let (%) (r:('tok,'a) grammar) (f:'a -> 'b) : ('tok,'b) grammar = fun s -> List.map (function (v,s') -> (f v,s')) (r s) let (++)(r1:('tok,'a) grammar) (r2:('tok,'a) grammar) : ('tok,'a) grammar = fun s -> (r1 s) @ (r2 s) let ($)(r1: ('tok,'a) grammar) (r2: ('tok,'b) grammar) : ('tok, 'a * 'b) grammar = fun s -> List.fold_right (function (v1,s1) -> fun res -> (List.fold_right (function (v2,s2) -> fun res -> ((v1,v2),s2)::res) (r2 s1) res)) (r1 s) [] let rec star(r:('tok,'a) grammar) : ('tok, 'a list) grammar = fun s -> (((r $ (star r)) % cons) ++ (eps % (fun _ -> []))) s let plus(r: ('tok,'a) grammar) : ('tok,'a list) grammar = (r $ (star r)) % cons (* This is similar to "++" except that it prefers to match the first grammar and only if that fails does it try to match against the second grammar. So it's a left-biased alternative. *) let (>>)(r1:('tok,'a) grammar) (r2:('tok,'a) grammar) : ('tok,'a) grammar = fun ts -> match r1 ts with | [] -> r2 ts | s -> s (* This is similar to start but uses the left-biased alternative to prefer a longer match over a shorter one. *) let rec greedy_star (r:('tok,'a) grammar) : ('tok, 'a list) grammar = fun s -> (((r $ (greedy_star r)) % cons) >> (eps % (fun _ -> []))) s let plus(r: ('tok,'a) grammar) : ('tok,'a list) grammar = (r $ (star r)) % cons (* And greedy plus also goes for a longest match. *) let greedy_plus(r: ('tok,'a) grammar) : ('tok,'a list) grammar = (r $ (greedy_star r)) % cons let (%%) (r:('tok,'a) grammar) (v:'b) : ('tok,'b) grammar = r % (fun _ -> v) let opt(r:('tok,'a) grammar) : ('tok,'a option) grammar = (r % (fun x -> Some x)) ++ (eps %% None);; let alts (rs: ('tok,'a) grammar list) : ('tok,'a) grammar = List.fold_right (++) rs void let cats (rs: ('tok,'a) grammar list) : ('tok,'a list) grammar = List.fold_right (fun r1 r2 -> (r1 $ r2) % cons) rs (eps % (fun _ -> [])) let digit : (char,char) grammar = alts (List.map (fun i -> ch (char_of_int (i + (int_of_char '0')))) [0;1;2;3;4;5;6;7;8;9]) (* We now define naturals using greedy plus to get the longest integer we can out of it. This avoids ambiguity later on in our definition of tokens. *) let natural : (char,int) grammar = (greedy_plus digit) % (List.fold_left (fun a c -> a*10 + (int_of_char c) - (int_of_char '0')) 0) let integer : (char,int) grammar = natural ++ (((ch '-') $ natural) % (fun (_,n) -> -n)) let rec gen(i:int)(stop:int) : int list = if i > stop then [] else i::(gen (i+1) stop) let lc_alpha : (char,char) grammar = let chars = List.map char_of_int (gen (int_of_char 'a') (int_of_char 'z')) in alts (List.map ch chars) let uc_alpha : (char,char) grammar = let chars = List.map char_of_int (gen (int_of_char 'A') (int_of_char 'Z')) in alts (List.map ch chars) let identifier : (char,string) grammar = (lc_alpha $ (greedy_star (alts [lc_alpha; uc_alpha; ch '_'; digit]))) % (fun (c,s) -> implode (c::s)) type token = INT of int | ID of string | LET | IN | PLUS | TIMES | MINUS | DIV | LPAREN | RPAREN | EQ ;; let keywords = [ ("let",LET) ; ("in",IN) ] (* here are the token grammars for our little ML language *) let token_grammars = [ integer % (fun i -> INT i) ; identifier % (fun s -> try List.assoc s keywords with Not_found -> ID s) ; (ch '+') %% PLUS ; (ch '*') %% TIMES ; (ch '-') %% MINUS ; (ch '/') %% DIV ; (ch '(') %% LPAREN ; (ch ')') %% RPAREN ; (ch '=') %% EQ ; ];; (* so we can define a grammar to match any legal token *) let token = alts token_grammars ;; (* white space *) let ws = alts [ch ' ' ; ch '\n' ; ch '\r' ; ch '\t'] (* one or more white space *) let wsp = greedy_plus ws (* zero or more white spaces *) let wsz = greedy_star ws (* document -- zero or more tokens separated by one or more white spaces. This is actually not quite right. For instance, we need white space between identifiers, but we don't need whitespace between other kinds of tokens. Helps us avoid more complicated combinators that ensure longest match... *) let doc : (char,token list) grammar = ((wsz $ (star ((token $ wsz) % fst))) % snd) (* Define lex in terms of our parse function above. *) let lex(s:string) : token list = match parse doc (explode s) with | ts::[] -> ts | [] -> raise (Failure "lexing failure") | _ -> raise (Failure "ambiguous") (* Here, we define a parser taking us from tokens to the abstract syntax for a little fragment of OCaml. *) type var = string type binop = Plus | Minus | Times | Div type exp = | Int of int | Binop of exp * binop * exp | Var of var | Let of var * exp * exp (* Sort of a generalization of the "ch" constructor. Here, we accept any input token for which the function pred returns SOMEthing. The output is the SOMEthing returned. If pred returns None on that input token, or there's no token, we reject. *) let satisfies (pred:'a -> 'b option) : ('a,'b) grammar = fun ts -> match ts with | [] -> [] | t::ts -> (match pred t with | None -> [] | Some v -> [(v,ts)]) (* Accepts any (INT i) token and returns i *) let int_p : (token,exp) grammar = satisfies (function (INT i) -> Some i | _ -> None) % (fun i -> Int i) (* Accepts any (ID x) token. *) let var_p : (token,string) grammar = satisfies (function (ID x) -> Some x | _ -> None) (* Accepts any token equal to t and returns unit. *) let tok (t:token) : (token,unit) grammar = satisfies (function t' -> if t = t' then Some () else None) let mkbinop b ((e1,_),e2) = Binop (e1,b,e2) (* Note that this is a recursive grammar. But note also that we must explicitly make it a function so that Ocaml is happy with the "let rec". *) let rec exp_p : (token,exp) grammar = fun s -> alts [ (* integers *) int_p ; (* variables *) (var_p % (fun x -> Var x)) ; (* let x = exp in exp *) (tok LET $ var_p $ tok EQ $ exp_p $ tok IN $ exp_p) % (function (((((_,x),_),e1),_),e2) -> Let(x,e1,e2)) ; (* '(' exp ')' *) (tok LPAREN $ exp_p $ tok RPAREN) % (function ((_,e),_) -> e) (* Unfortunately, adding in this next line won't work due to the left recursion. We must refactor the grammar to avoid left recursion for our combinators. *) ; (* exp '+' exp *) (* (exp_p $ tok PLUS $ exp_p) % mkbinop Plus ; *) ] s let exp_parse (s:string) = let ts = lex s in exp_p ts end