diff --git a/.ocamlformat b/.ocamlformat index ff63c583..0de0912e 100644 --- a/.ocamlformat +++ b/.ocamlformat @@ -1,4 +1,4 @@ -version=0.19.0 +#version=0.19.0 profile = conventional break-separators = after space-around-lists = false diff --git a/src/lib/sedlexing.ml b/src/lib/sedlexing.ml index adc415c7..116772b2 100644 --- a/src/lib/sedlexing.ml +++ b/src/lib/sedlexing.ml @@ -36,6 +36,7 @@ type lexbuf = { mutable marked_bol : int; mutable marked_line : int; mutable marked_val : int; + mutable marked_path : int list; mutable filename : string; mutable finished : bool; } @@ -58,6 +59,7 @@ let empty_lexbuf = marked_bol = 0; marked_line = 0; marked_val = 0; + marked_path = []; filename = ""; finished = false; } @@ -154,23 +156,24 @@ let __private__next_int lexbuf : int = Uchar.to_int ret end -let mark lexbuf i = +let mark lexbuf i path = lexbuf.marked_pos <- lexbuf.pos; lexbuf.marked_bol <- lexbuf.curr_bol; lexbuf.marked_line <- lexbuf.curr_line; - lexbuf.marked_val <- i + lexbuf.marked_val <- i; + lexbuf.marked_path <- path let start lexbuf = lexbuf.start_pos <- lexbuf.pos; lexbuf.start_bol <- lexbuf.curr_bol; lexbuf.start_line <- lexbuf.curr_line; - mark lexbuf (-1) + mark lexbuf (-1) [] let backtrack lexbuf = lexbuf.pos <- lexbuf.marked_pos; lexbuf.curr_bol <- lexbuf.marked_bol; lexbuf.curr_line <- lexbuf.marked_line; - lexbuf.marked_val + (lexbuf.marked_val, lexbuf.marked_path) let rollback lexbuf = lexbuf.pos <- lexbuf.start_pos; @@ -189,6 +192,7 @@ let lexeme lexbuf = Array.sub lexbuf.buf lexbuf.start_pos (lexbuf.pos - lexbuf.start_pos) let lexeme_char lexbuf pos = lexbuf.buf.(lexbuf.start_pos + pos) +let lexeme_code lexbuf pos = Uchar.to_int lexbuf.buf.(lexbuf.start_pos + pos) let lexing_positions lexbuf = let start_p = diff --git a/src/lib/sedlexing.mli b/src/lib/sedlexing.mli index 9df2f17f..104afda5 100644 --- a/src/lib/sedlexing.mli +++ b/src/lib/sedlexing.mli @@ -114,6 +114,10 @@ val lexeme : lexbuf -> Uchar.t array the matched string. *) val lexeme_char : lexbuf -> int -> Uchar.t +(** [Sedlexing.lexeme_code lexbuf pos] returns code point number [pos] in + the matched string. *) +val lexeme_code : lexbuf -> int -> int + (** [Sedlexing.sub_lexeme lexbuf pos len] returns a substring of the string matched by the regular expression as an array of Unicode code point. *) val sub_lexeme : lexbuf -> int -> int -> Uchar.t array @@ -151,19 +155,19 @@ val next : lexbuf -> Uchar.t option lexer buffer and increments to current position. If the input stream is exhausted, the function returns -1. If a ['\n'] is encountered, the tracked line number is incremented. - + This is a private API, it should not be used by code using this module's API and can be removed at any time. *) val __private__next_int : lexbuf -> int -(** [mark lexbuf i] stores the integer [i] in the internal - slot. The backtrack position is set to the current position. *) -val mark : lexbuf -> int -> unit +(** [mark lexbuf i path] stores the integer [i] and the list [path] in the + internal slot. The backtrack position is set to the current position. *) +val mark : lexbuf -> int -> int list -> unit -(** [backtrack lexbuf] returns the value stored in the +(** [backtrack lexbuf] returns the value and path stored in the internal slot of the buffer, and performs backtracking (the current position is set to the value of the backtrack position). *) -val backtrack : lexbuf -> int +val backtrack : lexbuf -> int * int list (** [with_tokenizer tokenizer lexbuf] given a lexer and a lexbuf, returns a generator of tokens annotated with positions. diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 172ad71c..6824669d 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -104,6 +104,14 @@ module StringMap = Map.Make (struct let compare = compare end) +(* Lexeme aliases *) + +module StrLocSet = Set.Make (struct + type t = string loc + + let compare a b = compare a.txt b.txt +end) + let builtin_regexps = List.fold_left (fun acc (n, c) -> StringMap.add n (Sedlex.chars c) acc) @@ -189,16 +197,52 @@ let best_final final = !fin let state_fun state = Printf.sprintf "__sedlex_state_%i" state +let trace_fun i = Printf.sprintf "__sedlex_trace_%i" i + +let set_trace_info auto offsets = + let new_auto = + Array.map + (fun (trans, final) -> + let enable_trace = + Array.mem true + (Array.map2 (fun f o -> f && Option.is_some o) final offsets) + in + (trans, final, enable_trace)) + auto + in + Array.fold_left + (fun auto _ -> + Array.map + (fun ((trans, final, enable_trace) as x) -> + if enable_trace then x + else + ( trans, + final, + Array.exists + (fun (_, j) -> + let _, _, enable_trace = auto.(j) in + enable_trace) + trans )) + auto) + new_auto new_auto let call_state lexbuf auto state = - let trans, final = auto.(state) in + let loc = default_loc in + let trans, final, enable_trace = auto.(state) in if Array.length trans = 0 then ( match best_final final with - | Some i -> eint ~loc:default_loc i + | Some i when enable_trace -> + [%expr [%e eint ~loc i], [%e eint ~loc state] :: __sedlex_path] + | Some i -> [%expr [%e eint ~loc i], []] | None -> assert false) - else appfun (state_fun state) [evar ~loc:default_loc lexbuf] + else begin + if enable_trace then + appfun (state_fun state) + [evar ~loc lexbuf; [%expr [%e eint ~loc state] :: __sedlex_path]] + else appfun (state_fun state) [evar ~loc lexbuf] + end -let gen_state lexbuf auto i (trans, final) = +let gen_state lexbuf auto i (trans, final, enable_trace) = let loc = default_loc in let partition = Array.map fst trans in let cases = @@ -225,17 +269,23 @@ let gen_state lexbuf auto i (trans, final) = value_binding ~loc ~pat:(pvar ~loc (state_fun i)) ~expr: - (pexp_function ~loc - [case ~lhs:(pvar ~loc lexbuf) ~guard:None ~rhs:body]); + (if enable_trace then + [%expr fun [%p pvar ~loc lexbuf] __sedlex_path -> [%e body]] + else [%expr fun [%p pvar ~loc lexbuf] -> [%e body]]); ] in match best_final final with | None -> ret (body ()) | Some _ when Array.length trans = 0 -> [] + | Some i when enable_trace -> + ret + [%expr + Sedlexing.mark [%e evar ~loc lexbuf] [%e eint ~loc i] __sedlex_path; + [%e body ()]] | Some i -> ret [%expr - Sedlexing.mark [%e evar ~loc lexbuf] [%e eint ~loc i]; + Sedlexing.mark [%e evar ~loc lexbuf] [%e eint ~loc i] []; [%e body ()]] let gen_recflag auto = @@ -243,34 +293,240 @@ let gen_recflag auto = in states with no further transitions. *) try Array.iter - (fun (trans_i, _) -> + (fun (trans_i, _, _) -> Array.iter (fun (_, j) -> - let trans_j, _ = auto.(j) in + let trans_j, _, _ = auto.(j) in if Array.length trans_j > 0 then raise Exit) trans_i) auto; Nonrecursive with Exit -> Recursive +let gen_offsets traces i = function + | (_, []), _ -> None + | (_, aliases), _ -> + let n = List.length aliases in + let _, trans, finals = traces.(i) in + let cases = + List.map (fun ({ actions; _ } : Sedlex.trans_case) -> actions) trans + @ List.map (fun ({ actions; _ } : Sedlex.final_case) -> actions) finals + in + let action2cases = Hashtbl.create n in + List.iteri + (fun i actions -> + List.iter + (fun action -> + try + let offsets = Hashtbl.find action2cases action in + Hashtbl.replace action2cases action (i :: offsets) + with Not_found -> Hashtbl.add action2cases action [i]) + actions) + cases; + let counter = ref 0 in + let alias2offset = Hashtbl.create n in + let cases2offset = Hashtbl.create 31 in + Hashtbl.iter + (fun action offsets -> + try + let i = Hashtbl.find cases2offset offsets in + Hashtbl.add alias2offset action i + with Not_found -> + Hashtbl.add cases2offset offsets !counter; + Hashtbl.add alias2offset action !counter; + incr counter) + action2cases; + Some (!counter, alias2offset) + +let gen_cset ~loc x char_set = + let interval a b = + [%expr + [%e eint ~loc a] <= [%e evar ~loc x] + && [%e evar ~loc x] <= [%e eint ~loc b]] + in + match char_set with + | (a, b) :: l -> + List.fold_left + (fun acc (a, b) -> [%expr [%e acc] || [%e interval a b]]) + (interval a b) l + | [] -> assert false + +let gen_trace lexbuf traces i = function + | None -> [] + | Some (offsets_num, action_offsets) -> + let loc = default_loc in + let initial, trans, finals = traces.(i) in + let offset_array = + value_binding ~loc + ~pat:[%pat? __sedlex_offsets] + ~expr:(pexp_array ~loc (List.init offsets_num (fun _ -> [%expr -1]))) + in + let find_offset_idx action = Hashtbl.find action_offsets action in + let aux_fun = + let gen_action e offset_idx = + [%expr + __sedlex_offsets.([%e eint ~loc offset_idx]) <- __sedlex_pos; + [%e e]] + in + let unreachable_case = + case ~lhs:[%pat? _] ~guard:None ~rhs:[%expr assert false] + in + let trans_cases = + let dup_case = Hashtbl.create (List.length trans) in + List.iter + (fun { Sedlex.curr_state; curr_node; prev_state; _ } -> + let key = (curr_state, curr_node, prev_state) in + if Hashtbl.mem dup_case key then + Hashtbl.replace dup_case key false + else Hashtbl.add dup_case key true) + trans; + List.map + (fun { + Sedlex.curr_state; + curr_node; + prev_state; + prev_node; + char_set; + actions; + } -> + let lhs = + ppat_tuple ~loc + [ + pint ~loc curr_state; + pint ~loc curr_node; + pint ~loc prev_state; + ] + in + let guard = + if Hashtbl.find dup_case (curr_state, curr_node, prev_state) + then None + else Some (gen_cset ~loc "__sedlex_code" char_set) + in + let rhs = + let call_rest = + [%expr + __sedlex_aux (__sedlex_pos - 1) [%e eint ~loc prev_state] + [%e eint ~loc prev_node] __sedlex_rest] + in + List.fold_left gen_action call_rest + (List.map find_offset_idx actions |> List.sort_uniq compare) + in + case ~lhs ~guard ~rhs) + trans + in + let final_cases = + List.map + (fun { Sedlex.curr_node; actions } -> + let lhs = pint ~loc curr_node in + let rhs = + List.fold_left gen_action [%expr ()] + (List.map find_offset_idx actions |> List.sort_uniq compare) + in + case ~lhs ~guard:None ~rhs) + finals + in + value_binding ~loc + ~pat:[%pat? __sedlex_aux] + ~expr: + [%expr + fun __sedlex_pos __sedlex_curr_state __sedlex_curr_node -> + function + | [] -> + [%e + pexp_match ~loc [%expr __sedlex_curr_node] + (final_cases @ [unreachable_case])] + | __sedlex_prev_state :: __sedlex_rest -> + let __sedlex_code = + Sedlexing.lexeme_code [%e evar ~loc lexbuf] + (__sedlex_pos - 1) + in + [%e + pexp_match ~loc + [%expr + __sedlex_curr_state, + __sedlex_curr_node, + __sedlex_prev_state] + (trans_cases @ [unreachable_case])]] + in + [ + value_binding ~loc + ~pat:(pvar ~loc (trace_fun i)) + ~expr: + [%expr + fun [%p pvar ~loc lexbuf] __sedlex_path -> + [%e + pexp_let ~loc Nonrecursive [offset_array] + @@ pexp_let ~loc Recursive [aux_fun] + @@ [%expr + (match __sedlex_path with + | __sedlex_curr_state :: __sedlex_rest -> + __sedlex_aux + (Sedlexing.lexeme_length [%e evar ~loc lexbuf]) + __sedlex_curr_state [%e eint ~loc initial] + __sedlex_rest + | _ -> assert false); + __sedlex_offsets]]]; + ] + +let gen_aliases lexbuf i e aliases = function + | None -> e + | Some (_, action_offsets) -> + let loc = default_loc in + pexp_let ~loc Nonrecursive + [ + value_binding ~loc + ~pat:[%pat? __sedlex_offsets] + ~expr: + (appfun (trace_fun i) [evar ~loc lexbuf; [%expr __sedlex_path]]); + ] + @@ pexp_let ~loc Nonrecursive + (List.map + (fun { txt = alias; loc } -> + let start = Hashtbl.find action_offsets (Sedlex.Start alias) in + let stop = Hashtbl.find action_offsets (Stop alias) in + value_binding ~loc ~pat:(pvar ~loc alias) + ~expr: + [%expr + __sedlex_offsets.([%e eint ~loc start]), + __sedlex_offsets.([%e eint ~loc stop]) + - __sedlex_offsets.([%e eint ~loc start])]) + aliases) + @@ e + let gen_definition lexbuf l error = let loc = default_loc in - let brs = Array.of_list l in - let auto = Sedlex.compile (Array.map fst brs) in + let brs = + Array.of_list + (List.map (fun ((r, s), e) -> ((r, StrLocSet.elements s), e)) l) + in + let auto, traces = Sedlex.compile (Array.map (fun ((r, _), _) -> r) brs) in + let offsets = Array.mapi (gen_offsets traces) brs in + let auto = set_trace_info auto offsets in let cases = Array.to_list (Array.mapi - (fun i (_, e) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:e) + (fun i ((_, aliases), e) -> + case ~lhs:(pint ~loc i) ~guard:None + ~rhs:(gen_aliases lexbuf i e aliases offsets.(i))) brs) in let states = Array.mapi (gen_state lexbuf auto) auto in let states = List.flatten (Array.to_list states) in - pexp_let ~loc (gen_recflag auto) states - (pexp_sequence ~loc - [%expr Sedlexing.start [%e evar ~loc lexbuf]] - (pexp_match ~loc - (appfun (state_fun 0) [evar ~loc lexbuf]) - (cases @ [case ~lhs:(ppat_any ~loc) ~guard:None ~rhs:error]))) + let traces = Array.mapi (gen_trace lexbuf traces) offsets in + let traces = List.flatten (Array.to_list traces) in + pexp_let ~loc (gen_recflag auto) (states @ traces) + @@ [%expr + Sedlexing.start [%e evar ~loc lexbuf]; + let __sedlex_result, __sedlex_path = + [%e + let _, _, enable_trace = auto.(0) in + if enable_trace then + appfun (state_fun 0) [evar ~loc lexbuf; [%expr [0]]] + else appfun (state_fun 0) [evar ~loc lexbuf]] + in + [%e + pexp_match ~loc [%expr __sedlex_result] + (cases @ [case ~lhs:(ppat_any ~loc) ~guard:None ~rhs:error])]] (* Lexer specification parser *) @@ -296,13 +552,13 @@ let rec repeat r = function | 0, m -> Sedlex.alt Sedlex.eps (Sedlex.seq r (repeat r (0, m - 1))) | n, m -> Sedlex.seq r (repeat r (n - 1, m - 1)) -let regexp_of_pattern env = +let regexp_of_pattern allow_alias env = let rec char_pair_op func name p tuple = (* Construct something like Sub(a,b) *) match tuple with | Some { ppat_desc = Ppat_tuple [p0; p1] } -> begin - match func (aux p0) (aux p1) with - | Some r -> r + match func (fst @@ aux false p0) (fst @@ aux false p1) with + | Some r -> (r, StrLocSet.empty) | None -> err p.ppat_loc @@ "the " ^ name ^ " operator can only applied to single-character length \ @@ -311,16 +567,37 @@ let regexp_of_pattern env = | _ -> err p.ppat_loc @@ "the " ^ name ^ " operator requires two arguments, like " ^ name ^ "(a,b)" - and aux p = + and aux allow_alias p = + let loc = p.ppat_loc in (* interpret one pattern node *) match p.ppat_desc with - | Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2) + | Ppat_or (p1, p2) -> + let r1, s1 = aux allow_alias p1 in + let r2, s2 = aux allow_alias p2 in + if not (StrLocSet.equal s1 s2) then begin + let x = + try StrLocSet.choose (StrLocSet.diff s1 s2) + with Not_found -> StrLocSet.choose (StrLocSet.diff s2 s1) + in + err loc @@ "variable " ^ x.txt + ^ " must occur on both sides of this | pattern" + end; + (Sedlex.alt r1 r2, s1) | Ppat_tuple (p :: pl) -> - List.fold_left (fun r p -> Sedlex.seq r (aux p)) (aux p) pl + List.fold_left + (fun (r1, s1) p -> + let r2, s2 = aux allow_alias p in + if not (StrLocSet.disjoint s1 s2) then begin + let x = StrLocSet.choose (StrLocSet.inter s1 s2) in + err loc @@ "variable " ^ x.txt + ^ " is bound several times in this matching" + end; + (Sedlex.seq r1 r2, StrLocSet.union s1 s2)) + (aux allow_alias p) pl | Ppat_construct ({ txt = Lident "Star" }, Some (_, p)) -> - Sedlex.rep (aux p) + (Sedlex.rep (fst @@ aux false p), StrLocSet.empty) | Ppat_construct ({ txt = Lident "Plus" }, Some (_, p)) -> - Sedlex.plus (aux p) + (Sedlex.plus (fst @@ aux false p), StrLocSet.empty) | Ppat_construct ( { txt = Lident "Rep" }, Some @@ -340,7 +617,8 @@ let regexp_of_pattern env = | Pconst_integer (i1, _), Pconst_integer (i2, _) -> let i1 = int_of_string i1 in let i2 = int_of_string i2 in - if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2) + if 0 <= i1 && i1 <= i2 then + (repeat (fst @@ aux false p0) (i1, i2), StrLocSet.empty) else err p.ppat_loc "Invalid range for Rep operator" | _ -> err p.ppat_loc "Rep must take an integer constant or interval" @@ -348,12 +626,12 @@ let regexp_of_pattern env = | Ppat_construct ({ txt = Lident "Rep" }, _) -> err p.ppat_loc "the Rep operator takes 2 arguments" | Ppat_construct ({ txt = Lident "Opt" }, Some (_, p)) -> - Sedlex.alt Sedlex.eps (aux p) + (Sedlex.alt Sedlex.eps (fst @@ aux false p), StrLocSet.empty) | Ppat_construct ({ txt = Lident "Compl" }, arg) -> begin match arg with | Some (_, p0) -> begin - match Sedlex.compl (aux p0) with - | Some r -> r + match Sedlex.compl (fst @@ aux false p0) with + | Some r -> (r, StrLocSet.empty) | None -> err p.ppat_loc "the Compl operator can only applied to a \ @@ -379,36 +657,48 @@ let regexp_of_pattern env = for i = 0 to String.length s - 1 do c := Cset.union !c (Cset.singleton (Char.code s.[i])) done; - Sedlex.chars !c + (Sedlex.chars !c, StrLocSet.empty) | _ -> err p.ppat_loc "the Chars operator requires a string argument") | Ppat_interval (i_start, i_end) -> begin match (i_start, i_end) with | Pconst_char c1, Pconst_char c2 -> - Sedlex.chars (Cset.interval (Char.code c1) (Char.code c2)) + ( Sedlex.chars (Cset.interval (Char.code c1) (Char.code c2)), + StrLocSet.empty ) | Pconst_integer (i1, _), Pconst_integer (i2, _) -> - Sedlex.chars - (Cset.interval - (codepoint (int_of_string i1)) - (codepoint (int_of_string i2))) + ( Sedlex.chars + (Cset.interval + (codepoint (int_of_string i1)) + (codepoint (int_of_string i2))), + StrLocSet.empty ) | _ -> err p.ppat_loc "this pattern is not a valid interval regexp" end | Ppat_constant const -> begin match const with - | Pconst_string (s, _, _) -> regexp_for_string s - | Pconst_char c -> regexp_for_char c + | Pconst_string (s, _, _) -> (regexp_for_string s, StrLocSet.empty) + | Pconst_char c -> (regexp_for_char c, StrLocSet.empty) | Pconst_integer (i, _) -> - Sedlex.chars (Cset.singleton (codepoint (int_of_string i))) + ( Sedlex.chars (Cset.singleton (codepoint (int_of_string i))), + StrLocSet.empty ) | _ -> err p.ppat_loc "this pattern is not a valid regexp" end | Ppat_var { txt = x } -> begin - try StringMap.find x env + try (StringMap.find x env, StrLocSet.empty) with Not_found -> err p.ppat_loc (Printf.sprintf "unbound regexp %s" x) end + | Ppat_alias (_, { loc }) when not allow_alias -> + err loc @@ "alias is not allowed inside constructors" + | Ppat_alias (p, ({ txt = x } as x_loc)) -> + let r, s = aux allow_alias p in + if StrLocSet.mem x_loc s then begin + err loc @@ "variable " ^ x + ^ " is bound several times in this matching" + end; + (Sedlex.alias r x, StrLocSet.add x_loc s) | _ -> err p.ppat_loc "this pattern is not a valid regexp" in - aux + aux allow_alias let previous = ref [] let regexps = ref [] @@ -420,7 +710,7 @@ let mapper = val env = builtin_regexps method define_regexp name p = - {} + {} method! expression e = match e with @@ -446,7 +736,7 @@ let mapper = List.map (function | { pc_lhs = p; pc_rhs = e; pc_guard = None } -> - (regexp_of_pattern env p, super#expression e) + (regexp_of_pattern true env p, super#expression e) | { pc_guard = Some e } -> err e.pexp_loc "'when' guards are not supported") cases diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml index 9a5f53ed..42061891 100644 --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -6,10 +6,13 @@ module Cset = Sedlex_cset (* NFA *) +type action = Start of string | Stop of string + type node = { id : int; mutable eps : node list; mutable trans : (Cset.t * node) list; + mutable action : action option; } (* Compilation regexp -> NFA *) @@ -20,7 +23,7 @@ let cur_id = ref 0 let new_node () = incr cur_id; - { id = !cur_id; eps = []; trans = [] } + { id = !cur_id; eps = []; trans = []; action = None } let seq r1 r2 succ = r1 (r2 succ) @@ -72,6 +75,15 @@ let pair_op f r0 r1 = let subtract = pair_op Cset.difference let intersection = pair_op Cset.intersection +let alias r alias succ = + let n = new_node () in + let s = new_node () in + s.eps <- [succ]; + s.action <- Some (Stop alias); + n.action <- Some (Start alias); + n.eps <- [r s]; + n + let compile_re re = let final = new_node () in (re final, final) @@ -117,6 +129,86 @@ let transition (state : state) = Array.sort (fun (c1, _) (c2, _) -> compare c1 c2) t; t +(* Restore NFA path from DFA path *) + +type trans_case = { + curr_state : int; + curr_node : int; + prev_state : int; + prev_node : int; + char_set : Sedlex_cset.t; + actions : action list; +} + +type final_case = { curr_node : int; actions : action list } + +let compile_traces states (start, final) = + let append_action actions = function + | None -> actions + | Some action -> action :: actions + in + let relevant_nodes = Hashtbl.create 31 in + let rec aux node = + if not (Hashtbl.mem relevant_nodes node.id) then begin + Hashtbl.add relevant_nodes node.id (); + List.iter aux node.eps; + List.iter (fun (_, n) -> aux n) node.trans + end + in + aux start; + let is_relevant node = Hashtbl.mem relevant_nodes node in + let first_node = final.id in + let trans_cases = + let cases = Hashtbl.create 31 in + Hashtbl.iter + (fun from_state j -> + Hashtbl.iter + (fun to_state i -> + List.iter + (fun from_node -> + let node_j = from_node.id in + if is_relevant node_j then ( + let rec dfs cset actions to_node = + let node_i = to_node.id in + if is_relevant node_i then + if not (Hashtbl.mem cases (i, node_i, j, cset)) then begin + let actions = append_action actions to_node.action in + if to_node.trans <> [] || to_node == final then + Hashtbl.add cases (i, node_i, j, cset) + { + curr_state = i; + curr_node = node_i; + prev_state = j; + prev_node = node_j; + char_set = cset; + actions; + }; + List.iter (dfs cset actions) to_node.eps + end + in + List.iter + (fun (cset, to_node) -> + if List.mem to_node to_state then dfs cset [] to_node) + from_node.trans)) + from_state) + states) + states; + Hashtbl.to_seq_values cases |> List.of_seq + in + let final_cases = + let rec dfs actions cases node = + let actions = append_action actions node.action in + let cases = + if node.trans <> [] || node == final then + { curr_node = node.id; actions } :: cases + else cases + in + List.fold_left (dfs actions) cases node.eps + in + dfs [] [] start + in + (first_node, trans_cases, final_cases) + let compile rs = let rs = Array.map compile_re rs in let counter = ref 0 in @@ -138,4 +230,5 @@ let compile rs = Array.iter (fun (i, _) -> init := add_node !init i) rs; let i = aux !init in assert (i = 0); - Array.init !counter (Hashtbl.find states_def) + ( Array.init !counter (Hashtbl.find states_def), + Array.map (compile_traces states) rs ) diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli index d809010e..8591b377 100644 --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -11,6 +11,7 @@ val rep : regexp -> regexp val plus : regexp -> regexp val eps : regexp val compl : regexp -> regexp option +val alias : regexp -> string -> regexp (* If the argument is a single [chars] regexp, returns a regexp which matches the complement set. Otherwise returns [None]. *) @@ -22,4 +23,20 @@ val intersection : regexp -> regexp -> regexp option (* If each argument is a single [chars] regexp, returns a regexp which matches the intersection set. Otherwise returns [None]. *) -val compile : regexp array -> ((Sedlex_cset.t * int) array * bool array) array +type action = Start of string | Stop of string + +type trans_case = { + curr_state : int; + curr_node : int; + prev_state : int; + prev_node : int; + char_set : Sedlex_cset.t; + actions : action list; +} + +type final_case = { curr_node : int; actions : action list } + +val compile : + regexp array -> + ((Sedlex_cset.t * int) array * bool array) array + * (int * trans_case list * final_case list) array diff --git a/test/dune b/test/dune new file mode 100644 index 00000000..612192c0 --- /dev/null +++ b/test/dune @@ -0,0 +1,7 @@ +(library + (name test) + (modules number_lexer misc) + (inline_tests) + (libraries sedlex sedlex_ppx ppx_expect) + (preprocess + (pps sedlex.ppx ppx_expect))) diff --git a/test/misc.ml b/test/misc.ml new file mode 100644 index 00000000..fca91181 --- /dev/null +++ b/test/misc.ml @@ -0,0 +1,96 @@ +let rec token buf = + let lex buf = Sedlexing.Latin1.lexeme buf in + let sub (a, b) = Sedlexing.Latin1.sub_lexeme buf a b in + match%sedlex buf with + | (((((("a" as a), "b") as b), "c") as c), "d") as d -> + Printf.printf "1. %s: %s %s %s %s\n" (lex buf) (sub a) (sub b) (sub c) + (sub d); + token buf + | ("d", (("c", (("b", ("a" as a)) as b)) as c)) as d -> + Printf.printf "2. %s: %s %s %s %s\n" (lex buf) (sub a) (sub b) (sub c) + (sub d); + token buf + | (('a' as a), ('b' as b as d) | ('b' as b), ('a' as a as d)) as c -> + Printf.printf "3. %s: %s %s %s %s\n" (lex buf) (sub a) (sub b) (sub c) + (sub d); + token buf + | (Plus "ab" as a), (Plus "ab" as b) -> + Printf.printf "4. %s: %s %s\n" (lex buf) (sub a) (sub b); + token buf + | 'c', (Star "dc" as s), 'd' -> + Printf.printf "5. %s: %s\n" (lex buf) (sub s); + token buf + | (Star "dcdc" as a), ("dcdc" as b) -> + Printf.printf "6. %s: %s %s\n" (lex buf) (sub a) (sub b); + token buf + | "dc" as s -> + Printf.printf "7. %s: %s\n" (lex buf) (sub s); + token buf + | ("abc" | "def") as s -> + Printf.printf "8. %s: %s\n" (lex buf) (sub s); + token buf + | (Plus "a" as a), (Plus "b" as b), "c" -> + Printf.printf "9. %s: %s %s\n" (lex buf) (sub a) (sub b); + token buf + | (Star "a" as a), (Plus "b" as b), "e" + | (Plus "a" as b), (Plus "b" as a), "d" -> + Printf.printf "10. %s: %s %s\n" (lex buf) (sub a) (sub b); + token buf + | ( ((Plus "d" as a) | (Plus "e" as a)), + "f", + ((Plus "d" as b) | (Plus "e" as b)) ) -> + Printf.printf "11. %s: %s %s\n" (lex buf) (sub a) (sub b); + token buf + (* {Others} *) + | Plus xml_blank -> token buf + | 128 .. 255 -> print_endline "Non ASCII" + | eof -> print_endline "EOF" + | _ -> failwith "Unexpected character" + +let%expect_test _ = + let lexbuf = + Sedlexing.Latin1.from_string + {| + abcd + dcba + ab + ba + ababababab + cdcdcdcdcd + dcdc + dcdcdcdcdc + abc + def + aaaabbbc + aaaabbbbd + aaaabbbbe + dfe + efd + ddddfdddd + eeeefeeee + |} + in + token lexbuf; + + [%expect + {| + 1. abcd: a ab abc abcd + 2. dcba: a ba cba dcba + 3. ab: a b ab b + 3. ba: a b ba a + 4. ababababab: ab abababab + 5. cdcdcdcdcd: dcdcdcdc + 6. dcdc: dcdc + 6. dcdcdcdc: dcdc dcdc + 7. dc: dc + 8. abc: abc + 8. def: def + 9. aaaabbbc: aaaa bbb + 10. aaaabbbbd: bbbb aaaa + 10. aaaabbbbe: aaaa bbbb + 11. dfe: d e + 11. efd: e d + 11. ddddfdddd: dddd dddd + 11. eeeefeeee: eeee eeee + EOF + |}] diff --git a/test/number_lexer.ml b/test/number_lexer.ml new file mode 100644 index 00000000..7eec8055 --- /dev/null +++ b/test/number_lexer.ml @@ -0,0 +1,121 @@ +let digit_2 = [%sedlex.regexp? '0' .. '1'] +let digit_8 = [%sedlex.regexp? '0' .. '7'] +let digit = [%sedlex.regexp? '0' .. '9'] +let digit_16 = [%sedlex.regexp? digit | 'A' .. 'F' | 'a' .. 'f'] +let prefix_2 = [%sedlex.regexp? "0b"] +let prefix_8 = [%sedlex.regexp? "0o"] +let prefix_16 = [%sedlex.regexp? "0x"] +let sign = [%sedlex.regexp? "" | '+' | '-'] +let sign_op = [%sedlex.regexp? '+' | '-'] +let num_2 = [%sedlex.regexp? Plus digit_2] +let num_8 = [%sedlex.regexp? Plus digit_8] +let num_10 = [%sedlex.regexp? Plus digit] +let num_16 = [%sedlex.regexp? Plus digit_16] + +let rec token buf = + let sub (a, b) = Sedlexing.Latin1.sub_lexeme buf a b in + match%sedlex buf with + (* {Integers} *) + | (sign as s), prefix_2, (num_2 as n) -> + Printf.printf "Bin %s%s\n" (sub s) (sub n); + token buf + | (sign as s), prefix_8, (num_8 as n) -> + Printf.printf "Oct %s%s\n" (sub s) (sub n); + token buf + | (sign as s), (num_10 as n) -> + Printf.printf "Dec %s%s\n" (sub s) (sub n); + token buf + | (sign as s), prefix_16, (num_16 as n) -> + Printf.printf "Hex %s%s\n" (sub s) (sub n); + token buf + (* {Fractions} *) + | (sign as s), prefix_2, (num_2 as n), '/', (num_2 as d) -> + Printf.printf "Bin %s%s/%s\n" (sub s) (sub n) (sub d); + token buf + | (sign as s), prefix_8, (num_8 as n), '/', (num_8 as d) -> + Printf.printf "Oct %s%s/%s\n" (sub s) (sub n) (sub d); + token buf + | (sign as s), (num_10 as n), '/', (num_10 as d) -> + Printf.printf "Dec %s%s/%s\n" (sub s) (sub n) (sub d); + token buf + | (sign as s), prefix_16, (num_16 as n), '/', (num_16 as d) -> + Printf.printf "Hex %s%s/%s\n" (sub s) (sub n) (sub d); + token buf + (* {Complex Numbers} *) + | (sign as s), prefix_2, (num_2 as r), (sign_op as o), (num_2 as i), 'i' -> + Printf.printf "Bin %s%s%s%si\n" (sub s) (sub r) (sub o) (sub i); + token buf + | (sign as s), prefix_8, (num_8 as r), (sign_op as o), (num_8 as i), 'i' -> + Printf.printf "Oct %s%s%s%si\n" (sub s) (sub r) (sub o) (sub i); + token buf + | (sign as s), (num_10 as r), (sign_op as o), (num_10 as i), 'i' -> + Printf.printf "Dec %s%s%s%si\n" (sub s) (sub r) (sub o) (sub i); + token buf + | (sign as s), prefix_16, (num_16 as r), (sign_op as o), (num_16 as i), 'i' + -> + Printf.printf "Hex %s%s%s%si\n" (sub s) (sub r) (sub o) (sub i); + token buf + (* {Others} *) + | Plus xml_blank -> token buf + | 128 .. 255 -> print_endline "Non ASCII" + | eof -> print_endline "EOF" + | _ -> failwith "Unexpected character" + +let%expect_test _ = + let lexbuf = + Sedlexing.Latin1.from_string + {| + 123 + +123 + -123 + 0b01010101 + -0b11110000 + +0b11111111 + 0o12345670 + +0o76543210 + -0o17263540 + 0x123abcdef + -0x456DEFabc + +0x789ABCdef + 123/456 + -456/789 + +987/654 + 0o777/100 + +0o200/666 + 1+1i + 1-1i + 0x1f+2ei + +0x1f-2ei + 0b10+11i + -0b10-11i + |} + in + token lexbuf; + + [%expect + {| + Dec 123 + Dec +123 + Dec -123 + Bin 01010101 + Bin -11110000 + Bin +11111111 + Oct 12345670 + Oct +76543210 + Oct -17263540 + Hex 123abcdef + Hex -456DEFabc + Hex +789ABCdef + Dec 123/456 + Dec -456/789 + Dec +987/654 + Oct 777/100 + Oct +200/666 + Dec 1+1i + Dec 1-1i + Hex 1f+2ei + Hex +1f-2ei + Bin 10+11i + Bin -10-11i + EOF + |}]