From 6a60d42e8ed0803b4a638f7bacbe502b1bb67886 Mon Sep 17 00:00:00 2001 From: Valentin Gatien-Baron Date: Sun, 20 Dec 2020 22:05:03 -0500 Subject: [PATCH 1/2] add Group.get_opt --- lib/core.mli | 3 +++ lib/group.ml | 5 +++++ lib/group.mli | 3 +++ lib_test/fort_unit/fort_unit.ml | 2 ++ lib_test/test_re.ml | 8 ++++++++ 5 files changed, 21 insertions(+) diff --git a/lib/core.mli b/lib/core.mli index ae4e6dfe..e804442e 100644 --- a/lib/core.mli +++ b/lib/core.mli @@ -36,6 +36,9 @@ module Group : sig val get : t -> int -> string (** Raise [Not_found] if the group did not match *) + val get_opt : t -> int -> string option + (** Similar to {!get}, but returns an option instead of using an exception. *) + val offset : t -> int -> int * int (** Raise [Not_found] if the group did not match *) diff --git a/lib/group.ml b/lib/group.ml index dc43a019..fa1e606a 100644 --- a/lib/group.ml +++ b/lib/group.ml @@ -30,6 +30,11 @@ let test t i = let idx = t.marks.(2 * i) in idx <> -1 +let get_opt t i = + if test t i + then Some (get t i) + else None + let dummy_offset = (-1, -1) let all_offset t = diff --git a/lib/group.mli b/lib/group.mli index 7ff2ccbb..3671d762 100644 --- a/lib/group.mli +++ b/lib/group.mli @@ -27,6 +27,9 @@ type t = val get : t -> int -> string (** Raise [Not_found] if the group did not match *) +val get_opt : t -> int -> string option +(** Similar to {!get}, but returns an option instead of using an exception. *) + val offset : t -> int -> int * int (** Raise [Not_found] if the group did not match *) diff --git a/lib_test/fort_unit/fort_unit.ml b/lib_test/fort_unit/fort_unit.ml index faeb7cd1..711f60ff 100644 --- a/lib_test/fort_unit/fort_unit.ml +++ b/lib_test/fort_unit/fort_unit.ml @@ -49,6 +49,8 @@ let expect_eq_bool ?msg f x g y = expect_equal_app ?msg ~printer:string_of_bool f x g y let expect_eq_str ?msg f x g y = expect_equal_app ?msg ~printer:str_printer f x g y +let expect_eq_str_opt ?msg f x g y = + expect_equal_app ?msg ~printer:(opt_printer str_printer) f x g y let expect_eq_ofs ?msg f x g y = expect_equal_app ?msg ~printer:ofs_printer f x g y let expect_eq_arr_str ?msg f x g y = diff --git a/lib_test/test_re.ml b/lib_test/test_re.ml index 05b90842..7891fd5a 100644 --- a/lib_test/test_re.ml +++ b/lib_test/test_re.ml @@ -50,6 +50,14 @@ let _ = expect_eq_str not_found () (Group.get m) 4; ); + expect_pass "Group.get_opt" (fun () -> + expect_eq_str_opt id (Some "ab") (Group.get_opt m) 0; + expect_eq_str_opt id (Some "a") (Group.get_opt m) 1; + expect_eq_str_opt id None (Group.get_opt m) 2; + expect_eq_str_opt id (Some "b") (Group.get_opt m) 3; + expect_eq_str_opt id None (Group.get_opt m) 4; + ); + expect_pass "Group.offset" (fun () -> expect_eq_ofs id (0,2) (Group.offset m) 0; expect_eq_ofs id (0,1) (Group.offset m) 1; From 5689b435d2afc837376f9091c99f1ebd0d625b74 Mon Sep 17 00:00:00 2001 From: Valentin Gatien-Baron Date: Sun, 20 Dec 2020 22:46:16 -0500 Subject: [PATCH 2/2] Add user-facing documentation for many things --- lib/core.mli | 163 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 125 insertions(+), 38 deletions(-) diff --git a/lib/core.mli b/lib/core.mli index e804442e..ea5efffc 100644 --- a/lib/core.mli +++ b/lib/core.mli @@ -20,7 +20,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *) -(** Module [Re]: regular expressions commons *) +(** Module [Re]: code for creating and using regular expressions, + independently of regular expression syntax. *) type t (** Regular expression *) @@ -31,7 +32,9 @@ type re (** Manipulate matching groups. *) module Group : sig type t - (** Information about groups in a match. *) + (** Information about groups in a match. As is conventional, every + match implicitly has a group 0 that covers the whole match, and + explicit groups are numbered from 1. *) val get : t -> int -> string (** Raise [Not_found] if the group did not match *) @@ -72,35 +75,51 @@ val compile : t -> re used to match strings, e.g. with {!exec}. *) val exec : - ?pos:int -> (* Default: 0 *) - ?len:int -> (* Default: -1 (until end of string) *) + ?pos:int -> (** Default: 0 *) + ?len:int -> (** Default: -1 (until end of string) *) re -> string -> Group.t -(** [exec re str] matches [str] against the compiled expression [re], +(** [exec re str] searches [str] for a match of the compiled expression [re], and returns the matched groups if any. + + More specifically, when a match exists, [exec] returns a match that + starts at the earliest position possible. If multiple such matches are + possible, the one specified by the match semantics described below is + returned. + @param pos optional beginning of the string (default 0) @param len length of the substring of [str] that can be matched (default [-1], - meaning to the end of the string + meaning to the end of the string) @raise Not_found if the regular expression can't be found in [str] + + Note that [exec re str ~pos ~len] is not equivalent to [exec re + (String.sub str pos len)]. This transformation changes the meaning + of some constructs ({!bos}, {!eos}, {!whole_string} and {!leol}), and + zero-width assertions like {!bow} or {!eow} look at characters before + [pos] and after [pos + len]. *) val exec_opt : - ?pos:int -> (* Default: 0 *) - ?len:int -> (* Default: -1 (until end of string) *) + ?pos:int -> (** Default: 0 *) + ?len:int -> (** Default: -1 (until end of string) *) re -> string -> Group.t option (** Similar to {!exec}, but returns an option instead of using an exception. *) val execp : - ?pos:int -> (* Default: 0 *) - ?len:int -> (* Default: -1 (until end of string) *) + ?pos:int -> (** Default: 0 *) + ?len:int -> (** Default: -1 (until end of string) *) re -> string -> bool (** Similar to {!exec}, but returns [true] if the expression matches, - and [false] if it doesn't *) + and [false] if it doesn't. This function is more efficient than + calling {!exec} or {!exec_opt} and ignoring the returned group. + *) val exec_partial : - ?pos:int -> (* Default: 0 *) - ?len:int -> (* Default: -1 (until end of string) *) + ?pos:int -> (** Default: 0 *) + ?len:int -> (** Default: -1 (until end of string) *) re -> string -> [ `Full | `Partial | `Mismatch ] -(** More detailed version of {!exec_p} *) +(** More detailed version of {!exec_p}. [`Full] is equivalent to [true], + while [`Mismatch] and [`Partial] are equivalent to [false], but [`Partial] + indicates the input string could be extended to create a match. *) (** Marks *) module Mark : sig @@ -193,7 +212,7 @@ val replace : ?len:int -> ?all:bool -> (** Default: true. Otherwise only replace first occurrence *) re -> (** matched groups *) - f:(Group.t -> string) -> (* how to replace *) + f:(Group.t -> string) -> (** how to replace *) string -> (** string to replace in *) string (** [replace ~all re ~f s] iterates on [s], and replaces every occurrence @@ -220,7 +239,12 @@ val char : char -> t (** {2 Basic operations on regular expressions} *) val alt : t list -> t -(** Alternative *) +(** Alternative. + + [alt []] is equivalent to {!empty}. + + By default, the leftmost match is preferred (see match semantics below). +*) val seq : t list -> t (** Sequence *) @@ -246,7 +270,10 @@ val repn : t -> int -> int option -> t val opt : t -> t (** 0 or 1 matches *) -(** {2 String, line, word} *) +(** {2 String, line, word} + + We define a word as a sequence of latin1 letters, digits and underscore. +*) val bol : t (** Beginning of line *) @@ -261,19 +288,35 @@ val eow : t (** End of word *) val bos : t -(** Beginning of string *) +(** Beginning of string. This differs from {!start} because it matches + the beginning of the input string even when using [~pos] arguments: + + {[ + let b = execp (compile (seq [ bos; str "a" ])) "aa" ~pos:1 in + assert (not b) + ]} +*) val eos : t -(** End of string *) +(** End of string. This is different from {!stop} in the way described + in {!bos}. *) val leol : t (** Last end of line or end of string *) val start : t -(** Initial position *) +(** Initial position. This differs from {!bos} because it takes into + account the [~pos] arguments: + + {[ + let b = execp (compile (seq [ start; str "a" ])) "aa" ~pos:1 in + assert b + ]} +*) val stop : t -(** Final position *) +(** Final position. This is different from {!eos} in the way described + in {!start}. *) val word : t -> t (** Word *) @@ -282,38 +325,77 @@ val not_boundary : t (** Not at a word boundary *) val whole_string : t -> t -(** Only matches the whole string *) +(** Only matches the whole string, i.e. [fun t -> seq [ eos; t; bos ]]. *) + +(** {2 Match semantics} + + A regular expression frequently matches a string in multiple ways. For + instance [exec (compile (opt (str "a"))) "ab"] can match "" or "a". Match + semantic can be modified with the functions below, allowing one to choose + which of these is preferable. -(** {2 Match semantics} *) + By default, the leftmost branch of alternations is preferred, and repetitions + are greedy. + + Note that the existence of matches cannot be changed by specifying match + semantics. [seq [ bos; str "a"; non_greedy (opt (str "b")); eos ]] will + match when applied to "ab". However if [seq [ bos; str "a"; non_greedy (opt + (str "b")) ]] is applied to "ab", it will match "a" rather than "ab". + + Also note that multiple match semantics can conflict. In this case, the one + executed earlier takes precedence. For instance, any match of [shortest (seq + [ bos; group (rep (str "a")); group (rep (str "a")); eos ])] will always have + an empty first group. Conversely, if we use [longest] instead of [shortest], + the second group will always be empty. +*) val longest : t -> t -(** Longest match *) +(** Longest match semantics. That is, matches will match as many bytes as + possible. If multiple choices match the maximum amount of bytes, the one + respecting the inner match semantics is preferred. *) val shortest : t -> t -(** Shortest match *) +(** Same as {!longest}, but matching the least number of bytes. *) val first : t -> t -(** First match *) - -(** {2 Repeated match modifiers} *) +(** First match semantics for alternations (not repetitions). That is, matches + will prefer the leftmost branch of the alternation that matches the text. *) val greedy : t -> t -(** Greedy *) +(** Greedy matches for repetitions ({!opt}, {!rep}, {!rep1}, {!repn}): they will + match as many times as possible. *) val non_greedy : t -> t -(** Non-greedy *) +(** Non-greedy matches for repetitions ({!opt}, {!rep}, {!rep1}, {!repn}): they + will match as few times as possible. *) (** {2 Groups (or submatches)} *) val group : t -> t -(** Delimit a group *) +(** Delimit a group. The group is considered as matching if it is used at least + once (it may be used multiple times if is nested inside {!rep} for + instance). If it is used multiple times, the last match is what gets + captured. *) val no_group : t -> t (** Remove all groups *) val nest : t -> t -(** when matching against [nest e], only the group matching in the - last match of e will be considered as matching *) +(** When matching against [nest e], only the group matching in the + last match of e will be considered as matching. + + For instance: + {[ + let re = compile (rep1 (nest (alt [ group (str "a"); str "b" ]))) in + let group = Re.exec re "ab" in + assert (Group.get_opt group 1 = None); + + (* same thing but without [nest] *) + let re = compile (rep1 (alt [ group (str "a"); str "b" ])) in + let group = Re.exec re "ab" in + assert (Group.get_opt group 1 = Some "a"); + ]} +*) @@ -363,10 +445,12 @@ val xdigit : t (** {2 Case modifiers} *) val case : t -> t -(** Case sensitive matching *) +(** Case sensitive matching. Note that this works on latin1, not ascii and not + utf8. *) val no_case : t -> t -(** Case insensitive matching *) +(** Case insensitive matching. Note that this works on latin1, not ascii and not + utf8. *) (****) @@ -404,11 +488,14 @@ module View : sig val view : outer -> t end with type outer := t -(** {2 Experimental functions}. *) +(** {2 Experimental functions} *) val witness : t -> string -(** [witness r] generates a string [s] such that [execp (compile r) s] is - true *) +(** [witness r] generates a string [s] such that [execp (compile r) s] is true. + + Be warned that this function is buggy because it ignores zero-width + assertions like beginning of words. As a result it can generate incorrect + results. *) (** {2 Deprecated functions} *)