Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More documentation #180

Merged
merged 2 commits into from
Jan 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 128 additions & 38 deletions lib/core.mli
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)

(** Module [Re]: regular expressions commons *)
(** Module [Re]: code for creating and using regular expressions,
independently of regular expression syntax. *)

type t
(** Regular expression *)
Expand All @@ -31,11 +32,16 @@ type re
(** Manipulate matching groups. *)
module Group : sig
type t
(** Information about groups in a match. *)
(** Information about groups in a match. As is conventional, every
match implicitly has a group 0 that covers the whole match, and
explicit groups are numbered from 1. *)

val get : t -> int -> string
(** Raise [Not_found] if the group did not match *)

val get_opt : t -> int -> string option
(** Similar to {!get}, but returns an option instead of using an exception. *)

val offset : t -> int -> int * int
(** Raise [Not_found] if the group did not match *)

Expand Down Expand Up @@ -69,35 +75,51 @@ val compile : t -> re
used to match strings, e.g. with {!exec}. *)

val exec :
?pos:int -> (* Default: 0 *)
?len:int -> (* Default: -1 (until end of string) *)
?pos:int -> (** Default: 0 *)
?len:int -> (** Default: -1 (until end of string) *)
re -> string -> Group.t
(** [exec re str] matches [str] against the compiled expression [re],
(** [exec re str] searches [str] for a match of the compiled expression [re],
and returns the matched groups if any.

More specifically, when a match exists, [exec] returns a match that
starts at the earliest position possible. If multiple such matches are
possible, the one specified by the match semantics described below is
returned.

@param pos optional beginning of the string (default 0)
@param len length of the substring of [str] that can be matched (default [-1],
meaning to the end of the string
meaning to the end of the string)
@raise Not_found if the regular expression can't be found in [str]

Note that [exec re str ~pos ~len] is not equivalent to [exec re
(String.sub str pos len)]. This transformation changes the meaning
of some constructs ({!bos}, {!eos}, {!whole_string} and {!leol}), and
zero-width assertions like {!bow} or {!eow} look at characters before
[pos] and after [pos + len].
*)

val exec_opt :
?pos:int -> (* Default: 0 *)
?len:int -> (* Default: -1 (until end of string) *)
?pos:int -> (** Default: 0 *)
?len:int -> (** Default: -1 (until end of string) *)
re -> string -> Group.t option
(** Similar to {!exec}, but returns an option instead of using an exception. *)

val execp :
?pos:int -> (* Default: 0 *)
?len:int -> (* Default: -1 (until end of string) *)
?pos:int -> (** Default: 0 *)
?len:int -> (** Default: -1 (until end of string) *)
re -> string -> bool
(** Similar to {!exec}, but returns [true] if the expression matches,
and [false] if it doesn't *)
and [false] if it doesn't. This function is more efficient than
calling {!exec} or {!exec_opt} and ignoring the returned group.
*)

val exec_partial :
?pos:int -> (* Default: 0 *)
?len:int -> (* Default: -1 (until end of string) *)
?pos:int -> (** Default: 0 *)
?len:int -> (** Default: -1 (until end of string) *)
re -> string -> [ `Full | `Partial | `Mismatch ]
(** More detailed version of {!exec_p} *)
(** More detailed version of {!exec_p}. [`Full] is equivalent to [true],
while [`Mismatch] and [`Partial] are equivalent to [false], but [`Partial]
indicates the input string could be extended to create a match. *)

(** Marks *)
module Mark : sig
Expand Down Expand Up @@ -190,7 +212,7 @@ val replace :
?len:int ->
?all:bool -> (** Default: true. Otherwise only replace first occurrence *)
re -> (** matched groups *)
f:(Group.t -> string) -> (* how to replace *)
f:(Group.t -> string) -> (** how to replace *)
string -> (** string to replace in *)
string
(** [replace ~all re ~f s] iterates on [s], and replaces every occurrence
Expand All @@ -217,7 +239,12 @@ val char : char -> t
(** {2 Basic operations on regular expressions} *)

val alt : t list -> t
(** Alternative *)
(** Alternative.

[alt []] is equivalent to {!empty}.

By default, the leftmost match is preferred (see match semantics below).
*)

val seq : t list -> t
(** Sequence *)
Expand All @@ -243,7 +270,10 @@ val repn : t -> int -> int option -> t
val opt : t -> t
(** 0 or 1 matches *)

(** {2 String, line, word} *)
(** {2 String, line, word}

We define a word as a sequence of latin1 letters, digits and underscore.
*)

val bol : t
(** Beginning of line *)
Expand All @@ -258,19 +288,35 @@ val eow : t
(** End of word *)

val bos : t
(** Beginning of string *)
(** Beginning of string. This differs from {!start} because it matches
the beginning of the input string even when using [~pos] arguments:

{[
let b = execp (compile (seq [ bos; str "a" ])) "aa" ~pos:1 in
assert (not b)
]}
*)

val eos : t
(** End of string *)
(** End of string. This is different from {!stop} in the way described
in {!bos}. *)

val leol : t
(** Last end of line or end of string *)

val start : t
(** Initial position *)
(** Initial position. This differs from {!bos} because it takes into
account the [~pos] arguments:

{[
let b = execp (compile (seq [ start; str "a" ])) "aa" ~pos:1 in
assert b
]}
*)

val stop : t
(** Final position *)
(** Final position. This is different from {!eos} in the way described
in {!start}. *)

val word : t -> t
(** Word *)
Expand All @@ -279,38 +325,77 @@ val not_boundary : t
(** Not at a word boundary *)

val whole_string : t -> t
(** Only matches the whole string *)
(** Only matches the whole string, i.e. [fun t -> seq [ eos; t; bos ]]. *)

(** {2 Match semantics}

A regular expression frequently matches a string in multiple ways. For
instance [exec (compile (opt (str "a"))) "ab"] can match "" or "a". Match
semantic can be modified with the functions below, allowing one to choose
which of these is preferable.

(** {2 Match semantics} *)
By default, the leftmost branch of alternations is preferred, and repetitions
are greedy.

Note that the existence of matches cannot be changed by specifying match
semantics. [seq [ bos; str "a"; non_greedy (opt (str "b")); eos ]] will
match when applied to "ab". However if [seq [ bos; str "a"; non_greedy (opt
(str "b")) ]] is applied to "ab", it will match "a" rather than "ab".

Also note that multiple match semantics can conflict. In this case, the one
executed earlier takes precedence. For instance, any match of [shortest (seq
[ bos; group (rep (str "a")); group (rep (str "a")); eos ])] will always have
an empty first group. Conversely, if we use [longest] instead of [shortest],
the second group will always be empty.
*)

val longest : t -> t
(** Longest match *)
(** Longest match semantics. That is, matches will match as many bytes as
possible. If multiple choices match the maximum amount of bytes, the one
respecting the inner match semantics is preferred. *)

val shortest : t -> t
(** Shortest match *)
(** Same as {!longest}, but matching the least number of bytes. *)

val first : t -> t
(** First match *)

(** {2 Repeated match modifiers} *)
(** First match semantics for alternations (not repetitions). That is, matches
will prefer the leftmost branch of the alternation that matches the text. *)

val greedy : t -> t
(** Greedy *)
(** Greedy matches for repetitions ({!opt}, {!rep}, {!rep1}, {!repn}): they will
match as many times as possible. *)

val non_greedy : t -> t
(** Non-greedy *)
(** Non-greedy matches for repetitions ({!opt}, {!rep}, {!rep1}, {!repn}): they
will match as few times as possible. *)

(** {2 Groups (or submatches)} *)

val group : t -> t
(** Delimit a group *)
(** Delimit a group. The group is considered as matching if it is used at least
once (it may be used multiple times if is nested inside {!rep} for
instance). If it is used multiple times, the last match is what gets
captured. *)

val no_group : t -> t
(** Remove all groups *)

val nest : t -> t
(** when matching against [nest e], only the group matching in the
last match of e will be considered as matching *)
(** When matching against [nest e], only the group matching in the
last match of e will be considered as matching.

For instance:
{[
let re = compile (rep1 (nest (alt [ group (str "a"); str "b" ]))) in
let group = Re.exec re "ab" in
assert (Group.get_opt group 1 = None);

(* same thing but without [nest] *)
let re = compile (rep1 (alt [ group (str "a"); str "b" ])) in
let group = Re.exec re "ab" in
assert (Group.get_opt group 1 = Some "a");
]}
*)



Expand Down Expand Up @@ -360,10 +445,12 @@ val xdigit : t
(** {2 Case modifiers} *)

val case : t -> t
(** Case sensitive matching *)
(** Case sensitive matching. Note that this works on latin1, not ascii and not
utf8. *)

val no_case : t -> t
(** Case insensitive matching *)
(** Case insensitive matching. Note that this works on latin1, not ascii and not
utf8. *)

(****)

Expand Down Expand Up @@ -401,11 +488,14 @@ module View : sig
val view : outer -> t
end with type outer := t

(** {2 Experimental functions}. *)
(** {2 Experimental functions} *)

val witness : t -> string
(** [witness r] generates a string [s] such that [execp (compile r) s] is
true *)
(** [witness r] generates a string [s] such that [execp (compile r) s] is true.

Be warned that this function is buggy because it ignores zero-width
assertions like beginning of words. As a result it can generate incorrect
results. *)

(** {2 Deprecated functions} *)

Expand Down
5 changes: 5 additions & 0 deletions lib/group.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ let test t i =
let idx = t.marks.(2 * i) in
idx <> -1

let get_opt t i =
if test t i
then Some (get t i)
else None

let dummy_offset = (-1, -1)

let all_offset t =
Expand Down
3 changes: 3 additions & 0 deletions lib/group.mli
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ type t =
val get : t -> int -> string
(** Raise [Not_found] if the group did not match *)

val get_opt : t -> int -> string option
(** Similar to {!get}, but returns an option instead of using an exception. *)

val offset : t -> int -> int * int
(** Raise [Not_found] if the group did not match *)

Expand Down
2 changes: 2 additions & 0 deletions lib_test/fort_unit/fort_unit.ml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ let expect_eq_bool ?msg f x g y =
expect_equal_app ?msg ~printer:string_of_bool f x g y
let expect_eq_str ?msg f x g y =
expect_equal_app ?msg ~printer:str_printer f x g y
let expect_eq_str_opt ?msg f x g y =
expect_equal_app ?msg ~printer:(opt_printer str_printer) f x g y
let expect_eq_ofs ?msg f x g y =
expect_equal_app ?msg ~printer:ofs_printer f x g y
let expect_eq_arr_str ?msg f x g y =
Expand Down
8 changes: 8 additions & 0 deletions lib_test/test_re.ml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ let _ =
expect_eq_str not_found () (Group.get m) 4;
);

expect_pass "Group.get_opt" (fun () ->
expect_eq_str_opt id (Some "ab") (Group.get_opt m) 0;
expect_eq_str_opt id (Some "a") (Group.get_opt m) 1;
expect_eq_str_opt id None (Group.get_opt m) 2;
expect_eq_str_opt id (Some "b") (Group.get_opt m) 3;
expect_eq_str_opt id None (Group.get_opt m) 4;
);

expect_pass "Group.offset" (fun () ->
expect_eq_ofs id (0,2) (Group.offset m) 0;
expect_eq_ofs id (0,1) (Group.offset m) 1;
Expand Down