Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

automatically fill the id attributes of the headings #267

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions bin/main.ml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ let with_open_out fn f =
close_out_noerr oc;
raise e

let process ic oc =
let process ?auto_identifiers ic oc =
let md = Omd.of_channel ic in
output_string oc (Omd.to_html md)
output_string oc (Omd.to_html ?auto_identifiers md)

let print_version () =
let version =
Expand All @@ -33,11 +33,15 @@ let print_version () =

let input = ref []
let output = ref ""
let auto_identifiers = ref None

let spec =
[ ( "-o"
, Arg.Set_string output
, " file.html Specify the output file (default is stdout)." )
; ( "--auto-identifiers"
, Arg.Bool (fun x -> auto_identifiers := Some x)
, " Should identifiers be automatically assigned to headings." )
shonfeder marked this conversation as resolved.
Show resolved Hide resolved
; ( "--version"
, Arg.Unit print_version
, " Display the version of the currently installed omd." )
Expand All @@ -54,10 +58,13 @@ let main () =
let with_output f =
if !output = "" then f stdout else with_open_out !output f
in
let auto_identifiers = !auto_identifiers in
with_output @@ fun oc ->
if !input = [] then process stdin oc
if !input = [] then process ?auto_identifiers stdin oc
else
let f filename = with_open_in filename @@ fun ic -> process ic oc in
let f filename =
with_open_in filename @@ fun ic -> process ?auto_identifiers ic oc
in
List.(iter f (rev !input))

let () =
Expand Down
113 changes: 109 additions & 4 deletions src/html.ml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,78 @@ let escape_uri s =
s;
Buffer.contents b

let trim_start_while p s =
let start = ref true in
let b = Buffer.create (String.length s) in
Uutf.String.fold_utf_8
(fun () _ -> function
| `Malformed _ -> Buffer.add_string b s
| `Uchar u when p u && !start -> ()
| `Uchar u when !start ->
start := false;
Uutf.Buffer.add_utf_8 b u
| `Uchar u -> Uutf.Buffer.add_utf_8 b u)
()
s;
Buffer.contents b
tatchi marked this conversation as resolved.
Show resolved Hide resolved

let underscore = Uchar.of_char '_'
let hyphen = Uchar.of_char '-'
let period = Uchar.of_char '.'
let is_white_space = Uucp.White.is_white_space
let is_alphabetic = Uucp.Alpha.is_alphabetic
let is_hex_digit = Uucp.Num.is_hex_digit

module Identifiers : sig
type t

val empty : t

val touch : string -> t -> int * t
(** Bump the frequency count for the given string.
It returns the previous count (before bumping) *)
end = struct
module SMap = Map.Make (String)

type t = int SMap.t

let empty = SMap.empty
let count s t = match SMap.find_opt s t with None -> 0 | Some x -> x
let incr s t = SMap.add s (count s t + 1) t

let touch s t =
let count = count s t in
(count, incr s t)
end

(* Based on pandoc algorithm to derive id's.
See: https://pandoc.org/MANUAL.html#extension-auto_identifiers *)
let slugify s =
let s = trim_start_while (fun c -> not (is_alphabetic c)) s in
let length = String.length s in
let b = Buffer.create length in
let last_is_ws = ref false in
let add_to_buffer u =
if !last_is_ws = true then begin
Uutf.Buffer.add_utf_8 b (Uchar.of_char '-');
last_is_ws := false
end;
Uutf.Buffer.add_utf_8 b u
in
let fold () _ = function
| `Malformed _ -> add_to_buffer Uutf.u_rep
| `Uchar u when is_white_space u && not !last_is_ws -> last_is_ws := true
| `Uchar u when is_white_space u && !last_is_ws -> ()
| `Uchar u ->
(if is_alphabetic u || is_hex_digit u then
match Uucp.Case.Map.to_lower u with
| `Self -> add_to_buffer u
| `Uchars us -> List.iter add_to_buffer us);
if u = underscore || u = hyphen || u = period then add_to_buffer u
in
Uutf.String.fold_utf_8 fold () s;
Buffer.contents b
tatchi marked this conversation as resolved.
Show resolved Hide resolved

let to_plain_text t =
let buf = Buffer.create 1024 in
let rec go : _ inline -> unit = function
Expand Down Expand Up @@ -128,9 +200,13 @@ and inline = function
| Image (attr, { label; destination; title }) ->
img label destination title attr

let rec block = function
let rec block ~auto_identifiers = function
Copy link
Collaborator

@shonfeder shonfeder Aug 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it means that we'll need to keep a state somewhere with all the ids we've already generated. Not sure how to best implement that and if that's something we want. Would love to have your input on that :)

Perhaps we can have the block function take a record for this argument that can carry some configuration/state? For the identifier numerical suffix, it could be a int StringMap.t. Of course, we could also use a mutable hash map for this, but would be nice to avoid if feasible. WDYT?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion, I added an int StringMap.t to keep track of the numerical suffix. Let me know what you think :)

| Blockquote (attr, q) ->
elt Block "blockquote" attr (Some (concat nl (concat_map block q)))
elt
Block
"blockquote"
attr
(Some (concat nl (concat_map (block ~auto_identifiers) q)))
| Paragraph (attr, md) -> elt Block "p" attr (Some (inline md))
| List (attr, ty, sp, bl) ->
let name = match ty with Ordered _ -> "ol" | Bullet _ -> "ul" in
Expand All @@ -143,7 +219,7 @@ let rec block = function
let block' t =
match (t, sp) with
| Paragraph (_, t), Tight -> concat (inline t) nl
| _ -> block t
| _ -> block ~auto_identifiers t
in
let nl = if sp = Tight then Null else nl in
elt Block "li" [] (Some (concat nl (concat_map block' t)))
Expand Down Expand Up @@ -178,7 +254,36 @@ let rec block = function
in
elt Block "dl" attr (Some (concat_map f l))

let of_doc doc = concat_map block doc
let of_doc ?(auto_identifiers = true) doc =
let identifiers = Identifiers.empty in
tatchi marked this conversation as resolved.
Show resolved Hide resolved
let f identifiers = function
| Heading (attr, level, text) ->
let attr, identifiers =
if (not auto_identifiers) || List.mem_assoc "id" attr then
(attr, identifiers)
else
let id = slugify (to_plain_text text) in
(* Default identifier if empty. It matches what pandoc does. *)
let id = if id = "" then "section" else id in
let count, identifiers = Identifiers.touch id identifiers in
let id =
if count = 0 then id else Printf.sprintf "%s-%i" id count
in
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this might better belong in the slugify function. WDYT?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have any strong opinion on that one. I made the changes in 820f310 (#267)

(("id", id) :: attr, identifiers)
in
(Heading (attr, level, text), identifiers)
| _ as c -> (c, identifiers)
in
let html, _ =
List.fold_left
(fun (accu, ids) x ->
let x', ids = f ids x in
let el = concat accu (block ~auto_identifiers x') in
(el, ids))
(Null, identifiers)
doc
in
html

let to_string t =
let buf = Buffer.create 1024 in
Expand Down
2 changes: 1 addition & 1 deletion src/html.mli
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ type t =
| Null
| Concat of t * t

val of_doc : attributes block list -> t
val of_doc : ?auto_identifiers:bool -> attributes block list -> t
val to_string : t -> string
5 changes: 4 additions & 1 deletion src/omd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ let html_bl ?(attrs = []) s = Html_block (attrs, s)
let dl ?(attrs = []) l = Definition_list (attrs, l)
let of_channel ic = parse_inlines (Pre.of_channel ic)
let of_string s = parse_inlines (Pre.of_string s)
let to_html doc = Html.to_string (Html.of_doc doc)

let to_html ?auto_identifiers doc =
Html.to_string (Html.of_doc ?auto_identifiers doc)
tatchi marked this conversation as resolved.
Show resolved Hide resolved

let to_sexp ast = Format.asprintf "@[%a@]@." Sexp.print (Sexp.create ast)
let headers = Toc.headers
let toc = Toc.toc
2 changes: 1 addition & 1 deletion src/omd.mli
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ val html_bl : ?attrs:attributes -> string -> attributes block
val dl : ?attrs:attributes -> attributes def_elt list -> attributes block
val of_channel : in_channel -> doc
val of_string : string -> doc
val to_html : doc -> string
val to_html : ?auto_identifiers:bool -> doc -> string
val to_sexp : doc -> string

val headers :
Expand Down
32 changes: 32 additions & 0 deletions tests/blackbox/heading-id.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
$ omd << "MD"
> ### This is an Header Without Id
> ### 1 2 Header that starts with 2 numbers
> ### Header with an id {#header-id}
> ### Maître d'hÔtel 😬
> ### 👋👋 ÔHey! 👋👋
> ### 👋👋 ÔHey! 👋👋
> ### *Dogs*?--in *my* house?
> ### [HTML], [S5], or [RTF]?
> ### 3. Applications
> ### hello.world
> ### -hello-
> ### with multiple spaces
> ### with&nbsp;&nbsp;&nbsp;spaces
> ### 33
> ###
> MD
<h3 id="this-is-an-header-without-id">This is an Header Without Id</h3>
<h3 id="header-that-starts-with-2-numbers">1 2 Header that starts with 2 numbers</h3>
<h3 id="header-id">Header with an id</h3>
<h3 id="maître-dhôtel">Maître d'hÔtel 😬</h3>
<h3 id="ôhey">👋👋 ÔHey! 👋👋</h3>
<h3 id="ôhey-1">👋👋 ÔHey! 👋👋</h3>
<h3 id="dogs--in-my-house"><em>Dogs</em>?--in <em>my</em> house?</h3>
<h3 id="html-s5-or-rtf">[HTML], [S5], or [RTF]?</h3>
<h3 id="applications">3. Applications</h3>
<h3 id="hello.world">hello.world</h3>
<h3 id="hello-">-hello-</h3>
<h3 id="with-multiple-spaces">with multiple spaces</h3>
<h3 id="with-spaces">with   spaces</h3>
<h3 id="section">33</h3>
<h3 id="section-1"></h3>
tatchi marked this conversation as resolved.
Show resolved Hide resolved
3 changes: 2 additions & 1 deletion tests/omd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ let with_open_in fn f =

let () =
with_open_in Sys.argv.(1) @@ fun ic ->
print_string (normalize_html (Omd.to_html (Omd.of_channel ic)))
print_string
(normalize_html (Omd.to_html ~auto_identifiers:false (Omd.of_channel ic)))