From 44e3a31314fc9f09b03482d5572600284c6f248e Mon Sep 17 00:00:00 2001 From: LTLA Date: Tue, 2 Jan 2024 23:06:28 -0800 Subject: [PATCH] Programmatically generate version-specific specification documents. This is arguably easier to read if we want to understand any given version of the spec, as we don't have to mentally ignore the parts of the spec related to other versions. We use knitr to generate one document per version, only keeping the clauses relevant to that version via conditional chunks. --- .github/workflows/doxygenate.yaml | 26 ++++ docs/Doxyfile | 3 +- docs/specifications/.gitignore | 1 + docs/specifications/build.R | 14 ++ docs/specifications/{hdf5.md => hdf5.Rmd} | 166 +++++++++++++++------- docs/specifications/{json.md => json.Rmd} | 84 ++++++++--- docs/specifications/misc.md | 12 +- 7 files changed, 221 insertions(+), 85 deletions(-) create mode 100644 docs/specifications/.gitignore create mode 100644 docs/specifications/build.R rename docs/specifications/{hdf5.md => hdf5.Rmd} (52%) rename docs/specifications/{json.md => json.Rmd} (57%) diff --git a/.github/workflows/doxygenate.yaml b/.github/workflows/doxygenate.yaml index 5464fff..20b578b 100644 --- a/.github/workflows/doxygenate.yaml +++ b/.github/workflows/doxygenate.yaml @@ -6,8 +6,28 @@ on: name: Build documentation jobs: + build-spec: + runs-on: ubuntu-latest + container: bioconductor/bioconductor_docker:devel + + steps: + - name: Checkout repo + uses: actions/checkout@v3 + + - name: Compile markdown + run: | + cd docs/specifications + R -f build.R + + - name: Upload markdown + uses: actions/upload-artifact@v3 + with: + name: built-spec + path: docs/specifications/compiled + docs: runs-on: ubuntu-latest + needs: build-spec steps: - uses: actions/checkout@v3 @@ -16,6 +36,12 @@ jobs: with: args: -O docs/doxygen-awesome.css https://raw.githubusercontent.com/jothepro/doxygen-awesome-css/main/doxygen-awesome.css + - name: Download markdown + uses: actions/download-artifact@v3 + with: + name: built-spec + path: docs/specifications/compiled + - name: Doxygen Action uses: mattnotmitt/doxygen-action@v1 with: diff --git a/docs/Doxyfile b/docs/Doxyfile index 0b9613e..3461186 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -794,7 +794,8 @@ INPUT = ../include/uzuki2/parse_json.hpp \ ../include/uzuki2/parse_hdf5.hpp \ ../include/uzuki2/interfaces.hpp \ ../include/uzuki2/uzuki2.hpp \ - ../README.md + ../README.md \ + specifications/compiled # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/docs/specifications/.gitignore b/docs/specifications/.gitignore new file mode 100644 index 0000000..724bbe1 --- /dev/null +++ b/docs/specifications/.gitignore @@ -0,0 +1 @@ +compiled/ diff --git a/docs/specifications/build.R b/docs/specifications/build.R new file mode 100644 index 0000000..8761536 --- /dev/null +++ b/docs/specifications/build.R @@ -0,0 +1,14 @@ +library(knitr) +dir.create("compiled", showWarnings=FALSE) + +for (v in c("1.0", "1.1", "1.2", "1.3")) { + .version <- package_version(v) + knitr::knit("hdf5.Rmd", output=file.path("compiled", paste0("hdf5-", v, ".md"))) +} + +for (v in c("1.0", "1.1", "1.2")) { + .version <- package_version(v) + knitr::knit("json.Rmd", output=file.path("compiled", paste0("json-", v, ".md"))) +} + +file.copy("misc.md", file.path("compiled", "misc.md")) diff --git a/docs/specifications/hdf5.md b/docs/specifications/hdf5.Rmd similarity index 52% rename from docs/specifications/hdf5.md rename to docs/specifications/hdf5.Rmd index 8b52aad..33dbad8 100644 --- a/docs/specifications/hdf5.md +++ b/docs/specifications/hdf5.Rmd @@ -1,18 +1,51 @@ -# HDF5 Specification +```{r, results="hide", echo=FALSE} +knitr::opts_chunk$set(error=FALSE) +if (!exists(".version")) { + .version <- package_version("1.3") +} +``` -## General comments +```{r, results="asis", echo=FALSE} +cat("# HDF5 Specification (", as.character(.version), ")", sep="") +``` -We use `**/` to represent a variable name of the group representing any of the supported R objects. -It is assumed that `**/` will be replaced by the actual name of the group in implementations, -as defined by users (for the top-level group) or by the specification (e.g., as a nested child of a list). +## Comments -All objects should be nested inside an R list. +### General + +Every R object is represented by a HDF5 group. +In the descriptions below, we use `**/` as a placeholder for the name of the group. + +All R objects should be nested inside an R list. +In other words, the top-level HDF5 group should represent an R list. The top-level group may have a `uzuki_version` attribute, describing the version of the **uzuki2** specification that it uses. This should be a scalar string dataset of the form `X.Y` for non-negative integers `X` and `Y`. -The latest version of this specification is **1.3**; if not provided, it is assumed to be **1.0**. +The latest version of this specification is **1.3**; if not provided, it is assumed to be **1.0** for back-compatibility purposes. + +```{r, echo=FALSE, results="asis"} +if (.version >= package_version("1.3")) { + cat("### Datatypes + +The HDF5 datatype specification used by each R object is based on the [HDF5 policy draft (v0.1.0)](https://github.com/ArtifactDB/Bioc-HDF5-policy/tree/v0.1.0). +This aims to provide readers with a guaranteed type for faithfully representing the data in memory. +The draft also describes the use of placeholders to represent missing values within HDF5 datasets.") +} +``` + +### Names + +Some R objects may have a `**/names` dataset in their HDF5 group. +If `**/names` is supplied, the contents should always be non-missing, so any `missing-value-placeholder` will not be respected. +Each name is allowed to be any string, including an empty string. -## Lists +It is technically permitted to provide duplicate names in `**/names`, consistent with how R itself supports duplicate names in its lists and vectors. +However, this is not recommended as other frameworks may wish to use representations that assume unique names, e.g., using Python dictionaries to represent named lists. +By providing unique names, users can improve interoperability with native data structures in other frameworks. + +## Object types + +### Lists An R list is represented as a HDF5 group (`**/`) with the following attributes: @@ -24,15 +57,19 @@ One subgroup should be present for each integer in `[0, N)`, given a list of len Each list element may be any of the objects described in this specification, including further nested lists. If the list is named, there will additionally be a 1-dimensional `**/names` string dataset of length equal to the number of elements in `**/data`. -See also the [comments on names](misc.md#comments-on-names). -## Atomic vectors +### Atomic vectors An atomic vector is represented as a HDF5 group (`**/`) with the following attributes: - `uzuki_object`, a scalar string dataset containing the value `"vector"`. -- `uzuki_type`, a scalar string dataset containing one of `"integer"`, `"boolean"`, `"number"` or `"string"`. - - **(for version 1.0)** this may also be `"date"` or `"date-time"`. +```{r, echo=FALSE, results="asis"} +if (.version == package_version("1.0")) { + cat('- `uzuki_type`, a scalar string dataset containing one of `"integer"`, `"boolean"`, `"number"`, `"string"`, `"date"` or `"date-time"`.') +} else { + cat('- `uzuki_type`, a scalar string dataset containing one of `"integer"`, `"boolean"`, `"number"` or `"string"`.') +} +``` The group should contain an 1-dimensional dataset at `**/data`. Vectors of length 1 may also be represented as a scalar dataset. @@ -41,72 +78,93 @@ The allowed HDF5 datatype depends on `uzuki_type`: - `"integer"`, `"boolean"`: any type of `H5T_INTEGER` that can be represented by a 32-bit signed integer. Note that the converse is not required, i.e., the storage type does not need to be 32-bit if no such values are present in the dataset. -- **(for version < 1.3)** `"number"`: any type of `H5T_FLOAT` that can be represented by a double-precision float. -- **(for version >= 1.3)** `"number"`: any type of `H5T_FLOAT` or `H5T_INTEGER` that can be represented exactly by a double-precision (64-bit) float. - This implies a limit of 32 bits for any integer datatype. - See also the [HDF5 policy draft (v0.1.0)](https://github.com/ArtifactDB/Bioc-HDF5-policy/tree/v0.1.0) for more details. +```{r, echo=FALSE, results="asis"} +if (.version == package_version("1.0")) { + cat('- `"number"`: any type of `H5T_FLOAT` that can be represented by a double-precision float.') +} else { + cat('- `"number"`: any type of `H5T_FLOAT` or `H5T_INTEGER` that can be represented exactly by a double-precision (64-bit) float.') +} +``` - `"string"`: any type of `H5T_STRING` that can be represented by a UTF-8 encoded string. -- **(for version 1.0)** `"date"`: any type of `H5T_STRING` where the srings are in the `YYYY-MM-DD` format, or are equal to a missing placeholder value. -- **(for version 1.0)** `"date-time"`: any type of `H5T_STRING` where the srings are Internet Date/Time format, or are equal to a missing placeholder value. - -For `boolean` type, values in `**/data` should be one of 0 (false) or 1 (true). - -**(for versions >= 1.1)** -For the `string` type, the group may optionally contain the `**/format` dataset. +```{r, echo=FALSE, results="asis"} +if (.version == package_version("1.0")) { + cat('- `"date"`: any type of `H5T_STRING` where the srings are in the `YYYY-MM-DD` format, or are equal to a missing placeholder value. +- `"date-time"`: any type of `H5T_STRING` where the srings are Internet Date/Time format, or are equal to a missing placeholder value.') +} +``` + +For `boolean` type, values in `**/data` should be one of 0 (false) or non-zero (true). + +```{r, echo=FALSE, results="asis"} +if (.version >= package_version("1.1")) { + cat('For the `string` type, the group may optionally contain the `**/format` dataset. This should be a scalar string dataset that specifies constraints to the format of the values in `**/data`: - `"date"`: strings should be `YYYY-MM-DD` dates or the placeholder value. -- `"date-time"`: strings should be in the Internet Date/Time format ([RFC 3339, Section 5.6](https://www.rfc-editor.org/rfc/rfc3339#section-5.6)) or the placeholder value. +- `"date-time"`: strings should be in the Internet Date/Time format ([RFC 3339, Section 5.6](https://www.rfc-editor.org/rfc/rfc3339#section-5.6)) or the placeholder value.') +} +``` The atomic vector's group may also contain `**/names`, a 1-dimensional string dataset of length equal to that of `**/data`. If `**/data` is a scalar, `**/names` should have length 1. -See also the [comments on names](misc.md#comments-on-names). ### Representing missing values -**(for version >= 1.1)** -Each `**/data` dataset may optionally contain a `missing-value-placeholder` attribute. +```{r, echo=FALSE, results="asis"} +if (.version >= package_version("1.1")) { + cat('Each `**/data` dataset may optionally contain a `missing-value-placeholder` attribute. If present, this should be a scalar dataset that specifies the placeholder for missing values. Any value of `**/data` that is equal to this placeholder should be treated as missing. -If no such attribute is present, it can be assumed that there are no missing values. +If no such attribute is present, it can be assumed that there are no missing values.') +} -**(for version >= 1.2)** -The data type of the placeholder attribute should be exactly the same as that of `**/data`, so as to avoid unexpected results upon casting. +if (.version >= package_version("1.2")) { + cat('The data type of the placeholder attribute should be exactly the same as that of `**/data`, so as to avoid unexpected results upon casting. The only exception is when `**/data` is a string, in which case the placeholder type may be of any string type; -it is expected that any comparison between the placeholder and strings in `**/data` will be performed bytewise in the same manner as `strcmp`. +it is expected that any comparison between the placeholder and strings in `**/data` will be performed bytewise in the same manner as `strcmp`.') +} -**(for version == 1.1)** -The data type of the placeholder attribute should have the same data type class as `**/data`. +if (.version >= package_version("1.1")) { + cat('The data type of the placeholder attribute should have the same data type class as `**/data`.') +} -**(for version >= 1.3)** -Floating-point missingness should be identified using the equality operator when both the placeholder and data values are loaded into memory as IEEE754-compliant `double`s. +if (.version >= package_version("1.3")) { + cat('Floating-point missingness should be identified using the equality operator when both the placeholder and data values are loaded into memory as IEEE754-compliant `double`s. No casting should be performed to a lower-precision type, as this may cause a non-missing value to become equal to the placeholder. -If the placeholder is NaN, all NaNs in the dataset should be considered missing, regardless of the exact bit representation in the NaN payload. -See the [HDF5 policy draft (v0.1.0)](https://github.com/ArtifactDB/Bioc-HDF5-policy/tree/v0.1.0) for more details. +If the placeholder is NaN, all NaNs in the dataset should be considered missing, regardless of the exact bit representation in the NaN payload.') +} -**(for version >= 1.1, < 1.3)** -Floating-point missingness may be encoded in the payload of an NaN, which distinguishes it from a non-missing "not-a-number" value. -Comparisons on NaN placeholders should be performed in a bytewise manner (e.g., with `memcmp`) to ensure that the payload is taken into account. +if (.version >= package_version("1.1") && .version < package_version("1.3")) { + cat('Floating-point missingness may be encoded in the payload of an NaN, which distinguishes it from a non-missing "not-a-number" value. +Comparisons on NaN placeholders should be performed in a bytewise manner (e.g., with `memcmp`) to ensure that the payload is taken into account.') +} -**(for version 1.0)** -Integer or boolean values of -2147483648 are treated as missing. +if (.version == package_version("1.0")) { + cat("Integer or boolean values of -2147483648 are treated as missing. Missing floats are represented by [R's NA representation](https://github.com/wch/r-source/blob/869e0f734dc4971c420cf417f5e0d18c0974a5af/src/main/arithmetic.c#L90-L98). For strings, each `**/data` dataset may contain a `missing-value-placeholder` attribute. If present, this should be a scalar string dataset that specifies the placeholder for missing values. Any value of `**/data` that is equal to this placeholder should be treated as missing. -If no such attribute is present, it can be assumed that there are no missing values. +If no such attribute is present, it can be assumed that there are no missing values.") +} +``` -## Factors +### Factors A factor is represented as a HDF5 group (`**/`) with the following attributes: - `uzuki_object`, a scalar string dataset containing the value `"vector"`. -- `uzuki_type`, a scalar string dataset containing `"factor"`. - - **(for version 1.0)** `uzuki_type` could also be set to `"ordered"`. - This is the same as `uzuki_type` of `"factor"` with the `**/ordered` dataset set to a truthy value. +```{r, echo=FALSE, results="asis"} +if (.version == package_version("1.0")) { + cat('- `uzuki_type`, a scalar string dataset containing `"factor"` or `"ordered"`.') +} else { + cat('- `uzuki_type`, a scalar string dataset containing `"factor"`.') +} +``` The group should contain an 1-dimensional dataset at `**/data`, containing 0-based indices into the levels. This should be type of `H5T_INTEGER` that can be represented by a 32-bit signed integer. +(Admittedly, this should have been an unsigned integer, but we started with a signed integer and we'll just keep it so for back-compatibility.) Missing values are represented as described above for atomic vectors. The group should also contain `**/levels`, a 1-dimensional string dataset that contains the levels for the indices in `**/data`. @@ -118,16 +176,20 @@ beyond that count, the levels cannot be indexed by elements of `**/data`. The group may also contain `**/names`, a 1-dimensional string dataset of length equal to `data`. See also the [comments on names](misc.md#comments-on-names). -**(for version >= 1.1)** The group may optionally contain `**/ordered`, a scalar integer dataset. -This should be interpreted as a boolean where a non-zero value specifies that we should assume that the levels are ordered. +```{r, echo=FALSE, results="asis"} +if (.version == package_version("1.1")) { + cat('The group may optionally contain `**/ordered`, a scalar integer dataset. +This should be interpreted as a boolean where a non-zero value specifies that we should assume that the levels are ordered.') +} +``` -## Nothing +### Nothing A "nothing" (a.k.a., "null", "none") value is represented as a HDF5 group with the following attributes: - `uzuki_object`, a scalar string dataset containing the value `"nothing"`. -## External object +### External object Each external object is represented as a HDF5 group (`**/`) with the following attributes: @@ -136,5 +198,5 @@ Each external object is represented as a HDF5 group (`**/`) with the following a This should contain an `**/index` scalar dataset, containing an index that identifies this external object uniquely within the entire list. `**/index` should start at zero and be incremented whenever an external object is encountered. -By indexing this external metadata, we can restore the object in its appropriate location in the list. +By indexing some external metadata with the value of `**/index`, we can restore the external object in its appropriate location in the R list. The exact mechanism by which this restoration occurs is implementation-defined. diff --git a/docs/specifications/json.md b/docs/specifications/json.Rmd similarity index 57% rename from docs/specifications/json.md rename to docs/specifications/json.Rmd index b30ee3d..b8e6353 100644 --- a/docs/specifications/json.md +++ b/docs/specifications/json.Rmd @@ -1,14 +1,37 @@ -# JSON Specification +```{r, results="hide", echo=FALSE} +knitr::opts_chunk$set(error=FALSE) +if (!exists(".version")) { + .version <- package_version("1.2") +} +``` -## General comments +```{r, results="asis", echo=FALSE} +cat("# JSON Specification (", as.character(.version), ")", sep="") +``` + +## Comments + +### General All R objects are represented by JSON objects with a `type` property. -Every R object should be nested inside an R list. +Every R object should be nested inside an R list, i.e., the top-level JSON object should represent an R list. The top-level object may have a `version` property that contains the **uzuki2** specification version as a `"X.Y"` string for non-negative integers `X` and `Y`. The latest version of this specification is **1.2**; if missing, the version can be assumed to be **1.0**. -## Lists +### Names + +Some R objects may have a `names` property in the JSON object. +If `names` is supplied, its contents should always be non-missing. +Each name is allowed to be any string, including an empty string. + +It is technically permitted to provide duplicate names in `names`, consistent with how R itself supports duplicate names in its lists and vectors. +However, this is not recommended as other frameworks may wish to use representations that assume unique names, e.g., using Python dictionaries to represent named lists. +By providing unique names, users can improve interoperability with native data structures in other frameworks. + +## Object types + +### Lists An R list is represented as a JSON object with the following properties: @@ -16,9 +39,8 @@ An R list is represented as a JSON object with the following properties: - `values`, an array of JSON objects corresponding to nested R objects. Each JSON object may follow any of the formats described in this specification. - (optional) `"names"`, an array of length equal to `values`, containing the names of the list elements. - See also the [comments on names](misc.md#comments-on-names). -## Atomic vectors +### Atomic vectors An atomic vector is represented as a JSON object with the following properties: @@ -29,7 +51,6 @@ An atomic vector is represented as a JSON object with the following properties: This may also be a scalar of the same type as the array contents. - (optional) `"names"`, an array of length equal to `values`, containing the names of the list elements. If `values` is a scalar, `names` should have length 1. - See also the [comments on names](misc.md#comments-on-names). The contents of `values` is subject to some constraints: @@ -38,50 +59,71 @@ The contents of `values` is subject to some constraints: IEEE special values can be represented by strings, i.e., `NaN`, `Inf`, `-Inf`. - `"integer"`: values should be JSON numbers that can be represented by a 32-bit signed integer. Missing values may be represented by `null`. - - **(for version 1.0)** missing integers could also be represented by the special value -2147483648. +```{r, echo=FALSE, results="asis"} +if (.version == package_version("1.0")) { + cat(" Missing integers may also be represented by the special value -2147483648.") +} +``` - `"boolean"`: values should be JSON booleans or `null` (for missing values). - `string`: values should be JSON strings. `null` is also allowed and represents a missing value. -**(for version >= 1.1)** -For `type` of `"string"`, the object may optionally have a `format` property that constrains the `values`: +```{r, echo=FALSE, results="asis"} +if (.version >= package_version("1.1")) { + cat('For `type` of `"string"`, the object may optionally have a `format` property that constrains the `values`: - `"date"`: values should be JSON strings following a `YYYY-MM-DD` format. `null` is also allowed and represents a missing value. - `"date-time"`: values should be JSON strings following the Internet Date/Time format. - `null` is also allowed and represents a missing value. + `null` is also allowed and represents a missing value.') +} +``` Vectors of length 1 may also be represented as scalars of the appropriate type. While R makes no distinction between scalars and length-1 vectors, this may be useful for other frameworks where this difference is relevant. -## Factors +### Factors A factor is represented as a JSON object with the following properties: -- `type`, set to `"factor"`. - - **(for version 1.0)** `type` can also be set to `"ordered"` for ordered levels. +```{r, echo=FALSE, results="asis"} +if (.version >= package_version("1.0")) { + cat('- `type`, set to `"factor"` or `"ordered"`.') +} else { + cat('- `type`, set to `"factor"`.') +} +``` - `values`, an array of 0-based integer indices for the factor. These should be non-negative JSON numbers that can fit into a 32-bit signed integer. They should also be less than the length of `levels`. Missing values are represented by `null`. - - **(for version 1.0)** missing values could also be represented by the special value -2147483648. +```{r, echo=FALSE, results="asis"} +if (.version == package_version("1.0")) { + cat(" Missing integers may also be represented by the special value -2147483648.") +} +``` - `levels`, an array of unique strings containing the levels for the indices in `values`. - (optional) `"names"`, an array of length equal to `values`, containing the names of the list elements. - See also the [comments on names](misc.md#comments-on-names). -- **(for version >= 1.1)** (optional) `ordered`, a boolean indicating whether to assume that the levels are ordered. - If absent, levels are assumed to be non-ordered. +```{r, echo=FALSE, results="asis"} +if (.version >= package_version("1.1")) { + cat("- (optional) `ordered`, a boolean indicating whether to assume that the levels are ordered. + If absent, levels are assumed to be non-ordered.") +} +``` -## Nothing +### Nothing A "nothing" (a.k.a., "null", "none") value is represented as a JSON object with the following properties: - `type`, set to `"nothing"`. -## External object +### External object Each external object is represented as a JSON object with the following properties: - `type`, set to `"index"`. - `index`, a non-negative JSON number that can fit into a 32-bit signed integer. This identifies this external object uniquely within the entire list. - See the equivalent in the HDF5 specification for more details. + +By indexing some external metadata with the value of `index`, we can restore the external object in its appropriate location in the R list. +The exact mechanism by which this restoration occurs is implementation-defined. diff --git a/docs/specifications/misc.md b/docs/specifications/misc.md index 2599d3d..ee1a218 100644 --- a/docs/specifications/misc.md +++ b/docs/specifications/misc.md @@ -1,14 +1,4 @@ -# Comments on names - -Both HDF5 and JSON support naming of the vector elements, typically via the `names` group/property. -If `names` are supplied, their contents should always be non-missing (e.g., not `null` in JSON, no `missing-value-placeholder` in HDF5). -Each name is allowed to be any string, including an empty string. - -It is technically permitted to provide duplicate names in `names`, consistent with how R itself supports duplicate names in its lists and vectors. -However, this is not recommended as other frameworks may wish to use representations that assume unique names, e.g., using Python dictionaries to represent named lists. -By providing unique names, users can improve interoperability with native data structures in other frameworks. - -# Comparison to version 1 +# Comparison to original **uzuki2** involves some major changes from the original [**uzuki**](https://github.com/LTLA/uzuki) library. Most obviously, we added support for HDF5 alongside the JSON format.