JuliaIO · ExpandingMan · Jul 20, 2021 · Jul 20, 2021 · Jul 20, 2021 · Jul 20, 2021
diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml
@@ -0,0 +1,24 @@
+name: Documentation
+
+on:
+  push:
+    branches:
+      - master
+    tags: '*'
+  pull_request:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: '1.6'
+      - name: Install dependencies
+        run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
+      - name: Build and deploy
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key
+        run: julia --project=docs/ docs/make.jl
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@
 parquet-compatibility/
 julia-parquet-compatibility/
 .vscode/settings.json
+/docs/build
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 [![Build status](https://ci.appveyor.com/api/projects/status/gx8pvdiiery74r9l/branch/master?svg=true)](https://ci.appveyor.com/project/tanmaykm/parquet-jl-cufdj/branch/master)
 [![Coverage Status](https://coveralls.io/repos/github/JuliaIO/Parquet.jl/badge.svg?branch=master)](https://coveralls.io/github/JuliaIO/Parquet.jl?branch=master)
 
+**DOCUMENTATION:** [![](https://img.shields.io/badge/docs-latest-blue.svg)](http://dataframes.juliadata.org/latest/)
+
 ## Reader
 
 A [parquet file](https://en.wikipedia.org/wiki/Apache_Parquet) or dataset can be loaded using the `read_parquet` function. A parquet dataset is a directory with multiple parquet files, each of which is a partition belonging to the dataset.

diff --git a/docs/make.jl b/docs/make.jl
@@ -0,0 +1,23 @@
+using Parquet
+using Documenter
+
+makedocs(;
+    modules = [Parquet],
+    authors = "JuliaIO and contributors",
+    repo = "https://github.com/JuliaIO/Parquet.jl/blob/{commit}{path}#L{line}",
+    sitename = "Parquet.jl",
+    format = Documenter.HTML(;
+        prettyurls = get(ENV, "CI", "false") == "true",
+        canonical = "https://JuliaIO.github.io/Parquet.jl",
+        assets = String[],
+    ),
+    pages = [
+        "Home" => "index.md",
+        "API" => "api.md",
+    ]
+)
+
+deploydocs(;
+    repo="github.com/JuliaIO/Parquet.jl",
+)
+
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -0,0 +1,29 @@
+```@meta
+CurrentModule = Parquet
+```
+
+# API
+```@index
+Pages = ["api.md"]
+```
+
+## Basic Usage
+```@docs
+Parquet.File
+Parquet.Table
+Parquet.Dataset
+read_parquet
+write_parquet
+```
+
+## Low-level Usage
+```@docs
+Page
+PageLRU
+TablePartition
+TablePartitions
+ColCursor
+BatchedColumnsCursor
+DatasetPartitions
+Schema
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -0,0 +1,15 @@
+```@meta
+CurrentModule = Parquet
+```
+
+!!! note
+
+    Docs currently under construction!
+
+
+# Parquet.jl
+Parquet is a tabular, columnar storage format which supports nested data with optional
+compression.
+
+The parquet format specification can be found
+[here](https://github.com/apache/parquet-format).
diff --git a/src/cursor.jl b/src/cursor.jl
@@ -2,10 +2,13 @@
 # layer 3 access
 # read data as records which are named tuple representations of the schema
 
-##
-# Column cursor iterates through all values of the column, including null values.
-# Each iteration returns the value (as a Union{T,Nothing}), definition level, and repetition level for each value.
-# Row can be deduced from repetition level.
+"""
+    ColCursor{T}
+
+Column cursor iterates through all values of the column, including null values.
+Each iteration returns the value (as a Union{T,Nothing}), definition level, and
+repetition level for each value.  Row can be deduced from repetition level.
+"""
 mutable struct ColCursor{T}
     par::Parquet.File
     colname::Vector{String}                                 # column name (full path in schema)
@@ -226,13 +229,16 @@ mutable struct BatchedColumnsCursor{T}
 end
 
 """
+    BatchedColumnsCursor
+
 Create cursor to iterate over batches of column values. Each iteration returns a named tuple of column names with batch of column values. Files with nested schemas can not be read with this cursor.
 
+## Constructors
 ```julia
 BatchedColumnsCursor(par::Parquet.File; kwargs...)
 ```
 
-Cursor options:
+## Arguments
 - `rows`: the row range to iterate through, all rows by default.
 - `batchsize`: maximum number of rows to read in each batch (default: row count of first row group).
 - `reusebuffer`: boolean to indicate whether to reuse the buffers with every iteration; if each iteration processes the batch and does not need to refer to the same data buffer again, then setting this to `true` reduces GC pressure and can help significantly while processing large files.
@@ -363,13 +369,16 @@ mutable struct RecordCursor{T}
 end
 
 """
+    RecordCursor
+
 Create cursor to iterate over records. In parallel mode, multiple remote cursors can be created and iterated on in parallel.
 
+## Constructor
 ```julia
 RecordCursor(par::Parquet.File; kwargs...)
 ```
 
-Cursor options:
+## Arguments
 - `rows`: the row range to iterate through, all rows by default.
 - `colnames`: the column names to retrieve; all by default
 """

diff --git a/src/dataset.jl b/src/dataset.jl
@@ -137,6 +137,8 @@ function dataset_schema(path::String)
 end
 
 """
+    DatasetPartitions
+
 Iterator to iterate over partitions of a parquet dataset, returned by the `Tables.partitions(dataset)` method.
 Each partition is a Parquet.Table.
 """
@@ -211,4 +213,4 @@ function Tables.getcolumn(d::Dataset, i::Int)
     loaded(d) || load(d)
     getfield(d, :columns)[i]
 end
-Tables.partitions(d::Dataset) = DatasetPartitions(d, getfield(d, :filter))
+Tables.partitions(d::Dataset) = DatasetPartitions(d, getfield(d, :filter))
diff --git a/src/reader.jl b/src/reader.jl
@@ -4,7 +4,15 @@ const SZ_PAR_MAGIC = length(PAR_MAGIC)
 const SZ_FOOTER = 4
 const SZ_VALID_PAR = 2*SZ_PAR_MAGIC + SZ_FOOTER
 
-# page is the unit of compression
+"""
+    Page
+
+Data structure representing a parquet "page".
+
+Column chunks are divided into pages which are conceptually individual units in terms
+of compression and encoding.  Multiple page types can be contained in a simble column
+chunk.
+"""
 mutable struct Page
     colchunk::ColumnChunk
     hdr::PageHeader
@@ -14,8 +22,10 @@ mutable struct Page
 end
 
 """
-Keeps a cache of pages read from a file.
-Pages are kept as weak refs, so that they can be collected when there's memory pressure.
+    PageLRU
+
+Keeps a cache of pages read from a file.  Pages are kept as weak refs, so that
+they can be collected when there's memory pressure.
 """
 struct PageLRU
     refs::Dict{Tuple{ColumnChunk,Int64},WeakRef}

diff --git a/src/schema.jl b/src/schema.jl
@@ -4,6 +4,11 @@
 const TLogicalTypeMap = Dict{Union{Int32,Vector{String}},Tuple{DataType,Function}}
 
 # schema and helper methods
+"""
+    Schema
+
+Parquet table schema.
+"""
 mutable struct Schema
     schema::Vector{SchemaElement}
     map_logical_types::TLogicalTypeMap
@@ -43,7 +48,8 @@ mutable struct Schema
                 end
             end
         end
-        new(elems, map_logical_types, name_lookup, Dict{Vector{String},Union{DataType,Union}}(), Dict{Vector{String},Union{DataType,Union}}())
+        new(elems, map_logical_types, name_lookup, Dict{Vector{String},Union{DataType,Union}}(),
+            Dict{Vector{String},Union{DataType,Union}}())
     end
 end
 
@@ -189,7 +195,7 @@ num_children(schelem::SchemaElement) = hasproperty(schelem, :num_children) ? sch
 function max_repetition_level(sch::Schema, schname::T) where {T <: AbstractVector{String}}
     lev = isrepeated(sch, schname) ? 1 : 0
     istoplevel(schname) ? lev : (lev + max_repetition_level(sch, parentname(schname)))
-end 
+end
 
 function max_definition_level(sch::Schema, schname::T) where {T <: AbstractVector{String}}
     lev = isrequired(sch, schname) ? 0 : 1
@@ -220,4 +226,4 @@ function map_logical_decimal(precision::Int32, scale::Int32; use_float::Bool=fal
         # use Decimal
         return (Decimal, (bytes)->logical_decimal_scaled(bytes, T, scale))
     end
-end
+end
diff --git a/src/simple_reader.jl b/src/simple_reader.jl
@@ -82,6 +82,8 @@ function column_generator(table::Table, colidx::Int, len::Int)
 end
 
 """
+    TablePartition
+
 Represents one partition of the parquet file.
 Typically a row group, but could be any other unit as mentioned while opening the table.
 """
@@ -91,6 +93,8 @@ struct TablePartition <: Tables.AbstractColumns
 end
 
 """
+    TablePartitions
+
 Iterator to iterate over partitions of a parquet file, returned by the `Tables.partitions(table)` method.
 Each partition is typically a row group, but could be any other unit as mentioned while opening the table.
 """