Skip to content

Commit 76d0718

Browse files
xiaodaightanmaykm
authored andcommitted
Parquet Writer #66
1 parent 29308e3 commit 76d0718

File tree

7 files changed

+716
-8
lines changed

7 files changed

+716
-8
lines changed

Project.toml

+14-3
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,38 @@ name = "Parquet"
22
uuid = "626c502c-15b0-58ad-a749-f091afb673ae"
33
keywords = ["parquet", "julia", "columnar-storage"]
44
license = "MIT"
5-
desc = "Julia implementation of parquet columnar file format reader"
6-
version = "0.5.2"
5+
desc = "Julia implementation of parquet columnar file format reader and writer"
6+
version = "0.5.3"
77

88
[deps]
9+
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
910
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
1011
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
12+
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
1113
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
14+
LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
1215
MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
16+
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
1317
Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
18+
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
1419
Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22"
1520

1621
[compat]
22+
CategoricalArrays = "0.6,0.7,0.8"
1723
CodecZlib = "0.5,0.6,0.7"
1824
CodecZstd = "0.6,0.7"
25+
DataAPI = "1"
26+
LittleEndianBase128 = "0.3"
1927
MemPool = "0.2"
28+
Missings = "0.3,0.4"
2029
Snappy = "0.3"
30+
Tables = "1"
2131
Thrift = "0.6,0.7"
2232
julia = "1"
2333

2434
[extras]
35+
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
2536
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
2637

2738
[targets]
28-
test = ["Test"]
39+
test = ["Test", "Random"]

README.md

+30
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
[![Build status](https://ci.appveyor.com/api/projects/status/gx8pvdiiery74r9l/branch/master?svg=true)](https://ci.appveyor.com/project/tanmaykm/parquet-jl-cufdj/branch/master)
55
[![Coverage Status](https://coveralls.io/repos/github/JuliaIO/Parquet.jl/badge.svg?branch=master)](https://coveralls.io/github/JuliaIO/Parquet.jl?branch=master)
66

7+
## Reader
8+
79
Load a [parquet file](https://en.wikipedia.org/wiki/Apache_Parquet). Only metadata is read initially, data is loaded in chunks on demand. (Note: [ParquetFiles.jl](https://github.com/queryverse/ParquetFiles.jl) also provides load support for Parquet files under the FileIO.jl package.)
810

911
`ParFile` represents a Parquet file at `path` open for reading. Options to map logical types can be provided via `map_logical_types`.
@@ -132,3 +134,31 @@ The reader will interpret logical types based on the `map_logical_types` provide
132134
- `logical_string(v): Applicable for strings that are `BYTE_ARRAY` values. Without this, they are represented in a `Vector{UInt8}` type. With this they are converted to `String` types.
133135

134136
Variants of these methods or custom methods can also be applied by caller.
137+
138+
## Writer
139+
140+
You can write any Tables.jl column-accessible table that contains columns of these types and their union with `Missing`: `Int32`, `Int64`, `String`, `Bool`, `Float32`, `Float64`.
141+
142+
However, `CategoricalArray`s are not yet supported. Furthermore, these types are not yet supported: `Int96`, `Int128`, `Date`, and `DateTime`.
143+
144+
### Writer Example
145+
146+
```julia
147+
tbl = (
148+
int32 = Int32.(1:1000),
149+
int64 = Int64.(1:1000),
150+
float32 = Float32.(1:1000),
151+
float64 = Float64.(1:1000),
152+
bool = rand(Bool, 1000),
153+
string = [randstring(8) for i in 1:1000],
154+
int32m = rand([missing, 1:100...], 1000),
155+
int64m = rand([missing, 1:100...], 1000),
156+
float32m = rand([missing, Float32.(1:100)...], 1000),
157+
float64m = rand([missing, Float64.(1:100)...], 1000),
158+
boolm = rand([missing, true, false], 1000),
159+
stringm = rand([missing, "abc", "def", "ghi"], 1000)
160+
)
161+
162+
file = tempname()*".parquet"
163+
write_parquet(file, tbl)
164+
```

src/Parquet.jl

+8
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,20 @@ using CodecZstd
77
using MemPool
88
using Dates
99

10+
if VERSION < v"1.3"
11+
using Missings: nonmissingtype
12+
end
13+
14+
const PARQUET_JL_VERSION = v"0.5.3"
15+
1016
import Base: show, open, close, values, eltype, length
1117
import Thrift: isfilled
1218

1319
export is_par_file, ParFile, show, nrows, ncols, rowgroups, columns, pages, bytes, values, colname, colnames
1420
export schema
1521
export logical_timestamp, logical_string
1622
export RecordCursor, BatchedColumnsCursor
23+
export write_parquet
1724

1825
# package code goes here
1926
include("PAR2/PAR2.jl")
@@ -23,5 +30,6 @@ include("schema.jl")
2330
include("reader.jl")
2431
include("cursor.jl")
2532
include("show.jl")
33+
include("writer.jl")
2634

2735
end # module

src/reader.jl

+5-5
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,11 @@ end
322322
function read_levels_and_nmissing(io, defn_enc::Int32, repn_enc::Int32, num_values::Int32, par::ParFile, page::Page, defn_levels::Vector{Int32}, repn_levels::Vector{Int32}, defn_offset::Int=0, repn_offset::Int=0)
323323
cname = colname(page.colchunk)
324324

325+
#@debug("before reading repn levels bytesavailable in page: $(bytesavailable(io))")
326+
# read repetition levels. skipped if all columns are at 1st level
327+
max_repn_level = max_repetition_level(par.schema, cname)
328+
((length(cname) > 1) && (max_repn_level > 0)) && read_levels(io, max_repn_level, repn_enc, num_values, repn_levels, repn_offset)
329+
325330
#@debug("before reading defn levels bytesavailable in page: $(bytesavailable(io))")
326331
# read definition levels. skipped if column is required
327332
nmissing = Int32(0)
@@ -332,11 +337,6 @@ function read_levels_and_nmissing(io, defn_enc::Int32, repn_enc::Int32, num_valu
332337
end
333338
end
334339

335-
#@debug("before reading repn levels bytesavailable in page: $(bytesavailable(io))")
336-
# read repetition levels. skipped if all columns are at 1st level
337-
max_repn_level = max_repetition_level(par.schema, cname)
338-
((length(cname) > 1) && (max_repn_level > 0)) && read_levels(io, max_repn_level, repn_enc, num_values, repn_levels, repn_offset)
339-
340340
nmissing
341341
end
342342

0 commit comments

Comments
 (0)