From a73b231cce8a31f26443f87a4c1d242f21c1d13e Mon Sep 17 00:00:00 2001 From: KristofferC Date: Mon, 17 Jun 2024 12:41:43 +0200 Subject: [PATCH 1/3] remove DataFree stuff #1 --- README.md | 23 ---------- benchmarks/benchmarkdatafreetree.jl | 52 ---------------------- src/NearestNeighbors.jl | 1 - src/datafreetree.jl | 68 ----------------------------- test/runtests.jl | 1 - test/test_datafreetree.jl | 29 ------------ 6 files changed, 174 deletions(-) delete mode 100644 benchmarks/benchmarkdatafreetree.jl delete mode 100644 src/datafreetree.jl delete mode 100644 test/test_datafreetree.jl diff --git a/README.md b/README.md index ca57196..4b88cd6 100644 --- a/README.md +++ b/README.md @@ -149,26 +149,3 @@ idxs = inrange(balltree, point, r, true) neighborscount = inrangecount(balltree, point, r, true) # counts points without allocating index arrays ``` - -## Using On-Disk Data Sets - -By default, trees store a copy of the `data` provided during construction. For data sets larger than available memory, `DataFreeTree` can be used to strip a tree of its data field and re-link it later. - -Example with a large on-disk data set: - -```julia -using Mmap -ndim = 2 -ndata = 10_000_000_000 -data = Mmap.mmap(datafilename, Matrix{Float32}, (ndim, ndata)) -data[:] = rand(Float32, ndim, ndata) # create example data -dftree = DataFreeTree(KDTree, data) -``` - -`dftree` stores the indexing data structures. To perform look-ups, re-link the tree to the data: - -```julia -tree = injectdata(dftree, data) # yields a KDTree -knn(tree, data[:,1], 3) # perform operations as usual -``` - diff --git a/benchmarks/benchmarkdatafreetree.jl b/benchmarks/benchmarkdatafreetree.jl deleted file mode 100644 index 89e70bd..0000000 --- a/benchmarks/benchmarkdatafreetree.jl +++ /dev/null @@ -1,52 +0,0 @@ -using NearestNeighbors -using Benchmarks -using Mmap - -runtimes = [] -runtimesreordered = [] - -function create_tree(n, reorder=false) - filename = tempname() - d = 10 - data = Mmap.mmap(filename, Matrix{Float32}, (d, n)) - data[:] = rand(Float32, d, n) - if reorder - reorderbuffer = Mmap.mmap(filename, Matrix{Float32}, (d, n)) - t = injectdata(DataFreeTree(KDTree, data, reorderbuffer = reorderbuffer), reorderbuffer) - else - t = injectdata(DataFreeTree(KDTree, data), data) - end - - return t, data, filename -end - -function knnbench(tree, data, n, N) - ind = rand(1:n, N) - knn(tree, data[:,ind], 3)[2] -end - -function bench() - runtimes = [] - runtimesreordered = [] - ns = [10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000] - for n in ns - t, data, filename = create_tree(n) - tr, datar, filenamer = create_tree(n, true) - - bm = @benchmark knnbench(t, data, n, 1000) - push!(runtimes, mean(bm.samples.elapsed_times) / 1e9) - - bmr = @benchmark knnbench(tr, datar, n, 1000) - push!(runtimesreordered, mean(bmr.samples.elapsed_times) / 1e9) - - rm(filename) - rm(filenamer) - end - - println("Speedups through reordering:") - for i in 1:length(ns) - println("$(ns[i]): $(runtimes[i] ./ runtimesreordered[i])") - end - return -end -bench() diff --git a/src/NearestNeighbors.jl b/src/NearestNeighbors.jl index 8fb08f9..01f8ec1 100644 --- a/src/NearestNeighbors.jl +++ b/src/NearestNeighbors.jl @@ -50,7 +50,6 @@ get_T(::T) where {T} = Float64 include("evaluation.jl") include("tree_data.jl") -include("datafreetree.jl") include("knn.jl") include("inrange.jl") include("hyperspheres.jl") diff --git a/src/datafreetree.jl b/src/datafreetree.jl deleted file mode 100644 index b0b3977..0000000 --- a/src/datafreetree.jl +++ /dev/null @@ -1,68 +0,0 @@ -# A DataFreeTree wraps a descendant of NNTree -# which does not contain a copy of the data -struct DataFreeTree{N <: NNTree} - size::Tuple{Int,Int} - hash::UInt64 - tree::N -end - -function get_points_dim(data) - if eltype(data) <: AbstractVector - ndim = eltype(eltype(data)) - npoints = length(data) - elseif typeof(data) <: AbstractMatrix - ndim = size(data, 1) - npoints = size(data, 2) - else - error("Unknown input data format") - end - return ndim, npoints -end - -""" - DataFreeTree(treetype, data[, reorderbufffer = similar(data), kwargs...]) -> datafreetree - -Creates a `DataFreeTree` which wraps a `KDTree` or `BallTree`. Keywords arguments are passed -to their respective constructors. - -The `KDTree` or `BallTree` will be stored without a reference to the underlaying data. `injectdata` -has to be used to re-link them to a data array before use. -""" -function DataFreeTree(::Type{T}, data, args...; reorderbuffer = data[:, 1:0], kargs...) where {T <: NNTree} - tree = T(data, args...; storedata = false, reorderbuffer = reorderbuffer, kargs...) - ndim, npoints = get_points_dim(data) - DataFreeTree((ndim, npoints), hash(tree.reordered ? reorderbuffer : data), tree) -end - -""" - injectdata(datafreetree, data) -> tree - -Returns the `KDTree`/`BallTree` wrapped by `datafreetree`, set up to use `data` for the points data. -""" -function injectdata(datafreetree::DataFreeTree, data::AbstractMatrix{T}) where {T} - dim = size(data, 1) - npoints = size(data, 2) - if isbitstype(T) - new_data = copy_svec(T, data, Val(dim)) - else - new_data = SVector{dim,T}[SVector{dim,T}(data[:, i]) for i in 1:npoints] - end - new_hash = hash(data) - injectdata(datafreetree, new_data, new_hash) -end - -function injectdata(datafreetree::DataFreeTree, data::AbstractVector{V}, new_hash::UInt64=0) where {V <: AbstractVector} - if new_hash == 0 - new_hash = hash(data) - end - if length(V) != datafreetree.size[1] || length(data) != datafreetree.size[2] - throw(DimensionMismatch("NearestNeighbors:injectdata: The size of 'data' $(length(data)) × $(length(V)) does not match the data array used to construct the tree $(datafreetree.size).")) - end - if new_hash != datafreetree.hash - throw(ArgumentError("NearestNeighbors:injectdata: The hash of 'data' does not match the hash of the data array used to construct the tree.")) - end - - typ = typeof(datafreetree.tree) - fields = map(x -> getfield(datafreetree.tree, x), fieldnames(typeof(datafreetree.tree)))[2:end] - typ(data, fields...) -end diff --git a/test/runtests.jl b/test/runtests.jl index a257562..cfa40ab 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -30,7 +30,6 @@ const trees_with_brute = [BruteTree; trees] include("test_knn.jl") include("test_inrange.jl") include("test_monkey.jl") -include("test_datafreetree.jl") @testset "periodic euclidean" begin pred = PeriodicEuclidean([Inf, 2.5]) diff --git a/test/test_datafreetree.jl b/test/test_datafreetree.jl deleted file mode 100644 index f6070af..0000000 --- a/test/test_datafreetree.jl +++ /dev/null @@ -1,29 +0,0 @@ -using Mmap - -@testset "datafreetree" begin - function test(data, data2, data3) - t = DataFreeTree(KDTree, data) - @test_throws ArgumentError injectdata(t, data2) - @test_throws DimensionMismatch injectdata(t, data3) - for typ in [KDTree, BallTree] - dfilename = tempname() - d = 2 - n = 100 - mktemp() do _, io - data = Mmap.mmap(io, Matrix{Float32}, (d, n)) - data[:] = rand(Float32, d, n) - t = injectdata(DataFreeTree(typ, data), data) - tr = typ(data) - for i = 1:n - @test knn(t, data[:,i], 3) == knn(tr, data[:,i], 3) - end - finalize(data) - end - end - end - data = rand(2,100) - data2 = rand(2,100) - data3 = rand(3,100) - test(data, data2, data3) - test(view(data, :, :), view(data2, :, :), view(data3, :, :)) -end From cc0d427f49b5c47e91b824741c00b2b31859e496 Mon Sep 17 00:00:00 2001 From: KristofferC Date: Mon, 17 Jun 2024 12:41:53 +0200 Subject: [PATCH 2/3] remove `storedata` kwarg --- src/ball_tree.jl | 17 +++++------------ src/brute_tree.jl | 8 ++++---- src/kd_tree.jl | 8 ++------ 3 files changed, 11 insertions(+), 22 deletions(-) diff --git a/src/ball_tree.jl b/src/ball_tree.jl index 5115f9e..abbbe5f 100644 --- a/src/ball_tree.jl +++ b/src/ball_tree.jl @@ -29,15 +29,11 @@ end Creates a `BallTree` from the data using the given `metric` and `leafsize`. """ function BallTree(data::AbstractVector{V}, - metric::M = Euclidean(); + metric::Metric = Euclidean(); leafsize::Int = 10, reorder::Bool = true, - storedata::Bool = true, - reorderbuffer::Vector{V} = Vector{V}()) where {V <: AbstractArray, M <: Metric} - reorder = !isempty(reorderbuffer) || (storedata ? reorder : false) - + reorderbuffer::Vector{V} = Vector{V}()) where {V <: AbstractArray} tree_data = TreeData(data, leafsize) - n_d = length(V) n_p = length(data) array_buffs = ArrayBuffers(Val{length(V)}, get_T(eltype(V))) @@ -78,25 +74,22 @@ function BallTree(data::AbstractVector{V}, indices = indices_reordered end - BallTree(storedata ? data : similar(data, 0), hyper_spheres, indices, metric, tree_data, reorder) + BallTree(data, hyper_spheres, indices, metric, tree_data, reorder) end function BallTree(data::AbstractVecOrMat{T}, - metric::M = Euclidean(); + metric::Metric = Euclidean(); leafsize::Int = 10, - storedata::Bool = true, reorder::Bool = true, reorderbuffer::Matrix{T} = Matrix{T}(undef, 0, 0)) where {T <: AbstractFloat, M <: Metric} dim = size(data, 1) - npoints = size(data, 2) points = copy_svec(T, data, Val(dim)) if isempty(reorderbuffer) reorderbuffer_points = Vector{SVector{dim,T}}() else reorderbuffer_points = copy_svec(T, reorderbuffer, Val(dim)) end - BallTree(points, metric, leafsize = leafsize, storedata = storedata, reorder = reorder, - reorderbuffer = reorderbuffer_points) + BallTree(points, metric; leafsize, reorder, reorderbuffer = reorderbuffer_points) end # Recursive function to build the tree. diff --git a/src/brute_tree.jl b/src/brute_tree.jl index c6a58f8..4fb5d64 100644 --- a/src/brute_tree.jl +++ b/src/brute_tree.jl @@ -10,7 +10,7 @@ end Creates a `BruteTree` from the data using the given `metric`. """ function BruteTree(data::AbstractVector{V}, metric::PreMetric = Euclidean(); - reorder::Bool=false, leafsize::Int=0, storedata::Bool=true) where {V <: AbstractVector} + reorder::Bool=false, leafsize::Int=0) where {V <: AbstractVector} if metric isa Distances.UnionMetrics p = parameters(metric) if p !== nothing && length(p) != length(V) @@ -19,14 +19,14 @@ function BruteTree(data::AbstractVector{V}, metric::PreMetric = Euclidean(); end end - BruteTree(storedata ? data : Vector{V}(), metric, reorder) + BruteTree(data, metric, reorder) end function BruteTree(data::AbstractVecOrMat{T}, metric::PreMetric = Euclidean(); - reorder::Bool=false, leafsize::Int=0, storedata::Bool=true) where {T} + reorder::Bool=false, leafsize::Int=0) where {T} dim = size(data, 1) BruteTree(copy_svec(T, data, Val(dim)), - metric, reorder = reorder, leafsize = leafsize, storedata = storedata) + metric; reorder, leafsize) end function _knn(tree::BruteTree{V}, diff --git a/src/kd_tree.jl b/src/kd_tree.jl index ddbef8f..70deff5 100644 --- a/src/kd_tree.jl +++ b/src/kd_tree.jl @@ -19,11 +19,8 @@ The `metric` must be a `MinkowskiMetric`. function KDTree(data::AbstractVector{V}, metric::M = Euclidean(); leafsize::Int = 10, - storedata::Bool = true, reorder::Bool = true, reorderbuffer::Vector{V} = Vector{V}()) where {V <: AbstractArray, M <: MinkowskiMetric} - reorder = !isempty(reorderbuffer) || (storedata ? reorder : false) - tree_data = TreeData(data, leafsize) n_p = length(data) @@ -71,13 +68,12 @@ function KDTree(data::AbstractVector{V}, end end - KDTree(storedata ? data : similar(data, 0), hyper_rec, indices, metric, split_vals, split_dims, tree_data, reorder) + KDTree(data, hyper_rec, indices, metric, split_vals, split_dims, tree_data, reorder) end function KDTree(data::AbstractVecOrMat{T}, metric::M = Euclidean(); leafsize::Int = 10, - storedata::Bool = true, reorder::Bool = true, reorderbuffer::Matrix{T} = Matrix{T}(undef, 0, 0)) where {T <: AbstractFloat, M <: MinkowskiMetric} dim = size(data, 1) @@ -87,7 +83,7 @@ end else reorderbuffer_points = copy_svec(T, reorderbuffer, Val(dim)) end - KDTree(points, metric, leafsize = leafsize, storedata = storedata, reorder = reorder, + KDTree(points, metric; leafsize, reorder, reorderbuffer = reorderbuffer_points) end From 763f3a30f1662cce67228a2ed9c5257b9a7f8ccc Mon Sep 17 00:00:00 2001 From: KristofferC Date: Mon, 17 Jun 2024 12:44:57 +0200 Subject: [PATCH 3/3] remove `reorderbuffer` stuff --- src/ball_tree.jl | 28 ++++++++-------------------- src/kd_tree.jl | 33 ++++++++++----------------------- 2 files changed, 18 insertions(+), 43 deletions(-) diff --git a/src/ball_tree.jl b/src/ball_tree.jl index abbbe5f..8271741 100644 --- a/src/ball_tree.jl +++ b/src/ball_tree.jl @@ -31,8 +31,7 @@ Creates a `BallTree` from the data using the given `metric` and `leafsize`. function BallTree(data::AbstractVector{V}, metric::Metric = Euclidean(); leafsize::Int = 10, - reorder::Bool = true, - reorderbuffer::Vector{V} = Vector{V}()) where {V <: AbstractArray} + reorder::Bool = true) where {V <: AbstractArray} tree_data = TreeData(data, leafsize) n_p = length(data) @@ -42,17 +41,12 @@ function BallTree(data::AbstractVector{V}, # Bottom up creation of hyper spheres so need spheres even for leafs) hyper_spheres = Vector{HyperSphere{length(V),eltype(V)}}(undef, tree_data.n_internal_nodes + tree_data.n_leafs) + indices_reordered = Vector{Int}() + data_reordered = Vector{V}() + if reorder - indices_reordered = Vector{Int}(undef, n_p) - if isempty(reorderbuffer) - data_reordered = Vector{V}(undef, n_p) - else - data_reordered = reorderbuffer - end - else - # Dummy variables - indices_reordered = Vector{Int}() - data_reordered = Vector{V}() + resize!(indices_reordered, n_p) + resize!(data_reordered, n_p) end if metric isa Distances.UnionMetrics @@ -80,16 +74,10 @@ end function BallTree(data::AbstractVecOrMat{T}, metric::Metric = Euclidean(); leafsize::Int = 10, - reorder::Bool = true, - reorderbuffer::Matrix{T} = Matrix{T}(undef, 0, 0)) where {T <: AbstractFloat, M <: Metric} + reorder::Bool = true) where {T <: AbstractFloat} dim = size(data, 1) points = copy_svec(T, data, Val(dim)) - if isempty(reorderbuffer) - reorderbuffer_points = Vector{SVector{dim,T}}() - else - reorderbuffer_points = copy_svec(T, reorderbuffer, Val(dim)) - end - BallTree(points, metric; leafsize, reorder, reorderbuffer = reorderbuffer_points) + BallTree(points, metric; leafsize, reorder) end # Recursive function to build the tree. diff --git a/src/kd_tree.jl b/src/kd_tree.jl index 70deff5..1c8515f 100644 --- a/src/kd_tree.jl +++ b/src/kd_tree.jl @@ -17,10 +17,9 @@ Creates a `KDTree` from the data using the given `metric` and `leafsize`. The `metric` must be a `MinkowskiMetric`. """ function KDTree(data::AbstractVector{V}, - metric::M = Euclidean(); + metric::MinkowskiMetric = Euclidean(); leafsize::Int = 10, - reorder::Bool = true, - reorderbuffer::Vector{V} = Vector{V}()) where {V <: AbstractArray, M <: MinkowskiMetric} + reorder::Bool = true) where {V <: AbstractArray} tree_data = TreeData(data, leafsize) n_p = length(data) @@ -28,17 +27,12 @@ function KDTree(data::AbstractVector{V}, split_vals = Vector{eltype(V)}(undef, tree_data.n_internal_nodes) split_dims = Vector{UInt16}(undef, tree_data.n_internal_nodes) + indices_reordered = Vector{Int}() + data_reordered = Vector{V}() + if reorder - indices_reordered = Vector{Int}(undef, n_p) - if isempty(reorderbuffer) - data_reordered = Vector{V}(undef, n_p) - else - data_reordered = reorderbuffer - end - else - # Dummy variables - indices_reordered = Vector{Int}() - data_reordered = Vector{V}() + resize!(indices_reordered, n_p) + resize!(data_reordered, n_p) end if metric isa Distances.UnionMetrics @@ -72,19 +66,12 @@ function KDTree(data::AbstractVector{V}, end function KDTree(data::AbstractVecOrMat{T}, - metric::M = Euclidean(); + metric::MinkowskiMetric = Euclidean(); leafsize::Int = 10, - reorder::Bool = true, - reorderbuffer::Matrix{T} = Matrix{T}(undef, 0, 0)) where {T <: AbstractFloat, M <: MinkowskiMetric} + reorder::Bool = true) where {T <: AbstractFloat} dim = size(data, 1) points = copy_svec(T, data, Val(dim)) - if isempty(reorderbuffer) - reorderbuffer_points = Vector{SVector{dim,T}}() - else - reorderbuffer_points = copy_svec(T, reorderbuffer, Val(dim)) - end - KDTree(points, metric; leafsize, reorder, - reorderbuffer = reorderbuffer_points) + KDTree(points, metric; leafsize, reorder) end function build_KDTree(index::Int,