diff --git a/flatnav/Index.h b/flatnav/Index.h
index 2d59b73..522f44a 100644
--- a/flatnav/Index.h
+++ b/flatnav/Index.h
@@ -25,6 +25,33 @@
 
 namespace flatnav {
 
+/**
+ * The following struct is a configuration of parameters needed for the index.
+ * It also exposes a builder pattern for constructing the index.
+ */
+
+struct IndexParameterConfig {
+
+  IndexParameterConfig() = default;
+  size_t _M;
+  // size of one data point (does not support variable-size data, strings)
+  size_t _data_size_bytes;
+  // Node consists of: ([data] [M links] [data label]). This layout was chosen
+  // after benchmarking - it's slightly more cache-efficient than others.
+  size_t _node_size_bytes;
+  size_t _max_node_count; // Determines size of internal pre-allocated memory
+  size_t _cur_num_nodes;
+  std::mutex _cur_num_nodes_global_lock;
+  std::condition_variable _cur_num_nodes_global_cv;
+  std::atomic<bool> _current_node_inserted = false;
+
+  // Remembers which nodes we've visited, to avoid re-computing distances.
+  // Might be a caching problem in beamSearch - needs to be profiled.
+  ShardedExplicitSet *_sharded_visited_nodes;
+
+  uint32_t _num_threads;
+};
+
 // dist_t: A distance function implementing DistanceInterface.
 // label_t: A fixed-width data type for the label (meta-data) of each point.
 template <typename dist_t, typename label_t> class Index {
@@ -69,7 +96,8 @@ template <typename dist_t, typename label_t> class Index {
 
   template <typename Archive> void serialize(Archive &archive) {
     archive(_M, _data_size_bytes, _node_size_bytes, _max_node_count,
-            _cur_num_nodes, *_distance, _visited_nodes);
+            _cur_num_nodes, *_distance, _visited_nodes,
+            *_sharded_visited_nodes);
 
     // Serialize the allocated memory for the index & query.
     archive(
@@ -90,7 +118,7 @@ template <typename dist_t, typename label_t> class Index {
         int max_edges_per_node)
       : _M(max_edges_per_node), _max_node_count(dataset_size),
         _cur_num_nodes(0), _distance(dist), _visited_nodes(dataset_size + 1),
-        _num_threads(0),
+        _num_threads(std::thread::hardware_concurrency()),
         _sharded_visited_nodes(new ShardedVisitedSet(
             /* total_size = */ dataset_size + 1,
             /* num_shards = */ std::thread::hardware_concurrency())) {
@@ -98,7 +126,6 @@ template <typename dist_t, typename label_t> class Index {
     _data_size_bytes = _distance->dataSize();
     _node_size_bytes =
         _data_size_bytes + (sizeof(node_id_t) * _M) + sizeof(label_t);
-
     size_t index_memory_size = _node_size_bytes * _max_node_count;
 
     _index_memory = new char[index_memory_size];
@@ -108,8 +135,8 @@ template <typename dist_t, typename label_t> class Index {
   // this class.
   Index() = default;
 
-  ~Index() { 
-    delete[] _index_memory; 
+  ~Index() {
+    delete[] _index_memory;
     delete _sharded_visited_nodes;
   }
 
@@ -137,9 +164,9 @@ template <typename dist_t, typename label_t> class Index {
 
     // search graph for neighbors of new node, connect to them
     if (new_node_id > 0) {
-      PriorityQueue neighbors =
-          beamSearch(/* query = */ data, /* entry_node = */ entry_node,
-                     /* buffer_size = */ ef_construction);
+      PriorityQueue neighbors = concurrentBeamSearch(
+          /* query = */ data, /* entry_node = */ entry_node,
+          /* buffer_size = */ ef_construction);
       selectNeighbors(/* neighbors = */ neighbors);
       connectNeighbors(neighbors, new_node_id);
     }
@@ -162,20 +189,25 @@ template <typename dist_t, typename label_t> class Index {
     std::vector<std::thread> thread_pool(thread_count);
     uint32_t batch_size = labels.size() / thread_count;
 
+    std::cout << "Starting parallel add"
+              << "\n"
+              << std::flush;
+
     for (uint32_t thread_id = 0; thread_id < thread_count; thread_id++) {
       void *current_batch =
           (float *)data + (thread_id * batch_size * _data_size_bytes);
-      thread_pool[thread_id] =
-          std::thread(&addParallelBatch, current_batch, batch_size,
-                      std::ref(labels), ef_construction, num_initializations);
+      uint32_t label_start = thread_id * batch_size;
+      thread_pool[thread_id] = std::thread(
+          &Index::addParallelBatch, this, current_batch, batch_size,
+          label_start, std::ref(labels), ef_construction, num_initializations);
     }
-    // Do the actual work
+
     for (uint32_t thread_id = 0; thread_id < thread_count; thread_id++) {
       thread_pool[thread_id].join();
     }
   }
 
-  void addParallelBatch(void *batch, uint32_t batch_size,
+  void addParallelBatch(void *batch, uint32_t batch_size, uint32_t label_start,
                         const std::vector<label_t> &labels, int ef_construction,
                         int num_initializations = 100) {
 
@@ -183,57 +215,69 @@ template <typename dist_t, typename label_t> class Index {
       throw std::invalid_argument(
           "num_initializations must be greater than 0.");
     }
-
-    for (uint32_t vec_index = 0; vec_index < batch_size; vec_index++) {
-      void *vector = (float *)batch + (vec_index * _data_size_bytes);
-      label_t label = labels[vec_index];
-
-      // Lock from now on until we've inserted the new node into the index.
-      // This prevents multiple threads from trying to insert the same node.
-      // Use the condition variable to stop other threads from busy-waiting,
-      // which would be a waste of CPU cycles.
+    uint32_t vec_dimension = _distance->dimension();
+    {
       std::unique_lock<std::mutex> lock(_cur_num_nodes_global_lock);
-      _cur_num_nodes_global_cv.wait(lock,
-                                    [this] { return !_current_node_inserted; });
 
-      if (_cur_num_nodes >= _max_node_count) {
-        throw std::runtime_error("Maximum number of nodes reached. Consider "
-                                 "increasing the `max_node_count` parameter to "
-                                 "create a larger index.");
-      }
+      for (uint32_t vec_index = label_start; vec_index < batch_size;
+           vec_index++) {
+        void *vector = (float *)batch + (vec_index * vec_dimension);
+        label_t label = labels[vec_index];
+
+        // Lock from now on until we've inserted the new node into the index.
+        // This prevents multiple threads from trying to insert the same node.
+        // Use the condition variable to stop other threads from busy-waiting,
+        // which would be a waste of CPU cycles.
+
+        // _cur_num_nodes_global_cv.wait(lock,
+        //                               [this] { return
+        //                               !_current_node_inserted; });
+
+        if (_cur_num_nodes >= _max_node_count) {
+          throw std::runtime_error(
+              "Maximum number of nodes reached. Consider "
+              "increasing the `max_node_count` parameter to "
+              "create a larger index.");
+        }
 
-      uint32_t step_size = _cur_num_nodes / num_initializations;
-      if (step_size <= 0) {
-        step_size = 1;
-      }
-      float min_dist = std::numeric_limits<float>::max();
-      node_id_t entry_node = 0;
-      for (node_id_t node = 0; node < _cur_num_nodes; node += step_size) {
-        float dist =
-            _distance->distance(/* x = */ vector, /* y = */ getNodeData(node),
-                                /* asymmetric = */ true);
-        if (dist < min_dist) {
-          min_dist = dist;
-          entry_node = node;
+        uint32_t step_size = _cur_num_nodes / num_initializations;
+        if (step_size <= 0) {
+          step_size = 1;
         }
-      }
-      node_id_t new_node_id;
-      allocateNode(vector, label, new_node_id);
-
-      // Mark the node as inserted, notify other threads, and reset the flag.
-      _current_node_inserted = true;
-      _cur_num_nodes_global_cv.notify_all();
-      _current_node_inserted = false;
-
-      lock.unlock();
-
-      // search graph for neighbors of new no de, connect to them
-      if (new_node_id > 0) {
-        PriorityQueue neighbors =
-            beamSearch(/* query = */ vector, /* entry_node = */ entry_node,
-                       /* buffer_size = */ ef_construction);
-        selectNeighbors(/* neighbors = */ neighbors);
-        connectNeighbors(neighbors, new_node_id);
+        float min_dist = std::numeric_limits<float>::max();
+        node_id_t entry_node = 0;
+        for (node_id_t node = 0; node < _cur_num_nodes; node += step_size) {
+          float dist = _distance->distance(/* x = */ vector, /* y = */
+                                           getNodeData(node),
+                                           /* asymmetric = */ true);
+          if (dist < min_dist) {
+            min_dist = dist;
+            entry_node = node;
+          }
+        }
+        node_id_t new_node_id;
+        allocateNode(vector, label, new_node_id);
+
+        // Mark the node as inserted, notify other threads, and reset the flag.
+        // _current_node_inserted = true;
+        // _cur_num_nodes_global_cv.notify_all();
+        // _current_node_inserted = false;
+
+        // lock.unlock();
+
+        // search graph for neighbors of new no de, connect to them
+        if (new_node_id > 0) {
+          PriorityQueue neighbors = concurrentBeamSearch(
+              /* query = */ vector, /* entry_node = */ entry_node,
+              /* buffer_size = */ ef_construction);
+          selectNeighbors(/* neighbors = */ neighbors);
+          connectNeighbors(neighbors, new_node_id);
+        }
+        // _current_node_inserted = true;
+        // _cur_num_nodes_global_cv.notify_all();
+        // _current_node_inserted = false;
+
+        // lock.unlock();
       }
     }
   }
@@ -246,24 +290,21 @@ template <typename dist_t, typename label_t> class Index {
    * @param num_initializations The number of random initializations to use.
    */
   std::vector<dist_label_t> search(const void *query, const int K,
-                                   int ef_search,
-                                   int num_initializations = 100) {
-
+                                           int ef_search,
+                                           int num_initializations = 100) {
     node_id_t entry_node = initializeSearch(query, num_initializations);
-    PriorityQueue neighbors = beamSearch(/* query = */ query,
-                                         /* entry_node = */ entry_node,
-                                         /* buffer_size = */ ef_search);
-    std::vector<dist_label_t> results;
-
+    PriorityQueue neighbors =
+        concurrentBeamSearch(/* query = */ query,
+                             /* entry_node = */ entry_node,
+                             /* buffer_size = */ ef_search);
     while (neighbors.size() > K) {
       neighbors.pop();
     }
-
-    auto size = neighbors.size();
-    results.reserve(size);
+    std::vector<dist_label_t> results;
+    results.reserve(K);
     while (neighbors.size() > 0) {
-      results.push_back(std::make_pair(neighbors.top().first,
-                                       *getNodeLabel(neighbors.top().second)));
+      results.emplace_back(neighbors.top().first,
+                           *getNodeLabel(neighbors.top().second));
       neighbors.pop();
     }
     std::sort(results.begin(), results.end(),
@@ -318,11 +359,14 @@ template <typename dist_t, typename label_t> class Index {
     std::shared_ptr<DistanceInterface<dist_t>> dist =
         std::make_shared<dist_t>();
 
+    ShardedVisitedSet *sharded_visited_nodes = new ShardedVisitedSet();
+
     // 1. Deserialize metadata
     archive(index->_M, index->_data_size_bytes, index->_node_size_bytes,
             index->_max_node_count, index->_cur_num_nodes, *dist,
-            index->_visited_nodes);
+            index->_visited_nodes, *sharded_visited_nodes);
     index->_distance = dist;
+    index->_sharded_visited_nodes = sharded_visited_nodes;
 
     // 3. Allocate memory using deserialized metadata
     index->_index_memory =
@@ -374,6 +418,9 @@ template <typename dist_t, typename label_t> class Index {
 private:
   char *getNodeData(const node_id_t &n) const {
     char *location = _index_memory + (n * _node_size_bytes);
+    if (location == nullptr) {
+      throw std::runtime_error("getNodeData: pointer to node data is null.");
+    }
     return location;
   }
 
@@ -402,8 +449,9 @@ template <typename dist_t, typename label_t> class Index {
     }
     new_node_id = _cur_num_nodes;
 
-    _distance->transformData(/* destination = */ getNodeData(new_node_id),
-                             /* src = */ data);
+    _distance->transformData(
+        /* destination = */ (void *)getNodeData(new_node_id),
+        /* src = */ data);
     *(getNodeLabel(_cur_num_nodes)) = label;
 
     node_id_t *links = getNodeLinks(_cur_num_nodes);
@@ -474,19 +522,21 @@ template <typename dist_t, typename label_t> class Index {
       candidates.pop();
       node_id_t *d_node_links = getNodeLinks(d_node.second);
       for (int i = 0; i < _M; i++) {
-        if (!_visited_nodes[d_node_links[i]]) {
+        node_id_t neighbor_node_id = d_node_links[i];
+        bool neighbor_is_visited = _visited_nodes[neighbor_node_id];
+        if (!neighbor_is_visited) {
           // If we haven't visited the node yet.
-          _visited_nodes.insert(d_node_links[i]);
+          _visited_nodes.insert(neighbor_node_id);
 
           dist = _distance->distance(/* x = */ query,
-                                     /* y = */ getNodeData(d_node_links[i]),
+                                     /* y = */ getNodeData(neighbor_node_id),
                                      /* asymmetric = */ true);
 
           // Include the node in the buffer if buffer isn't full or
           // if the node is closer than a node already in the buffer.
           if (neighbors.size() < buffer_size || dist < max_dist) {
-            candidates.emplace(-dist, d_node_links[i]);
-            neighbors.emplace(dist, d_node_links[i]);
+            candidates.emplace(-dist, neighbor_node_id);
+            neighbors.emplace(dist, neighbor_node_id);
             if (neighbors.size() > buffer_size) {
               neighbors.pop();
             }
@@ -500,13 +550,17 @@ template <typename dist_t, typename label_t> class Index {
     return neighbors;
   }
 
-  void concurrentBeamSearch(const void *query, const node_id_t entry_node,
-                            const int buffer_size) {
+  PriorityQueue concurrentBeamSearch(const void *query,
+                                     const node_id_t entry_node,
+                                     const int buffer_size) {
     PriorityQueue neighbors;
     PriorityQueue candidates;
 
-    // Maybe this is supposed to be clearAll()?
-    _sharded_visited_nodes->clear(entry_node);
+    _sharded_visited_nodes->clearAll();
+
+    if (!_sharded_visited_nodes->allShardsHaveSameMark()) {
+      throw std::runtime_error("All shards must have the same mark.");
+    }
 
     float dist =
         _distance->distance(/* x = */ query, /* y = */ getNodeData(entry_node),
@@ -517,24 +571,27 @@ template <typename dist_t, typename label_t> class Index {
     neighbors.emplace(dist, entry_node);
     _sharded_visited_nodes->insert(entry_node);
 
-    while(!candidates.empty()) {
-      dist_node_it d_node = candidates.top();
+    while (!candidates.empty()) {
+      dist_node_t d_node = candidates.top();
       if ((-d_node.first) > max_dist) {
         break;
       }
       candidates.pop();
       node_id_t *d_node_links = getNodeLinks(d_node.second);
       for (int i = 0; i < _M; i++) {
-        if (!_sharded_visited_nodes[d_node_links[i]]) {
-          _sharded_visited_nodes->insert(d_node_links[i]);
+        node_id_t neighbor_node_id = d_node_links[i];
+        bool neighbor_is_visited = _sharded_visited_nodes->operator[](
+            /* node_id = */ neighbor_node_id);
+        if (!neighbor_is_visited) {
+          _sharded_visited_nodes->insert(/* node_id = */ neighbor_node_id);
 
           dist = _distance->distance(/* x = */ query,
-                                     /* y = */ getNodeData(d_node_links[i]),
+                                     /* y = */ getNodeData(neighbor_node_id),
                                      /* asymmetric = */ true);
 
           if (neighbors.size() < buffer_size || dist < max_dist) {
-            candidates.emplace(-dist, d_node_links[i]);
-            neighbors.emplace(dist, d_node_links[i]);
+            candidates.emplace(-dist, neighbor_node_id);
+            neighbors.emplace(dist, neighbor_node_id);
             if (neighbors.size() > buffer_size) {
               neighbors.pop();
             }
@@ -693,7 +750,6 @@ template <typename dist_t, typename label_t> class Index {
     node_id_t entry_node = 0;
 
     for (node_id_t node = 0; node < _cur_num_nodes; node += step_size) {
-
       float dist =
           _distance->distance(/* x = */ query, /* y = */ getNodeData(node),
                               /* asymmetric = */ true);
diff --git a/flatnav/distances/InnerProductDistance.h b/flatnav/distances/InnerProductDistance.h
index e00a771..4b3d4e5 100644
--- a/flatnav/distances/InnerProductDistance.h
+++ b/flatnav/distances/InnerProductDistance.h
@@ -33,7 +33,7 @@ class InnerProductDistance : public DistanceInterface<InnerProductDistance> {
     setDistanceFunction();
   }
 
-  ~InnerProductDistance() = default;
+  // ~InnerProductDistance() = default;
 
   float distanceImpl(const void *x, const void *y,
                      bool asymmetric = false) const {
diff --git a/flatnav/distances/SquaredL2Distance.h b/flatnav/distances/SquaredL2Distance.h
index fc754c5..d500825 100644
--- a/flatnav/distances/SquaredL2Distance.h
+++ b/flatnav/distances/SquaredL2Distance.h
@@ -31,7 +31,7 @@ class SquaredL2Distance : public DistanceInterface<SquaredL2Distance> {
     setDistanceFunction();
   }
 
-  ~SquaredL2Distance() = default;
+  // ~SquaredL2Distance() = default;
 
   float distanceImpl(const void *x, const void *y,
                      bool asymmetric = false) const {
diff --git a/flatnav/util/ExplicitSet.h b/flatnav/util/ExplicitSet.h
index 2dda668..273dc6e 100644
--- a/flatnav/util/ExplicitSet.h
+++ b/flatnav/util/ExplicitSet.h
@@ -8,6 +8,7 @@
 #include <cereal/types/memory.hpp>
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <mutex>
 #include <stdint.h>
 #include <vector>
@@ -48,6 +49,8 @@ class ExplicitSet {
 #endif
   }
 
+  inline uint32_t getMark() const { return _mark; }
+
   inline void insert(const uint32_t num) { set(num); }
 
   inline void set(const uint32_t num) { _table[num] = _mark; }
@@ -100,15 +103,34 @@ class ExplicitSet {
 
 class ShardedExplicitSet {
   uint32_t _shard_size;
-  std::vector<ExplicitSet *> _shards;
-  std::vector<std::mutex> _shard_mutexes;
+  std::vector<std::unique_ptr<ExplicitSet>> _shards;
+
+  // Mutexes for each shard
+  // We are using a std::unique_ptr here because std::mutex is neither
+  // copy-constructible nor move-constructible and std::vector requires
+  // its elements to be copy-constructible or move-constructible.
+  std::vector<std::unique_ptr<std::mutex>> _shard_mutexes;
 
 public:
+  // Constructor for cereal. Do not call except for serialization.
+  ShardedExplicitSet() = default;
+
+  /**
+   * @brief Construct a new Sharded Explicit Set object
+   * TODO: Add exception checks in the constructor to make sure we take
+   * valid input arguments for total_size and num_shards.
+   *
+   * @param total_size Corresponds to the total number of elements across all
+   * shards. This is also the maximum number of nodes held by a flatnav index.
+   * @param num_shards Corresponds to the number of sharded regions. Each region
+   * (shard) is an ExplicitSet object.
+   */
   ShardedExplicitSet(uint32_t total_size, uint32_t num_shards)
       : _shard_size(total_size / num_shards), _shards(num_shards),
         _shard_mutexes(num_shards) {
     for (uint32_t i = 0; i < num_shards; i++) {
-      _shards[i] = new ExplicitSet(_shard_size);
+      _shards[i] = std::make_unique<ExplicitSet>(_shard_size);
+      _shard_mutexes[i] = std::make_unique<std::mutex>();
     }
   }
 
@@ -116,7 +138,7 @@ class ShardedExplicitSet {
     uint32_t shard_id = node_id / _shard_size;
 
     {
-      std::lock_guard<std::mutex> lock(_shard_mutexes[shard_id]);
+      std::lock_guard<std::mutex> lock(*(_shard_mutexes[shard_id]));
       uint32_t index_in_shard = node_id % _shard_size;
       _shards[shard_id]->insert(index_in_shard);
     }
@@ -124,14 +146,15 @@ class ShardedExplicitSet {
 
   inline bool operator[](uint32_t node_id) {
     uint32_t shard_id = node_id / _shard_size;
-    std::lock_guard<std::mutex> lock(_shard_mutexes[shard_id]);
+
+    std::lock_guard<std::mutex> lock(*(_shard_mutexes[shard_id]));
     uint32_t index_in_shard = node_id % _shard_size;
-    return (*_shards[shard_id])[index_in_shard];
+    return _shards[shard_id]->operator[](index_in_shard);
   }
 
   inline void clear(uint32_t node_id) {
     uint32_t shard_id = node_id / _shard_size;
-    std::lock_guard<std::mutex> lock(_shard_mutexes[shard_id]);
+    std::lock_guard<std::mutex> lock(*(_shard_mutexes[shard_id]));
     _shards[shard_id]->clear();
   }
 
@@ -140,19 +163,41 @@ class ShardedExplicitSet {
     std::vector<std::unique_lock<std::mutex>> locks;
     locks.reserve(_shard_mutexes.size());
 
-    for (auto& mutex : _shard_mutexes) {
-      locks.emplace_back(mutex);
+    for (auto &mutex : _shard_mutexes) {
+      locks.emplace_back(*mutex);
     }
 
     // Step 2: Clear all shards
-    for (auto& shard : _shards) {
+    for (auto &shard : _shards) {
       shard->clear();
     }
   }
 
-  ~ShardedExplicitSet() {
-    for (uint32_t i = 0; i < _shards.size(); i++) {
-      delete _shards[i];
+  inline bool allShardsHaveSameMark() {
+    uint32_t mark = _shards[0]->getMark();
+
+    for (uint32_t i = 1; i < _shards.size(); i++) {
+      if (_shards[i]->getMark() != mark) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  ~ShardedExplicitSet() = default;
+
+private:
+  friend class cereal::access;
+  template <typename Archive> void serialize(Archive &archive) {
+    archive(_shard_size, _shards);
+
+    if (Archive::is_loading::value) {
+
+      _shard_mutexes.resize(_shards.size());
+      for (uint32_t i = 0; i < _shards.size(); i++) {
+        _shard_mutexes[i] = std::make_unique<std::mutex>();
+      }
     }
   }
 };
diff --git a/tools/cereal_tests.cpp b/tools/cereal_tests.cpp
index 1ac4ab3..60d9135 100644
--- a/tools/cereal_tests.cpp
+++ b/tools/cereal_tests.cpp
@@ -9,58 +9,58 @@ using flatnav::DistanceInterface;
 using flatnav::Index;
 using flatnav::SquaredL2Distance;
 
-void serializeIndex(
-    float *data,
-    std::unique_ptr<DistanceInterface<SquaredL2Distance>> &&distance, int N,
-    int M, int dim, int ef_construction, const std::string &save_file) {
-  std::unique_ptr<Index<SquaredL2Distance, int>> index =
-      std::make_unique<Index<SquaredL2Distance, int>>(
-          /* dist = */ std::move(distance), /* dataset_size = */ N,
-          /* max_edges = */ M);
+// void serializeIndex(
+//     float *data,
+//     std::unique_ptr<DistanceInterface<SquaredL2Distance>> &&distance, int N,
+//     int M, int dim, int ef_construction, const std::string &save_file) {
+//   std::unique_ptr<Index<SquaredL2Distance, int>> index =
+//       std::make_unique<Index<SquaredL2Distance, int>>(
+//           /* dist = */ std::move(distance), /* dataset_size = */ N,
+//           /* max_edges = */ M);
 
-  float *element = new float[dim];
-  for (int label = 0; label < N; label++) {
-    float *element = data + (dim * label);
-    index->add(/* data = */ (void *)element, /* label = */ label,
-               /* ef_construction = */ ef_construction);
-    if (label % 100000 == 0) {
-      std::clog << "." << std::flush;
-    }
-  }
+//   float *element = new float[dim];
+//   for (int label = 0; label < N; label++) {
+//     float *element = data + (dim * label);
+//     index->add(/* data = */ (void *)element, /* label = */ label,
+//                /* ef_construction = */ ef_construction);
+//     if (label % 100000 == 0) {
+//       std::clog << "." << std::flush;
+//     }
+//   }
 
-  std::clog << "\nSaving index to " << save_file << std::endl;
-  index->saveIndex(/* filename = */ save_file);
+//   std::clog << "\nSaving index to " << save_file << std::endl;
+//   index->saveIndex(/* filename = */ save_file);
 
-  std::clog << "Loading index " << std::endl;
+//   std::clog << "Loading index " << std::endl;
 
-  auto new_index =
-      Index<SquaredL2Distance, int>::loadIndex(/* filename = */ save_file);
+//   auto new_index =
+//       Index<SquaredL2Distance, int>::loadIndex(/* filename = */ save_file);
 
-  assert(new_index->maxEdgesPerNode() == M);
-  assert(new_index->dataSizeBytes() == distance->dataSize() + (32 * M) + 32);
-  assert(new_index->maxNodeCount() == N);
+//   assert(new_index->maxEdgesPerNode() == M);
+//   assert(new_index->dataSizeBytes() == distance->dataSize() + (32 * M) + 32);
+//   assert(new_index->maxNodeCount() == N);
 
-  uint64_t total_index_size =
-      new_index->nodeSizeBytes() * new_index->maxNodeCount();
+//   uint64_t total_index_size =
+//       new_index->nodeSizeBytes() * new_index->maxNodeCount();
 
-  for (uint64_t i = 0; i < total_index_size; i++) {
-    assert(index->indexMemory()[i] == new_index->indexMemory()[i] * 2);
-  }
-}
+//   for (uint64_t i = 0; i < total_index_size; i++) {
+//     assert(index->indexMemory()[i] == new_index->indexMemory()[i] * 2);
+//   }
+// }
 
-int main(int argc, char **argv) {
-  if (argc < 2) {
-    return -1;
-  }
+// int main(int argc, char **argv) {
+//   if (argc < 2) {
+//     return -1;
+//   }
 
-  cnpy::NpyArray datafile = cnpy::npy_load(argv[1]);
-  int M = 16;
-  int ef_construction = 100;
-  int dim = 784;
-  int N = 60000;
-  float *data = datafile.data<float>();
-  auto distance = std::make_unique<SquaredL2Distance>(dim);
-  std::string save_file = "mnist.index";
-  serializeIndex(data, std::move(distance), N, M, dim, ef_construction,
-                 save_file);
-}
\ No newline at end of file
+//   cnpy::NpyArray datafile = cnpy::npy_load(argv[1]);
+//   int M = 16;
+//   int ef_construction = 100;
+//   int dim = 784;
+//   int N = 60000;
+//   float *data = datafile.data<float>();
+//   auto distance = std::make_unique<SquaredL2Distance>(dim);
+//   std::string save_file = "mnist.index";
+//   serializeIndex(data, std::move(distance), N, M, dim, ef_construction,
+//                  save_file);
+// }
\ No newline at end of file
diff --git a/tools/construct_npy.cpp b/tools/construct_npy.cpp
index e0ecf55..435ccdf 100644
--- a/tools/construct_npy.cpp
+++ b/tools/construct_npy.cpp
@@ -9,6 +9,7 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <omp.h>
 #include <optional>
 #include <quantization/ProductQuantization.h>
@@ -30,19 +31,32 @@ void buildIndex(float *data,
                 int M, int dim, int ef_construction,
                 const std::string &save_file) {
 
+  std::cout << "Building index" << std::endl;
+
   auto index = new Index<dist_t, int>(
       /* dist = */ std::move(distance), /* dataset_size = */ N,
       /* max_edges = */ M);
 
+  std::cout << "Index initialized" << std::endl;
+
   auto start = std::chrono::high_resolution_clock::now();
 
-  for (int label = 0; label < N; label++) {
-    float *element = data + (dim * label);
-    index->add(/* data = */ (void *)element, /* label = */ label,
-               /* ef_construction */ ef_construction);
-    if (label % 10000 == 0)
-      std::clog << "." << std::flush;
-  }
+  // for (int label = 0; label < N; label++) {
+  //   float *element = data + (dim * label);
+  //   index->add(/* data = */ (void *)element, /* label = */ label,
+  //              /* ef_construction */ ef_construction);
+  //   if (label % 10000 == 0)
+  //     std::clog << "." << std::flush;
+  // }
+
+  // Invoke addParallel() to add vectors in parallel.
+  std::cout << "Creating a vector of labels" << std::endl;
+  std::vector<int> labels(N);
+  std::iota(labels.begin(), labels.end(), 0);
+  std::cout << "Adding vectors in parallel" << std::endl;
+  index->addParallel(/* data = */ (void *)data, /* labels = */ labels,
+                     /* ef_construction */ ef_construction);
+  std::cout << "Done adding vectors in parallel" << std::endl;
   std::clog << std::endl;
 
   auto stop = std::chrono::high_resolution_clock ::now();
diff --git a/tools/query_npy.cpp b/tools/query_npy.cpp
index fad19f8..29da978 100644
--- a/tools/query_npy.cpp
+++ b/tools/query_npy.cpp
@@ -50,6 +50,8 @@ void run(float *queries, int *gtruth, const std::string &index_filename,
       float *q = queries + dim * i;
       int *g = gtruth + num_gtruth * i;
 
+      std::cout << "[INFO] Query " << i << std::endl;
+
       std::vector<std::pair<float, int>> result =
           index->search(q, K, ef_search);