Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "husky"]
path = husky
url = https://github.com/husky-team/husky
[submodule "gqr"]
path = gqr
url = https://github.com/lijinf2/gqr.git
1 change: 1 addition & 0 deletions apps/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
add_subdirectory(plsh)
add_subdirectory(e2lsh)
add_subdirectory(e2lsh_ir)
add_subdirectory(hammingrank)
# add_subdirectory(srs)
# add_subdirectory(iterative-coslsh)
Expand Down
7 changes: 7 additions & 0 deletions apps/e2lsh_ir/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR}/husky)

### applications
set(losha husky losha-lib ${HUSKY_EXTERNAL_LIB})
add_executable(e2lsh_ir e2lsh_ir.cpp )
target_link_libraries(e2lsh_ir ${losha})
63 changes: 63 additions & 0 deletions apps/e2lsh_ir/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Play with E2LSH using small datasets

We have two small datasets, which are aimed to help you get familiar with the framework.

The two datasets, one is in .txt format, another is in binary format. The item and query in both format are the same.

You can set the itemPath and queryPath in `losha/conf/e2lsh.conf`. By default, they are point to `losha/apps/e2lsh/data/item_ss.txt` and `losha/apps/e2lsh/data/query_ss.txt`.


## TXT format (default input setting in e2lsh.cpp)

Query file: `losha/apps/e2lsh/data/item_ss.txt`

Item file: `losha/apps/e2lsh/data/query_ss.txt`


### Using this dataset:

Change `losha/conf/e2lsh.conf`, set the `itemPath` and `queryPath` correctly. Set `outputPath` correctly (which should be on HDFS).

Change in `./e2lsh.cpp`:

- Using `#include "small.hpp"`

- Comment out `#include "small_binary.hpp"`

- Using `auto& lineInputFormat ...`

- Comment out `auto& binaryInputFormat ...`

- Using `loshaengine ... setItemSMALL, lineInputFormat);`

- Comment out `loshaengine ... setItemSmallBinary, binaryInputFormat);`

In `losha/build/` directory, run `make e2lsh`.


## Binary format

Query file: `losha/apps/e2lsh/data/item_ss.bin`

Item file: `losha/apps/e2lsh/data/query_ss.bin`


### Using this dataset:

Change `losha/conf/e2lsh.conf`, set the `itemPath` and `queryPath` correctly. Set `outputPath` correctly (which should be on HDFS).

Change in `./e2lsh.cpp`:

- Comment out `#include "small.hpp"`

- Using `#include "small_binary.hpp"`

- Comment out `auto& lineInputFormat ...`

- Using `auto& binaryInputFormat ...`

- Comment out `loshaengine ... setItemSMALL, lineInputFormat);`

- Using `loshaengine ... setItemSmallBinary, binaryInputFormat);`

In `losha/build/` directory, run `make e2lsh`.
2 changes: 2 additions & 0 deletions apps/e2lsh_ir/data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
wb_item
wb_query
15 changes: 15 additions & 0 deletions apps/e2lsh_ir/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Notices

`wb_item.cpp` and `wb_query.cpp` can generate binary query file and item file.

The data in both .bin and .txt are same. Except that .bin do not contain any empty space or `\n`.

## In .txt

In each line (DV: value in that dimension):

ID DV DV DV DV DV

## In .bin

Each chunk contains 6 integer, the first is the ID, then following 5 DV.
Binary file added apps/e2lsh_ir/data/item_ss.bin
Binary file not shown.
7 changes: 7 additions & 0 deletions apps/e2lsh_ir/data/item_ss.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
2 0 0 0 0 0
3 0 0 0 0 1
4 0 0 0 1 0
5 0 0 1 0 0
6 0 1 0 0 0
7 1 0 0 0 0
8 1 1 1 1 1
Binary file added apps/e2lsh_ir/data/query_ss.bin
Binary file not shown.
2 changes: 2 additions & 0 deletions apps/e2lsh_ir/data/query_ss.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
0 0 0 0 0 0
1 0 0 0 0 1
32 changes: 32 additions & 0 deletions apps/e2lsh_ir/data/wb_item.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include <fstream>
#include <iostream>

int main() {

std::ofstream outfile("item_ssb.bin", std::ofstream::binary);

int x = 0;
int y = 1;

for (int i = 2; i < 9; i++){

outfile.write(reinterpret_cast<const char *>(&i), sizeof(i));
std::cout << i << " ";
for (int j = 0; j < 5; j++){

if ( (i - j) == 3 ){
outfile.write(reinterpret_cast<const char *>(&y), sizeof(y));
std::cout << y << " ";
}else if (i == 8){
outfile.write(reinterpret_cast<const char *>(&y), sizeof(y));
std::cout << y << " ";
}else{
outfile.write(reinterpret_cast<const char *>(&x), sizeof(x));
std::cout << x << " ";
}
}
std::cout << "\n";

}

}
29 changes: 29 additions & 0 deletions apps/e2lsh_ir/data/wb_query.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include <fstream>
#include <iostream>

int main() {

std::ofstream outfile("query_ssb.bin", std::ofstream::binary);

int x = 0;
int y = 1;

for (int i = 0; i < 2; i++){

outfile.write(reinterpret_cast<const char *>(&i), sizeof(i));
std::cout << i << " ";
for (int j = 0; j < 5; j++){

if ( i == 1 && j == 0 ){
outfile.write(reinterpret_cast<const char *>(&y), sizeof(y));
std::cout << y << " ";
}else{
outfile.write(reinterpret_cast<const char *>(&x), sizeof(x));
std::cout << x << " ";
}
}
std::cout << "\n";

}

}
96 changes: 96 additions & 0 deletions apps/e2lsh_ir/e2lsh_ir.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Copyright 2016 Husky Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cmath>
#include <limits>
#include <vector>

#include "core/engine.hpp"
#include "io/input/inputformat_store.hpp"

#include "lshcore/lshengine.hpp"
#include "lshcore/e2lshfactory.hpp"

#include "sift_ir.hpp"

using namespace husky::losha;
E2LSHFactory<ItemIdType, ItemElementType> factory;
std::once_flag factory_flag;

void lsh() {
auto start_s = std::chrono::steady_clock::now();

// initialization
int band = std::stoi(husky::Context::get_param("band"));
int row = std::stoi(husky::Context::get_param("row"));
int dimension = std::stoi(husky::Context::get_param("dimension"));
int W = std::stoi(husky::Context::get_param("W"));
std::call_once(factory_flag, [&]() {
factory.initialize(band, row, dimension, W);
});

auto init_f = std::chrono::steady_clock::now();
std::chrono::duration<double, std::milli> d_init = init_f - start_s;
if (husky::Context::get_global_tid() == 0)
husky::LOG_I << "Job init finishes in "
<< std::to_string(d_init.count() / 1000.0)
<< " seconds" << std::endl;

// Small Dataset (txt file)
// auto& lineInputFormat = husky::io::InputFormatStore::create_line_inputformat();
// loshaengine<Query1B, Bucket1B, Item1B, QueryMsg, AnswerMsg>(
// factory, setItemSMALL, lineInputFormat);

// Small Dataset (binary file)
//auto& binaryInputFormat =
// husky::io::InputFormatStore::create_chunk_inputformat(BytesPerVector);
//loshaengine<Query1B, Bucket1B, Item1B, QueryMsg, AnswerMsg>(
// factory, setItemSmallBinary, binaryInputFormat);

//IR
auto& binaryInputFormat =
husky::io::InputFormatStore::create_chunk_inputformat(BytesPerVector);
loshaengine<Query1B, Bucket1B, Item1B, QueryMsg, AnswerMsg>(
factory, setItemSIFT1B, binaryInputFormat);

auto query_f = std::chrono::steady_clock::now();
std::chrono::duration<double, std::milli> d_query = query_f - init_f;
if(husky::Context::get_global_tid() == 0)
husky::LOG_I << "Job query finishes in "
<< std::to_string( d_query.count() / 1000.0)
<< " seconds" << std::endl;
if(husky::Context::get_global_tid() == 0)
husky::LOG_I << "E2LSH finish" << std::endl;
}

int main(int argc, char ** argv) {
husky::LOG_I << "E2LSH program starts" << std::endl;
std::vector<std::string> args;
args.push_back("hdfs_namenode");
args.push_back("hdfs_namenode_port");
args.push_back("band");
args.push_back("row");
args.push_back("dimension");
args.push_back("queryPath"); // the inputQueryPath
args.push_back("itemPath"); // the inputItemPath
args.push_back("outputPath"); // the outputPath
args.push_back("W");
if (husky::init_with_args(argc, argv, args)) {
husky::run_job(lsh);
return 0;
}
return 1;
}

107 changes: 107 additions & 0 deletions apps/e2lsh_ir/e2lsh_ir.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright 2016 Husky Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once
#include <set>
#include <string>
#include <utility>
#include <vector>

#include "core/engine.hpp"
#include "io/hdfs_manager.hpp"

#include "lshcore/lshbucket.hpp"
#include "lshcore/lshquery.hpp"
#include "lshcore/lshitem.hpp"
using namespace husky::losha;

template<
typename ItemIdType,
typename ItemElementType,
typename QueryMsg,
typename AnswerMsg
>
class E2LSHQuery : public LSHQuery<ItemIdType, ItemElementType, QueryMsg, AnswerMsg> {
public:
// explicit E2LSHQuery(const typename E2LSHQuery::KeyT& id):LSHQuery(id){}
explicit E2LSHQuery(const typename E2LSHQuery::KeyT& id):LSHQuery<ItemIdType, ItemElementType, QueryMsg, AnswerMsg>(id) {}
void query(
LSHFactory<ItemIdType, ItemElementType>& fty,
const vector<AnswerMsg>& inMsg) override {
this->queryMsg = this->getItemId();
for (auto& bId : fty.calItemBuckets(this->getQuery())) {
this->sendToBucket(bId);
}
}
};

template<
typename ItemIdType,
typename ItemElementType,
typename QueryMsg,
typename AnswerMsg
>
class E2LSHItem : public LSHItem<ItemIdType, ItemElementType, QueryMsg, AnswerMsg> {
public:
// explicit E2LSHItem(const typename E2LSHItem::KeyT& id):LSHItem(id){}
explicit E2LSHItem(const typename E2LSHItem::KeyT& id):LSHItem<ItemIdType, ItemElementType, QueryMsg, AnswerMsg>(id){}

virtual void answer(
LSHFactory<ItemIdType, ItemElementType>& factory,
const vector<QueryMsg>& inMsgs) {

std::unordered_set<QueryMsg> evaluated;
for (auto& queryId : inMsgs) {

if (evaluated.find(queryId)!= evaluated.end()) continue;
evaluated.insert(queryId);

// get broadcasted value
auto& queryVector =
factory.getQueryVector(queryId);
float distance = factory.calDist(queryVector, this->getItemVector());

if (distance < 50){
std::string result;
result += std::to_string(queryId) + " ";
result += std::to_string(this->getItemId()) + " " + std::to_string(distance) + "\n";

if (husky::Context::get_param("outputPath") == "localhost"){
husky::LOG_I << "OUTPUT:" << result;
}else{
husky::io::HDFS::Write(
husky::Context::get_param("hdfs_namenode"),
husky::Context::get_param("hdfs_namenode_port"),
result,
husky::Context::get_param("outputPath"),
husky::Context::get_global_tid());
}
}
}
}
};

template<
typename ItemIdType,
typename ItemElementType,
typename QueryMsg,
typename AnswerMsg
>
class E2LSHBucket: public LSHBucket<ItemIdType, ItemElementType, QueryMsg, AnswerMsg> {
public:
// explicit E2LSHBucket(const typename E2LSHBucket::KeyT& bId):LSHBucket(bId){}
explicit E2LSHBucket(const typename E2LSHBucket::KeyT& bId):LSHBucket<ItemIdType, ItemElementType, QueryMsg, AnswerMsg>(bId){}
};
Loading