From f57fddef45281f1ee3101ef0a98de25690a25c75 Mon Sep 17 00:00:00 2001 From: Lars Vilhuber Date: Wed, 20 Feb 2019 17:12:38 -0500 Subject: [PATCH] Clarified the output file from the C compiler. Added a setup script (should be requirements.txt). --- README.md | 10 +++-- config_restaurant.txt | 92 +++++++++++++++++++++---------------------- run_script.sh | 10 ++--- setup.sh | 18 +++++++++ 4 files changed, 76 insertions(+), 54 deletions(-) create mode 100644 setup.sh diff --git a/README.md b/README.md index 652553f..f6aefe7 100644 --- a/README.md +++ b/README.md @@ -19,10 +19,14 @@ This package is written in C++ and Python. We require at least g++ version 5 and 3. Prerequisites +Software: + + C++ compiler + + Python 2.7 + The following packages are needed in Python for the code to run: ``` -C++, Python 2, ngram, sklearn, numpy, scipy, matlib +ngram, sklearn, numpy, scipy, matlib ``` Remark: In order to install using pip, one will need to run the following commands if errors arise from the terminal due to recent changes with SSH in pip (Linux and MacOS) @@ -36,7 +40,7 @@ pip2 install numpy scipy matplotlib ``` cd C++Codes -g++ -std=c++11 *.cpp -fopenmp (on Windows and Linux) +g++ -o minhash -std=c++11 *.cpp -fopenmp (on Windows and Linux) g++ *.cpp -fopenmp (on MacOS) ``` @@ -63,7 +67,7 @@ Use the C++ Package folder in this repository. This is a fast minhash package wh 1. Update the Config file for minhash and run the program (Remember to change the outputfile name option to Restaurant_pair.csv or the particular name of your data set.) The second and third arguments are K and L respectively. ``` -./a.out Config.txt 1 10 +./C++Codes/minhash config_restaurant.txt 1 10 ``` The output is `Restaurant_pair.csv` where the output is candidate record pairs: diff --git a/config_restaurant.txt b/config_restaurant.txt index 283d919..319539f 100755 --- a/config_restaurant.txt +++ b/config_restaurant.txt @@ -1,47 +1,47 @@ ############################################################## -# Config file for Restaurant dataset -# AnyLine containing `#' with be treated as comment. -# The Typical Format is Variablename = Value -############################################################### - -# Choose K and L Wisely, try with higher values of K and go down. Typically in range 1-5 for small pruning, -#for rigorous pruning use larger values of K - -K=1 - -# More L increases the recall but also reports more pair. Less sensitive than K - -L=4 - -# ngrams length - -shingles=2 - -# Thresholds, only reports if found in at least this many buckets (cancels random noise). If you are missing pairs #decrease this -Thresh=3 - - -#Give the input CSV file. First line will be ignored (assumed to be header). Every line will be treated as a #record. -#The line number of record will be its ID. That is the fist line after header is treated as record with ID 1 etc. - -Input=data/restaurant.csv -#Output File: this will contain a pair of record IDs in each line indicating a possible match. - -Output=restaurant_pair.csv -############################################################################## -#These are advanced parameters depending on memory -############################################################################## -# No of Cells in each bucket. Decrease if goes out of memory. - -BucketSize=32 - -# No of buckets in each tables is 2^{this number}. Too small will never finish. Decrease if goes out of memoryy. #Larger is better. Must be < 27 - -RangePow=20 - -# Increase if MinHashing Takes a lot of Time. Must be power of 2. - -MinHashChunkSize=32 - -# Processes these many records in parallel, larger is faster. Decrease if goes out of memory -Chunk=500000 +# Config file for Restaurant dataset +# AnyLine containing `#' with be treated as comment. +# The Typical Format is Variablename = Value +############################################################### + +# Choose K and L Wisely, try with higher values of K and go down. Typically in range 1-5 for small pruning, +#for rigorous pruning use larger values of K + +K=1 + +# More L increases the recall but also reports more pair. Less sensitive than K + +L=4 + +# ngrams length + +shingles=2 + +# Thresholds, only reports if found in at least this many buckets (cancels random noise). If you are missing pairs #decrease this +Thresh=3 + + +#Give the input CSV file. First line will be ignored (assumed to be header). Every line will be treated as a #record. +#The line number of record will be its ID. That is the fist line after header is treated as record with ID 1 etc. + +Input=data/Restaurant.csv +#Output File: this will contain a pair of record IDs in each line indicating a possible match. + +Output=Restaurant_pair.csv +############################################################################## +#These are advanced parameters depending on memory +############################################################################## +# No of Cells in each bucket. Decrease if goes out of memory. + +BucketSize=32 + +# No of buckets in each tables is 2^{this number}. Too small will never finish. Decrease if goes out of memoryy. #Larger is better. Must be < 27 + +RangePow=20 + +# Increase if MinHashing Takes a lot of Time. Must be power of 2. + +MinHashChunkSize=32 + +# Processes these many records in parallel, larger is faster. Decrease if goes out of memory +Chunk=500000 diff --git a/run_script.sh b/run_script.sh index 962ecf2..1a39c2e 100644 --- a/run_script.sh +++ b/run_script.sh @@ -3,12 +3,12 @@ #!/bin/bash -g++-7 -std=c++11 C++Codes/*.cpp -o output -fopenmp +g++-7 -std=c++11 C++Codes/*.cpp -o minhash -fopenmp For Restaurant for ((i=6;i<=25;i+=6)) ; do for ((j=1;j<=10; j++)); - do ./output config_restaurant.txt 1 $i; python pipeline.py --flag 0 --id $i --trainsize 0.3 --input restaurant_pair.csv --goldstan data/restaurant.csv --output log-restaurant ; + do ./minhash config_restaurant.txt 1 $i; python pipeline.py --flag 0 --id $i --trainsize 0.3 --input restaurant_pair.csv --goldstan data/restaurant.csv --output log-restaurant ; done done @@ -17,7 +17,7 @@ g++-7 -std=c++11 C++Codes/*.cpp -o output -fopenmp #For CD # for ((i=6;i<=20;i+=4)) ; # do for ((j=1;j<=3; j++)); -# do ./output config_cd.txt 1 $i; python pipeline.py --flag 0 --id $i --trainsize 0.5 --input cd_pair.csv --goldstan data/cd.csv --delimiter ';' --output log-cd ; +# do ./minhash config_cd.txt 1 $i; python pipeline.py --flag 0 --id $i --trainsize 0.5 --input cd_pair.csv --goldstan data/cd.csv --delimiter ';' --output log-cd ; # done # done @@ -26,7 +26,7 @@ g++-7 -std=c++11 C++Codes/*.cpp -o output -fopenmp #For Voter # for ((i=25;i<=40;i+=5)) ; # do for ((j=1;j<=10; j++)); -# do ./output config_voter.txt 4 $i; python pipeline.py --flag 0 --id $i --trainsize 0.1 --input voter_pair.csv --goldstan data/voter.csv --delimiter ',' --c 0.0001 --output log-voter ; +# do ./minhash config_voter.txt 4 $i; python pipeline.py --flag 0 --id $i --trainsize 0.1 --input voter_pair.csv --goldstan data/voter.csv --delimiter ',' --c 0.0001 --output log-voter ; # done # done @@ -36,7 +36,7 @@ g++-7 -std=c++11 C++Codes/*.cpp -o output -fopenmp # python preprocess.py #for ((i=1;i<=10;i++)) ; -# do ./output config_syria.txt 15 10; python pipeline_for_syria.py --input syria_pair.csv --output log-syria --rawdata data/syria.csv --goldstandpair data/syria_train.csv; +# do ./minhash config_syria.txt 15 10; python pipeline_for_syria.py --input syria_pair.csv --output log-syria --rawdata data/syria.csv --goldstandpair data/syria_train.csv; #done #python count.py --input log-syria diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..a806e71 --- /dev/null +++ b/setup.sh @@ -0,0 +1,18 @@ +# Setup script +# Assumes presence of Anaconda + +# Create an environment +conda create --name LSH python=2.7 +source activate LSH + +# Install packages from Anaconda +conda install numpy +conda install scipy + +# Install packages using pip +pip install --pre subprocess32 +pip install ngram +pip install sklearn +pip install matlib + +# this fails due to dependency failure: matlib.h \ No newline at end of file