diff --git a/LargeVis_run.py b/LargeVis_run.py old mode 100644 new mode 100755 index 4029e09..cbdb05d --- a/LargeVis_run.py +++ b/LargeVis_run.py @@ -1,28 +1,30 @@ -import LargeVis -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument('-fea', default = 1, type = int, help = 'whether to visualize high-dimensional feature vectors or networks') -parser.add_argument('-input', default = '', help = 'input file') -parser.add_argument('-output', default = '', help = 'output file') -parser.add_argument('-outdim', default = -1, type = int, help = 'output dimensionality') -parser.add_argument('-threads', default = -1, type = int, help = 'number of training threads') -parser.add_argument('-samples', default = -1, type = int, help = 'number of training mini-batches') -parser.add_argument('-prop', default = -1, type = int, help = 'number of propagations') -parser.add_argument('-alpha', default = -1, type = float, help = 'learning rate') -parser.add_argument('-trees', default = -1, type = int, help = 'number of rp-trees') -parser.add_argument('-neg', default = -1, type = int, help = 'number of negative samples') -parser.add_argument('-neigh', default = -1, type = int, help = 'number of neighbors in the NN-graph') -parser.add_argument('-gamma', default = -1, type = float, help = 'weight assigned to negative edges') -parser.add_argument('-perp', default = -1, type = float, help = 'perplexity for the NN-grapn') - -args = parser.parse_args() - -if args.fea == 1: - LargeVis.loadfile(args.input) -else: - LargeVis.loadgraph(args.input) - -Y = LargeVis.run(args.outdim, args.threads, args.samples, args.prop, args.alpha, args.trees, args.neg, args.neigh, args.gamma, args.perp) - -LargeVis.save(args.output) +#!/usr/bin/env python + +import LargeVis +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('--fea', default=1, type=int, help='whether to visualize high-dimensional feature vectors or networks') +parser.add_argument('--input', default='', help='input file', required=True) +parser.add_argument('--output', default='', help='output file', required=True) +parser.add_argument('--outdim', default=-1, type=int, help='output dimensionality') +parser.add_argument('--threads', default=-1, type=int, help='number of training threads') +parser.add_argument('--samples', default=-1, type=int, help='number of training mini-batches') +parser.add_argument('--prop', default=-1, type=int, help='number of propagations') +parser.add_argument('--alpha', default=-1, type=float, help='learning rate') +parser.add_argument('--trees', default=-1, type=int, help='number of rp-trees') +parser.add_argument('--neg', default=-1, type=int, help='number of negative samples') +parser.add_argument('--neigh', default=-1, type=int, help='number of neighbors in the NN-graph') +parser.add_argument('--gamma', default=-1, type=float, help='weight assigned to negative edges') +parser.add_argument('--perp', default=-1, type=float, help='perplexity for the NN-grapn') + +args = parser.parse_args() + +if args.fea == 1: + LargeVis.loadfile(args.input) +else: + LargeVis.loadgraph(args.input) + +Y = LargeVis.run(args.outdim, args.threads, args.samples, args.prop, args.alpha, args.trees, args.neg, args.neigh, args.gamma, args.perp) + +LargeVis.save(args.output) diff --git a/Linux/LargeVis.cpp b/Linux/LargeVis.cpp index 022e78e..9a47818 100644 --- a/Linux/LargeVis.cpp +++ b/Linux/LargeVis.cpp @@ -8,7 +8,7 @@ LargeVis::LargeVis() knn_vec = old_knn_vec = NULL; annoy_index = NULL; head = alias = NULL; - neg_table = NULL; + neg_table = NULL; } const gsl_rng_type *LargeVis::gsl_T = NULL; @@ -26,8 +26,8 @@ void LargeVis::clean_model() vis = prob = NULL; knn_vec = old_knn_vec = NULL; annoy_index = NULL; - neg_table = NULL; - alias = NULL; + neg_table = NULL; + alias = NULL; edge_count_actual = 0; neg_size = 1e8; @@ -56,14 +56,23 @@ void LargeVis::load_from_file(char *infile) printf("\nFile not found!\n"); return; } - printf("Reading input file %s ......", infile); fflush(stdout); - fscanf(fin, "%lld%lld", &n_vertices, &n_dim); + printf("Reading input file %s ......", infile); fflush(stdout); + if (fscanf(fin, "%lld%lld", &n_vertices, &n_dim) != 2) { + printf("Could not read dimensions\n"); + fclose(fin); + exit(1); + } vec = new real[n_vertices * n_dim]; for (long long i = 0; i < n_vertices; ++i) { for (long long j = 0; j < n_dim; ++j) { - fscanf(fin, "%f", &vec[i * n_dim + j]); + if (fscanf(fin, "%f", &vec[i * n_dim + j]) != 1) + { + fclose(fin); + printf("Could not read line %lld\n", i + 1); + exit(1); + } } } fclose(fin); @@ -80,7 +89,17 @@ void LargeVis::load_from_data(real *data, long long n_vert, long long n_di) printf("Total vertices : %lld\tDimension : %lld\n", n_vertices, n_dim); } -void LargeVis::load_from_graph(char *infile) +bool load_edge_from_graph(FILE *fin, char *w1, char *w2, real *weight, bool use_default_weight) { + if (use_default_weight) + { + (*weight) = 1; + return fscanf(fin, "%s%s", w1, w2) == 2; + } + else + return fscanf(fin, "%s%s%f", w1, w2, weight) == 3; +} + +void LargeVis::load_from_graph(char *infile, bool use_default_weight) { clean_data(); char *w1 = new char[1000]; @@ -96,7 +115,7 @@ void LargeVis::load_from_graph(char *infile) return; } printf("Reading input file %s ......%c", infile, 13); - while (fscanf(fin, "%s%s%f", w1, w2, &weight) == 3) + while (load_edge_from_graph(fin, w1, w2, &weight, use_default_weight)) { if (!dict.count(w1)) { dict[w1] = n_vertices++; names.push_back(w1); } if (!dict.count(w2)) { dict[w2] = n_vertices++; names.push_back(w2); } @@ -162,7 +181,7 @@ long long LargeVis::get_out_dim() void LargeVis::normalize() { - printf("Normalizing ......"); fflush(stdout); + printf("Normalizing ......"); fflush(stdout); real *mean = new real[n_dim]; for (long long i = 0; i < n_dim; ++i) mean[i] = 0; for (long long i = 0, ll = 0; i < n_vertices; ++i, ll += n_dim) @@ -281,7 +300,7 @@ void *LargeVis::annoy_thread_caller(void *arg) void LargeVis::run_annoy() { - printf("Running ANNOY ......"); fflush(stdout); + printf("Running ANNOY ......"); fflush(stdout); annoy_index = new AnnoyIndex(n_dim); for (long long i = 0; i < n_vertices; ++i) annoy_index->add_item(i, &vec[i * n_dim]); @@ -293,7 +312,7 @@ void LargeVis::run_annoy() for (int j = 0; j < n_threads; ++j) pthread_create(&pt[j], NULL, LargeVis::annoy_thread_caller, new arg_struct(this, j)); for (int j = 0; j < n_threads; ++j) pthread_join(pt[j], NULL); delete[] pt; - delete annoy_index; annoy_index = NULL; + delete annoy_index; annoy_index = NULL; printf(" Done.\n"); } @@ -348,7 +367,7 @@ void LargeVis::run_propagation() { for (int i = 0; i < n_propagations; ++i) { - printf("Running propagation %d/%d%c", i + 1, n_propagations, 13); + printf("Running propagation %d/%lld%c", i + 1, n_propagations, 13); fflush(stdout); old_knn_vec = knn_vec; knn_vec = new std::vector[n_vertices]; @@ -375,7 +394,7 @@ void LargeVis::compute_similarity_thread(int id) for (iter = 0; iter < 200; ++iter) { H = 0; - sum_weight = FLT_MIN; + sum_weight = FLT_MIN; for (p = head[x]; p >= 0; p = next[p]) { sum_weight += tmp = exp(-beta * edge_weight[p]); @@ -392,8 +411,8 @@ void LargeVis::compute_similarity_thread(int id) hi_beta = beta; if (lo_beta < 0) beta /= 2; else beta = (lo_beta + beta) / 2; } - if(beta > FLT_MAX) beta = FLT_MAX; - } + if(beta > FLT_MAX) beta = FLT_MAX; + } for (p = head[x], sum_weight = FLT_MIN; p >= 0; p = next[p]) { sum_weight += edge_weight[p] = exp(-beta * edge_weight[p]); @@ -440,7 +459,7 @@ void *LargeVis::search_reverse_thread_caller(void *arg) void LargeVis::compute_similarity() { - printf("Computing similarities ......"); fflush(stdout); + printf("Computing similarities ......"); fflush(stdout); n_edge = 0; head = new long long[n_vertices]; long long i, x, y, p, q; @@ -458,8 +477,8 @@ void LargeVis::compute_similarity() head[x] = n_edge++; } } - delete[] vec; vec = NULL; - delete[] knn_vec; knn_vec = NULL; + delete[] vec; vec = NULL; + delete[] knn_vec; knn_vec = NULL; pthread_t *pt = new pthread_t[n_threads]; for (int j = 0; j < n_threads; ++j) pthread_create(&pt[j], NULL, LargeVis::compute_similarity_thread_caller, new arg_struct(this, j)); for (int j = 0; j < n_threads; ++j) pthread_join(pt[j], NULL); @@ -515,7 +534,7 @@ void LargeVis::test_accuracy() ++hit_case; } } - delete heap; + delete heap; printf("Test knn accuracy : %.2f%%\n", hit_case * 100.0 / (test_case * n_neighbors)); } @@ -542,7 +561,7 @@ void LargeVis::init_neg_table() { long long x, p, i; neg_size = 1e8; - reverse.clear(); vector (reverse).swap(reverse); + reverse.clear(); vector (reverse).swap(reverse); real sum_weights = 0, dd, *weights = new real[n_vertices]; for (i = 0; i < n_vertices; ++i) weights[i] = 0; for (x = 0; x < n_vertices; ++x) @@ -553,8 +572,8 @@ void LargeVis::init_neg_table() } sum_weights += weights[x] = pow(weights[x], 0.75); } - next.clear(); vector (next).swap(next); - delete[] head; head = NULL; + next.clear(); vector (next).swap(next); + delete[] head; head = NULL; neg_table = new int[neg_size]; dd = weights[0]; for (i = x = 0; i < neg_size; ++i) diff --git a/Linux/LargeVis.h b/Linux/LargeVis.h index 6914ac5..20c2ba9 100644 --- a/Linux/LargeVis.h +++ b/Linux/LargeVis.h @@ -67,7 +67,7 @@ class LargeVis{ public: LargeVis(); void load_from_file(char *infile); - void load_from_graph(char *infile); + void load_from_graph(char *infile, bool use_default_weight = false); void load_from_data(real *data, long long n_vert, long long n_di); void save(char *outfile); void run(long long out_d = -1, long long n_thre = -1, long long n_samp = -1, long long n_prop = -1, real alph = -1, long long n_tree = -1, long long n_nega = -1, long long n_neig = -1, real gamm = -1, real perp = -1); diff --git a/Linux/LargeVismodule.cpp b/Linux/LargeVismodule.cpp index bbc50d9..23ccf16 100644 --- a/Linux/LargeVismodule.cpp +++ b/Linux/LargeVismodule.cpp @@ -1,5 +1,19 @@ #include "Python.h" #include "LargeVis.h" +#include "numpy/arrayobject.h" + +struct module_state { + PyObject *error; +}; + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else +#define GETSTATE(m) (&_state) +static struct module_state _state; +#endif + real *out_vec; LargeVis model; @@ -94,7 +108,11 @@ static PyObject *LoadFromList(PyObject *self, PyObject *args) } for (long long j = 0; j < n_dim; ++j) { +#ifdef IS_PY3K + real x = atof(PyBytes_AS_STRING(PyObject_Bytes(PyList_GetItem(vec, j)))); +#else real x = atof(PyString_AsString(PyObject_Str(PyList_GetItem(vec, j)))); +#endif data[ll + j] = x; } } @@ -103,6 +121,51 @@ static PyObject *LoadFromList(PyObject *self, PyObject *args) return Py_None; } +static PyObject *LoadFromArray(PyObject *self, PyObject *args) +{ + PyArrayObject *input; + long long n_vertices; + long long n_dim; + + //printf("Starting LoadFromArray\n"); + + if (!PyArg_ParseTuple(args, "O", &input)) return NULL; + + if (NULL == input) return NULL; + + //printf("Got input object parsed as array\n"); + + // Verify we have a 2D array of doubles + if ((PyArray_NDIM(input) != 2) || (!PyArray_ISFLOAT(input))) return NULL; + + n_vertices = PyArray_DIM(input, 0); + n_dim = PyArray_DIM(input, 1); + + //printf("Read array data as shape (%i, %i)\n", n_vertices, n_dim); + + //real *data = new real[n_vertices * n_dim]; + + //printf("Allocated new data array\n", n_vertices, n_dim); + + real *indata = (real *) PyArray_DATA(input); + + // printf("Got pointer to input data\n"); + + // for (long long i = 0; i < n_vertices; ++i) { + // printf("Processing row %i\n", i); + // for (long long j = 0; j < n_dim; ++j) { + // // data[i * n_dim + j] = (real) *((real *) PyArray_GETPTR2(input, i, j)); + // printf("processing col %i\n", j); + // data[i * n_dim + j] = indata[i * n_dim + j]; + // } + // } + + //printf("Completed reading in data from numpy array\n"); + + model.load_from_data(indata, n_vertices, n_dim); + return Py_None; +} + static PyObject *SaveToFile(PyObject *self, PyObject *args) { if (!PyArg_ParseTuple(args, "s", &filename)) @@ -114,18 +177,69 @@ static PyObject *SaveToFile(PyObject *self, PyObject *args) return Py_None; } -static PyMethodDef PyExtMethods[] = + +static PyMethodDef LargeVis_methods[] = { { "run", Run, METH_VARARGS, "(All arguments are optional.\nrun(output dimension, threads number, training samples, propagations number, learning rate, rp-trees number, negative samples number, neighbors number, gamma, perplexity)\nFire up LargeVis." }, { "loadfile", LoadFromFile, METH_VARARGS, "loadfile(str filename)\nLoad high-dimensional feature vectors from file." }, { "loadgraph", LoadFromGraph, METH_VARARGS, "loadfile(str filename)\nLoad graph from file." }, { "loaddata", LoadFromList, METH_VARARGS, "loaddata(X)\nLoad data from list." }, + { "loadarray", LoadFromArray, METH_VARARGS, "loadarray(X)\nLoad data from a numpy array."}, { "save", SaveToFile, METH_VARARGS, "save(str filename)\nSave data to file." }, { NULL, NULL, 0, NULL } }; -PyMODINIT_FUNC initLargeVis() +#if PY_MAJOR_VERSION >= 3 + +static int LargeVis_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int LargeVis_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + return 0; +} +static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "LargeVis", + NULL, + sizeof(struct module_state), + LargeVis_methods, + NULL, + LargeVis_traverse, + LargeVis_clear, + NULL +}; + +#define INITERROR return NULL + +PyMODINIT_FUNC +PyInit_LargeVis(void) + +#else +#define INITERROR return + +PyMODINIT_FUNC +initLargeVis(void) +#endif { - printf("LargeVis successfully imported!\n"); - Py_InitModule("LargeVis", PyExtMethods); +#if PY_MAJOR_VERSION >= 3 + PyObject *module = PyModule_Create(&moduledef); +#else + PyObject *module = Py_InitModule("LargeVis", LargeVis_methods); +#endif + if (module == NULL) + INITERROR; + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("LargeVis.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + +#if PY_MAJOR_VERSION >= 3 + return module; +#endif } diff --git a/Linux/irun.sh b/Linux/irun.sh old mode 100644 new mode 100755 diff --git a/Linux/main.cpp b/Linux/main.cpp index 22abea8..991e7a9 100644 --- a/Linux/main.cpp +++ b/Linux/main.cpp @@ -6,11 +6,12 @@ char infile[1000], outfile[1000]; long long if_embed = 1, out_dim = -1, n_samples = -1, n_threads = -1, n_negative = -1, n_neighbors = -1, n_trees = -1, n_propagation = -1; real alpha = -1, n_gamma = -1, perplexity = -1; +bool use_default_weight = false; int ArgPos(char *str, int argc, char **argv) { int a; for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { - if (a == argc - 1) { + if (a == argc - 1 && strcmp(str, "--default-weight")) { printf("Argument missing for %s\n", str); exit(1); } @@ -22,42 +23,46 @@ int ArgPos(char *str, int argc, char **argv) { int main(int argc, char **argv) { long long i; - if (argc < 3) - { - printf("-fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); - printf("-input: Input file of feature vectors or networks\n"); - printf("-output: Output file of low-dimensional representations.\n"); - printf("-threads: Number of threads. Default is 8.\n"); - printf("-outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); - printf("-samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); - printf("-prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); - printf("-alpha: Initial learning rate. Default is 1.0.\n"); - printf("-trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); - printf("-neg: Number of negative samples used for negative sampling. Default is 5.\n"); - printf("-neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); - printf("-gamma: The weights assigned to negative edges. Default is 7.\n"); - printf("-perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); - return 0; - } - if ((i = ArgPos((char *)"-fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-input", argc, argv)) > 0) strcpy(infile, argv[i + 1]); - if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(outfile, argv[i + 1]); - if ((i = ArgPos((char *)"-outdim", argc, argv)) > 0) out_dim = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-samples", argc, argv)) > 0) n_samples = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) n_threads = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-neg", argc, argv)) > 0) n_negative = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-neigh", argc, argv)) > 0) n_neighbors = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-trees", argc, argv)) > 0) n_trees = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-prop", argc, argv)) > 0) n_propagation = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); - if ((i = ArgPos((char *)"-gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); - if ((i = ArgPos((char *)"-perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--input", argc, argv)) > 0) strcpy(infile, argv[i + 1]); + if ((i = ArgPos((char *)"--output", argc, argv)) > 0) strcpy(outfile, argv[i + 1]); + if ((i = ArgPos((char *)"--outdim", argc, argv)) > 0) out_dim = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--samples", argc, argv)) > 0) n_samples = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--threads", argc, argv)) > 0) n_threads = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--neg", argc, argv)) > 0) n_negative = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--neigh", argc, argv)) > 0) n_neighbors = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--trees", argc, argv)) > 0) n_trees = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--prop", argc, argv)) > 0) n_propagation = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--default-weight", argc, argv)) > 0) use_default_weight = true; + + if (argc < 3 || strlen(infile) == 0 || strlen(outfile) == 0) + { + printf("--fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); + printf("--input: Input file of feature vectors or networks\n"); + printf("--output: Output file of low-dimensional representations.\n"); + printf("--threads: Number of threads. Default is 8.\n"); + printf("--outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); + printf("--samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); + printf("--prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); + printf("--alpha: Initial learning rate. Default is 1.0.\n"); + printf("--trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); + printf("--neg: Number of negative samples used for negative sampling. Default is 5.\n"); + printf("--neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); + printf("--gamma: The weights assigned to negative edges. Default is 7.\n"); + printf("--perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); + printf("--default-weight: Use 1 as weight of edges instead of reading weight from edge list.\n"); + return 2; + } LargeVis model; - if (if_embed) - model.load_from_file(infile); - else - model.load_from_graph(infile); + if (if_embed) + model.load_from_file(infile); + else + model.load_from_graph(infile, use_default_weight); + model.run(out_dim, n_threads, n_samples, n_propagation, alpha, n_trees, n_negative, n_neighbors, n_gamma, perplexity); model.save(outfile); diff --git a/Linux/makefile b/Linux/makefile new file mode 100644 index 0000000..43ffa40 --- /dev/null +++ b/Linux/makefile @@ -0,0 +1,14 @@ +all: LargeVis + +LargeVis: LargeVis.o main.o + g++ LargeVis.o main.o -o LargeVis -lm -pthread -lgsl -lgslcblas -Ofast -march=native -ffast-math + +LargeVis.o: LargeVis.cpp LargeVis.h ANNOY/* + g++ LargeVis.cpp -c -Ofast + +main.o: main.cpp LargeVis.h ANNOY/* + g++ main.cpp -c -Ofast + +.PHONY: clean +clean: + rm -f LargeVis *.o \ No newline at end of file diff --git a/Linux/setup.py b/Linux/setup.py index 029c8f5..3d20921 100644 --- a/Linux/setup.py +++ b/Linux/setup.py @@ -1,9 +1,10 @@ from distutils.core import setup, Extension +import numpy as np LargeVis = Extension('LargeVis', sources = ['LargeVis.cpp', 'LargeVismodule.cpp'], depends=['LargeVis.h'], - include_dirs = ['/usr/local/include'], + include_dirs = ['/usr/local/include', np.get_include()], library_dirs = ['/usr/local/lib'], libraries=['gsl', 'gslcblas'], extra_compile_args=['-lm -pthread -lgsl -lgslcblas -Ofast -march=native -ffast-math']) diff --git a/README.md b/README.md index a503542..91560b3 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,44 @@ -#LargeVis +# LargeVis + This is the *official* implementation of the **LargeVis** model by the original authors, which is used to visualize large-scale and high-dimensional data [(Tang, Liu, Zhang and Mei)](https://arxiv.org/abs/1602.00370). It now supports visualizing both high-dimensional feature vectors and networks. The package also contains a very efficient algorithm for constructing K-nearest neighbor graph (K-NNG). Contact person: Jian Tang, tangjianpku@gmail.com. This work is done when the author is in Microsoft Research Asia. -##Install +## Install + Both C++ source codes and Python wrapper are provided on Linux, OS X and Windows. To install the package, external packages are required, including [GSL (GNU Scientific Library)](http://www.gnu.org/software/gsl/) on Linux and OS X or [BOOST](http://www.boost.org/) on Windows for generating random numbers. -####Linux +#### Linux + Compile the source files via: -``` +```bash g++ LargeVis.cpp main.cpp -o LargeVis -lm -pthread -lgsl -lgslcblas -Ofast -march=native -ffast-math ``` To install the Python wrapper, modify ```setup.py``` to make sure that the GSL path is correctly set and then run ```sudo python setup.py install```. -####OS X +#### OS X + Install gsl using [Homebrew](http://brew.sh/): -``` +```bash brew install gsl ``` Modify line 347 of ```annoylib.h``` to change ```lseek64``` to ```lseek```. Then compile the source files (in the Linux folder) via: -``` +```bash g++ LargeVis.cpp main.cpp -o LargeVis -lm -pthread -lgsl -lgslcblas -Ofast -march=native -ffast-math -L/usr/local/lib -I/usr/local/include ``` To install the Python wrapper, run ```sudo python setup.py install```. -####Windows +#### Windows + To compile the source files, use Microsoft Visual Studio, where you need to set the BOOST path. To install the Python wrapper, modify ```setup.py``` to make sure that the BOOST path is correctly set and then run ```python setup.py install```. -##Usage +## Usage + LargeVis is suitable for visualizing both high-dimensional feature vectors and networks. For high-dimensional feature vectors, the format of input file should be as follows: the first line specifies the number of feature vectors and the dimensionality (500 vectors with 10 dimensions in the following example), and each of the next 500 lines describes one feature vector with 10 float numbers. ``` 500 10 @@ -55,44 +61,47 @@ For networks, each line of the input file is a DIRECTED edge. For each undirecte 495 498 1.5 ``` For C++ executable file, -``` -./LargeVis -input -output +```bash +./LargeVis --input INPUT --output OUTPUT ``` or for Python, -``` -python LargeVis_run.py -input -output +```bash +python LargeVis_run.py --input INPUT --output OUTPUT ``` -* `-input`: Input file of feature vectors or networks (see the Example folders for input format). -* `-output`: Output file of low-dimensional representations. +* `--input`: Input file of feature vectors or networks (see the Example folders for input format). +* `--output`: Output file of low-dimensional representations. Besides the two parameters, other optional parameters include: -* `-fea`: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1. -* `-threads`: Number of threads. Default is 8. -* `-outdim`: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2. -* `-samples`: Number of edge samples for graph layout (in millions). Default is set to ```data size / 100``` (million). -* `-prop`: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3. -* `-alpha`: Initial learning rate. Default is 1.0. -* `-trees`: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases unless you are dealing with very large datasets (e.g. data size over 5 million), and less trees are suitable for smaller datasets. Default is set according to the data size. -* `-neg`: Number of negative samples used for negative sampling. Default is 5. -* `-neigh`: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150. -* `-gamma`: The weights assigned to negative edges. Default is 7. -* `-perp`: The perplexity used for deciding edge weights in K-NNG. Default is 50. - -##Examples +* `--fea`: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1. +* `--threads`: Number of threads. Default is 8. +* `--outdim`: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2. +* `--samples`: Number of edge samples for graph layout (in millions). Default is set to ```data size / 100``` (million). +* `--prop`: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3. +* `--alpha`: Initial learning rate. Default is 1.0. +* `--trees`: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases unless you are dealing with very large datasets (e.g. data size over 5 million), and less trees are suitable for smaller datasets. Default is set according to the data size. +* `--neg`: Number of negative samples used for negative sampling. Default is 5. +* `--neigh`: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150. +* `--gamma`: The weights assigned to negative edges. Default is 7. +* `--perp`: The perplexity used for deciding edge weights in K-NNG. Default is 50. + +## Examples + We provide some examples including MNIST(high-dimensional feature vectors) and CondMat(networks) in the ```Examples/``` folder. For example, to visualize the MNIST dataset, -``` -python LargeVis_run.py -input mnist_vec784D.txt -output mnist_vec2D.txt -threads 16 -python plot.py -input mnist_vec2D.txt -label mnist_label.txt -output mnist_vec2D_plot +```bash +python LargeVis_run.py --input mnist_vec784D.txt --output mnist_vec2D.txt --threads 16 +python plot.py --input mnist_vec2D.txt --label mnist_label.txt --output mnist_vec2D_plot ``` ![plot of mnist](Examples/MNIST/mnist_plot.png) Please cite the following paper if you use LargeVis to visualize your data. -##Citation -``` + +## Citation + +```bibtex @inproceedings{tang2016visualizing, title={Visualizing Large-scale and High-dimensional Data}, author={Tang, Jian and Liu, Jingzhou and Zhang, Ming and Mei, Qiaozhu}, @@ -102,5 +111,7 @@ Please cite the following paper if you use LargeVis to visualize your data. organization={International World Wide Web Conferences Steering Committee} } ``` -##Acknowledgement + +## Acknowledgement + Some methods of this package are from a previous work of the LargeVis authors, [LINE (Large-scale Information Network Embedding)](https://github.com/tangjianpku/LINE). diff --git a/Windows/main.cpp b/Windows/main.cpp index 22abea8..b0413b6 100644 --- a/Windows/main.cpp +++ b/Windows/main.cpp @@ -24,34 +24,34 @@ int main(int argc, char **argv) long long i; if (argc < 3) { - printf("-fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); - printf("-input: Input file of feature vectors or networks\n"); - printf("-output: Output file of low-dimensional representations.\n"); - printf("-threads: Number of threads. Default is 8.\n"); - printf("-outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); - printf("-samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); - printf("-prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); - printf("-alpha: Initial learning rate. Default is 1.0.\n"); - printf("-trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); - printf("-neg: Number of negative samples used for negative sampling. Default is 5.\n"); - printf("-neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); - printf("-gamma: The weights assigned to negative edges. Default is 7.\n"); - printf("-perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); + printf("--fea: specify whether the input file is high-dimensional feature vectors (1) or networks (0). Default is 1.\n"); + printf("--input: Input file of feature vectors or networks\n"); + printf("--output: Output file of low-dimensional representations.\n"); + printf("--threads: Number of threads. Default is 8.\n"); + printf("--outdim: The lower dimensionality LargesVis learns for visualization (usually 2 or 3). Default is 2.\n"); + printf("--samples: Number of edge samples for graph layout (in millions). Default is set to data size / 100 (million).\n"); + printf("--prop: Number of times for neighbor propagations in the state of K-NNG construction, usually less than 3. Default is 3.\n"); + printf("--alpha: Initial learning rate. Default is 1.0.\n"); + printf("--trees: Number of random-projection trees used for constructing K-NNG. 50 is sufficient for most cases.\n"); + printf("--neg: Number of negative samples used for negative sampling. Default is 5.\n"); + printf("--neigh: Number of neighbors (K) in K-NNG, which is usually set as three times of perplexity. Default is 150.\n"); + printf("--gamma: The weights assigned to negative edges. Default is 7.\n"); + printf("--perp: The perplexity used for deciding edge weights in K-NNG. Default is 50.\n"); return 0; } - if ((i = ArgPos((char *)"-fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-input", argc, argv)) > 0) strcpy(infile, argv[i + 1]); - if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(outfile, argv[i + 1]); - if ((i = ArgPos((char *)"-outdim", argc, argv)) > 0) out_dim = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-samples", argc, argv)) > 0) n_samples = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) n_threads = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-neg", argc, argv)) > 0) n_negative = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-neigh", argc, argv)) > 0) n_neighbors = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-trees", argc, argv)) > 0) n_trees = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-prop", argc, argv)) > 0) n_propagation = atoi(argv[i + 1]); - if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); - if ((i = ArgPos((char *)"-gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); - if ((i = ArgPos((char *)"-perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--fea", argc, argv)) > 0) if_embed = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--input", argc, argv)) > 0) strcpy(infile, argv[i + 1]); + if ((i = ArgPos((char *)"--output", argc, argv)) > 0) strcpy(outfile, argv[i + 1]); + if ((i = ArgPos((char *)"--outdim", argc, argv)) > 0) out_dim = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--samples", argc, argv)) > 0) n_samples = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--threads", argc, argv)) > 0) n_threads = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--neg", argc, argv)) > 0) n_negative = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--neigh", argc, argv)) > 0) n_neighbors = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--trees", argc, argv)) > 0) n_trees = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--prop", argc, argv)) > 0) n_propagation = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"--alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--gamma", argc, argv)) > 0) n_gamma = atof(argv[i + 1]); + if ((i = ArgPos((char *)"--perp", argc, argv)) > 0) perplexity = atof(argv[i + 1]); LargeVis model; if (if_embed) diff --git a/plot.py b/plot.py old mode 100644 new mode 100755 index a55d6c8..630c274 --- a/plot.py +++ b/plot.py @@ -1,44 +1,69 @@ -import numpy -import matplotlib.pyplot as plt -import argparse - -parser = argparse.ArgumentParser() - -parser.add_argument('-input', default = '', help = 'input file') -parser.add_argument('-label', default = '', help = 'label file') -parser.add_argument('-output', default = '', help = 'output file') -parser.add_argument('-range', default = '', help = 'axis range') - -args = parser.parse_args() - -label = [] -if args.label != '': - for line in open(args.label): - label.append(line.strip()) - -N = M = 0 -all_data = {} -for i, line in enumerate(open(args.input)): - vec = line.strip().split(' ') - if i == 0: - N = int(vec[0]) - M = int(vec[1]) - elif i <= N: - if args.label == '': - label.append(0) - all_data.setdefault(label[i-1], []).append((float(vec[-2]), float(vec[-1]))) - -colors = plt.cm.rainbow(numpy.linspace(0, 1, len(all_data))) - -for color, ll in zip(colors, sorted(all_data.keys())): - x = [t[0] for t in all_data[ll]] - y = [t[1] for t in all_data[ll]] - plt.plot(x, y, '.', color = color, markersize = 1) -if args.range != '': - l = abs(float(args.range)) - plt.xlim(-l, l) - plt.ylim(-l, l) -plt.savefig(args.output, dpi = 500) - - - +#!/usr/bin/env python3 + +import argparse + +import numpy as np +from matplotlib import rcParams, pyplot as plt + +rcParams["svg.fonttype"] = "none" + +parser = argparse.ArgumentParser() +parser.add_argument("--input", "-i", default="", help="input file", required=True) +parser.add_argument("--output", "-o", default="", help="output file", required=True) +parser.add_argument("--clusters", "-c", default="", help="clusters file") +parser.add_argument("--labels", "-l", default="", help="labels to annotate file") +parser.add_argument("--range", "-r", type=float, help="axis range") +parser.add_argument("--no-axis", "-n", help="hide axis", action="store_true") +parser.add_argument("--legend", "-s", help="show legend", action="store_true") +args = parser.parse_args() + +clusters = {} +if args.clusters != "": + with open(args.clusters) as f: + for line in f: + node, cluster = line.strip().split() + clusters[node] = cluster + +lables = [] +if args.labels != "": + with open(args.labels) as f: + lables = list(map(lambda line: line.strip(), f)) + +positions_by_cluster = {} +positions = {} +with open(args.input) as f: + _ = f.readline() # ignore first line + for line in f: + vec = line.strip().split(" ") + node = vec[0] + pos = (float(vec[-2]), float(vec[-1])) + positions[node] = pos + positions_by_cluster.setdefault(clusters.get(node, ""), []).append(pos) + +colors = plt.cm.tab10(np.linspace(0, 1, len(positions_by_cluster))) + +for color, cluster in zip(colors, sorted(positions_by_cluster.keys())): + x = [t[0] for t in positions_by_cluster[cluster]] + y = [t[1] for t in positions_by_cluster[cluster]] + plt.plot(x, y, ".", color=color, markersize=1, label=cluster) + + +for node in lables: + if node not in positions: + continue + x, y = positions[node] + plt.annotate(node, xy=(x, y), xytext=(x - 3, y - 3), arrowprops=dict(arrowstyle="-"), fontsize="xx-small") + +if args.range: + axis_limit = abs(float(args.range)) + plt.xlim(-axis_limit, axis_limit) + plt.ylim(-axis_limit, axis_limit) + +if args.no_axis: + plt.axis("off") + +if args.legend: + plt.legend() + +plt.gca().set_aspect("equal", adjustable="box") +plt.savefig(args.output, dpi=500)