diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bde9397 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# compiled +_tagger_swig.so +libtagger.so +cleandict +environments +organisms +species +tagcorpus +*.o + +# swig +tagger_swig.py +tagger_swig_wrap.cxx + +# lib +libtagger.a + +# output +out + +# other +logs +# test_script.sh diff --git a/README.md b/README.md index db62b48..48dcc5a 100644 --- a/README.md +++ b/README.md @@ -155,11 +155,17 @@ NB: additionally, an output file for the scored pairs output must also be specif By default, pairs are given a score of 1 if they occur in the same document, a score of 2 if they occur in the same paragraph, and a score of 0.2 if they occur in the same sentence. The parameter a in the following formula controls the weight of the normalization factor (actually, 1-a is the exponent on the normalization factor, but let's not be too pedantic). +c_ij (co-occurrence) = (sum(delta_s(i,j) * w_s + delta_p(i,j) * w_p + delta_d(i,j) * w_d)) * w_c + +Where s=sentence, p=paragraph, d=document and c=corpus. w_x stands for x-weight. delta_s(i,j) evaluates to 1 if term i and j occur in the same sentence and 0 otherwise and similarly for paragraphs and documents. + +Noted that corpus weights (w_c) are input from a optional input file, this file should contain 2 columns, the first column is PubMed ID and the second column is its corpus weight (if a document/PubMed ID is not assigned a weight, then the default weight is 1.0). + score = c_ij^a * ( c_ij * c_.. / c_i. * c_.j )^(1-a) Where . is shorthand for "count over all entities of the given type". -These values can be set with the --document-weight, --paragraph-weight, --sentence-weight and --normalization-factor command line options respectively. +These values can be set with the --corpus-weights, --document-weight, --paragraph-weight, --sentence-weight and --normalization-factor command line options respectively. #### Deprecated: Organism #### @@ -252,8 +258,8 @@ If --out-segments is specified, the sentence segmentation will be written to the Example: specify stopwords and pairs, and output pairs output to a file called output-pairs. ~~~~ gzip -cd `ls -1r /home/purple1/databases/Medline/*.tsv.gz` | tagcorpus --entities=entities \\ ---names=names --stopwords=all_global.tsv --type-pairs=typepairs --threads=16 \\ ---out-pairs=output-pairs --types=types --out-segments=all_segments.tsv > output-mentions +--names=names --stopwords=all_global.tsv --corpus-weights=all_weights.tsv --type-pairs=typepairs \\ +--threads=16 --out-pairs=output-pairs --types=types --out-segments=all_segments.tsv > output-mentions ~~~~ ## TODO ## diff --git a/data/test_Wc.tsv b/data/test_Wc.tsv new file mode 100644 index 0000000..313a315 --- /dev/null +++ b/data/test_Wc.tsv @@ -0,0 +1,2 @@ +23909892 1.0 +25921289 3.0 diff --git a/document.h b/document.h index 8d8a667..c19fc60 100644 --- a/document.h +++ b/document.h @@ -12,6 +12,8 @@ #include #include +typedef double SCORE; + using namespace std; struct Segment @@ -30,11 +32,12 @@ class Document int key; char* name; char* text; + SCORE weight; public: Document(); Document(const Document& other); - Document(int key, const char* text); + Document(int key, const char* text, SCORE weight); virtual ~Document(); public: @@ -89,6 +92,7 @@ class TsvDocumentReader : public IDocumentReader, protected InputFile, protected { protected: unordered_set seen; + unordered_map weights; public: TsvDocumentReader(FILE* file); @@ -96,29 +100,33 @@ class TsvDocumentReader : public IDocumentReader, protected InputFile, protected ~TsvDocumentReader(); public: + void load_weights(InputFile file); + void load_weights(FILE* file); + void load_weights(const char* filename); Document* read_document(); }; -//////////////////////////////////////////////////////////////////////////////// - Document::Document() { this->key = 0; this->name = NULL; this->text = NULL; + this->weight = 1.0; } Document::Document(const Document& other) { this->key = other.key; + this->weight = other.weight; int length = strlen(other.text); this->text = new char[length+1]; memcpy(this->text, other.text, length+1); } -Document::Document(int key, const char* text) +Document::Document(int key, const char* text, SCORE weight = 1.0) { this->key = key; + this->weight = weight; int length = strlen(text); this->text = new char[length+1]; memcpy(this->text, text, length+1); @@ -363,6 +371,26 @@ TsvDocumentReader::~TsvDocumentReader() { } +void TsvDocumentReader::load_weights(InputFile file) { + while (true) { + vector fields = file.get_fields(); + int size = fields.size(); + if (size == 0) break; + if (size >= 2 && *fields[1] != '\0' && *fields[1] != '\t') { + weights[atoi(fields[0])] = atof(fields[1]); + } + for (vector::iterator it = fields.begin(); it != fields.end(); it++) delete *it; + } +} + +void TsvDocumentReader::load_weights(FILE* file) { + load_weights(InputFile(file)); +} + +void TsvDocumentReader::load_weights(const char* filename) { + load_weights(InputFile(filename)); +} + Document* TsvDocumentReader::read_document() { TsvDocument* document = new TsvDocument(); @@ -426,6 +454,11 @@ Document* TsvDocumentReader::read_document() else { valid = false; } + // Map weight. + unordered_map::iterator it = weights.find(document->key); + if (it != weights.end()) { + document->weight = it->second; + } } else if (line) { free(line); diff --git a/makefile b/makefile index 5746b98..fa6b026 100644 --- a/makefile +++ b/makefile @@ -38,10 +38,10 @@ libtagger.a: tagger.o ar -rfs -o $@ $< tagcorpus: tagcorpus.cxx acronyms.h document.h file.h hash.h mutex.h thread.h match_handlers.h base_handlers.h meta_handlers.h print_handlers.h score_handlers.h segment_handlers.h batch_tagger.h threaded_batch_tagger.h tagger.h tagger_core.h tagger_types.h tightvector.h tokens.h - $(CC) $(CFLAGS) -lboost_regex -pthread -o $@ $< -lm + $(CC) $(CFLAGS) -pthread -o $@ $< -lboost_regex -lm cleandict: cleandict.cxx acronyms.h file.h hash.h tagger.h tagger_core.h tagger_types.h - $(CC) $(CFLAGS) -lboost_regex -pthread -o $@ $< -lm + $(CC) $(CFLAGS) -pthread -o $@ $< -lboost_regex -lm %: %.cxx acronyms.h document.h file.h hash.h mutex.h match_handlers.h base_handlers.h batch_tagger.h tagger.h tagger_core.h tagger_types.h tightvector.h tokens.h - $(CC) $(CFLAGS) -lboost_regex -pthread -o $@ $< -lm + $(CC) $(CFLAGS) -pthread -o $@ $< -lboost_regex -lm diff --git a/score_handlers.h b/score_handlers.h index ac1190d..8d1592b 100644 --- a/score_handlers.h +++ b/score_handlers.h @@ -176,9 +176,9 @@ void ScoreDocumentHandler::on_document_begin(Document& document) void ScoreDocumentHandler::on_document_end(Document& document) { ScoreBatchHandler* score_batch_handler = (ScoreBatchHandler*) this->batch_handler; - this->commit_pairs(this->document_pair_set, score_batch_handler->document_weight); - this->commit_pairs(this->paragraph_pair_set, score_batch_handler->paragraph_weight); - this->commit_pairs(this->sentence_pair_set, score_batch_handler->sentence_weight); + this->commit_pairs(this->document_pair_set, score_batch_handler->document_weight*(document.weight)); + this->commit_pairs(this->paragraph_pair_set, score_batch_handler->paragraph_weight*(document.weight)); + this->commit_pairs(this->sentence_pair_set, score_batch_handler->sentence_weight*(document.weight)); if (this->pair_score_map.size() >= 10000) { score_batch_handler->lock(); score_batch_handler->pair_score_map += this->pair_score_map; diff --git a/tagcorpus.cxx b/tagcorpus.cxx index f16f6fd..5842a35 100644 --- a/tagcorpus.cxx +++ b/tagcorpus.cxx @@ -15,7 +15,7 @@ extern "C" } #define MAXFILENAMELEN 256 -#define VERSION "1.1" +#define VERSION "1.2" using namespace std; @@ -47,6 +47,7 @@ int main (int argc, char *argv[]) char localstopwords[MAXFILENAMELEN] = ""; bool autodetect = false; bool tokenize_characters = false; + char corpus_weights[MAXFILENAMELEN] = ""; float document_weight = 1; float paragraph_weight = 2; float sentence_weight = 0.2; @@ -70,6 +71,7 @@ int main (int argc, char *argv[]) {"local-stopwords", optional_argument, 0, 'l'}, {"autodetect", no_argument, 0, 'u'}, {"tokenize-characters", no_argument, 0, 'z'}, + {"corpus-weights", optional_argument, 0, 'w'}, {"document-weight", optional_argument, 0, 'd'}, {"paragraph-weight", optional_argument, 0, 'r'}, {"sentence-weight", optional_argument, 0, 'c'}, @@ -85,7 +87,7 @@ int main (int argc, char *argv[]) int option_index = 0; - c = getopt_long (argc, argv, "y:e:n:i:g:p:s:l:u:d:r:c:f:t:m:a:h:", long_options, &option_index); + c = getopt_long (argc, argv, "y:e:n:i:g:p:s:l:u:w:d:r:c:f:t:m:a:h:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) @@ -107,6 +109,7 @@ int main (int argc, char *argv[]) printf("\t--local-stopwords=filename\n"); printf("\t--autodetect Turn autodetect on\n"); printf("\t--tokenize-characters Turn single-character tokenization on\n"); + printf("\t--corpus-weights=filename\tIf not specify then all weights default 1.0\n"); printf("\t--document-weight=%1.2f\n", document_weight); printf("\t--paragraph-weight=%1.2f\n", paragraph_weight); printf("\t--sentence-weight=%1.2f\n", sentence_weight); @@ -175,7 +178,13 @@ int main (int argc, char *argv[]) case 'z': tokenize_characters = true; - + + case 'w': + if (optarg) { + strncpy(corpus_weights, optarg, min(MAXFILENAMELEN, int(sizeof(corpus_weights)))); + } + break; + case 'd': if (optarg) { document_weight = atof(optarg); @@ -259,6 +268,10 @@ int main (int argc, char *argv[]) document_reader = new TsvDocumentReader(stdin); } + if (validate_opt(corpus_weights)) { + document_reader->load_weights(corpus_weights); + } + if (validate_opt(out_matches)) { batch_handler.push_back(new PrintBatchHandler(out_matches)); } diff --git a/test_script.sh b/test_script.sh new file mode 100644 index 0000000..f2735fb --- /dev/null +++ b/test_script.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# the first para is the name of output +printf "Define the outputs name: [prefix name]\n" +printf "default: (without name) >>> " +read -r try_num +#try_num=$1 + +# the second para is whether to use Wc or not (yes or no) +printf "Decide whether to use the Wc or not: [yes|no]\n" +printf "default:no >>> " +read -r use_Wc +#use_Wc=$2 + +input="$(pwd)/data" +output="$(pwd)/out" + +if [ -d $output ]; then + echo "dict out exsists" +else + mkdir out + echo "dict out is created" +fi + +if [ $use_Wc == 'y' ] || [ $use_Wc == 'Y' ] || [ $use_Wc == 'yes' ] || [ $use_Wc == 'Yes' ]; then + ### with corpus weights + $(pwd)/tagcorpus --types=$input/test_types.tsv --entities=$input/test_entities.tsv --names=$input/test_names.tsv --documents=$input/test_documents.tsv --out-matches=$output/${try_num}_test_matches.tsv --out-pairs=$output/${try_num}_test_pairs.tsv --out-segments=$output/${try_num}_test_segments.tsv --corpus-weights=$input/test_Wc.tsv +else + ### without corpus weights + $(pwd)/tagcorpus --types=$input/test_types.tsv --entities=$input/test_entities.tsv --names=$input/test_names.tsv --documents=$input/test_documents.tsv --out-matches=$output/${try_num}_test_matches.tsv --out-pairs=$output/${try_num}_test_pairs.tsv --out-segments=$output/${try_num}_test_segments.tsv +fi +