Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# compiled
_tagger_swig.so
libtagger.so
cleandict
environments
organisms
species
tagcorpus
*.o

# swig
tagger_swig.py
tagger_swig_wrap.cxx

# lib
libtagger.a

# output
out

# other
logs
# test_script.sh
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,17 @@ NB: additionally, an output file for the scored pairs output must also be specif

By default, pairs are given a score of 1 if they occur in the same document, a score of 2 if they occur in the same paragraph, and a score of 0.2 if they occur in the same sentence. The parameter a in the following formula controls the weight of the normalization factor (actually, 1-a is the exponent on the normalization factor, but let's not be too pedantic).

c_ij (co-occurrence) = (sum(delta_s(i,j) * w_s + delta_p(i,j) * w_p + delta_d(i,j) * w_d)) * w_c

Where s=sentence, p=paragraph, d=document and c=corpus. w_x stands for x-weight. delta_s(i,j) evaluates to 1 if term i and j occur in the same sentence and 0 otherwise and similarly for paragraphs and documents.

Noted that corpus weights (w_c) are input from a optional input file, this file should contain 2 columns, the first column is PubMed ID and the second column is its corpus weight (if a document/PubMed ID is not assigned a weight, then the default weight is 1.0).

score = c_ij^a * ( c_ij * c_.. / c_i. * c_.j )^(1-a)

Where . is shorthand for "count over all entities of the given type".

These values can be set with the --document-weight, --paragraph-weight, --sentence-weight and --normalization-factor command line options respectively.
These values can be set with the --corpus-weights, --document-weight, --paragraph-weight, --sentence-weight and --normalization-factor command line options respectively.


#### Deprecated: Organism ####
Expand Down Expand Up @@ -252,8 +258,8 @@ If --out-segments is specified, the sentence segmentation will be written to the
Example: specify stopwords and pairs, and output pairs output to a file called output-pairs.
~~~~
gzip -cd `ls -1r /home/purple1/databases/Medline/*.tsv.gz` | tagcorpus --entities=entities \\
--names=names --stopwords=all_global.tsv --type-pairs=typepairs --threads=16 \\
--out-pairs=output-pairs --types=types --out-segments=all_segments.tsv > output-mentions
--names=names --stopwords=all_global.tsv --corpus-weights=all_weights.tsv --type-pairs=typepairs \\
--threads=16 --out-pairs=output-pairs --types=types --out-segments=all_segments.tsv > output-mentions
~~~~

## TODO ##
Expand Down
2 changes: 2 additions & 0 deletions data/test_Wc.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
23909892 1.0
25921289 3.0
41 changes: 37 additions & 4 deletions document.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <unordered_set>
#include <unordered_map>

typedef double SCORE;

using namespace std;

struct Segment
Expand All @@ -30,11 +32,12 @@ class Document
int key;
char* name;
char* text;
SCORE weight;

public:
Document();
Document(const Document& other);
Document(int key, const char* text);
Document(int key, const char* text, SCORE weight);
virtual ~Document();

public:
Expand Down Expand Up @@ -89,36 +92,41 @@ class TsvDocumentReader : public IDocumentReader, protected InputFile, protected
{
protected:
unordered_set<int> seen;
unordered_map<int, SCORE> weights;

public:
TsvDocumentReader(FILE* file);
TsvDocumentReader(const char* filename);
~TsvDocumentReader();

public:
void load_weights(InputFile file);
void load_weights(FILE* file);
void load_weights(const char* filename);
Document* read_document();
};

////////////////////////////////////////////////////////////////////////////////

Document::Document()
{
this->key = 0;
this->name = NULL;
this->text = NULL;
this->weight = 1.0;
}

Document::Document(const Document& other)
{
this->key = other.key;
this->weight = other.weight;
int length = strlen(other.text);
this->text = new char[length+1];
memcpy(this->text, other.text, length+1);
}

Document::Document(int key, const char* text)
Document::Document(int key, const char* text, SCORE weight = 1.0)
{
this->key = key;
this->weight = weight;
int length = strlen(text);
this->text = new char[length+1];
memcpy(this->text, text, length+1);
Expand Down Expand Up @@ -363,6 +371,26 @@ TsvDocumentReader::~TsvDocumentReader()
{
}

void TsvDocumentReader::load_weights(InputFile file) {
while (true) {
vector<char*> fields = file.get_fields();
int size = fields.size();
if (size == 0) break;
if (size >= 2 && *fields[1] != '\0' && *fields[1] != '\t') {
weights[atoi(fields[0])] = atof(fields[1]);
}
for (vector<char*>::iterator it = fields.begin(); it != fields.end(); it++) delete *it;
}
}

void TsvDocumentReader::load_weights(FILE* file) {
load_weights(InputFile(file));
}

void TsvDocumentReader::load_weights(const char* filename) {
load_weights(InputFile(filename));
}

Document* TsvDocumentReader::read_document()
{
TsvDocument* document = new TsvDocument();
Expand Down Expand Up @@ -426,6 +454,11 @@ Document* TsvDocumentReader::read_document()
else {
valid = false;
}
// Map weight.
unordered_map<int, SCORE>::iterator it = weights.find(document->key);
if (it != weights.end()) {
document->weight = it->second;
}
}
else if (line) {
free(line);
Expand Down
6 changes: 3 additions & 3 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ libtagger.a: tagger.o
ar -rfs -o $@ $<

tagcorpus: tagcorpus.cxx acronyms.h document.h file.h hash.h mutex.h thread.h match_handlers.h base_handlers.h meta_handlers.h print_handlers.h score_handlers.h segment_handlers.h batch_tagger.h threaded_batch_tagger.h tagger.h tagger_core.h tagger_types.h tightvector.h tokens.h
$(CC) $(CFLAGS) -lboost_regex -pthread -o $@ $< -lm
$(CC) $(CFLAGS) -pthread -o $@ $< -lboost_regex -lm

cleandict: cleandict.cxx acronyms.h file.h hash.h tagger.h tagger_core.h tagger_types.h
$(CC) $(CFLAGS) -lboost_regex -pthread -o $@ $< -lm
$(CC) $(CFLAGS) -pthread -o $@ $< -lboost_regex -lm

%: %.cxx acronyms.h document.h file.h hash.h mutex.h match_handlers.h base_handlers.h batch_tagger.h tagger.h tagger_core.h tagger_types.h tightvector.h tokens.h
$(CC) $(CFLAGS) -lboost_regex -pthread -o $@ $< -lm
$(CC) $(CFLAGS) -pthread -o $@ $< -lboost_regex -lm
6 changes: 3 additions & 3 deletions score_handlers.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,9 @@ void ScoreDocumentHandler::on_document_begin(Document& document)
void ScoreDocumentHandler::on_document_end(Document& document)
{
ScoreBatchHandler* score_batch_handler = (ScoreBatchHandler*) this->batch_handler;
this->commit_pairs(this->document_pair_set, score_batch_handler->document_weight);
this->commit_pairs(this->paragraph_pair_set, score_batch_handler->paragraph_weight);
this->commit_pairs(this->sentence_pair_set, score_batch_handler->sentence_weight);
this->commit_pairs(this->document_pair_set, score_batch_handler->document_weight*(document.weight));
this->commit_pairs(this->paragraph_pair_set, score_batch_handler->paragraph_weight*(document.weight));
this->commit_pairs(this->sentence_pair_set, score_batch_handler->sentence_weight*(document.weight));
if (this->pair_score_map.size() >= 10000) {
score_batch_handler->lock();
score_batch_handler->pair_score_map += this->pair_score_map;
Expand Down
19 changes: 16 additions & 3 deletions tagcorpus.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ extern "C"
}

#define MAXFILENAMELEN 256
#define VERSION "1.1"
#define VERSION "1.2"

using namespace std;

Expand Down Expand Up @@ -47,6 +47,7 @@ int main (int argc, char *argv[])
char localstopwords[MAXFILENAMELEN] = "";
bool autodetect = false;
bool tokenize_characters = false;
char corpus_weights[MAXFILENAMELEN] = "";
float document_weight = 1;
float paragraph_weight = 2;
float sentence_weight = 0.2;
Expand All @@ -70,6 +71,7 @@ int main (int argc, char *argv[])
{"local-stopwords", optional_argument, 0, 'l'},
{"autodetect", no_argument, 0, 'u'},
{"tokenize-characters", no_argument, 0, 'z'},
{"corpus-weights", optional_argument, 0, 'w'},
{"document-weight", optional_argument, 0, 'd'},
{"paragraph-weight", optional_argument, 0, 'r'},
{"sentence-weight", optional_argument, 0, 'c'},
Expand All @@ -85,7 +87,7 @@ int main (int argc, char *argv[])

int option_index = 0;

c = getopt_long (argc, argv, "y:e:n:i:g:p:s:l:u:d:r:c:f:t:m:a:h:", long_options, &option_index);
c = getopt_long (argc, argv, "y:e:n:i:g:p:s:l:u:w:d:r:c:f:t:m:a:h:", long_options, &option_index);

/* Detect the end of the options. */
if (c == -1)
Expand All @@ -107,6 +109,7 @@ int main (int argc, char *argv[])
printf("\t--local-stopwords=filename\n");
printf("\t--autodetect Turn autodetect on\n");
printf("\t--tokenize-characters Turn single-character tokenization on\n");
printf("\t--corpus-weights=filename\tIf not specify then all weights default 1.0\n");
printf("\t--document-weight=%1.2f\n", document_weight);
printf("\t--paragraph-weight=%1.2f\n", paragraph_weight);
printf("\t--sentence-weight=%1.2f\n", sentence_weight);
Expand Down Expand Up @@ -175,7 +178,13 @@ int main (int argc, char *argv[])

case 'z':
tokenize_characters = true;


case 'w':
if (optarg) {
strncpy(corpus_weights, optarg, min(MAXFILENAMELEN, int(sizeof(corpus_weights))));
}
break;

case 'd':
if (optarg) {
document_weight = atof(optarg);
Expand Down Expand Up @@ -259,6 +268,10 @@ int main (int argc, char *argv[])
document_reader = new TsvDocumentReader(stdin);
}

if (validate_opt(corpus_weights)) {
document_reader->load_weights(corpus_weights);
}

if (validate_opt(out_matches)) {
batch_handler.push_back(new PrintBatchHandler(out_matches));
}
Expand Down
32 changes: 32 additions & 0 deletions test_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

# the first para is the name of output
printf "Define the outputs name: [prefix name]\n"
printf "default: (without name) >>> "
read -r try_num
#try_num=$1

# the second para is whether to use Wc or not (yes or no)
printf "Decide whether to use the Wc or not: [yes|no]\n"
printf "default:no >>> "
read -r use_Wc
#use_Wc=$2

input="$(pwd)/data"
output="$(pwd)/out"

if [ -d $output ]; then
echo "dict out exsists"
else
mkdir out
echo "dict out is created"
fi

if [ $use_Wc == 'y' ] || [ $use_Wc == 'Y' ] || [ $use_Wc == 'yes' ] || [ $use_Wc == 'Yes' ]; then
### with corpus weights
$(pwd)/tagcorpus --types=$input/test_types.tsv --entities=$input/test_entities.tsv --names=$input/test_names.tsv --documents=$input/test_documents.tsv --out-matches=$output/${try_num}_test_matches.tsv --out-pairs=$output/${try_num}_test_pairs.tsv --out-segments=$output/${try_num}_test_segments.tsv --corpus-weights=$input/test_Wc.tsv
else
### without corpus weights
$(pwd)/tagcorpus --types=$input/test_types.tsv --entities=$input/test_entities.tsv --names=$input/test_names.tsv --documents=$input/test_documents.tsv --out-matches=$output/${try_num}_test_matches.tsv --out-pairs=$output/${try_num}_test_pairs.tsv --out-segments=$output/${try_num}_test_segments.tsv
fi