larsjuhljensen · lzlniu · Oct 12, 2021 · Oct 13, 2021 · Oct 13, 2021 · Oct 13, 2021
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,23 @@
+# compiled
+_tagger_swig.so
+libtagger.so
+cleandict
+environments
+organisms
+species
+tagcorpus
+*.o
+
+# swig
+tagger_swig.py
+tagger_swig_wrap.cxx
+
+# lib
+libtagger.a
+
+# output
+out
+
+# other
+logs
+# test_script.sh
diff --git a/README.md b/README.md
@@ -155,11 +155,17 @@ NB: additionally, an output file for the scored pairs output must also be specif
 
 By default, pairs are given a score of 1 if they occur in the same document, a score of 2 if they occur in the same paragraph, and a score of 0.2 if they occur in the same sentence.  The parameter a in the following formula controls the weight of the normalization factor (actually, 1-a is the exponent on the normalization factor, but let's not be too pedantic).
 
+c_ij (co-occurrence) = (sum(delta_s(i,j) * w_s + delta_p(i,j) * w_p + delta_d(i,j) * w_d)) * w_c
+
+Where s=sentence, p=paragraph, d=document and c=corpus. w_x stands for x-weight. delta_s(i,j) evaluates to 1 if term i and j occur in the same sentence and 0 otherwise and similarly for paragraphs and documents.
+
+Noted that corpus weights (w_c) are input from a optional input file, this file should contain 2 columns, the first column is PubMed ID and the second column is its corpus weight (if a document/PubMed ID is not assigned a weight, then the default weight is 1.0).
+
 score = c_ij^a * ( c_ij * c_.. / c_i. * c_.j )^(1-a)  
 
 Where . is shorthand for "count over all entities of the given type".  
 
-These values can be set with the --document-weight, --paragraph-weight, --sentence-weight and --normalization-factor command line options respectively.
+These values can be set with the --corpus-weights, --document-weight, --paragraph-weight, --sentence-weight and --normalization-factor command line options respectively.
 
 
 #### Deprecated: Organism ####
@@ -252,8 +258,8 @@ If --out-segments is specified, the sentence segmentation will be written to the
 Example: specify stopwords and pairs, and output pairs output to a file called output-pairs.
 ~~~~
 gzip -cd `ls -1r /home/purple1/databases/Medline/*.tsv.gz` | tagcorpus --entities=entities \\
---names=names --stopwords=all_global.tsv --type-pairs=typepairs --threads=16 \\
---out-pairs=output-pairs --types=types --out-segments=all_segments.tsv > output-mentions
+--names=names --stopwords=all_global.tsv --corpus-weights=all_weights.tsv --type-pairs=typepairs \\
+--threads=16 --out-pairs=output-pairs --types=types --out-segments=all_segments.tsv > output-mentions
 ~~~~
 
 ## TODO ##

diff --git a/data/test_Wc.tsv b/data/test_Wc.tsv
@@ -0,0 +1,2 @@
+23909892	1.0
+25921289	3.0
diff --git a/document.h b/document.h
@@ -12,6 +12,8 @@
 #include <unordered_set>
 #include <unordered_map>
 
+typedef double SCORE;
+
 using namespace std;
 
 struct Segment
@@ -30,11 +32,12 @@ class Document
 		int key;
 		char* name;
 		char* text;
+		SCORE weight;
 
 	public:
 		Document();
 		Document(const Document& other);
-		Document(int key, const char* text);
+		Document(int key, const char* text, SCORE weight);
 		virtual ~Document();
 
 	public:
@@ -89,36 +92,41 @@ class TsvDocumentReader : public IDocumentReader, protected InputFile, protected
 {
 	protected:
 		unordered_set<int> seen;
+		unordered_map<int, SCORE> weights;
 
 	public:
 		TsvDocumentReader(FILE* file);
 		TsvDocumentReader(const char* filename);
 		~TsvDocumentReader();
 
 	public:
+		void load_weights(InputFile file);
+		void load_weights(FILE* file);
+		void load_weights(const char* filename);
 		Document* read_document();
 };
 
-////////////////////////////////////////////////////////////////////////////////
-
 Document::Document()
 {
 	this->key = 0;
 	this->name = NULL;
 	this->text = NULL;
+	this->weight = 1.0;
 }
 
 Document::Document(const Document& other)
 {
 	this->key  = other.key;
+	this->weight = other.weight;
 	int length = strlen(other.text);
 	this->text = new char[length+1];
 	memcpy(this->text, other.text, length+1);
 }
 
-Document::Document(int key, const char* text)
+Document::Document(int key, const char* text, SCORE weight = 1.0)
 {
 	this->key = key;
+	this->weight = weight;
 	int length = strlen(text);
 	this->text = new char[length+1];
 	memcpy(this->text, text, length+1);
@@ -363,6 +371,26 @@ TsvDocumentReader::~TsvDocumentReader()
 {
 }
 
+void TsvDocumentReader::load_weights(InputFile file) {
+	while (true) {
+		vector<char*> fields = file.get_fields();
+		int size = fields.size();
+		if (size == 0) break;
+		if (size >= 2 && *fields[1] != '\0' && *fields[1] != '\t') {
+			weights[atoi(fields[0])] = atof(fields[1]);
+		}
+		for (vector<char*>::iterator it = fields.begin(); it != fields.end(); it++) delete *it;
+	}
+}
+
+void TsvDocumentReader::load_weights(FILE* file) {
+	load_weights(InputFile(file));
+}
+
+void TsvDocumentReader::load_weights(const char* filename) {
+	load_weights(InputFile(filename));
+}
+
 Document* TsvDocumentReader::read_document()
 {
 	TsvDocument* document = new TsvDocument();
@@ -426,6 +454,11 @@ Document* TsvDocumentReader::read_document()
 			else {
 				valid = false;
 			}
+			// Map weight.
+			unordered_map<int, SCORE>::iterator it = weights.find(document->key);
+			if (it != weights.end()) {
+				document->weight = it->second;
+			}
 		}
 		else if (line) {
 			free(line);

diff --git a/makefile b/makefile
@@ -38,10 +38,10 @@ libtagger.a: tagger.o
 	ar -rfs -o $@ $<
 
 tagcorpus: tagcorpus.cxx acronyms.h document.h file.h hash.h mutex.h thread.h match_handlers.h base_handlers.h meta_handlers.h print_handlers.h score_handlers.h segment_handlers.h batch_tagger.h threaded_batch_tagger.h tagger.h tagger_core.h tagger_types.h tightvector.h tokens.h
-	$(CC) $(CFLAGS) -lboost_regex -pthread -o $@ $< -lm
+	$(CC) $(CFLAGS) -pthread -o $@ $< -lboost_regex -lm
 
 cleandict: cleandict.cxx acronyms.h file.h hash.h tagger.h tagger_core.h tagger_types.h
-	$(CC) $(CFLAGS) -lboost_regex -pthread -o $@ $< -lm
+	$(CC) $(CFLAGS) -pthread -o $@ $< -lboost_regex -lm
 
 %: %.cxx acronyms.h document.h file.h hash.h mutex.h match_handlers.h base_handlers.h batch_tagger.h tagger.h tagger_core.h tagger_types.h tightvector.h tokens.h
-	$(CC) $(CFLAGS) -lboost_regex -pthread -o $@ $< -lm
+	$(CC) $(CFLAGS) -pthread -o $@ $< -lboost_regex -lm
diff --git a/score_handlers.h b/score_handlers.h
@@ -176,9 +176,9 @@ void ScoreDocumentHandler::on_document_begin(Document& document)
 void ScoreDocumentHandler::on_document_end(Document& document)
 {
 	ScoreBatchHandler* score_batch_handler = (ScoreBatchHandler*) this->batch_handler;
-	this->commit_pairs(this->document_pair_set, score_batch_handler->document_weight);
-	this->commit_pairs(this->paragraph_pair_set, score_batch_handler->paragraph_weight);
-	this->commit_pairs(this->sentence_pair_set, score_batch_handler->sentence_weight);
+	this->commit_pairs(this->document_pair_set, score_batch_handler->document_weight*(document.weight));
+	this->commit_pairs(this->paragraph_pair_set, score_batch_handler->paragraph_weight*(document.weight));
+	this->commit_pairs(this->sentence_pair_set, score_batch_handler->sentence_weight*(document.weight));
 	if (this->pair_score_map.size() >= 10000) {
 		score_batch_handler->lock();
 		score_batch_handler->pair_score_map += this->pair_score_map;

diff --git a/tagcorpus.cxx b/tagcorpus.cxx
@@ -15,7 +15,7 @@ extern "C"
 }
 
 #define MAXFILENAMELEN 256
-#define VERSION "1.1"
+#define VERSION "1.2"
 
 using namespace std;
 
@@ -47,6 +47,7 @@ int main (int argc, char *argv[])
 	char localstopwords[MAXFILENAMELEN] = "";
 	bool autodetect = false;
 	bool tokenize_characters = false;
+	char corpus_weights[MAXFILENAMELEN] = "";
 	float document_weight = 1;
 	float paragraph_weight = 2;
 	float sentence_weight = 0.2;
@@ -70,6 +71,7 @@ int main (int argc, char *argv[])
 			{"local-stopwords", optional_argument, 0, 'l'},
 			{"autodetect", no_argument, 0, 'u'},
 			{"tokenize-characters", no_argument, 0, 'z'},
+			{"corpus-weights", optional_argument, 0, 'w'},
 			{"document-weight", optional_argument, 0, 'd'},
 			{"paragraph-weight", optional_argument, 0, 'r'},
 			{"sentence-weight", optional_argument, 0, 'c'},
@@ -85,7 +87,7 @@ int main (int argc, char *argv[])
 
 		int option_index = 0;
 
-		c = getopt_long (argc, argv, "y:e:n:i:g:p:s:l:u:d:r:c:f:t:m:a:h:", long_options, &option_index);
+		c = getopt_long (argc, argv, "y:e:n:i:g:p:s:l:u:w:d:r:c:f:t:m:a:h:", long_options, &option_index);
 
 		/* Detect the end of the options. */
 		if (c == -1)
@@ -107,6 +109,7 @@ int main (int argc, char *argv[])
 				printf("\t--local-stopwords=filename\n");
 				printf("\t--autodetect Turn autodetect on\n");
 				printf("\t--tokenize-characters Turn single-character tokenization on\n");
+				printf("\t--corpus-weights=filename\tIf not specify then all weights default 1.0\n");
 				printf("\t--document-weight=%1.2f\n", document_weight);
 				printf("\t--paragraph-weight=%1.2f\n", paragraph_weight);
 				printf("\t--sentence-weight=%1.2f\n", sentence_weight);
@@ -175,7 +178,13 @@ int main (int argc, char *argv[])
 
 			case 'z':
 				tokenize_characters = true;
-
+
+			case 'w':
+				if (optarg) {
+					strncpy(corpus_weights, optarg, min(MAXFILENAMELEN, int(sizeof(corpus_weights))));
+				}
+				break;
+
 			case 'd':
 				if (optarg) {
 					document_weight = atof(optarg);
@@ -259,6 +268,10 @@ int main (int argc, char *argv[])
 		document_reader = new TsvDocumentReader(stdin);
 	}
 
+	if (validate_opt(corpus_weights)) {
+		document_reader->load_weights(corpus_weights);
+	}
+
 	if (validate_opt(out_matches)) {
 		batch_handler.push_back(new PrintBatchHandler(out_matches));
 	}

diff --git a/test_script.sh b/test_script.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# the first para is the name of output
+printf "Define the outputs name: [prefix name]\n"
+printf "default: (without name) >>> "
+read -r try_num
+#try_num=$1
+
+# the second para is whether to use Wc or not (yes or no)
+printf "Decide whether to use the Wc or not: [yes|no]\n"
+printf "default:no >>> "
+read -r use_Wc
+#use_Wc=$2
+
+input="$(pwd)/data"
+output="$(pwd)/out"
+
+if [ -d $output ]; then
+	echo "dict out exsists"
+else
+	mkdir out
+	echo "dict out is created"
+fi
+
+if [ $use_Wc == 'y' ] || [ $use_Wc == 'Y' ] || [ $use_Wc == 'yes' ] || [ $use_Wc == 'Yes' ]; then
+	### with corpus weights
+	$(pwd)/tagcorpus --types=$input/test_types.tsv --entities=$input/test_entities.tsv --names=$input/test_names.tsv --documents=$input/test_documents.tsv --out-matches=$output/${try_num}_test_matches.tsv --out-pairs=$output/${try_num}_test_pairs.tsv --out-segments=$output/${try_num}_test_segments.tsv --corpus-weights=$input/test_Wc.tsv
+else
+	### without corpus weights
+	$(pwd)/tagcorpus --types=$input/test_types.tsv --entities=$input/test_entities.tsv --names=$input/test_names.tsv --documents=$input/test_documents.tsv --out-matches=$output/${try_num}_test_matches.tsv --out-pairs=$output/${try_num}_test_pairs.tsv --out-segments=$output/${try_num}_test_segments.tsv
+fi
+