added prediction script

Vladimir Dobrovolskii · Vladimir Dobrovolskii · commit 07731f1d3743 · 2021-12-15T09:22:38.000Z
diff --git a/README.md b/README.md
@@ -3,9 +3,13 @@
 This is a repository with the code to reproduce the experiments described in the paper of the same name, which was accepted to EMNLP 2021. The paper is available [here](https://aclanthology.org/2021.emnlp-main.605/).
 
 ### Table of contents
-1. [Preparation](#preparation)
-2. [Training](#training)
-3. [Evaluation](#evaluation)
+- [Word-Level Coreference Resolution](#word-level-coreference-resolution)
+  - [Table of contents](#table-of-contents)
+  - [Preparation](#preparation)
+  - [Training](#training)
+  - [Evaluation](#evaluation)
+  - [Prediction](#prediction)
+  - [Citation](#citation)
 
 ### Preparation
 
@@ -67,6 +71,38 @@ Make sure that you have successfully completed all steps of the [Preparation](#p
 
         python calculate_conll.py roberta test 20
 
+### Prediction
+
+To predict coreference relations on an arbitrary text, you will need to prepare the data in the jsonlines format (one json-formatted document per line).
+The following fields are requred:
+
+        {
+                "document_id": "tc_mydoc_001",
+                "cased_words": ["Hi", "!", "Bye", "."],
+                "sent_id": [0, 0, 1, 1]
+        }
+
+You can optionally provide the speaker data:
+
+        {
+                "speaker": ["Tom", "Tom", "#2", "#2"]
+        }
+
+`document_id` can be any string that starts with a two-letter genre identifier. The genres recognized are the following:
+* bc: broadcast conversation
+* bn: broadcast news
+* mz: magazine genre (Sinorama magazine)
+* nw: newswire genre
+* pt: pivot text (The Bible)
+* tc: telephone conversation (CallHome corpus)
+* wb: web data
+
+Then run:
+
+        python predict.py roberta input.jsonlines output.jsonlines
+
+This will utilize the latest weights available in the data directory for the chosen configuration. To load other weights, use the `--weights` argument.
+
 ### Citation
     @inproceedings{dobrovolskii-2021-word,
     title = "Word-Level Coreference Resolution",
diff --git a/predict.py b/predict.py
@@ -0,0 +1,79 @@
+import argparse
+
+import jsonlines
+import torch
+from tqdm import tqdm
+
+from coref import CorefModel
+from coref.tokenizer_customization import *
+
+
+def build_doc(doc: dict, model: CorefModel) -> dict:
+    filter_func = TOKENIZER_FILTERS.get(model.config.bert_model,
+                                        lambda _: True)
+    token_map = TOKENIZER_MAPS.get(model.config.bert_model, {})
+
+    word2subword = []
+    subwords = []
+    word_id = []
+    for i, word in enumerate(doc["cased_words"]):
+        tokenized_word = (token_map[word]
+                          if word in token_map
+                          else model.tokenizer.tokenize(word))
+        tokenized_word = list(filter(filter_func, tokenized_word))
+        word2subword.append((len(subwords), len(subwords) + len(tokenized_word)))
+        subwords.extend(tokenized_word)
+        word_id.extend([i] * len(tokenized_word))
+    doc["word2subword"] = word2subword
+    doc["subwords"] = subwords
+    doc["word_id"] = word_id
+
+    doc["head2span"] = []
+    if "speaker" not in doc:
+        doc["speaker"] = ["_" for _ in doc["cased_words"]]
+    doc["word_clusters"] = []
+    doc["span_clusters"] = []
+
+    return doc
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("experiment")
+    argparser.add_argument("input_file")
+    argparser.add_argument("output_file")
+    argparser.add_argument("--config-file", default="config.toml")
+    argparser.add_argument("--batch-size", type=int,
+                           help="Adjust to override the config value if you're"
+                                " experiencing out-of-memory issues")
+    argparser.add_argument("--weights",
+                           help="Path to file with weights to load."
+                                " If not supplied, in the latest"
+                                " weights of the experiment will be loaded;"
+                                " if there aren't any, an error is raised.")
+    args = argparser.parse_args()
+
+    model = CorefModel(args.config_file, args.experiment)
+
+    if args.batch_size:
+        model.config.a_scoring_batch_size = args.batch_size
+
+    model.load_weights(path=args.weights, map_location="cpu",
+                       ignore={"bert_optimizer", "general_optimizer",
+                               "bert_scheduler", "general_scheduler"})
+    model.training = False
+
+    with jsonlines.open(args.input_file, mode="r") as input_data:
+        docs = [build_doc(doc, model) for doc in input_data]
+
+    with torch.no_grad():
+        for doc in tqdm(docs, unit="docs"):
+            result = model.run(doc)
+            doc["span_clusters"] = result.span_clusters
+            doc["word_clusters"] = result.word_clusters
+
+            for key in ("word2subword", "subwords", "word_id", "head2span"):
+                del doc[key]
+
+    with jsonlines.open(args.output_file, mode="w") as output_data:
+        output_data.write_all(docs)
diff --git a/sample_input.jsonlines b/sample_input.jsonlines
@@ -0,0 +1,2 @@
+{"document_id": "tc_sample_input_001", "cased_words": ["Hi", ",", "my", "name", "is", "Tom", ".", "I", "am", "five", "."], "sent_id": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], "speaker": ["Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom"]}
+{"document_id": "pt_sample_input_001", "cased_words": ["Because", "Joseph", "her", "husband", "was", "faithful", "to", "the", "law,", "and", "yet", "did", "not", "want", "to", "expose", "her", "to", "public", "disgrace,", "he", "had", "in", "mind", "to", "divorce", "her", "quietly", "."], "sent_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{"document_id": "tc_sample_input_001", "cased_words": ["Hi", ",", "my", "name", "is", "Tom", ".", "I", "am", "five", "."], "sent_id": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], "speaker": ["Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom", "Tom"]}`
	`2`	`+{"document_id": "pt_sample_input_001", "cased_words": ["Because", "Joseph", "her", "husband", "was", "faithful", "to", "the", "law,", "and", "yet", "did", "not", "want", "to", "expose", "her", "to", "public", "disgrace,", "he", "had", "in", "mind", "to", "divorce", "her", "quietly", "."], "sent_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}`