SigmaWe · lihebi · Jan 13, 2023 · Jan 17, 2023 · Jan 17, 2023 · Jan 17, 2023
diff --git a/README.md b/README.md
@@ -1,3 +1,13 @@
+# PairFact source code for ACL submission
+
+To run:
+
+    pip3 install -r requirements.txt
+    python3 eval.py
+
+The results will be in the `results/` folder.
+
+
 # An evaluation framework for text-to-text generation tasks 
 
 EvalBase allow easy testing of new text-to-text generation metrics. A computerprogram or a machine learning model that can generate text from text is called a **system**. Such text-to-text generation tasks can be __summarization__, __translation__, or even __question answering__. Correspondingly, the systems for such tasks are called **summarizers**, **translators**, and **answer**. 

diff --git a/bertscore_sentence/eval.py b/bertscore_sentence/eval.py
@@ -0,0 +1,83 @@
+import numpy as np
+import torch
+from tqdm import tqdm
+import functools
+import dar_type
+from text_preprocess import list_segmentation
+import warnings
+
+
+
+def cos_sim_mat_f(cand_segments: dar_type.TextSegments, ref_segments: dar_type.TextSegments, embedder: dar_type.Embedder) -> np.ndarray:
+    def bert_encode(piece_segments: dar_type.TextSegments):
+        sent_emb_list = list()
+        for sent in piece_segments:
+            with torch.no_grad():
+                sent_emb_list.append(embedder.encode(sent, convert_to_numpy=True))
+        return np.stack(sent_emb_list, axis=0)
+
+    # def bert_encode_multiprocess(piece_segments: dar_type.TextSegments):
+    #     pool = embedder.start_multi_process_pool()
+    #     sent_emb = embedder.encode_multi_process(sentences=piece_segments, pool=pool, batch_size=8)
+    #     embedder.stop_multi_process_pool(pool)
+    #     return sent_emb
+
+    ref_sent_emb = bert_encode(ref_segments)
+    cand_sent_emb = bert_encode(cand_segments)
+    numerators = np.inner(ref_sent_emb, cand_sent_emb)
+    ref_sent_emb_norms = np.linalg.norm(ref_sent_emb, axis=1)
+    cand_sent_emb_norms = np.linalg.norm(cand_sent_emb, axis=1)
+    denominators = np.outer(ref_sent_emb_norms, cand_sent_emb_norms)
+    sim_mat = np.divide(numerators, denominators)
+    return sim_mat
+
+
+def score_np(predictions: dar_type.TextList, references: dar_type.TextList, sim_mat_f: dar_type.SimilarityMatrixFunc) -> np.ndarray:
+    cands, refs = list_segmentation(predictions), list_segmentation(references)
+    all_scores = np.empty((len(cands), 3))
+
+    # for index in tqdm(range(len(cands)), desc="bertscore-sentence {}".format(sim_mat_f.__name__), leave=False):  # all pieces, len(cands) == len(refs)
+    for index in range(len(cands)):
+        sim_mat = sim_mat_f(cand_segments=cands[index], ref_segments=refs[index])
+
+        def sum_max(is_r: bool) -> float:
+            if is_r:
+                return np.sum(np.max(sim_mat, axis=1))
+            else:
+                return np.sum(np.max(sim_mat, axis=0))  # equals to np.sum(np.max(sim_mat.T, axis=1))
+
+        has_empty = False
+        if len(refs[index]) == 0:
+            warnings.warn("empty ref str", dar_type.DocWarning)
+            has_empty = True
+        if len(cands[index]) == 0:
+            warnings.warn("empty cand str", dar_type.DocWarning)
+            has_empty = True
+        if has_empty:
+            warnings.warn("detail: [ref] {}; [cand] {}".format(refs[index], cands[index]), dar_type.DocWarning)
+
+        if not has_empty:
+            R = (1 / len(refs[index])) * sum_max(True)
+            P = (1 / len(cands[index])) * sum_max(False)
+            F = 2 * ((P * R) / (P + R))
+            all_scores[index, :] = np.array([P, R, F])
+        else:
+            all_scores[index, :] = np.zeros((3,))
+
+    return all_scores
+
+
+def compute(predictions: dar_type.TextList, references: dar_type.TextList, sim_mat_f: dar_type.SimilarityMatrixFunc) -> dar_type.MetricScoreDict:
+    cands, refs = predictions, references # simple renaming
+    score_arr = score_np(predictions=cands, references=refs, sim_mat_f=sim_mat_f)
+    return {
+        "P": score_arr[:, 0].tolist(),
+        "R": score_arr[:, 1].tolist(),
+        "F": score_arr[:, 2].tolist()
+    }
+
+
+def compute_cos(predictions: dar_type.TextList, references: dar_type.TextList, embedder: dar_type.Embedder) -> dar_type.MetricScoreDict:
+    cos_sim_mat_f_with_embedder: dar_type.SimilarityMatrixFunc = functools.partial(cos_sim_mat_f, embedder=embedder)
+    cos_sim_mat_f_with_embedder.__name__ = " ".join(["cos", embedder.__name__])
+    return compute(predictions=predictions, references=references, sim_mat_f=cos_sim_mat_f_with_embedder)
diff --git a/dar_type.py b/dar_type.py
@@ -0,0 +1,60 @@
+import typing
+import numpy as np
+import sentence_transformers
+import transformers
+from datasets.arrow_dataset import Dataset
+
+
+MetricScoreList = typing.List[float]
+MetricScoreDict = typing.Dict[str, MetricScoreList]
+TextList = typing.List[str]
+TextListSegments = typing.List[typing.List[str]]
+TextSegments = typing.List[str]
+
+
+class Embedder(sentence_transformers.SentenceTransformer):
+    __name__: str
+
+
+class SimilarityMatrixFunc(typing.Protocol):
+    __name__: str
+
+    def __call__(self, cand_segments: TextSegments, ref_segments: TextSegments) -> np.ndarray:
+        ...
+
+
+Pipeline = transformers.Pipeline
+
+
+class PipelinesList():
+    __name__: str
+    pipelines: typing.List[Pipeline]
+
+
+PipelinesDict = typing.Dict[str, PipelinesList]
+
+
+class MetricComputeFunc(typing.Protocol):
+    def __call__(self, predictions: TextList, references: TextList) -> MetricScoreDict:
+        ...
+
+
+class DocWarning(Warning):
+    """
+    Warning raised when the document is malformed
+    e.g., an empty document, which is meaningless to metrics
+    """
+
+
+class MNLICategory(typing.TypedDict):
+    label: str
+    score: float
+
+
+MNLICategories = typing.List[MNLICategory]
+
+
+MNLISimilarityExpression = typing.Callable[[MNLICategories], float]
+
+
+SummaryLengthExpression = typing.Callable[[typing.Optional[Dataset]], typing.Optional[int]]
diff --git a/env.py b/env.py
@@ -0,0 +1,69 @@
+# Group 2: bertscore-sentence (cos, mnli)
+
+# from env_root import *
+
+### HARDWARE ###
+
+import os
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+
+# fix: GPU OOM (TF exhausts GPU memory, crashing PyTorch)
+import tensorflow as tf
+gpus = tf.config.experimental.list_physical_devices("GPU")
+for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+### DATASETS ###
+
+import sys
+# sys.path.append("/home/turx/EvalBase")
+# sys.path.append("../evalbase")
+import evalbase
+for ds_name in evalbase.datasets:
+    evalbase.datasets[ds_name]["approaches"] = ["new"]
+# from evalbase import newsroom, realsumm, summeval
+
+### GLOBAL VARS ###
+
+import torch
+
+path = os.path.dirname(os.path.abspath(__file__))
+n_gpu = torch.cuda.device_count()
+
+### LIBRARY VARS ###
+
+import datasets
+datasets.disable_progress_bar()
+
+### METRICS ###
+
+corr_metrics = ["pearsonr", "kendalltau", "spearmanr"]
+
+
+### MODELS ###
+
+import dar_type
+import sentence_transformers
+
+from mnli.classifiers import mnli_classifiers
+sent_embedder_mpnet: dar_type.Embedder = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
+sent_embedder_mpnet.__name__ = "all-mpnet-base-v2"
+sent_embedder_roberta: dar_type.Embedder = sentence_transformers.SentenceTransformer("all-roberta-large-v1")
+sent_embedder_roberta.__name__ = "all-roberta-large-v1"
+
+### METRICS ###
+
+import functools
+import bertscore_sentence.eval as bertscore_sentence
+import mnli.eval
+import mnli.sim_expr
+
+metrics = {
+    "bertscore-sentence-cos-mpnet": functools.partial(bertscore_sentence.compute_cos, embedder=sent_embedder_mpnet),
+    "bertscore-sentence-cos-roberta": functools.partial(bertscore_sentence.compute_cos, embedder=sent_embedder_roberta),
+}
+
+for mnli_name in ["roberta", "bart", "deberta"]:
+    for mnli_expr in [mnli.sim_expr.not_neutral, mnli.sim_expr.entail_only, mnli.sim_expr.entail_contradict]:
+        metrics["bertscore-sentence-mnli-{}-{}".format(mnli_name, mnli_expr.__name__)] = functools.partial(mnli.eval.bertscore_sentence_compute, classifiers=mnli_classifiers[mnli_name], expr=mnli_expr)
diff --git a/eval.py b/eval.py
@@ -0,0 +1,28 @@
+import os
+import warnings
+import pandas
+import sys
+import datetime
+
+path = os.path.dirname(os.path.abspath(__file__))
+result_path = os.path.join(path, "results")
+
+
+def create_result_path():
+    if not os.path.exists(result_path):
+        os.makedirs(result_path)
+
+import factcc
+
+def eval():
+    create_result_path()
+    print("==== env.factcc.qags_main(), size: 235")
+    factcc.qags_main()
+    print("==== env.factcc.frank_main(): size: 1250")
+    factcc.frank_main()
+    print("==== env.factcc.factCC_main(): size: large")
+    factcc.factCC_main()
+
+
+if __name__ == '__main__':
+    eval()
diff --git a/eval_utils.py b/eval_utils.py
@@ -242,6 +242,13 @@ def write_results(
     if detail_df is None:
         detail_df = simple_df
 
+    parent_dir = os.path.dirname(simple_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+    parent_dir = os.path.dirname(detail_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
     with pandas.option_context('display.max_rows', None,
                                'display.max_columns', None,
                                'display.precision', 3,

diff --git a/evalbase.py b/evalbase.py
@@ -2,6 +2,7 @@
 import newsroom
 import realsumm
 import summeval
+import factcc
 
 
 path = os.path.dirname(os.path.abspath(__file__))