Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# PairFact source code for ACL submission

To run:

pip3 install -r requirements.txt
python3 eval.py

The results will be in the `results/` folder.


# An evaluation framework for text-to-text generation tasks

EvalBase allow easy testing of new text-to-text generation metrics. A computerprogram or a machine learning model that can generate text from text is called a **system**. Such text-to-text generation tasks can be __summarization__, __translation__, or even __question answering__. Correspondingly, the systems for such tasks are called **summarizers**, **translators**, and **answer**.
Expand Down
83 changes: 83 additions & 0 deletions bertscore_sentence/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import numpy as np
import torch
from tqdm import tqdm
import functools
import dar_type
from text_preprocess import list_segmentation
import warnings



def cos_sim_mat_f(cand_segments: dar_type.TextSegments, ref_segments: dar_type.TextSegments, embedder: dar_type.Embedder) -> np.ndarray:
def bert_encode(piece_segments: dar_type.TextSegments):
sent_emb_list = list()
for sent in piece_segments:
with torch.no_grad():
sent_emb_list.append(embedder.encode(sent, convert_to_numpy=True))
return np.stack(sent_emb_list, axis=0)

# def bert_encode_multiprocess(piece_segments: dar_type.TextSegments):
# pool = embedder.start_multi_process_pool()
# sent_emb = embedder.encode_multi_process(sentences=piece_segments, pool=pool, batch_size=8)
# embedder.stop_multi_process_pool(pool)
# return sent_emb

ref_sent_emb = bert_encode(ref_segments)
cand_sent_emb = bert_encode(cand_segments)
numerators = np.inner(ref_sent_emb, cand_sent_emb)
ref_sent_emb_norms = np.linalg.norm(ref_sent_emb, axis=1)
cand_sent_emb_norms = np.linalg.norm(cand_sent_emb, axis=1)
denominators = np.outer(ref_sent_emb_norms, cand_sent_emb_norms)
sim_mat = np.divide(numerators, denominators)
return sim_mat


def score_np(predictions: dar_type.TextList, references: dar_type.TextList, sim_mat_f: dar_type.SimilarityMatrixFunc) -> np.ndarray:
cands, refs = list_segmentation(predictions), list_segmentation(references)
all_scores = np.empty((len(cands), 3))

# for index in tqdm(range(len(cands)), desc="bertscore-sentence {}".format(sim_mat_f.__name__), leave=False): # all pieces, len(cands) == len(refs)
for index in range(len(cands)):
sim_mat = sim_mat_f(cand_segments=cands[index], ref_segments=refs[index])

def sum_max(is_r: bool) -> float:
if is_r:
return np.sum(np.max(sim_mat, axis=1))
else:
return np.sum(np.max(sim_mat, axis=0)) # equals to np.sum(np.max(sim_mat.T, axis=1))

has_empty = False
if len(refs[index]) == 0:
warnings.warn("empty ref str", dar_type.DocWarning)
has_empty = True
if len(cands[index]) == 0:
warnings.warn("empty cand str", dar_type.DocWarning)
has_empty = True
if has_empty:
warnings.warn("detail: [ref] {}; [cand] {}".format(refs[index], cands[index]), dar_type.DocWarning)

if not has_empty:
R = (1 / len(refs[index])) * sum_max(True)
P = (1 / len(cands[index])) * sum_max(False)
F = 2 * ((P * R) / (P + R))
all_scores[index, :] = np.array([P, R, F])
else:
all_scores[index, :] = np.zeros((3,))

return all_scores


def compute(predictions: dar_type.TextList, references: dar_type.TextList, sim_mat_f: dar_type.SimilarityMatrixFunc) -> dar_type.MetricScoreDict:
cands, refs = predictions, references # simple renaming
score_arr = score_np(predictions=cands, references=refs, sim_mat_f=sim_mat_f)
return {
"P": score_arr[:, 0].tolist(),
"R": score_arr[:, 1].tolist(),
"F": score_arr[:, 2].tolist()
}


def compute_cos(predictions: dar_type.TextList, references: dar_type.TextList, embedder: dar_type.Embedder) -> dar_type.MetricScoreDict:
cos_sim_mat_f_with_embedder: dar_type.SimilarityMatrixFunc = functools.partial(cos_sim_mat_f, embedder=embedder)
cos_sim_mat_f_with_embedder.__name__ = " ".join(["cos", embedder.__name__])
return compute(predictions=predictions, references=references, sim_mat_f=cos_sim_mat_f_with_embedder)
60 changes: 60 additions & 0 deletions dar_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import typing
import numpy as np
import sentence_transformers
import transformers
from datasets.arrow_dataset import Dataset


MetricScoreList = typing.List[float]
MetricScoreDict = typing.Dict[str, MetricScoreList]
TextList = typing.List[str]
TextListSegments = typing.List[typing.List[str]]
TextSegments = typing.List[str]


class Embedder(sentence_transformers.SentenceTransformer):
__name__: str


class SimilarityMatrixFunc(typing.Protocol):
__name__: str

def __call__(self, cand_segments: TextSegments, ref_segments: TextSegments) -> np.ndarray:
...


Pipeline = transformers.Pipeline


class PipelinesList():
__name__: str
pipelines: typing.List[Pipeline]


PipelinesDict = typing.Dict[str, PipelinesList]


class MetricComputeFunc(typing.Protocol):
def __call__(self, predictions: TextList, references: TextList) -> MetricScoreDict:
...


class DocWarning(Warning):
"""
Warning raised when the document is malformed
e.g., an empty document, which is meaningless to metrics
"""


class MNLICategory(typing.TypedDict):
label: str
score: float


MNLICategories = typing.List[MNLICategory]


MNLISimilarityExpression = typing.Callable[[MNLICategories], float]


SummaryLengthExpression = typing.Callable[[typing.Optional[Dataset]], typing.Optional[int]]
69 changes: 69 additions & 0 deletions env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Group 2: bertscore-sentence (cos, mnli)

# from env_root import *

### HARDWARE ###

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

# fix: GPU OOM (TF exhausts GPU memory, crashing PyTorch)
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)

### DATASETS ###

import sys
# sys.path.append("/home/turx/EvalBase")
# sys.path.append("../evalbase")
import evalbase
for ds_name in evalbase.datasets:
evalbase.datasets[ds_name]["approaches"] = ["new"]
# from evalbase import newsroom, realsumm, summeval

### GLOBAL VARS ###

import torch

path = os.path.dirname(os.path.abspath(__file__))
n_gpu = torch.cuda.device_count()

### LIBRARY VARS ###

import datasets
datasets.disable_progress_bar()

### METRICS ###

corr_metrics = ["pearsonr", "kendalltau", "spearmanr"]


### MODELS ###

import dar_type
import sentence_transformers

from mnli.classifiers import mnli_classifiers
sent_embedder_mpnet: dar_type.Embedder = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
sent_embedder_mpnet.__name__ = "all-mpnet-base-v2"
sent_embedder_roberta: dar_type.Embedder = sentence_transformers.SentenceTransformer("all-roberta-large-v1")
sent_embedder_roberta.__name__ = "all-roberta-large-v1"

### METRICS ###

import functools
import bertscore_sentence.eval as bertscore_sentence
import mnli.eval
import mnli.sim_expr

metrics = {
"bertscore-sentence-cos-mpnet": functools.partial(bertscore_sentence.compute_cos, embedder=sent_embedder_mpnet),
"bertscore-sentence-cos-roberta": functools.partial(bertscore_sentence.compute_cos, embedder=sent_embedder_roberta),
}

for mnli_name in ["roberta", "bart", "deberta"]:
for mnli_expr in [mnli.sim_expr.not_neutral, mnli.sim_expr.entail_only, mnli.sim_expr.entail_contradict]:
metrics["bertscore-sentence-mnli-{}-{}".format(mnli_name, mnli_expr.__name__)] = functools.partial(mnli.eval.bertscore_sentence_compute, classifiers=mnli_classifiers[mnli_name], expr=mnli_expr)
28 changes: 28 additions & 0 deletions eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
import warnings
import pandas
import sys
import datetime

path = os.path.dirname(os.path.abspath(__file__))
result_path = os.path.join(path, "results")


def create_result_path():
if not os.path.exists(result_path):
os.makedirs(result_path)

import factcc

def eval():
create_result_path()
print("==== env.factcc.qags_main(), size: 235")
factcc.qags_main()
print("==== env.factcc.frank_main(): size: 1250")
factcc.frank_main()
print("==== env.factcc.factCC_main(): size: large")
factcc.factCC_main()


if __name__ == '__main__':
eval()
7 changes: 7 additions & 0 deletions eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,13 @@ def write_results(
if detail_df is None:
detail_df = simple_df

parent_dir = os.path.dirname(simple_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
parent_dir = os.path.dirname(detail_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)

with pandas.option_context('display.max_rows', None,
'display.max_columns', None,
'display.precision', 3,
Expand Down
1 change: 1 addition & 0 deletions evalbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import newsroom
import realsumm
import summeval
import factcc


path = os.path.dirname(os.path.abspath(__file__))
Expand Down
Loading