DachengLi1 · Ying1123 · Jun 26, 2023 · Jun 26, 2023 · Jun 27, 2023 · Jun 27, 2023
diff --git a/evaluation/utils.py b/evaluation/utils.py
@@ -0,0 +1,263 @@
+import json
+import numpy as np
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from matplotlib import pyplot as plt
+
+import openai
+import tiktoken
+import time
+import os
+import argparse
+import yaml
+
+def load_model(path, dtype=torch.bfloat16, device="cuda", num_gpus=1):
+    if device == "cpu":
+        kwargs = {"torch_dtype": torch.float32}
+    elif device == "cuda":
+        kwargs = {"torch_dtype": torch.bfloat16}
+        if num_gpus != 1:
+            kwargs["device_map"] = "auto"
+            # kwargs["device_map"] = "sequential"  # This is important for not the same VRAM sizes
+            # Hard code for A100s
+            available_gpu_memory = [40] * num_gpus
+            kwargs["max_memory"] = {
+                    i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
+                    for i in range(num_gpus)
+                }
+    model = AutoModelForCausalLM.from_pretrained(path, **kwargs)
+    tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
+    return model, tokenizer
+
+def load_testcases(test_file):
+    with open(test_file, 'r') as json_file:
+        json_list = list(json_file)
+
+    test_cases = []
+    for test_case in json_list:
+        test_case = json.loads(test_case)
+        test_cases.append(test_case)
+
+    return test_cases
+
+def test(test_case, model, tokenizer, return_summary=True):
+    prompt = test_case["prompt"]
+    prompt_length = test_case["prompt_length"]
+    topics = test_case["topics"]
+    input = tokenizer(prompt, return_tensors="pt")
+    outputs = model.generate(input.input_ids.to(model.device), max_new_tokens=100, use_cache=True)[0]
+    outputs = outputs[prompt_length:]
+    outputs = tokenizer.batch_decode([outputs], skip_special_tokens=True)
+    if return_summary:
+        summary = f"Label: {topics[0]}, Predict: {outputs}, --- INFO --- Topics: {topics}, Length: {prompt_length}"
+        return outputs, summary
+    else: 
+        return outputs
+
+def attention_span(model, tokenizer, test_case, num_gen_steps=1, raw_attn=False):
+    assert num_gen_steps == 1, "Only support span for a single generation step now."
+    prompt = test_case["prompt"]
+    prompt_length = test_case["prompt_length"]
+    topics = test_case["topics"]
+    input = tokenizer(prompt, return_tensors="pt")
+
+    output = model(input.input_ids.to(model.device), output_attentions=True)
+    num_layer = len(output.attentions)
+
+    attn_mat = torch.cat([a[0, :, -1, :] for a in output.attentions])
+    attn_len = attn_mat.shape[-1]
+    dist = torch.arange(attn_len, 0, step=-1).cuda()
+
+    span = torch.sum(dist * attn_mat, dim =1)
+    span = span.reshape(num_layer, -1)
+
+    span_avg_layer = torch.mean(span, dim=1)
+    span_avg_all = torch.mean(span)
+
+    if raw_attn:
+        return span, span_avg_layer, span_avg_all, attn_mat
+    else:
+        return span, span_avg_layer, span_avg_all
+
+def visualize_attn(attn_mat, save_path):
+    pass
+
+
+# some codes taking reference from Auto-GPT
+def let_gpt_check_response(topics, response, model_name):
+    topics_list = topics[0]
+    for i in range(len(topics)):
+        if i == 0:
+            continue
+        topics_list = topics_list + "," + topics[i]
+
+    # prompt = f"Respond True if the topic(s) mentioned in the following paragraph " + \
+    #          f"have similar topoics in this list in the same order: {topics_list}; " + \
+    #           "otherwise respond False: \n" + \
+    #          f"{response}"
+
+    # prompt = f"Given this list of {len(topics)} topics separated by ',': {topics_list} " + \
+    #     f"\nRespond True if the following list contains {len(topics)} similar topics separated by ',' in the " + \
+    #         f"same order. Otherwise, respond False. \n" + \
+    #         f"List: {response}"
+
+    prompt = "Compare the topics in two lists and determine the similarity of the topics on a " + \
+        "scale of 1 to 100, where 1 indicates very low similarity and 100 indicates " + \
+        "very high similarity. The similarity score will be proportional to the " + \
+        "number of different topics in the lists.\n\n"
+    prompt += f"List 1: {topics} \n"
+    prompt += f"List 2: {response} \n"
+    prompt += "Question: What is the similarity score between the topics of List 1 and List 2? " + \
+        "The score should be proportional to the number of different topics in the lists. \n"
+    prompt += "Answer:"
+
+    _, response_line = retrieve_from_openai(prompt, model_name)
+
+    import re
+    return re.search("\d+", response_line).group()
+
+
+# def ask_gpt_for_similarity_score(topic, response, model_name):
+
+def token_counter(model_name, prompt):
+    if "gpt" in model_name:
+        token_size = len(tiktoken.encoding_for_model(model_name).encode(prompt))
+        print(f"Number of tokens: {token_size}")
+    else:
+        token_size = len(tiktoken.encoding_for_model(model_name).encode(prompt))
+        print(f"Number of tokens: {token_size} by using gpt tokenizer as default")
+
+    return token_size
+
+
+def retrieve_from_openai(prompt, model_name, num_retries=10):
+    if "gpt" in model_name:
+        token_size = token_counter(model_name, prompt)
+        print(f"Number of tokens: {token_size}")
+        openai.api_key = os.environ["OPENAI_API_KEY"]
+    else:
+        token_size = token_counter(model_name, prompt)
+        print(f"Number of tokens: {token_size} by using gpt tokenizer as default")
+
+        openai.api_key = os.environ["OPENAI_API_KEY"]
+        print("Using openai key as default key")
+
+    num_retries = 10
+    completion = None
+    for attempt in range(num_retries):
+        backoff = 2 ** (attempt)
+
+        try:    
+            completion = openai.ChatCompletion.create(
+                model=model_name,
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": f"{prompt}"}    
+                ],
+                temperature = 0
+            )
+            break
+        except openai.error.RateLimitError:
+            print("Got rate limit...")
+            pass
+        except openai.error.APIError as e:
+            if e.http_status == 502:
+                pass
+            else:
+                pass
+
+            if attempt == num_retries - 1:
+                raise
+
+        time.sleep(backoff)
+
+    if completion is None:
+        print(f"Failed to get response after {num_retries} retries")
+        return token_size, -1, "Rate limit"
+
+    response_line = completion.choices[0].message["content"]
+
+    return token_size, response_line
+
+def retrieve_cmd_args(): # setup program params from a given path to a yaml file
+    parser = argparse.ArgumentParser(
+        prog='lrt_eval',
+        description='lrt_eval'
+    )
+    parser.add_argument('yaml_path')
+    args = parser.parse_args()
+    f = open(args.yaml_path, "r")
+    cfgs = yaml.load(f, Loader=yaml.CLoader)
+    print(yaml.dump(cfgs))
+
+    return cfgs
+
+class Conv:
+    """a single conversation on a topic"""
+
+    def __init__(self, topic, length, content):
+        self.topic = topic
+        self.length = length
+        self.content = content
+
+class Prompt:
+    """the prompt used for testing, composed of multiple  """
+
+    def __init__(self, model_name, id, question_dist):
+        self.model_name = model_name
+        self.id = id
+        self.conv_list = []
+        self.topic_list = []
+        self.length_list = []
+        self.length = -1
+        self.question_dist = question_dist
+
+    def add_conv(self, conv):
+        self.conv_list.append(conv)
+        self.topic_list.append(conv.topic)
+        self.length_list.append(conv.length)
+
+    def assemble_prompt(self):
+        order_word = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh",
+                      "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth",
+                      "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth",
+                      "nineteenth", "twentieth", "twenty-first", "twenty-second", "twenty-third",
+                      "twenty-fourth", "twenty-fifth", "twenty_sixth", "twenty_seventh", "twenty_eigth",
+                      "twenty_ninth", "thritieth"]
+
+        record_prompt = "Below is a record of our previous conversation " + \
+            f"on {len(self.topic_list)} different topics. You are the ASSISTANT, and " + \
+            "I am the USER. At the beginning of each topic, the USER will say " + \
+            "'I would like to discuss the topic of <TOPIC>'. Memorize each " + \
+            "<TOPIC>. At the end of the record, I will ask you to retrieve the " + \
+            "first topic. Now the record start. "
+
+        for conv in self.conv_list:
+            record_prompt += conv.content
+
+        question_idx = "first"
+        picked_topics = [self.topic_list[0]]
+        i = 1
+        while ((self.question_dist * i) < len(self.conv_list)):
+            question_idx += f", {order_word[(self.question_dist * i)]}"
+            picked_topics.append(self.topic_list[self.question_dist * i])
+            i += 1
+
+        self.prompt = "A chat between a curious user and an artificial intelligence " + \
+            "assistant. The assistant gives helpful, detailed, and polite " + \
+            f"answers to the user\'s questions. USER: {record_prompt} Now " + \
+            "the record ends. What is the " + question_idx + " topic(s) we discussed? Only give " + \
+            "me the topic name(s) in the format of [<topic>, <topic>, ...]. Do not summarize yourself. Do not mention topic order. ASSISTANT:" 
+
+        # self.prompt = "A chat between a curious user and an artificial intelligence " + \
+        #     "assistant. The assistant gives helpful, detailed, and polite " + \
+        #     f"answers to the user\'s questions. USER: {record_prompt} Now " + \
+        #     f"the record ends. What is the {question_idx} topic(s) we discussed? Only give " + \
+        #     "me the topic name(s) in the format of [<topic>, <topic>, ...]. Do not summarize yourself. Do not mention topic order. ASSISTANT:" 
+
+        self.length = token_counter(self.model_name, self.prompt)
+
+        return self.prompt, picked_topics
+
+
diff --git a/evaluation/zeroscrolls/eval.py b/evaluation/zeroscrolls/eval.py
@@ -0,0 +1,107 @@
+import argparse
+import json
+import os
+import time
+from tqdm import tqdm
+
+import torch
+import numpy as np
+
+from datasets import load_dataset
+from rouge_score import rouge_scorer
+
+from evaluation.utils import load_model
+
+def fix_prompt(prompt):
+    paragraphs = prompt.split("\n\n")
+    new_prompt = prompt + "\n\nQuestions:\n" + paragraphs[0] + "\n\nAnswer:\n"
+    return new_prompt
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name-or-path", type=str,
+            # default="/home/haozhang/LongChat/data/dacheng-data/longchat_32K_interpolate",
+            default="/home/haozhang/LongChat/data/dacheng-data/longchat_7b_16K",
+            # default="/home/haozhang/LongChat/data/dacheng-data/longchat_13b_16K",
+            help="model path")
+    parser.add_argument("--ratio", type=int, default=8,
+            help="target sequence length / original sequence length")
+    parser.add_argument("--flash", action='store_true', help="whether to use flash attention to save memory, but slower")
+    parser.add_argument("--dataset-version", type=str, default="tau/zero_scrolls")
+    parser.add_argument("--dataset", type=str, default="qasper")
+    parser.add_argument("--seq-len", type=int, default=15000)
+    args = parser.parse_args()
+
+    SEQ_LEN = args.seq_len
+
+    # load model
+    from longchat.train.monkey_patch.llama_interpolate_monkey_patch import replace_llama_with_interpolate
+    replace_llama_with_interpolate(args.ratio)
+
+    if args.flash:
+        from longchat.train.monkey_patch.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
+        replace_llama_attn_with_flash_attn()
+
+    import transformers
+
+    path = args.model_name_or_path
+    model, tokenizer = load_model(path, num_gpus=4)
+    print("model loaded.")
+
+    # create output dir
+    name = os.path.split(path)[-1]
+    output_dir = os.path.join("predictions", name)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+    output_file = os.path.join(output_dir, f"{name}_{args.dataset}_{SEQ_LEN}.raw")
+    print(f"output file: {output_file}")
+
+    # load dataset
+    data = load_dataset(args.dataset_version, args.dataset)
+    if args.dataset_version=="tau/scrolls":
+        test_cases = data["validation"]
+        test_cases = test_cases.shuffle(seed=1123).select(range(200))
+    else:
+        test_cases = data["validation"]
+
+    # inference
+    print(f"start inference ...")
+    tic = time.time()
+    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
+    f1 = 0
+    predicts = ["Task,ID,Prediction"]
+    for x in tqdm(test_cases):
+        prompt = x["input"]
+        if args.dataset_version == "tau/scrolls":
+            prompt = fix_prompt(prompt)
+        input_ids = tokenizer(prompt).input_ids
+        new_len = min(SEQ_LEN, len(input_ids))
+        input_ids = torch.tensor([input_ids[-new_len:]]).to(model.device)
+
+        output_ids = model.generate(input_ids, max_new_tokens=64, use_cache=False)[0][new_len:]
+        # output_ids = model.generate(input_ids, max_new_tokens=64, use_cache=False)[0]
+        outputs = tokenizer.batch_decode([output_ids], skip_special_tokens=True)
+        predicts.append(f'{args.dataset},{x["id"]},"{outputs[0]}"')
+
+        # print("---------------------")
+        # print(x["output"])
+        # print("---------------------")
+        # print(prompt)
+        # print("---------------------")
+        # print(outputs[0])
+        # print("=====================")
+        max_score = 0
+        for l in [512]:
+            outputs = tokenizer.batch_decode([output_ids[:l]], skip_special_tokens=True)
+            score = scorer.score(outputs[0], x["output"])
+            max_score = max(max_score, score["rouge1"].fmeasure)
+        f1 += max_score
+
+    f1 /= len(test_cases)
+    print(f"avg f1 over {len(test_cases)} test cases: {f1:.3f}")
+    print(f"total inference time: {time.time() - tic:.0f} s")
+
+    with open(output_file, "w") as f:
+        for line in predicts:
+            f.write(line + "\n")
+