-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassification.py
449 lines (348 loc) · 23.7 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
import argparse
import json
import os
import pickle
import numpy as np
import openai
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, train_test_split
from helper_functions import (
encode_sentences_openai,
encode_sentences_sentencetransformer,
find_sentences_openai,
find_sentences_sentencetransformer,
initialize_model,
record_response,
select_random_sentences,
)
from prompt_template_conclusion import create_prompt_conclusion
from prompt_template_premise import create_prompt_premise
# this is saved as environment variable in the system.
openai.api_key = os.getenv("OPENAI_API_KEY_MICHAEL")
def main():
parser = argparse.ArgumentParser(description="""
Developer: Abdullah Al Zubaer
Email: [email protected]
Institution: University of Passau
Accompanying code for the paper titled: "Performance analysis of large language models in the domain of legal argument mining"
Please cite the paper if you use this code for your research.
And cite the following paper for the original ECHR corpus:
"ECHR: Legal Corpus for Argument Mining". More information of how to cite in the ./data/original/README.md file.
""", formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-m','--model_name',
type=str, required=True,
choices=['gpt-3.5-turbo', 'gpt-4'],
help='''Model for text generation.
Choose from:
- gpt-3.5-turbo: GPT-3.5 Turbo model.
- gpt-4: GPT-4 model.
''')
parser.add_argument('-e', '--embedding_model',
type=str, required=True,
choices=['multi-qa-mpnet-base-dot-v1', 'text-embedding-ada-002', 'none'],
help='''Model identifier for the sentence transformer or OpenAI embedding model.
Choose from:
- multi-qa-mpnet-base-dot-v1: A sentence transformer model.
- text-embedding-ada-002: An OpenAI embedding model.
''')
parser.add_argument('-mode', '--mode',
type=str, required=True,
choices=['similar', 'dissimilar', 'random', 'none'],
help='''Mode for selecting sentences to add as few-shot examples in the prompt.
Choose from:
- similar: Sentences semantically similar to the input.
- dissimilar: Sentences semantically dissimilar to the input.
- random: Randomly selected sentences.
''')
parser.add_argument('-pt','--prompt_type',
type=str, required=True,
choices=['zeroshot_with_instruction', 'twoshot_with_instruction',
'fourshot_with_instruction', 'eightshot_with_instruction'],
help='''Type of prompt to use for the model.
Choose from:
- zeroshot_with_instruction: No examples are given, only instructions.
- twoshot_with_instruction: Two examples are given with instructions.
- fourshot_with_instruction: Four examples are given with instructions.
- eightshot_with_instruction: Eight examples are given with instructions.
''')
parser.add_argument('-c', '--component_to_classify',
type=str, required=True,
choices=['premise', 'conclusion'],
help='''The component to classify in the text.
Choose from:
- premise: Classify sentences as premise or non-premise.
- conclusion: Classify sentences as conclusion or non-conclusion.
''')
args = parser.parse_args()
# Accessing arguments
which_embedding_model = args.embedding_model
current_model_name = args.model_name
current_prompt_type = args.prompt_type
mode = args.mode
component_to_classify = args.component_to_classify
# Example usage of the arguments
print(f"Using embedding model: {which_embedding_model}")
print(f"Text generation model: {current_model_name}")
print(f"Prompt type: {current_prompt_type}")
print(f"Mode: {mode}")
print(f"Component to classify: {component_to_classify}")
# Assertion checks for not allowed configurations
assert not (which_embedding_model != 'none' and mode == 'random'), \
"Invalid configuration: --embedding_model must be 'none' when --mode is 'random'."
assert not (which_embedding_model != 'none' and current_prompt_type == 'zeroshot_with_instruction'), \
"Invalid configuration: --embedding_model must be 'none' when --prompt_type is 'zeroshot_with_instruction'."
assert not (mode == 'random' and which_embedding_model != 'none'), \
"Invalid configuration: --mode must be 'none' when --embedding_model is 'none'."
assert not (current_prompt_type == 'zeroshot_with_instruction' and mode != 'none'), \
"Invalid configuration: --mode must be 'none' when --prompt_type is 'zeroshot_with_instruction'."
assert not (which_embedding_model == 'none' and current_prompt_type in ['twoshot_with_instruction', 'fourshot_with_instruction', 'eightshot_with_instruction'] and mode == 'none'), \
"Invalid configuration: --embedding_model cannot be 'none' when using shot prompt types with mode 'none'."
assert not (which_embedding_model != 'none' and mode == 'none' and current_prompt_type != 'zeroshot_with_instruction'), \
"Invalid configuration: --mode cannot be 'none' when --embedding_model is specified and --prompt_type is not 'zeroshot_with_instruction'."
assert not (which_embedding_model == 'none' and current_prompt_type in ['twoshot_with_instruction', 'fourshot_with_instruction', 'eightshot_with_instruction'] and mode in ['similar', 'dissimilar']), \
"Invalid configuration: --embedding_model cannot be 'none' when using shot prompt types with modes 'similar' or 'dissimilar'."
# Main code starts here
# Prompt types and corresponding settings\
'''
These sequence of True and False must be exactly the sequence expected in the prompt_template_{conclusions/premise}.py file.
The binary is used in the create_prompt_{conclusion/premise}.
'''
prompt_types = {
"zeroshot_with_instruction": {"top_k": 0, "flags": [True, False, False, False]},
"twoshot_with_instruction": {"top_k": 2, "flags": [False, True, False, False]},
"fourshot_with_instruction": {"top_k": 4, "flags": [False, False, True, False]},
"eightshot_with_instruction": {"top_k": 8, "flags": [False, False, False, True]}
}
# Make sure current_prompt_type is valid
assert current_prompt_type in prompt_types, f"Invalid prompt type: {current_prompt_type}"
# Set top_k and flags based on current_prompt_type
top_k = prompt_types[current_prompt_type]["top_k"]
zeroshot_with_instruction, twoshot_with_instruction, fourshot_with_instruction, eightshot_with_instruction = prompt_types[current_prompt_type]["flags"]
# File paths for each component type
component_paths = {
"conclusion": './data/experiment_data/df_decomposed_conclusion.pickle',
"premise": './data/experiment_data/df_decomposed_premise.pickle'
}
# Make sure component_to_classify is valid
assert component_to_classify in component_paths, f"Invalid component to classify: {component_to_classify}"
# Get path based on component_to_classify
path_to_df_decomposed = component_paths[component_to_classify]
# This is an identifier for saving files with correct names based on
# which classification task I am working on i.e. premise or conclusion
conclusion_or_premise = path_to_df_decomposed.split('_')[-1].split('.')[0]
with open(path_to_df_decomposed, 'rb') as f:
df_conclusion_or_premise = pickle.load(f)
# df_conclusion_or_premise = df_conclusion_or_premise[:200].copy() # for testing purposes reduce the size of the dataset
df_conclusion_or_premise['article'] = df_conclusion_or_premise['article'].astype('category')
kf = KFold(n_splits=5, shuffle=False)
fold = 0
# Initialize a dictionary to hold data for each fold
data_dict = {}
# Initializing a dictionary for storing the cross-validation result
cv_dictionary = dict()
for train_index, test_index in kf.split(df_conclusion_or_premise['article'].unique()):
fold += 1
print(f"========= FOLD: {fold} starting ==============")
train_articles = df_conclusion_or_premise['article'].unique()[train_index]
test_articles = df_conclusion_or_premise['article'].unique()[test_index]
# Split the train_articles further into training and validation articles
train_articles, validation_articles = train_test_split(train_articles, test_size=0.2, random_state=42)
# Select the corresponding rows for each set
train_df = df_conclusion_or_premise[df_conclusion_or_premise['article'].isin(train_articles)]
validation_df = df_conclusion_or_premise[df_conclusion_or_premise['article'].isin(validation_articles)]
test_df = df_conclusion_or_premise[df_conclusion_or_premise['article'].isin(test_articles)]
# Store the dataframes in the dictionary
# Not needed now, but if you want you can
data_dict[f'fold_{fold}_for_{conclusion_or_premise}'] = {
'train': train_df,
'validation': validation_df,
'test': test_df,
}
#CHANGEHERE#
'''
folder that will contain the embeddings:
base_embeddings_folder = f"./embeddings/llm_{current_model_name}/em_{which_embedding_model}/mode_{mode}/nshots_{current_prompt_type}/c_{component_to_classify}"
embeddings_file = f"{base_embeddings_folder}" + f"/fold_{fold}_embeddings.npy"
'''
base_embeddings_folder = f"./embeddings/llm_{current_model_name}/em_{which_embedding_model}/mode_{mode}/nshots_{current_prompt_type}/c_{component_to_classify}"
# Ensure the base folder exists
if which_embedding_model != "none" and mode not in ["random", "none"] and current_prompt_type != "zeroshot_with_instruction":
if not os.path.exists(base_embeddings_folder):
os.makedirs(base_embeddings_folder)
if which_embedding_model != "none":
# Therefore we need to encode the sentences using the embedding model
# embeddings_file = f"./embeddings/embeddings_using_{which_embedding_model}_for_fold_{fold}_for_{conclusion_or_premise}.npy"
embeddings_file = f"{base_embeddings_folder}/fold_{fold}_embeddings.npy"
print("Encoding in progress...")
'''
One caveat is that the embeddings are created everytim we run this code. Although
this is not optimized as the dataset is not changing so calculating themn one time would be sufficient.
Just to keep it consistent with the code I used for the experiments done in the paper I am keeping it the
way it is. The overhead is not much given it is cheap to calculate the embedding using openai and its free
for sentence-transformers.
'''
if which_embedding_model == "multi-qa-mpnet-base-dot-v1":
# Initialize the model using the which_embedding_model
initialize_model(which_embedding_model)
train_df = encode_sentences_sentencetransformer(df=train_df,
column_name='argument',
embeddings_file=embeddings_file)
elif which_embedding_model == "text-embedding-ada-002":
train_df = encode_sentences_openai(df=train_df,
column_name='argument',
embeddings_file=embeddings_file,
which_embedding_model=which_embedding_model)
print("Encoding finished!")
else:
embeddings_file = "none" # this is not needed actually as we are not using this file anywhere.
for index, row in test_df.iterrows():
test_text = row['argument']
test_text_ground_truth = row['argument_type']
if mode == 'random':
nshot_sentences = select_random_sentences(df=train_df,
top_k=top_k) # Although top_k is not needed for random selection but it is used to select n examples.
elif mode == 'similar' or mode == 'dissimilar':
# if which_embedding_model == "multi-qa-mpnet-base-dot-v1" or which_embedding_model == "text-embedding-ada-002":
# i.e. mode is similar or dissimilar
if which_embedding_model == "multi-qa-mpnet-base-dot-v1":
nshot_sentences = find_sentences_sentencetransformer(query=test_text,
df=train_df,
top_k=top_k,
mode=mode,
embeddings_file=embeddings_file)
elif which_embedding_model == "text-embedding-ada-002":
nshot_sentences = find_sentences_openai(query=test_text,
df=train_df,
top_k=top_k,
mode=mode,
embeddings_file=embeddings_file,
which_embedding_model=which_embedding_model)
# elif which_embedding_model == "none":
if component_to_classify == "conclusion":
if current_prompt_type == "zeroshot_with_instruction":
nshot_sentences=() # require empty tuple for the prompt template.
prompt_created = create_prompt_conclusion(test_text,
*nshot_sentences,
zeroshot_with_instruction=zeroshot_with_instruction,
twoshot_with_instruction=twoshot_with_instruction,
fourshot_with_instruction=fourshot_with_instruction,
eightshot_with_instruction=eightshot_with_instruction)
else:
prompt_created = create_prompt_conclusion(test_text,
*nshot_sentences,
zeroshot_with_instruction=zeroshot_with_instruction,
twoshot_with_instruction=twoshot_with_instruction,
fourshot_with_instruction=fourshot_with_instruction,
eightshot_with_instruction=eightshot_with_instruction)
elif component_to_classify == "premise":
if current_prompt_type == "zeroshot_with_instruction":
nshot_sentences=()
prompt_created = create_prompt_premise(test_text,
*nshot_sentences,
zeroshot_with_instruction=zeroshot_with_instruction,
twoshot_with_instruction=twoshot_with_instruction,
fourshot_with_instruction=fourshot_with_instruction,
eightshot_with_instruction=eightshot_with_instruction)
else:
prompt_created = create_prompt_premise(test_text,
*nshot_sentences,
zeroshot_with_instruction=zeroshot_with_instruction,
twoshot_with_instruction=twoshot_with_instruction,
fourshot_with_instruction=fourshot_with_instruction,
eightshot_with_instruction=eightshot_with_instruction)
if component_to_classify == "conclusion":
message=[
{"role": "system", "content": "You must reply with either 'conclusion' or 'non-conclusion'."},
{"role": "user", "content": prompt_created}
]
elif component_to_classify == "premise":
message=[
{"role": "system", "content": "You must reply with either 'premise' or 'non-premise'."},
{"role": "user", "content": prompt_created}
]
'''
base_csv_folder = f"./model_classification/llm_{current_model_name}/em_{which_embedding_model}/mode_{mode}/nshots_{current_prompt_type}/c_{component_to_classify}"
csv_filename = f"{base_csv_folder}/echr_test_fold_{fold}_classification.csv"
# Ensure the base folder exists
if not os.path.exists(base_csv_folder):
os.makedirs(base_csv_folder)
'''
# to contain the csv files for each fold with models predictions.
base_csv_folder = f"./model_classification/llm_{current_model_name}/em_{which_embedding_model}/mode_{mode}/nshots_{current_prompt_type}/c_{component_to_classify}"
if not os.path.exists(base_csv_folder):
os.makedirs(base_csv_folder)
# Call the record_response function
record_response(model_name = current_model_name,
messages = message,
#CHANGEHERE#
csv_filename = f"{base_csv_folder}/echr_test_fold_{fold}_classification.csv",
# csv_filename = f"./results/echr_test_fold_{fold}_for_{conclusion_or_premise}_using_{current_model_name}_for_{current_prompt_type}_embeddingmodel_{which_embedding_model}_mode_{mode}.csv",
prompt_type = current_prompt_type,
temperature = 0,
top_p = 1,
test_text = test_text,
groundtruth_label_for_test_text = test_text_ground_truth)
#CHANGEHERE#
# Ground truth data extracting from the saved csv file in the current iteration
y_true = pd.read_csv(f"{base_csv_folder}/echr_test_fold_{fold}_classification.csv", encoding="utf-8")['Ground Truth']
#CHANGEHERE#
# Model output data extracting from the saved csv file in the current iteration
y_pred = pd.read_csv(f"{base_csv_folder}/echr_test_fold_{fold}_classification.csv", encoding="utf-8")['Only Model Output'].str.lower()
# lowercase the model output for comparison (to avoid case sensitivity given that gpt models are non-deterministic)
# this was not necessary when I wrote the paper, everything was in lowercase- but when refactroring the code for publication
# I realized that the model output sometimes is not in lowercase. This does not change the results but it is good to have it in lowercase.
# Generate classification report
report = classification_report(y_true, y_pred, digits=3)
report_dictionary = classification_report(y_true, y_pred, output_dict=True)
#saving each classificaiton report for the current run
# cv_dictionary[f'fold_{fold}'] = report_dictionary
cv_dictionary[f'llm_{current_model_name}_em_{which_embedding_model}_mode_{mode}_nshots_{current_prompt_type}_c_{component_to_classify}_fold_{fold}'] = report_dictionary
print(report)
print("\n")
print(f"========= Fold: {fold} Finished ==============\n")
#CHANGEHERE#
'''
base_result_folder = f"./results/llm_{current_model_name}/em_{which_embedding_model}/mode_{mode}/nshots_{current_prompt_type}/c_{component_to_classify}/complete_evalution_5_fold"
'''
base_result_folder = f"./results/llm_{current_model_name}/em_{which_embedding_model}/mode_{mode}/nshots_{current_prompt_type}/c_{component_to_classify}/complete_evalution_5_fold"
if not os.path.exists(base_result_folder):
os.makedirs(base_result_folder)
# saving the complete report for each run
with open(f"{base_result_folder}/evaluation_5_fold.json", "w") as f:
json.dump(cv_dictionary, f, indent=4)
# >>> saving the summarized report for each run (mean and standard deviation) for 5 fold cross validation >>>
with open(f"{base_result_folder}/evaluation_5_fold.json", "r") as f:
data = json.load(f)
metrics = ["precision", "recall", "f1-score"]
metric_results = {}
categories = [component_to_classify, f"non-{component_to_classify}", "macro avg", "weighted avg"]
for category in categories:
metric_results[category] = {}
for metric in metrics:
metric_list = []
for i in range(1, len(data) + 1):
metric_list.append(data[f'llm_{current_model_name}_em_{which_embedding_model}_mode_{mode}_nshots_{current_prompt_type}_c_{component_to_classify}_fold_{i}'][category][metric])
# metric_list.append(data[f'fold_{i}'][category][metric])
mean_metric = np.mean(metric_list)
stdev_metric = np.std(metric_list)
metric_results[category][metric] = {
"mean": mean_metric,
"standard_deviation": stdev_metric
}
# Calculate average accuracy and standard deviation
accuracy_list = [data[f'llm_{current_model_name}_em_{which_embedding_model}_mode_{mode}_nshots_{current_prompt_type}_c_{component_to_classify}_fold_{i}']['accuracy'] for i in range(1, len(data) + 1)]
mean_accuracy = np.mean(accuracy_list)
stdev_accuracy = np.std(accuracy_list)
metric_results["accuracy"] = {
"mean": mean_accuracy,
"standard_deviation": stdev_accuracy
}
# Write tbhe result to the output JSON file
output_file_path = f"{base_result_folder}/evaluation_5_fold_summarized.json"
with open(output_file_path, "w") as output_file:
json.dump(metric_results, output_file, indent=4)
print("Summarized metric results have been saved to", output_file_path)
# <<< saving the summarized report for each run (mean and standard deviation) for 5 fold cross validation <<<
if __name__ == "__main__":
exit(main())