-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper_functions.py
743 lines (562 loc) · 29.3 KB
/
helper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
import ast
import json
import os
import re
import warnings
from datetime import datetime
import numpy as np
import openai
import pandas as pd
from pandas.errors import PerformanceWarning
from tenacity import ( # , before_sleep_log # for exponential backoff
retry,
stop_after_attempt,
wait_random_exponential,
)
from tqdm import tqdm
# Suppress the specific PerformanceWarning
warnings.simplefilter(action='ignore', category=PerformanceWarning)
os.environ['OUTDATED_IGNORE'] = '1'
import pingouin as pg
openai.api_key = os.getenv("OPENAI_API_KEY")
def save_dataframe(df, filename):
"""
Saves the given DataFrame to both CSV and Excel formats in the current working directory.
Args:
df (pandas.DataFrame): The DataFrame to save.
filename (str): The base filename without extension to use for saving the files.
Returns:
None
"""
# Define file paths
csv_file = f"{filename}.csv"
excel_file = f"{filename}.xlsx"
# Save as CSV
df.to_csv(csv_file, index=False)
# print(f"DataFrame saved as CSV in {csv_file}")
# Save as Excel
df.to_excel(excel_file, index=False, engine='openpyxl')
# print(f"DataFrame saved as Excel in {excel_file}")
def drop_columns_from(df, start_column):
"""
Drop all columns from the specified start_column to the end of the DataFrame (inclusive).
Parameters:
df (pd.DataFrame): The DataFrame from which to drop columns.
start_column (str): The column name from which to start dropping.
Returns:
pd.DataFrame: A DataFrame with the specified columns removed.
"""
# Get the index of the start column
start_index = df.columns.get_loc(start_column)
# Get the column names to drop from start_index to the end
columns_to_drop = df.columns[start_index:]
# Drop the columns
df = df.drop(columns=columns_to_drop)
return df
def read_excel_sheets(file_path, sheets=None, return_type='single'):
"""
Reads specified sheets from an Excel file using pandas.
:param file_path: str, path to the Excel file.
:param sheets: str, int, or list, names or indices of the sheets to read.
:param return_type: str, 'single' to return a single DataFrame (if one sheet is specified),
'dict' to return a dictionary of DataFrames (if multiple sheets are specified).
:return: DataFrame or dict of DataFrames depending on return_type and sheets.
"""
# Read the sheets based on the provided 'sheets' argument
try:
data = pd.read_excel(file_path, sheet_name=sheets)
except Exception as e:
print(f"Failed to read the file: {e}")
return None
# If multiple sheets are read into a dictionary
if isinstance(data, dict):
if return_type == 'single':
# If user wants a single DataFrame but multiple sheets were requested, raise an error
raise ValueError("Multiple sheets found but 'single' DataFrame requested. Specify correct 'return_type'.")
return data
else:
if return_type == 'dict':
# If user expects a dictionary but only one sheet was read, adjust the return structure
return {sheets: data}
return data
def extract_points(response):
# Initialize an empty dictionary to store the points
points = {}
# Follow #1 when you have the output point
# in new line or else follow #2 the second one where I
# explicitely mention delimiter
#1
# Split the response string into individual lines
# lines = response.split('\n')
#2
delimiter = '\n' if '\n' in response else '\\n'
lines = response.split(delimiter)
for line in lines:
# Extract the sentence number and its corresponding points using regex
match = re.search(r'Studierendenantwort_(\d+): (\d+)', line)
# print(match.group(1), match.group(2))
if match:
sentence_num = int(match.group(1))
point = int(match.group(2))
# Store the points in the dictionary while maintaining the original order
points[sentence_num] = point
# print(points)
# Convert the dictionary to a list of points ordered by sentence number
ordered_points = [points[i] for i in range(1, len(points) + 1)]
return ordered_points
'''
Functionality:
Extract the rank from the model's output (which is saved in the csv file)
and create a new column containing (in the same df) the ranks in a list.
'''
def extract_rankings(response):
# Initialize an empty dictionary to store the rankings
rankings = {}
# Follow #1 when you have the output rank
# in new line or else follow #2 the second one where I
# explicitely mention delimiter
#1
# Split the response string into individual lines
# lines = response.split('\n')
#2
delimiter = '\n' if '\n' in response else '\\n'
lines = response.split(delimiter)
for line in lines:
# Extract the sentence number and its corresponding rank using regex
match = re.search(r'Studierendenantwort_(\d+): Rang (\d+)', line)
if match:
sentence_num = int(match.group(1))
rank = int(match.group(2))
# Store the rank in the dictionary while maintaining the original order
rankings[sentence_num] = rank
# Convert the dictionary to a list of rankings ordered by sentence number
ordered_rankings = [rankings[i] for i in range(1, len(rankings) + 1)]
return ordered_rankings
@retry(wait=wait_random_exponential(min=1, max=180), stop=stop_after_attempt(20))
def chat_with_openai(model_name, messages, temperature, top_p):
"""
This function initiates a chat with the specified OpenAI model.
Args:
model_name (str): The name of the model to be used.
messages (List[Dict[str, str]]): The list of messages to be sent. Each message is a dictionary with 'role' and 'content'.
temperature (float): The 'temperature' parameter to be used for the chat completion. Lower values (close to 0) make the output more deterministic, while higher values (close to 1) make it more random.
top_p (float): The 'top_p' parameter to be used for the chat completion. A float between 0 and 1.
Returns:
dict or str: The chat completion response if successful, otherwise an error message.
"""
try:
response = openai.ChatCompletion.create(
model=model_name,
# seed=101,
messages=messages,
temperature=temperature,
top_p=top_p
)
# If the response is successful, return the response
return response
except openai.error.RateLimitError as e:
print("Rate limit exceeded. Retrying...")
raise e # Retrying is handled by `tenacity` now, so re-raise the exception
except Exception as e:
print(f"This Exception: {e}\nRetrying...")
raise e # Retrying is handled by `tenacity` now, so re-raise the exception
def process_dataframe_with_openai_api(df, metadata_df, model_name, temperature, top_p, assessment_prompt_version, assessment_type):
"""
Processes each row of the given DataFrame by sending prompts to the OpenAI API and stores the response along with
additional metadata.
This function iterates through the DataFrame `df`, extracting prompts from a dynamically determined `input_column`,
based on the `assessment_prompt_version` and `assessment_type`, and makes an API call to OpenAI.
Parameters:
- df (pd.DataFrame): A DataFrame containing the prompts to be sent to the OpenAI API.
- metadata_df (pd.DataFrame): A DataFrame to store the metadata of the API interaction.
- model_name (str): The name of the OpenAI model to be used for generating responses.
- temperature (float): The temperature parameter for the OpenAI API, controlling randomness.
- top_p (float): The top_p parameter for the OpenAI API, controlling token selection based on cumulative probability.
- assessment_prompt_version (str): Version identifier for the prompts, used in naming the columns.
- assessment_type (str): Type of assessment ('rank' or 'point') to determine the processing method.
Returns:
- tuple: A tuple containing two pandas DataFrames:
- The updated original DataFrame `df` with a new column for model rank/point.
- The updated `metadata_df` DataFrame containing the metadata for each API call.
Raises:
ValueError: if an invalid `assessment_type` is provided or if any of the new columns that
are to be added to df or metadata_df already exist in these dataframes.
Exception Handling:
- In case of an error during the API call or processing, the function prints the error message and sets the
corresponding metadata and rankings columns to None or empty values for the affected row.
"""
# Determine input_column and prefix based on assessment_type
# input_colum: The name of the column in `df` containing the prompts to be sent to the API.
# prefix: A string prefix used to name the new columns for the metadata and primary data. This helps in distinguishing
# between different sets of data, especially when processing multiple types of prompts.
if assessment_type == 'rank':
input_column = f'prompt_{assessment_prompt_version}_rank_assessment'
prefix = f'prompt_{assessment_prompt_version}_rank_assessment_' # Add dd_mm_yyyy_ if you want in th end. Example: prompt_{assessment_prompt_version}_rank_assessment_22_JUN_2039_
result_column = f"{prefix}{model_name}_model_ranks"
elif assessment_type == 'point':
input_column = f'prompt_{assessment_prompt_version}_point_assessment'
prefix = f'prompt_{assessment_prompt_version}_point_assessment_' # Add dd_mm_yyyy_ if you want in th end. Example: prompt_{assessment_prompt_version}_rank_assessment_22_JUN_2039_
result_column = f"{prefix}{model_name}_model_points"
else:
raise ValueError("Invalid assessment type. Choose 'rank' or 'point'.")
''' # it does not make sense to check if the column already exist in the metadata_df as the metadata_df is empty at the first place.
# >>> check if overwriting existing column >>>
# Columns to be added to metadata_df
metadata_columns = [f"{prefix}{model_name}_{suffix}" for suffix in [
"complete_input_to_model", "complete_response", "model_output",
"hyperparameter", "date_and_time", "model_used", "model_ranks" if assessment_type == 'rank' else "model_points"
]]
# Check if any of the new columns already exist in metadata_df or df
existing_metadata_cols = set(metadata_df.columns).intersection(metadata_columns)
if existing_metadata_cols or result_column in df.columns:
raise ValueError(f"Columns {existing_metadata_cols} or Column {result_column} already exist in metadata_df or df.")
# <<< check if overwriting existing column <<<
'''
for idx, prompt in enumerate(tqdm(df[input_column], desc=f"Processing {assessment_type} prompt version {assessment_prompt_version}")):
messages = [{"role": "user", "content": prompt}]
timestamp = datetime.now()
try:
response = chat_with_openai(model_name=model_name, messages=messages, temperature=temperature, top_p=top_p)
if isinstance(response, dict) and 'choices' in response and len(response['choices']) > 0:
model_output = response['choices'][0]['message']['content'] # Only the model output nothing else
response_str = json.dumps(response) # Convert complete response to a JSON string
else:
model_output = None
response_str = json.dumps(response) # Convert response to a JSON string
# Dynamically adding metadata to the metadata DataFrame
metadata_df.loc[idx, f"{prefix}{model_name}_complete_input_to_model"] = messages
metadata_df.loc[idx, f"{prefix}{model_name}_complete_response"] = response_str
metadata_df.loc[idx, f"{prefix}{model_name}_model_output"] = model_output
metadata_df.loc[idx, f"{prefix}{model_name}_hyperparameter"] = json.dumps({"temperature": temperature, "top_p": top_p})
metadata_df.loc[idx, f"{prefix}{model_name}_date_and_time"] = timestamp.strftime("%Y-%m-%d %H:%M:%S")
metadata_df.loc[idx, f"{prefix}{model_name}_model_used"] = model_name
if assessment_type=='rank':
# Convert list of ranks to a string and add to the original DataFrame and metadata
only_ranks = extract_rankings(model_output) if model_output else []
df.loc[idx, f"{prefix}{model_name}_model_ranks"] = json.dumps(only_ranks) # addding the model rank in the existing df passed in this function
metadata_df.loc[idx, f"{prefix}{model_name}_model_ranks"] = json.dumps(only_ranks)
elif assessment_type=='point':
# Convert list of points to a string and add to the original DataFrame and metadata
only_points = extract_points(model_output) if model_output else []
df.loc[idx, f"{prefix}{model_name}_model_points"] = json.dumps(only_points) # addding the model points in the existing df passed in this function
metadata_df.loc[idx, f"{prefix}{model_name}_model_points"] = json.dumps(only_points)
except Exception as e:
print(f"Error for prompt {prompt}: {e}")
# Setting the correnspoding columsn to None
for col in ['complete_input_to_model', 'complete_response', 'model_output', 'hyperparameter', 'date_and_time', 'model_used']:
metadata_df.loc[idx, f"{prefix}{model_name}_{col}"] = None
if assessment_type == 'rank':
metadata_df.loc[idx, f"{prefix}{model_name}_model_ranks"] = None
df.loc[idx, f"{prefix}{model_name}_model_ranks"] = json.dumps([])
elif assessment_type=='point':
metadata_df.loc[idx, f"{prefix}{model_name}_model_points"]=None
df.loc[idx, f"{prefix}{model_name}_model_points"]=json.dumps([])
return df, metadata_df
def extract_and_clean(data):
"""
Splits the input string on 'gpt', selects the first part, and removes any trailing underscores.
:param data: str, the input string to process.
:return: str, the cleaned up string.
"""
# Step 1: Split the string on 'gpt'
parts = data.split("gpt")
# Step 2: Select the first part and remove any trailing underscore
cleaned_string = parts[0].rstrip("_")
return cleaned_string
def calculate_statistics_and_save(df, identifier, raters, location_to_save):
"""
:Function used in point asseessment evaluation
Calculates statistical metrics and saves the results to a JSON file.
This function computes Cronbach's alpha and confidence intervals for
each sublist of raters, and also calculates the correlation matrix
among the raters. The results, including Cronbach's alpha, confidence
intervals, and correlation matrix, are stored in a dictionary and then
saved to a JSON file.
Parameters:
- df (pd.DataFrame): The DataFrame containing the data for calculation.
- identifier (str): A unique identifier used to name the root key in the
results dictionary and the output JSON file.
- raters (list of str): A list of column names in 'df' representing
different raters.
- location_to_save (str): Provide the location where you want to save the result along with correlation.
Returns:
- dict: A dictionary containing the computed Cronbach's alpha, confidence
intervals, and correlation matrix.
- correlation df: A pandas dataframe containing the correlation matrix
Example usage:
the_dict = calculate_statistics_and_save(df=my_dataframe,
identifier="my_identifier",
raters=["rater1", "rater2", "rater3"])
"""
output_lists = create_sublists(raters)
results_dict = {identifier: {}}
for lst in output_lists:
alpha, ci = pg.cronbach_alpha(data=df[lst], ci=0.95)
# print(f"{lst}\nCronbach's Alpha:{alpha}\nConfidence Interval:{ci}\n")
key = "_".join(lst)
results_dict[identifier][key] = {
"cronbachs_alpha": alpha,
"confidence_interval": ci.tolist(),
}
# if you want correlation also uncomment the below code and return correlation if you want.
# # Calculate the correlation and return it
# correlation = df[raters].corr()
# # Calculate and add the correlation
# results_dict[identifier]['correlation'] = df[raters].corr().to_dict()
# Save the results to a JSON file and has correlation also
save_dict_as_json(d=results_dict, filename=location_to_save)
return results_dict
def split_list_columns(df, columns_to_split):
"""
:Function used in point asseessment evaluation
Split columns containing lists into separate columns for each list element.
Parameters:
df (pd.DataFrame): The DataFrame to be processed.
columns_to_split (list): A list of column names containing lists to be split.
"""
for column in columns_to_split:
if column in df.columns:
# Split the list into separate columns
max_length = df[column].str.len().max()
column_names = [f"{column}_studansw_{i+1}" for i in range(max_length)]
# Expand the lists into new columns and rename them
df[column_names] = pd.DataFrame(df[column].tolist(), index=df.index)
# Drop the original column
# df.drop(column, axis=1, inplace=True)
else:
print(f"Column '{column}' not found in DataFrame.")
def convert_string_lists_to_int_lists(df, columns_to_convert):
"""
:Function used in point asseessment evaluation
Convert columns in a DataFrame where each element is a string representation
of a list into a list of integers.
Parameters:
df (pd.DataFrame): The DataFrame to be processed.
columns_to_convert (list): A list of column names to be converted.
"""
# Function to check and convert string representation of a list into an actual list
def convert_string_to_list(item):
# Check if item is already a list of integers
if isinstance(item, list) and all(isinstance(x, int) for x in item):
return item
# Try converting string to list
try:
return ast.literal_eval(item)
except (ValueError, SyntaxError):
# Handle cases where the string is not a list representation or other errors
return []
# Apply the conversion to each column
for column in columns_to_convert:
if column in df.columns:
if all(isinstance(row, list) and all(isinstance(x, int) for x in row) for row in df[column]):
print(f"Column '{column}' is already a list of integers.")
else:
df[column] = df[column].apply(convert_string_to_list)
# print(f"Column '{column}' is converted to a list of integers.")
else:
print(f"Column '{column}' not found in DataFrame.")
def print_in_box(message: str) -> None:
"""
Print a given message along with the current directory and timestamp in a box, separated by a horizontal line.
Parameters:
message (str): The message to be printed in the box.
"""
# Get current directory and timestamp
current_directory = os.getcwd()
time_now = datetime.now().strftime('%d-%b-%Y %H:%M:%S')
# Prepare the directory and time information
dir_info = f"Current directory as of {time_now}:\n{current_directory}"
# Combine the custom message with the directory information, separated by a line
combined_message = message + "\n\n" + "-" * len(max(message.split('\n'), key=len)) + "\n" + dir_info
# Split the combined message into lines
lines = combined_message.split('\n')
# Find the length of the longest line
max_length = max(len(line) for line in lines)
# Create the top and bottom borders of the box
top_border = "+" + "-" * (max_length + 2) + "+"
bottom_border = top_border
# Print the box with the combined message
print(top_border)
for line in lines:
# Pad each line to the length of the longest line
padded_line = line + ' ' * (max_length - len(line))
print("| " + padded_line + " |")
print(bottom_border)
def save_dict_as_json(d, filename):
"""
Saves a dictionary as a JSON file, but only if the file does not already exist.
Parameters:
d (dict): The dictionary to save.
filename (str): The path and name of the file to save the dictionary to.
Raises:
FileExistsError: If a file with the specified name already exists.
"""
# Check if the file already exists
if os.path.exists(filename):
raise FileExistsError(f"File '{filename}' already exists.")
# Create the directory if it does not exist
os.makedirs(os.path.dirname(filename), exist_ok=True)
# Save the dictionary as a JSON file
with open(filename, "w") as file:
json.dump(d, file, indent=4)
# print_in_box(f"Result saved successfully at\n{filename}")
def save_results_as_nested_json_kendall_w (mean_values, std_deviation_values, root_key, filename):
"""
Used in IRR for ranking in the notebook 'evaluation-rank-assessment.ipynb'
Convert mean and standard deviation values to a nested dictionary and save as JSON.
:param mean_values: The mean values to be stored (assumed to be in a format convertible to dict).
:param std_deviation_values: The standard deviation values to be stored (assumed to be in a format convertible to dict).
:param root_key: The root key for the nested dictionary.
:param filename: The filename where the JSON should be saved.
"""
# Convert the values to dictionaries
mean_values_dict = mean_values.to_dict()
std_deviation_values_dict = std_deviation_values.to_dict()
# Create the nested dictionary structure
nested_results = {
root_key: {
"kendall_w_mean_values": mean_values_dict,
"kendall_w_standard_deviations": std_deviation_values_dict
}
}
save_dict_as_json(d=nested_results,
filename=filename)
def read_excel_sheets(file_path, sheets=None, return_type="single"):
"""
Reads specified sheets from an Excel file using pandas.
:param file_path: str, path to the Excel file.
:param sheets: str, int, or list, names or indices of the sheets to read.
:param return_type: str, 'single' to return a single DataFrame (if one sheet is specified),
'dict' to return a dictionary of DataFrames (if multiple sheets are specified).
:return: DataFrame or dict of DataFrames depending on return_type and sheets.
"""
# Read the sheets based on the provided 'sheets' argument
try:
data = pd.read_excel(file_path, sheet_name=sheets)
except Exception as e:
print(f"Failed to read the file: {e}")
return None
# If multiple sheets are read into a dictionary
if isinstance(data, dict):
if return_type == "single":
# If user wants a single DataFrame but multiple sheets were requested, raise an error
raise ValueError(
"Multiple sheets found but 'single' DataFrame requested. Specify correct 'return_type'."
)
return data
else:
if return_type == "dict":
# If user expects a dictionary but only one sheet was read, adjust the return structure
return {sheets: data}
return data
def create_sublists(raters):
"""
Create sublists from a given list, where each sublist contains all elements except one.
This function iterates through the 'raters' list and for each element, it creates a new list
that includes all elements from 'raters' except the current one. It returns a list of these sublists.
Parameters:
raters (list): A list of elements. The function works with any non-empty list.
Returns:
list: A list of sublists, each missing one element from the original 'raters' list.
Example:
>>> create_sublists(['A', 'B', 'C', 'D'])
[['B', 'C', 'D'], ['A', 'C', 'D'], ['A', 'B', 'D'], ['A', 'B', 'C']]
"""
# if len(raters) != 4:
# raise ValueError("The input list must contain exactly 4 elements.")
result_lists = []
for rater in raters:
new_list = raters.copy()
new_list.remove(rater)
result_lists.append(new_list)
return result_lists
def calculate_kendall_w_between_n_raters(df, raters, prefix):
"""
Calculates and adds Kendall's W coefficient as new columns to the provided DataFrame
for each combination of raters. The function directly modifies the passed DataFrame.
This function is designed to work with a DataFrame where each rater's ratings are
provided in separate columns. It calculates the Kendall's W coefficient, a measure of
agreement between raters, and adds these calculations as new columns to the DataFrame.
The function updates the original DataFrame in place, adding a new column for each
combination of raters specified. As a result, there is no need to capture the return
value of the function since the modifications are applied directly to the passed DataFrame.
Parameters:
df (pd.DataFrame): The DataFrame to be modified. Contains ratings from multiple raters.
raters (list): A list of column names in df, representing different raters.
prefix (str): A prefix string for naming the new columns.
Returns:
pd.DataFrame: The original DataFrame with new columns added, each representing
the Kendall's W coefficient for a different combination of raters.
Raises:
ValueError: If the number of raters is less than two or if a generated column name
already exists in the DataFrame.
Note:
- The function assumes that the DataFrame and raters list are valid and correctly formatted.
- Direct modifications to the DataFrame within this function make it efficient for
scenarios where the original DataFrame needs to be updated across multiple iterations.
"""
# Check if there are more than one rater
if len(raters) < 2:
raise ValueError("At least two raters are required to calculate Kendall's W.")
# Sort the raters list to ensure consistent column naming
sorted_raters = sorted(raters)
raters_identifier = "_".join(sorted_raters)
new_column_name = prefix + raters_identifier
# Check if the new column name already exists in the DataFrame
if new_column_name in df.columns:
raise ValueError(f"Column '{new_column_name}' already exists in the DataFrame.")
# Convert the ratings from string to list of integers, if needed
for rater in raters:
if not isinstance(df[rater].iloc[0], list):
df[rater] = df[rater].apply(ast.literal_eval)
# Define a function to convert a DataFrame row to a suitable format for kendall_w
def convert_row_to_ratings(df_row, raters):
ratings = []
for rater in raters:
ratings.append(df_row[rater])
return np.array(ratings)
# Initialize a list to store Kendall's W for each row
kendalls_w = []
# Iterate over each row and calculate Kendall's W
for index, row in df.iterrows():
ratings_matrix = convert_row_to_ratings(row, raters)
w = kendall_w(ratings_matrix)
kendalls_w.append(w)
df[new_column_name] = kendalls_w
# print_in_box(f"kendall_w for {raters} Calculated!")
return df
# Define the Kendall's W function
def kendall_w(expt_ratings):
"""
reference: https://stackoverflow.com/a/48916127/12946268
This function calculates the Kendall's W, which is a measure of agreement between
multiple rankers. It takes a 2-dimensional numpy array as input, where each row
corresponds to a rater and each column corresponds to an item that is rated.
It then calculates the Kendall's W coefficient according to the formula and returns it.
Parameters:
expt_ratings (numpy.ndarray): A 2D numpy array where rows are raters and columns are items.
Returns:
float: The Kendall's W coefficient indicating the level of agreement.
Raises:
ValueError: If the input ratings matrix is not 2-dimensional.
Example:
>>> import numpy as np
>>> ratings = np.array([
[1, 5, 2, 3, 4], # rater 1 - 1st row
[2, 4, 1, 5, 3], # rater 2 - 2nd row
[2, 5, 1, 4, 3] # rater 3 - 3rd row
])
>>> kendall_w(ratings)
0.8444 (the actual output will vary based on the calculation)
In this example, ratings are provided by 3 raters (rows) for 5 different items (columns).
The function calculates and returns the Kendall's W coefficient, which measures the level
of agreement among the raters' rankings of the items.
"""
if expt_ratings.ndim != 2:
raise ValueError("Ratings matrix must be 2-dimensional")
m = expt_ratings.shape[0] # raters
n = expt_ratings.shape[1] # items rated
denom = m**2 * (n**3 - n)
rating_sums = np.sum(expt_ratings, axis=0)
S = n * np.var(rating_sums)
return 12 * S / denom