This repository has been archived by the owner on Mar 19, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
193 lines (150 loc) · 6.1 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import pandas as pd
import openai
import json
import os
import streamlit as st
from cluster_skills import cluster_skills
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
load_dotenv()
# Read the API key from an environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")
def skills_to_json_list(*skills):
"""
Convert skills into a JSON-encoded list.
Args:
*skills: Variable arguments of skill strings.
Returns:
str: A JSON-encoded string representing the list of skills.
"""
return json.dumps(list(skills))
def call_openai_api(skills_text):
"""
Make an API call to OpenAI to process the skills text.
Args:
skills_text (str): Text containing skills.
Returns:
list: Extracted skills from the text.
"""
functions = [
{
"name": "skills_to_json_list",
"description": "Convert a variable number of skills into a JSON-encoded list.",
"parameters": {
"type": "object",
"properties": {
"skills": {
"type": "array",
"description": "A variable number of skill strings.",
"items": {
"type": "string"
}
}
},
"required": ["skills"]
}
}
]
try:
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[{'role': 'user', 'content': skills_text}],
functions=functions,
function_call={"name": "skills_to_json_list"}
)
arguments = json.loads(response["choices"][0]["message"]["function_call"]["arguments"])
return arguments["skills"]
except Exception as e:
print(f"Error calling OpenAI API: {e}")
return []
def extract_skills_from_csv(csv_path):
# Read the CSV
df = pd.read_csv(csv_path)
# df = df.head(6) # For demonstration, take only the first 5 rows
print(f"Initial DataFrame:\n{df}\n") # Debugging print
raw_skills = set()
individual_skills = {}
skills_taxonomy = set()
for _, row in df.iterrows():
skills = call_openai_api(row['Skill Sets'])
print(f"Skills for {row['Name']}: {skills}") # Debugging print
raw_skills.update(skills)
individual_skills[row['Name']] = skills
# Update individual_skills.csv in real-time
individual_skills_df = pd.DataFrame({
'Name': individual_skills.keys(),
'Skills': [', '.join(skills) for skills in individual_skills.values()]
})
individual_skills_df.to_csv("individual_skills.csv", index=False)
# Cluster skills after gathering all of them
skills_list = list(raw_skills)
print(f"Total unique skills gathered: {len(skills_list)}")
clusters = cluster_skills(skills_list)
skill_to_cluster_label = {skill: cluster_name for cluster_name, skills in clusters for skill in skills}
# Update the skills_taxonomy
skills_taxonomy = set(skill_to_cluster_label.values())
# Write to skills_taxonomy.txt
with open("skills_taxonomy.txt", "w") as f:
for skill in skills_taxonomy:
f.write(skill + "\n")
# Update individual_skills with the clustered skills names
for name, skills in individual_skills.items():
clustered_skills = [skill_to_cluster_label.get(skill, skill) for skill in skills]
individual_skills[name] = list(set(clustered_skills)) # Remove duplicates
return skills_taxonomy, individual_skills
def extract_skills_from_csv_in_realtime(uploaded_file):
# Read the entire CSV
df = pd.read_csv(uploaded_file)
raw_skills = set()
# Create a placeholder dataframe and placeholder for the table
current_df = pd.DataFrame(columns=['Name', 'Skills'])
table_placeholder = st.empty()
# df = df.head(6) # For demonstration, take only the first 5 rows
# Initialize progress bar and status text
progress_bar = st.progress(0)
status_text = st.empty()
total_rows = len(df)
# Process row-by-row
for index, row in df.iterrows():
skills = call_openai_api(row['Skill Sets'])
raw_skills.update(skills)
# Cluster skills immediately
skills_list = list(raw_skills)
clusters = cluster_skills(skills_list)
skill_to_cluster_label = {skill: cluster_name for cluster_name, skills in clusters for skill in skills}
clustered_skills = [skill_to_cluster_label.get(skill, skill) for skill in skills]
skills_str = ', '.join(set(clustered_skills))
# Add the processed data to current_df and display the table
current_df.loc[index] = [row['Name'], skills_str]
table_placeholder.table(current_df)
# Update progress bar and status text
progress = (index + 1) / total_rows
progress_bar.progress(progress)
status_text.text(f"Processing row {index + 1} of {total_rows}...")
# Update skills_taxonomy to only consist of the cluster names
skills_taxonomy = set(skill_to_cluster_label.values())
# Clear status text
status_text.text(f"Processed {total_rows} rows!")
return skills_taxonomy, current_df
def main():
csv_path = "skills.csv" # Replace with the path to your CSV
skills_taxonomy, individual_skills = extract_skills_from_csv(csv_path)
print("Skills Taxonomy:")
for skill in skills_taxonomy:
print(skill)
print("\nIndividual Skills:")
for name, skills in individual_skills.items():
print(f"{name}: {', '.join(skills)}")
# Save skills_taxonomy in a .txt file
with open("skills_taxonomy.txt", "w") as f:
for skill in skills_taxonomy:
f.write(skill + "\n")
# Save individual_skills in a .csv file
individual_skills_df = pd.DataFrame({
'Name': individual_skills.keys(),
'Skills': [', '.join(skills) for skills in individual_skills.values()]
})
individual_skills_df.to_csv("individual_skills.csv", index=False)
if __name__ == "__main__":
main()