Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new gpt cv conversion functionality #19

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def size_checker(y: int, text):
y = size_checker(y, text=exp["technologies"])
y -= 25
print(y)
if y < 0:
if y < 50:
return True
else:
return False
Expand Down Expand Up @@ -280,11 +280,6 @@ def generate_pptx_from_pdf(


def yaml_checker(yaml):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pydantic model 🥸

"""
This function is used to assess if the current cv page will be enough
to display the next experience block. If not, a new page should be created.
"""

assert (
"first_name" in yaml
), "'first_name' field is missing in yaml. this is a mandatory field."
Expand Down
23 changes: 23 additions & 0 deletions gpt/gpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from gpt_functions import pdf_reader, cv_text_to_yaml

cv_text = pdf_reader("gpt/test_files/john_smith_cv.pdf")
output_file_name = "john_smith_new_xebia_data_cv"

cv_text_to_yaml(
cv_text=cv_text, output_dir="./gpt/yaml_output", output_file_name=output_file_name
)

# input_directory = "./gpt/pdf"
# for filename in os.listdir(input_directory):
# try:
# f = os.path.join(input_directory, filename)
# print(f)
# print(f"starting {filename}")
# cv_text = pdf_reader(f)
# output_file_name = filename.replace(".pdf", "")
# cv_text_to_yaml(cv_text=cv_text, output_dir="./gpt/yaml_output", output_file_name=output_file_name)
# now = datetime.now()
# print(f"{filename} completed at {now}")
# except:
# print(f"{filename} FAILED at {now}")
# continue
219 changes: 219 additions & 0 deletions gpt/gpt_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
import openai
import fitz
import pytesseract
import cv2
import tempfile
import os
from dotenv import load_dotenv
from questions import questions
import time

load_dotenv()
deployment_name = os.getenv("DEPLOYMENT-NAME")
openai.api_type = "azure"
openai.api_version = "2023-03-15-preview"
openai.api_base = os.getenv("ENDPOINT") # Your Azure OpenAI resource's endpoint value.
openai.api_key = os.getenv("API-KEY")


def pdf_reader(pdf_path: str = None):
doc = fitz.open(pdf_path)
text = ""
pages = doc.pages()
for page in pages:
with tempfile.TemporaryDirectory() as tmpdir:
# page = doc.load_page(0) # number of page
pix = page.get_pixmap(dpi=300)
path = f"{tmpdir}/image.jpg"
pix.save(path)

img = cv2.imread(path)
img_text = pytesseract.image_to_string(img)
text = text + "\n" + img_text
doc.close()
print(f"extracted text from {pdf_path}" + "\n")
return text


def gpt_communicator(text: str = None, question: str = None, verbose=True):
response = openai.ChatCompletion.create(
engine=deployment_name, # The deployment name you chose when you deployed the ChatGPT or GPT-4 model.
messages=[
{
"role": "user",
"content": f"""
\"{text}\"

Based on the CV above:
{question}
""",
}
],
)

answer = response["choices"][0]["message"]["content"]
if verbose:
print(question)
print(f"===> {answer}")
print("----------------\n")
time.sleep(3)
return answer


def cv_text_to_yaml(
cv_text: str = None,
output_dir: str = "../gpt/yaml_output",
output_file_name: str = None,
):
yaml_text = ""
text = cv_text

first_name = gpt_communicator(text=text, question=questions["first_name"]).strip()
yaml_text += "\n" + f'first_name: "{first_name}"'

last_name = gpt_communicator(text, questions["last_name"]).strip()
yaml_text += "\n" + f'last_name: "{last_name}"'

role = gpt_communicator(text, questions["role"]).strip()
yaml_text += "\n" + f'role: "{role}"'

email_address = gpt_communicator(text, questions["email_address"]).strip()
if email_address == "None":
yaml_text += "\n" + f"email: "
else:
yaml_text += "\n" + f'email: "{email_address}"'

phone_number = gpt_communicator(text, questions["phone_number"]).strip()
if phone_number == "None":
yaml_text += "\n" + f"phone: "
else:
yaml_text += "\n" + f'phone: "{phone_number}"'

linkedin = gpt_communicator(text, questions["linkedin"]).strip()
if linkedin == "None":
yaml_text += "\n" + f"linkedin: "
else:
yaml_text += "\n" + f'linkedin: "{linkedin}"'

github = gpt_communicator(text, questions["github"]).strip()
if github == "None":
yaml_text += "\n" + f"github: "
else:
yaml_text += "\n" + f'github: "{github}"'

# website = gpt_communicator(text, questions['website']).strip()
# if website == "None":
# yaml_text += '\n' + f'website: '
# else:
# yaml_text += '\n' + f'website: "{website}"'

about_me = (
gpt_communicator(text, questions["about_me"])
.replace("\n", " ")
.replace(" ", " ")
.strip()
)
yaml_text += "\n" + f'about_me: "{about_me}"'

education_degrees = gpt_communicator(text, questions["education_degrees"]).strip()
yaml_text += "\n" + f"education:"
for degree in education_degrees.split(","):
degree = degree.strip()
education_year = gpt_communicator(
text, questions["education_year"].format(degree=degree)
).strip()
education_school = gpt_communicator(
text, questions["education_school"].format(degree=degree)
).strip()
yaml_text += "\n" + f' - degree: "{degree}"'
if education_school == "None":
yaml_text += "\n" + f" institution: "
else:
yaml_text += "\n" + f' institution: "{education_school}"'
if education_year == "None":
yaml_text += "\n" + f" year: "
else:
yaml_text += "\n" + f' year: "{education_year}"'

biography = (
gpt_communicator(text, questions["biography"])
.replace("\n", " ")
.replace(" ", " ")
.strip()
)
yaml_text += "\n" + f'biography: "{biography}"'

roles = gpt_communicator(text, questions["roles"]).strip()
yaml_text += "\n" + f"roles:"
for role in roles.split(","):
role = role.strip()
role_description = gpt_communicator(
text, questions["role_description"].format(role=role)
).strip()
yaml_text += "\n" + f' - title: "{role}"'
yaml_text += "\n" + f' description: "{role_description}"'

certifications = gpt_communicator(text, questions["certifications"]).strip()
yaml_text += "\n" + f"certifications:"
if certifications != "None":
for certification in certifications.split(","):
certification = certification.strip()
yaml_text += "\n" + f' - title: "{certification}"'

competences_titles = gpt_communicator(text, questions["competences_titles"]).strip()
yaml_text += "\n" + f"competences:"
for competences_title in competences_titles.split(","):
competences_title = competences_title.strip()
competences = gpt_communicator(
text, questions["competences"].format(competences_title=competences_title)
).strip()
yaml_text += "\n" + f' - title: "{competences_title}"'
yaml_text += "\n" + f' description: "{competences}"'

companies = gpt_communicator(text, questions["companies"]).strip()
yaml_text += "\n" + f"experience:"
for company in companies.split(","):
company = company.strip()
company_role = gpt_communicator(
text, questions["company_role"].format(company=company)
).strip()
company_start = gpt_communicator(
text, questions["company_start"].format(company=company)
).strip()
company_end = gpt_communicator(
text, questions["company_end"].format(company=company)
).strip()
company_work = (
gpt_communicator(text, questions["company_work"].format(company=company))
.strip()
.replace("\n", "\n ")
.replace("¢", "•")
.replace("* ", "• ")
.replace("+ ", "• ")
.replace("« ", "• ")
)
company_technologies = gpt_communicator(
text, questions["company_technologies"].format(company=company)
).strip()

yaml_text += "\n" + f' - title: "{company_role}"'
yaml_text += "\n" + f' company: "{company}"'
if company_start == "None":
yaml_text += "\n" + f" start: "
else:
yaml_text += "\n" + f' start: "{company_start}"'
if company_end == "None":
yaml_text += "\n" + f" end: "
else:
yaml_text += "\n" + f' end: "{company_end}"'
yaml_text += "\n" + f' description: "{company_work}"'
if company_technologies == "None":
yaml_text += "\n" + f" technologies: "
else:
yaml_text += "\n" + f' technologies: "{company_technologies}"'
yaml_text += "\n" + f" visible: true"

with open(f"{output_dir}/{output_file_name}.yml", "w") as text_file:
text_file.write(yaml_text)

return yaml_text
7 changes: 7 additions & 0 deletions gpt/gpt_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
openai
python-dotenv
pypdf
opencv-python
pytesseract
pdf2image
ppt2pdf
25 changes: 25 additions & 0 deletions gpt/questions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
questions = {
"first_name": """What's this persons first name? Give me only the name.""",
"last_name": """What's this persons last name? Give me only the name.""",
"role": """What's this persons main role? Don't tell me "Data Analyst". Your answer must be one of these titles: "Analytics Engineer", "Machine Learning Engineer", "Data Scientist", "Data Engineer", "Analytics Translator". Give me only the title.""",
"about_me": """What does this persons "About Me" section say? If you cannot find an "About Me" section, generate an appropriate one based on this persons CV. Keep it short and informal, no more than 75 words. Give me only the text.""",
"education_degrees": """What are this persons education degrees? Remove commas in degree names if any. Give me only the degree names, seperated by commas. Order them from newest to oldest. If not provided, answer "None". """,
"education_year": """When did they study {degree}? Give me your answer in a format like "2000 - 2004". If not provided, answer "None".""",
"education_school": """What is the name of the school they studied {degree}? Give me only the school name. If not provided, answer "None".""",
"biography": """What does this persons "Biography" section says? If you cannot find a "Biography" section, generate an appropriate one based on this persons CV. Keep it short, no more than 100 words. Give me only the text.""",
"roles": """What roles are listed in the Roles section of the CV? Give me no more than 4 role titles. If not provided in the CV, give me 3 appropriate role titles. Don't include company names or the work they do, give only generic work titles like Data Engineer, Data Analyst, Data Scientist etc. Give me the role titles, seperated by commas.""",
"role_description": """What is the description of their {role} role in the CV? If they did not provide this in the CV, write an appropriate one, about 20 words long. Keep it generic, not specific for a company they worked for. Give me only the text.""",
"certifications": """What certifications does this person have? Give me only the certifications, seperated by commas. If not provided, answer "None".""",
"competences_titles": """What are the competences titles listed in the Competences section of the CV? Such as "Programming, Tech, Languages". Don't give me specific competences like SQL, Python etc, give me more generic titles like Programming, Cloud, Languages. Don't include "Education" and "Certification" in these titles. Give me only the titles, seperated by commas. If competences titles are not explicitly listed in the CV, select applicable ones from "Programming, Data Stack, Visualization, Cloud, Tech, Languages" """,
"competences": """What are this persons competences that would fall under {competences_title}? Give me only the competences, seperated by commas.""",
"companies": """What companies did this person work for? Remove commas in company names if any. Give me only the company names, seperated by commas. Order them from newest to oldest.""",
"company_role": """What was this persons job title at {company}? Don't include what they do, or company name etc. Give me a generic job title like Data Engineer, Analytics Engineer, Data Scientist, Machine Learning Engineer etc. If the job title is not explicitly provided in the CV, try to come up with an appropriate one. Give me only the job title.""",
"company_start": """When did this person started working at {company}? Give me only the year and the month, in "2000 June" format. Don't write anything else! If not provided, answer "None".""",
"company_end": """When did this person finished working at {company}? Give me only the year and the month, in "2000 June" format. Don't write anything else! If it says present, answer "Present". If nothing about end date is provided, answer "None".""",
"company_work": """What did this person do at {company}? Give me the text block exactly as it is written in the CV, don't add anything yourself, remove the technologies that are mentioned in the end if they are mentioned. Don't include company name, role title or year/month they worked there if they are mentioned in the text. If there are stange characters in the original text, like "¢, *, +, «", replace them with "•" if you think they are meant to be bulletpoints.""",
"company_technologies": """What technologies did this person use at {company}? These should be provided in the CV, if not, write related technologies yourself. Give me only the names of technologies, seperated by commas. If you cannot find related technologies, answer "None".""",
"email_address": """What's this persons email address? Give me only the address. If not provided, answer "None".""",
"phone_number": """What's this persons phone number? Give me only the number. If not provided, answer "None".""",
"linkedin": """What's this persons linkedin address? Give me only the address, cut everything before "linkedin" in the address. If not provided, answer "None".""",
"github": """What's this persons github address? Give me only the address, cut everything before "github" in the address. If not provided, answer "None".""",
}
Binary file added gpt/test_files/john_smith_cv.pdf
Binary file not shown.
52 changes: 52 additions & 0 deletions gpt/yaml_output/john_smith_new_xebia_data_cv.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

first_name: "John"
last_name: "Smith"
role: "Data Engineer"
email: "[email protected]"
phone: "3868683442"
linkedin:
github:
website:
about_me: "Hi there! I'm a dedicated Data Engineer with over 5 years of experience working with large datasets and building robust databases. I'm passionate about utilizing my skills in SQL, Java, and Python to create game-changing insights for businesses. Besides work, you can find me cycling, songwriting, and running. Excited to see what new challenges lie ahead!"
education:
- degree: "BS Computer Science"
institution: "Texas University"
year:
biography: "John Smith is a dedicated Data Engineer with over 5 years of experience working with large datasets. He has a proven track record of building robust databases and implementing natural language processing tools to support data scientists. John is highly skilled in SQL, Java, Apache Spark, Hadoop, and Python, and is fluent in English and German. He holds a Bachelor's degree in Computer Science from Texas University, with a dual concentration in Machine Learning and a Business Foundations Certificate. His passion for delivering game-changing insights extends to his hobbies, including cycling, songwriting, and running."
roles:
- title: "Data Engineer"
description: "Dedicated Data Engineer with 5+ years' experience in designing, implementing, and maintaining scalable databases and ETL processes for large datasets, seeking to deliver game-changing insights."
- title: "SQL server database developer"
description: "Develop and optimize SQL server database systems to maximize performance and efficiency while ensuring data accuracy and integrity for clientele."
certifications:
- title: "CCA Cloudera Certified Associate"
competences:
- title: "SKILLS"
description: "SQL, Java, Apache Spark, Hadoop, Python, Coding"
- title: "LANGUAGES"
description: "English, German"
- title: "HOBBIES"
description: "Cycling, Songwriting, Running"
experience:
- title: "Data Engineer"
company: "FNB Nong Phai"
start: "2018 January"
end: "2018 December"
description: "Responsible for scaling machine learning models and making these models fit within banking environments. Implemented natural language processing tools to ensure machine-readable databases were ready for the team of data scientists."
technologies: "SQL, Spark, MongoDB, sci-kit-learn, Tensorflow, and Keras"
visible: true
- title: "Data Engineer"
company: "ABSA Washington"
start: "2015 January"
end: "2017 December"
description: "Responsible for developing database triggers, packages, functions, and stored procedures using PL/SQL and maintain the scripts for various data feeds across multiple regional and international offices of the company

• Co-develop a SQL server database system to maximize performance benefits for clientele.

• Assisted senior-level Data Scientists in the design of ETL processes, including SSIS packages.

• Developed coherent Logical Data Models that helped guide important client business decisions.

• Collaborate and coordinate with development teams to deploy data quality solutions and create and maintain standard operating procedure documentation."
technologies: "PL/SQL, SQL server, ETL processes, SSIS packages"
visible: true
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
python-dotenv
black
pyyaml
reportlab<4.0.0
Expand Down
2 changes: 1 addition & 1 deletion streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
## Footnote
st.caption(
"""
⚠️Note: Please save your `YAML input` locally, refreshing this page will reset the input to the default values and the loss of any data you have entered.
⚠️Note: Please save your `YAML input` locally, refreshing this page will reset the input to the default values and any data you have entered will be lost.

Developed By: Erkan Celen
"""
Expand Down