diff --git a/functions.py b/functions.py index e28322b..44daf6f 100644 --- a/functions.py +++ b/functions.py @@ -164,7 +164,7 @@ def size_checker(y: int, text): y = size_checker(y, text=exp["technologies"]) y -= 25 print(y) - if y < 0: + if y < 50: return True else: return False @@ -280,11 +280,6 @@ def generate_pptx_from_pdf( def yaml_checker(yaml): - """ - This function is used to assess if the current cv page will be enough - to display the next experience block. If not, a new page should be created. - """ - assert ( "first_name" in yaml ), "'first_name' field is missing in yaml. this is a mandatory field." diff --git a/gpt/gpt.py b/gpt/gpt.py new file mode 100644 index 0000000..c383c85 --- /dev/null +++ b/gpt/gpt.py @@ -0,0 +1,23 @@ +from gpt_functions import pdf_reader, cv_text_to_yaml + +cv_text = pdf_reader("gpt/test_files/john_smith_cv.pdf") +output_file_name = "john_smith_new_xebia_data_cv" + +cv_text_to_yaml( + cv_text=cv_text, output_dir="./gpt/yaml_output", output_file_name=output_file_name +) + +# input_directory = "./gpt/pdf" +# for filename in os.listdir(input_directory): +# try: +# f = os.path.join(input_directory, filename) +# print(f) +# print(f"starting {filename}") +# cv_text = pdf_reader(f) +# output_file_name = filename.replace(".pdf", "") +# cv_text_to_yaml(cv_text=cv_text, output_dir="./gpt/yaml_output", output_file_name=output_file_name) +# now = datetime.now() +# print(f"{filename} completed at {now}") +# except: +# print(f"{filename} FAILED at {now}") +# continue diff --git a/gpt/gpt_functions.py b/gpt/gpt_functions.py new file mode 100644 index 0000000..41b18ad --- /dev/null +++ b/gpt/gpt_functions.py @@ -0,0 +1,219 @@ +import openai +import fitz +import pytesseract +import cv2 +import tempfile +import os +from dotenv import load_dotenv +from questions import questions +import time + +load_dotenv() +deployment_name = os.getenv("DEPLOYMENT-NAME") +openai.api_type = "azure" +openai.api_version = "2023-03-15-preview" +openai.api_base = os.getenv("ENDPOINT") # Your Azure OpenAI resource's endpoint value. +openai.api_key = os.getenv("API-KEY") + + +def pdf_reader(pdf_path: str = None): + doc = fitz.open(pdf_path) + text = "" + pages = doc.pages() + for page in pages: + with tempfile.TemporaryDirectory() as tmpdir: + # page = doc.load_page(0) # number of page + pix = page.get_pixmap(dpi=300) + path = f"{tmpdir}/image.jpg" + pix.save(path) + + img = cv2.imread(path) + img_text = pytesseract.image_to_string(img) + text = text + "\n" + img_text + doc.close() + print(f"extracted text from {pdf_path}" + "\n") + return text + + +def gpt_communicator(text: str = None, question: str = None, verbose=True): + response = openai.ChatCompletion.create( + engine=deployment_name, # The deployment name you chose when you deployed the ChatGPT or GPT-4 model. + messages=[ + { + "role": "user", + "content": f""" + \"{text}\" + + Based on the CV above: + {question} + """, + } + ], + ) + + answer = response["choices"][0]["message"]["content"] + if verbose: + print(question) + print(f"===> {answer}") + print("----------------\n") + time.sleep(3) + return answer + + +def cv_text_to_yaml( + cv_text: str = None, + output_dir: str = "../gpt/yaml_output", + output_file_name: str = None, +): + yaml_text = "" + text = cv_text + + first_name = gpt_communicator(text=text, question=questions["first_name"]).strip() + yaml_text += "\n" + f'first_name: "{first_name}"' + + last_name = gpt_communicator(text, questions["last_name"]).strip() + yaml_text += "\n" + f'last_name: "{last_name}"' + + role = gpt_communicator(text, questions["role"]).strip() + yaml_text += "\n" + f'role: "{role}"' + + email_address = gpt_communicator(text, questions["email_address"]).strip() + if email_address == "None": + yaml_text += "\n" + f"email: " + else: + yaml_text += "\n" + f'email: "{email_address}"' + + phone_number = gpt_communicator(text, questions["phone_number"]).strip() + if phone_number == "None": + yaml_text += "\n" + f"phone: " + else: + yaml_text += "\n" + f'phone: "{phone_number}"' + + linkedin = gpt_communicator(text, questions["linkedin"]).strip() + if linkedin == "None": + yaml_text += "\n" + f"linkedin: " + else: + yaml_text += "\n" + f'linkedin: "{linkedin}"' + + github = gpt_communicator(text, questions["github"]).strip() + if github == "None": + yaml_text += "\n" + f"github: " + else: + yaml_text += "\n" + f'github: "{github}"' + + # website = gpt_communicator(text, questions['website']).strip() + # if website == "None": + # yaml_text += '\n' + f'website: ' + # else: + # yaml_text += '\n' + f'website: "{website}"' + + about_me = ( + gpt_communicator(text, questions["about_me"]) + .replace("\n", " ") + .replace(" ", " ") + .strip() + ) + yaml_text += "\n" + f'about_me: "{about_me}"' + + education_degrees = gpt_communicator(text, questions["education_degrees"]).strip() + yaml_text += "\n" + f"education:" + for degree in education_degrees.split(","): + degree = degree.strip() + education_year = gpt_communicator( + text, questions["education_year"].format(degree=degree) + ).strip() + education_school = gpt_communicator( + text, questions["education_school"].format(degree=degree) + ).strip() + yaml_text += "\n" + f' - degree: "{degree}"' + if education_school == "None": + yaml_text += "\n" + f" institution: " + else: + yaml_text += "\n" + f' institution: "{education_school}"' + if education_year == "None": + yaml_text += "\n" + f" year: " + else: + yaml_text += "\n" + f' year: "{education_year}"' + + biography = ( + gpt_communicator(text, questions["biography"]) + .replace("\n", " ") + .replace(" ", " ") + .strip() + ) + yaml_text += "\n" + f'biography: "{biography}"' + + roles = gpt_communicator(text, questions["roles"]).strip() + yaml_text += "\n" + f"roles:" + for role in roles.split(","): + role = role.strip() + role_description = gpt_communicator( + text, questions["role_description"].format(role=role) + ).strip() + yaml_text += "\n" + f' - title: "{role}"' + yaml_text += "\n" + f' description: "{role_description}"' + + certifications = gpt_communicator(text, questions["certifications"]).strip() + yaml_text += "\n" + f"certifications:" + if certifications != "None": + for certification in certifications.split(","): + certification = certification.strip() + yaml_text += "\n" + f' - title: "{certification}"' + + competences_titles = gpt_communicator(text, questions["competences_titles"]).strip() + yaml_text += "\n" + f"competences:" + for competences_title in competences_titles.split(","): + competences_title = competences_title.strip() + competences = gpt_communicator( + text, questions["competences"].format(competences_title=competences_title) + ).strip() + yaml_text += "\n" + f' - title: "{competences_title}"' + yaml_text += "\n" + f' description: "{competences}"' + + companies = gpt_communicator(text, questions["companies"]).strip() + yaml_text += "\n" + f"experience:" + for company in companies.split(","): + company = company.strip() + company_role = gpt_communicator( + text, questions["company_role"].format(company=company) + ).strip() + company_start = gpt_communicator( + text, questions["company_start"].format(company=company) + ).strip() + company_end = gpt_communicator( + text, questions["company_end"].format(company=company) + ).strip() + company_work = ( + gpt_communicator(text, questions["company_work"].format(company=company)) + .strip() + .replace("\n", "\n ") + .replace("¢", "•") + .replace("* ", "• ") + .replace("+ ", "• ") + .replace("« ", "• ") + ) + company_technologies = gpt_communicator( + text, questions["company_technologies"].format(company=company) + ).strip() + + yaml_text += "\n" + f' - title: "{company_role}"' + yaml_text += "\n" + f' company: "{company}"' + if company_start == "None": + yaml_text += "\n" + f" start: " + else: + yaml_text += "\n" + f' start: "{company_start}"' + if company_end == "None": + yaml_text += "\n" + f" end: " + else: + yaml_text += "\n" + f' end: "{company_end}"' + yaml_text += "\n" + f' description: "{company_work}"' + if company_technologies == "None": + yaml_text += "\n" + f" technologies: " + else: + yaml_text += "\n" + f' technologies: "{company_technologies}"' + yaml_text += "\n" + f" visible: true" + + with open(f"{output_dir}/{output_file_name}.yml", "w") as text_file: + text_file.write(yaml_text) + + return yaml_text diff --git a/gpt/gpt_requirements.txt b/gpt/gpt_requirements.txt new file mode 100644 index 0000000..683a451 --- /dev/null +++ b/gpt/gpt_requirements.txt @@ -0,0 +1,7 @@ +openai +python-dotenv +pypdf +opencv-python +pytesseract +pdf2image +ppt2pdf diff --git a/gpt/questions.py b/gpt/questions.py new file mode 100644 index 0000000..ba18566 --- /dev/null +++ b/gpt/questions.py @@ -0,0 +1,25 @@ +questions = { + "first_name": """What's this persons first name? Give me only the name.""", + "last_name": """What's this persons last name? Give me only the name.""", + "role": """What's this persons main role? Don't tell me "Data Analyst". Your answer must be one of these titles: "Analytics Engineer", "Machine Learning Engineer", "Data Scientist", "Data Engineer", "Analytics Translator". Give me only the title.""", + "about_me": """What does this persons "About Me" section say? If you cannot find an "About Me" section, generate an appropriate one based on this persons CV. Keep it short and informal, no more than 75 words. Give me only the text.""", + "education_degrees": """What are this persons education degrees? Remove commas in degree names if any. Give me only the degree names, seperated by commas. Order them from newest to oldest. If not provided, answer "None". """, + "education_year": """When did they study {degree}? Give me your answer in a format like "2000 - 2004". If not provided, answer "None".""", + "education_school": """What is the name of the school they studied {degree}? Give me only the school name. If not provided, answer "None".""", + "biography": """What does this persons "Biography" section says? If you cannot find a "Biography" section, generate an appropriate one based on this persons CV. Keep it short, no more than 100 words. Give me only the text.""", + "roles": """What roles are listed in the Roles section of the CV? Give me no more than 4 role titles. If not provided in the CV, give me 3 appropriate role titles. Don't include company names or the work they do, give only generic work titles like Data Engineer, Data Analyst, Data Scientist etc. Give me the role titles, seperated by commas.""", + "role_description": """What is the description of their {role} role in the CV? If they did not provide this in the CV, write an appropriate one, about 20 words long. Keep it generic, not specific for a company they worked for. Give me only the text.""", + "certifications": """What certifications does this person have? Give me only the certifications, seperated by commas. If not provided, answer "None".""", + "competences_titles": """What are the competences titles listed in the Competences section of the CV? Such as "Programming, Tech, Languages". Don't give me specific competences like SQL, Python etc, give me more generic titles like Programming, Cloud, Languages. Don't include "Education" and "Certification" in these titles. Give me only the titles, seperated by commas. If competences titles are not explicitly listed in the CV, select applicable ones from "Programming, Data Stack, Visualization, Cloud, Tech, Languages" """, + "competences": """What are this persons competences that would fall under {competences_title}? Give me only the competences, seperated by commas.""", + "companies": """What companies did this person work for? Remove commas in company names if any. Give me only the company names, seperated by commas. Order them from newest to oldest.""", + "company_role": """What was this persons job title at {company}? Don't include what they do, or company name etc. Give me a generic job title like Data Engineer, Analytics Engineer, Data Scientist, Machine Learning Engineer etc. If the job title is not explicitly provided in the CV, try to come up with an appropriate one. Give me only the job title.""", + "company_start": """When did this person started working at {company}? Give me only the year and the month, in "2000 June" format. Don't write anything else! If not provided, answer "None".""", + "company_end": """When did this person finished working at {company}? Give me only the year and the month, in "2000 June" format. Don't write anything else! If it says present, answer "Present". If nothing about end date is provided, answer "None".""", + "company_work": """What did this person do at {company}? Give me the text block exactly as it is written in the CV, don't add anything yourself, remove the technologies that are mentioned in the end if they are mentioned. Don't include company name, role title or year/month they worked there if they are mentioned in the text. If there are stange characters in the original text, like "¢, *, +, «", replace them with "•" if you think they are meant to be bulletpoints.""", + "company_technologies": """What technologies did this person use at {company}? These should be provided in the CV, if not, write related technologies yourself. Give me only the names of technologies, seperated by commas. If you cannot find related technologies, answer "None".""", + "email_address": """What's this persons email address? Give me only the address. If not provided, answer "None".""", + "phone_number": """What's this persons phone number? Give me only the number. If not provided, answer "None".""", + "linkedin": """What's this persons linkedin address? Give me only the address, cut everything before "linkedin" in the address. If not provided, answer "None".""", + "github": """What's this persons github address? Give me only the address, cut everything before "github" in the address. If not provided, answer "None".""", +} diff --git a/gpt/test_files/john_smith_cv.pdf b/gpt/test_files/john_smith_cv.pdf new file mode 100644 index 0000000..0e7086b Binary files /dev/null and b/gpt/test_files/john_smith_cv.pdf differ diff --git a/gpt/yaml_output/john_smith_new_xebia_data_cv.yml b/gpt/yaml_output/john_smith_new_xebia_data_cv.yml new file mode 100644 index 0000000..438e23e --- /dev/null +++ b/gpt/yaml_output/john_smith_new_xebia_data_cv.yml @@ -0,0 +1,52 @@ + +first_name: "John" +last_name: "Smith" +role: "Data Engineer" +email: "email@email.com" +phone: "3868683442" +linkedin: +github: +website: +about_me: "Hi there! I'm a dedicated Data Engineer with over 5 years of experience working with large datasets and building robust databases. I'm passionate about utilizing my skills in SQL, Java, and Python to create game-changing insights for businesses. Besides work, you can find me cycling, songwriting, and running. Excited to see what new challenges lie ahead!" +education: + - degree: "BS Computer Science" + institution: "Texas University" + year: +biography: "John Smith is a dedicated Data Engineer with over 5 years of experience working with large datasets. He has a proven track record of building robust databases and implementing natural language processing tools to support data scientists. John is highly skilled in SQL, Java, Apache Spark, Hadoop, and Python, and is fluent in English and German. He holds a Bachelor's degree in Computer Science from Texas University, with a dual concentration in Machine Learning and a Business Foundations Certificate. His passion for delivering game-changing insights extends to his hobbies, including cycling, songwriting, and running." +roles: + - title: "Data Engineer" + description: "Dedicated Data Engineer with 5+ years' experience in designing, implementing, and maintaining scalable databases and ETL processes for large datasets, seeking to deliver game-changing insights." + - title: "SQL server database developer" + description: "Develop and optimize SQL server database systems to maximize performance and efficiency while ensuring data accuracy and integrity for clientele." +certifications: + - title: "CCA Cloudera Certified Associate" +competences: + - title: "SKILLS" + description: "SQL, Java, Apache Spark, Hadoop, Python, Coding" + - title: "LANGUAGES" + description: "English, German" + - title: "HOBBIES" + description: "Cycling, Songwriting, Running" +experience: + - title: "Data Engineer" + company: "FNB Nong Phai" + start: "2018 January" + end: "2018 December" + description: "Responsible for scaling machine learning models and making these models fit within banking environments. Implemented natural language processing tools to ensure machine-readable databases were ready for the team of data scientists." + technologies: "SQL, Spark, MongoDB, sci-kit-learn, Tensorflow, and Keras" + visible: true + - title: "Data Engineer" + company: "ABSA Washington" + start: "2015 January" + end: "2017 December" + description: "Responsible for developing database triggers, packages, functions, and stored procedures using PL/SQL and maintain the scripts for various data feeds across multiple regional and international offices of the company + + • Co-develop a SQL server database system to maximize performance benefits for clientele. + + • Assisted senior-level Data Scientists in the design of ETL processes, including SSIS packages. + + • Developed coherent Logical Data Models that helped guide important client business decisions. + + • Collaborate and coordinate with development teams to deploy data quality solutions and create and maintain standard operating procedure documentation." + technologies: "PL/SQL, SQL server, ETL processes, SSIS packages" + visible: true diff --git a/requirements.txt b/requirements.txt index 282baf9..3431b68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +python-dotenv black pyyaml reportlab<4.0.0 diff --git a/streamlit_app.py b/streamlit_app.py index 2aa06dc..0b24501 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -59,7 +59,7 @@ ## Footnote st.caption( """ -⚠️Note: Please save your `YAML input` locally, refreshing this page will reset the input to the default values and the loss of any data you have entered. +⚠️Note: Please save your `YAML input` locally, refreshing this page will reset the input to the default values and any data you have entered will be lost. Developed By: Erkan Celen """