diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..5ce3dbe --- /dev/null +++ b/.coveragerc @@ -0,0 +1,17 @@ +[run] +# Exclude test files and specific init files from the coverage report +omit = + */tests/* + */test_*.py + */__init__.py # Good idea to exclude __init__.py files from the coverage report + +# Include source files only from certain directories +source = + easy_letters + notebooks + +# Set parallel to true if you run tests in parallel +parallel = True + +# Enable branch coverage if set to True +branch = False diff --git a/.editorconfig b/.editorconfig index 09c3767..a13ed49 100644 --- a/.editorconfig +++ b/.editorconfig @@ -3,27 +3,29 @@ # Top-most EditorConfig file root = true -# Python specific settings, complying with PEP 8 style guide +# Global settings (applicable to all files unless overridden) +[*] +charset = utf-8 # Default character encoding +end_of_line = lf # Use LF for line endings (Unix-style) +indent_style = space # Use spaces for indentation +indent_size = 4 # Default indentation size +insert_final_newline = true # Make sure files end with a newline +trim_trailing_whitespace = true # Remove trailing whitespace + +# Python specific settings, complying with PEP 8 style guide, except for the line length [*.py] -indent_size = 4 -max_line_length = 80 +max_line_length = 100 # Markdown files [*.md] -trim_trailing_whitespace = false - -# Bash scripts -[*.sh] -indent_size = 2 - -# SQL files -[*.sql] -indent_size = 2 +trim_trailing_whitespace = false # Don't remove trailing whitespace in Markdown files +max_line_length = 120 # YAML files -[*.yml] +[*.{yaml,yml}] indent_size = 2 -# JSON files -[*.json] +# Shell scripts +[*.sh] indent_size = 2 +indent_style = tab \ No newline at end of file diff --git a/.gitattributes b/.gitattributes index fe04a12..5b08caa 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,8 +1,73 @@ -# Adding the following lines to the .gitattributes file will tell Git to treat the files as binary data. -*.text filter=lfs diff=lfs merge=lfs -text -*.png filter=lfs diff=lfs merge=lfs -text +# Common document and text file formats +*.docx filter=lfs diff=lfs merge=lfs -text +*.doc filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +*.djvu filter=lfs diff=lfs merge=lfs -text +*.eps filter=lfs diff=lfs merge=lfs -text +*.odt filter=lfs diff=lfs merge=lfs -text +*.rtf filter=lfs diff=lfs merge=lfs -text +*.ps filter=lfs diff=lfs merge=lfs -text +*.xls filter=lfs diff=lfs merge=lfs -text +*.xlsx filter=lfs diff=lfs merge=lfs -text +*.ppt filter=lfs diff=lfs merge=lfs -text +*.pptx filter=lfs diff=lfs merge=lfs -text + +# Common image formats *.jpg filter=lfs diff=lfs merge=lfs -text *.jpeg filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text *.gif filter=lfs diff=lfs merge=lfs -text -*.csv filter=lfs diff=lfs merge=lfs -text +*.bmp filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +*.tif filter=lfs diff=lfs merge=lfs -text +*.svgz filter=lfs diff=lfs merge=lfs -text + +# Common compressed file formats +*.zip filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.7z filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text + +# Common file formats in machine learning projects +*.bin filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.tfrecord filter=lfs diff=lfs merge=lfs -text +*.hdf5 filter=lfs diff=lfs merge=lfs -text +*.keras filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text + +# Common audio and video formats +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.mp4 filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +*.avi filter=lfs diff=lfs merge=lfs -text +*.mov filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mkv filter=lfs diff=lfs merge=lfs -text +*.webm filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.ogv filter=lfs diff=lfs merge=lfs -text + +# Common data transfer formats +#*.csv filter=lfs diff=lfs merge=lfs -text +#*.tsv filter=lfs diff=lfs merge=lfs -text +#*.json filter=lfs diff=lfs merge=lfs -text +#*.xml filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text +*.feather filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.avro filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.orc filter=lfs diff=lfs merge=lfs -text + +# Exclude files from language stats (GitHub Linguist) +*.ipynb linguist-vendored diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish.yml similarity index 78% rename from .github/workflows/publish_to_pypi.yml rename to .github/workflows/publish.yml index 5018e68..aa66bb8 100644 --- a/.github/workflows/publish_to_pypi.yml +++ b/.github/workflows/publish.yml @@ -2,10 +2,12 @@ name: Publish to PyPI on: workflow_dispatch: # Enable manual runs + tag: + - 'v*' # Run on version tags jobs: - # Run tests before publishing + # Run the tests before publishing to PyPI call_tests: uses: ./.github/workflows/tests.yml @@ -30,10 +32,6 @@ jobs: run: | poetry install - # - name: Update Version - # run: | - # poetry version patch # Use 'minor' or 'major' for minor or major version bumps - - name: Build and Publish Package run: | poetry config pypi-token.pypi ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7bc6653..9a5f70a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,9 +1,6 @@ -name: Tests +name: Run Tests on: - # push: - # branches: - # - main workflow_dispatch: # Enable manual runs workflow_call: # Make this workflow available to be called by other workflows @@ -13,7 +10,7 @@ jobs: strategy: matrix: - python-version: [ "3.10", "3.11", "3.12" ] + python-version: [ "3.10", "3.11", "3.12", "3.13" ] steps: - name: Checkout Repository @@ -30,18 +27,18 @@ jobs: - name: Install Dependencies run: | - poetry install --with dev + poetry install --with dev --no-root - - name: Run Tests + - name: Run Tests with Coverage shell: bash env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | - poetry run pytest tests/ --cov --doctest-modules --junitxml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest continue-on-error: false - - name: Upload Test Results - uses: actions/upload-artifact@v4 + - name: Upload Coverage Reports to Codecov + uses: codecov/codecov-action@v5 with: - name: pytest-results-${{ matrix.python-version }} - path: junit/test-results-${{ matrix.python-version }}.xml + token: ${{ secrets.CODECOV_TOKEN }} + diff --git a/LICENSE b/LICENSE index feb81f7..a2559ba 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Hassan Abedi +Copyright (c) 2025 Hassan Abedi Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 15e0a6d..7997573 100644 --- a/README.md +++ b/README.md @@ -1,77 +1,77 @@ -# Easy Letters +## Easy Letters -[![PyPI version](https://badge.fury.io/py/easy-letters.svg)](https://badge.fury.io/py/easy-letters) -[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![python version](https://img.shields.io/badge/Python-%3E=3.10-blue)](https://github.com/habedi/easy-letters) -[![pip downloads](https://img.shields.io/pypi/dm/easy-letters.svg)](https://pypi.org/project/easy-letters/) [![Tests](https://github.com/habedi/easy-letters/actions/workflows/tests.yml/badge.svg)](https://github.com/habedi/easy-letters/actions/workflows/tests.yml) +[![codecov](https://codecov.io/gh/habedi/easy-letters/graph/badge.svg?token=E47OPB2HVA)](https://codecov.io/gh/habedi/easy-letters) [![CodeFactor](https://www.codefactor.io/repository/github/habedi/easy-letters/badge)](https://www.codefactor.io/repository/github/habedi/easy-letters) +[![python version](https://img.shields.io/badge/Python-%3E=3.10-blue)](https://github.com/habedi/easy-letters) +[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/habedi/easy-letters/blob/main/LICENSE) +[![PyPI version](https://badge.fury.io/py/easy-letters.svg)](https://badge.fury.io/py/easy-letters) +[![pip downloads](https://img.shields.io/pypi/dm/easy-letters.svg)](https://pypi.org/project/easy-letters/) -Easy Letters is a Python package that helps job seekers write application letters. A simple retrieval -augmented generation (RAG) pipeline is used to generate the letters. The user can then edit the draft letter to suit -their needs. +Easy Letters is a Python library that provides the basic building blocks for creating a naive [retrieval augmented +generation (or RAG)](https://arxiv.org/abs/2312.10997) pipeline to generate application letter drafts. +The main idea is to speed up the process of writing application letters by not starting from scratch. +Instead, an applicant could generate a draft letter that can be used as a starting point and customized as needed +to make the final letter. -See the `notebooks/README.md` file for how easy letters works. +The diagram below shows the high-level workflow of how Easy Letters can be used to generate draft application +letters. -## 🔧 Installation +![Easy Letters Workflow](assets/workflow.svg) -You can install Easy Letters using pip: +### Installation ```bash pip install easy-letters ``` -## 🚀 Getting Started - -### API Key Setup +#### Installing from Source -At the moment, Easy Letters gets the API key for supported services from the environment variables. -So you need to set the following environment variables to be able to use Easy Letters: - -- `OPENAI_API_KEY`: The OpenAI API key (required) +You can also install Easy Letters from the source code in this repository. +The main benefit of this approach is that you might find it easier to run the sample notebooks and modify the code as +you wish this way. -### Sample Notebooks +After cloning this repository, you can navigate to the directory where you cloned the repository and install the +dependencies using [Poetry](https://python-poetry.org/): -You can find Jupyter notebooks with example code in the `notebooks` directory. -The notebooks demonstrate how to use Easy Letters to generate application letter drafts. +```bash +git clone https://github.com/habedi/easy-letters.git && cd easy-letters -### Supported Models +# Install the dependencies using Poetry +poetry install --with dev +``` -Easy Letters currently supports the following models: +### Getting Started -| Model | Type | -|----------------------------------|-----------------| -| GPT-3.5 Turbo | Text Generation | -| GPT-4 Turbo | Text Generation | -| GPT-4o | Text Generation | -| GPT-4o Mini | Text Generation | -| Text Embedding 3 (Small Variant) | Text Embedding | -| Text Embedding 3 (Large Variant) | Text Embedding | +#### API Key Setup -### Installing from Source +Easy Letters gets the API key for supported services (like [OpenAI](https://platform.openai.com/)) from the environment +variables. +So you need to set the following environment variables to be able to use Easy Letters: -You can also install Easy Letters from the source code in this repository. The main benefit of this approach is that -you might find it easier to run the sample notebooks and modify the code as you wish this way. +- `OPENAI_API_KEY`: The [OpenAI API key](https://platform.openai.com/docs/api-reference/authentication) (required) -After cloning this repository, you can navigate to the `easy-letters` directory and install the -dependencies using [Poetry](https://python-poetry.org/): +#### Sample Notebooks -```bash -git clone https://github.com/habedi/easy-letters.git && cd easy-letters +You can find Jupyter notebooks with example code in the [notebooks](notebooks/) directory. +The notebooks demonstrate how to use Easy Letters to generate application letter drafts. -# Assuming you have Poetry installed on your system -poetry install --with dev -``` +### Supported Models -### Running the Unit Tests with Coverage +Easy Letters currently supports the following models: -You can run the unit tests with coverage using the following command: +| Model | Type | Company | +|----------------------------------|-----------------|---------| +| GPT-3.5 Turbo | Text Generation | OpenAI | +| GPT-4 Turbo | Text Generation | OpenAI | +| GPT-4o | Text Generation | OpenAI | +| GPT-4o Mini | Text Generation | OpenAI | +| Text Embedding 3 (Small Variant) | Text Embedding | OpenAI | +| Text Embedding 3 (Large Variant) | Text Embedding | OpenAI | -```bash -poetry run pytest tests/ --cov=easy_letters -``` +> [!NOTE] +> At the moment, Easy Letters only supports text generation and text embedding models from OpenAI. -## 📝 TODO +### License -- [ ] Add support for Anthropic models and API -- [ ] Add support for locally served models via Ollama +Easy Letters is available under the MIT license ([LICENSE](LICENSE)). diff --git a/assets/make_figures.sh b/assets/make_figures.sh new file mode 100644 index 0000000..9f679a2 --- /dev/null +++ b/assets/make_figures.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# You need to have Graphviz installed to run this script +# On Debian-based systems, you can install it using: sudo apt-get install graphviz + +# Make figures from .dot files +for f in *.dot; do + dot -Tsvg $f -o ${f%.dot}.svg +done diff --git a/notebooks/assets/workflow.dot b/assets/workflow.dot similarity index 100% rename from notebooks/assets/workflow.dot rename to assets/workflow.dot diff --git a/assets/workflow.svg b/assets/workflow.svg new file mode 100644 index 0000000..5a3d987 --- /dev/null +++ b/assets/workflow.svg @@ -0,0 +1,232 @@ + + + + + + + Elements + + + cluster_0 + + + User Inputs + + + + cluster_1 + + + Processing + + + + cluster_2 + + + Output + + + + + u + + + User + + + + + q + + + Job Description + + + + + u->q + + + + provides + + + + + als + + + Existing Application Letters + + + + + u->als + + + + provides + + + + + ei + + + Extra Instructions + + + + + u->ei + + + + provides + + + + + api + + + Embedding Model + + + + + q->api + + + + processed by + + + + + als->api + + + + processed by + + + + + db + + + Vector Database + + + + + als->db + + + + stores in + + + + + llm + + + Language Model + + + + + ei->llm + + + + augments + + + + + api->db + + + + stores in + + + + + cp + + + Custom Prompt + + + + + db->cp + + + + retrieves from + + + + + cp->llm + + + + forms + + + + + ld + + + Letter Draft + + + + + llm->ld + + + + generates + + + + + ld->u + + + + reviews + + + + diff --git a/easy_letters/connectors.py b/easy_letters/connectors.py index bd26a5b..ba86824 100644 --- a/easy_letters/connectors.py +++ b/easy_letters/connectors.py @@ -6,77 +6,72 @@ class LanguageModels: - """List of supported language models.""" - OPENAI_GPT35TURBO = 'gpt-3.5-turbo' - OPENAI_GPT4TURBO = 'gpt-4-turbo' - OPENAI_GPT4O = 'gpt-4o' - OPENAI_GPT4OMINI = 'gpt-4o-mini' + """ + Supported language models. + """ + OPENAI_GPT35TURBO = "gpt-3.5-turbo" + OPENAI_GPT4TURBO = "gpt-4-turbo" + OPENAI_GPT4O = "gpt-4o" + OPENAI_GPT4OMINI = "gpt-4o-mini" class EmbeddingModels: - """List of supported text embedding models.""" - OPENAPI_EMS = 'text-embedding-3-small' - OPENAPI_EML = 'text-embedding-3-large' + """ + Supported text embedding models. + """ + OPENAPI_EMS = "text-embedding-3-small" + OPENAPI_EML = "text-embedding-3-large" class OpenAIConnector: """ - Connector class to interact with OpenAI API for embeddings and - chat completions. + Connector to interact with the OpenAI API for embeddings and chat completions. Attributes: - client (openai.Client): The OpenAI client used for API interactions. + client (openai.Client): The client used for API calls. """ def __init__(self, api_key: str, **kwargs): """ - Initialize the OpenAIConnector with an API key and optional parameters. + Initialize the connector with an API key and extra options. Args: - api_key (str): The API key for authenticating with the OpenAI API. - **kwargs: Additional keyword arguments to pass to the OpenAI client. + api_key (str): Your OpenAI API key. + **kwargs: Additional parameters for the OpenAI client. """ self.client = openai.Client(api_key=api_key, **kwargs) - def embed(self, documents: List[str], model: str) -> List[ - ndarray[Any, dtype[Any]]]: + def embed(self, documents: List[str], model: str) -> List[ndarray[Any, dtype[Any]]]: """ - Generate embeddings for a list of documents using a specified model. + Generate embeddings for a list of documents using a given model. Args: - documents (List[str]): A list of documents to embed. + documents (List[str]): Documents to embed. model (str): The model to use for generating embeddings. Returns: - List[ndarray[Any, dtype[Any]]]: A list of numpy arrays containing - the embeddings. + List[ndarray[Any, dtype[Any]]]: A list of numpy arrays with the embeddings. """ - embeddings = self.client.embeddings.create(input=documents, - model=model) + embeddings = self.client.embeddings.create(input=documents, model=model) return [np.array(d.embedding) for d in embeddings.data] - def chat(self, prompt: str, model: str, temperature: float = 0.0, - max_tokens: int = 512) -> str: + def chat(self, prompt: str, model: str, temperature: float = 0.0, max_tokens: int = 512) -> str: """ - Generate a chat completion for a given prompt using a specified model. + Generate a chat response for a given prompt using a specified model. Args: - prompt (str): The input prompt for the chat model. - model (str): The model to use for generating the chat completion. - temperature (float, optional): The sampling temperature. - Defaults to 0.0. - max_tokens (int, optional): The maximum number of tokens for the - model to generate. Defaults to 512. + prompt (str): The prompt text. + model (str): The model to use. + temperature (float, optional): Sampling temperature (defaults to 0.0). + max_tokens (int, optional): Maximum tokens to generate (defaults to 512). Returns: - str: The generated chat response. + str: The chat response text. """ completion = self.client.chat.completions.create( model=model, - messages=[ - {"role": "user", "content": prompt}, - ], + messages=[{"role": "user", "content": prompt}], max_tokens=max_tokens, - temperature=temperature + temperature=temperature, ) return completion.choices[0].message.content diff --git a/easy_letters/similarity_search.py b/easy_letters/similarity_search.py index 92966ae..450ded9 100644 --- a/easy_letters/similarity_search.py +++ b/easy_letters/similarity_search.py @@ -17,56 +17,71 @@ def __init__(self): """ self.client = QdrantClient(":memory:") - def make_collection(self, documents_with_embeddings, - collection_name="letters"): + def make_collection(self, documents_with_embeddings, collection_name="letters"): """ - Create a collection with the given documents and embeddings in the db. + Create a collection with the given documents and embeddings in the database. Args: documents_with_embeddings (dict): A dictionary containing 'text' - and 'embedding' keys with lists of documents and their - corresponding embeddings. + and 'embedding' keys with lists of documents and their + corresponding embeddings. collection_name (str): The name of the collection to create. - Defaults to "letters". + Defaults to "letters". Returns: None """ - documents = documents_with_embeddings['text'] - embeddings = documents_with_embeddings['embedding'] + documents = documents_with_embeddings["text"] + embeddings = documents_with_embeddings["embedding"] - points = [PointStruct(id=idx, vector=e, payload={'text': d}) - for idx, (d, e) in enumerate(zip(documents, embeddings))] + points = [ + PointStruct(id=idx, vector=e, payload={"text": d}) + for idx, (d, e) in enumerate(zip(documents, embeddings)) + ] print( f"Creating collection {collection_name} with {len(points)} " - f"points of size {embeddings[0].shape[0]}") - self.client.create_collection(collection_name=collection_name, - vectors_config=VectorParams( - size=embeddings[0].shape[0], - distance=Distance.COSINE)) + f"points of size {embeddings[0].shape[0]}" + ) + self.client.create_collection( + collection_name=collection_name, + vectors_config=VectorParams( + size=embeddings[0].shape[0], distance=Distance.COSINE + ), + ) self.client.upsert(collection_name, points) - def find_similar(self, embedding, collection_name="letters", top_k=5, - min_similarity=0.1): + def find_similar( + self, embedding, collection_name="letters", top_k=5, min_similarity=0.1 + ): """ - Find similar documents in the given collection for the given embedding - and return top k results. + Find similar documents in the given collection for the provided embedding. + + This method queries the Qdrant collection for documents that are most similar + to the provided embedding vector, returning the top k results with a similarity + score above the specified threshold. The response is converted into a list of + dictionaries for easier indexing. Args: - embedding (list or numpy.ndarray): The embedding vector to search - for similar documents. - collection_name (str): The name of the collection to search in. - Defaults to "letters". - top_k (int): The number of top similar documents to return. - Defaults to 5. - min_similarity (float): The minimum similarity score threshold. - Defaults to 0.1. + embedding (list or numpy.ndarray): The embedding vector to search for similar documents. + collection_name (str): The name of the collection to search in. Defaults to "letters". + top_k (int): The maximum number of similar documents to return. Defaults to 5. + min_similarity (float): The minimum similarity score threshold. Defaults to 0.1. Returns: - list: A list of search results with similar documents. + list: A list of dictionaries, each containing: + - "id": The document's identifier. + - "payload": A dictionary with the document content (e.g., {"text": "..."}) + - "score": The similarity score. """ - return self.client.search(collection_name=collection_name, - query_vector=embedding, - limit=top_k, score_threshold=min_similarity) + response = self.client.query_points( + collection_name=collection_name, + query=embedding, + limit=top_k, + score_threshold=min_similarity, + ) + return [ + {"id": point.id, "payload": point.payload, "score": point.score} + for point in response.points + ] diff --git a/notebooks/README.md b/notebooks/README.md index b9197da..15b9768 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -1,5 +1,3 @@ -# 📝 Overview +# Notebooks -The diagram below shows the high-level workflow of how Easy Letters generates application letter drafts. - -![Easy Letters Workflow](assets/workflow.png) +This directory contains example Jupyter notebooks that show how to use Easy Letters to make draft application letters. diff --git a/notebooks/assets/workflow.png b/notebooks/assets/workflow.png deleted file mode 100644 index a8b66b9..0000000 --- a/notebooks/assets/workflow.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8145c236ea1f263b225ecfa04da20067bbfa3b11b3214a0b7dd5bfdfa5a6629d -size 75834 diff --git a/notebooks/demo_openai.ipynb b/notebooks/demo_openai.ipynb index bd7057f..b5594cc 100644 --- a/notebooks/demo_openai.ipynb +++ b/notebooks/demo_openai.ipynb @@ -1,8 +1,9 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", + "id": "e49b37128a0e12ae", + "metadata": {}, "source": [ "# Workflow Summary\n", "\n", @@ -11,14 +12,13 @@ "3. Find the most relevant (similar) application letters to the job ad\n", "4. Create a custom prompt for the langauge model which includes the job ad and the most relevant application letters, plus some extra information\n", "5. Generate a draft application letter by sending the prompt to the model and save it to a file for further use" - ], - "id": "e49b37128a0e12ae" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Importing Libraries", - "id": "8bce22186ea367e7" + "id": "8bce22186ea367e7", + "metadata": {}, + "source": "# Importing Libraries" }, { "cell_type": "code", @@ -26,50 +26,59 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.133906Z", - "start_time": "2024-07-19T15:18:18.324252Z" + "end_time": "2025-03-01T16:03:30.854723Z", + "start_time": "2025-03-01T16:03:30.072154Z" } }, "source": [ "import io\n", "import os\n", - "import tiktoken\n", + "from pathlib import Path\n", "\n", "import pandas as pd\n", + "import tiktoken\n", + "from IPython.display import display, Markdown\n", "\n", - "from easy_letters import OpenAIConnector, Ranker\n", "from easy_letters import LanguageModels, EmbeddingModels\n", - "\n", - "from IPython.display import display, Markdown\n", - "from pathlib import Path" + "from easy_letters import OpenAIConnector, Ranker" ], "outputs": [], "execution_count": 1 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Constants and Global Settings", - "id": "dbe6834d7e71a750" + "id": "dbe6834d7e71a750", + "metadata": {}, + "source": "# Constants and Global Settings" }, { + "cell_type": "code", + "id": "de023f5b4bcc6a27", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.138520Z", - "start_time": "2024-07-19T15:18:20.135465Z" + "end_time": "2025-03-01T16:03:30.859295Z", + "start_time": "2025-03-01T16:03:30.857632Z" } }, - "cell_type": "code", - "source": "pd.set_option('display.float_format', lambda x: f'{x:.2f}')", - "id": "de023f5b4bcc6a27", + "source": [ + "pd.set_option(\"display.float_format\", lambda x: f\"{x:.2f}\")" + ], "outputs": [], "execution_count": 2 }, { "cell_type": "code", + "id": "5c47615b36a77ea7", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-03-01T16:03:30.934161Z", + "start_time": "2025-03-01T16:03:30.932265Z" + } + }, "source": [ "DATA_DIR = Path(\"../tests/test_data\")\n", - "LETTERS_DIR = DATA_DIR/ \"sample_letters\"\n", + "LETTERS_DIR = DATA_DIR / \"sample_letters\"\n", "SAMPLE_JOB_AD = DATA_DIR / \"sample_ads/description_6.text\"\n", "\n", "OUTPUT_DIR = Path(\"./output\")\n", @@ -77,90 +86,85 @@ "\n", "LLM = LanguageModels.OPENAI_GPT4OMINI" ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.219764Z", - "start_time": "2024-07-19T15:18:20.140001Z" - } - }, - "id": "5c47615b36a77ea7", "outputs": [], "execution_count": 3 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Helper Functions", - "id": "2b971057bcc85cf3" + "id": "2b971057bcc85cf3", + "metadata": {}, + "source": "# Helper Functions" }, { + "cell_type": "code", + "id": "8f441a7372df51b6", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.247749Z", - "start_time": "2024-07-19T15:18:20.227464Z" + "end_time": "2025-03-01T16:03:30.975702Z", + "start_time": "2025-03-01T16:03:30.972929Z" } }, - "cell_type": "code", "source": [ "class DocumentLoader:\n", " \"\"\"A class to load documents from files.\"\"\"\n", - " \n", + "\n", " @staticmethod\n", " def _read_txt(path: Path) -> str:\n", - " with io.open(path, 'r', encoding='utf-8') as f:\n", + " with io.open(path, \"r\", encoding=\"utf-8\") as f:\n", " return f.read()\n", "\n", - " def bulk_load_documents(self, path: Path, ext='.txt') -> pd.DataFrame:\n", + " def bulk_load_documents(self, path: Path, ext=\".txt\") -> pd.DataFrame:\n", " \"\"\"Load all the documents in a directory with a specific extension into a DataFrame.\"\"\"\n", " documents = []\n", " documents_ids = []\n", - " \n", - " ext = '.' + ext.lower().lstrip('.')\n", - " for file in path.glob(f'*{ext}'):\n", - " if ext in ('.txt', '.text'):\n", + "\n", + " ext = \".\" + ext.lower().lstrip(\".\")\n", + " for file in path.glob(f\"*{ext}\"):\n", + " if ext in (\".txt\", \".text\"):\n", " documents.append(self._read_txt(file))\n", " else:\n", - " raise ValueError(f'Unsupported file format: {ext.strip(\".\").capitalize()}')\n", + " raise ValueError(\n", + " f\"Unsupported file format: {ext.strip('.').capitalize()}\"\n", + " )\n", " documents_ids.append(file.stem)\n", - " return pd.DataFrame({'id': documents_ids, 'text': documents})\n", + " return pd.DataFrame({\"id\": documents_ids, \"text\": documents})\n", "\n", - " def load_document(self, path: Path, ext='.txt') -> str:\n", + " def load_document(self, path: Path, ext=\".txt\") -> str:\n", " \"\"\"Load a single document from a file.\"\"\"\n", - " if ext == '.txt':\n", + " if ext == \".txt\":\n", " return self._read_txt(path)\n", " else:\n", - " raise ValueError(f'Unsupported file format: {ext}')" + " raise ValueError(f\"Unsupported file format: {ext}\")" ], - "id": "8f441a7372df51b6", "outputs": [], "execution_count": 4 }, { + "cell_type": "code", + "id": "f95d49699943e99b", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.256452Z", - "start_time": "2024-07-19T15:18:20.251635Z" + "end_time": "2025-03-01T16:03:31.020763Z", + "start_time": "2025-03-01T16:03:31.018818Z" } }, - "cell_type": "code", "source": [ "def pprint(text: str):\n", " \"\"\"Pretty print the text as markdown in Jupyter Notebook's output.\"\"\"\n", " display(Markdown(text))" ], - "id": "f95d49699943e99b", "outputs": [], "execution_count": 5 }, { + "cell_type": "code", + "id": "eb103567891fa92", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.263207Z", - "start_time": "2024-07-19T15:18:20.258780Z" + "end_time": "2025-03-01T16:03:31.065485Z", + "start_time": "2025-03-01T16:03:31.063169Z" } }, - "cell_type": "code", "source": [ "def gen_num_tokens(text: str, encoding: str = \"r50k_base\") -> int:\n", " \"\"\"Calculate the number of tokens in a string for a specific model.\n", @@ -174,54 +178,55 @@ " encoding = tiktoken.get_encoding(encoding)\n", " return len(encoding.encode(text))" ], - "id": "eb103567891fa92", "outputs": [], "execution_count": 6 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Loading the Application Letters", - "id": "9744b2654a8917ca" + "id": "9744b2654a8917ca", + "metadata": {}, + "source": "# Loading the Application Letters" }, { "cell_type": "code", - "source": [ - "doc_loader = DocumentLoader()\n", - "application_letters_df = doc_loader.bulk_load_documents(LETTERS_DIR, ext=\"text\")" - ], + "id": "d4373fdb670dddb4", "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.293820Z", - "start_time": "2024-07-19T15:18:20.264962Z" + "end_time": "2025-03-01T16:03:31.118478Z", + "start_time": "2025-03-01T16:03:31.108213Z" } }, - "id": "d4373fdb670dddb4", + "source": [ + "doc_loader = DocumentLoader()\n", + "application_letters_df = doc_loader.bulk_load_documents(LETTERS_DIR, ext=\"text\")" + ], "outputs": [], "execution_count": 7 }, { "cell_type": "code", - "source": "application_letters_df.head()", + "id": "afca07705eed8007", "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.305847Z", - "start_time": "2024-07-19T15:18:20.295217Z" + "end_time": "2025-03-01T16:03:31.160056Z", + "start_time": "2025-03-01T16:03:31.152759Z" } }, - "id": "afca07705eed8007", + "source": [ + "application_letters_df.head()" + ], "outputs": [ { "data": { "text/plain": [ " id text\n", - "0 sample_letter_708 Having worked in the tech industry for 7 years...\n", - "1 sample_letter_311 Dear Hiring Manager,\\n\\nI am writing to expres...\n", - "2 sample_letter_253 I am writing to apply for the Generative AI En...\n", - "3 sample_letter_611 Dear Hiring Manager,\\n\\nI am writing to expres...\n", - "4 sample_letter_394 Dear Hiring Manager,\\n\\nI am writing to expres..." + "0 sample_letter_507 I am writing to apply for the System Administr...\n", + "1 sample_letter_649 I am excited to apply for the AI Engineer posi...\n", + "2 sample_letter_480 Dear Hiring Manager,\\n\\nI am writing to expres...\n", + "3 sample_letter_605 Dear Hiring Manager,\\n\\nI am writing to expres...\n", + "4 sample_letter_746 I am writing to express my interest in the Dat..." ], "text/html": [ "
\n", @@ -249,28 +254,28 @@ " \n", " \n", " 0\n", - " sample_letter_708\n", - " Having worked in the tech industry for 7 years...\n", + " sample_letter_507\n", + " I am writing to apply for the System Administr...\n", " \n", " \n", " 1\n", - " sample_letter_311\n", - " Dear Hiring Manager,\\n\\nI am writing to expres...\n", + " sample_letter_649\n", + " I am excited to apply for the AI Engineer posi...\n", " \n", " \n", " 2\n", - " sample_letter_253\n", - " I am writing to apply for the Generative AI En...\n", + " sample_letter_480\n", + " Dear Hiring Manager,\\n\\nI am writing to expres...\n", " \n", " \n", " 3\n", - " sample_letter_611\n", + " sample_letter_605\n", " Dear Hiring Manager,\\n\\nI am writing to expres...\n", " \n", " \n", " 4\n", - " sample_letter_394\n", - " Dear Hiring Manager,\\n\\nI am writing to expres...\n", + " sample_letter_746\n", + " I am writing to express my interest in the Dat...\n", " \n", " \n", "\n", @@ -285,95 +290,100 @@ "execution_count": 8 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Initializing a Connector to the OpenAI API", - "id": "f13c02d1366be5df" + "id": "f13c02d1366be5df", + "metadata": {}, + "source": "# Initializing a Connector to the OpenAI API" }, { "cell_type": "code", + "id": "c7e7e469ae9840ea", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-03-01T16:03:31.285820Z", + "start_time": "2025-03-01T16:03:31.284192Z" + } + }, "source": [ "openai_key = os.getenv(\"OPENAI_API_KEY\")\n", "\n", "# Check if the API key is set and available\n", "assert openai_key != \"\", \"Please set the OPENAI_API_KEY environment variable\"" ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.310220Z", - "start_time": "2024-07-19T15:18:20.307153Z" - } - }, - "id": "c7e7e469ae9840ea", "outputs": [], "execution_count": 9 }, { "cell_type": "code", - "source": "connector = OpenAIConnector(openai_key)", + "id": "47b55c370e9b849a", "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.355977Z", - "start_time": "2024-07-19T15:18:20.314041Z" + "end_time": "2025-03-01T16:03:31.354081Z", + "start_time": "2025-03-01T16:03:31.325298Z" } }, - "id": "47b55c370e9b849a", + "source": [ + "connector = OpenAIConnector(openai_key)" + ], "outputs": [], "execution_count": 10 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Embedding the Letters", - "id": "64a31f966f4bce00" + "id": "64a31f966f4bce00", + "metadata": {}, + "source": "# Embedding the Letters" }, { "cell_type": "code", - "source": [ - "letters_with_embeddings_df = application_letters_df.copy()\n", - "letters_with_embeddings_df['embedding'] = connector.embed(documents=application_letters_df['text'], \n", - " model=EmbeddingModels.OPENAPI_EMS)" - ], + "id": "68a6d6e054a998b0", "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-19T15:18:25.397806Z", - "start_time": "2024-07-19T15:18:20.357020Z" + "end_time": "2025-03-01T16:03:35.887699Z", + "start_time": "2025-03-01T16:03:31.376295Z" } }, - "id": "68a6d6e054a998b0", + "source": [ + "letters_with_embeddings_df = application_letters_df.copy()\n", + "letters_with_embeddings_df[\"embedding\"] = connector.embed(\n", + " documents=application_letters_df[\"text\"], model=EmbeddingModels.OPENAPI_EMS\n", + ")" + ], "outputs": [], "execution_count": 11 }, { + "cell_type": "code", + "id": "8d4478a411acf706", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:25.408392Z", - "start_time": "2024-07-19T15:18:25.398908Z" + "end_time": "2025-03-01T16:03:35.898332Z", + "start_time": "2025-03-01T16:03:35.893536Z" } }, - "cell_type": "code", - "source": "letters_with_embeddings_df.head()", - "id": "8d4478a411acf706", + "source": [ + "letters_with_embeddings_df.head()" + ], "outputs": [ { "data": { "text/plain": [ " id text \\\n", - "0 sample_letter_708 Having worked in the tech industry for 7 years... \n", - "1 sample_letter_311 Dear Hiring Manager,\\n\\nI am writing to expres... \n", - "2 sample_letter_253 I am writing to apply for the Generative AI En... \n", - "3 sample_letter_611 Dear Hiring Manager,\\n\\nI am writing to expres... \n", - "4 sample_letter_394 Dear Hiring Manager,\\n\\nI am writing to expres... \n", + "0 sample_letter_507 I am writing to apply for the System Administr... \n", + "1 sample_letter_649 I am excited to apply for the AI Engineer posi... \n", + "2 sample_letter_480 Dear Hiring Manager,\\n\\nI am writing to expres... \n", + "3 sample_letter_605 Dear Hiring Manager,\\n\\nI am writing to expres... \n", + "4 sample_letter_746 I am writing to express my interest in the Dat... \n", "\n", " embedding \n", - "0 [-0.015994248911738396, 0.020503530278801918, ... \n", - "1 [0.035636693239212036, -0.001379814581014216, ... \n", - "2 [0.017323147505521774, -0.044040385633707047, ... \n", - "3 [0.049760978668928146, -0.015271708369255066, ... \n", - "4 [0.034355904906988144, 0.02534237876534462, 0.... " + "0 [-0.017879672348499298, 0.01556997001171112, 0... \n", + "1 [-0.016682444140315056, -0.0227852500975132, -... \n", + "2 [0.04394077882170677, 0.005209765397012234, 0.... \n", + "3 [0.04028189927339554, 0.020218798890709877, 0.... \n", + "4 [0.016521798446774483, -0.006224688142538071, ... " ], "text/html": [ "
\n", @@ -402,33 +412,33 @@ " \n", " \n", " 0\n", - " sample_letter_708\n", - " Having worked in the tech industry for 7 years...\n", - " [-0.015994248911738396, 0.020503530278801918, ...\n", + " sample_letter_507\n", + " I am writing to apply for the System Administr...\n", + " [-0.017879672348499298, 0.01556997001171112, 0...\n", " \n", " \n", " 1\n", - " sample_letter_311\n", - " Dear Hiring Manager,\\n\\nI am writing to expres...\n", - " [0.035636693239212036, -0.001379814581014216, ...\n", + " sample_letter_649\n", + " I am excited to apply for the AI Engineer posi...\n", + " [-0.016682444140315056, -0.0227852500975132, -...\n", " \n", " \n", " 2\n", - " sample_letter_253\n", - " I am writing to apply for the Generative AI En...\n", - " [0.017323147505521774, -0.044040385633707047, ...\n", + " sample_letter_480\n", + " Dear Hiring Manager,\\n\\nI am writing to expres...\n", + " [0.04394077882170677, 0.005209765397012234, 0....\n", " \n", " \n", " 3\n", - " sample_letter_611\n", + " sample_letter_605\n", " Dear Hiring Manager,\\n\\nI am writing to expres...\n", - " [0.049760978668928146, -0.015271708369255066, ...\n", + " [0.04028189927339554, 0.020218798890709877, 0....\n", " \n", " \n", " 4\n", - " sample_letter_394\n", - " Dear Hiring Manager,\\n\\nI am writing to expres...\n", - " [0.034355904906988144, 0.02534237876534462, 0....\n", + " sample_letter_746\n", + " I am writing to express my interest in the Dat...\n", + " [0.016521798446774483, -0.006224688142538071, ...\n", " \n", " \n", "\n", @@ -443,34 +453,38 @@ "execution_count": 12 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Finding the Most Relevant Letters to the Job Ad", - "id": "c7e02957fc102026" + "id": "c7e02957fc102026", + "metadata": {}, + "source": "# Finding the Most Relevant Letters to the Job Ad" }, { + "cell_type": "code", + "id": "ce90f301a75a57d", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:25.489564Z", - "start_time": "2024-07-19T15:18:25.409503Z" + "end_time": "2025-03-01T16:03:35.957725Z", + "start_time": "2025-03-01T16:03:35.953966Z" } }, - "cell_type": "code", - "source": "ranker = Ranker()", - "id": "ce90f301a75a57d", + "source": [ + "ranker = Ranker()" + ], "outputs": [], "execution_count": 13 }, { + "cell_type": "code", + "id": "20a8939323bfaf40", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.193297Z", - "start_time": "2024-07-19T15:18:25.495872Z" + "end_time": "2025-03-01T16:03:36.786030Z", + "start_time": "2025-03-01T16:03:35.995579Z" } }, - "cell_type": "code", - "source": "ranker.make_collection(documents_with_embeddings=letters_with_embeddings_df)", - "id": "20a8939323bfaf40", + "source": [ + "ranker.make_collection(documents_with_embeddings=letters_with_embeddings_df)" + ], "outputs": [ { "name": "stdout", @@ -483,19 +497,19 @@ "execution_count": 14 }, { + "cell_type": "code", + "id": "f1ecce59c3f826e0", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.198446Z", - "start_time": "2024-07-19T15:18:27.194346Z" + "end_time": "2025-03-01T16:03:36.800549Z", + "start_time": "2025-03-01T16:03:36.798838Z" } }, - "cell_type": "code", "source": [ "sample_job_ad = doc_loader.load_document(path=SAMPLE_JOB_AD)\n", "\n", "pprint(sample_job_ad)" ], - "id": "f1ecce59c3f826e0", "outputs": [ { "data": { @@ -511,37 +525,40 @@ "execution_count": 15 }, { + "cell_type": "code", + "id": "2d61db3285bf245c", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.516960Z", - "start_time": "2024-07-19T15:18:27.199817Z" + "end_time": "2025-03-01T16:03:37.297177Z", + "start_time": "2025-03-01T16:03:36.840933Z" } }, - "cell_type": "code", "source": [ - "sample_job_ad_embedded = connector.embed(documents=[sample_job_ad], \n", - " model=EmbeddingModels.OPENAPI_EMS)" + "sample_job_ad_embedded = connector.embed(\n", + " documents=[sample_job_ad], model=EmbeddingModels.OPENAPI_EMS\n", + ")" ], - "id": "2d61db3285bf245c", "outputs": [], "execution_count": 16 }, { + "cell_type": "code", + "id": "f40e0f2ddb7a7504", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.522163Z", - "start_time": "2024-07-19T15:18:27.518012Z" + "end_time": "2025-03-01T16:03:37.301674Z", + "start_time": "2025-03-01T16:03:37.299478Z" } }, - "cell_type": "code", - "source": "sample_job_ad_embedded", - "id": "f40e0f2ddb7a7504", + "source": [ + "sample_job_ad_embedded" + ], "outputs": [ { "data": { "text/plain": [ "[array([-0.03499759, -0.01236026, 0.00740111, ..., 0.00064774,\n", - " 0.02555379, -0.00236529])]" + " 0.02555379, -0.00236529], shape=(1536,))]" ] }, "execution_count": 17, @@ -552,26 +569,34 @@ "execution_count": 17 }, { + "cell_type": "code", + "id": "1ab0c0edd9e564bf", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.608446Z", - "start_time": "2024-07-19T15:18:27.523471Z" + "end_time": "2025-03-01T16:03:37.355254Z", + "start_time": "2025-03-01T16:03:37.345435Z" } }, - "cell_type": "code", "source": [ - "most_relevant_letters = ranker.find_similar(sample_job_ad_embedded[0], top_k=3, min_similarity=0.1)\n", + "most_relevant_letters = ranker.find_similar(\n", + " sample_job_ad_embedded[0], top_k=3, min_similarity=0.1\n", + ")\n", "\n", "most_relevant_letters" ], - "id": "1ab0c0edd9e564bf", "outputs": [ { "data": { "text/plain": [ - "[ScoredPoint(id=478, version=0, score=0.36129672473127655, payload={'text': 'I am applying for the System Administrator position at Adobe. My extensive experience with Red Hat, CentOS, and Puppet, as well as my proficiency in Ruby, makes me a strong candidate for this role. I am excited about the opportunity to bring my unique skill set to your team.'}, vector=None, shard_key=None, order_value=None),\n", - " ScoredPoint(id=644, version=0, score=0.35730961122204624, payload={'text': 'I am very interested in the Senior Support Engineer position at Innovative Tech. I have over 4 years of experience in IT support, and I am confident that I can bring a high level of expertise to your team. In my current role at ABC Inc, I have been responsible for managing a team of support specialists and dealing with complex technical issues. I am highly skilled in troubleshooting, networking, and both Linux and Windows systems. I hold a BSc in Information Technology and I am always eager to learn and improve my skills. I believe that my experience and dedication would make me a valuable addition to your company. Thank you for considering my application.'}, vector=None, shard_key=None, order_value=None),\n", - " ScoredPoint(id=159, version=0, score=0.35427863518996816, payload={'text': \"Dear Hiring Manager,\\n\\nI am writing to express my interest in the Intermediate Applied Scientist position at ABC Environmental Solutions. With my strong background in environmental science and experience in applied consulting, I believe I am well-suited for this role.\\n\\nIn my current position as an Applied Scientist at DEF Environmental Solutions, I have successfully conducted environmental impact assessments and provided regulatory support. I have also gained proficiency in data analysis using tools such as Microsoft Office Suite and Arc GIS. Additionally, my fieldwork experience has allowed me to develop strong problem-solving skills and the ability to thrive under new challenges.\\n\\nI hold a Bachelor's degree in Environmental Science and have completed the Canadian Certified Electrofishing Course, demonstrating my commitment to continuous learning and professional development. Furthermore, my Level 1 First Aid Certificate in BC showcases my dedication to safety in the field.\\n\\nI am confident that my technical expertise, leadership abilities, and strong communication skills make me a valuable asset to your team. I am excited about the opportunity to contribute to ABC Environmental Solutions and make a positive impact on environmental conservation.\\n\\nThank you for considering my application. I look forward to the opportunity to discuss how my qualifications align with the requirements of the Intermediate Applied Scientist position.\\n\\nSincerely,\\nJohn Smith\"}, vector=None, shard_key=None, order_value=None)]" + "[{'id': 302,\n", + " 'payload': {'text': 'I am applying for the System Administrator position at Adobe. My extensive experience with Red Hat, CentOS, and Puppet, as well as my proficiency in Ruby, makes me a strong candidate for this role. I am excited about the opportunity to bring my unique skill set to your team.'},\n", + " 'score': 0.36129672473127655},\n", + " {'id': 640,\n", + " 'payload': {'text': 'I am very interested in the Senior Support Engineer position at Innovative Tech. I have over 4 years of experience in IT support, and I am confident that I can bring a high level of expertise to your team. In my current role at ABC Inc, I have been responsible for managing a team of support specialists and dealing with complex technical issues. I am highly skilled in troubleshooting, networking, and both Linux and Windows systems. I hold a BSc in Information Technology and I am always eager to learn and improve my skills. I believe that my experience and dedication would make me a valuable addition to your company. Thank you for considering my application.'},\n", + " 'score': 0.35730961122204624},\n", + " {'id': 545,\n", + " 'payload': {'text': \"Dear Hiring Manager,\\n\\nI am writing to express my interest in the Intermediate Applied Scientist position at ABC Environmental Solutions. With my strong background in environmental science and experience in applied consulting, I believe I am well-suited for this role.\\n\\nIn my current position as an Applied Scientist at DEF Environmental Solutions, I have successfully conducted environmental impact assessments and provided regulatory support. I have also gained proficiency in data analysis using tools such as Microsoft Office Suite and Arc GIS. Additionally, my fieldwork experience has allowed me to develop strong problem-solving skills and the ability to thrive under new challenges.\\n\\nI hold a Bachelor's degree in Environmental Science and have completed the Canadian Certified Electrofishing Course, demonstrating my commitment to continuous learning and professional development. Furthermore, my Level 1 First Aid Certificate in BC showcases my dedication to safety in the field.\\n\\nI am confident that my technical expertise, leadership abilities, and strong communication skills make me a valuable asset to your team. I am excited about the opportunity to contribute to ABC Environmental Solutions and make a positive impact on environmental conservation.\\n\\nThank you for considering my application. I look forward to the opportunity to discuss how my qualifications align with the requirements of the Intermediate Applied Scientist position.\\n\\nSincerely,\\nJohn Smith\"},\n", + " 'score': 0.35427863518996816}]" ] }, "execution_count": 18, @@ -582,41 +607,41 @@ "execution_count": 18 }, { + "cell_type": "code", + "id": "72f666ce5b32c3c2", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.617384Z", - "start_time": "2024-07-19T15:18:27.609781Z" + "end_time": "2025-03-01T16:03:37.404063Z", + "start_time": "2025-03-01T16:03:37.398438Z" } }, - "cell_type": "code", "source": [ "for letter in most_relevant_letters:\n", - " print(\"=\"*80)\n", - " print(f\"Letter ID: {letter.id}\")\n", - " print(f\"Similarity: {letter.score:.2f}\")\n", - " print(f\"Letter Text:\\n{letter.payload['text']}\")\n", + " print(\"=\" * 80)\n", + " print(f\"Letter ID: {letter['id']}\")\n", + " print(f\"Similarity: {letter['score']:.2f}\")\n", + " print(f\"Letter Text:\\n{letter['payload']['text']}\")\n", " print()" ], - "id": "72f666ce5b32c3c2", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "================================================================================\n", - "Letter ID: 478\n", + "Letter ID: 302\n", "Similarity: 0.36\n", "Letter Text:\n", "I am applying for the System Administrator position at Adobe. My extensive experience with Red Hat, CentOS, and Puppet, as well as my proficiency in Ruby, makes me a strong candidate for this role. I am excited about the opportunity to bring my unique skill set to your team.\n", "\n", "================================================================================\n", - "Letter ID: 644\n", + "Letter ID: 640\n", "Similarity: 0.36\n", "Letter Text:\n", "I am very interested in the Senior Support Engineer position at Innovative Tech. I have over 4 years of experience in IT support, and I am confident that I can bring a high level of expertise to your team. In my current role at ABC Inc, I have been responsible for managing a team of support specialists and dealing with complex technical issues. I am highly skilled in troubleshooting, networking, and both Linux and Windows systems. I hold a BSc in Information Technology and I am always eager to learn and improve my skills. I believe that my experience and dedication would make me a valuable addition to your company. Thank you for considering my application.\n", "\n", "================================================================================\n", - "Letter ID: 159\n", + "Letter ID: 545\n", "Similarity: 0.35\n", "Letter Text:\n", "Dear Hiring Manager,\n", @@ -640,35 +665,44 @@ "execution_count": 19 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Gluing Everything Together", - "id": "6b7ec625af7b6ac5" + "id": "6b7ec625af7b6ac5", + "metadata": {}, + "source": "# Gluing Everything Together" }, { + "cell_type": "code", + "id": "469615267e34f4b7", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.630258Z", - "start_time": "2024-07-19T15:18:27.621249Z" + "end_time": "2025-03-01T16:03:37.446492Z", + "start_time": "2025-03-01T16:03:37.443381Z" } }, - "cell_type": "code", "source": [ "prompt_part_1 = \"I'm applying for a job with this description:\\n\\n\"\n", "prompt_part_2 = \"#START OF JOB AD\\n\\n\" + sample_job_ad + \"\\n\\n#END OF JOB AD\"\n", "prompt_part_3 = \"\\n\\nI need to submit a application letter with my CV. Here is a few examples of my previous application letters:\\n\\n\"\n", - "prompt_part_4 = \"\\n\\n\".join([('#START OF EXAMPLE APPLICATION LETTER\\n\\n'+\n", - " t.payload['text'] + '\\n\\n#END OF EXAMPLE APPLICATION LETTER')\n", - " for t in most_relevant_letters])\n", + "prompt_part_4 = \"\\n\\n\".join(\n", + " [\n", + " (\n", + " \"#START OF EXAMPLE APPLICATION LETTER\\n\\n\"\n", + " + t['payload'][\"text\"]\n", + " + \"\\n\\n#END OF EXAMPLE APPLICATION LETTER\"\n", + " )\n", + " for t in most_relevant_letters\n", + " ]\n", + ")\n", "# Extra information for the prompt\n", - "prompt_part_5 = (\"\\n\\nWrite a new application letter that is tailored to the job description above. \"\n", - " \"Be concise and to the point. The letter should be no longer than 500 words. \"\n", - " \"The letter should be written in English and be easy to read.\\n\\n\")\n", + "prompt_part_5 = (\n", + " \"\\n\\nWrite a new application letter that is tailored to the job description above. \"\n", + " \"Be concise and to the point. The letter should be no longer than 500 words. \"\n", + " \"The letter should be written in English and be easy to read.\\n\\n\"\n", + ")\n", "\n", "prompt = prompt_part_1 + prompt_part_2 + prompt_part_3 + prompt_part_4 + prompt_part_5\n", "pprint(prompt)" ], - "id": "469615267e34f4b7", "outputs": [ { "data": { @@ -684,18 +718,18 @@ "execution_count": 20 }, { + "cell_type": "code", + "id": "777666bb5f08fab", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.881913Z", - "start_time": "2024-07-19T15:18:27.631532Z" + "end_time": "2025-03-01T16:03:39.042845Z", + "start_time": "2025-03-01T16:03:37.490333Z" } }, - "cell_type": "code", "source": [ - "num_tokens = gen_num_tokens(text=prompt, encoding='r50k_base')\n", + "num_tokens = gen_num_tokens(text=prompt, encoding=\"r50k_base\")\n", "print(f\"Number of tokens sent to the API: {num_tokens}\")" ], - "id": "777666bb5f08fab", "outputs": [ { "name": "stdout", @@ -708,40 +742,39 @@ "execution_count": 21 }, { + "cell_type": "code", + "id": "fdcaa76ede6692a7", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:33.990875Z", - "start_time": "2024-07-19T15:18:27.883215Z" + "end_time": "2025-03-01T16:03:44.773958Z", + "start_time": "2025-03-01T16:03:39.049094Z" } }, - "cell_type": "code", "source": [ - "draft_letter = connector.chat(prompt=prompt, \n", - " model=LLM,\n", - " temperature=0.1,\n", - " max_tokens=512)" + "draft_letter = connector.chat(prompt=prompt, model=LLM, temperature=0.1, max_tokens=512)" ], - "id": "fdcaa76ede6692a7", "outputs": [], "execution_count": 22 }, { + "cell_type": "code", + "id": "7fdc9e1c780ff8fc", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:34.001718Z", - "start_time": "2024-07-19T15:18:33.993466Z" + "end_time": "2025-03-01T16:03:44.788880Z", + "start_time": "2025-03-01T16:03:44.786983Z" } }, - "cell_type": "code", - "source": "pprint(draft_letter)", - "id": "7fdc9e1c780ff8fc", + "source": [ + "pprint(draft_letter)" + ], "outputs": [ { "data": { "text/plain": [ "" ], - "text/markdown": "[Your Name] \n[Your Address] \n[City, Zip Code] \n[Your Email] \n[Your Phone Number] \n[Date] \n\nHiring Manager \nOSI Assistance Foundation - Armenian Branch Office \n1 Pushkin Str., apt. 2 \nYerevan, Armenia \n\nDear Hiring Manager,\n\nI am writing to express my interest in the Chief Accountant/Finance Assistant position at the OSI Assistance Foundation, as advertised. With a university degree in finance and over a year of experience working in an international organization, I am confident in my ability to contribute effectively to your team.\n\nIn my previous role at [Your Previous Company], I was responsible for managing financial transactions, including grant payments and administrative expenses. This experience has equipped me with a solid understanding of International Accounting Standards (IAS) and Armenian taxation laws, as well as the reporting requirements necessary for compliance. I have developed strong organizational skills that enable me to handle multiple tasks efficiently while maintaining attention to detail.\n\nI am proficient in MS Excel and MS Access, which I have used extensively for data analysis and financial reporting. My ability to quickly learn new software and adapt to changing environments has been a key factor in my success. I pride myself on my discretion and ability to handle confidential information with the utmost professionalism.\n\nFluency in English, Armenian, and Russian allows me to communicate effectively with diverse stakeholders, enhancing collaboration and understanding within the team. I am self-motivated and committed to setting and achieving goals, which I believe aligns well with the values of the OSI Assistance Foundation.\n\nI am excited about the opportunity to bring my skills and experience to your organization and contribute to the important work you do. Thank you for considering my application. I look forward to the possibility of discussing how I can support the financial operations of the OSI Assistance Foundation.\n\nSincerely,\n\n[Your Name]" + "text/markdown": "[Your Name] \n[Your Address] \n[City, State, Zip] \n[Your Email] \n[Your Phone Number] \n[Date] \n\nHiring Manager \nOSI Assistance Foundation - Armenian Branch Office \n1 Pushkin Str., apt. 2 \nYerevan, Armenia \n\nDear Hiring Manager,\n\nI am writing to express my interest in the Chief Accountant/Finance Assistant position at the OSI Assistance Foundation, as advertised. With a university degree in finance and over a year of experience in an international organization, I am confident in my ability to contribute effectively to your team.\n\nIn my previous role at [Your Previous Company], I was responsible for managing financial transactions, including grant payments and administrative expenses. This experience has equipped me with a solid understanding of International Accounting Standards (IAS) and Armenian taxation laws, as well as the reporting requirements necessary for compliance. My strong organizational skills have allowed me to maintain accurate financial records while ensuring timely reporting and adherence to deadlines.\n\nI am proficient in MS Excel and MS Access, which I have used extensively for data analysis and financial reporting. My ability to quickly learn new software and adapt to changing environments has been a key asset in my previous positions. I pride myself on my discretion and ability to handle confidential information with the utmost professionalism.\n\nFluency in English, Armenian, and Russian enables me to communicate effectively with diverse stakeholders, fostering collaboration and understanding within the team. I am self-motivated and goal-oriented, always striving to improve my skills and contribute positively to my workplace.\n\nI am excited about the opportunity to join the OSI Assistance Foundation and support its mission through effective financial management. Thank you for considering my application. I look forward to the possibility of discussing how my qualifications align with the needs of your organization.\n\nSincerely,\n\n[Your Name]" }, "metadata": {}, "output_type": "display_data" @@ -750,33 +783,35 @@ "execution_count": 23 }, { + "cell_type": "code", + "id": "72e655639ecf67ec", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:34.016275Z", - "start_time": "2024-07-19T15:18:34.004639Z" + "end_time": "2025-03-01T16:03:44.827391Z", + "start_time": "2025-03-01T16:03:44.825356Z" } }, - "cell_type": "code", "source": [ "# Save the draft letter to a file\n", "output_file = OUTPUT_DIR / f\"draft-letter-using-{LLM}.txt\"\n", - "with io.open(output_file, 'w', encoding='utf-8') as f:\n", + "with io.open(output_file, \"w\", encoding=\"utf-8\") as f:\n", " f.write(draft_letter)" ], - "id": "72e655639ecf67ec", "outputs": [], "execution_count": 24 }, { + "cell_type": "code", + "id": "58c6a81261120d3a", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:34.023149Z", - "start_time": "2024-07-19T15:18:34.018419Z" + "end_time": "2025-03-01T16:03:44.872718Z", + "start_time": "2025-03-01T16:03:44.870933Z" } }, - "cell_type": "code", - "source": "print(\"Done! The draft letter is saved to the file: \", output_file)", - "id": "58c6a81261120d3a", + "source": [ + "print(\"Done! The draft letter is saved to the file: \", output_file)" + ], "outputs": [ { "name": "stdout", diff --git a/pyproject.toml b/pyproject.toml index 9911a83..3f74a2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,12 @@ [tool.poetry] name = "easy-letters" -version = "0.1.8" -description = "A Python package for generating draft application letters using generative AI" +version = "0.1.9" +description = "A Python library for generating draft application letters using generative AI" authors = ["Hassan Abedi "] +maintainers = ["Hassan Abedi "] readme = "README.md" packages = [{ include = "easy_letters", from = "." }] -include = ["README.md", "LICENSE"] +include = ["README.md"] license = "MIT" repository = "https://github.com/habedi/easy-letters" @@ -15,20 +16,95 @@ openai = "^1.16.1" qdrant-client = "^1.8.2" [tool.poetry.group.dev.dependencies] -jupyter = "^1.0.0" -pytest = "^8.2.2" -black = ">=24.4.2,<26.0.0" -pytest-cov = ">=5,<7" poetry-dynamic-versioning = "^1.4.0" -tiktoken = ">=0.7,<0.10" -pandas = "^2.2.2" +pytest = "^8.2.2" pytest-mock = "^3.14.0" +pytest-cov = "^6.0.0" +pandas = "^2.2.2" +jupyter = "^1.0.0" +tiktoken = "^0.9.0" +ruff = "^0.9.9" +mypy = "^1.15.0" +icecream = "^2.1.4" +numpy = "^2.2.3" +notebook = "^7.3.2" + +#[build-system] +#requires = ["poetry-core"] +#build-backend = "poetry.core.masonry.api" + +#[build-system] +#requires = ["pdm-backend"] +#build-backend = "pdm.backend" [build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.pytest.ini_options] +pythonpath = [".", "easy_letters", "notebooks"] +addopts = "tests/ --cov --doctest-modules --cov-report=xml -s" + +[tool.mypy] +python_version = "3.10" +ignore_missing_imports = true +disallow_untyped_calls = true +strict_optional = true +warn_redundant_casts = true [tool.poetry-dynamic-versioning] enable = true vcs = "git" versioning = "semver" # Semantic Versioning + +# Ruff configuration +[tool.ruff] +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv" +] +line-length = 100 +indent-width = 4 +src = ["src", "tests", "bin", "notebooks"] +target-version = "py310" + +[tool.ruff.lint] +select = ["ANN", "D", "E", "F", "I"] +ignore = [ + # Ignore missing docstrings + "D100", "D101", "D102", "D103", "D104", "D105", "D106", "D107", +] +fixable = ["ALL"] +unfixable = [] +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = [] diff --git a/tests/__init__.py b/tests/__init__.py index 8ac1ecd..7c3cd66 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,4 +1,4 @@ import os from pathlib import Path -TEST_DATA_DIR = Path(os.path.join(os.path.dirname(__file__), 'test_data')) +TEST_DATA_DIR = Path(os.path.join(os.path.dirname(__file__), "test_data")) diff --git a/tests/shared.py b/tests/shared.py new file mode 100644 index 0000000..1275834 --- /dev/null +++ b/tests/shared.py @@ -0,0 +1,39 @@ +import numpy as np + +from easy_letters import Ranker + +# Sample documents and their embeddings for testing +documents_with_embeddings = { + "text": ["Document 1", "Document 2"], + "embedding": [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])], +} + +# Sample embedding to search for similar documents +embedding_to_search = np.array([0.1, 0.2, 0.3]) + +# The expected response (score is Cosine similarity) +search_response = [ + {"id": 0, "score": 1.0, "payload": {"text": "Document 1"}}, + {"id": 1, "score": 0.9746, "payload": {"text": "Document 2"}}, +] + + +def test_make_collection(): + """ + Test the make_collection method of the Ranker class. + This test checks if the collection is created successfully with the correct + parameters. + """ + # Arrange + ranker = Ranker() + collection_name = "test_collection" + + # Act + ranker.make_collection(documents_with_embeddings, collection_name) + + # Assert + coll = ranker.client.get_collection(collection_name) + assert coll is not None + assert coll.points_count == 2 + assert coll.config.params.vectors.size == 3 + assert coll.config.params.vectors.distance == "Cosine" diff --git a/tests/test_connectors.py b/tests/test_connectors.py index d7abcdd..b1f4123 100644 --- a/tests/test_connectors.py +++ b/tests/test_connectors.py @@ -1,42 +1,8 @@ import numpy as np +from icecream import ic from easy_letters import Ranker - -# Sample documents and their embeddings for testing -documents_with_embeddings = { - 'text': ["Document 1", "Document 2"], - 'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])] -} - -# Sample embedding to search for similar documents -embedding_to_search = np.array([0.1, 0.2, 0.3]) - -# The expected response (score is Cosine similarity) -search_response = [ - {"id": 0, "score": 1.0, "payload": {"text": "Document 1"}}, - {"id": 1, "score": 0.9746, "payload": {"text": "Document 2"}} -] - - -def test_make_collection(): - """ - Test the make_collection method of the Ranker class. - This test checks if the collection is created successfully with the correct - parameters. - """ - # Arrange - ranker = Ranker() - collection_name = "test_collection" - - # Act - ranker.make_collection(documents_with_embeddings, collection_name) - - # Assert - coll = ranker.client.get_collection(collection_name) - assert coll is not None - assert coll.points_count == 2 - assert coll.config.params.vectors.size == 3 - assert coll.config.params.vectors.distance == "Cosine" +from tests.shared import documents_with_embeddings, embedding_to_search, search_response def test_find_similar(): @@ -54,6 +20,9 @@ def test_find_similar(): response = ranker.find_similar(embedding_to_search, collection_name, 2) # Assert - assert response[1].id == search_response[1]["id"] - assert np.isclose(response[1].score, search_response[1]["score"], atol=1e-4) - assert response[1].payload == search_response[1]["payload"] + ic(response) + ic(search_response) + ic(response[1]) + assert response[1]['id'] == search_response[1]["id"] + assert np.isclose(response[1]['score'], search_response[1]["score"], atol=1e-4) + assert response[1]['payload'] == search_response[1]["payload"] diff --git a/tests/test_data/README.md b/tests/test_data/README.md index 2cd50dd..4823a72 100644 --- a/tests/test_data/README.md +++ b/tests/test_data/README.md @@ -1,6 +1,6 @@ -# Data for Testing +# Datasets for Tests -The files in the `sample_application_letters` directory are +The files in the [sample_letters](sample_letters/) directory are from [here](https://huggingface.co/datasets/ShashiVish/cover-letter-dataset). -The files in the `sample_job_ads` directory are from [here](https://www.kaggle.com/datasets/madhab/jobposts). +The files in the [sample_ads](sample_ads/) directory are from [here](https://www.kaggle.com/datasets/madhab/jobposts). diff --git a/tests/test_similarity_search.py b/tests/test_similarity_search.py index 5dfa5ab..b1f4123 100644 --- a/tests/test_similarity_search.py +++ b/tests/test_similarity_search.py @@ -1,42 +1,8 @@ import numpy as np +from icecream import ic from easy_letters import Ranker - -# Sample documents and their embeddings for testing -documents_with_embeddings = { - 'text': ["Document 1", "Document 2"], - 'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])] -} - -# Sample embedding to search for similar documents -embedding_to_search = np.array([0.1, 0.2, 0.3]) - -# The expected response (score is Cosine similarity) -search_response = [ - {"id": 0, "score": 1.0, "payload": {"text": "Document 1"}}, - {"id": 1, "score": 0.9746, "payload": {"text": "Document 2"}} -] - - -def test_make_collection(): - """ - Test the make_collection method of the Ranker class. - This test checks if the collection is created successfully with the correct - parameters. - """ - # Arrange - ranker = Ranker() - collection_name = "test_collection" - - # Act - ranker.make_collection(documents_with_embeddings, collection_name) - - # Assert - coll = ranker.client.get_collection(collection_name) - assert coll is not None - assert coll.points_count == 2 - assert coll.config.params.vectors.size == 3 - assert coll.config.params.vectors.distance == "Cosine" +from tests.shared import documents_with_embeddings, embedding_to_search, search_response def test_find_similar(): @@ -54,6 +20,9 @@ def test_find_similar(): response = ranker.find_similar(embedding_to_search, collection_name, 2) # Assert - assert response[1].id == search_response[1]["id"] - assert np.isclose(response[1].score, search_response[1]["score"], atol=1e-4) - assert response[1].payload == search_response[1]["payload"] + ic(response) + ic(search_response) + ic(response[1]) + assert response[1]['id'] == search_response[1]["id"] + assert np.isclose(response[1]['score'], search_response[1]["score"], atol=1e-4) + assert response[1]['payload'] == search_response[1]["payload"]