From 9e70db45ed9d766cd21c2dd54ec13e8227570336 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Feb 2025 19:17:40 +0000 Subject: [PATCH 1/8] Update black requirement from ^24.4.2 to >=24.4.2,<26.0.0 Updates the requirements on [black](https://github.com/psf/black) to permit the latest version. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/24.4.2...25.1.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d36dcd1..fb68053 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ qdrant-client = "^1.8.2" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" pytest = "^8.2.2" -black = "^24.4.2" +black = ">=24.4.2,<26.0.0" pytest-cov = ">=5,<7" poetry-dynamic-versioning = "^1.4.0" tiktoken = ">=0.7,<0.9" From cfbc5044d42c69682f0c309f90a819d6abdb3028 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Feb 2025 19:14:03 +0000 Subject: [PATCH 2/8] Update tiktoken requirement from >=0.7,<0.9 to >=0.7,<0.10 Updates the requirements on [tiktoken](https://github.com/openai/tiktoken) to permit the latest version. - [Release notes](https://github.com/openai/tiktoken/releases) - [Changelog](https://github.com/openai/tiktoken/blob/main/CHANGELOG.md) - [Commits](https://github.com/openai/tiktoken/compare/0.7.0...0.9.0) --- updated-dependencies: - dependency-name: tiktoken dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fb68053..9911a83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ pytest = "^8.2.2" black = ">=24.4.2,<26.0.0" pytest-cov = ">=5,<7" poetry-dynamic-versioning = "^1.4.0" -tiktoken = ">=0.7,<0.9" +tiktoken = ">=0.7,<0.10" pandas = "^2.2.2" pytest-mock = "^3.14.0" From 561abe82011357c0cfd3f69311babf97235f8b6f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 19:30:09 +0000 Subject: [PATCH 3/8] Update tiktoken requirement from ^0.7.0 to >=0.7,<0.9 Updates the requirements on [tiktoken](https://github.com/openai/tiktoken) to permit the latest version. - [Release notes](https://github.com/openai/tiktoken/releases) - [Changelog](https://github.com/openai/tiktoken/blob/main/CHANGELOG.md) - [Commits](https://github.com/openai/tiktoken/compare/0.7.0...0.8.0) --- updated-dependencies: - dependency-name: tiktoken dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9911a83..5f6d092 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,10 +17,10 @@ qdrant-client = "^1.8.2" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" pytest = "^8.2.2" -black = ">=24.4.2,<26.0.0" -pytest-cov = ">=5,<7" +black = "^24.4.2" +pytest-cov = "^5.0.0" poetry-dynamic-versioning = "^1.4.0" -tiktoken = ">=0.7,<0.10" +tiktoken = ">=0.7,<0.9" pandas = "^2.2.2" pytest-mock = "^3.14.0" From b36a2d412d2702c24cc2927a35330be1ab71a93a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 19:43:37 +0000 Subject: [PATCH 4/8] Update pytest-cov requirement from ^5.0.0 to >=5,<7 Updates the requirements on [pytest-cov](https://github.com/pytest-dev/pytest-cov) to permit the latest version. - [Changelog](https://github.com/pytest-dev/pytest-cov/blob/master/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest-cov/compare/v5.0.0...v6.0.0) --- updated-dependencies: - dependency-name: pytest-cov dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5f6d092..d36dcd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ qdrant-client = "^1.8.2" jupyter = "^1.0.0" pytest = "^8.2.2" black = "^24.4.2" -pytest-cov = "^5.0.0" +pytest-cov = ">=5,<7" poetry-dynamic-versioning = "^1.4.0" tiktoken = ">=0.7,<0.9" pandas = "^2.2.2" From 535320e732939663d139eaec56145ea15c2afb96 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Feb 2025 19:17:40 +0000 Subject: [PATCH 5/8] Update black requirement from ^24.4.2 to >=24.4.2,<26.0.0 Updates the requirements on [black](https://github.com/psf/black) to permit the latest version. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/24.4.2...25.1.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d36dcd1..fb68053 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ qdrant-client = "^1.8.2" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" pytest = "^8.2.2" -black = "^24.4.2" +black = ">=24.4.2,<26.0.0" pytest-cov = ">=5,<7" poetry-dynamic-versioning = "^1.4.0" tiktoken = ">=0.7,<0.9" From 89005a23438a5d18bf1be0ca627c7466f957d290 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Feb 2025 19:14:03 +0000 Subject: [PATCH 6/8] Update tiktoken requirement from >=0.7,<0.9 to >=0.7,<0.10 Updates the requirements on [tiktoken](https://github.com/openai/tiktoken) to permit the latest version. - [Release notes](https://github.com/openai/tiktoken/releases) - [Changelog](https://github.com/openai/tiktoken/blob/main/CHANGELOG.md) - [Commits](https://github.com/openai/tiktoken/compare/0.7.0...0.9.0) --- updated-dependencies: - dependency-name: tiktoken dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fb68053..9911a83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ pytest = "^8.2.2" black = ">=24.4.2,<26.0.0" pytest-cov = ">=5,<7" poetry-dynamic-versioning = "^1.4.0" -tiktoken = ">=0.7,<0.9" +tiktoken = ">=0.7,<0.10" pandas = "^2.2.2" pytest-mock = "^3.14.0" From d77f999a1084f1d0292f086dbe394f5daa0e8984 Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Sat, 15 Feb 2025 05:32:07 +0100 Subject: [PATCH 7/8] The first revision --- .editorconfig | 4 +- .gitattributes | 3 ++ .../{publish_to_pypi.yml => publish.yml} | 8 ++-- .github/workflows/tests.yml | 15 +++---- README.md | 41 ++++++++++--------- easy_letters/similarity_search.py | 6 +-- notebooks/demo_openai.ipynb | 35 ++++++++-------- pyproject.toml | 32 ++++++++++----- tests/shared.py | 39 ++++++++++++++++++ tests/test_connectors.py | 39 ++---------------- tests/test_similarity_search.py | 39 ++---------------- 11 files changed, 121 insertions(+), 140 deletions(-) rename .github/workflows/{publish_to_pypi.yml => publish.yml} (78%) create mode 100644 tests/shared.py diff --git a/.editorconfig b/.editorconfig index 09c3767..dbb7086 100644 --- a/.editorconfig +++ b/.editorconfig @@ -6,7 +6,7 @@ root = true # Python specific settings, complying with PEP 8 style guide [*.py] indent_size = 4 -max_line_length = 80 +max_line_length = 100 # Markdown files [*.md] @@ -21,7 +21,7 @@ indent_size = 2 indent_size = 2 # YAML files -[*.yml] +[*.{yaml,yml}] indent_size = 2 # JSON files diff --git a/.gitattributes b/.gitattributes index fe04a12..ba88579 100644 --- a/.gitattributes +++ b/.gitattributes @@ -6,3 +6,6 @@ *.gif filter=lfs diff=lfs merge=lfs -text *.csv filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text + +# Exclude files from language stats (GitHub Linguist) +*.ipynb linguist-vendored diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish.yml similarity index 78% rename from .github/workflows/publish_to_pypi.yml rename to .github/workflows/publish.yml index 5018e68..aa66bb8 100644 --- a/.github/workflows/publish_to_pypi.yml +++ b/.github/workflows/publish.yml @@ -2,10 +2,12 @@ name: Publish to PyPI on: workflow_dispatch: # Enable manual runs + tag: + - 'v*' # Run on version tags jobs: - # Run tests before publishing + # Run the tests before publishing to PyPI call_tests: uses: ./.github/workflows/tests.yml @@ -30,10 +32,6 @@ jobs: run: | poetry install - # - name: Update Version - # run: | - # poetry version patch # Use 'minor' or 'major' for minor or major version bumps - - name: Build and Publish Package run: | poetry config pypi-token.pypi ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7bc6653..1552493 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,9 +1,6 @@ name: Tests on: - # push: - # branches: - # - main workflow_dispatch: # Enable manual runs workflow_call: # Make this workflow available to be called by other workflows @@ -13,7 +10,7 @@ jobs: strategy: matrix: - python-version: [ "3.10", "3.11", "3.12" ] + python-version: [ "3.10", "3.11", "3.12", "3.13" ] steps: - name: Checkout Repository @@ -37,11 +34,11 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | - poetry run pytest tests/ --cov --doctest-modules --junitxml=junit/test-results-${{ matrix.python-version }}.xml + poetry run pytest tests/ --cov --doctest-modules --cov-report=xml continue-on-error: false - - name: Upload Test Results - uses: actions/upload-artifact@v4 + - name: Upload Coverage Reports to Codecov + uses: codecov/codecov-action@v5 with: - name: pytest-results-${{ matrix.python-version }} - path: junit/test-results-${{ matrix.python-version }}.xml + token: ${{ secrets.CODECOV_TOKEN }} + diff --git a/README.md b/README.md index 15e0a6d..4368c2d 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,20 @@ -# Easy Letters +## Easy Letters -[![PyPI version](https://badge.fury.io/py/easy-letters.svg)](https://badge.fury.io/py/easy-letters) -[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![python version](https://img.shields.io/badge/Python-%3E=3.10-blue)](https://github.com/habedi/easy-letters) -[![pip downloads](https://img.shields.io/pypi/dm/easy-letters.svg)](https://pypi.org/project/easy-letters/) [![Tests](https://github.com/habedi/easy-letters/actions/workflows/tests.yml/badge.svg)](https://github.com/habedi/easy-letters/actions/workflows/tests.yml) [![CodeFactor](https://www.codefactor.io/repository/github/habedi/easy-letters/badge)](https://www.codefactor.io/repository/github/habedi/easy-letters) +[![python version](https://img.shields.io/badge/Python-%3E=3.10-blue)](https://github.com/habedi/easy-letters) +[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) +[![PyPI version](https://badge.fury.io/py/easy-letters.svg)](https://badge.fury.io/py/easy-letters) +[![pip downloads](https://img.shields.io/pypi/dm/easy-letters.svg)](https://pypi.org/project/easy-letters/) -Easy Letters is a Python package that helps job seekers write application letters. A simple retrieval -augmented generation (RAG) pipeline is used to generate the letters. The user can then edit the draft letter to suit -their needs. +Easy Letters is a Python library that can help job seekers write application letters. +Currently, it proves the basic blocks for creating a simple retrieval augmented generation (RAG) pipeline +to generate application letter drafts. +The user can then edit the draft letter to suit their needs. -See the `notebooks/README.md` file for how easy letters works. +See the [notebooks/README.md](notebooks/README.md) file for how it works. -## 🔧 Installation +### 🔧 Installation You can install Easy Letters using pip: @@ -21,21 +22,21 @@ You can install Easy Letters using pip: pip install easy-letters ``` -## 🚀 Getting Started +### 🚀 Getting Started -### API Key Setup +#### API Key Setup -At the moment, Easy Letters gets the API key for supported services from the environment variables. +Easy Letters gets the API key for supported services (like OpenAI) from the environment variables. So you need to set the following environment variables to be able to use Easy Letters: - `OPENAI_API_KEY`: The OpenAI API key (required) -### Sample Notebooks +#### Sample Notebooks -You can find Jupyter notebooks with example code in the `notebooks` directory. +You can find Jupyter notebooks with example code in the [notebooks](notebooks/) directory. The notebooks demonstrate how to use Easy Letters to generate application letter drafts. -### Supported Models +#### Supported Models Easy Letters currently supports the following models: @@ -48,12 +49,12 @@ Easy Letters currently supports the following models: | Text Embedding 3 (Small Variant) | Text Embedding | | Text Embedding 3 (Large Variant) | Text Embedding | -### Installing from Source +#### Installing from Source You can also install Easy Letters from the source code in this repository. The main benefit of this approach is that you might find it easier to run the sample notebooks and modify the code as you wish this way. -After cloning this repository, you can navigate to the `easy-letters` directory and install the +After cloning this repository, you can navigate to the directory where you cloned the repository and install the dependencies using [Poetry](https://python-poetry.org/): ```bash @@ -63,7 +64,7 @@ git clone https://github.com/habedi/easy-letters.git && cd easy-letters poetry install --with dev ``` -### Running the Unit Tests with Coverage +#### Running Tests with Coverage You can run the unit tests with coverage using the following command: @@ -71,7 +72,7 @@ You can run the unit tests with coverage using the following command: poetry run pytest tests/ --cov=easy_letters ``` -## 📝 TODO +### 📝 TODO - [ ] Add support for Anthropic models and API - [ ] Add support for locally served models via Ollama diff --git a/easy_letters/similarity_search.py b/easy_letters/similarity_search.py index 92966ae..7408c94 100644 --- a/easy_letters/similarity_search.py +++ b/easy_letters/similarity_search.py @@ -67,6 +67,6 @@ def find_similar(self, embedding, collection_name="letters", top_k=5, Returns: list: A list of search results with similar documents. """ - return self.client.search(collection_name=collection_name, - query_vector=embedding, - limit=top_k, score_threshold=min_similarity) + return self.client.query_points(collection_name=collection_name, + query=embedding, + limit=top_k, score_threshold=min_similarity) diff --git a/notebooks/demo_openai.ipynb b/notebooks/demo_openai.ipynb index bd7057f..758e5b4 100644 --- a/notebooks/demo_openai.ipynb +++ b/notebooks/demo_openai.ipynb @@ -33,15 +33,14 @@ "source": [ "import io\n", "import os\n", - "import tiktoken\n", + "from pathlib import Path\n", "\n", "import pandas as pd\n", + "import tiktoken\n", + "from IPython.display import display, Markdown\n", "\n", - "from easy_letters import OpenAIConnector, Ranker\n", "from easy_letters import LanguageModels, EmbeddingModels\n", - "\n", - "from IPython.display import display, Markdown\n", - "from pathlib import Path" + "from easy_letters import OpenAIConnector, Ranker" ], "outputs": [], "execution_count": 1 @@ -69,7 +68,7 @@ "cell_type": "code", "source": [ "DATA_DIR = Path(\"../tests/test_data\")\n", - "LETTERS_DIR = DATA_DIR/ \"sample_letters\"\n", + "LETTERS_DIR = DATA_DIR / \"sample_letters\"\n", "SAMPLE_JOB_AD = DATA_DIR / \"sample_ads/description_6.text\"\n", "\n", "OUTPUT_DIR = Path(\"./output\")\n", @@ -105,7 +104,7 @@ "source": [ "class DocumentLoader:\n", " \"\"\"A class to load documents from files.\"\"\"\n", - " \n", + "\n", " @staticmethod\n", " def _read_txt(path: Path) -> str:\n", " with io.open(path, 'r', encoding='utf-8') as f:\n", @@ -115,7 +114,7 @@ " \"\"\"Load all the documents in a directory with a specific extension into a DataFrame.\"\"\"\n", " documents = []\n", " documents_ids = []\n", - " \n", + "\n", " ext = '.' + ext.lower().lstrip('.')\n", " for file in path.glob(f'*{ext}'):\n", " if ext in ('.txt', '.text'):\n", @@ -333,7 +332,7 @@ "cell_type": "code", "source": [ "letters_with_embeddings_df = application_letters_df.copy()\n", - "letters_with_embeddings_df['embedding'] = connector.embed(documents=application_letters_df['text'], \n", + "letters_with_embeddings_df['embedding'] = connector.embed(documents=application_letters_df['text'],\n", " model=EmbeddingModels.OPENAPI_EMS)" ], "metadata": { @@ -519,7 +518,7 @@ }, "cell_type": "code", "source": [ - "sample_job_ad_embedded = connector.embed(documents=[sample_job_ad], \n", + "sample_job_ad_embedded = connector.embed(documents=[sample_job_ad],\n", " model=EmbeddingModels.OPENAPI_EMS)" ], "id": "2d61db3285bf245c", @@ -560,7 +559,7 @@ }, "cell_type": "code", "source": [ - "most_relevant_letters = ranker.find_similar(sample_job_ad_embedded[0], top_k=3, min_similarity=0.1)\n", + "most_relevant_letters = ranker.find_similar(sample_job_ad_embedded[0], top_k=3, min_similarity=0.1)\n", "\n", "most_relevant_letters" ], @@ -591,7 +590,7 @@ "cell_type": "code", "source": [ "for letter in most_relevant_letters:\n", - " print(\"=\"*80)\n", + " print(\"=\" * 80)\n", " print(f\"Letter ID: {letter.id}\")\n", " print(f\"Similarity: {letter.score:.2f}\")\n", " print(f\"Letter Text:\\n{letter.payload['text']}\")\n", @@ -657,9 +656,9 @@ "prompt_part_1 = \"I'm applying for a job with this description:\\n\\n\"\n", "prompt_part_2 = \"#START OF JOB AD\\n\\n\" + sample_job_ad + \"\\n\\n#END OF JOB AD\"\n", "prompt_part_3 = \"\\n\\nI need to submit a application letter with my CV. Here is a few examples of my previous application letters:\\n\\n\"\n", - "prompt_part_4 = \"\\n\\n\".join([('#START OF EXAMPLE APPLICATION LETTER\\n\\n'+\n", + "prompt_part_4 = \"\\n\\n\".join([('#START OF EXAMPLE APPLICATION LETTER\\n\\n' +\n", " t.payload['text'] + '\\n\\n#END OF EXAMPLE APPLICATION LETTER')\n", - " for t in most_relevant_letters])\n", + " for t in most_relevant_letters])\n", "# Extra information for the prompt\n", "prompt_part_5 = (\"\\n\\nWrite a new application letter that is tailored to the job description above. \"\n", " \"Be concise and to the point. The letter should be no longer than 500 words. \"\n", @@ -716,10 +715,10 @@ }, "cell_type": "code", "source": [ - "draft_letter = connector.chat(prompt=prompt, \n", - " model=LLM,\n", - " temperature=0.1,\n", - " max_tokens=512)" + "draft_letter = connector.chat(prompt=prompt,\n", + " model=LLM,\n", + " temperature=0.1,\n", + " max_tokens=512)" ], "id": "fdcaa76ede6692a7", "outputs": [], diff --git a/pyproject.toml b/pyproject.toml index 9911a83..5ffbc3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,12 @@ [tool.poetry] name = "easy-letters" version = "0.1.8" -description = "A Python package for generating draft application letters using generative AI" +description = "A Python library for generating draft application letters using generative AI" authors = ["Hassan Abedi "] +maintainers = ["Hassan Abedi "] readme = "README.md" packages = [{ include = "easy_letters", from = "." }] -include = ["README.md", "LICENSE"] +include = ["README.md"] license = "MIT" repository = "https://github.com/habedi/easy-letters" @@ -13,22 +14,31 @@ repository = "https://github.com/habedi/easy-letters" python = "^3.10" openai = "^1.16.1" qdrant-client = "^1.8.2" +numpy = "^2.2.3" [tool.poetry.group.dev.dependencies] -jupyter = "^1.0.0" +poetry-dynamic-versioning = "^1.4.0" pytest = "^8.2.2" -black = ">=24.4.2,<26.0.0" +pytest-mock = "^3.14.0" pytest-cov = ">=5,<7" -poetry-dynamic-versioning = "^1.4.0" -tiktoken = ">=0.7,<0.10" pandas = "^2.2.2" -pytest-mock = "^3.14.0" - -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +jupyter = "^1.0.0" +tiktoken = ">=0.7,<0.10" +ruff = "^0.9.9" [tool.poetry-dynamic-versioning] enable = true vcs = "git" versioning = "semver" # Semantic Versioning + +#[build-system] +#requires = ["poetry-core"] +#build-backend = "poetry.core.masonry.api" + +#[build-system] +#requires = ["pdm-backend"] +#build-backend = "pdm.backend" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" diff --git a/tests/shared.py b/tests/shared.py new file mode 100644 index 0000000..02459c6 --- /dev/null +++ b/tests/shared.py @@ -0,0 +1,39 @@ +import numpy as np + +from easy_letters import Ranker + +# Sample documents and their embeddings for testing +documents_with_embeddings = { + 'text': ["Document 1", "Document 2"], + 'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])] +} + +# Sample embedding to search for similar documents +embedding_to_search = np.array([0.1, 0.2, 0.3]) + +# The expected response (score is Cosine similarity) +search_response = [ + {"id": 0, "score": 1.0, "payload": {"text": "Document 1"}}, + {"id": 1, "score": 0.9746, "payload": {"text": "Document 2"}} +] + + +def test_make_collection(): + """ + Test the make_collection method of the Ranker class. + This test checks if the collection is created successfully with the correct + parameters. + """ + # Arrange + ranker = Ranker() + collection_name = "test_collection" + + # Act + ranker.make_collection(documents_with_embeddings, collection_name) + + # Assert + coll = ranker.client.get_collection(collection_name) + assert coll is not None + assert coll.points_count == 2 + assert coll.config.params.vectors.size == 3 + assert coll.config.params.vectors.distance == "Cosine" diff --git a/tests/test_connectors.py b/tests/test_connectors.py index d7abcdd..1bbab4d 100644 --- a/tests/test_connectors.py +++ b/tests/test_connectors.py @@ -1,42 +1,8 @@ import numpy as np from easy_letters import Ranker - -# Sample documents and their embeddings for testing -documents_with_embeddings = { - 'text': ["Document 1", "Document 2"], - 'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])] -} - -# Sample embedding to search for similar documents -embedding_to_search = np.array([0.1, 0.2, 0.3]) - -# The expected response (score is Cosine similarity) -search_response = [ - {"id": 0, "score": 1.0, "payload": {"text": "Document 1"}}, - {"id": 1, "score": 0.9746, "payload": {"text": "Document 2"}} -] - - -def test_make_collection(): - """ - Test the make_collection method of the Ranker class. - This test checks if the collection is created successfully with the correct - parameters. - """ - # Arrange - ranker = Ranker() - collection_name = "test_collection" - - # Act - ranker.make_collection(documents_with_embeddings, collection_name) - - # Assert - coll = ranker.client.get_collection(collection_name) - assert coll is not None - assert coll.points_count == 2 - assert coll.config.params.vectors.size == 3 - assert coll.config.params.vectors.distance == "Cosine" +from tests.shared import (documents_with_embeddings, embedding_to_search, + search_response) def test_find_similar(): @@ -54,6 +20,7 @@ def test_find_similar(): response = ranker.find_similar(embedding_to_search, collection_name, 2) # Assert + print(type(response)) assert response[1].id == search_response[1]["id"] assert np.isclose(response[1].score, search_response[1]["score"], atol=1e-4) assert response[1].payload == search_response[1]["payload"] diff --git a/tests/test_similarity_search.py b/tests/test_similarity_search.py index 5dfa5ab..9c41db0 100644 --- a/tests/test_similarity_search.py +++ b/tests/test_similarity_search.py @@ -1,42 +1,8 @@ import numpy as np from easy_letters import Ranker - -# Sample documents and their embeddings for testing -documents_with_embeddings = { - 'text': ["Document 1", "Document 2"], - 'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])] -} - -# Sample embedding to search for similar documents -embedding_to_search = np.array([0.1, 0.2, 0.3]) - -# The expected response (score is Cosine similarity) -search_response = [ - {"id": 0, "score": 1.0, "payload": {"text": "Document 1"}}, - {"id": 1, "score": 0.9746, "payload": {"text": "Document 2"}} -] - - -def test_make_collection(): - """ - Test the make_collection method of the Ranker class. - This test checks if the collection is created successfully with the correct - parameters. - """ - # Arrange - ranker = Ranker() - collection_name = "test_collection" - - # Act - ranker.make_collection(documents_with_embeddings, collection_name) - - # Assert - coll = ranker.client.get_collection(collection_name) - assert coll is not None - assert coll.points_count == 2 - assert coll.config.params.vectors.size == 3 - assert coll.config.params.vectors.distance == "Cosine" +from tests.shared import (documents_with_embeddings, embedding_to_search, + search_response) def test_find_similar(): @@ -54,6 +20,7 @@ def test_find_similar(): response = ranker.find_similar(embedding_to_search, collection_name, 2) # Assert + print("m", response) assert response[1].id == search_response[1]["id"] assert np.isclose(response[1].score, search_response[1]["score"], atol=1e-4) assert response[1].payload == search_response[1]["payload"] From 2b2e08a98231753333bc21afcca4baa0b793dd6e Mon Sep 17 00:00:00 2001 From: Hassan Abedi Date: Sat, 1 Mar 2025 14:14:01 +0100 Subject: [PATCH 8/8] WIP --- .coveragerc | 17 + .editorconfig | 28 +- .gitattributes | 70 +++- .github/workflows/tests.yml | 8 +- LICENSE | 2 +- README.md | 89 ++-- assets/make_figures.sh | 9 + {notebooks/assets => assets}/workflow.dot | 0 assets/workflow.svg | 232 +++++++++++ easy_letters/connectors.py | 67 ++- easy_letters/similarity_search.py | 77 ++-- notebooks/README.md | 6 +- notebooks/assets/workflow.png | 3 - notebooks/demo_openai.ipynb | 478 ++++++++++++---------- pyproject.toml | 84 +++- tests/__init__.py | 2 +- tests/shared.py | 6 +- tests/test_connectors.py | 14 +- tests/test_data/README.md | 6 +- tests/test_similarity_search.py | 14 +- 20 files changed, 822 insertions(+), 390 deletions(-) create mode 100644 .coveragerc create mode 100644 assets/make_figures.sh rename {notebooks/assets => assets}/workflow.dot (100%) create mode 100644 assets/workflow.svg delete mode 100644 notebooks/assets/workflow.png diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..5ce3dbe --- /dev/null +++ b/.coveragerc @@ -0,0 +1,17 @@ +[run] +# Exclude test files and specific init files from the coverage report +omit = + */tests/* + */test_*.py + */__init__.py # Good idea to exclude __init__.py files from the coverage report + +# Include source files only from certain directories +source = + easy_letters + notebooks + +# Set parallel to true if you run tests in parallel +parallel = True + +# Enable branch coverage if set to True +branch = False diff --git a/.editorconfig b/.editorconfig index dbb7086..a13ed49 100644 --- a/.editorconfig +++ b/.editorconfig @@ -3,27 +3,29 @@ # Top-most EditorConfig file root = true -# Python specific settings, complying with PEP 8 style guide +# Global settings (applicable to all files unless overridden) +[*] +charset = utf-8 # Default character encoding +end_of_line = lf # Use LF for line endings (Unix-style) +indent_style = space # Use spaces for indentation +indent_size = 4 # Default indentation size +insert_final_newline = true # Make sure files end with a newline +trim_trailing_whitespace = true # Remove trailing whitespace + +# Python specific settings, complying with PEP 8 style guide, except for the line length [*.py] -indent_size = 4 max_line_length = 100 # Markdown files [*.md] -trim_trailing_whitespace = false - -# Bash scripts -[*.sh] -indent_size = 2 - -# SQL files -[*.sql] -indent_size = 2 +trim_trailing_whitespace = false # Don't remove trailing whitespace in Markdown files +max_line_length = 120 # YAML files [*.{yaml,yml}] indent_size = 2 -# JSON files -[*.json] +# Shell scripts +[*.sh] indent_size = 2 +indent_style = tab \ No newline at end of file diff --git a/.gitattributes b/.gitattributes index ba88579..5b08caa 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,11 +1,73 @@ -# Adding the following lines to the .gitattributes file will tell Git to treat the files as binary data. -*.text filter=lfs diff=lfs merge=lfs -text -*.png filter=lfs diff=lfs merge=lfs -text +# Common document and text file formats +*.docx filter=lfs diff=lfs merge=lfs -text +*.doc filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +*.djvu filter=lfs diff=lfs merge=lfs -text +*.eps filter=lfs diff=lfs merge=lfs -text +*.odt filter=lfs diff=lfs merge=lfs -text +*.rtf filter=lfs diff=lfs merge=lfs -text +*.ps filter=lfs diff=lfs merge=lfs -text +*.xls filter=lfs diff=lfs merge=lfs -text +*.xlsx filter=lfs diff=lfs merge=lfs -text +*.ppt filter=lfs diff=lfs merge=lfs -text +*.pptx filter=lfs diff=lfs merge=lfs -text + +# Common image formats *.jpg filter=lfs diff=lfs merge=lfs -text *.jpeg filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text *.gif filter=lfs diff=lfs merge=lfs -text -*.csv filter=lfs diff=lfs merge=lfs -text +*.bmp filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +*.tif filter=lfs diff=lfs merge=lfs -text +*.svgz filter=lfs diff=lfs merge=lfs -text + +# Common compressed file formats +*.zip filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.7z filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text + +# Common file formats in machine learning projects +*.bin filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.tfrecord filter=lfs diff=lfs merge=lfs -text +*.hdf5 filter=lfs diff=lfs merge=lfs -text +*.keras filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text + +# Common audio and video formats +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.mp4 filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +*.avi filter=lfs diff=lfs merge=lfs -text +*.mov filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mkv filter=lfs diff=lfs merge=lfs -text +*.webm filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.ogv filter=lfs diff=lfs merge=lfs -text + +# Common data transfer formats +#*.csv filter=lfs diff=lfs merge=lfs -text +#*.tsv filter=lfs diff=lfs merge=lfs -text +#*.json filter=lfs diff=lfs merge=lfs -text +#*.xml filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text +*.feather filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.avro filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.orc filter=lfs diff=lfs merge=lfs -text # Exclude files from language stats (GitHub Linguist) *.ipynb linguist-vendored diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1552493..9a5f70a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: Tests +name: Run Tests on: workflow_dispatch: # Enable manual runs @@ -27,14 +27,14 @@ jobs: - name: Install Dependencies run: | - poetry install --with dev + poetry install --with dev --no-root - - name: Run Tests + - name: Run Tests with Coverage shell: bash env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | - poetry run pytest tests/ --cov --doctest-modules --cov-report=xml + poetry run pytest continue-on-error: false - name: Upload Coverage Reports to Codecov diff --git a/LICENSE b/LICENSE index feb81f7..a2559ba 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Hassan Abedi +Copyright (c) 2025 Hassan Abedi Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 4368c2d..7997573 100644 --- a/README.md +++ b/README.md @@ -1,78 +1,77 @@ ## Easy Letters [![Tests](https://github.com/habedi/easy-letters/actions/workflows/tests.yml/badge.svg)](https://github.com/habedi/easy-letters/actions/workflows/tests.yml) +[![codecov](https://codecov.io/gh/habedi/easy-letters/graph/badge.svg?token=E47OPB2HVA)](https://codecov.io/gh/habedi/easy-letters) [![CodeFactor](https://www.codefactor.io/repository/github/habedi/easy-letters/badge)](https://www.codefactor.io/repository/github/habedi/easy-letters) [![python version](https://img.shields.io/badge/Python-%3E=3.10-blue)](https://github.com/habedi/easy-letters) -[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) +[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/habedi/easy-letters/blob/main/LICENSE) [![PyPI version](https://badge.fury.io/py/easy-letters.svg)](https://badge.fury.io/py/easy-letters) [![pip downloads](https://img.shields.io/pypi/dm/easy-letters.svg)](https://pypi.org/project/easy-letters/) -Easy Letters is a Python library that can help job seekers write application letters. -Currently, it proves the basic blocks for creating a simple retrieval augmented generation (RAG) pipeline -to generate application letter drafts. -The user can then edit the draft letter to suit their needs. +Easy Letters is a Python library that provides the basic building blocks for creating a naive [retrieval augmented +generation (or RAG)](https://arxiv.org/abs/2312.10997) pipeline to generate application letter drafts. +The main idea is to speed up the process of writing application letters by not starting from scratch. +Instead, an applicant could generate a draft letter that can be used as a starting point and customized as needed +to make the final letter. -See the [notebooks/README.md](notebooks/README.md) file for how it works. +The diagram below shows the high-level workflow of how Easy Letters can be used to generate draft application +letters. -### 🔧 Installation +![Easy Letters Workflow](assets/workflow.svg) -You can install Easy Letters using pip: +### Installation ```bash pip install easy-letters ``` -### 🚀 Getting Started +#### Installing from Source + +You can also install Easy Letters from the source code in this repository. +The main benefit of this approach is that you might find it easier to run the sample notebooks and modify the code as +you wish this way. + +After cloning this repository, you can navigate to the directory where you cloned the repository and install the +dependencies using [Poetry](https://python-poetry.org/): + +```bash +git clone https://github.com/habedi/easy-letters.git && cd easy-letters + +# Install the dependencies using Poetry +poetry install --with dev +``` + +### Getting Started #### API Key Setup -Easy Letters gets the API key for supported services (like OpenAI) from the environment variables. +Easy Letters gets the API key for supported services (like [OpenAI](https://platform.openai.com/)) from the environment +variables. So you need to set the following environment variables to be able to use Easy Letters: -- `OPENAI_API_KEY`: The OpenAI API key (required) +- `OPENAI_API_KEY`: The [OpenAI API key](https://platform.openai.com/docs/api-reference/authentication) (required) #### Sample Notebooks You can find Jupyter notebooks with example code in the [notebooks](notebooks/) directory. The notebooks demonstrate how to use Easy Letters to generate application letter drafts. -#### Supported Models +### Supported Models Easy Letters currently supports the following models: -| Model | Type | -|----------------------------------|-----------------| -| GPT-3.5 Turbo | Text Generation | -| GPT-4 Turbo | Text Generation | -| GPT-4o | Text Generation | -| GPT-4o Mini | Text Generation | -| Text Embedding 3 (Small Variant) | Text Embedding | -| Text Embedding 3 (Large Variant) | Text Embedding | - -#### Installing from Source - -You can also install Easy Letters from the source code in this repository. The main benefit of this approach is that -you might find it easier to run the sample notebooks and modify the code as you wish this way. - -After cloning this repository, you can navigate to the directory where you cloned the repository and install the -dependencies using [Poetry](https://python-poetry.org/): - -```bash -git clone https://github.com/habedi/easy-letters.git && cd easy-letters - -# Assuming you have Poetry installed on your system -poetry install --with dev -``` - -#### Running Tests with Coverage +| Model | Type | Company | +|----------------------------------|-----------------|---------| +| GPT-3.5 Turbo | Text Generation | OpenAI | +| GPT-4 Turbo | Text Generation | OpenAI | +| GPT-4o | Text Generation | OpenAI | +| GPT-4o Mini | Text Generation | OpenAI | +| Text Embedding 3 (Small Variant) | Text Embedding | OpenAI | +| Text Embedding 3 (Large Variant) | Text Embedding | OpenAI | -You can run the unit tests with coverage using the following command: - -```bash -poetry run pytest tests/ --cov=easy_letters -``` +> [!NOTE] +> At the moment, Easy Letters only supports text generation and text embedding models from OpenAI. -### 📝 TODO +### License -- [ ] Add support for Anthropic models and API -- [ ] Add support for locally served models via Ollama +Easy Letters is available under the MIT license ([LICENSE](LICENSE)). diff --git a/assets/make_figures.sh b/assets/make_figures.sh new file mode 100644 index 0000000..9f679a2 --- /dev/null +++ b/assets/make_figures.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# You need to have Graphviz installed to run this script +# On Debian-based systems, you can install it using: sudo apt-get install graphviz + +# Make figures from .dot files +for f in *.dot; do + dot -Tsvg $f -o ${f%.dot}.svg +done diff --git a/notebooks/assets/workflow.dot b/assets/workflow.dot similarity index 100% rename from notebooks/assets/workflow.dot rename to assets/workflow.dot diff --git a/assets/workflow.svg b/assets/workflow.svg new file mode 100644 index 0000000..5a3d987 --- /dev/null +++ b/assets/workflow.svg @@ -0,0 +1,232 @@ + + + + + + + Elements + + + cluster_0 + + + User Inputs + + + + cluster_1 + + + Processing + + + + cluster_2 + + + Output + + + + + u + + + User + + + + + q + + + Job Description + + + + + u->q + + + + provides + + + + + als + + + Existing Application Letters + + + + + u->als + + + + provides + + + + + ei + + + Extra Instructions + + + + + u->ei + + + + provides + + + + + api + + + Embedding Model + + + + + q->api + + + + processed by + + + + + als->api + + + + processed by + + + + + db + + + Vector Database + + + + + als->db + + + + stores in + + + + + llm + + + Language Model + + + + + ei->llm + + + + augments + + + + + api->db + + + + stores in + + + + + cp + + + Custom Prompt + + + + + db->cp + + + + retrieves from + + + + + cp->llm + + + + forms + + + + + ld + + + Letter Draft + + + + + llm->ld + + + + generates + + + + + ld->u + + + + reviews + + + + diff --git a/easy_letters/connectors.py b/easy_letters/connectors.py index bd26a5b..ba86824 100644 --- a/easy_letters/connectors.py +++ b/easy_letters/connectors.py @@ -6,77 +6,72 @@ class LanguageModels: - """List of supported language models.""" - OPENAI_GPT35TURBO = 'gpt-3.5-turbo' - OPENAI_GPT4TURBO = 'gpt-4-turbo' - OPENAI_GPT4O = 'gpt-4o' - OPENAI_GPT4OMINI = 'gpt-4o-mini' + """ + Supported language models. + """ + OPENAI_GPT35TURBO = "gpt-3.5-turbo" + OPENAI_GPT4TURBO = "gpt-4-turbo" + OPENAI_GPT4O = "gpt-4o" + OPENAI_GPT4OMINI = "gpt-4o-mini" class EmbeddingModels: - """List of supported text embedding models.""" - OPENAPI_EMS = 'text-embedding-3-small' - OPENAPI_EML = 'text-embedding-3-large' + """ + Supported text embedding models. + """ + OPENAPI_EMS = "text-embedding-3-small" + OPENAPI_EML = "text-embedding-3-large" class OpenAIConnector: """ - Connector class to interact with OpenAI API for embeddings and - chat completions. + Connector to interact with the OpenAI API for embeddings and chat completions. Attributes: - client (openai.Client): The OpenAI client used for API interactions. + client (openai.Client): The client used for API calls. """ def __init__(self, api_key: str, **kwargs): """ - Initialize the OpenAIConnector with an API key and optional parameters. + Initialize the connector with an API key and extra options. Args: - api_key (str): The API key for authenticating with the OpenAI API. - **kwargs: Additional keyword arguments to pass to the OpenAI client. + api_key (str): Your OpenAI API key. + **kwargs: Additional parameters for the OpenAI client. """ self.client = openai.Client(api_key=api_key, **kwargs) - def embed(self, documents: List[str], model: str) -> List[ - ndarray[Any, dtype[Any]]]: + def embed(self, documents: List[str], model: str) -> List[ndarray[Any, dtype[Any]]]: """ - Generate embeddings for a list of documents using a specified model. + Generate embeddings for a list of documents using a given model. Args: - documents (List[str]): A list of documents to embed. + documents (List[str]): Documents to embed. model (str): The model to use for generating embeddings. Returns: - List[ndarray[Any, dtype[Any]]]: A list of numpy arrays containing - the embeddings. + List[ndarray[Any, dtype[Any]]]: A list of numpy arrays with the embeddings. """ - embeddings = self.client.embeddings.create(input=documents, - model=model) + embeddings = self.client.embeddings.create(input=documents, model=model) return [np.array(d.embedding) for d in embeddings.data] - def chat(self, prompt: str, model: str, temperature: float = 0.0, - max_tokens: int = 512) -> str: + def chat(self, prompt: str, model: str, temperature: float = 0.0, max_tokens: int = 512) -> str: """ - Generate a chat completion for a given prompt using a specified model. + Generate a chat response for a given prompt using a specified model. Args: - prompt (str): The input prompt for the chat model. - model (str): The model to use for generating the chat completion. - temperature (float, optional): The sampling temperature. - Defaults to 0.0. - max_tokens (int, optional): The maximum number of tokens for the - model to generate. Defaults to 512. + prompt (str): The prompt text. + model (str): The model to use. + temperature (float, optional): Sampling temperature (defaults to 0.0). + max_tokens (int, optional): Maximum tokens to generate (defaults to 512). Returns: - str: The generated chat response. + str: The chat response text. """ completion = self.client.chat.completions.create( model=model, - messages=[ - {"role": "user", "content": prompt}, - ], + messages=[{"role": "user", "content": prompt}], max_tokens=max_tokens, - temperature=temperature + temperature=temperature, ) return completion.choices[0].message.content diff --git a/easy_letters/similarity_search.py b/easy_letters/similarity_search.py index 7408c94..450ded9 100644 --- a/easy_letters/similarity_search.py +++ b/easy_letters/similarity_search.py @@ -17,56 +17,71 @@ def __init__(self): """ self.client = QdrantClient(":memory:") - def make_collection(self, documents_with_embeddings, - collection_name="letters"): + def make_collection(self, documents_with_embeddings, collection_name="letters"): """ - Create a collection with the given documents and embeddings in the db. + Create a collection with the given documents and embeddings in the database. Args: documents_with_embeddings (dict): A dictionary containing 'text' - and 'embedding' keys with lists of documents and their - corresponding embeddings. + and 'embedding' keys with lists of documents and their + corresponding embeddings. collection_name (str): The name of the collection to create. - Defaults to "letters". + Defaults to "letters". Returns: None """ - documents = documents_with_embeddings['text'] - embeddings = documents_with_embeddings['embedding'] + documents = documents_with_embeddings["text"] + embeddings = documents_with_embeddings["embedding"] - points = [PointStruct(id=idx, vector=e, payload={'text': d}) - for idx, (d, e) in enumerate(zip(documents, embeddings))] + points = [ + PointStruct(id=idx, vector=e, payload={"text": d}) + for idx, (d, e) in enumerate(zip(documents, embeddings)) + ] print( f"Creating collection {collection_name} with {len(points)} " - f"points of size {embeddings[0].shape[0]}") - self.client.create_collection(collection_name=collection_name, - vectors_config=VectorParams( - size=embeddings[0].shape[0], - distance=Distance.COSINE)) + f"points of size {embeddings[0].shape[0]}" + ) + self.client.create_collection( + collection_name=collection_name, + vectors_config=VectorParams( + size=embeddings[0].shape[0], distance=Distance.COSINE + ), + ) self.client.upsert(collection_name, points) - def find_similar(self, embedding, collection_name="letters", top_k=5, - min_similarity=0.1): + def find_similar( + self, embedding, collection_name="letters", top_k=5, min_similarity=0.1 + ): """ - Find similar documents in the given collection for the given embedding - and return top k results. + Find similar documents in the given collection for the provided embedding. + + This method queries the Qdrant collection for documents that are most similar + to the provided embedding vector, returning the top k results with a similarity + score above the specified threshold. The response is converted into a list of + dictionaries for easier indexing. Args: - embedding (list or numpy.ndarray): The embedding vector to search - for similar documents. - collection_name (str): The name of the collection to search in. - Defaults to "letters". - top_k (int): The number of top similar documents to return. - Defaults to 5. - min_similarity (float): The minimum similarity score threshold. - Defaults to 0.1. + embedding (list or numpy.ndarray): The embedding vector to search for similar documents. + collection_name (str): The name of the collection to search in. Defaults to "letters". + top_k (int): The maximum number of similar documents to return. Defaults to 5. + min_similarity (float): The minimum similarity score threshold. Defaults to 0.1. Returns: - list: A list of search results with similar documents. + list: A list of dictionaries, each containing: + - "id": The document's identifier. + - "payload": A dictionary with the document content (e.g., {"text": "..."}) + - "score": The similarity score. """ - return self.client.query_points(collection_name=collection_name, - query=embedding, - limit=top_k, score_threshold=min_similarity) + response = self.client.query_points( + collection_name=collection_name, + query=embedding, + limit=top_k, + score_threshold=min_similarity, + ) + return [ + {"id": point.id, "payload": point.payload, "score": point.score} + for point in response.points + ] diff --git a/notebooks/README.md b/notebooks/README.md index b9197da..15b9768 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -1,5 +1,3 @@ -# 📝 Overview +# Notebooks -The diagram below shows the high-level workflow of how Easy Letters generates application letter drafts. - -![Easy Letters Workflow](assets/workflow.png) +This directory contains example Jupyter notebooks that show how to use Easy Letters to make draft application letters. diff --git a/notebooks/assets/workflow.png b/notebooks/assets/workflow.png deleted file mode 100644 index a8b66b9..0000000 --- a/notebooks/assets/workflow.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8145c236ea1f263b225ecfa04da20067bbfa3b11b3214a0b7dd5bfdfa5a6629d -size 75834 diff --git a/notebooks/demo_openai.ipynb b/notebooks/demo_openai.ipynb index 758e5b4..b5594cc 100644 --- a/notebooks/demo_openai.ipynb +++ b/notebooks/demo_openai.ipynb @@ -1,8 +1,9 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", + "id": "e49b37128a0e12ae", + "metadata": {}, "source": [ "# Workflow Summary\n", "\n", @@ -11,14 +12,13 @@ "3. Find the most relevant (similar) application letters to the job ad\n", "4. Create a custom prompt for the langauge model which includes the job ad and the most relevant application letters, plus some extra information\n", "5. Generate a draft application letter by sending the prompt to the model and save it to a file for further use" - ], - "id": "e49b37128a0e12ae" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Importing Libraries", - "id": "8bce22186ea367e7" + "id": "8bce22186ea367e7", + "metadata": {}, + "source": "# Importing Libraries" }, { "cell_type": "code", @@ -26,8 +26,8 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.133906Z", - "start_time": "2024-07-19T15:18:18.324252Z" + "end_time": "2025-03-01T16:03:30.854723Z", + "start_time": "2025-03-01T16:03:30.072154Z" } }, "source": [ @@ -46,26 +46,36 @@ "execution_count": 1 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Constants and Global Settings", - "id": "dbe6834d7e71a750" + "id": "dbe6834d7e71a750", + "metadata": {}, + "source": "# Constants and Global Settings" }, { + "cell_type": "code", + "id": "de023f5b4bcc6a27", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.138520Z", - "start_time": "2024-07-19T15:18:20.135465Z" + "end_time": "2025-03-01T16:03:30.859295Z", + "start_time": "2025-03-01T16:03:30.857632Z" } }, - "cell_type": "code", - "source": "pd.set_option('display.float_format', lambda x: f'{x:.2f}')", - "id": "de023f5b4bcc6a27", + "source": [ + "pd.set_option(\"display.float_format\", lambda x: f\"{x:.2f}\")" + ], "outputs": [], "execution_count": 2 }, { "cell_type": "code", + "id": "5c47615b36a77ea7", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-03-01T16:03:30.934161Z", + "start_time": "2025-03-01T16:03:30.932265Z" + } + }, "source": [ "DATA_DIR = Path(\"../tests/test_data\")\n", "LETTERS_DIR = DATA_DIR / \"sample_letters\"\n", @@ -76,90 +86,85 @@ "\n", "LLM = LanguageModels.OPENAI_GPT4OMINI" ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.219764Z", - "start_time": "2024-07-19T15:18:20.140001Z" - } - }, - "id": "5c47615b36a77ea7", "outputs": [], "execution_count": 3 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Helper Functions", - "id": "2b971057bcc85cf3" + "id": "2b971057bcc85cf3", + "metadata": {}, + "source": "# Helper Functions" }, { + "cell_type": "code", + "id": "8f441a7372df51b6", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.247749Z", - "start_time": "2024-07-19T15:18:20.227464Z" + "end_time": "2025-03-01T16:03:30.975702Z", + "start_time": "2025-03-01T16:03:30.972929Z" } }, - "cell_type": "code", "source": [ "class DocumentLoader:\n", " \"\"\"A class to load documents from files.\"\"\"\n", "\n", " @staticmethod\n", " def _read_txt(path: Path) -> str:\n", - " with io.open(path, 'r', encoding='utf-8') as f:\n", + " with io.open(path, \"r\", encoding=\"utf-8\") as f:\n", " return f.read()\n", "\n", - " def bulk_load_documents(self, path: Path, ext='.txt') -> pd.DataFrame:\n", + " def bulk_load_documents(self, path: Path, ext=\".txt\") -> pd.DataFrame:\n", " \"\"\"Load all the documents in a directory with a specific extension into a DataFrame.\"\"\"\n", " documents = []\n", " documents_ids = []\n", "\n", - " ext = '.' + ext.lower().lstrip('.')\n", - " for file in path.glob(f'*{ext}'):\n", - " if ext in ('.txt', '.text'):\n", + " ext = \".\" + ext.lower().lstrip(\".\")\n", + " for file in path.glob(f\"*{ext}\"):\n", + " if ext in (\".txt\", \".text\"):\n", " documents.append(self._read_txt(file))\n", " else:\n", - " raise ValueError(f'Unsupported file format: {ext.strip(\".\").capitalize()}')\n", + " raise ValueError(\n", + " f\"Unsupported file format: {ext.strip('.').capitalize()}\"\n", + " )\n", " documents_ids.append(file.stem)\n", - " return pd.DataFrame({'id': documents_ids, 'text': documents})\n", + " return pd.DataFrame({\"id\": documents_ids, \"text\": documents})\n", "\n", - " def load_document(self, path: Path, ext='.txt') -> str:\n", + " def load_document(self, path: Path, ext=\".txt\") -> str:\n", " \"\"\"Load a single document from a file.\"\"\"\n", - " if ext == '.txt':\n", + " if ext == \".txt\":\n", " return self._read_txt(path)\n", " else:\n", - " raise ValueError(f'Unsupported file format: {ext}')" + " raise ValueError(f\"Unsupported file format: {ext}\")" ], - "id": "8f441a7372df51b6", "outputs": [], "execution_count": 4 }, { + "cell_type": "code", + "id": "f95d49699943e99b", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.256452Z", - "start_time": "2024-07-19T15:18:20.251635Z" + "end_time": "2025-03-01T16:03:31.020763Z", + "start_time": "2025-03-01T16:03:31.018818Z" } }, - "cell_type": "code", "source": [ "def pprint(text: str):\n", " \"\"\"Pretty print the text as markdown in Jupyter Notebook's output.\"\"\"\n", " display(Markdown(text))" ], - "id": "f95d49699943e99b", "outputs": [], "execution_count": 5 }, { + "cell_type": "code", + "id": "eb103567891fa92", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.263207Z", - "start_time": "2024-07-19T15:18:20.258780Z" + "end_time": "2025-03-01T16:03:31.065485Z", + "start_time": "2025-03-01T16:03:31.063169Z" } }, - "cell_type": "code", "source": [ "def gen_num_tokens(text: str, encoding: str = \"r50k_base\") -> int:\n", " \"\"\"Calculate the number of tokens in a string for a specific model.\n", @@ -173,54 +178,55 @@ " encoding = tiktoken.get_encoding(encoding)\n", " return len(encoding.encode(text))" ], - "id": "eb103567891fa92", "outputs": [], "execution_count": 6 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Loading the Application Letters", - "id": "9744b2654a8917ca" + "id": "9744b2654a8917ca", + "metadata": {}, + "source": "# Loading the Application Letters" }, { "cell_type": "code", - "source": [ - "doc_loader = DocumentLoader()\n", - "application_letters_df = doc_loader.bulk_load_documents(LETTERS_DIR, ext=\"text\")" - ], + "id": "d4373fdb670dddb4", "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.293820Z", - "start_time": "2024-07-19T15:18:20.264962Z" + "end_time": "2025-03-01T16:03:31.118478Z", + "start_time": "2025-03-01T16:03:31.108213Z" } }, - "id": "d4373fdb670dddb4", + "source": [ + "doc_loader = DocumentLoader()\n", + "application_letters_df = doc_loader.bulk_load_documents(LETTERS_DIR, ext=\"text\")" + ], "outputs": [], "execution_count": 7 }, { "cell_type": "code", - "source": "application_letters_df.head()", + "id": "afca07705eed8007", "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.305847Z", - "start_time": "2024-07-19T15:18:20.295217Z" + "end_time": "2025-03-01T16:03:31.160056Z", + "start_time": "2025-03-01T16:03:31.152759Z" } }, - "id": "afca07705eed8007", + "source": [ + "application_letters_df.head()" + ], "outputs": [ { "data": { "text/plain": [ " id text\n", - "0 sample_letter_708 Having worked in the tech industry for 7 years...\n", - "1 sample_letter_311 Dear Hiring Manager,\\n\\nI am writing to expres...\n", - "2 sample_letter_253 I am writing to apply for the Generative AI En...\n", - "3 sample_letter_611 Dear Hiring Manager,\\n\\nI am writing to expres...\n", - "4 sample_letter_394 Dear Hiring Manager,\\n\\nI am writing to expres..." + "0 sample_letter_507 I am writing to apply for the System Administr...\n", + "1 sample_letter_649 I am excited to apply for the AI Engineer posi...\n", + "2 sample_letter_480 Dear Hiring Manager,\\n\\nI am writing to expres...\n", + "3 sample_letter_605 Dear Hiring Manager,\\n\\nI am writing to expres...\n", + "4 sample_letter_746 I am writing to express my interest in the Dat..." ], "text/html": [ "
\n", @@ -248,28 +254,28 @@ " \n", " \n", " 0\n", - " sample_letter_708\n", - " Having worked in the tech industry for 7 years...\n", + " sample_letter_507\n", + " I am writing to apply for the System Administr...\n", " \n", " \n", " 1\n", - " sample_letter_311\n", - " Dear Hiring Manager,\\n\\nI am writing to expres...\n", + " sample_letter_649\n", + " I am excited to apply for the AI Engineer posi...\n", " \n", " \n", " 2\n", - " sample_letter_253\n", - " I am writing to apply for the Generative AI En...\n", + " sample_letter_480\n", + " Dear Hiring Manager,\\n\\nI am writing to expres...\n", " \n", " \n", " 3\n", - " sample_letter_611\n", + " sample_letter_605\n", " Dear Hiring Manager,\\n\\nI am writing to expres...\n", " \n", " \n", " 4\n", - " sample_letter_394\n", - " Dear Hiring Manager,\\n\\nI am writing to expres...\n", + " sample_letter_746\n", + " I am writing to express my interest in the Dat...\n", " \n", " \n", "\n", @@ -284,95 +290,100 @@ "execution_count": 8 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Initializing a Connector to the OpenAI API", - "id": "f13c02d1366be5df" + "id": "f13c02d1366be5df", + "metadata": {}, + "source": "# Initializing a Connector to the OpenAI API" }, { "cell_type": "code", + "id": "c7e7e469ae9840ea", + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2025-03-01T16:03:31.285820Z", + "start_time": "2025-03-01T16:03:31.284192Z" + } + }, "source": [ "openai_key = os.getenv(\"OPENAI_API_KEY\")\n", "\n", "# Check if the API key is set and available\n", "assert openai_key != \"\", \"Please set the OPENAI_API_KEY environment variable\"" ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.310220Z", - "start_time": "2024-07-19T15:18:20.307153Z" - } - }, - "id": "c7e7e469ae9840ea", "outputs": [], "execution_count": 9 }, { "cell_type": "code", - "source": "connector = OpenAIConnector(openai_key)", + "id": "47b55c370e9b849a", "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-19T15:18:20.355977Z", - "start_time": "2024-07-19T15:18:20.314041Z" + "end_time": "2025-03-01T16:03:31.354081Z", + "start_time": "2025-03-01T16:03:31.325298Z" } }, - "id": "47b55c370e9b849a", + "source": [ + "connector = OpenAIConnector(openai_key)" + ], "outputs": [], "execution_count": 10 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Embedding the Letters", - "id": "64a31f966f4bce00" + "id": "64a31f966f4bce00", + "metadata": {}, + "source": "# Embedding the Letters" }, { "cell_type": "code", - "source": [ - "letters_with_embeddings_df = application_letters_df.copy()\n", - "letters_with_embeddings_df['embedding'] = connector.embed(documents=application_letters_df['text'],\n", - " model=EmbeddingModels.OPENAPI_EMS)" - ], + "id": "68a6d6e054a998b0", "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-19T15:18:25.397806Z", - "start_time": "2024-07-19T15:18:20.357020Z" + "end_time": "2025-03-01T16:03:35.887699Z", + "start_time": "2025-03-01T16:03:31.376295Z" } }, - "id": "68a6d6e054a998b0", + "source": [ + "letters_with_embeddings_df = application_letters_df.copy()\n", + "letters_with_embeddings_df[\"embedding\"] = connector.embed(\n", + " documents=application_letters_df[\"text\"], model=EmbeddingModels.OPENAPI_EMS\n", + ")" + ], "outputs": [], "execution_count": 11 }, { + "cell_type": "code", + "id": "8d4478a411acf706", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:25.408392Z", - "start_time": "2024-07-19T15:18:25.398908Z" + "end_time": "2025-03-01T16:03:35.898332Z", + "start_time": "2025-03-01T16:03:35.893536Z" } }, - "cell_type": "code", - "source": "letters_with_embeddings_df.head()", - "id": "8d4478a411acf706", + "source": [ + "letters_with_embeddings_df.head()" + ], "outputs": [ { "data": { "text/plain": [ " id text \\\n", - "0 sample_letter_708 Having worked in the tech industry for 7 years... \n", - "1 sample_letter_311 Dear Hiring Manager,\\n\\nI am writing to expres... \n", - "2 sample_letter_253 I am writing to apply for the Generative AI En... \n", - "3 sample_letter_611 Dear Hiring Manager,\\n\\nI am writing to expres... \n", - "4 sample_letter_394 Dear Hiring Manager,\\n\\nI am writing to expres... \n", + "0 sample_letter_507 I am writing to apply for the System Administr... \n", + "1 sample_letter_649 I am excited to apply for the AI Engineer posi... \n", + "2 sample_letter_480 Dear Hiring Manager,\\n\\nI am writing to expres... \n", + "3 sample_letter_605 Dear Hiring Manager,\\n\\nI am writing to expres... \n", + "4 sample_letter_746 I am writing to express my interest in the Dat... \n", "\n", " embedding \n", - "0 [-0.015994248911738396, 0.020503530278801918, ... \n", - "1 [0.035636693239212036, -0.001379814581014216, ... \n", - "2 [0.017323147505521774, -0.044040385633707047, ... \n", - "3 [0.049760978668928146, -0.015271708369255066, ... \n", - "4 [0.034355904906988144, 0.02534237876534462, 0.... " + "0 [-0.017879672348499298, 0.01556997001171112, 0... \n", + "1 [-0.016682444140315056, -0.0227852500975132, -... \n", + "2 [0.04394077882170677, 0.005209765397012234, 0.... \n", + "3 [0.04028189927339554, 0.020218798890709877, 0.... \n", + "4 [0.016521798446774483, -0.006224688142538071, ... " ], "text/html": [ "
\n", @@ -401,33 +412,33 @@ " \n", " \n", " 0\n", - " sample_letter_708\n", - " Having worked in the tech industry for 7 years...\n", - " [-0.015994248911738396, 0.020503530278801918, ...\n", + " sample_letter_507\n", + " I am writing to apply for the System Administr...\n", + " [-0.017879672348499298, 0.01556997001171112, 0...\n", " \n", " \n", " 1\n", - " sample_letter_311\n", - " Dear Hiring Manager,\\n\\nI am writing to expres...\n", - " [0.035636693239212036, -0.001379814581014216, ...\n", + " sample_letter_649\n", + " I am excited to apply for the AI Engineer posi...\n", + " [-0.016682444140315056, -0.0227852500975132, -...\n", " \n", " \n", " 2\n", - " sample_letter_253\n", - " I am writing to apply for the Generative AI En...\n", - " [0.017323147505521774, -0.044040385633707047, ...\n", + " sample_letter_480\n", + " Dear Hiring Manager,\\n\\nI am writing to expres...\n", + " [0.04394077882170677, 0.005209765397012234, 0....\n", " \n", " \n", " 3\n", - " sample_letter_611\n", + " sample_letter_605\n", " Dear Hiring Manager,\\n\\nI am writing to expres...\n", - " [0.049760978668928146, -0.015271708369255066, ...\n", + " [0.04028189927339554, 0.020218798890709877, 0....\n", " \n", " \n", " 4\n", - " sample_letter_394\n", - " Dear Hiring Manager,\\n\\nI am writing to expres...\n", - " [0.034355904906988144, 0.02534237876534462, 0....\n", + " sample_letter_746\n", + " I am writing to express my interest in the Dat...\n", + " [0.016521798446774483, -0.006224688142538071, ...\n", " \n", " \n", "\n", @@ -442,34 +453,38 @@ "execution_count": 12 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Finding the Most Relevant Letters to the Job Ad", - "id": "c7e02957fc102026" + "id": "c7e02957fc102026", + "metadata": {}, + "source": "# Finding the Most Relevant Letters to the Job Ad" }, { + "cell_type": "code", + "id": "ce90f301a75a57d", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:25.489564Z", - "start_time": "2024-07-19T15:18:25.409503Z" + "end_time": "2025-03-01T16:03:35.957725Z", + "start_time": "2025-03-01T16:03:35.953966Z" } }, - "cell_type": "code", - "source": "ranker = Ranker()", - "id": "ce90f301a75a57d", + "source": [ + "ranker = Ranker()" + ], "outputs": [], "execution_count": 13 }, { + "cell_type": "code", + "id": "20a8939323bfaf40", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.193297Z", - "start_time": "2024-07-19T15:18:25.495872Z" + "end_time": "2025-03-01T16:03:36.786030Z", + "start_time": "2025-03-01T16:03:35.995579Z" } }, - "cell_type": "code", - "source": "ranker.make_collection(documents_with_embeddings=letters_with_embeddings_df)", - "id": "20a8939323bfaf40", + "source": [ + "ranker.make_collection(documents_with_embeddings=letters_with_embeddings_df)" + ], "outputs": [ { "name": "stdout", @@ -482,19 +497,19 @@ "execution_count": 14 }, { + "cell_type": "code", + "id": "f1ecce59c3f826e0", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.198446Z", - "start_time": "2024-07-19T15:18:27.194346Z" + "end_time": "2025-03-01T16:03:36.800549Z", + "start_time": "2025-03-01T16:03:36.798838Z" } }, - "cell_type": "code", "source": [ "sample_job_ad = doc_loader.load_document(path=SAMPLE_JOB_AD)\n", "\n", "pprint(sample_job_ad)" ], - "id": "f1ecce59c3f826e0", "outputs": [ { "data": { @@ -510,37 +525,40 @@ "execution_count": 15 }, { + "cell_type": "code", + "id": "2d61db3285bf245c", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.516960Z", - "start_time": "2024-07-19T15:18:27.199817Z" + "end_time": "2025-03-01T16:03:37.297177Z", + "start_time": "2025-03-01T16:03:36.840933Z" } }, - "cell_type": "code", "source": [ - "sample_job_ad_embedded = connector.embed(documents=[sample_job_ad],\n", - " model=EmbeddingModels.OPENAPI_EMS)" + "sample_job_ad_embedded = connector.embed(\n", + " documents=[sample_job_ad], model=EmbeddingModels.OPENAPI_EMS\n", + ")" ], - "id": "2d61db3285bf245c", "outputs": [], "execution_count": 16 }, { + "cell_type": "code", + "id": "f40e0f2ddb7a7504", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.522163Z", - "start_time": "2024-07-19T15:18:27.518012Z" + "end_time": "2025-03-01T16:03:37.301674Z", + "start_time": "2025-03-01T16:03:37.299478Z" } }, - "cell_type": "code", - "source": "sample_job_ad_embedded", - "id": "f40e0f2ddb7a7504", + "source": [ + "sample_job_ad_embedded" + ], "outputs": [ { "data": { "text/plain": [ "[array([-0.03499759, -0.01236026, 0.00740111, ..., 0.00064774,\n", - " 0.02555379, -0.00236529])]" + " 0.02555379, -0.00236529], shape=(1536,))]" ] }, "execution_count": 17, @@ -551,26 +569,34 @@ "execution_count": 17 }, { + "cell_type": "code", + "id": "1ab0c0edd9e564bf", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.608446Z", - "start_time": "2024-07-19T15:18:27.523471Z" + "end_time": "2025-03-01T16:03:37.355254Z", + "start_time": "2025-03-01T16:03:37.345435Z" } }, - "cell_type": "code", "source": [ - "most_relevant_letters = ranker.find_similar(sample_job_ad_embedded[0], top_k=3, min_similarity=0.1)\n", + "most_relevant_letters = ranker.find_similar(\n", + " sample_job_ad_embedded[0], top_k=3, min_similarity=0.1\n", + ")\n", "\n", "most_relevant_letters" ], - "id": "1ab0c0edd9e564bf", "outputs": [ { "data": { "text/plain": [ - "[ScoredPoint(id=478, version=0, score=0.36129672473127655, payload={'text': 'I am applying for the System Administrator position at Adobe. My extensive experience with Red Hat, CentOS, and Puppet, as well as my proficiency in Ruby, makes me a strong candidate for this role. I am excited about the opportunity to bring my unique skill set to your team.'}, vector=None, shard_key=None, order_value=None),\n", - " ScoredPoint(id=644, version=0, score=0.35730961122204624, payload={'text': 'I am very interested in the Senior Support Engineer position at Innovative Tech. I have over 4 years of experience in IT support, and I am confident that I can bring a high level of expertise to your team. In my current role at ABC Inc, I have been responsible for managing a team of support specialists and dealing with complex technical issues. I am highly skilled in troubleshooting, networking, and both Linux and Windows systems. I hold a BSc in Information Technology and I am always eager to learn and improve my skills. I believe that my experience and dedication would make me a valuable addition to your company. Thank you for considering my application.'}, vector=None, shard_key=None, order_value=None),\n", - " ScoredPoint(id=159, version=0, score=0.35427863518996816, payload={'text': \"Dear Hiring Manager,\\n\\nI am writing to express my interest in the Intermediate Applied Scientist position at ABC Environmental Solutions. With my strong background in environmental science and experience in applied consulting, I believe I am well-suited for this role.\\n\\nIn my current position as an Applied Scientist at DEF Environmental Solutions, I have successfully conducted environmental impact assessments and provided regulatory support. I have also gained proficiency in data analysis using tools such as Microsoft Office Suite and Arc GIS. Additionally, my fieldwork experience has allowed me to develop strong problem-solving skills and the ability to thrive under new challenges.\\n\\nI hold a Bachelor's degree in Environmental Science and have completed the Canadian Certified Electrofishing Course, demonstrating my commitment to continuous learning and professional development. Furthermore, my Level 1 First Aid Certificate in BC showcases my dedication to safety in the field.\\n\\nI am confident that my technical expertise, leadership abilities, and strong communication skills make me a valuable asset to your team. I am excited about the opportunity to contribute to ABC Environmental Solutions and make a positive impact on environmental conservation.\\n\\nThank you for considering my application. I look forward to the opportunity to discuss how my qualifications align with the requirements of the Intermediate Applied Scientist position.\\n\\nSincerely,\\nJohn Smith\"}, vector=None, shard_key=None, order_value=None)]" + "[{'id': 302,\n", + " 'payload': {'text': 'I am applying for the System Administrator position at Adobe. My extensive experience with Red Hat, CentOS, and Puppet, as well as my proficiency in Ruby, makes me a strong candidate for this role. I am excited about the opportunity to bring my unique skill set to your team.'},\n", + " 'score': 0.36129672473127655},\n", + " {'id': 640,\n", + " 'payload': {'text': 'I am very interested in the Senior Support Engineer position at Innovative Tech. I have over 4 years of experience in IT support, and I am confident that I can bring a high level of expertise to your team. In my current role at ABC Inc, I have been responsible for managing a team of support specialists and dealing with complex technical issues. I am highly skilled in troubleshooting, networking, and both Linux and Windows systems. I hold a BSc in Information Technology and I am always eager to learn and improve my skills. I believe that my experience and dedication would make me a valuable addition to your company. Thank you for considering my application.'},\n", + " 'score': 0.35730961122204624},\n", + " {'id': 545,\n", + " 'payload': {'text': \"Dear Hiring Manager,\\n\\nI am writing to express my interest in the Intermediate Applied Scientist position at ABC Environmental Solutions. With my strong background in environmental science and experience in applied consulting, I believe I am well-suited for this role.\\n\\nIn my current position as an Applied Scientist at DEF Environmental Solutions, I have successfully conducted environmental impact assessments and provided regulatory support. I have also gained proficiency in data analysis using tools such as Microsoft Office Suite and Arc GIS. Additionally, my fieldwork experience has allowed me to develop strong problem-solving skills and the ability to thrive under new challenges.\\n\\nI hold a Bachelor's degree in Environmental Science and have completed the Canadian Certified Electrofishing Course, demonstrating my commitment to continuous learning and professional development. Furthermore, my Level 1 First Aid Certificate in BC showcases my dedication to safety in the field.\\n\\nI am confident that my technical expertise, leadership abilities, and strong communication skills make me a valuable asset to your team. I am excited about the opportunity to contribute to ABC Environmental Solutions and make a positive impact on environmental conservation.\\n\\nThank you for considering my application. I look forward to the opportunity to discuss how my qualifications align with the requirements of the Intermediate Applied Scientist position.\\n\\nSincerely,\\nJohn Smith\"},\n", + " 'score': 0.35427863518996816}]" ] }, "execution_count": 18, @@ -581,41 +607,41 @@ "execution_count": 18 }, { + "cell_type": "code", + "id": "72f666ce5b32c3c2", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.617384Z", - "start_time": "2024-07-19T15:18:27.609781Z" + "end_time": "2025-03-01T16:03:37.404063Z", + "start_time": "2025-03-01T16:03:37.398438Z" } }, - "cell_type": "code", "source": [ "for letter in most_relevant_letters:\n", " print(\"=\" * 80)\n", - " print(f\"Letter ID: {letter.id}\")\n", - " print(f\"Similarity: {letter.score:.2f}\")\n", - " print(f\"Letter Text:\\n{letter.payload['text']}\")\n", + " print(f\"Letter ID: {letter['id']}\")\n", + " print(f\"Similarity: {letter['score']:.2f}\")\n", + " print(f\"Letter Text:\\n{letter['payload']['text']}\")\n", " print()" ], - "id": "72f666ce5b32c3c2", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "================================================================================\n", - "Letter ID: 478\n", + "Letter ID: 302\n", "Similarity: 0.36\n", "Letter Text:\n", "I am applying for the System Administrator position at Adobe. My extensive experience with Red Hat, CentOS, and Puppet, as well as my proficiency in Ruby, makes me a strong candidate for this role. I am excited about the opportunity to bring my unique skill set to your team.\n", "\n", "================================================================================\n", - "Letter ID: 644\n", + "Letter ID: 640\n", "Similarity: 0.36\n", "Letter Text:\n", "I am very interested in the Senior Support Engineer position at Innovative Tech. I have over 4 years of experience in IT support, and I am confident that I can bring a high level of expertise to your team. In my current role at ABC Inc, I have been responsible for managing a team of support specialists and dealing with complex technical issues. I am highly skilled in troubleshooting, networking, and both Linux and Windows systems. I hold a BSc in Information Technology and I am always eager to learn and improve my skills. I believe that my experience and dedication would make me a valuable addition to your company. Thank you for considering my application.\n", "\n", "================================================================================\n", - "Letter ID: 159\n", + "Letter ID: 545\n", "Similarity: 0.35\n", "Letter Text:\n", "Dear Hiring Manager,\n", @@ -639,35 +665,44 @@ "execution_count": 19 }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Gluing Everything Together", - "id": "6b7ec625af7b6ac5" + "id": "6b7ec625af7b6ac5", + "metadata": {}, + "source": "# Gluing Everything Together" }, { + "cell_type": "code", + "id": "469615267e34f4b7", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.630258Z", - "start_time": "2024-07-19T15:18:27.621249Z" + "end_time": "2025-03-01T16:03:37.446492Z", + "start_time": "2025-03-01T16:03:37.443381Z" } }, - "cell_type": "code", "source": [ "prompt_part_1 = \"I'm applying for a job with this description:\\n\\n\"\n", "prompt_part_2 = \"#START OF JOB AD\\n\\n\" + sample_job_ad + \"\\n\\n#END OF JOB AD\"\n", "prompt_part_3 = \"\\n\\nI need to submit a application letter with my CV. Here is a few examples of my previous application letters:\\n\\n\"\n", - "prompt_part_4 = \"\\n\\n\".join([('#START OF EXAMPLE APPLICATION LETTER\\n\\n' +\n", - " t.payload['text'] + '\\n\\n#END OF EXAMPLE APPLICATION LETTER')\n", - " for t in most_relevant_letters])\n", + "prompt_part_4 = \"\\n\\n\".join(\n", + " [\n", + " (\n", + " \"#START OF EXAMPLE APPLICATION LETTER\\n\\n\"\n", + " + t['payload'][\"text\"]\n", + " + \"\\n\\n#END OF EXAMPLE APPLICATION LETTER\"\n", + " )\n", + " for t in most_relevant_letters\n", + " ]\n", + ")\n", "# Extra information for the prompt\n", - "prompt_part_5 = (\"\\n\\nWrite a new application letter that is tailored to the job description above. \"\n", - " \"Be concise and to the point. The letter should be no longer than 500 words. \"\n", - " \"The letter should be written in English and be easy to read.\\n\\n\")\n", + "prompt_part_5 = (\n", + " \"\\n\\nWrite a new application letter that is tailored to the job description above. \"\n", + " \"Be concise and to the point. The letter should be no longer than 500 words. \"\n", + " \"The letter should be written in English and be easy to read.\\n\\n\"\n", + ")\n", "\n", "prompt = prompt_part_1 + prompt_part_2 + prompt_part_3 + prompt_part_4 + prompt_part_5\n", "pprint(prompt)" ], - "id": "469615267e34f4b7", "outputs": [ { "data": { @@ -683,18 +718,18 @@ "execution_count": 20 }, { + "cell_type": "code", + "id": "777666bb5f08fab", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:27.881913Z", - "start_time": "2024-07-19T15:18:27.631532Z" + "end_time": "2025-03-01T16:03:39.042845Z", + "start_time": "2025-03-01T16:03:37.490333Z" } }, - "cell_type": "code", "source": [ - "num_tokens = gen_num_tokens(text=prompt, encoding='r50k_base')\n", + "num_tokens = gen_num_tokens(text=prompt, encoding=\"r50k_base\")\n", "print(f\"Number of tokens sent to the API: {num_tokens}\")" ], - "id": "777666bb5f08fab", "outputs": [ { "name": "stdout", @@ -707,40 +742,39 @@ "execution_count": 21 }, { + "cell_type": "code", + "id": "fdcaa76ede6692a7", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:33.990875Z", - "start_time": "2024-07-19T15:18:27.883215Z" + "end_time": "2025-03-01T16:03:44.773958Z", + "start_time": "2025-03-01T16:03:39.049094Z" } }, - "cell_type": "code", "source": [ - "draft_letter = connector.chat(prompt=prompt,\n", - " model=LLM,\n", - " temperature=0.1,\n", - " max_tokens=512)" + "draft_letter = connector.chat(prompt=prompt, model=LLM, temperature=0.1, max_tokens=512)" ], - "id": "fdcaa76ede6692a7", "outputs": [], "execution_count": 22 }, { + "cell_type": "code", + "id": "7fdc9e1c780ff8fc", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:34.001718Z", - "start_time": "2024-07-19T15:18:33.993466Z" + "end_time": "2025-03-01T16:03:44.788880Z", + "start_time": "2025-03-01T16:03:44.786983Z" } }, - "cell_type": "code", - "source": "pprint(draft_letter)", - "id": "7fdc9e1c780ff8fc", + "source": [ + "pprint(draft_letter)" + ], "outputs": [ { "data": { "text/plain": [ "" ], - "text/markdown": "[Your Name] \n[Your Address] \n[City, Zip Code] \n[Your Email] \n[Your Phone Number] \n[Date] \n\nHiring Manager \nOSI Assistance Foundation - Armenian Branch Office \n1 Pushkin Str., apt. 2 \nYerevan, Armenia \n\nDear Hiring Manager,\n\nI am writing to express my interest in the Chief Accountant/Finance Assistant position at the OSI Assistance Foundation, as advertised. With a university degree in finance and over a year of experience working in an international organization, I am confident in my ability to contribute effectively to your team.\n\nIn my previous role at [Your Previous Company], I was responsible for managing financial transactions, including grant payments and administrative expenses. This experience has equipped me with a solid understanding of International Accounting Standards (IAS) and Armenian taxation laws, as well as the reporting requirements necessary for compliance. I have developed strong organizational skills that enable me to handle multiple tasks efficiently while maintaining attention to detail.\n\nI am proficient in MS Excel and MS Access, which I have used extensively for data analysis and financial reporting. My ability to quickly learn new software and adapt to changing environments has been a key factor in my success. I pride myself on my discretion and ability to handle confidential information with the utmost professionalism.\n\nFluency in English, Armenian, and Russian allows me to communicate effectively with diverse stakeholders, enhancing collaboration and understanding within the team. I am self-motivated and committed to setting and achieving goals, which I believe aligns well with the values of the OSI Assistance Foundation.\n\nI am excited about the opportunity to bring my skills and experience to your organization and contribute to the important work you do. Thank you for considering my application. I look forward to the possibility of discussing how I can support the financial operations of the OSI Assistance Foundation.\n\nSincerely,\n\n[Your Name]" + "text/markdown": "[Your Name] \n[Your Address] \n[City, State, Zip] \n[Your Email] \n[Your Phone Number] \n[Date] \n\nHiring Manager \nOSI Assistance Foundation - Armenian Branch Office \n1 Pushkin Str., apt. 2 \nYerevan, Armenia \n\nDear Hiring Manager,\n\nI am writing to express my interest in the Chief Accountant/Finance Assistant position at the OSI Assistance Foundation, as advertised. With a university degree in finance and over a year of experience in an international organization, I am confident in my ability to contribute effectively to your team.\n\nIn my previous role at [Your Previous Company], I was responsible for managing financial transactions, including grant payments and administrative expenses. This experience has equipped me with a solid understanding of International Accounting Standards (IAS) and Armenian taxation laws, as well as the reporting requirements necessary for compliance. My strong organizational skills have allowed me to maintain accurate financial records while ensuring timely reporting and adherence to deadlines.\n\nI am proficient in MS Excel and MS Access, which I have used extensively for data analysis and financial reporting. My ability to quickly learn new software and adapt to changing environments has been a key asset in my previous positions. I pride myself on my discretion and ability to handle confidential information with the utmost professionalism.\n\nFluency in English, Armenian, and Russian enables me to communicate effectively with diverse stakeholders, fostering collaboration and understanding within the team. I am self-motivated and goal-oriented, always striving to improve my skills and contribute positively to my workplace.\n\nI am excited about the opportunity to join the OSI Assistance Foundation and support its mission through effective financial management. Thank you for considering my application. I look forward to the possibility of discussing how my qualifications align with the needs of your organization.\n\nSincerely,\n\n[Your Name]" }, "metadata": {}, "output_type": "display_data" @@ -749,33 +783,35 @@ "execution_count": 23 }, { + "cell_type": "code", + "id": "72e655639ecf67ec", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:34.016275Z", - "start_time": "2024-07-19T15:18:34.004639Z" + "end_time": "2025-03-01T16:03:44.827391Z", + "start_time": "2025-03-01T16:03:44.825356Z" } }, - "cell_type": "code", "source": [ "# Save the draft letter to a file\n", "output_file = OUTPUT_DIR / f\"draft-letter-using-{LLM}.txt\"\n", - "with io.open(output_file, 'w', encoding='utf-8') as f:\n", + "with io.open(output_file, \"w\", encoding=\"utf-8\") as f:\n", " f.write(draft_letter)" ], - "id": "72e655639ecf67ec", "outputs": [], "execution_count": 24 }, { + "cell_type": "code", + "id": "58c6a81261120d3a", "metadata": { "ExecuteTime": { - "end_time": "2024-07-19T15:18:34.023149Z", - "start_time": "2024-07-19T15:18:34.018419Z" + "end_time": "2025-03-01T16:03:44.872718Z", + "start_time": "2025-03-01T16:03:44.870933Z" } }, - "cell_type": "code", - "source": "print(\"Done! The draft letter is saved to the file: \", output_file)", - "id": "58c6a81261120d3a", + "source": [ + "print(\"Done! The draft letter is saved to the file: \", output_file)" + ], "outputs": [ { "name": "stdout", diff --git a/pyproject.toml b/pyproject.toml index 5ffbc3d..3f74a2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "easy-letters" -version = "0.1.8" +version = "0.1.9" description = "A Python library for generating draft application letters using generative AI" authors = ["Hassan Abedi "] maintainers = ["Hassan Abedi "] @@ -14,22 +14,20 @@ repository = "https://github.com/habedi/easy-letters" python = "^3.10" openai = "^1.16.1" qdrant-client = "^1.8.2" -numpy = "^2.2.3" [tool.poetry.group.dev.dependencies] poetry-dynamic-versioning = "^1.4.0" pytest = "^8.2.2" pytest-mock = "^3.14.0" -pytest-cov = ">=5,<7" +pytest-cov = "^6.0.0" pandas = "^2.2.2" jupyter = "^1.0.0" -tiktoken = ">=0.7,<0.10" +tiktoken = "^0.9.0" ruff = "^0.9.9" - -[tool.poetry-dynamic-versioning] -enable = true -vcs = "git" -versioning = "semver" # Semantic Versioning +mypy = "^1.15.0" +icecream = "^2.1.4" +numpy = "^2.2.3" +notebook = "^7.3.2" #[build-system] #requires = ["poetry-core"] @@ -42,3 +40,71 @@ versioning = "semver" # Semantic Versioning [build-system] requires = ["hatchling"] build-backend = "hatchling.build" + +[tool.pytest.ini_options] +pythonpath = [".", "easy_letters", "notebooks"] +addopts = "tests/ --cov --doctest-modules --cov-report=xml -s" + +[tool.mypy] +python_version = "3.10" +ignore_missing_imports = true +disallow_untyped_calls = true +strict_optional = true +warn_redundant_casts = true + +[tool.poetry-dynamic-versioning] +enable = true +vcs = "git" +versioning = "semver" # Semantic Versioning + +# Ruff configuration +[tool.ruff] +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv" +] +line-length = 100 +indent-width = 4 +src = ["src", "tests", "bin", "notebooks"] +target-version = "py310" + +[tool.ruff.lint] +select = ["ANN", "D", "E", "F", "I"] +ignore = [ + # Ignore missing docstrings + "D100", "D101", "D102", "D103", "D104", "D105", "D106", "D107", +] +fixable = ["ALL"] +unfixable = [] +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = [] diff --git a/tests/__init__.py b/tests/__init__.py index 8ac1ecd..7c3cd66 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,4 +1,4 @@ import os from pathlib import Path -TEST_DATA_DIR = Path(os.path.join(os.path.dirname(__file__), 'test_data')) +TEST_DATA_DIR = Path(os.path.join(os.path.dirname(__file__), "test_data")) diff --git a/tests/shared.py b/tests/shared.py index 02459c6..1275834 100644 --- a/tests/shared.py +++ b/tests/shared.py @@ -4,8 +4,8 @@ # Sample documents and their embeddings for testing documents_with_embeddings = { - 'text': ["Document 1", "Document 2"], - 'embedding': [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])] + "text": ["Document 1", "Document 2"], + "embedding": [np.array([0.1, 0.2, 0.3]), np.array([0.4, 0.5, 0.6])], } # Sample embedding to search for similar documents @@ -14,7 +14,7 @@ # The expected response (score is Cosine similarity) search_response = [ {"id": 0, "score": 1.0, "payload": {"text": "Document 1"}}, - {"id": 1, "score": 0.9746, "payload": {"text": "Document 2"}} + {"id": 1, "score": 0.9746, "payload": {"text": "Document 2"}}, ] diff --git a/tests/test_connectors.py b/tests/test_connectors.py index 1bbab4d..b1f4123 100644 --- a/tests/test_connectors.py +++ b/tests/test_connectors.py @@ -1,8 +1,8 @@ import numpy as np +from icecream import ic from easy_letters import Ranker -from tests.shared import (documents_with_embeddings, embedding_to_search, - search_response) +from tests.shared import documents_with_embeddings, embedding_to_search, search_response def test_find_similar(): @@ -20,7 +20,9 @@ def test_find_similar(): response = ranker.find_similar(embedding_to_search, collection_name, 2) # Assert - print(type(response)) - assert response[1].id == search_response[1]["id"] - assert np.isclose(response[1].score, search_response[1]["score"], atol=1e-4) - assert response[1].payload == search_response[1]["payload"] + ic(response) + ic(search_response) + ic(response[1]) + assert response[1]['id'] == search_response[1]["id"] + assert np.isclose(response[1]['score'], search_response[1]["score"], atol=1e-4) + assert response[1]['payload'] == search_response[1]["payload"] diff --git a/tests/test_data/README.md b/tests/test_data/README.md index 2cd50dd..4823a72 100644 --- a/tests/test_data/README.md +++ b/tests/test_data/README.md @@ -1,6 +1,6 @@ -# Data for Testing +# Datasets for Tests -The files in the `sample_application_letters` directory are +The files in the [sample_letters](sample_letters/) directory are from [here](https://huggingface.co/datasets/ShashiVish/cover-letter-dataset). -The files in the `sample_job_ads` directory are from [here](https://www.kaggle.com/datasets/madhab/jobposts). +The files in the [sample_ads](sample_ads/) directory are from [here](https://www.kaggle.com/datasets/madhab/jobposts). diff --git a/tests/test_similarity_search.py b/tests/test_similarity_search.py index 9c41db0..b1f4123 100644 --- a/tests/test_similarity_search.py +++ b/tests/test_similarity_search.py @@ -1,8 +1,8 @@ import numpy as np +from icecream import ic from easy_letters import Ranker -from tests.shared import (documents_with_embeddings, embedding_to_search, - search_response) +from tests.shared import documents_with_embeddings, embedding_to_search, search_response def test_find_similar(): @@ -20,7 +20,9 @@ def test_find_similar(): response = ranker.find_similar(embedding_to_search, collection_name, 2) # Assert - print("m", response) - assert response[1].id == search_response[1]["id"] - assert np.isclose(response[1].score, search_response[1]["score"], atol=1e-4) - assert response[1].payload == search_response[1]["payload"] + ic(response) + ic(search_response) + ic(response[1]) + assert response[1]['id'] == search_response[1]["id"] + assert np.isclose(response[1]['score'], search_response[1]["score"], atol=1e-4) + assert response[1]['payload'] == search_response[1]["payload"]