tmsincomb
diff --git a/‎.github/workflows/pytest.yml‎
Lines changed: 14 additions & 15 deletions b/‎.github/workflows/pytest.yml‎
Lines changed: 14 additions & 15 deletions
diff --git a/‎README.md‎
Lines changed: 17 additions & 8 deletions b/‎README.md‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎docs/images/convert.png‎
1.72 MB b/‎docs/images/convert.png‎
1.72 MB
diff --git a/‎environment.yml‎
Lines changed: 8 additions & 0 deletions b/‎environment.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎imagetocsv/examples/no-grid-index-label.png‎
244 KB b/‎imagetocsv/examples/no-grid-index-label.png‎
244 KB
diff --git a/‎imagetocsv/imagetocsv.py‎
Lines changed: 159 additions & 31 deletions b/‎imagetocsv/imagetocsv.py‎
Lines changed: 159 additions & 31 deletions
diff --git a/‎imagetocsv/string_modifiers.py‎
Lines changed: 10 additions & 1 deletion b/‎imagetocsv/string_modifiers.py‎
Lines changed: 10 additions & 1 deletion
@@ -8,28 +8,27 @@ on:
     branches: ["main", "development"]
 jobs:
   pytest:
+    name: Ex1 (${{ matrix.python-version }}, ${{ matrix.os }})
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [macos-latest, windows-latest]
-        python-version: [3.7] # , 3.8, 3.9, "3.10", "3.11"]
+        os: [windows-latest, ubuntu-latest, macos-latest]
+        python-version: [3.7, 3.8, 3.9, "3.10", "3.11"]
     steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-      - name: Setup Python
-        uses: actions/setup-python@v4
+      - uses: actions/checkout@v3
+      - uses: conda-incubator/setup-miniconda@v2
         with:
+          auto-update-conda: true
           python-version: ${{ matrix.python-version }}
+          environment-file: environment.yml
       - name: Install dependencies (linux)
         if: runner.os == 'Linux'
         run: sudo apt install --yes libpoppler-cpp-dev pkg-config tesseract-ocr libtesseract-dev
-      - name: Install dependencies (macOS)
-        if: runner.os == 'macOS'
-        run: brew install pkg-config poppler tesseract
-      - name: Install dependencies (Windows)
-        if: runner.os == 'Windows'
-        run: choco install poppler tesseract
-      - name: Install Python Dependencies
-        run: pip install '.[dev]'
+      - name: Install dependencies
+        shell: bash -l {0}
+        run: |
+          pip install '.[dev]'
       - name: Unit Testing
-        run: pytest -xsv tests/test_imagetocsv.py
+        shell: bash -l {0}
+        run: pytest -xsvv tests/test_imagetocsv.py
@@ -5,8 +5,8 @@
 
 <div class="flex-container" align="center">
     <a href="https://github.com/jwillis0720/template-repo/commits/master">
-    <a href="https://img.shields.io/badge/Python-3.6%C3.%7C3.8%7C3.9%7C3.10-blue">
-    <img src="https://img.shields.io/badge/Python-3.6%7C3.7%7C3.8%7C3.9%7C3.10%7C3.11-blue"
+    <a href="https://img.shields.io/badge/Python-3.%7C3.8%7C3.9%7C3.10-blue">
+    <img src="https://img.shields.io/badge/Python-3.7%7C3.8%7C3.9%7C3.10%7C3.11-blue"
         alt="Python Version">
     <a href="https://github.com/psf/black">
     <img src="https://img.shields.io/badge/code%20style-black-000000.svg"
@@ -15,6 +15,13 @@
     <a href="https://github.com/pre-commit/pre-commit">
     <img src="https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white"
         alt="pre commit">
+    </br>
+    <img src="https://img.shields.io/badge/mac%20os-000000?style=for-the-badge&logo=macos&logoColor=F0F0F0"
+        alt="MacOS">
+    <img src="https://img.shields.io/badge/Linux-FCC624?style=for-the-badge&logo=linux&logoColor=black"
+        alt="Linux">
+    <img src="https://img.shields.io/badge/Windows-0078D6?style=for-the-badge&logo=windows&logoColor=white"
+        alt="Windows">
 </div>
 
 <p align="center" style="color:green">
@@ -27,15 +34,17 @@
   <a href="#license">License</a>
 </p>
 
-## About
+# About
 Converts An Image to a CSV. This exists because Chorus 3.0 is bat-shit and only shows images for vital metadata.
+<img src="docs/images/convert.png" width="1025"/>
 
-<img src="docs/images/convert.svg" width="1025"/>
 
-
-# Installation
+# Installation for MacOS, Linux, and Windows
+### - Tesseract for the OCR test recognition
+### - Poppler for the pdf/text manipulation
 ```
-pip install imagetocsv
+$ conda install -c conda-forge tesseract==5.2.0 poppler==22.11.0
+$ pip install imagetocsv
 ```
 
 # Terminal Usage
@@ -58,7 +67,7 @@ Options:
 # Terminal Simple Examples
 ```bash
 $ imagetocsv myimage.png mytable.csv
-# For the source image this was built for
+# For the hardcoded options use -p. The "-p bib" option is for Chorus 3.0 columns and headers so you dont have to use the advanced options
 $ imagetocsv -p bib myimage.png
 ```
 
 
@@ -0,0 +1,8 @@
+name: anaconda-client-env
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - tesseract
+  - poppler
+  - pip
@@ -1,52 +1,68 @@
 """Main module."""
 from __future__ import annotations
 
+import gc
 import re
-import tempfile
+import subprocess
 
 import cv2
+import numpy as np
+import numpy.typing as npt
 import pandas as pd
 import pdftotext
-import pytesseract
 
 from imagetocsv.string_modifiers import fix_common_mistakes
+from imagetocsv.tempfile import NamedTemporaryFile
 
+# import pytesseract
 
-def pdftocsv(file: str):
+
+
+def pdftocsv(file: str) -> list[list[str]]:
+    """Convert a pdf file to a list of lists of strings. We do this to keep layout information.
+
+    Parameters
+    ----------
+    file : str
+        Path to the pdf file.
+
+    Returns
+    -------
+    list[list[str]]
+        List of lists of strings.
+    """
     tmpchar = "*"
     special_chars = "%"
-    all_positions = set()
-    # file = '../tests/data/myimage.pdf'
+    all_positions: set[int] = set()
+
+    # Find start positions of all columns
     with open(file, "rb") as f:
         pdf = pdftotext.PDF(f, physical=True)[0]
         for line in pdf.split("\n"):
             for special_char in special_chars.split():
                 line = line.replace(f" {special_char}", f"{special_char} ")
                 line = line.replace(f"  {special_char}", f"{special_char}  ")
             line = f" {line.strip()} "
-            for i, word in enumerate(line.split()):
+            for word in line.split():
                 if not word:
                     continue
                 word = f" {word} "
-                # print(line)
                 positions = [m.start() for m in re.finditer(word, line)]
                 all_positions |= set(positions)
-            # print(line)
-        all_positions = sorted(list(all_positions))
-        all_positions = [p for p in all_positions]
-        all_positions[0] = all_positions[0]
-        if len(all_positions) > 1:
-            all_positions[-1] = all_positions[-1]
-
-        lines = []
+        ali_positions = sorted(list(all_positions))
+        ali_positions = [p for p in ali_positions]
+        ali_positions[0] = ali_positions[0]
+        if len(ali_positions) > 1:
+            ali_positions[-1] = ali_positions[-1]
+
+        # Add special temp character to empty string to empty cell
+        lines: list[str] = []
         for line in pdf.split("\n"):
-            # line = line.strip()
-            # print(line)
             if not [v for v in line.strip()]:
                 continue
             for special_char in special_chars.split():
-                line = line.replace(f" {special_char}", f"{special_char} ")
-                line = line.replace(f"  {special_char}", f"{special_char}  ")
+                line: str = line.replace(f" {special_char}", f"{special_char} ")
+                line: str = line.replace(f"  {special_char}", f"{special_char}  ")
             for pos in all_positions:
                 try:
                     if not line[pos].strip():
@@ -55,6 +71,8 @@ def pdftocsv(file: str):
                     line = line.ljust(pos, " ") + tmpchar
             lines.append(line)
 
+        # 1. replace empty cells with special char with empty string
+        # 2. fix any cells with common issues
         rows = []
         for line in lines:
             row = []
@@ -68,8 +86,25 @@ def pdftocsv(file: str):
     return rows
 
 
-def add_df_indexes_headers(df: pd.DataFrame, index_name: str, index: str, column_header: str):
-
+def add_df_indexes_headers(df: pd.DataFrame, index_name: str, index: str, column_header: str) -> pd.DataFrame:
+    """Add indexes and headers to a dataframe.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The dataframe to add indexes and headers to.
+    index_name : str, optional
+        Name of the index, by default None
+    index : list[str] | str, optional
+        Index values, by default None
+    column_header : list[str] | str, optional
+        Column header values, by default None
+
+    Returns
+    -------
+    pd.DataFrame
+        The dataframe with indexes and headers.
+    """
     if column_header:
         df.columns = column_header.split(",") if isinstance(column_header, str) else column_header
     if index:
@@ -80,26 +115,119 @@ def add_df_indexes_headers(df: pd.DataFrame, index_name: str, index: str, column
     return df
 
 
+def unsharp_mask(
+    image: npt.NDArray[np.uint8],
+    kernel_size: tuple[int, int] = (5, 5),
+    sigma: float = 1.0,
+    amount: float = 1.0,
+    threshold: float = 0,
+):
+    """Return a sharpened version of the image, using an unsharp mask.
+
+    Parameters
+    ----------
+    image : np.ndarray
+        image - The image to be sharpened.
+    kernel_size : tuple[int, int], optional
+        kernel_size - The size of the Gaussian blur kernel, by default (5, 5)
+    sigma : float, optional
+        sigma - The standard deviation of the Gaussian blur, by default 1.0
+    amount : float, optional
+        amount - The strength of the sharpening, by default 1.0
+    threshold : float, optional
+        threshold - The threshold for the mask, by default 0
+
+    Returns
+    -------
+    np.ndarray
+        The sharpened image.
+    """
+    blurred = cv2.GaussianBlur(image, kernel_size, sigma)
+    sharpened = float(amount + 1) * image - float(amount) * blurred
+    sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))
+    sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))
+    sharpened = sharpened.round().astype(np.uint8)
+    if threshold > 0:
+        low_contrast_mask = np.absolute(image - blurred) < threshold
+        np.copyto(sharpened, image, where=low_contrast_mask)
+    return sharpened
+
+
 def imagetocsv(
     file: str,
     index_name: str | None = None,
     index: list[str] | str | None = None,
     column_header: list[str] | str | None = None,
 ) -> pd.DataFrame:
-
+    """Convert an image file to a pandas DataFrame.
+
+    Parameters
+    ----------
+    file : str
+        Path to the image file.
+    index_name : str, optional
+        Name of the index, by default None
+    index : list[str] | str, optional
+        Index values, by default None
+    column_header : list[str] | str, optional
+        Column header values, by default None
+
+    Returns
+    -------
+    pd.DataFrame
+    """
     file = str(file)
 
     img = cv2.imread(file)
+    h, w, _ = img.shape
+    img = cv2.resize(img, (w * 3, h * 3))
+    img = unsharp_mask(img)
     grayImage = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
-    (_thresh, blackAndWhiteImage) = cv2.threshold(grayImage, 200, 255, cv2.THRESH_BINARY)
-
-    with tempfile.NamedTemporaryFile(delete=False) as fp:
-        custom_oem_psm_config = r"--oem 3 --psm 6 -c preserve_interword_spaces=1x1"
-        pdf: bytes = pytesseract.image_to_pdf_or_hocr(
-            blackAndWhiteImage, lang="eng", extension="pdf", config=custom_oem_psm_config
-        )
-        fp.write(pdf)
-        rows = pdftocsv(fp.name)
+    (_thresh, blackAndWhiteImage) = cv2.threshold(grayImage, 180, 255, cv2.THRESH_BINARY)
+
+    # TODO: see if we can use pytesseract to get the table for windows in version 0.3.0
+    # custom_oem_psm_config = r"""
+    #     --oem 3
+    #     --psm 6
+    #     -1 deu
+    #     -c tessedit_char_whitelist=0123456789.,%
+    #     -c preserve_interword_spaces=1
+    #     -c tessedit_create_pdf=1
+    # """
+    # pdf: bytes = pytesseract.image_to_pdf_or_hocr(
+    #     "blackAndWhiteImage.png", lang="eng", extension="pdf", config=custom_oem_psm_config
+    # )
+
+    tmp = NamedTemporaryFile(delete=False, mode=None)
+    prefix: str = tmp.name
+    # Tesseract cannot handle stdin so we need to write the image to a file first
+    cv2.imwrite(prefix + ".png", blackAndWhiteImage)
+    _ = subprocess.run(
+        [
+            "tesseract",
+            "--oem",
+            "3",
+            "--psm",
+            "6",
+            "-l",
+            "eng",
+            "-c",
+            "tessedit_char_whitelist=0123456789.,%",
+            "-c",
+            "preserve_interword_spaces=1",
+            "-c",
+            "tessedit_create_pdf=1",
+            prefix + ".png",
+            prefix,
+            # "pdf",
+        ],
+        capture_output=True,
+    )
+    # pdftotext for layout analysis
+    rows = pdftocsv(prefix + ".pdf")
+    # remove the temporary files in garbage collection for windows to handle it
+    del tmp
+    gc.collect()
 
     df = pd.DataFrame(rows)
     df = add_df_indexes_headers(df, index_name, index, column_header)
 
@@ -4,12 +4,21 @@
 def fix_common_mistakes(line: str):
     if not line:
         return ""
+
+    line = line.strip()
+
+    # Zero is a problem child. Needs special handling.
+    if line.lower() in ["00", "o0", "oo", "0o", "o", "o°", "°o", "fe", "°"]:
+        return "0"
+
     line = line.replace("@", "")
     line = line.replace("#", "")
     line = line.replace(",", "")
+
     if not line:
         return ""
+
     if line[-1] == "%" and "." not in line:
         line = line[:-3] + "." + line[len(line) - 3 :]
-    line = line.strip()
+
     return line