Skip to content

Commit 9d19be6

Browse files
authored
Merge pull request #1 from tmsincomb/linux
Linux & Windows
2 parents 4a7bc24 + d2a74b3 commit 9d19be6

File tree

14 files changed

+284
-81
lines changed

14 files changed

+284
-81
lines changed

.github/workflows/pytest.yml

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,27 @@ on:
88
branches: ["main", "development"]
99
jobs:
1010
pytest:
11+
name: Ex1 (${{ matrix.python-version }}, ${{ matrix.os }})
1112
runs-on: ${{ matrix.os }}
1213
strategy:
14+
fail-fast: false
1315
matrix:
14-
os: [macos-latest, windows-latest]
15-
python-version: [3.7] # , 3.8, 3.9, "3.10", "3.11"]
16+
os: [windows-latest, ubuntu-latest, macos-latest]
17+
python-version: [3.7, 3.8, 3.9, "3.10", "3.11"]
1618
steps:
17-
- name: Checkout
18-
uses: actions/checkout@v3
19-
- name: Setup Python
20-
uses: actions/setup-python@v4
19+
- uses: actions/checkout@v3
20+
- uses: conda-incubator/setup-miniconda@v2
2121
with:
22+
auto-update-conda: true
2223
python-version: ${{ matrix.python-version }}
24+
environment-file: environment.yml
2325
- name: Install dependencies (linux)
2426
if: runner.os == 'Linux'
2527
run: sudo apt install --yes libpoppler-cpp-dev pkg-config tesseract-ocr libtesseract-dev
26-
- name: Install dependencies (macOS)
27-
if: runner.os == 'macOS'
28-
run: brew install pkg-config poppler tesseract
29-
- name: Install dependencies (Windows)
30-
if: runner.os == 'Windows'
31-
run: choco install poppler tesseract
32-
- name: Install Python Dependencies
33-
run: pip install '.[dev]'
28+
- name: Install dependencies
29+
shell: bash -l {0}
30+
run: |
31+
pip install '.[dev]'
3432
- name: Unit Testing
35-
run: pytest -xsv tests/test_imagetocsv.py
33+
shell: bash -l {0}
34+
run: pytest -xsvv tests/test_imagetocsv.py

README.md

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66
<div class="flex-container" align="center">
77
<a href="https://github.com/jwillis0720/template-repo/commits/master">
8-
<a href="https://img.shields.io/badge/Python-3.6%C3.%7C3.8%7C3.9%7C3.10-blue">
9-
<img src="https://img.shields.io/badge/Python-3.6%7C3.7%7C3.8%7C3.9%7C3.10%7C3.11-blue"
8+
<a href="https://img.shields.io/badge/Python-3.%7C3.8%7C3.9%7C3.10-blue">
9+
<img src="https://img.shields.io/badge/Python-3.7%7C3.8%7C3.9%7C3.10%7C3.11-blue"
1010
alt="Python Version">
1111
<a href="https://github.com/psf/black">
1212
<img src="https://img.shields.io/badge/code%20style-black-000000.svg"
@@ -15,6 +15,13 @@
1515
<a href="https://github.com/pre-commit/pre-commit">
1616
<img src="https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white"
1717
alt="pre commit">
18+
</br>
19+
<img src="https://img.shields.io/badge/mac%20os-000000?style=for-the-badge&logo=macos&logoColor=F0F0F0"
20+
alt="MacOS">
21+
<img src="https://img.shields.io/badge/Linux-FCC624?style=for-the-badge&logo=linux&logoColor=black"
22+
alt="Linux">
23+
<img src="https://img.shields.io/badge/Windows-0078D6?style=for-the-badge&logo=windows&logoColor=white"
24+
alt="Windows">
1825
</div>
1926

2027
<p align="center" style="color:green">
@@ -27,15 +34,17 @@
2734
<a href="#license">License</a>
2835
</p>
2936

30-
## About
37+
# About
3138
Converts An Image to a CSV. This exists because Chorus 3.0 is bat-shit and only shows images for vital metadata.
39+
<img src="docs/images/convert.png" width="1025"/>
3240

33-
<img src="docs/images/convert.svg" width="1025"/>
3441

35-
36-
# Installation
42+
# Installation for MacOS, Linux, and Windows
43+
### - Tesseract for the OCR test recognition
44+
### - Poppler for the pdf/text manipulation
3745
```
38-
pip install imagetocsv
46+
$ conda install -c conda-forge tesseract==5.2.0 poppler==22.11.0
47+
$ pip install imagetocsv
3948
```
4049

4150
# Terminal Usage
@@ -58,7 +67,7 @@ Options:
5867
# Terminal Simple Examples
5968
```bash
6069
$ imagetocsv myimage.png mytable.csv
61-
# For the source image this was built for
70+
# For the hardcoded options use -p. The "-p bib" option is for Chorus 3.0 columns and headers so you dont have to use the advanced options
6271
$ imagetocsv -p bib myimage.png
6372
```
6473

docs/images/convert.png

1.72 MB
Loading

environment.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
name: anaconda-client-env
2+
channels:
3+
- conda-forge
4+
- defaults
5+
dependencies:
6+
- tesseract
7+
- poppler
8+
- pip
244 KB
Loading

imagetocsv/imagetocsv.py

Lines changed: 159 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,68 @@
11
"""Main module."""
22
from __future__ import annotations
33

4+
import gc
45
import re
5-
import tempfile
6+
import subprocess
67

78
import cv2
9+
import numpy as np
10+
import numpy.typing as npt
811
import pandas as pd
912
import pdftotext
10-
import pytesseract
1113

1214
from imagetocsv.string_modifiers import fix_common_mistakes
15+
from imagetocsv.tempfile import NamedTemporaryFile
1316

17+
# import pytesseract
1418

15-
def pdftocsv(file: str):
19+
20+
21+
def pdftocsv(file: str) -> list[list[str]]:
22+
"""Convert a pdf file to a list of lists of strings. We do this to keep layout information.
23+
24+
Parameters
25+
----------
26+
file : str
27+
Path to the pdf file.
28+
29+
Returns
30+
-------
31+
list[list[str]]
32+
List of lists of strings.
33+
"""
1634
tmpchar = "*"
1735
special_chars = "%"
18-
all_positions = set()
19-
# file = '../tests/data/myimage.pdf'
36+
all_positions: set[int] = set()
37+
38+
# Find start positions of all columns
2039
with open(file, "rb") as f:
2140
pdf = pdftotext.PDF(f, physical=True)[0]
2241
for line in pdf.split("\n"):
2342
for special_char in special_chars.split():
2443
line = line.replace(f" {special_char}", f"{special_char} ")
2544
line = line.replace(f" {special_char}", f"{special_char} ")
2645
line = f" {line.strip()} "
27-
for i, word in enumerate(line.split()):
46+
for word in line.split():
2847
if not word:
2948
continue
3049
word = f" {word} "
31-
# print(line)
3250
positions = [m.start() for m in re.finditer(word, line)]
3351
all_positions |= set(positions)
34-
# print(line)
35-
all_positions = sorted(list(all_positions))
36-
all_positions = [p for p in all_positions]
37-
all_positions[0] = all_positions[0]
38-
if len(all_positions) > 1:
39-
all_positions[-1] = all_positions[-1]
40-
41-
lines = []
52+
ali_positions = sorted(list(all_positions))
53+
ali_positions = [p for p in ali_positions]
54+
ali_positions[0] = ali_positions[0]
55+
if len(ali_positions) > 1:
56+
ali_positions[-1] = ali_positions[-1]
57+
58+
# Add special temp character to empty string to empty cell
59+
lines: list[str] = []
4260
for line in pdf.split("\n"):
43-
# line = line.strip()
44-
# print(line)
4561
if not [v for v in line.strip()]:
4662
continue
4763
for special_char in special_chars.split():
48-
line = line.replace(f" {special_char}", f"{special_char} ")
49-
line = line.replace(f" {special_char}", f"{special_char} ")
64+
line: str = line.replace(f" {special_char}", f"{special_char} ")
65+
line: str = line.replace(f" {special_char}", f"{special_char} ")
5066
for pos in all_positions:
5167
try:
5268
if not line[pos].strip():
@@ -55,6 +71,8 @@ def pdftocsv(file: str):
5571
line = line.ljust(pos, " ") + tmpchar
5672
lines.append(line)
5773

74+
# 1. replace empty cells with special char with empty string
75+
# 2. fix any cells with common issues
5876
rows = []
5977
for line in lines:
6078
row = []
@@ -68,8 +86,25 @@ def pdftocsv(file: str):
6886
return rows
6987

7088

71-
def add_df_indexes_headers(df: pd.DataFrame, index_name: str, index: str, column_header: str):
72-
89+
def add_df_indexes_headers(df: pd.DataFrame, index_name: str, index: str, column_header: str) -> pd.DataFrame:
90+
"""Add indexes and headers to a dataframe.
91+
92+
Parameters
93+
----------
94+
df : pd.DataFrame
95+
The dataframe to add indexes and headers to.
96+
index_name : str, optional
97+
Name of the index, by default None
98+
index : list[str] | str, optional
99+
Index values, by default None
100+
column_header : list[str] | str, optional
101+
Column header values, by default None
102+
103+
Returns
104+
-------
105+
pd.DataFrame
106+
The dataframe with indexes and headers.
107+
"""
73108
if column_header:
74109
df.columns = column_header.split(",") if isinstance(column_header, str) else column_header
75110
if index:
@@ -80,26 +115,119 @@ def add_df_indexes_headers(df: pd.DataFrame, index_name: str, index: str, column
80115
return df
81116

82117

118+
def unsharp_mask(
119+
image: npt.NDArray[np.uint8],
120+
kernel_size: tuple[int, int] = (5, 5),
121+
sigma: float = 1.0,
122+
amount: float = 1.0,
123+
threshold: float = 0,
124+
):
125+
"""Return a sharpened version of the image, using an unsharp mask.
126+
127+
Parameters
128+
----------
129+
image : np.ndarray
130+
image - The image to be sharpened.
131+
kernel_size : tuple[int, int], optional
132+
kernel_size - The size of the Gaussian blur kernel, by default (5, 5)
133+
sigma : float, optional
134+
sigma - The standard deviation of the Gaussian blur, by default 1.0
135+
amount : float, optional
136+
amount - The strength of the sharpening, by default 1.0
137+
threshold : float, optional
138+
threshold - The threshold for the mask, by default 0
139+
140+
Returns
141+
-------
142+
np.ndarray
143+
The sharpened image.
144+
"""
145+
blurred = cv2.GaussianBlur(image, kernel_size, sigma)
146+
sharpened = float(amount + 1) * image - float(amount) * blurred
147+
sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))
148+
sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))
149+
sharpened = sharpened.round().astype(np.uint8)
150+
if threshold > 0:
151+
low_contrast_mask = np.absolute(image - blurred) < threshold
152+
np.copyto(sharpened, image, where=low_contrast_mask)
153+
return sharpened
154+
155+
83156
def imagetocsv(
84157
file: str,
85158
index_name: str | None = None,
86159
index: list[str] | str | None = None,
87160
column_header: list[str] | str | None = None,
88161
) -> pd.DataFrame:
89-
162+
"""Convert an image file to a pandas DataFrame.
163+
164+
Parameters
165+
----------
166+
file : str
167+
Path to the image file.
168+
index_name : str, optional
169+
Name of the index, by default None
170+
index : list[str] | str, optional
171+
Index values, by default None
172+
column_header : list[str] | str, optional
173+
Column header values, by default None
174+
175+
Returns
176+
-------
177+
pd.DataFrame
178+
"""
90179
file = str(file)
91180

92181
img = cv2.imread(file)
182+
h, w, _ = img.shape
183+
img = cv2.resize(img, (w * 3, h * 3))
184+
img = unsharp_mask(img)
93185
grayImage = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
94-
(_thresh, blackAndWhiteImage) = cv2.threshold(grayImage, 200, 255, cv2.THRESH_BINARY)
95-
96-
with tempfile.NamedTemporaryFile(delete=False) as fp:
97-
custom_oem_psm_config = r"--oem 3 --psm 6 -c preserve_interword_spaces=1x1"
98-
pdf: bytes = pytesseract.image_to_pdf_or_hocr(
99-
blackAndWhiteImage, lang="eng", extension="pdf", config=custom_oem_psm_config
100-
)
101-
fp.write(pdf)
102-
rows = pdftocsv(fp.name)
186+
(_thresh, blackAndWhiteImage) = cv2.threshold(grayImage, 180, 255, cv2.THRESH_BINARY)
187+
188+
# TODO: see if we can use pytesseract to get the table for windows in version 0.3.0
189+
# custom_oem_psm_config = r"""
190+
# --oem 3
191+
# --psm 6
192+
# -1 deu
193+
# -c tessedit_char_whitelist=0123456789.,%
194+
# -c preserve_interword_spaces=1
195+
# -c tessedit_create_pdf=1
196+
# """
197+
# pdf: bytes = pytesseract.image_to_pdf_or_hocr(
198+
# "blackAndWhiteImage.png", lang="eng", extension="pdf", config=custom_oem_psm_config
199+
# )
200+
201+
tmp = NamedTemporaryFile(delete=False, mode=None)
202+
prefix: str = tmp.name
203+
# Tesseract cannot handle stdin so we need to write the image to a file first
204+
cv2.imwrite(prefix + ".png", blackAndWhiteImage)
205+
_ = subprocess.run(
206+
[
207+
"tesseract",
208+
"--oem",
209+
"3",
210+
"--psm",
211+
"6",
212+
"-l",
213+
"eng",
214+
"-c",
215+
"tessedit_char_whitelist=0123456789.,%",
216+
"-c",
217+
"preserve_interword_spaces=1",
218+
"-c",
219+
"tessedit_create_pdf=1",
220+
prefix + ".png",
221+
prefix,
222+
# "pdf",
223+
],
224+
capture_output=True,
225+
)
226+
# pdftotext for layout analysis
227+
rows = pdftocsv(prefix + ".pdf")
228+
# remove the temporary files in garbage collection for windows to handle it
229+
del tmp
230+
gc.collect()
103231

104232
df = pd.DataFrame(rows)
105233
df = add_df_indexes_headers(df, index_name, index, column_header)

imagetocsv/string_modifiers.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,21 @@
44
def fix_common_mistakes(line: str):
55
if not line:
66
return ""
7+
8+
line = line.strip()
9+
10+
# Zero is a problem child. Needs special handling.
11+
if line.lower() in ["00", "o0", "oo", "0o", "o", "o°", "°o", "fe", "°"]:
12+
return "0"
13+
714
line = line.replace("@", "")
815
line = line.replace("#", "")
916
line = line.replace(",", "")
17+
1018
if not line:
1119
return ""
20+
1221
if line[-1] == "%" and "." not in line:
1322
line = line[:-3] + "." + line[len(line) - 3 :]
14-
line = line.strip()
23+
1524
return line

0 commit comments

Comments
 (0)