11"""Main module."""
22from __future__ import annotations
33
4+ import gc
45import re
5- import tempfile
6+ import subprocess
67
78import cv2
9+ import numpy as np
10+ import numpy .typing as npt
811import pandas as pd
912import pdftotext
10- import pytesseract
1113
1214from imagetocsv .string_modifiers import fix_common_mistakes
15+ from imagetocsv .tempfile import NamedTemporaryFile
1316
17+ # import pytesseract
1418
15- def pdftocsv (file : str ):
19+
20+
21+ def pdftocsv (file : str ) -> list [list [str ]]:
22+ """Convert a pdf file to a list of lists of strings. We do this to keep layout information.
23+
24+ Parameters
25+ ----------
26+ file : str
27+ Path to the pdf file.
28+
29+ Returns
30+ -------
31+ list[list[str]]
32+ List of lists of strings.
33+ """
1634 tmpchar = "*"
1735 special_chars = "%"
18- all_positions = set ()
19- # file = '../tests/data/myimage.pdf'
36+ all_positions : set [int ] = set ()
37+
38+ # Find start positions of all columns
2039 with open (file , "rb" ) as f :
2140 pdf = pdftotext .PDF (f , physical = True )[0 ]
2241 for line in pdf .split ("\n " ):
2342 for special_char in special_chars .split ():
2443 line = line .replace (f" { special_char } " , f"{ special_char } " )
2544 line = line .replace (f" { special_char } " , f"{ special_char } " )
2645 line = f" { line .strip ()} "
27- for i , word in enumerate ( line .split () ):
46+ for word in line .split ():
2847 if not word :
2948 continue
3049 word = f" { word } "
31- # print(line)
3250 positions = [m .start () for m in re .finditer (word , line )]
3351 all_positions |= set (positions )
34- # print(line )
35- all_positions = sorted ( list ( all_positions ))
36- all_positions = [ p for p in all_positions ]
37- all_positions [ 0 ] = all_positions [ 0 ]
38- if len ( all_positions ) > 1 :
39- all_positions [ - 1 ] = all_positions [ - 1 ]
40-
41- lines = []
52+ ali_positions = sorted ( list ( all_positions ) )
53+ ali_positions = [ p for p in ali_positions ]
54+ ali_positions [ 0 ] = ali_positions [ 0 ]
55+ if len ( ali_positions ) > 1 :
56+ ali_positions [ - 1 ] = ali_positions [ - 1 ]
57+
58+ # Add special temp character to empty string to empty cell
59+ lines : list [ str ] = []
4260 for line in pdf .split ("\n " ):
43- # line = line.strip()
44- # print(line)
4561 if not [v for v in line .strip ()]:
4662 continue
4763 for special_char in special_chars .split ():
48- line = line .replace (f" { special_char } " , f"{ special_char } " )
49- line = line .replace (f" { special_char } " , f"{ special_char } " )
64+ line : str = line .replace (f" { special_char } " , f"{ special_char } " )
65+ line : str = line .replace (f" { special_char } " , f"{ special_char } " )
5066 for pos in all_positions :
5167 try :
5268 if not line [pos ].strip ():
@@ -55,6 +71,8 @@ def pdftocsv(file: str):
5571 line = line .ljust (pos , " " ) + tmpchar
5672 lines .append (line )
5773
74+ # 1. replace empty cells with special char with empty string
75+ # 2. fix any cells with common issues
5876 rows = []
5977 for line in lines :
6078 row = []
@@ -68,8 +86,25 @@ def pdftocsv(file: str):
6886 return rows
6987
7088
71- def add_df_indexes_headers (df : pd .DataFrame , index_name : str , index : str , column_header : str ):
72-
89+ def add_df_indexes_headers (df : pd .DataFrame , index_name : str , index : str , column_header : str ) -> pd .DataFrame :
90+ """Add indexes and headers to a dataframe.
91+
92+ Parameters
93+ ----------
94+ df : pd.DataFrame
95+ The dataframe to add indexes and headers to.
96+ index_name : str, optional
97+ Name of the index, by default None
98+ index : list[str] | str, optional
99+ Index values, by default None
100+ column_header : list[str] | str, optional
101+ Column header values, by default None
102+
103+ Returns
104+ -------
105+ pd.DataFrame
106+ The dataframe with indexes and headers.
107+ """
73108 if column_header :
74109 df .columns = column_header .split ("," ) if isinstance (column_header , str ) else column_header
75110 if index :
@@ -80,26 +115,119 @@ def add_df_indexes_headers(df: pd.DataFrame, index_name: str, index: str, column
80115 return df
81116
82117
118+ def unsharp_mask (
119+ image : npt .NDArray [np .uint8 ],
120+ kernel_size : tuple [int , int ] = (5 , 5 ),
121+ sigma : float = 1.0 ,
122+ amount : float = 1.0 ,
123+ threshold : float = 0 ,
124+ ):
125+ """Return a sharpened version of the image, using an unsharp mask.
126+
127+ Parameters
128+ ----------
129+ image : np.ndarray
130+ image - The image to be sharpened.
131+ kernel_size : tuple[int, int], optional
132+ kernel_size - The size of the Gaussian blur kernel, by default (5, 5)
133+ sigma : float, optional
134+ sigma - The standard deviation of the Gaussian blur, by default 1.0
135+ amount : float, optional
136+ amount - The strength of the sharpening, by default 1.0
137+ threshold : float, optional
138+ threshold - The threshold for the mask, by default 0
139+
140+ Returns
141+ -------
142+ np.ndarray
143+ The sharpened image.
144+ """
145+ blurred = cv2 .GaussianBlur (image , kernel_size , sigma )
146+ sharpened = float (amount + 1 ) * image - float (amount ) * blurred
147+ sharpened = np .maximum (sharpened , np .zeros (sharpened .shape ))
148+ sharpened = np .minimum (sharpened , 255 * np .ones (sharpened .shape ))
149+ sharpened = sharpened .round ().astype (np .uint8 )
150+ if threshold > 0 :
151+ low_contrast_mask = np .absolute (image - blurred ) < threshold
152+ np .copyto (sharpened , image , where = low_contrast_mask )
153+ return sharpened
154+
155+
83156def imagetocsv (
84157 file : str ,
85158 index_name : str | None = None ,
86159 index : list [str ] | str | None = None ,
87160 column_header : list [str ] | str | None = None ,
88161) -> pd .DataFrame :
89-
162+ """Convert an image file to a pandas DataFrame.
163+
164+ Parameters
165+ ----------
166+ file : str
167+ Path to the image file.
168+ index_name : str, optional
169+ Name of the index, by default None
170+ index : list[str] | str, optional
171+ Index values, by default None
172+ column_header : list[str] | str, optional
173+ Column header values, by default None
174+
175+ Returns
176+ -------
177+ pd.DataFrame
178+ """
90179 file = str (file )
91180
92181 img = cv2 .imread (file )
182+ h , w , _ = img .shape
183+ img = cv2 .resize (img , (w * 3 , h * 3 ))
184+ img = unsharp_mask (img )
93185 grayImage = cv2 .cvtColor (img , cv2 .COLOR_RGB2GRAY )
94- (_thresh , blackAndWhiteImage ) = cv2 .threshold (grayImage , 200 , 255 , cv2 .THRESH_BINARY )
95-
96- with tempfile .NamedTemporaryFile (delete = False ) as fp :
97- custom_oem_psm_config = r"--oem 3 --psm 6 -c preserve_interword_spaces=1x1"
98- pdf : bytes = pytesseract .image_to_pdf_or_hocr (
99- blackAndWhiteImage , lang = "eng" , extension = "pdf" , config = custom_oem_psm_config
100- )
101- fp .write (pdf )
102- rows = pdftocsv (fp .name )
186+ (_thresh , blackAndWhiteImage ) = cv2 .threshold (grayImage , 180 , 255 , cv2 .THRESH_BINARY )
187+
188+ # TODO: see if we can use pytesseract to get the table for windows in version 0.3.0
189+ # custom_oem_psm_config = r"""
190+ # --oem 3
191+ # --psm 6
192+ # -1 deu
193+ # -c tessedit_char_whitelist=0123456789.,%
194+ # -c preserve_interword_spaces=1
195+ # -c tessedit_create_pdf=1
196+ # """
197+ # pdf: bytes = pytesseract.image_to_pdf_or_hocr(
198+ # "blackAndWhiteImage.png", lang="eng", extension="pdf", config=custom_oem_psm_config
199+ # )
200+
201+ tmp = NamedTemporaryFile (delete = False , mode = None )
202+ prefix : str = tmp .name
203+ # Tesseract cannot handle stdin so we need to write the image to a file first
204+ cv2 .imwrite (prefix + ".png" , blackAndWhiteImage )
205+ _ = subprocess .run (
206+ [
207+ "tesseract" ,
208+ "--oem" ,
209+ "3" ,
210+ "--psm" ,
211+ "6" ,
212+ "-l" ,
213+ "eng" ,
214+ "-c" ,
215+ "tessedit_char_whitelist=0123456789.,%" ,
216+ "-c" ,
217+ "preserve_interword_spaces=1" ,
218+ "-c" ,
219+ "tessedit_create_pdf=1" ,
220+ prefix + ".png" ,
221+ prefix ,
222+ # "pdf",
223+ ],
224+ capture_output = True ,
225+ )
226+ # pdftotext for layout analysis
227+ rows = pdftocsv (prefix + ".pdf" )
228+ # remove the temporary files in garbage collection for windows to handle it
229+ del tmp
230+ gc .collect ()
103231
104232 df = pd .DataFrame (rows )
105233 df = add_df_indexes_headers (df , index_name , index , column_header )
0 commit comments