Skip to content

Commit d93e6f8

Browse files
included additional input types
1 parent 15864bf commit d93e6f8

File tree

6 files changed

+112
-31
lines changed

6 files changed

+112
-31
lines changed

README.md

+8-5
Original file line numberDiff line numberDiff line change
@@ -116,17 +116,20 @@ Now, based on the example of a national passport, let us take a closer look at t
116116
117117
## 🗹ToDo
118118
119-
- [x] Test for **mrva** and **mrvb** documents
120-
- [x] Add `wiki` page
119+
- [x] Include mrva and mrvb documents
120+
- [x] Add wiki page
121121
- [ ] Support numpy array as input
122122
- [x] Support mrz text as input
123-
- [ ] Support base64 as input
123+
- [x] Support base64 as input
124+
- [ ] Support pdf as input
124125
- [x] Function to return mrz text as output
125126
- [ ] Bulk process
126127
- [ ] Train Tesseract model with additional data
127-
- [ ] Add function parameter - is_checkdigit
128+
- [ ] Add function parameter - include_checkdigit
128129
- [ ] Add function - get_mrz_image
129-
- [ ] Support pdf as input
130+
- [x] Add function - validate_mrz
131+
- [ ] Add function - generate_mrz
132+
- [ ] Extract face image
130133
131134
## ⚖️License
132135

docs/LAGHIMA.png

-4.11 MB
Binary file not shown.

docs/mrz_fields_distribution.png

42 KB
Loading

fastmrz/fastmrz.py

+66-23
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import numpy as np
22
import cv2
33
import pytesseract
4-
from datetime import datetime
54
import os
6-
5+
import base64
6+
from datetime import datetime
7+
from io import BytesIO
8+
from PIL import Image
79

810
class FastMRZ:
911
def __init__(self, tesseract_path=""):
@@ -89,6 +91,16 @@ def _format_date(self, input_date):
8991

9092
return formatted_date
9193

94+
def _get_birth_date(self, birth_date_str, expiry_date_str):
95+
birth_year = int(birth_date_str[:4])
96+
expiry_year = int(expiry_date_str[:4])
97+
98+
if expiry_year > birth_year:
99+
return birth_date_str
100+
adjusted_year = birth_year - 100
101+
102+
return f"{adjusted_year}-{birth_date_str[5:]}"
103+
92104
def _is_valid(self, image):
93105
if isinstance(image, str):
94106
return bool(os.path.isfile(image))
@@ -103,23 +115,21 @@ def _get_mrz(self, image):
103115

104116
return self._cleanse_roi(mrz_roi)
105117

106-
def get_details(self, image, ignore_parse=False):
118+
def _base64_to_image_array(self, base64_string):
119+
image_data = base64.b64decode(base64_string)
120+
image_stream = BytesIO(image_data)
121+
image = Image.open(image_stream)
122+
image_array = np.array(image)
123+
124+
return image_array
125+
126+
def get_details_old(self, image, ignore_parse=False, include_checkdigit=True):
107127
if not self._is_valid(image):
108128
return {"status": "FAILURE", "message": "Invalid input image"}
109129
mrz_text = self._get_mrz(image)
110130

111131
return mrz_text if ignore_parse else self._parse_mrz(mrz_text)
112132

113-
def _get_birth_date(self, birth_date_str, expiry_date_str):
114-
birth_year = int(birth_date_str[:4])
115-
expiry_year = int(expiry_date_str[:4])
116-
117-
if expiry_year > birth_year:
118-
return birth_date_str
119-
adjusted_year = birth_year - 100
120-
121-
return f"{adjusted_year}-{birth_date_str[5:]}"
122-
123133
def _parse_mrz(self, mrz_text):
124134
if not mrz_text:
125135
return {"status": "FAILURE", "message": "No MRZ detected"}
@@ -149,22 +159,22 @@ def _parse_mrz(self, mrz_text):
149159
mrz_code_dict["document_number"] = mrz_lines[1][:9].replace("<", "")
150160
mrz_code_dict["document_number_checkdigit"] = self._get_check_digit(mrz_code_dict["document_number"])
151161
if mrz_code_dict["document_number_checkdigit"] != mrz_lines[1][9]:
152-
return {"status": "FAILURE", "message": "document number checksum is not matching"}
162+
return {"status": "FAILURE", "message": "Document number checksum is not matching"}
153163

154164
mrz_code_dict["nationality_code"] = mrz_lines[1][10:13]
155165
if not mrz_code_dict["nationality_code"].isalpha():
156166
return {"status": "FAILURE", "message": "Invalid MRZ format"}
157167

158168
mrz_code_dict["birth_date"] = mrz_lines[1][13:19]
159169
if self._get_check_digit(mrz_code_dict["birth_date"]) != mrz_lines[1][19]:
160-
return {"status": "FAILURE", "message": "date of birth checksum is not matching"}
170+
return {"status": "FAILURE", "message": "Date of birth checksum is not matching"}
161171
mrz_code_dict["birth_date"] = self._format_date(mrz_code_dict["birth_date"])
162172

163173
mrz_code_dict["sex"] = mrz_lines[1][20]
164174

165175
mrz_code_dict["expiry_date"] = mrz_lines[1][21:27]
166176
if self._get_check_digit(mrz_code_dict["expiry_date"]) != mrz_lines[1][27]:
167-
return {"status": "FAILURE", "message": "date of expiry checksum is not matching"}
177+
return {"status": "FAILURE", "message": "Date of expiry checksum is not matching"}
168178
mrz_code_dict["expiry_date"] = self._format_date(mrz_code_dict["expiry_date"])
169179
mrz_code_dict["birth_date"] = self._get_birth_date(mrz_code_dict["birth_date"], mrz_code_dict["expiry_date"])
170180

@@ -179,7 +189,7 @@ def _parse_mrz(self, mrz_text):
179189

180190
if (mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines[1], mrz_code_dict["mrz_type"])
181191
and mrz_code_dict["mrz_type"] not in ("MRVA", "MRVB")):
182-
return {"status": "FAILURE", "message": "final checksum is not matching"}
192+
return {"status": "FAILURE", "message": "Final checksum is not matching"}
183193
else:
184194
mrz_code_dict["mrz_type"] = "TD1"
185195

@@ -193,21 +203,21 @@ def _parse_mrz(self, mrz_text):
193203
mrz_code_dict["document_number"] = mrz_lines[0][5:14]
194204
mrz_code_dict["document_number_checkdigit"] = self._get_check_digit(mrz_code_dict["document_number"])
195205
if mrz_code_dict["document_number_checkdigit"] != mrz_lines[0][14]:
196-
return {"status": "FAILURE", "message": "document number checksum is not matching"}
206+
return {"status": "FAILURE", "message": "Document number checksum is not matching"}
197207

198208
mrz_code_dict["optional_data_1"] = mrz_lines[0][15:].strip("<")
199209

200210
# Line 2
201211
mrz_code_dict["birth_date"] = mrz_lines[1][:6]
202212
if self._get_check_digit(mrz_code_dict["birth_date"]) != mrz_lines[1][6]:
203-
return {"status": "FAILURE", "message": "date of birth checksum is not matching"}
213+
return {"status": "FAILURE", "message": "Date of birth checksum is not matching"}
204214
mrz_code_dict["birth_date"] = self._format_date(mrz_code_dict["birth_date"])
205215

206216
mrz_code_dict["sex"] = mrz_lines[1][7]
207217

208218
mrz_code_dict["expiry_date"] = mrz_lines[1][8:14]
209219
if self._get_check_digit(mrz_code_dict["expiry_date"]) != mrz_lines[1][14]:
210-
return {"status": "FAILURE", "message": "date of expiry checksum is not matching"}
220+
return {"status": "FAILURE", "message": "Date of expiry checksum is not matching"}
211221
mrz_code_dict["expiry_date"] = self._format_date(mrz_code_dict["expiry_date"])
212222

213223
mrz_code_dict["birth_date"] = self._get_birth_date(mrz_code_dict["birth_date"], mrz_code_dict["expiry_date"])
@@ -218,7 +228,7 @@ def _parse_mrz(self, mrz_text):
218228

219229
mrz_code_dict["optional_data_2"] = mrz_lines[0][18:29].strip("<")
220230
if mrz_lines[1][-1] != self._get_final_check_digit(mrz_lines, mrz_code_dict["mrz_type"]):
221-
return {"status": "FAILURE", "message": "final checksum is not matching"}
231+
return {"status": "FAILURE", "message": "Final checksum is not matching"}
222232

223233
# Line 3
224234
names = mrz_lines[2].split("<<")
@@ -232,7 +242,40 @@ def _parse_mrz(self, mrz_text):
232242

233243
return mrz_code_dict
234244

235-
def get_details_mrz(self, mrz_text):
245+
def validate_mrz(self, mrz_text):
236246
mrz_text = self._cleanse_roi(mrz_text)
237247

238-
return self._parse_mrz(mrz_text)
248+
result = self._parse_mrz(mrz_text)
249+
if result.get("status") == "SUCCESS":
250+
return {"is_valid": True, "message": "The given mrz is valid"}
251+
else:
252+
return {"is_valid": False, "message": result.get("message")}
253+
254+
def get_details(self, input_data, input_type="imagepath", ignore_parse=False, include_checkdigit=True):
255+
if input_type == "imagepath":
256+
if not self._is_valid(input_data):
257+
return {"status": "FAILURE", "message": "Invalid input image"}
258+
mrz_text = self._get_mrz(input_data)
259+
260+
return mrz_text if ignore_parse else self._parse_mrz(mrz_text)
261+
elif input_type == "numpy":
262+
# get_details_from_numpy(input_data, ignore_parse=False, include_checkdigit=True)
263+
if isinstance(input_data, np.ndarray):
264+
image = input_data
265+
else:
266+
raise ValueError("Input is not a valid NumPy array.")
267+
pass
268+
elif input_type == "base64":
269+
image_array = self._base64_to_image_array(input_data)
270+
mrz_text = self._get_mrz(image_array)
271+
272+
return mrz_text if ignore_parse else self._parse_mrz(mrz_text)
273+
elif input_type == "pdf":
274+
# get_details_from_pdf(input_data, ignore_parse=False, include_checkdigit=True)
275+
pass
276+
elif input_type == "text":
277+
mrz_text = self._cleanse_roi(input_data)
278+
279+
return mrz_text if ignore_parse else self._parse_mrz(mrz_text)
280+
else:
281+
raise ValueError(f"Unsupported input_type: {input_type}")

fastmrz/main.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from fastmrz import FastMRZ
22
import json
3+
import base64
34

4-
fast_mrz = FastMRZ()
5+
fast_mrz = FastMRZ(tesseract_path="C:\\Program Files\\Tesseract-OCR\\tesseract.exe")
56
# Pass file path of installed Tesseract OCR, incase if not added to PATH variable
67
# fast_mrz = FastMRZ(tesseract_path=r'/path/to/tesseract/source')
78
passport_mrz = fast_mrz.get_details("../data/passport_uk.jpg")
@@ -16,6 +17,21 @@
1617

1718
print("\n")
1819

19-
passport_mrz = fast_mrz.get_details_mrz("P<GBRPUDARSAN<<HENERT<<<<<<<<<<<<<<<<<<<<<<<\n7077979792GBR9505209M1704224<<<<<<<<<<<<<<00")
20+
passport_mrz = fast_mrz.get_details("P<GBRPUDARSAN<<HENERT<<<<<<<<<<<<<<<<<<<<<<<\n7077979792GBR9505209M1704224<<<<<<<<<<<<<<00", input_type="text")
2021
print("JSON:")
2122
print(json.dumps(passport_mrz, indent=4))
23+
24+
print("\n")
25+
26+
is_valid = fast_mrz.validate_mrz("P<GBRPUDARSAN<<HENERT<<<<<<<<<<<<<<<<<<<<<<<\n7077979792GBR1505209M1704224<<<<<<<<<<<<<<00")
27+
print("MRZ VALIDITY CHECK:")
28+
print(json.dumps(is_valid, indent=4))
29+
30+
print("\n")
31+
image_file = open("../data/passport_uk.jpg", "rb")
32+
image_data = image_file.read()
33+
image_file.close()
34+
base64_string = base64.b64encode(image_data).decode("utf-8")
35+
passport_mrz = fast_mrz.get_details(base64_string, input_type="base64", ignore_parse=True)
36+
print("TEXT:")
37+
print(passport_mrz)

tests/test.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
from fastmrz import FastMRZ
55

6-
fast_mrz = FastMRZ()
6+
fast_mrz = FastMRZ(tesseract_path="C:\\Program Files\\Tesseract-OCR\\tesseract.exe")
77

88

99
class TestFastMRZMethods(unittest.TestCase):
@@ -69,6 +69,25 @@ def test_read_mrz_mrvb(self):
6969
self.assertIsInstance(mrz_data, dict)
7070
self.assertIn("status", mrz_data.keys())
7171

72+
def test_validate_mrz(self):
73+
result = fast_mrz.validate_mrz("P<GBRPUDARSAN<<HENERT<<<<<<<<<<<<<<<<<<<<<<<\n"
74+
"7077979792GBR9505209M1704224<<<<<<<<<<<<<<00")
75+
expected = {"is_valid": True, "message": "The given mrz is valid"}
76+
self.assertEqual(result, expected)
77+
78+
def test_validate_mrz_invalid_format(self):
79+
result = fast_mrz.validate_mrz("INVALIDTEXT<<HENERT<<<<<<<<<<<<<<<<<<<<<<<\n"
80+
"7077979792GBR9505209M1704224<<<<<<<<<<<<<<00")
81+
self.assertFalse(result["is_valid"])
82+
self.assertIn("message", result)
83+
self.assertIsInstance(result["message"], str)
84+
85+
def test_validate_mrz_invalid_check_digit(self):
86+
result = fast_mrz.validate_mrz("P<GBRPUDARSAN<<HENERT<<<<<<<<<<<<<<<<<<<<<<<\n"
87+
"7077979792GBR9505209M1704224<<<<<<<<<<<<<<01")
88+
self.assertFalse(result["is_valid"])
89+
self.assertIn("message", result)
90+
self.assertIsInstance(result["message"], str)
7291

7392
if __name__ == "__main__":
7493
unittest.main()

0 commit comments

Comments
 (0)