Skip to content

Commit

Permalink
Merge pull request #10 from aws-samples/idp-lending
Browse files Browse the repository at this point in the history
(enhance): Update Mortgage with Amazon Textract Analyze Lending
  • Loading branch information
anjanvb authored Jan 11, 2023
2 parents 224c303 + 206e466 commit 415866c
Show file tree
Hide file tree
Showing 25 changed files with 756 additions and 43,922 deletions.
441 changes: 219 additions & 222 deletions industry/mortgage/01-document-classification.ipynb

Large diffs are not rendered by default.

1,640 changes: 0 additions & 1,640 deletions industry/mortgage/02-document-extraction-1.ipynb

This file was deleted.

502 changes: 502 additions & 0 deletions industry/mortgage/02-document-extraction.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from textractcaller.t_call import call_textract, Textract_Features, Textract_Types\n",
Expand All @@ -39,7 +41,8 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -93,7 +96,7 @@
"metadata": {},
"outputs": [],
"source": [
"documentName = \"docs/Paystub.jpg\"\n",
"documentName = \"docs/Paystub.png\"\n",
"display(Image(filename=documentName, width=500))"
]
},
Expand All @@ -110,11 +113,15 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"resp = call_textract(input_document = f's3://{data_bucket}/idp-mortgage/textract/Paystub.jpg')\n",
"with open(documentName, 'rb') as document:\n",
" imageBytes = bytearray(document.read())\n",
"\n",
"resp = call_textract(input_document = imageBytes)\n",
"text = get_string(textract_json=resp, output_type=[Textract_Pretty_Print.LINES])\n",
"\n",
"#Call Amazon Comprehend Detect PII Entities API\n",
Expand Down Expand Up @@ -148,21 +155,24 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"from textractoverlayer.t_overlay import DocumentDimensions, get_bounding_boxes\n",
"\n",
"def redact_doc(s3document, localpath, redact_entities):\n",
" print(s3document)\n",
"def redact_doc(localpath, redact_entities):\n",
" try:\n",
" img = PImage.open(localpath)\n",
" draw = ImageDraw.Draw(img)\n",
"\n",
" # Use call_textract to get bounding boxes\n",
" # call_textract without the features parameter uses Textract Detect text\n",
" resp = call_textract(input_document = s3document)\n",
" with open(localpath, 'rb') as document:\n",
" imageBytes = bytearray(document.read())\n",
" \n",
" resp = call_textract(input_document = imageBytes)\n",
" document_dimension:DocumentDimensions = DocumentDimensions(doc_width=img.size[0], doc_height=img.size[1])\n",
" overlay=[Textract_Types.LINE, Textract_Types.WORD, Textract_Types.FORM, Textract_Types.CELL, Textract_Types.KEY, Textract_Types.VALUE]\n",
" bounding_box_list = get_bounding_boxes(textract_json=resp, document_dimensions=[document_dimension], overlay_features=overlay)\n",
Expand Down Expand Up @@ -197,7 +207,7 @@
" \n",
" #Generate the redacted/enriched document file and save to file system\n",
" opfile = Path(localpath).stem\n",
" opfile = f'{opfile}_redacted.jpg' \n",
" opfile = f'{opfile}_redacted.png' \n",
" img.save(opfile) \n",
" print(f'Done.... Redacted file saved: {opfile}')\n",
" return opfile\n",
Expand All @@ -215,10 +225,12 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"redact_doc(f's3://{data_bucket}/idp-mortgage/textract/Paystub.jpg','docs/Paystub.jpg',['NAME','SSN','DATE_TIME'])"
"redact_doc('docs/Paystub.png',['NAME','SSN','DATE_TIME'])"
]
},
{
Expand All @@ -232,11 +244,13 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"bank_document_local='docs/Paystub.jpg'\n",
"redacted_file='Paystub_redacted.jpg'\n",
"bank_document_local='docs/Paystub.png'\n",
"redacted_file='Paystub_redacted.png'\n",
"\n",
"print(f'\\nUnredacted Document\\t\\t\\t\\t\\t\\t\\tRedacted Document \\n')\n",
"\n",
Expand Down Expand Up @@ -267,22 +281,6 @@
"\n",
"In order to clean up the files uploaded into the S3 bucket, execute the following command. If you created this SageMaker Domain Studio environment manually then follow the SageMaker documentation to delete the Studio domain. If you created, the Studio Domain using a CloudFormation stack, delete the stack. If you are performing this lab as part of an instructor led workshop please follow instructions shared by the instructor."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!aws s3 rm s3://{data_bucket}/idp-mortgage/ --recursive --exclude \"*\" --include \"textract/*\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 415866c

Please sign in to comment.