Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,50 @@ for page in doc.pages:

```

## Table Headers
Through the Table class you can retrieve header column names by calling the get_header_field_names method. By default it returns an array containing the detected column names. If the table has more than one header, the output is returned in the form an array of arrays. You may alternatively pass a custom header processor function as argument to reformat the columns list(s) according to your own requirements. Please find a code sample below.

```python

from textractcaller.t_call import call_textract, Textract_Features
from trp.trp2 import TDocument, TDocumentSchema
from trp.t_pipeline import order_blocks_by_geo
import trp
import json

j = call_textract(input_document="path_to_some_document (PDF, JPEG, PNG)", features=[Textract_Features.FORMS, Textract_Features.TABLES])
t_doc = TDocumentSchema().load(j)
ordered_doc = order_blocks_by_geo(t_doc)
trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc))

page = trp_doc.pages[0]
table = page.tables[2]

def process_headers(header_cells):
header_names = []
for header in header_cells:
s = []
for cell in header:
if cell._isChildOfMergedCell:
s.append(cell.mergedText.strip())
else:
s.append(cell.text.strip())
header_names.append(s)

t = header_names[0]
b = header_names[1]
header_names = [i + " / " + j for i, j in zip(t, b)]
return header_names


headers = table.get_header_field_names(process_headers)

```





## Test

- Clone the repo and run pytest
Expand Down
31 changes: 31 additions & 0 deletions src-python/tests/test_trp.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,34 @@ def test_table_with_header(caplog):

rows = table.rows_without_header
assert len(rows) == 7


def test_table_with_header_get_field_names(caplog):
caplog.set_level(logging.DEBUG)
p = os.path.dirname(os.path.realpath(__file__))
f = open(os.path.join(p, "data", "response.json"))
j = json.load(f)
doc = Document(j)

page = doc.pages[0]
table = page.tables[2]

def process_headers(header_cells):
header_names = []
for header in header_cells:
s = []
for cell in header:
if cell._isChildOfMergedCell:
s.append(cell.mergedText.strip())
else:
s.append(cell.text.strip())
header_names.append(s)

t = header_names[0]
b = header_names[1]
header_names = [i + " / " + j for i, j in zip(t, b)]
return header_names


headers = table.get_header_field_names(process_headers)
assert len(headers) == 6
5 changes: 4 additions & 1 deletion src-python/trp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,8 +460,11 @@ def _resolve_merged_cells(self, blockMap):
merged_cell = MergedCell(blockMap[cid], blockMap, self._rows)
self._merged_cells.append(merged_cell)

def get_header_field_names(self):
def get_header_field_names(self, header_proc_func=None):
header_cells = self.header
if header_proc_func != None:
return header_proc_func(header_cells)

header_names = []
for header in header_cells:
s = []
Expand Down