diff --git a/src-python/README.md b/src-python/README.md index 52cc078..52e49c4 100644 --- a/src-python/README.md +++ b/src-python/README.md @@ -254,6 +254,50 @@ for page in doc.pages: ``` +## Table Headers +Through the Table class you can retrieve header column names by calling the get_header_field_names method. By default it returns an array containing the detected column names. If the table has more than one header, the output is returned in the form an array of arrays. You may alternatively pass a custom header processor function as argument to reformat the columns list(s) according to your own requirements. Please find a code sample below. + +```python + + from textractcaller.t_call import call_textract, Textract_Features + from trp.trp2 import TDocument, TDocumentSchema + from trp.t_pipeline import order_blocks_by_geo + import trp + import json + + j = call_textract(input_document="path_to_some_document (PDF, JPEG, PNG)", features=[Textract_Features.FORMS, Textract_Features.TABLES]) + t_doc = TDocumentSchema().load(j) + ordered_doc = order_blocks_by_geo(t_doc) + trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc)) + + page = trp_doc.pages[0] + table = page.tables[2] + + def process_headers(header_cells): + header_names = [] + for header in header_cells: + s = [] + for cell in header: + if cell._isChildOfMergedCell: + s.append(cell.mergedText.strip()) + else: + s.append(cell.text.strip()) + header_names.append(s) + + t = header_names[0] + b = header_names[1] + header_names = [i + " / " + j for i, j in zip(t, b)] + return header_names + + + headers = table.get_header_field_names(process_headers) + +``` + + + + + ## Test - Clone the repo and run pytest diff --git a/src-python/tests/test_trp.py b/src-python/tests/test_trp.py index a9bd029..44c3560 100644 --- a/src-python/tests/test_trp.py +++ b/src-python/tests/test_trp.py @@ -141,3 +141,34 @@ def test_table_with_header(caplog): rows = table.rows_without_header assert len(rows) == 7 + + +def test_table_with_header_get_field_names(caplog): + caplog.set_level(logging.DEBUG) + p = os.path.dirname(os.path.realpath(__file__)) + f = open(os.path.join(p, "data", "response.json")) + j = json.load(f) + doc = Document(j) + + page = doc.pages[0] + table = page.tables[2] + + def process_headers(header_cells): + header_names = [] + for header in header_cells: + s = [] + for cell in header: + if cell._isChildOfMergedCell: + s.append(cell.mergedText.strip()) + else: + s.append(cell.text.strip()) + header_names.append(s) + + t = header_names[0] + b = header_names[1] + header_names = [i + " / " + j for i, j in zip(t, b)] + return header_names + + + headers = table.get_header_field_names(process_headers) + assert len(headers) == 6 diff --git a/src-python/trp/__init__.py b/src-python/trp/__init__.py index 78c74c6..027e784 100644 --- a/src-python/trp/__init__.py +++ b/src-python/trp/__init__.py @@ -460,8 +460,11 @@ def _resolve_merged_cells(self, blockMap): merged_cell = MergedCell(blockMap[cid], blockMap, self._rows) self._merged_cells.append(merged_cell) - def get_header_field_names(self): + def get_header_field_names(self, header_proc_func=None): header_cells = self.header + if header_proc_func != None: + return header_proc_func(header_cells) + header_names = [] for header in header_cells: s = []