aws-samples · nz6 · Jul 18, 2022 · Jul 29, 2022
diff --git a/src-python/README.md b/src-python/README.md
@@ -254,6 +254,50 @@ for page in doc.pages:
 
 ```
 
+## Table Headers
+Through the Table class you can retrieve header column names by calling the get_header_field_names method. By default it returns an array containing the detected column names. If the table has more than one header, the output is returned in the form an array of arrays. You may alternatively pass a custom header processor function as argument to reformat the columns list(s) according to your own requirements. Please find a code sample below.
+
+```python
+
+    from textractcaller.t_call import call_textract, Textract_Features
+    from trp.trp2 import TDocument, TDocumentSchema
+    from trp.t_pipeline import order_blocks_by_geo
+    import trp
+    import json
+
+    j = call_textract(input_document="path_to_some_document (PDF, JPEG, PNG)", features=[Textract_Features.FORMS, Textract_Features.TABLES])
+    t_doc = TDocumentSchema().load(j)
+    ordered_doc = order_blocks_by_geo(t_doc)
+    trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc))
+
+    page = trp_doc.pages[0]
+    table = page.tables[2]
+
+    def process_headers(header_cells):
+        header_names = []
+        for header in header_cells:
+            s = []
+            for cell in header:
+                if cell._isChildOfMergedCell:
+                    s.append(cell.mergedText.strip())
+                else:
+                    s.append(cell.text.strip())
+            header_names.append(s)
+
+        t = header_names[0]
+        b = header_names[1]
+        header_names = [i + " / " + j for i, j in zip(t, b)]
+        return header_names
+
+
+    headers = table.get_header_field_names(process_headers)
+
+```
+
+
+
+
+
 ## Test
 
 - Clone the repo and run pytest

diff --git a/src-python/tests/test_trp.py b/src-python/tests/test_trp.py
@@ -141,3 +141,34 @@ def test_table_with_header(caplog):
 
     rows = table.rows_without_header
     assert len(rows) == 7
+
+
+def test_table_with_header_get_field_names(caplog):
+    caplog.set_level(logging.DEBUG)
+    p = os.path.dirname(os.path.realpath(__file__))
+    f = open(os.path.join(p, "data", "response.json"))
+    j = json.load(f)
+    doc = Document(j)
+
+    page = doc.pages[0]
+    table = page.tables[2]
+
+    def process_headers(header_cells):
+        header_names = []
+        for header in header_cells:
+            s = []
+            for cell in header:
+                if cell._isChildOfMergedCell:
+                    s.append(cell.mergedText.strip())
+                else:
+                    s.append(cell.text.strip())
+            header_names.append(s)
+
+        t = header_names[0]
+        b = header_names[1]
+        header_names = [i + " / " + j for i, j in zip(t, b)]
+        return header_names
+
+
+    headers = table.get_header_field_names(process_headers)
+    assert len(headers) == 6
diff --git a/src-python/trp/__init__.py b/src-python/trp/__init__.py
@@ -460,8 +460,11 @@ def _resolve_merged_cells(self, blockMap):
             merged_cell = MergedCell(blockMap[cid], blockMap, self._rows)
             self._merged_cells.append(merged_cell)
 
-    def get_header_field_names(self):
+    def get_header_field_names(self, header_proc_func=None):
         header_cells = self.header
+        if header_proc_func != None:
+            return header_proc_func(header_cells)
+
         header_names = []
         for header in header_cells:
             s = []