-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_table_extraction.py
More file actions
136 lines (113 loc) Β· 3.67 KB
/
test_table_extraction.py
File metadata and controls
136 lines (113 loc) Β· 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
Test script for table extraction from PDF files.
This script demonstrates how to query the BigQuery KB table
to retrieve table rows with their column structure.
"""
from bq_handler import BigQueryRAG
def test_table_extraction():
"""
Query the KB table to retrieve and display table rows.
"""
rag = BigQueryRAG(dataset="IDB")
# Query 1: Get all table rows
query = """
SELECT
file_id,
table_id,
row_number,
html_tag,
columns,
content,
parent_file_id,
ingested_at
FROM `aixr-401704.IDB.KB`
WHERE is_table_row = true
ORDER BY table_id, row_number
LIMIT 100
"""
print("π Querying table rows from BigQuery...\n")
results = rag.run_query(query)
if not results:
print("β οΈ No table rows found yet. Process a PDF with tables first.")
return
# Display results
current_table = None
for row in results:
# Print table header when we encounter a new table
if row['table_id'] != current_table:
current_table = row['table_id']
print(f"\n{'='*80}")
print(f"π TABLE: {row['file_id']}")
print(f" Table ID: {row['table_id']}")
print(f"{'='*80}\n")
# Print column headers if available
if row.get('columns'):
headers = list(row['columns'].keys())
print(f"Columns: {' | '.join(headers)}\n")
# Print row data
print(f"Row {row['row_number']}:")
if row.get('columns'):
for col_name, col_value in row['columns'].items():
print(f" - {col_name}: {col_value}")
else:
print(f" Content: {row['content']}")
print()
def test_table_search():
"""
Test vector search on table content.
"""
rag = BigQueryRAG(dataset="IDB")
# Example: Search for specific content in tables
search_query = "revenue" # Modify this based on your data
print(f"\nπ Searching for '{search_query}' in table rows...\n")
query = f"""
SELECT
file_id,
table_id,
row_number,
columns,
content
FROM `aixr-401704.IDB.KB`
WHERE is_table_row = true
AND LOWER(content) LIKE '%{search_query.lower()}%'
ORDER BY table_id, row_number
LIMIT 20
"""
results = rag.run_query(query)
if not results:
print(f"β οΈ No results found for '{search_query}'")
return
for row in results:
print(f"π File: {row['file_id']}")
print(f" Row {row['row_number']}: {row['content']}")
if row.get('columns'):
print(f" Columns: {row['columns']}")
print()
def get_table_stats():
"""
Get statistics about tables in the knowledge base.
"""
rag = BigQueryRAG(dataset="IDB")
query = """
SELECT
COUNT(*) as total_table_rows,
COUNT(DISTINCT table_id) as total_tables,
COUNT(DISTINCT file_id) as total_files_with_tables
FROM `aixr-401704.IDB.KB`
WHERE is_table_row = true
"""
print("\nπ Table Statistics:\n")
results = rag.run_query(query)
if results:
stats = results[0]
print(f"Total Table Rows: {stats['total_table_rows']}")
print(f"Total Tables: {stats['total_tables']}")
print(f"Files with Tables: {stats['total_files_with_tables']}")
if __name__ == "__main__":
print("π PDF Table Extraction Test Suite\n")
print("="*80)
# Run tests
get_table_stats()
test_table_extraction()
# test_table_search() # Uncomment to test search
print("\nβ
Test complete!")