11from pypdf import PdfReader
22from pypdf .generic ._data_structures import Destination
3+ from rich .live import Live
34from typing import Dict
45
5- def extract_toc_and_sections (reader : PdfReader ) -> Dict :
6+ def extract_toc_and_sections (reader : PdfReader , live : Live ) -> Dict :
67 """
78 Extract table of contents and corresponding sections from a single PDF.
89 If TOC is not available, fall back to heuristic section detection.
@@ -80,7 +81,7 @@ def process_outline(entries, level=1):
8081 if page_text :
8182 section_text += page_text + "\n \n "
8283 except Exception as e :
83- print (f"Error extracting text from page { p } : { e } " )
84+ live . console . print (f"Error extracting text from page { p } : { e } " )
8485
8586 # Store the section
8687 section_id = f"{ level } _{ title .replace (' ' , '_' )[:30 ]} _{ page_number } "
@@ -94,7 +95,7 @@ def process_outline(entries, level=1):
9495
9596 # If no TOC was found or no valid sections were extracted, use page-based sections
9697 if not result ['has_toc' ] or not result ['sections' ]:
97- print ("No TOC found or no valid sections extracted. Using page-based sections." )
98+ live . console . print ("No TOC found or no valid sections extracted. Using page-based sections." )
9899 result ['has_toc' ] = False
99100
100101 for page_num , page in enumerate (reader .pages ):
@@ -110,6 +111,6 @@ def process_outline(entries, level=1):
110111 }
111112
112113 except Exception as e :
113- print (f"Error extracting TOC: { e } " )
114+ live . console . print (f"Error extracting TOC: { e } " )
114115
115116 return result
0 commit comments