@@ -35,56 +35,71 @@ def process_word(word, ratio):
35
35
def process_document (doc_path , ratio ):
36
36
# Check if the file path is for a .docx file
37
37
if doc_path .endswith ('.docx' ):
38
- print ("already in docx format" )
38
+ print ("Already in docx format" )
39
39
elif doc_path .endswith ('.pdf' ):
40
40
docx_path = doc_path .replace ('.pdf' , '.docx' )
41
- # Run the conversion script as a subprocess with the same Python interpreter
42
- subprocess .run ([sys .executable , 'converter.py' , '--pdf' , doc_path , '--docx' , docx_path ])
43
- # Replace '.pdf' with '.docx' in the file path
44
- doc_path = docx_path
41
+ try :
42
+ # Run the conversion script as a subprocess
43
+ subprocess .run ([sys .executable , 'converter.py' , '--pdf' , doc_path , '--docx' , docx_path ])
44
+ # Replace '.pdf' with '.docx' in the file path
45
+ doc_path = docx_path
46
+ except subprocess .CalledProcessError as e :
47
+ print ("Error converting PDF:" , e )
48
+ sys .exit (1 )
45
49
else :
46
- print ("files of this format are not supported yet" )
47
- print ("please use either .pdf or .docx files" )
50
+ print ("Files of this format are not supported yet" )
51
+ print ("Please use either .pdf or .docx files" )
48
52
sys .exit ()
49
-
50
- # Load the spacy model for word recognition
51
- nlp = spacy .load ('en_core_web_sm' )
52
-
53
- # Open the .docx file
54
- word_doc = Document (doc_path )
55
-
56
- for paragraph in word_doc .paragraphs :
57
- for run in paragraph .runs :
58
- # Skip if the run is already bold
59
- if run .bold :
60
- continue
61
-
62
- # Split the run text into words
63
- # words = run.text.split()
64
- # words = [word + ' ' for word in run.text.split()]
65
- # words = re.findall(r'\s*\S+', run.text)
66
- # words = re.findall(r'(?:^|\s)\S+', run.text)
67
- words = run .text .split (' ' )
68
- words = [' ' + word if i != 0 else word for i , word in enumerate (words )]
69
-
70
- # Process each word
71
- new_runs = []
72
- for word in words :
73
- # Use spacy to recognize the words
74
- doc = nlp (word )
75
- for token in doc :
76
- # Bolden a ratio of the characters in the word
77
- runs = process_word (token .text , ratio )
78
- new_runs .extend (runs )
79
-
80
- # Clear the original run
81
- run .text = ''
82
-
83
- # Add new runs with the appropriate formatting
84
- for text , is_bold in new_runs :
85
- new_run = paragraph .add_run (text )
86
- new_run .bold = is_bold
87
53
54
+ # Load the spacy model for word recognition (wrap in try-except)
55
+ try :
56
+ nlp = spacy .load ('en_core_web_sm' )
57
+ except OSError as e :
58
+ print ("Error loading spaCy model:" , e )
59
+ sys .exit (1 )
60
+
61
+ try :
62
+ # Open the .docx file
63
+ word_doc = Document (doc_path )
64
+
65
+ for paragraph in word_doc .paragraphs :
66
+ for run in paragraph .runs :
67
+ # Skip if the run is already bold
68
+ if run .bold :
69
+ continue
70
+
71
+ # Split the run text into words
72
+ words = run .text .split (' ' )
73
+ words = [' ' + word if i != 0 else word for i , word in enumerate (words )]
74
+
75
+ # Process each word
76
+ new_runs = []
77
+ for word in words :
78
+ # Use spacy to recognize the words
79
+ doc = nlp (word )
80
+ for token in doc :
81
+ # Bolden a ratio of the characters in the word
82
+ runs = process_word (token .text , ratio )
83
+ new_runs .extend (runs )
84
+
85
+ # Clear the original run
86
+ run .text = ''
87
+
88
+ # Add new runs with the appropriate formatting
89
+ for text , is_bold in new_runs :
90
+ new_run = paragraph .add_run (text )
91
+ new_run .bold = is_bold
92
+
93
+ # Save the document (wrap in try-except)
94
+ try :
95
+ word_doc .save (output_path )
96
+ except PermissionError as e :
97
+ print ("Error saving document:" , e )
98
+ sys .exit (1 )
99
+
100
+ except Exception as e : # Catch any other unexpected errors
101
+ print ("Unexpected error processing document:" , e )
102
+ sys .exit (1 )
88
103
89
104
# Get the directory and filename from the input path
90
105
dir_name , file_name = os .path .split (doc_path )
@@ -96,8 +111,6 @@ def process_document(doc_path, ratio):
96
111
output_path = os .path .splitext (doc_path )[0 ] + '_modified.docx'
97
112
print (output_path )
98
113
99
- word_doc .save (output_path )
100
-
101
114
102
115
def main ():
103
116
parser = argparse .ArgumentParser ()
0 commit comments