-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkeepLines.py
More file actions
69 lines (58 loc) · 2.51 KB
/
keepLines.py
File metadata and controls
69 lines (58 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import re
#Set file paths and variables for moving later IN BASH OR TERMINAL:
sourcePath = r"../corpus/stripped/" #replace with file path to folder with source files
destPath = r"/path to folder/foldername/" #replace with file path to where you want
# clean files to be saved
#IN WINDOWS CMD:
#sourcePath = r"C:/Users/Tiff/Documents/UVic/Classes/ENGL598/samples/stripped/"
#destPath = r"C:/Users/Tiff/Documents/UVic/Classes/ENGL598/samples/oneString/"
fileName = r"" #to be used later
#If you only want a specific set of files in your source folder:
#listOfFiles = ['babylonVol1.txt', 'babylonVol2.txt', 'babylonVol3.txt', 'beckoningHand.txt'] #list file names (separated by commas) in the square brackets
#for file in listOfFiles:
#OR
#Iterate over each file in the directory
for file in os.listdir(sourcePath):
fileName = file
#Open the stripped .txt file & read it
contents = open(sourcePath + fileName, 'r+')
fulltext = contents.read()
'''Split the file into lines according to paragraph breaks
(as they appear in the raw text). Returns a list of lines.'''
listLines = fulltext.split('\n\n+')
'''Define a function to filter out lines we don't want'''
def filterLines():
'''Filter out lines that have 2+ capital letters that are
not followed by an alpha character (e.g. chapter headings
IN ALL CAPS)'''
allCaps = re.compile(r'[A-Z][A-Z][^\w]')
filter1 = filter(lambda i: allCaps.search(i), listLines)
'''Filter out lines that contain only one capital letter
in a word boundary (e.g. I(.) at the start of a chapter)'''
chapNum = re.compile(r'\W*\b\w{0,1}\b')
filter2 = filter(lambda i: chapNum.search(i), filter1)
#Filter out Illustration lines
illus = re.compile(r'\[Illustration:')
filter3 = filter(lambda i: illus.search(i), filter2)
#Filter out lines that have asterisks
ast = re.compile(r'\*')
filter4 = filter(lambda i: not ast.search(i), filter3)
return filter4
toReplace = '\n\n'.join(filterLines())#Call function, pass results to toReplace
#Define a function to change or delete unwanted characters
def subChars():
global toReplace
toReplace = toReplace.replace('_', '')
toReplace = toReplace.replace('=', '')
toReplace = toReplace.replace('|', '')
toReplace = re.sub(r'‘', '\'', toReplace)
toReplace = re.sub(r'-{2,}', '—', toReplace)
return toReplace
result = subChars()
#with open('/home/Tiffany/ENGL598/samples/test/' + fileName, 'w') as f:
f = open(destPath + fileName, 'w')
f.write(result)
f.close()