-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathload_lacour.py
164 lines (149 loc) · 6.27 KB
/
load_lacour.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import logging
import os
import re
import xml.etree.ElementTree as ET
from itertools import groupby
from typing import Dict, List
def write_xml(file: str, output_dir: str) -> None:
"""Write an xml file from a txt file transcript.
Args:
file (str): Location of the txt file.
output_dir (str): Target location of the xml file.
"""
transcript, w_id = load_transcript(file, fformat="txt")
rootname = "Transcript"
root = ET.Element(rootname)
root.tail = "\n"
webcast = ET.SubElement(root, "WebcastID")
webcast.text = w_id
webcast.tail = "\n"
for _, value in groupby(transcript, lambda x: x["segment_id"]):
snippets = list(value)
assert (
len(set([v["speaker_name"] for v in snippets])) == 1
), "There are non unique names in the speaker segment"
assert (
len(set([v["speaker_role"] for v in snippets])) == 1
), "There are non unique roles in the speaker segment"
speaker_segment = ET.SubElement(root, "SpeakerSegment")
speaker_segment.tail = "\n"
role = ET.SubElement(speaker_segment, "Role")
role.text = snippets[0]["speaker_role"]
role.tail = "\n"
name = ET.SubElement(speaker_segment, "Name")
name.text = snippets[0]["speaker_name"]
name.tail = "\n"
for snip in snippets:
snippet = ET.SubElement(speaker_segment, "Snippet")
snippet.tail = "\n"
language = ET.SubElement(snippet, "Language")
language.text = snip["language"]
language.tail = "\n"
timestamp_begin = ET.SubElement(snippet, "TimestampBegin")
timestamp_begin.text = str(snip["begin"])
timestamp_begin.tail = "\n"
timestamp_end = ET.SubElement(snippet, "TimestampEnd")
timestamp_end.text = str(snip["end"])
timestamp_end.tail = "\n"
text_element = ET.SubElement(snippet, "Text")
text_element.text = snip["text"]
text_element.tail = "\n"
tree = ET.ElementTree(root)
ET.indent(tree, space="\t", level=0)
tree.write(
f"{output_dir}/{w_id}_transcript.xml", encoding="utf-8", xml_declaration=True
)
def load_transcript(f: str, fformat: str = "txt") -> (List[Dict[str, str]], str):
"""Load a transcript from a file and parse it by snippets
Args:
f (str): Location of the file.
fformat (str, optional): Defines the file format to load. Supports 'txt' or 'xml'. Defaults to 'txt'.
Raises:
NotImplementedError: File formats other than 'txt' or 'xml' are not supported.
Returns:
(List[Dict[str, str]], str): List of dictionaries containing snippets and the webcast id.
"""
# assert that file exists
assert os.path.exists(f), f"File does not exist or is not accessible: {f}"
if fformat == "txt":
# assert that file is txt file
assert f.endswith(".txt"), f"File is not a txt file: {f}"
# extract webcast_id from file name
try:
w_id = re.search(r"([0-9]+\_[0-9]+)(?=\_transcript)", f)[0]
except AttributeError:
logging.warning(f"Could not find a valid webcast id in the file name {f}")
with open(f, "r", encoding="utf-8-sig") as transcript:
lines = transcript.readlines()
segment_id = -1
speech_line_idx = -1
role, name, lang = "UNK", "UNK", "UNK"
begin, end = 0, 0
segments = []
for idxl, l in enumerate(lines):
if l.startswith("\n"):
continue
elif l.startswith("[["):
segment_id += 1
try:
role, name = l.split("[[")[1].split("]]")[0].split(";")
except ValueError:
logging.warning(
f"There was an issue reading line {idxl} in {f}"
)
elif l.startswith("<<"):
try:
begin, end, lang = l.split("<<")[1].split(">>")[0].split(";")
begin = float(begin)
end = float(end)
except ValueError:
logging.warning(
f"There is an incorrectly formed speech tag in {f} in line {l}"
)
else:
speech_line_idx += 1
segments.append(
{
"webcast_id": w_id,
"segment_id": segment_id,
"snippet_id": speech_line_idx,
"speaker_role": role,
"speaker_name": name,
"language": lang,
"begin": begin,
"end": end,
"text": l.lstrip().rstrip(),
}
)
return segments, w_id
elif fformat == "xml":
# assert that file is xml file
assert f.endswith(".xml"), f"File is not an xml file: {f}"
id_ = 0
tree = ET.parse(f)
root = tree.getroot()
# webcast_id is in field
w_id = root.findtext("WebcastID")
segment_id = 0
segments = []
for speaker_segment in root.findall("SpeakerSegment"):
for snippet in speaker_segment.findall("Snippet"):
segments.append(
{
"webcast_id": w_id,
"segment_id": segment_id,
"snippet_id": id_,
"speaker_role": speaker_segment.findtext("Role", ""),
"speaker_name": speaker_segment.findtext("Name", ""),
"language": snippet.findtext("Language", ""),
"begin": float(snippet.findtext("TimestampBegin", "")),
"end": float(snippet.findtext("TimestampEnd", "")),
"text": snippet.findtext("Text", "").strip(),
}
)
id_ += 1
segment_id += 1
return segments, w_id
else:
# no other file formats are supported for now
raise NotImplementedError