-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathparse.py
170 lines (138 loc) · 5.9 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import re
import string
import logging
from string import Formatter
from typing import List, TypedDict
logger = logging.getLogger(__name__)
class PartialStringFormatter(string.Formatter):
def get_value(self, key, args, kwds):
if isinstance(key, str):
try:
return kwds[key]
except KeyError:
return "{" + key + "}"
else:
Formatter.get_value(key, args, kwds)
def format_field(self, value, format_spec):
try:
return super().format_field(value, format_spec)
except ValueError:
# HACK: the value was an unfilled variable or not a variable at all, so the format spec should be considered part of the variable name
if value.startswith("{") and value.endswith("}"):
return value[:-1] + ":" + format_spec + "}"
def _vformat(
self, format_string, args, kwargs, used_args, recursion_depth, auto_arg_index=0
):
# copied verbatim from parent class except for the # HACK
if recursion_depth < 0:
raise ValueError("Max string recursion exceeded")
result = []
for literal_text, field_name, format_spec, conversion in self.parse(
format_string
):
# output the literal text
if literal_text:
result.append(literal_text)
# if there's a field, output it
if field_name is not None:
# this is some markup, find the object and do
# the formatting
# handle arg indexing when empty field_names are given.
if field_name == "":
if auto_arg_index is False:
raise ValueError(
"cannot switch from manual field "
"specification to automatic field "
"numbering"
)
field_name = str(auto_arg_index)
auto_arg_index += 1
elif field_name.isdigit():
if auto_arg_index:
raise ValueError(
"cannot switch from manual field "
"specification to automatic field "
"numbering"
)
# disable auto arg incrementing, if it gets
# used later on, then an exception will be raised
auto_arg_index = False
# given the field_name, find the object it references
# and the argument it came from
obj, arg_used = self.get_field(field_name, args, kwargs)
used_args.add(arg_used)
# do any conversion on the resulting object
obj = self.convert_field(obj, conversion)
# expand the format spec, if needed
format_spec, auto_arg_index = self._vformat(
format_spec,
args,
kwargs,
used_args,
recursion_depth - 1,
auto_arg_index=auto_arg_index,
)
# format the object and append to the result
# HACK: if the format_spec is invalid, assume this field_name was not meant to be a variable, and don't substitute anything
formatted_field = self.format_field(obj, format_spec)
if formatted_field is None:
result.append("{" + ":".join([field_name, format_spec]) + "}")
else:
result.append(formatted_field)
return "".join(result), auto_arg_index
PartialStringFormat = PartialStringFormatter()
def partial_str_format(string, **kwargs):
"""
Formats a string with a subset of the arguments.
Analogous to str.format, but ignores missing arguments.
"""
return PartialStringFormat.format(string, **kwargs)
class TemplateChunks(TypedDict):
text: str
start: int
end: int
type: str
match_fields_regex = re.compile(r"(?<!\{)\{([a-zA-Z0-9_]+)\}(?!})")
def parse_template(string, include_texts=True) -> List[TemplateChunks]:
"""
Parses a template string to extract output fields and the text between them.
Args:
string (str): The template string to parse.
include_texts (bool): Whether to include the text between the fields in the output.
Returns:
List[Element]: A list of dictionaries with the keys 'text', 'start', 'end', and 'type'.
Example:
>>> parse_template("some text {field1} some more text {field2}")
[{"text": "some text ", "start": 0, "end": 10, "type": "text"},
{"text": "field1", "start": 11, "end": 17, "type": "var"},
{"text": " some more text ", "start": 18, "end": 35, "type": "text"},
{"text": "field2", "start": 36, "end": 42, "type": "var"}]
"""
chunks: List[TemplateChunks] = []
last_index = 0
for match in match_fields_regex.finditer(string):
# for match in re.finditer(r'\{(.*?)\}', string):
# Text before field
if last_index < match.start() and include_texts:
text = string[last_index : match.start()]
chunks.append(
{
"text": text,
"start": last_index,
"end": match.start(),
"type": "text",
}
)
# Field itself
field = match.group(1)
start = match.start()
end = match.end()
chunks.append({"text": field, "start": start, "end": end, "type": "var"})
last_index = match.end()
# Text after the last field
if last_index < len(string) and include_texts:
text = string[last_index:]
chunks.append(
{"text": text, "start": last_index, "end": len(string), "type": "text"}
)
return chunks