regex_stone/regex.gawk at master · gragatrim/regex_stone · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#! /bin/gawk -f
BEGIN {
  useless_words         = "^(a|the|any|all|with)$";
  character_class_words = "^(characters|digits|lowercase|optional(ly)?|spaces|uppercase)$";
}
/^[^#]/{
  print $0 " -> " language_parser_handler($0)
}

# This function just calls the language parser function for us
#
# @args global current_line This is the the entire line i.e. $0
# @args local  output       This is the parsed output that will be returned to the console
#
# @return This returns the parsed output for the line so it can be output to the console
function language_parser_handler(current_line,    output) {
  #lets loop through each field and parse the important bits out
  for (i=1; i <= NF; i++) {
    output = output language_parser($i, i)
  }
  return output
}

# Current this function is going to be a bit messy and be a bunch of if/elseifs until I can think of a better way to handle parsing english
# it may end up staying this way, but get a bit of sprucing by using fancier regex to keep some of the nested if/switches to a minimum
# Basically it is doing all of the heavy lifting and determinig what should be parsed by what function
#
# @arg global current_word        This is the current field we are working on e.g. $1, $2
# @arg global current_field_index This is the index for the current field we are parsing. If we were parsing $23, this would be 23
# @arg local  parsed_value        This is a local variable that we use to organize what will be returned by this function
# @arg local  unformatted_return  This is for character classes so I can strip out nested brackets
#
# @return This returns the parsed output of the field passed in
function language_parser(current_word, current_field_index,    parsed_value,unformatted_return) {
#THIS SHOULD ALWAYS BE FIRST!!!! no need to parse anything if the word is useless
  if (match(current_word, useless_words)) {
#This is done so that things like ends with will still work and get the expected regex instead of just getting nothing
    parsed_value = language_parser($(current_field_index + 1), (current_field_index + 1))
  } else if (match(current_word, /^"/)) {
    parsed_value = literal_check(current_field_index)
  } else if (match(current_word, /^followed$/) || match(current_word, /^preceded$/)) {
    switch ($(current_field_index + 1)) {
      case "by":
        parsed_value = look_around_check(current_word, current_field_index)
        break
    }
  } else if (match(current_word $(current_field_index + 1) , /^endswith$/)) {
#We'll always want to parse the value below, the only thing that will change is when/where the $ goes
      parsed_value = language_parser($(current_field_index + 2), (current_field_index + 2))
#We need to use i here instead of current_field_index due to how the current_field_index changes after the last parsing
    if (quantifier_check(i + 1)) {
#There is a quantifier after, we need to parse that before tossing in the $
      parsed_value = parsed_value language_parser($(i + 1), i + 1)  "$"
    } else {
      parsed_value = parsed_value "$"
    }
  } else if (match(current_word, /^\($/)) {
    parsed_value = capture_check(current_word, current_field_index)
  } else if (match(current_word, /^not$/)) {
    if (!(match($(current_field_index + 1), /^followed$/) || match($(current_field_index + 1), /^preceded$/))) {
    parsed_value = not_check(current_field_index)
    } else {
#we don't need to do anything here as the check for followed/preceded should handle this
    }
  } else if (match(current_word, /^one$/) &&
             match($(current_field_index + 1), /^or$/) &&
             match($(current_field_index + 2), /^more$/) &&
             match($(current_field_index + 3), /^times$/)) {
    parsed_value =  "+"
    i = current_field_index + 3
  } else if (match(current_word, /^more$/) &&
             match($(current_field_index + 1), /^than$/) &&
             match($(current_field_index + 2), /^zero$/) &&
             match($(current_field_index + 3), /^times$/)) {
    parsed_value =  "*"
    i = current_field_index + 3
  } else if (match(current_word, /^between$/) &&
             match($(current_field_index + 1), /^[[:digit:]]+$/, beginning_digit) &&
             match($(current_field_index + 2), /^and$/) &&
             match($(current_field_index + 3), /^[[:digit:]]+$/, ending_digit) &&
             match($(current_field_index + 4), /^times$/)) {
    parsed_value = "{" beginning_digit[0] "," ending_digit[0] "}"
    i = current_field_index + 4
  } else if (match(current_word, /^exactly$/) &&
             match($(current_field_index + 1), /^[[:digit:]]+$/, matched_digits) &&
             match($(current_field_index + 2), /^times$/)) {
    parsed_value = "{" matched_digits[0] "}"
    i = current_field_index + 2
  } else  if (match(current_word, /^atleast$/) &&
              match($(current_field_index + 1), /^[[:digit:]]+$/, matched_digits) &&
              match($(current_field_index + 2), /^times$/)) {
    parsed_value = "{" matched_digits[0] ",}"
    i = current_field_index + 2
  } else if (match(current_word, /^uppercase$/) && match($(current_field_index + 1), /^letters?$/)) {
    parsed_value = "[" get_character_class("uletter")
#start at 2 since we need to skip "letters"
    for (k = 2; match($(current_field_index + k), character_class_words); k++) {
      unformatted_return = language_parser($(current_field_index + k), current_field_index + k)
      parsed_value = parsed_value substr(unformatted_return, 2, length(unformatted_return) - 2)
      if (i > k) {
        k = i
      }
    }
    parsed_value = parsed_value "]"
  } else if (match(current_word, /^lowercase$/) && match($(current_field_index + 1), /^letters?$/)) {
    parsed_value = "[" get_character_class("lletter")
#start at 2 since we need to skip "letters"
    for (k = 2; match($(current_field_index + k), character_class_words); k++) {
      unformatted_return = language_parser($(current_field_index + k), current_field_index + k)
      parsed_value = parsed_value substr(unformatted_return, 2, length(unformatted_return) - 2)
      if (i > k) {
        k = i
      }
    }
    parsed_value = parsed_value "]"
  } else if (match(current_word, character_class_words)) {
    parsed_value = "[" get_character_class(current_word)
    #we'll for through the next results until we hit a non-character class so that we know when/how to end the character class
    for (k = 1; match($(current_field_index + k), character_class_words); k++) {
      unformatted_return = language_parser($(current_field_index + k), current_field_index + k)
      parsed_value = parsed_value substr(unformatted_return, 2, length(unformatted_return) - 2)
      if (i > k) {
        k = i
      }
    }
    parsed_value = parsed_value "]"
  } else {
#TODO clean this up, this is gross, this is used for closing parens
    parsed_value = current_word
  }
  if (i < current_field_index) {
    i = current_field_index
  }
  return parsed_value
}

# This returns the character class equivalents for the values passed in
#
# @arg global text This is the text that is passed in and should be a character class
#
# @return This returns the character class
function get_character_class(text) {
  if (match(text, /^digits?$/)) {
    return  "\\d"
  }
  if (match(text, /^characters?$/)) {
    return  "\\w"
  }
  if (match(text, /^spaces?$/)) {
    return  "\\s"
  }
  if (match(text, /^lletters?$/)) {
    return  "a-z"
  }
  if (match(text, /^uletters?$/)) {
    return  "A-Z"
  }
  if (match(text, /^optional(ly)?$/)) {
    return "?"
 }
}

# This is how we check for capture groups
#
# @arg global current_value       This is the current field's value being passed in
# @arg global current_field_index This is the index of the field that was passed in
# @arg local  tmp_return          This holds the partial return for capture groups(either due to more than one parsable input between braces, or nested capture groups
#
# @return This returns the parsed capture group
function capture_check(current_value,current_field_index,    tmp_return) {
#let's save that opening brace
  tmp_return = current_value
#keep grabbing the next input, starting at index + 1 so that I don't have to pass the "next" value to the language parser and instead pass in the "current"
  for (j = current_field_index + 1; !match($j, /^\)$/); j++) {
#as long as it isn't a closing paren parse it and keep on keepin on
     tmp_return = tmp_return language_parser($(j), j)
#This fixes multi word identifiers used inside of a capture group
     if (i > j) {
       j = i
     }
   }
#we have the final language_parser here since we still need to parse the closing paren
  return tmp_return language_parser($(j), (j))
}

# This handles literal strings that need parsing
#
# @arg global current_field_index This is the index of the current field being parsed
# @arg local  tmp_return          This holds the partial return for literal groups(either due to more than one parsable input between quotes, or nested literal groups
#
# @return The exact words between quotes
function literal_check(current_field_index,    tmp_return) {
#This checks if we are doing a single value in the literal
  if (match($(current_field_index), /\"$/)) {
    tmp_return = tmp_return substr($(current_field_index), 2, length($(current_field_index)) - 2)
  } else {
#if we aren't then we need to continue to print out the rest of the string until we hit the next "(which is hopefully the closing quote)
    tmp_return = tmp_return substr($(current_field_index), 2)
#start at the next field index since we already added in the first one
    for (j = current_field_index + 1; !match($j, /\"$/); j++) {
#as long as it isn't the closing " use it and keep on keeping on
      tmp_return = tmp_return " " $j
#SUPER FUCKING HACKY!!! Not sure how to handle this otherwise though.... TODO Make this suck less
      language_parser("", j)
    }
#we hit the end of the road, time to print out the final set of characters, minus the closing quote
    tmp_return = tmp_return " " substr($j, 1, length($j) - 1)
  }
#and the hacks just keep on coming.... I really need to figure out a better way to do literal strings.... TODO Make this suck less
  language_parser("", j)
  return tmp_return
}

# This handles look ahead/behinds
#
# @arg global current_record      This is the current field being parsed
# @arg global current_field_index This is the index of the field being parsed
# @arg local  tmp_return          This holds the partial return for lookaround groups
#
# @return This returns the parsed look ahead/behind
function look_around_check(current_record,current_field_index,    tmp_return) {
#this is sort of dirty.... TODO maybe come up with a better way of handling it
  look_around_clause_two = current_record $(current_field_index + 1)
  look_around_clause_three = $(current_field_index - 1) current_record $(current_field_index + 1)
   tmp = language_parser($(current_field_index + 2), (current_field_index + 2))
#The three value should go first as the two will always catch in these cases, and we actually want to give the not a chance
   if (match(look_around_clause_three, "notprecededby")) {
     tmp_return =  "(?<!" tmp ")"
   } else if (match(look_around_clause_three, "notfollowedby")) {
     tmp_return =  "(?!" tmp ")"
   } else if (match(look_around_clause_two, "precededby")) {
     tmp_return =  "(?<=" tmp ")"
   } else if (match(look_around_clause_two, "followedby")) {
     tmp_return =  "(?=" tmp ")"
   }
 return tmp_return
}

# This handles parsing not
#
# @arg global current_field_index This is the current field's index
# @arg local  tmp_return          This holds the partial return for lookaround groups
#
# @return This returns the notted value
function not_check(current_field_index,    tmp_return) {
#since we are notting we'll need a character class
  tmp_return = "[^" language_parser($(current_field_index + 1), (current_field_index + 1 )) "]"
  return tmp_return
}

# This checks if the next value is a quantifier or not, this is especially useful for making sure ends with is being parsed correctly
#
# @arg global current_field_index This is the field's current index
# @arg local  tmp_return          This is either 0 if the next field isn't a quantifier, or 1 if it is
#
# @return This returns a 0 if the next field isn't a quantifier, or a 1 if it is
function quantifier_check(current_field_index,    tmp_return) {
#Here we'll use a bunch of duplicate code to check if a modifier comes next for use with things like "ends with"
  tmp_return = 0
  if (match($(current_field_index), useless_words)) {
    tmp_return = quantifier_check(current_field_index + 1)
  } else if (match($(current_field_index), /^one$/) &&
             match($(current_field_index + 1), /^or$/) &&
             match($(current_field_index + 2), /^more$/) &&
             match($(current_field_index + 3), /^times$/)) {
      tmp_return = 1
  } else if (match($(current_field_index), /^more$/) &&
             match($(current_field_index + 1), /^than$/) &&
             match($(current_field_index + 2), /^zero$/) &&
             match($(current_field_index + 3), /^times$/)) {
      tmp_return = 1
  } else if (match($(current_field_index), /^between$/) &&
             match($(current_field_index + 1), /^[[:digit:]]+$/, beginning_digit) &&
             match($(current_field_index + 2), /^and$/) &&
             match($(current_field_index + 3), /^[[:digit:]]+$/, ending_digit) &&
             match($(current_field_index + 4), /^times$/)) {
      tmp_return = 1
  } else if (match($(current_field_index), /^exactly$/) &&
             match($(current_field_index + 1), /^[[:digit:]]+$/, matched_digits) &&
             match($(current_field_index + 2), /^times$/)) {
      tmp_return = 1
  } else  if (match($(current_field_index), /^atleast$/) &&
              match($(current_field_index + 1), /^[[:digit:]]+$/, matched_digits) &&
              match($(current_field_index + 2), /^times$/)) {
      tmp_return = 1
  }
  return tmp_return
}