Improved javascript regex regocnizing for extracting js messages

gitaarik · gitaarik · commit be485229b0f3 · 2022-02-19T20:16:32.000+01:00
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,15 @@
 Babel Changelog
 ===============
 
+Next version
+--------------
+
+Bugfixes
+~~~~~~~~
+
+* Regex for parsing JavaScript regexes improved. Before this, the lexer couldn't recognize certain regexes,
+  breaking the parsing of JS files.
+
 Version 2.9.1
 -------------
 
diff --git a/babel/messages/jslexer.py b/babel/messages/jslexer.py
@@ -24,7 +24,57 @@
 name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
 dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
 division_re = re.compile(r'/=?')
-regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
+
+regex_re = re.compile(
+    r'''
+
+        # Opening slash of the regex
+        /
+
+        (?:
+
+            # 1) Blackslashed character
+            #
+            # Match a backslash `\` and then it's following character, allowing
+            # to blackslash the `/` for example.
+            (?:\\.)?
+
+            |
+
+            # 2) Regex character class `[a-z]`
+            #
+            # Match regex character class, like `[a-z]`. Inside a character
+            # class, a `/` character may appear, which does not close the
+            # regex. Therefore we allow it here inside a character class.
+            \[
+                (?:
+                    [^\]]*
+                    |
+                    \\\]
+                )*
+            \]
+
+            |
+
+            # 3) Other characters
+            #
+            # Match anything except a closing slash `/`, a backslash `\`, or a
+            # opening bracket `[`. Those last two will be handled by the other
+            # matchers.
+            [^/\\\[]*
+
+        )*
+
+        # Closing slash of the regex
+        /
+
+        # regex flags
+        [a-zA-Z]*
+
+    ''',
+    re.DOTALL + re.VERBOSE
+)
+
 line_re = re.compile(r'(\r\n|\n|\r)')
 line_join_re = re.compile(r'\\' + line_re.pattern)
 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')