diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1bfdf7f..c635787 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
### Added
+- **Regular Expression (re) Module Runtime Implementation** (Issue #30)
+ - Full runtime support using the `regex` crate
+ - Core functions: `re.compile()`, `re.search()`, `re.match()`, `re.fullmatch()`
+ - Multi-match functions: `re.findall()`, `re.finditer()`
+ - String manipulation: `re.split()`, `re.sub()`, `re.subn()`
+ - Utility functions: `re.escape()`, `re.purge()`
+ - **Match object support** with methods:
+ - `group()` - returns matched string
+ - `groups()` - returns tuple of captured groups
+ - `start()`, `end()` - match position indices
+ - `span()` - returns (start, end) tuple
+ - **Regex flags support**:
+ - `re.IGNORECASE` / `re.I` - case-insensitive matching
+ - `re.MULTILINE` / `re.M` - multi-line mode (^ and $ match line boundaries)
+ - `re.DOTALL` / `re.S` - dot matches newlines
+ - `re.VERBOSE` / `re.X` - verbose patterns with comments
+ - `re.ASCII` / `re.A` - ASCII-only matching
+ - `re.UNICODE` / `re.U` - Unicode matching (default)
+ - Compile-time pattern evaluation for constant string patterns
+
- **Datetime Module Runtime Implementation** (Issue #32)
- Full runtime support for `datetime` module with chrono backend
- Constructors: `datetime.datetime()`, `datetime.date()`, `datetime.time()`, `datetime.timedelta()`
diff --git a/Cargo.lock b/Cargo.lock
index c89abea..ba5258f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -894,6 +894,7 @@ dependencies = [
"binaryen",
"chrono",
"log",
+ "regex",
"rustpython-parser",
"serde",
"serde_json",
diff --git a/Cargo.toml b/Cargo.toml
index 61482db..64e13da 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,7 @@ log = "0.4.29"
serde = { version = "1.0.228", features = ["derive"], optional = true }
serde_json = "1.0"
chrono = "0.4"
+regex = "1.11"
[lib]
name = "waspy"
diff --git a/docs/modules.html b/docs/modules.html
index 85d9631..df5ab73 100644
--- a/docs/modules.html
+++ b/docs/modules.html
@@ -285,7 +285,7 @@
Waspy Development Board
{ name: "math module (pi, e, tau, inf, nan, sqrt, sin, cos, tan, asin, acos, atan, atan2, sinh, cosh, tanh, exp, log, log10, log2, pow, floor, ceil, trunc, round, abs, fabs, copysign, fmod, remainder, degrees, radians, hypot, factorial, gcd, isnan, isinf, isfinite)", status: "done", version: "0.8.0" },
{ name: "random module (random, randint, randrange, uniform, choice, shuffle, sample, seed, getrandbits, gauss, normalvariate, expovariate)", status: "done", version: "0.8.0" },
{ name: "json module (loads, dumps, load, dump, JSONEncoder, JSONDecoder) - runtime implementation", status: "done", version: "unreleased" },
- { name: "re module (compile, search, match, fullmatch, findall, finditer, split, sub, subn, escape, purge, IGNORECASE, MULTILINE, DOTALL, VERBOSE, ASCII)", status: "done", version: "0.8.0" },
+ { name: "re module (compile, search, match, fullmatch, findall, finditer, split, sub, subn, escape, purge, IGNORECASE, MULTILINE, DOTALL, VERBOSE, ASCII, UNICODE) - full runtime support with match objects", status: "done", version: "unreleased" },
{ name: "datetime module (datetime, date, time, timedelta, timezone, tzinfo, now, today, fromtimestamp, fromisoformat, strftime, strptime, replace, timestamp, isoformat, weekday, isoweekday, MINYEAR, MAXYEAR) - full runtime support", status: "done", version: "unreleased" },
{ name: "collections module (namedtuple, deque, Counter, OrderedDict, defaultdict, ChainMap, UserDict, UserList, UserString)", status: "done", version: "0.8.0" },
{ name: "itertools module (count, cycle, repeat, chain, compress, dropwhile, filterfalse, groupby, islice, starmap, takewhile, tee, zip_longest, product, permutations, combinations, combinations_with_replacement, accumulate, batched, pairwise)", status: "done", version: "0.8.0" },
diff --git a/src/compiler/expression.rs b/src/compiler/expression.rs
index 006e5b5..c6715f0 100644
--- a/src/compiler/expression.rs
+++ b/src/compiler/expression.rs
@@ -1763,6 +1763,567 @@ pub fn emit_expr(
}
}
+ // Handle re module functions
+ if module_name == "re" {
+ if let Some(re_func) = crate::stdlib::re::get_function(method_name) {
+ return match re_func {
+ crate::stdlib::re::ReFunction::Compile => {
+ // re.compile(pattern, flags=0) - compile pattern for reuse
+ // For compile-time constant patterns, we can pre-validate
+ if !arguments.is_empty() {
+ if let IRExpr::Const(IRConstant::String(pattern)) =
+ &arguments[0]
+ {
+ // Validate pattern at compile time
+ let flags = if arguments.len() > 1 {
+ if let IRExpr::Const(IRConstant::Int(f)) =
+ &arguments[1]
+ {
+ *f
+ } else {
+ 0
+ }
+ } else {
+ 0
+ };
+ // Store pattern in memory (lookup existing or use 0)
+ let offset = memory_layout
+ .string_offsets
+ .get(pattern)
+ .copied()
+ .unwrap_or(0);
+ func.instruction(&Instruction::I32Const(offset as i32));
+ func.instruction(&Instruction::I32Const(
+ pattern.len() as i32
+ ));
+ func.instruction(&Instruction::I32Const(flags));
+ return IRType::Unknown; // Pattern object
+ }
+ }
+ // Drop all arguments for non-constant patterns
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ IRType::Unknown
+ }
+ crate::stdlib::re::ReFunction::Search => {
+ // re.search(pattern, string, flags=0) - search for pattern
+ // Returns Match object or None
+ if arguments.len() >= 2 {
+ if let (
+ IRExpr::Const(IRConstant::String(pattern)),
+ IRExpr::Const(IRConstant::String(text)),
+ ) = (&arguments[0], &arguments[1])
+ {
+ let flags = if arguments.len() > 2 {
+ if let IRExpr::Const(IRConstant::Int(f)) =
+ &arguments[2]
+ {
+ *f
+ } else {
+ 0
+ }
+ } else {
+ 0
+ };
+ // Execute search at compile time
+ if let Some(result) =
+ crate::stdlib::re::search(pattern, text, flags)
+ {
+ // Return match info (using text offset if available)
+ let offset = memory_layout
+ .string_offsets
+ .get(&result.group)
+ .copied()
+ .unwrap_or(0);
+ func.instruction(&Instruction::I32Const(
+ offset as i32,
+ ));
+ func.instruction(&Instruction::I32Const(
+ result.group.len() as i32,
+ ));
+ func.instruction(&Instruction::I32Const(
+ result.start as i32,
+ ));
+ func.instruction(&Instruction::I32Const(
+ result.end as i32,
+ ));
+ return IRType::Unknown; // Match object
+ } else {
+ // No match - return None indicator
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(-1));
+ func.instruction(&Instruction::I32Const(-1));
+ return IRType::None;
+ }
+ }
+ }
+ // Runtime search - drop args and return placeholder
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(-1));
+ func.instruction(&Instruction::I32Const(-1));
+ IRType::None
+ }
+ crate::stdlib::re::ReFunction::Match => {
+ // re.match(pattern, string, flags=0) - match at beginning
+ if arguments.len() >= 2 {
+ if let (
+ IRExpr::Const(IRConstant::String(pattern)),
+ IRExpr::Const(IRConstant::String(text)),
+ ) = (&arguments[0], &arguments[1])
+ {
+ let flags = if arguments.len() > 2 {
+ if let IRExpr::Const(IRConstant::Int(f)) =
+ &arguments[2]
+ {
+ *f
+ } else {
+ 0
+ }
+ } else {
+ 0
+ };
+ if let Some(result) =
+ crate::stdlib::re::match_start(pattern, text, flags)
+ {
+ let offset = memory_layout
+ .string_offsets
+ .get(&result.group)
+ .copied()
+ .unwrap_or(0);
+ func.instruction(&Instruction::I32Const(
+ offset as i32,
+ ));
+ func.instruction(&Instruction::I32Const(
+ result.group.len() as i32,
+ ));
+ func.instruction(&Instruction::I32Const(
+ result.start as i32,
+ ));
+ func.instruction(&Instruction::I32Const(
+ result.end as i32,
+ ));
+ return IRType::Unknown;
+ } else {
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(-1));
+ func.instruction(&Instruction::I32Const(-1));
+ return IRType::None;
+ }
+ }
+ }
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(-1));
+ func.instruction(&Instruction::I32Const(-1));
+ IRType::None
+ }
+ crate::stdlib::re::ReFunction::Fullmatch => {
+ // re.fullmatch(pattern, string, flags=0) - full string match
+ if arguments.len() >= 2 {
+ if let (
+ IRExpr::Const(IRConstant::String(pattern)),
+ IRExpr::Const(IRConstant::String(text)),
+ ) = (&arguments[0], &arguments[1])
+ {
+ let flags = if arguments.len() > 2 {
+ if let IRExpr::Const(IRConstant::Int(f)) =
+ &arguments[2]
+ {
+ *f
+ } else {
+ 0
+ }
+ } else {
+ 0
+ };
+ if let Some(result) =
+ crate::stdlib::re::fullmatch(pattern, text, flags)
+ {
+ let offset = memory_layout
+ .string_offsets
+ .get(&result.group)
+ .copied()
+ .unwrap_or(0);
+ func.instruction(&Instruction::I32Const(
+ offset as i32,
+ ));
+ func.instruction(&Instruction::I32Const(
+ result.group.len() as i32,
+ ));
+ func.instruction(&Instruction::I32Const(
+ result.start as i32,
+ ));
+ func.instruction(&Instruction::I32Const(
+ result.end as i32,
+ ));
+ return IRType::Unknown;
+ } else {
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(-1));
+ func.instruction(&Instruction::I32Const(-1));
+ return IRType::None;
+ }
+ }
+ }
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(-1));
+ func.instruction(&Instruction::I32Const(-1));
+ IRType::None
+ }
+ crate::stdlib::re::ReFunction::Findall => {
+ // re.findall(pattern, string, flags=0) - find all matches
+ if arguments.len() >= 2 {
+ if let (
+ IRExpr::Const(IRConstant::String(pattern)),
+ IRExpr::Const(IRConstant::String(text)),
+ ) = (&arguments[0], &arguments[1])
+ {
+ let flags = if arguments.len() > 2 {
+ if let IRExpr::Const(IRConstant::Int(f)) =
+ &arguments[2]
+ {
+ *f
+ } else {
+ 0
+ }
+ } else {
+ 0
+ };
+ let results =
+ crate::stdlib::re::findall(pattern, text, flags);
+ // Return list pointer and count (placeholder)
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(
+ results.len() as i32
+ ));
+ return IRType::List(Box::new(IRType::String));
+ }
+ }
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ IRType::List(Box::new(IRType::String))
+ }
+ crate::stdlib::re::ReFunction::Finditer => {
+ // re.finditer(pattern, string, flags=0) - iterator of matches
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ IRType::Unknown // Iterator
+ }
+ crate::stdlib::re::ReFunction::Split => {
+ // re.split(pattern, string, maxsplit=0, flags=0)
+ if arguments.len() >= 2 {
+ if let (
+ IRExpr::Const(IRConstant::String(pattern)),
+ IRExpr::Const(IRConstant::String(text)),
+ ) = (&arguments[0], &arguments[1])
+ {
+ let maxsplit = if arguments.len() > 2 {
+ if let IRExpr::Const(IRConstant::Int(m)) =
+ &arguments[2]
+ {
+ if *m > 0 {
+ Some(*m as usize)
+ } else {
+ None
+ }
+ } else {
+ None
+ }
+ } else {
+ None
+ };
+ let flags = if arguments.len() > 3 {
+ if let IRExpr::Const(IRConstant::Int(f)) =
+ &arguments[3]
+ {
+ *f
+ } else {
+ 0
+ }
+ } else {
+ 0
+ };
+ let results = crate::stdlib::re::split(
+ pattern, text, maxsplit, flags,
+ );
+ // Return list pointer and count (placeholder)
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(
+ results.len() as i32
+ ));
+ return IRType::List(Box::new(IRType::String));
+ }
+ }
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ IRType::List(Box::new(IRType::String))
+ }
+ crate::stdlib::re::ReFunction::Sub => {
+ // re.sub(pattern, repl, string, count=0, flags=0)
+ if arguments.len() >= 3 {
+ if let (
+ IRExpr::Const(IRConstant::String(pattern)),
+ IRExpr::Const(IRConstant::String(repl)),
+ IRExpr::Const(IRConstant::String(text)),
+ ) = (&arguments[0], &arguments[1], &arguments[2])
+ {
+ let count = if arguments.len() > 3 {
+ if let IRExpr::Const(IRConstant::Int(c)) =
+ &arguments[3]
+ {
+ if *c > 0 {
+ Some(*c as usize)
+ } else {
+ None
+ }
+ } else {
+ None
+ }
+ } else {
+ None
+ };
+ let flags = if arguments.len() > 4 {
+ if let IRExpr::Const(IRConstant::Int(f)) =
+ &arguments[4]
+ {
+ *f
+ } else {
+ 0
+ }
+ } else {
+ 0
+ };
+ let result = crate::stdlib::re::sub(
+ pattern, repl, text, count, flags,
+ );
+ // Return string offset and length (placeholder)
+ let offset = memory_layout
+ .string_offsets
+ .get(&result)
+ .copied()
+ .unwrap_or(0);
+ func.instruction(&Instruction::I32Const(offset as i32));
+ func.instruction(&Instruction::I32Const(
+ result.len() as i32
+ ));
+ return IRType::String;
+ }
+ }
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ IRType::String
+ }
+ crate::stdlib::re::ReFunction::Subn => {
+ // re.subn(pattern, repl, string, count=0, flags=0)
+ // Returns (new_string, num_substitutions)
+ if arguments.len() >= 3 {
+ if let (
+ IRExpr::Const(IRConstant::String(pattern)),
+ IRExpr::Const(IRConstant::String(repl)),
+ IRExpr::Const(IRConstant::String(text)),
+ ) = (&arguments[0], &arguments[1], &arguments[2])
+ {
+ let count = if arguments.len() > 3 {
+ if let IRExpr::Const(IRConstant::Int(c)) =
+ &arguments[3]
+ {
+ if *c > 0 {
+ Some(*c as usize)
+ } else {
+ None
+ }
+ } else {
+ None
+ }
+ } else {
+ None
+ };
+ let flags = if arguments.len() > 4 {
+ if let IRExpr::Const(IRConstant::Int(f)) =
+ &arguments[4]
+ {
+ *f
+ } else {
+ 0
+ }
+ } else {
+ 0
+ };
+ let (result, num_subs) = crate::stdlib::re::subn(
+ pattern, repl, text, count, flags,
+ );
+ // Return string offset and length (placeholder)
+ let offset = memory_layout
+ .string_offsets
+ .get(&result)
+ .copied()
+ .unwrap_or(0);
+ func.instruction(&Instruction::I32Const(offset as i32));
+ func.instruction(&Instruction::I32Const(
+ result.len() as i32
+ ));
+ func.instruction(&Instruction::I32Const(
+ num_subs as i32,
+ ));
+ return IRType::Tuple(vec![
+ IRType::String,
+ IRType::Int,
+ ]);
+ }
+ }
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ IRType::Tuple(vec![IRType::String, IRType::Int])
+ }
+ crate::stdlib::re::ReFunction::Escape => {
+ // re.escape(pattern) - escape special characters
+ if !arguments.is_empty() {
+ if let IRExpr::Const(IRConstant::String(pattern)) =
+ &arguments[0]
+ {
+ let escaped = crate::stdlib::re::escape(pattern);
+ // Return escaped string offset and length (placeholder)
+ let offset = memory_layout
+ .string_offsets
+ .get(&escaped)
+ .copied()
+ .unwrap_or(0);
+ func.instruction(&Instruction::I32Const(offset as i32));
+ func.instruction(&Instruction::I32Const(
+ escaped.len() as i32
+ ));
+ return IRType::String;
+ }
+ }
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ func.instruction(&Instruction::I32Const(0));
+ IRType::String
+ }
+ crate::stdlib::re::ReFunction::Purge => {
+ // re.purge() - clear regex cache (no-op in our implementation)
+ for arg in arguments {
+ let arg_type =
+ emit_expr(arg, func, ctx, memory_layout, None);
+ if arg_type == IRType::String {
+ func.instruction(&Instruction::Drop);
+ func.instruction(&Instruction::Drop);
+ } else {
+ func.instruction(&Instruction::Drop);
+ }
+ }
+ func.instruction(&Instruction::I32Const(0));
+ IRType::None
+ }
+ };
+ }
+ }
+
// Handle datetime module constructor functions (datetime.datetime(), datetime.date(), etc.)
if module_name == "datetime" {
if let Some(dt_func) = crate::stdlib::datetime::get_function(method_name) {
diff --git a/src/stdlib/re.rs b/src/stdlib/re.rs
index 439b32e..68a419d 100644
--- a/src/stdlib/re.rs
+++ b/src/stdlib/re.rs
@@ -1,4 +1,5 @@
use crate::stdlib::StdlibValue;
+use regex::{Regex, RegexBuilder};
pub fn get_attribute(attr: &str) -> Option {
match attr {
@@ -7,6 +8,7 @@ pub fn get_attribute(attr: &str) -> Option {
"DOTALL" | "S" => Some(StdlibValue::Int(16)),
"VERBOSE" | "X" => Some(StdlibValue::Int(64)),
"ASCII" | "A" => Some(StdlibValue::Int(256)),
+ "UNICODE" | "U" => Some(StdlibValue::Int(32)),
_ => None,
}
}
@@ -42,3 +44,424 @@ pub enum ReFunction {
Escape,
Purge,
}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct ReFlags {
+ pub ignorecase: bool,
+ pub multiline: bool,
+ pub dotall: bool,
+ pub verbose: bool,
+ pub ascii: bool,
+ pub unicode: bool,
+}
+
+impl Default for ReFlags {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl ReFlags {
+ pub fn new() -> Self {
+ ReFlags {
+ ignorecase: false,
+ multiline: false,
+ dotall: false,
+ verbose: false,
+ ascii: false,
+ unicode: true,
+ }
+ }
+
+ pub fn from_int(flags: i32) -> Self {
+ ReFlags {
+ ignorecase: (flags & 2) != 0,
+ multiline: (flags & 8) != 0,
+ dotall: (flags & 16) != 0,
+ verbose: (flags & 64) != 0,
+ ascii: (flags & 256) != 0,
+ unicode: (flags & 32) != 0,
+ }
+ }
+}
+
+#[derive(Debug, Clone)]
+pub struct MatchResult {
+ pub matched: bool,
+ pub start: usize,
+ pub end: usize,
+ pub group: String,
+ pub groups: Vec