fjebaker · fjebaker · Aug 12, 2024 · Aug 10, 2024 · Aug 10, 2024 · Aug 10, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,32 @@
+# This file is for zig-specific build artifacts.
+
+.zig-cache/
+zig-out/
+build/
+build-*/
+docgen_tmp/
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+*.elf
+*.ko
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
diff --git a/build.zig b/build.zig
@@ -4,6 +4,11 @@ pub fn build(b: *std.Build) void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
 
+    const zg = b.dependency("zg", .{
+        .target = target,
+        .optimize = optimize,
+    });
+
     _ = b.addModule("fuzzig", .{ .root_source_file = b.path("src/root.zig") });
 
     const lib = b.addStaticLibrary(.{
@@ -13,6 +18,12 @@ pub fn build(b: *std.Build) void {
         .optimize = optimize,
     });
 
+    lib.root_module.addImport("code_point", zg.module("code_point"));
+    lib.root_module.addImport("GenCatData", zg.module("GenCatData"));
+    lib.root_module.addImport("CaseData", zg.module("CaseData"));
+    lib.root_module.addImport("Normalize", zg.module("Normalize"));
+    lib.root_module.addImport("CaseFold", zg.module("CaseFold"));
+
     b.installArtifact(lib);
 
     const lib_unit_tests = b.addTest(.{
@@ -21,6 +32,12 @@ pub fn build(b: *std.Build) void {
         .optimize = optimize,
     });
 
+    lib_unit_tests.root_module.addImport("code_point", zg.module("code_point"));
+    lib_unit_tests.root_module.addImport("GenCatData", zg.module("GenCatData"));
+    lib_unit_tests.root_module.addImport("CaseData", zg.module("CaseData"));
+    lib_unit_tests.root_module.addImport("Normalize", zg.module("Normalize"));
+    lib_unit_tests.root_module.addImport("CaseFold", zg.module("CaseFold"));
+
     const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);
 
     const test_step = b.step("test", "Run unit tests");

diff --git a/build.zig.zon b/build.zig.zon
@@ -38,6 +38,10 @@
         //    // computed. This field and `url` are mutually exclusive.
         //    .path = "foo",
         //},
+        .zg = .{
+            .url = "https://codeberg.org/dude_the_builder/zg/archive/v0.13.2.tar.gz",
+            .hash = "122055beff332830a391e9895c044d33b15ea21063779557024b46169fb1984c6e40",
+        },
     },
 
     // Specifies the set of files and directories that are included in this package.

diff --git a/src/root.zig b/src/root.zig
@@ -2,6 +2,13 @@ const std = @import("std");
 const utils = @import("utils.zig");
 const structures = @import("structures.zig");
 
+const code_point = @import("code_point");
+const GenCatData = @import("GenCatData");
+const CaseData = @import("CaseData");
+const Normalize = @import("Normalize");
+
+const Allocator = std.mem.Allocator;
+
 const CharacterType = utils.CharacterType;
 const MatrixT = structures.MatrixT;
 
@@ -100,6 +107,12 @@ pub fn Algorithm(
 
         impl: Impl,
 
+        const TypeOfCaracter = switch (Impl) {
+            AsciiOptions => u8,
+            UnicodeOptions => u21,
+            else => unreachable,
+        };
+
         pub fn deinit(self: *Self) void {
             self.m.deinit();
             self.x.deinit();
@@ -117,6 +130,9 @@ pub fn Algorithm(
             max_needle: usize,
             impl: Impl,
         ) !Self {
+            var impl_with_allocator = impl;
+            impl_with_allocator.allocator = allocator;
+
             const rows = max_needle + 1;
             const cols = max_haystack + 1;
 
@@ -149,7 +165,7 @@ pub fn Algorithm(
                 .first_match_buffer = first_match_buffer,
                 .traceback_buffer = traceback_buffer,
                 .allocator = allocator,
-                .impl = impl,
+                .impl = impl_with_allocator,
             };
         }
 
@@ -179,34 +195,40 @@ pub fn Algorithm(
                 .score = 0,
             };
 
-            const rows = needle.len;
-            const cols = haystack.len;
+            const haystack_normal = self.impl.convertString(haystack);
+            defer self.allocator.free(haystack_normal);
+
+            const needle_normal = self.impl.convertString(needle);
+            defer self.allocator.free(needle_normal);
+
+            const rows = needle_normal.len;
+            const cols = haystack_normal.len;
 
             // resize the view into memory
             self.m.resizeNoAlloc(rows + 1, cols + 1);
             self.x.resizeNoAlloc(rows + 1, cols + 1);
             self.m_skip.resizeNoAlloc(rows + 1, cols + 1);
 
             const first_match_indices = utils.firstMatchesGeneric(
-                ElType,
+                TypeOfCaracter,
                 &self.impl,
                 Impl.eqlFunc,
                 self.first_match_buffer,
-                haystack,
-                needle,
+                haystack_normal,
+                needle_normal,
             ) orelse return null;
 
             self.reset(rows + 1, cols + 1, first_match_indices);
-            self.determineBonuses(haystack);
+            self.determineBonuses(TypeOfCaracter, haystack_normal);
 
-            try self.populateMatrices(haystack, needle, first_match_indices);
+            try self.populateMatrices(haystack_normal, needle_normal, first_match_indices);
             const col_max = self.findMaximalElement(
                 first_match_indices,
                 rows,
                 cols,
             );
 
-            const last_row_index = needle.len;
+            const last_row_index = needle_normal.len;
             const s = self.m.get(last_row_index, col_max);
             return .{
                 .score = s,
@@ -268,8 +290,8 @@ pub fn Algorithm(
             return buf;
         }
 
-        fn determineBonuses(self: *Self, haystack: []const ElType) void {
-            var prev: u8 = 0;
+        fn determineBonuses(self: *Self, T: type, haystack: []const T) void {
+            var prev: T = 0;
             for (1.., haystack) |i, h| {
                 self.role_bonus[i] = Impl.bonusFunc(&self.impl, scores, prev, h);
                 prev = h;
@@ -325,8 +347,8 @@ pub fn Algorithm(
 
         fn populateMatrices(
             self: *Self,
-            haystack: []const ElType,
-            needle: []const ElType,
+            haystack: []const TypeOfCaracter,
+            needle: []const TypeOfCaracter,
             first_match_indices: []const usize,
         ) !void {
             for (1.., needle) |i, n| {
@@ -455,6 +477,8 @@ pub fn Algorithm(
 pub const AsciiOptions = struct {
     const AsciiScores = Scores(i32);
 
+    pub const TypeOfCharacter = u8;
+
     case_sensitive: bool = true,
     case_penalize: bool = false,
     // treat spaces as wildcards for any kind of boundary
@@ -463,6 +487,13 @@ pub const AsciiOptions = struct {
 
     penalty_case_mistmatch: i32 = -2,
 
+    /// Don't forget the allocator !!!
+    allocator: Allocator = undefined,
+
+    fn convertString(a: *const AsciiOptions, string: []const u8) []const TypeOfCharacter {
+        return a.allocator.dupe(TypeOfCharacter, string) catch @panic("Memory error");
+    }
+
     fn eqlFunc(a: *const AsciiOptions, h: u8, n: u8) bool {
         if (n == ' ' and a.wildcard_spaces) {
             return switch (h) {
@@ -508,9 +539,98 @@ pub const AsciiOptions = struct {
     }
 };
 
+pub const UnicodeOptions = struct {
+    const UnicodeScores = Scores(i32);
+
+    pub const TypeOfCharacter: type = u21;
+
+    case_sensitive: bool = true,
+    case_penalize: bool = false,
+    // treat spaces as wildcards for any kind of boundary
+    // i.e. match with any `[^a-z,A-Z,0-9]`
+    wildcard_spaces: bool = false,
+
+    penalty_case_mistmatch: i32 = -2,
+
+    /// Don't forget the allocator !!!
+    allocator: Allocator = undefined,
+
+    fn convertString(a: *const UnicodeOptions, string: []const u8) []const TypeOfCharacter {
+        var norm_data: Normalize.NormData = undefined;
+        Normalize.NormData.init(&norm_data, a.allocator) catch @panic("Cannot normalize string");
+        defer norm_data.deinit();
+
+        const n = Normalize{ .norm_data = &norm_data };
+
+        const nfc_result = n.nfc(a.allocator, string) catch @panic("Cannot normalize string");
+        defer nfc_result.deinit();
+
+        var iter = code_point.Iterator{ .bytes = nfc_result.slice };
+
+        var converted_string = std.ArrayList(TypeOfCharacter).init(a.allocator);
+        defer converted_string.deinit();
+
+        while (iter.next()) |c| {
+            converted_string.append(c.code) catch @panic("Memory error");
+        }
+        return converted_string.toOwnedSlice() catch @panic("Memory error");
+    }
+
+    fn eqlFunc(a: *const UnicodeOptions, h: u21, n: u21) bool {
+        const gcd = GenCatData.init(a.allocator) catch @panic("Memory error");
+        defer gcd.deinit();
+        if (gcd.isSeparator(n) and a.wildcard_spaces) {
+            if (gcd.isLetter(h) or gcd.isNumber(h) or gcd.isSymbol(h)) {
+                return true;
+            } else {
+                return false;
+            }
+        } else if (!a.case_sensitive) {
+            const cd = CaseData.init(a.allocator) catch @panic("Memory error");
+            defer cd.deinit();
+            return cd.toLower(h) == cd.toLower(n);
+        } else {
+            return h == n;
+        }
+    }
+
+    fn scoreFunc(
+        a: *const UnicodeOptions,
+        comptime scores: UnicodeScores,
+        h: u21,
+        n: u21,
+    ) ?i32 {
+        if (!a.eqlFunc(h, n)) return null;
+
+        if (a.case_penalize and (h != n)) {
+            return scores.score_match + a.penalty_case_mistmatch;
+        }
+        return scores.score_match;
+    }
+
+    fn bonusFunc(
+        self: *const UnicodeOptions,
+        comptime scores: UnicodeScores,
+        h: u21,
+        n: u21,
+    ) i32 {
+        const p = CharacterType.fromUnicode(h, self.allocator);
+        const c = CharacterType.fromUnicode(n, self.allocator);
+
+        return switch (p.roleNextTo(c)) {
+            .Head => scores.bonus_head,
+            .Camel => scores.bonus_camel,
+            .Break => scores.bonus_break,
+            .Tail => scores.bonus_tail,
+        };
+    }
+};
+
 /// Default ASCII Fuzzy Finder
 pub const Ascii = Algorithm(u8, i32, .{}, AsciiOptions);
 
+pub const Unicode = Algorithm(u8, i32, .{}, UnicodeOptions);
+
 fn doTestScore(alg: *Ascii, haystack: []const u8, needle: []const u8, comptime score: i32) !void {
     const s = alg.score(haystack, needle);
 
@@ -521,6 +641,18 @@ fn doTestScore(alg: *Ascii, haystack: []const u8, needle: []const u8, comptime s
     try std.testing.expectEqual(score, s.?);
 }
 
+fn doTestScoreUnicode(alg: *Unicode, haystack: []const u8, needle: []const u8, comptime score: ?i32) !void {
+    const s = alg.score(haystack, needle);
+
+    if (score == null) {
+        // const stderr = std.io.getStdErr().writer();
+        // try alg.debugPrint(stderr, haystack, needle);
+        std.debug.print("SCORE : {d}\n", .{s orelse -1});
+    } else {
+        try std.testing.expectEqual(score, s.?);
+    }
+}
+
 test "algorithm test" {
     const o = AsciiOptions.AsciiScores{};
 
@@ -714,3 +846,17 @@ test "traceback" {
     try doTestTraceback(&alg, "A" ++ "a" ** 20 ++ "B", "AB", &.{ 0, 21 });
     try doTestTraceback(&alg, "./src/main.zig", "main", &.{ 6, 7, 8, 9 });
 }
+
+test "Unicode search" {
+    const o = UnicodeOptions.UnicodeScores{};
+
+    var alg = try Unicode.init(
+        std.testing.allocator,
+        128,
+        32,
+        .{},
+    );
+    defer alg.deinit();
+
+    try doTestScoreUnicode(&alg, "zig⚡ fast", "⚡", o.score_match);
+}