Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode #2

Merged
merged 29 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c73a3b1
Create .gitignore
alberic89 Aug 10, 2024
fc5e83d
Add working Unicode support
alberic89 Aug 10, 2024
33b305d
Merge pull request #1 from alberic89/unicode-implementation
fjebaker Aug 10, 2024
6ea5cce
feat: added simple benchmark suite
fjebaker Aug 10, 2024
e88ba67
chore: cleanup build.zig.zon
fjebaker Aug 10, 2024
d9de50d
feat: overhaul implementation
fjebaker Aug 10, 2024
692ee88
chore: rename fields to be consistent + function tables comptime
fjebaker Aug 10, 2024
5d77462
chore: rename implementations
fjebaker Aug 10, 2024
5250b26
No longer need to know the max size by advance
alberic89 Aug 10, 2024
4ae5d78
Remove allocator field in implementations
alberic89 Aug 10, 2024
67e3c14
Remove the TypeOfCaracter switch
alberic89 Aug 10, 2024
2fe86fc
make rebase adjustements
alberic89 Aug 10, 2024
0c7594d
WIP: make adjustement
alberic89 Aug 10, 2024
4e998d4
feat: init category and case unicode data only once
fjebaker Aug 10, 2024
068c73e
WIP
alberic89 Aug 11, 2024
4db079d
Solve unicode tool init problem
alberic89 Aug 12, 2024
cbb68f1
Merge remote-tracking branch 'upstream/unicode' into unicode
alberic89 Aug 12, 2024
b1f437a
fix: convertString of Unicode can fail
alberic89 Aug 12, 2024
abac757
Resize Matrix on need
alberic89 Aug 12, 2024
94f8f87
Let the user choice if he want to resize the matrix and buffer
alberic89 Aug 12, 2024
17903ad
fix: add errdefer in Unicode init
alberic89 Aug 12, 2024
4aebb97
fix: various style and optimization improvements
alberic89 Aug 12, 2024
ac1d1cd
Merge pull request #5 from alberic89/matrix-resize
fjebaker Aug 12, 2024
ad33235
feat: cleanup unicode support
fjebaker Aug 12, 2024
967df94
feat: pre-allocated buffer resizing
fjebaker Aug 12, 2024
2215837
feat: expose maximum haystack / needle query functions
fjebaker Aug 12, 2024
ed86b30
Merge branch 'main' into unicode
fjebaker Aug 12, 2024
3b435d5
ci: add unicode to ci test
fjebaker Aug 12, 2024
30a5f62
chore: bump version 0.1.0
fjebaker Aug 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# This file is for zig-specific build artifacts.

.zig-cache/
zig-out/
build/
build-*/
docgen_tmp/

# Compiled Object files
*.slo
*.lo
*.o
*.obj
*.elf
*.ko

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Libraries
*.lib
*.a
*.la
*.lo

# Shared objects (inc. Windows DLLs)
*.dll
*.so
*.so.*
*.dylib
17 changes: 17 additions & 0 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});

const zg = b.dependency("zg", .{
.target = target,
.optimize = optimize,
});

_ = b.addModule("fuzzig", .{ .root_source_file = b.path("src/root.zig") });

const lib = b.addStaticLibrary(.{
Expand All @@ -13,6 +18,12 @@ pub fn build(b: *std.Build) void {
.optimize = optimize,
});

lib.root_module.addImport("code_point", zg.module("code_point"));
lib.root_module.addImport("GenCatData", zg.module("GenCatData"));
lib.root_module.addImport("CaseData", zg.module("CaseData"));
lib.root_module.addImport("Normalize", zg.module("Normalize"));
lib.root_module.addImport("CaseFold", zg.module("CaseFold"));

b.installArtifact(lib);

const lib_unit_tests = b.addTest(.{
Expand All @@ -21,6 +32,12 @@ pub fn build(b: *std.Build) void {
.optimize = optimize,
});

lib_unit_tests.root_module.addImport("code_point", zg.module("code_point"));
lib_unit_tests.root_module.addImport("GenCatData", zg.module("GenCatData"));
lib_unit_tests.root_module.addImport("CaseData", zg.module("CaseData"));
lib_unit_tests.root_module.addImport("Normalize", zg.module("Normalize"));
lib_unit_tests.root_module.addImport("CaseFold", zg.module("CaseFold"));

const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests);

const test_step = b.step("test", "Run unit tests");
Expand Down
4 changes: 4 additions & 0 deletions build.zig.zon
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
// // computed. This field and `url` are mutually exclusive.
// .path = "foo",
//},
.zg = .{
.url = "https://codeberg.org/dude_the_builder/zg/archive/v0.13.2.tar.gz",
.hash = "122055beff332830a391e9895c044d33b15ea21063779557024b46169fb1984c6e40",
},
},

// Specifies the set of files and directories that are included in this package.
Expand Down
172 changes: 159 additions & 13 deletions src/root.zig
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@ const std = @import("std");
const utils = @import("utils.zig");
const structures = @import("structures.zig");

const code_point = @import("code_point");
const GenCatData = @import("GenCatData");
const CaseData = @import("CaseData");
const Normalize = @import("Normalize");

const Allocator = std.mem.Allocator;

const CharacterType = utils.CharacterType;
const MatrixT = structures.MatrixT;

Expand Down Expand Up @@ -100,6 +107,12 @@ pub fn Algorithm(

impl: Impl,

const TypeOfCaracter = switch (Impl) {
AsciiOptions => u8,
UnicodeOptions => u21,
else => unreachable,
};
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Algorithm generic struct already has an ElType argument, so we can use that instead of having to maintain a switch. In theory I still want downstream users to be able to modify the behaviour of the fuzzy finder without having to modify the source.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For me, there are two types of solutions:

  1. The user should give a standard string ([]const u8) to the fuzzy finder, choose the right algo (Ascii or Unicode) depending on the “type” of the text. In this case, ElType will always be u8, and TypeOfCaracter depends on the algo. The algo will make by himself all the conversion.
  2. Let the user make the conversions, and the algo will take the type of the strings (ElType). The problem is that the user will need a lib to make the conversion, for being able to use fuzzig. And fuzzig will also need a lib (surely the same) to process correctly the data.

The solution for me is the 1, to replace ElType by u8 where it is needed, and use it instead to provide the type that is waited by the UnicodeOption (or Ascii).
In this case, TypeOfCaracter is no longer needed (so we can delete the switch).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have done it in alberic89@073d10a


pub fn deinit(self: *Self) void {
self.m.deinit();
self.x.deinit();
Expand All @@ -117,6 +130,9 @@ pub fn Algorithm(
max_needle: usize,
impl: Impl,
) !Self {
var impl_with_allocator = impl;
impl_with_allocator.allocator = allocator;

const rows = max_needle + 1;
const cols = max_haystack + 1;

Expand Down Expand Up @@ -149,7 +165,7 @@ pub fn Algorithm(
.first_match_buffer = first_match_buffer,
.traceback_buffer = traceback_buffer,
.allocator = allocator,
.impl = impl,
.impl = impl_with_allocator,
};
}

Expand Down Expand Up @@ -179,34 +195,40 @@ pub fn Algorithm(
.score = 0,
};

const rows = needle.len;
const cols = haystack.len;
const haystack_normal = self.impl.convertString(haystack);
defer self.allocator.free(haystack_normal);

const needle_normal = self.impl.convertString(needle);
defer self.allocator.free(needle_normal);
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe use ArenaAllocator for the lifetime of a single solve? Then implementations that don't need to allocate don't have to pay the price of a dupe to satisfy the free? Or else let the implementation manage the lifetimes?


const rows = needle_normal.len;
const cols = haystack_normal.len;

// resize the view into memory
self.m.resizeNoAlloc(rows + 1, cols + 1);
self.x.resizeNoAlloc(rows + 1, cols + 1);
self.m_skip.resizeNoAlloc(rows + 1, cols + 1);

const first_match_indices = utils.firstMatchesGeneric(
ElType,
TypeOfCaracter,
&self.impl,
Impl.eqlFunc,
self.first_match_buffer,
haystack,
needle,
haystack_normal,
needle_normal,
) orelse return null;

self.reset(rows + 1, cols + 1, first_match_indices);
self.determineBonuses(haystack);
self.determineBonuses(TypeOfCaracter, haystack_normal);

try self.populateMatrices(haystack, needle, first_match_indices);
try self.populateMatrices(haystack_normal, needle_normal, first_match_indices);
const col_max = self.findMaximalElement(
first_match_indices,
rows,
cols,
);

const last_row_index = needle.len;
const last_row_index = needle_normal.len;
const s = self.m.get(last_row_index, col_max);
return .{
.score = s,
Expand Down Expand Up @@ -268,8 +290,8 @@ pub fn Algorithm(
return buf;
}

fn determineBonuses(self: *Self, haystack: []const ElType) void {
var prev: u8 = 0;
fn determineBonuses(self: *Self, T: type, haystack: []const T) void {
var prev: T = 0;
for (1.., haystack) |i, h| {
self.role_bonus[i] = Impl.bonusFunc(&self.impl, scores, prev, h);
prev = h;
Expand Down Expand Up @@ -325,8 +347,8 @@ pub fn Algorithm(

fn populateMatrices(
self: *Self,
haystack: []const ElType,
needle: []const ElType,
haystack: []const TypeOfCaracter,
needle: []const TypeOfCaracter,
first_match_indices: []const usize,
) !void {
for (1.., needle) |i, n| {
Expand Down Expand Up @@ -455,6 +477,8 @@ pub fn Algorithm(
pub const AsciiOptions = struct {
const AsciiScores = Scores(i32);

pub const TypeOfCharacter = u8;

case_sensitive: bool = true,
case_penalize: bool = false,
// treat spaces as wildcards for any kind of boundary
Expand All @@ -463,6 +487,13 @@ pub const AsciiOptions = struct {

penalty_case_mistmatch: i32 = -2,

/// Don't forget the allocator !!!
allocator: Allocator = undefined,
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's pass allocators in from the algorithm struct instead of having the options hold on to them.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed in commit alberic89@3bab90b
But many side effects. Maybe they will be attenuated with the futures changes (less need of an allocator).


fn convertString(a: *const AsciiOptions, string: []const u8) []const TypeOfCharacter {
return a.allocator.dupe(TypeOfCharacter, string) catch @panic("Memory error");
}

fn eqlFunc(a: *const AsciiOptions, h: u8, n: u8) bool {
if (n == ' ' and a.wildcard_spaces) {
return switch (h) {
Expand Down Expand Up @@ -508,9 +539,98 @@ pub const AsciiOptions = struct {
}
};

pub const UnicodeOptions = struct {
const UnicodeScores = Scores(i32);

pub const TypeOfCharacter: type = u21;

case_sensitive: bool = true,
case_penalize: bool = false,
// treat spaces as wildcards for any kind of boundary
// i.e. match with any `[^a-z,A-Z,0-9]`
wildcard_spaces: bool = false,

penalty_case_mistmatch: i32 = -2,

/// Don't forget the allocator !!!
allocator: Allocator = undefined,

fn convertString(a: *const UnicodeOptions, string: []const u8) []const TypeOfCharacter {
var norm_data: Normalize.NormData = undefined;
Normalize.NormData.init(&norm_data, a.allocator) catch @panic("Cannot normalize string");
defer norm_data.deinit();

const n = Normalize{ .norm_data = &norm_data };

const nfc_result = n.nfc(a.allocator, string) catch @panic("Cannot normalize string");
defer nfc_result.deinit();

var iter = code_point.Iterator{ .bytes = nfc_result.slice };

var converted_string = std.ArrayList(TypeOfCharacter).init(a.allocator);
defer converted_string.deinit();

while (iter.next()) |c| {
converted_string.append(c.code) catch @panic("Memory error");
}
return converted_string.toOwnedSlice() catch @panic("Memory error");
}

fn eqlFunc(a: *const UnicodeOptions, h: u21, n: u21) bool {
const gcd = GenCatData.init(a.allocator) catch @panic("Memory error");
defer gcd.deinit();
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The eqlFunc is called in the inner loop of the alignment solver, so it should ideally have a buffered allocator. As it currently is this will hurt performance with the alloc overhead.

if (gcd.isSeparator(n) and a.wildcard_spaces) {
if (gcd.isLetter(h) or gcd.isNumber(h) or gcd.isSymbol(h)) {
return true;
} else {
return false;
}
} else if (!a.case_sensitive) {
const cd = CaseData.init(a.allocator) catch @panic("Memory error");
defer cd.deinit();
return cd.toLower(h) == cd.toLower(n);
} else {
return h == n;
}
}

fn scoreFunc(
a: *const UnicodeOptions,
comptime scores: UnicodeScores,
h: u21,
n: u21,
) ?i32 {
if (!a.eqlFunc(h, n)) return null;

if (a.case_penalize and (h != n)) {
return scores.score_match + a.penalty_case_mistmatch;
}
return scores.score_match;
}

fn bonusFunc(
self: *const UnicodeOptions,
comptime scores: UnicodeScores,
h: u21,
n: u21,
) i32 {
const p = CharacterType.fromUnicode(h, self.allocator);
const c = CharacterType.fromUnicode(n, self.allocator);

return switch (p.roleNextTo(c)) {
.Head => scores.bonus_head,
.Camel => scores.bonus_camel,
.Break => scores.bonus_break,
.Tail => scores.bonus_tail,
};
}
};
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are both essentically identical to the Ascii functions which suggests we should pull them out so we only maintain the implementation once.


/// Default ASCII Fuzzy Finder
pub const Ascii = Algorithm(u8, i32, .{}, AsciiOptions);

pub const Unicode = Algorithm(u8, i32, .{}, UnicodeOptions);

fn doTestScore(alg: *Ascii, haystack: []const u8, needle: []const u8, comptime score: i32) !void {
const s = alg.score(haystack, needle);

Expand All @@ -521,6 +641,18 @@ fn doTestScore(alg: *Ascii, haystack: []const u8, needle: []const u8, comptime s
try std.testing.expectEqual(score, s.?);
}

fn doTestScoreUnicode(alg: *Unicode, haystack: []const u8, needle: []const u8, comptime score: ?i32) !void {
const s = alg.score(haystack, needle);

if (score == null) {
// const stderr = std.io.getStdErr().writer();
// try alg.debugPrint(stderr, haystack, needle);
std.debug.print("SCORE : {d}\n", .{s orelse -1});
} else {
try std.testing.expectEqual(score, s.?);
}
}

test "algorithm test" {
const o = AsciiOptions.AsciiScores{};

Expand Down Expand Up @@ -714,3 +846,17 @@ test "traceback" {
try doTestTraceback(&alg, "A" ++ "a" ** 20 ++ "B", "AB", &.{ 0, 21 });
try doTestTraceback(&alg, "./src/main.zig", "main", &.{ 6, 7, 8, 9 });
}

test "Unicode search" {
const o = UnicodeOptions.UnicodeScores{};

var alg = try Unicode.init(
std.testing.allocator,
128,
32,
.{},
);
defer alg.deinit();

try doTestScoreUnicode(&alg, "zig⚡ fast", "⚡", o.score_match);
}
Loading