@@ -6,6 +6,42 @@ static int bstrcmp(const void *l, const void *r)
6
6
return strcmp (l , (* (const char * * )r ));
7
7
}
8
8
9
+ /**
10
+ * Does a strncmp on utf8 strings. This mostly works by compairing but
11
+ * not counting continuation byte & accent/sound mark characters. This
12
+ * is not guaranteed to work for all utf8 strings but is supposed to
13
+ * work for the bip39 word lists in libwally.
14
+ */
15
+ int utf_strncmp (const char * s1 , const char * s2 , size_t n ) {
16
+ size_t p = 0 ;
17
+ size_t c = 0 ;
18
+ while ((s1 [p ] != '\0' ) && (s2 [p ] != '\0' )) {
19
+ //Only count non continuation or accent characters
20
+ unsigned char byte = (unsigned char )s1 [p ];
21
+ if ( (byte < 0x80 ) || ((byte > 0xbf ) && (byte != 0xcc ) && (byte != 0xcd ))) {
22
+ ++ c ;
23
+ //Skip Hiragana sound mark (e38299-e3829f)
24
+ const unsigned char * s = (const unsigned char * )s1 ;
25
+ if ((s [p ] == 0xe3 ) && (s [p + 1 ] == 0x82 ) && ((s [p + 2 ] >= 0x99 ) && (s [p + 2 ] <= 0x9f ))) {
26
+ -- c ;
27
+ }
28
+ }
29
+ if (c > n ) {
30
+ -- p ;
31
+ break ;
32
+ }
33
+
34
+ if (s1 [p ] != s2 [p ]) return (unsigned char )s1 [p ] - (unsigned char )s2 [p ];
35
+ ++ p ;
36
+ }
37
+
38
+ return (unsigned char )s1 [p ] - (unsigned char )s2 [p ];
39
+ }
40
+
41
+ static int bstr4cmp (const void * l , const void * r ) {
42
+ return utf_strncmp (l , (* (const char * * )r ), 4 );
43
+ }
44
+
9
45
/* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious */
10
46
static int get_bits (size_t n )
11
47
{
@@ -72,13 +108,14 @@ size_t wordlist_lookup_word(const struct words *w, const char *word)
72
108
const size_t size = sizeof (const char * );
73
109
const char * * found = NULL ;
74
110
75
- if (w -> sorted )
76
- found = (const char * * )bsearch (word , w -> indices , w -> len , size , bstrcmp );
77
- else {
111
+ if (w -> sorted ) {
112
+ found = (const char * * )bsearch (word , w -> indices , w -> len , size , bstr4cmp );
113
+ } else {
78
114
size_t i ;
79
115
for (i = 0 ; i < w -> len && !found ; ++ i )
80
- if (!strcmp (word , w -> indices [i ]))
116
+ if (!utf_strncmp (word , w -> indices [i ], 4 )) {
81
117
found = w -> indices + i ;
118
+ }
82
119
}
83
120
return found ? found - w -> indices + 1u : 0u ;
84
121
}
0 commit comments