Skip to content

Commit 378c5d3

Browse files
committed
Add bit_array.to_string_lossy
1 parent f690a8e commit 378c5d3

File tree

3 files changed

+176
-2
lines changed

3 files changed

+176
-2
lines changed

CHANGELOG.md

+6-2
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,17 @@
22

33
## Unreleased
44

5+
- The `uri` module gains the `empty` value, representing an empty URI which
6+
equivalent to `""`.
7+
- The `bit_array` module gains the `to_string_lossy` function.
8+
9+
## v0.54.0 - 2025-02-04
10+
511
- The deprecated `drop_left`, `drop_right`, `pad_left`, `pad_right`,
612
`trim_left`, and `trim_right` functions have been removed.
713
- Fixed a bug that would result in `list.unique` having quadratic runtime.
814
- Fixed the implementation of `list.key_set` to be tail recursive.
915
- The `pop` and `pop_map` functions in the `list` module have been deprecated.
10-
- The `uri` module gains the `empty` value, representing an empty URI which
11-
equivalent to `""`.
1216

1317
## v0.53.0 - 2025-01-23
1418

src/gleam/bit_array.gleam

+148
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,154 @@ pub fn to_string(bits: BitArray) -> Result(String, Nil) {
9797
@external(erlang, "gleam_stdlib", "identity")
9898
fn unsafe_to_string(a: BitArray) -> String
9999

100+
/// Converts a bit array to a string. Invalid bits are passed to the provided
101+
/// callback and its result is included in the final string in place of the
102+
/// invalid data.
103+
///
104+
/// ## Examples
105+
///
106+
/// ```gleam
107+
/// to_string_lossy(<<"A":utf8, 0x80, "1":utf8, 0:size(5)>>, fn(_) { "�" })
108+
/// // -> "A�1�"
109+
/// ```
110+
///
111+
pub fn to_string_lossy(
112+
bits: BitArray,
113+
map_invalid_bits: fn(BitArray) -> String,
114+
) -> String {
115+
to_string_lossy_impl(bits, map_invalid_bits, "")
116+
}
117+
118+
@target(erlang)
119+
fn to_string_lossy_impl(
120+
bits: BitArray,
121+
map_invalid_bits: fn(BitArray) -> String,
122+
acc: String,
123+
) -> String {
124+
case bits {
125+
<<>> -> acc
126+
127+
<<x:utf8_codepoint, rest:bits>> ->
128+
to_string_lossy_impl(
129+
rest,
130+
map_invalid_bits,
131+
acc <> string.from_utf_codepoints([x]),
132+
)
133+
134+
<<x:bytes-1, rest:bits>> ->
135+
to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x))
136+
137+
_ -> acc <> map_invalid_bits(bits)
138+
}
139+
}
140+
141+
// The following is the same as the above function but supports the JavaScript
142+
// target due to not using the `utf8_codepoint` bit array segment type. Once
143+
// the JavaScript target supports `utf8_codepoint` this function should be
144+
// removed.
145+
@target(javascript)
146+
fn to_string_lossy_impl(
147+
bits: BitArray,
148+
map_invalid_bits: fn(BitArray) -> String,
149+
acc: String,
150+
) -> String {
151+
case bits {
152+
<<>> -> acc
153+
154+
// 1-byte UTF-8 character
155+
<<b0, rest:bytes>> if b0 <= 0x7F -> {
156+
let codepoint_value = b0
157+
158+
let acc =
159+
acc
160+
<> case string.utf_codepoint(codepoint_value) {
161+
Ok(codepoint) -> string.from_utf_codepoints([codepoint])
162+
Error(Nil) -> map_invalid_bits(<<b0>>)
163+
}
164+
165+
to_string_lossy_impl(rest, map_invalid_bits, acc)
166+
}
167+
168+
// 2-byte UTF-8 character
169+
<<b0, b1, rest:bytes>>
170+
if b0 >= 0xC0 && b0 <= 0xDF && b1 >= 0x80 && b1 <= 0xBF
171+
-> {
172+
let codepoint_value =
173+
int.bitwise_and(b0, 0x1F) * 64 + int.bitwise_and(b1, 0x3F)
174+
175+
let acc =
176+
acc
177+
<> case string.utf_codepoint(codepoint_value) {
178+
Ok(codepoint) -> string.from_utf_codepoints([codepoint])
179+
Error(Nil) -> map_invalid_bits(<<b0, b1>>)
180+
}
181+
182+
to_string_lossy_impl(rest, map_invalid_bits, acc)
183+
}
184+
185+
// 3-byte UTF-8 character
186+
<<b0, b1, b2, rest:bytes>>
187+
if b0 >= 0xE0
188+
&& b0 <= 0xEF
189+
&& b1 >= 0x80
190+
&& b1 <= 0xBF
191+
&& b2 >= 0x80
192+
&& b2 <= 0xBF
193+
-> {
194+
let codepoint_value =
195+
int.bitwise_and(b0, 0x0F)
196+
* 4096
197+
+ int.bitwise_and(b1, 0x3F)
198+
* 64
199+
+ int.bitwise_and(b2, 0x3F)
200+
201+
let acc =
202+
acc
203+
<> case string.utf_codepoint(codepoint_value) {
204+
Ok(codepoint) -> string.from_utf_codepoints([codepoint])
205+
Error(Nil) -> map_invalid_bits(<<b0, b1, b2>>)
206+
}
207+
208+
to_string_lossy_impl(rest, map_invalid_bits, acc)
209+
}
210+
211+
// 4-byte UTF-8 character
212+
<<b0, b1, b2, b3, rest:bytes>>
213+
if b0 >= 0xF0
214+
&& b0 <= 0xF7
215+
&& b1 >= 0x80
216+
&& b1 <= 0xBF
217+
&& b2 >= 0x80
218+
&& b2 <= 0xBF
219+
&& b3 >= 0x80
220+
&& b3 <= 0xBF
221+
-> {
222+
let codepoint_value =
223+
int.bitwise_and(b0, 0x07)
224+
* 262_144
225+
+ int.bitwise_and(b1, 0x3F)
226+
* 4096
227+
+ int.bitwise_and(b2, 0x3F)
228+
* 64
229+
+ int.bitwise_and(b3, 0x3F)
230+
231+
let acc =
232+
acc
233+
<> case string.utf_codepoint(codepoint_value) {
234+
Ok(codepoint) -> string.from_utf_codepoints([codepoint])
235+
Error(Nil) -> map_invalid_bits(<<b0, b1, b2, b3>>)
236+
}
237+
238+
to_string_lossy_impl(rest, map_invalid_bits, acc)
239+
}
240+
241+
<<x:bytes-1, rest:bytes>> ->
242+
to_string_lossy_impl(rest, map_invalid_bits, acc <> map_invalid_bits(x))
243+
244+
_ -> acc <> map_invalid_bits(bits)
245+
}
246+
}
247+
100248
/// Creates a new bit array by joining multiple binaries.
101249
///
102250
/// ## Examples

test/gleam/bit_array_test.gleam

+22
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,28 @@ pub fn to_string_erlang_only_test() {
233233
|> should.equal(Error(Nil))
234234
}
235235

236+
pub fn to_string_lossy_test() {
237+
<<>>
238+
|> bit_array.to_string_lossy(fn(_) { "�" })
239+
|> should.equal("")
240+
241+
<<0x80, "A":utf8, 0x81>>
242+
|> bit_array.to_string_lossy(fn(_) { "�" })
243+
|> should.equal("�A�")
244+
245+
// Test some codepoints that require 2/3/4 bytes to be stored as UTF-8
246+
<<"£И한𐍈":utf8>>
247+
|> bit_array.to_string_lossy(fn(_) { "�" })
248+
|> should.equal("£И한𐍈")
249+
}
250+
251+
@target(erlang)
252+
pub fn to_string_lossy_erlang_only_test() {
253+
<<"ø":utf8, 50:4>>
254+
|> bit_array.to_string_lossy(fn(_) { "�" })
255+
|> should.equal("ø�")
256+
}
257+
236258
pub fn is_utf8_test() {
237259
<<>>
238260
|> bit_array.is_utf8

0 commit comments

Comments
 (0)