@@ -97,6 +97,154 @@ pub fn to_string(bits: BitArray) -> Result(String, Nil) {
97
97
@ external ( erlang , "gleam_stdlib" , "identity" )
98
98
fn unsafe_to_string ( a : BitArray ) -> String
99
99
100
+ /// Converts a bit array to a string. Invalid bits are passed to the provided
101
+ /// callback and its result is included in the final string in place of the
102
+ /// invalid data.
103
+ ///
104
+ /// ## Examples
105
+ ///
106
+ /// ```gleam
107
+ /// to_string_lossy(<<"A":utf8, 0x80, "1":utf8, 0:size(5)>>, fn(_) { "�" })
108
+ /// // -> "A�1�"
109
+ /// ```
110
+ ///
111
+ pub fn to_string_lossy (
112
+ bits : BitArray ,
113
+ map_invalid_bits : fn ( BitArray ) -> String ,
114
+ ) -> String {
115
+ to_string_lossy_impl ( bits , map_invalid_bits , "" )
116
+ }
117
+
118
+ @ target ( erlang )
119
+ fn to_string_lossy_impl (
120
+ bits : BitArray ,
121
+ map_invalid_bits : fn ( BitArray ) -> String ,
122
+ acc : String ,
123
+ ) -> String {
124
+ case bits {
125
+ << >> -> acc
126
+
127
+ << x : utf8_codepoint , rest : bits >> ->
128
+ to_string_lossy_impl (
129
+ rest ,
130
+ map_invalid_bits ,
131
+ acc <> string . from_utf_codepoints ( [ x ] ) ,
132
+ )
133
+
134
+ << x : bytes - 1 , rest : bits >> ->
135
+ to_string_lossy_impl ( rest , map_invalid_bits , acc <> map_invalid_bits ( x ) )
136
+
137
+ _ -> acc <> map_invalid_bits ( bits )
138
+ }
139
+ }
140
+
141
+ // The following is the same as the above function but supports the JavaScript
142
+ // target due to not using the `utf8_codepoint` bit array segment type. Once
143
+ // the JavaScript target supports `utf8_codepoint` this function should be
144
+ // removed.
145
+ @ target ( javascript )
146
+ fn to_string_lossy_impl (
147
+ bits : BitArray ,
148
+ map_invalid_bits : fn ( BitArray ) -> String ,
149
+ acc : String ,
150
+ ) -> String {
151
+ case bits {
152
+ << >> -> acc
153
+
154
+ // 1-byte UTF-8 character
155
+ << b0 , rest : bytes >> if b0 <= 0x7F -> {
156
+ let codepoint_value = b0
157
+
158
+ let acc =
159
+ acc
160
+ <> case string . utf_codepoint ( codepoint_value ) {
161
+ Ok ( codepoint ) -> string . from_utf_codepoints ( [ codepoint ] )
162
+ Error ( Nil ) -> map_invalid_bits ( << b0 >> )
163
+ }
164
+
165
+ to_string_lossy_impl ( rest , map_invalid_bits , acc )
166
+ }
167
+
168
+ // 2-byte UTF-8 character
169
+ << b0 , b1 , rest : bytes >>
170
+ if b0 >= 0xC0 && b0 <= 0xDF && b1 >= 0x80 && b1 <= 0xBF
171
+ -> {
172
+ let codepoint_value =
173
+ int . bitwise_and ( b0 , 0x1F ) * 64 + int . bitwise_and ( b1 , 0x3F )
174
+
175
+ let acc =
176
+ acc
177
+ <> case string . utf_codepoint ( codepoint_value ) {
178
+ Ok ( codepoint ) -> string . from_utf_codepoints ( [ codepoint ] )
179
+ Error ( Nil ) -> map_invalid_bits ( << b0 , b1 >> )
180
+ }
181
+
182
+ to_string_lossy_impl ( rest , map_invalid_bits , acc )
183
+ }
184
+
185
+ // 3-byte UTF-8 character
186
+ << b0 , b1 , b2 , rest : bytes >>
187
+ if b0 >= 0xE0
188
+ && b0 <= 0xEF
189
+ && b1 >= 0x80
190
+ && b1 <= 0xBF
191
+ && b2 >= 0x80
192
+ && b2 <= 0xBF
193
+ -> {
194
+ let codepoint_value =
195
+ int . bitwise_and ( b0 , 0x0F )
196
+ * 4096
197
+ + int . bitwise_and ( b1 , 0x3F )
198
+ * 64
199
+ + int . bitwise_and ( b2 , 0x3F )
200
+
201
+ let acc =
202
+ acc
203
+ <> case string . utf_codepoint ( codepoint_value ) {
204
+ Ok ( codepoint ) -> string . from_utf_codepoints ( [ codepoint ] )
205
+ Error ( Nil ) -> map_invalid_bits ( << b0 , b1 , b2 >> )
206
+ }
207
+
208
+ to_string_lossy_impl ( rest , map_invalid_bits , acc )
209
+ }
210
+
211
+ // 4-byte UTF-8 character
212
+ << b0 , b1 , b2 , b3 , rest : bytes >>
213
+ if b0 >= 0xF0
214
+ && b0 <= 0xF7
215
+ && b1 >= 0x80
216
+ && b1 <= 0xBF
217
+ && b2 >= 0x80
218
+ && b2 <= 0xBF
219
+ && b3 >= 0x80
220
+ && b3 <= 0xBF
221
+ -> {
222
+ let codepoint_value =
223
+ int . bitwise_and ( b0 , 0x07 )
224
+ * 262_144
225
+ + int . bitwise_and ( b1 , 0x3F )
226
+ * 4096
227
+ + int . bitwise_and ( b2 , 0x3F )
228
+ * 64
229
+ + int . bitwise_and ( b3 , 0x3F )
230
+
231
+ let acc =
232
+ acc
233
+ <> case string . utf_codepoint ( codepoint_value ) {
234
+ Ok ( codepoint ) -> string . from_utf_codepoints ( [ codepoint ] )
235
+ Error ( Nil ) -> map_invalid_bits ( << b0 , b1 , b2 , b3 >> )
236
+ }
237
+
238
+ to_string_lossy_impl ( rest , map_invalid_bits , acc )
239
+ }
240
+
241
+ << x : bytes - 1 , rest : bytes >> ->
242
+ to_string_lossy_impl ( rest , map_invalid_bits , acc <> map_invalid_bits ( x ) )
243
+
244
+ _ -> acc <> map_invalid_bits ( bits )
245
+ }
246
+ }
247
+
100
248
/// Creates a new bit array by joining multiple binaries.
101
249
///
102
250
/// ## Examples
0 commit comments