generated from okikio/transferables
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbyte_methods.ts
221 lines (202 loc) · 9.02 KB
/
byte_methods.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
/**
* @module
* Provides utility functions for working with UTF-8 encoded characters in TypeScript.
* It includes methods for determining the byte length of UTF-8 characters, converting bytes to Unicode code points,
* extracting code points from buffers, and dealing with UTF-16 code units in strings.
*
* @example
* ```ts
* import { getByteLength, bytesToCodePoint, bytesToCodePointFromBuffer, codePointAt } from 'jsr:@okikio/codepoint-iterator/byte_methods';
*
* // Determine the byte length of a UTF-8 encoded character
* const leadByte = 0xF0; // Leading byte of a 4-byte UTF-8 character
* console.log(getByteLength(leadByte)); // Expected output: 4
*
* // Convert a sequence of UTF-8 bytes to a Unicode code point
* const bytes = [0xF0, 0x9F, 0x92, 0xA9]; // UTF-8 encoded representation of the 💩 emoji
* console.log(bytesToCodePoint(4, bytes)); // Expected output: 128169 (code point for 💩)
*
* // Extract a Unicode code point from a buffer
* const buffer = new Uint8Array([0xF0, 0x9F, 0x92, 0xA9]);
* console.log(bytesToCodePointFromBuffer(4, buffer, 0)); // Expected output: 128169
*
* // Calculate the Unicode code point of a character in a string
* const str = '🌍';
* console.log(codePointAt(str, 0)); // Expected output: 127757 (code point for 🌍)
* ```
*/
import {
BITS_FOR_2B,
BITS_FOR_3B,
BITS_FOR_4B,
LEAD_FOR_1B,
LEAD_FOR_2B,
LEAD_FOR_3B,
LEAD_FOR_4B,
LEAD_FOR_5B,
MASK_FOR_1B,
MASK_FOR_2B,
MASK_FOR_3B,
MASK_FOR_4B,
} from "./constants.ts";
/**
* Calculates the number of bytes required to represent a single UTF-8 character.
*
* Determines the byte length of a UTF-8 encoded character based on its leading byte.
* This is crucial for correctly interpreting or encoding text in UTF-8,
* where characters may vary in byte length from 1 to 4 bytes.
*
* This function given the byte value of the leading byte for the utf-8 character
* calculates how many more bytes are required to represent the utf-8 character,
* this allows emoji's another other symbols to be represented in utf-8.
*
* @param byte The lead byte of a UTF-8 character.
* @returns The number of bytes in a Uint8Array required to represent the UTF-8 character (the number of bytes ranges from 1 to 4).
*/
export function getByteLength(byte: number): number {
return (
byte < LEAD_FOR_1B ? 1 :
LEAD_FOR_2B === (LEAD_FOR_3B & byte) ? 2 :
LEAD_FOR_3B === (LEAD_FOR_4B & byte) ? 3 :
LEAD_FOR_4B === (LEAD_FOR_5B & byte) ? 4
: 1
);
}
/**
* Converts a sequence of bytes into a Unicode code point. This function is a key part of
* decoding UTF-8 encoded text, as it translates the raw bytes back into the characters they represent.
*
* UTF-8 can be represented by 1 to 4 bytes.
* This function given the byte length of the utf-8 character
* calculates the code point using the 1 to 4 numbers given for the bytes
* of the utf-8 character.
*
* Due to the dynamic length of utf-8 characters,
* its faster to just grab the bytes from the Uint8Array then calculate it's codepoint
* than trying to decode said Uint8Array into a string and then converting
* said string into codepoints.
*
* @param byteLength The number of bytes in a Uint8Array required to represent a single UTF-8 character (the number of bytes ranges from 1 to 4).
* @param bytes An array of length `byteLength` bytes that make up the UTF-8 character.
* @returns The Unicode code point of the UTF-8 character.
*/
export function bytesToCodePoint(byteLength: number, [byte1, byte2, byte3, byte4]: number[]): number {
return (
// 1-byte UTF-8 sequence
byteLength === 1 ?
byte1
// 2-byte UTF-8 sequence
: byteLength === 2 ?
(MASK_FOR_2B & byte1) << BITS_FOR_2B |
MASK_FOR_1B & byte2
// 3-byte UTF-8 sequence
: byteLength === 3 ?
(MASK_FOR_3B & byte1) << BITS_FOR_3B |
(MASK_FOR_1B & byte2) << BITS_FOR_2B |
MASK_FOR_1B & byte3
// 4-byte UTF-8 sequence
: byteLength === 4 ?
(MASK_FOR_4B & byte1) << BITS_FOR_4B |
(MASK_FOR_1B & byte2) << BITS_FOR_3B |
(MASK_FOR_1B & byte3) << 6 |
MASK_FOR_1B & byte4
// 1-byte UTF-8 sequence (fallback)
// Default to 1-byte sequence if length is unexpected
: byte1
);
}
/**
* Extracts a Unicode code point from a given buffer starting at a specified index.
* This method is useful for parsing a stream or array of data where UTF-8 characters
* are embedded within a larger set of binary data.
*
* @param byteLength The byte length of the UTF-8 encoded character to extract.
* @param buffer The buffer (array or Uint8Array) containing the UTF-8 data.
* @param head The index in the buffer where the UTF-8 encoded character starts.
* @returns The Unicode code point extracted from the buffer.
*/
export function bytesToCodePointFromBuffer<T extends number = number>(
byteLength: number,
buffer: Array<T> | Uint8Array,
head: number
): number {
// Constants for bit manipulation
const bufferSize = buffer.length;
// Calculate the code point based on the byte length
switch (byteLength) {
case 1:
return buffer[head];
case 2:
return (
(MASK_FOR_2B & buffer[head]) << BITS_FOR_2B |
MASK_FOR_1B & buffer[(head + 1) % bufferSize]
);
case 3:
return (
(MASK_FOR_3B & buffer[head]) << BITS_FOR_3B |
(MASK_FOR_1B & buffer[(head + 1) % bufferSize]) << BITS_FOR_2B |
MASK_FOR_1B & buffer[(head + 2) % bufferSize]
);
case 4:
return (
(MASK_FOR_4B & buffer[head]) << BITS_FOR_4B |
(MASK_FOR_1B & buffer[(head + 1) % bufferSize]) << BITS_FOR_3B |
(MASK_FOR_1B & buffer[(head + 2) % bufferSize]) << BITS_FOR_2B |
MASK_FOR_1B & buffer[(head + 3) % bufferSize]
);
default:
// Default case for unexpected byteLength
return buffer[head];
}
}
/**
* Extracts the Unicode code point and its size in UTF-16 code units from a string at a given position.
*
* Calculates the Unicode code point of a character at a specific index in a string,
* taking into account UTF-16 encoding which may represent characters using one or two code units (surrogates).
* This function is particularly useful for strings containing emoji or other characters
* that may be represented as surrogate pairs in JavaScript.
*
* @param str The string to extract the code point from.
* @param index The index of the character within the string.
* @returns The Unicode code point of the character, considering potential surrogate pairs.
*/
export function codePointAt(str: string, index: number): number | undefined {
const size = str.length;
if (index < 0 || index >= size) return undefined; // Guard clause for out-of-bounds index
// Get the first code unit
const first = str.charCodeAt(index);
// Surrogate pairs are a way to represent characters that don't fit in the standard size for a character in UTF-16 encoding.
// Normally, characters in UTF-16 are represented using a single unit (16 bits).
// However, this only allows for 65,536 unique characters (2^16),
// which is not enough for all the characters and symbols we want to use.
// To encode more than 65,536 characters, UTF-16 uses a pair of units (32 bits in total) for some characters.
// This pair is known as a surrogate pair.
// A surrogate pair consists of two parts: a high surrogate and a low surrogate.
// The high surrogate is the first part and is a code unit in the range 0xD800 to 0xDBFF.
// The low surrogate is the second part and is a code unit in the range 0xDC00 to 0xDFFF.
// When these two parts are combined, they represent a single character that is outside the standard 65,536 characters.
if (
// check if it’s the start of a surrogate pair
first >= 0xD800 && first <= 0xDBFF && // high surrogate
size > index + 1 // there is a next code unit
) {
const second = str.charCodeAt(index + 1);
// Here, we check if the second unit is a low surrogate to confirm that we have a valid surrogate pair.
// If it is, we combine the high and low surrogates to get the actual character they represent.
// This is done using a specific formula that translates these two units into the correct character.
if (second >= 0xDC00 && second <= 0xDFFF) { // low surrogate
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
// The full code point is calculated using the formula:
// (high surrogate - 0xD800) * 0x400 + (low surrogate - 0xDC00) + 0x10000
// e.g. `return (first - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;`
// This formula converts the surrogate pair into the actual Unicode code point.
// Use bitwise shift instead of multiplication and addition
// Bitwise left shift (<< 10) is used here as an efficient way to multiply by 2^10 (or 2**10) (or 1024).
// This is equivalent to the expression (first - 0xD800) * 0x400, since 0x400 in decimal is 1024.
return ((first - 0xD800) << 10) + (second - 0xDC00) + 0x10000; // Calculate and return surrogate pair code point
}
}
// Return the code unit if not a surrogate pair
return first;
}