1
+ /**
2
+ * @module
3
+ * Provides utility functions for working with UTF-8 encoded characters in TypeScript.
4
+ * It includes methods for determining the byte length of UTF-8 characters, converting bytes to Unicode code points,
5
+ * extracting code points from buffers, and dealing with UTF-16 code units in strings.
6
+ *
7
+ * @example
8
+ * ```ts
9
+ * import { getByteLength, bytesToCodePoint, bytesToCodePointFromBuffer, codePointAt } from 'jsr:@okikio/codepoint-iterator/byte_methods';
10
+ *
11
+ * // Determine the byte length of a UTF-8 encoded character
12
+ * const leadByte = 0xF0; // Leading byte of a 4-byte UTF-8 character
13
+ * console.log(getByteLength(leadByte)); // Expected output: 4
14
+ *
15
+ * // Convert a sequence of UTF-8 bytes to a Unicode code point
16
+ * const bytes = [0xF0, 0x9F, 0x92, 0xA9]; // UTF-8 encoded representation of the 💩 emoji
17
+ * console.log(bytesToCodePoint(4, bytes)); // Expected output: 128169 (code point for 💩)
18
+ *
19
+ * // Extract a Unicode code point from a buffer
20
+ * const buffer = new Uint8Array([0xF0, 0x9F, 0x92, 0xA9]);
21
+ * console.log(bytesToCodePointFromBuffer(4, buffer, 0)); // Expected output: 128169
22
+ *
23
+ * // Calculate the Unicode code point of a character in a string
24
+ * const str = '🌍';
25
+ * console.log(codePointAt(str, 0)); // Expected output: 127757 (code point for 🌍)
26
+ * ```
27
+ */
28
+
1
29
import {
2
30
BITS_FOR_2B ,
3
31
BITS_FOR_3B ,
@@ -18,12 +46,15 @@ import {
18
46
/**
19
47
* Calculates the number of bytes required to represent a single UTF-8 character.
20
48
*
21
- * UTF-8 can be represented by 1 to 4 bytes.
49
+ * Determines the byte length of a UTF-8 encoded character based on its leading byte.
50
+ * This is crucial for correctly interpreting or encoding text in UTF-8,
51
+ * where characters may vary in byte length from 1 to 4 bytes.
52
+ *
22
53
* This function given the byte value of the leading byte for the utf-8 character
23
54
* calculates how many more bytes are required to represent the utf-8 character,
24
55
* this allows emoji's another other symbols to be represented in utf-8.
25
56
*
26
- * @param byte - The lead byte of a UTF-8 character.
57
+ * @param byte The lead byte of a UTF-8 character.
27
58
* @returns The number of bytes in a Uint8Array required to represent the UTF-8 character (the number of bytes ranges from 1 to 4).
28
59
*/
29
60
export function getByteLength ( byte : number ) : number {
@@ -37,8 +68,8 @@ export function getByteLength(byte: number): number {
37
68
}
38
69
39
70
/**
40
- * UTF-8 bytes to codepoint.
41
- * Calculates the Unicode code point from the bytes of a UTF-8 character .
71
+ * Converts a sequence of bytes into a Unicode code point. This function is a key part of
72
+ * decoding UTF-8 encoded text, as it translates the raw bytes back into the characters they represent .
42
73
*
43
74
* UTF-8 can be represented by 1 to 4 bytes.
44
75
* This function given the byte length of the utf-8 character
@@ -48,10 +79,10 @@ export function getByteLength(byte: number): number {
48
79
* Due to the dynamic length of utf-8 characters,
49
80
* its faster to just grab the bytes from the Uint8Array then calculate it's codepoint
50
81
* than trying to decode said Uint8Array into a string and then converting
51
- * said string into codepoints.
82
+ * said string into codepoints.
52
83
*
53
84
* @param byteLength The number of bytes in a Uint8Array required to represent a single UTF-8 character (the number of bytes ranges from 1 to 4).
54
- * @param [ bytes] - An array of length `byteLength` bytes that make up the UTF-8 character.
85
+ * @param bytes An array of length `byteLength` bytes that make up the UTF-8 character.
55
86
* @returns The Unicode code point of the UTF-8 character.
56
87
*/
57
88
export function bytesToCodePoint ( byteLength : number , [ byte1 , byte2 , byte3 , byte4 ] : number [ ] ) : number {
@@ -79,16 +110,20 @@ export function bytesToCodePoint(byteLength: number, [byte1, byte2, byte3, byte4
79
110
MASK_FOR_1B & byte4
80
111
81
112
// 1-byte UTF-8 sequence (fallback)
113
+ // Default to 1-byte sequence if length is unexpected
82
114
: byte1
83
115
) ;
84
116
}
85
117
86
- /**
87
- * Calculates the Unicode code point from a given buffer using indexed access.
88
- * @param byteLength - The number of bytes representing the code point.
89
- * @param buffer - The Uint8Array buffer containing the bytes.
90
- * @param head - The starting index of the code point in the buffer.
91
- * @returns The calculated Unicode code point.
118
+ /**
119
+ * Extracts a Unicode code point from a given buffer starting at a specified index.
120
+ * This method is useful for parsing a stream or array of data where UTF-8 characters
121
+ * are embedded within a larger set of binary data.
122
+ *
123
+ * @param byteLength The byte length of the UTF-8 encoded character to extract.
124
+ * @param buffer The buffer (array or Uint8Array) containing the UTF-8 data.
125
+ * @param head The index in the buffer where the UTF-8 encoded character starts.
126
+ * @returns The Unicode code point extracted from the buffer.
92
127
*/
93
128
export function bytesToCodePointFromBuffer < T extends number = number > (
94
129
byteLength : number ,
@@ -121,23 +156,26 @@ export function bytesToCodePointFromBuffer<T extends number = number>(
121
156
MASK_FOR_1B & buffer [ ( head + 3 ) % bufferSize ]
122
157
) ;
123
158
default :
159
+ // Default case for unexpected byteLength
124
160
return buffer [ head ] ;
125
161
}
126
162
}
127
163
128
164
/**
129
165
* Extracts the Unicode code point and its size in UTF-16 code units from a string at a given position.
130
- * @param str - The input string.
131
- * @param index - The position in the string to extract the code point from.
132
- * @returns A number represent the code point in UTF-16 code units.
166
+ *
167
+ * Calculates the Unicode code point of a character at a specific index in a string,
168
+ * taking into account UTF-16 encoding which may represent characters using one or two code units (surrogates).
169
+ * This function is particularly useful for strings containing emoji or other characters
170
+ * that may be represented as surrogate pairs in JavaScript.
171
+ *
172
+ * @param str The string to extract the code point from.
173
+ * @param index The index of the character within the string.
174
+ * @returns The Unicode code point of the character, considering potential surrogate pairs.
133
175
*/
134
- export function codePointAt ( str : string , index : number ) : number {
176
+ export function codePointAt ( str : string , index : number ) : number | undefined {
135
177
const size = str . length ;
136
-
137
- // Account for out-of-bounds indices:
138
- if ( index < 0 || index >= size ) {
139
- return undefined ;
140
- }
178
+ if ( index < 0 || index >= size ) return undefined ; // Guard clause for out-of-bounds index
141
179
142
180
// Get the first code unit
143
181
const first = str . charCodeAt ( index ) ;
@@ -174,9 +212,10 @@ export function codePointAt(str: string, index: number): number {
174
212
// Use bitwise shift instead of multiplication and addition
175
213
// Bitwise left shift (<< 10) is used here as an efficient way to multiply by 2^10 (or 2**10) (or 1024).
176
214
// This is equivalent to the expression (first - 0xD800) * 0x400, since 0x400 in decimal is 1024.
177
- return ( ( first - 0xD800 ) << 10 ) + ( second - 0xDC00 ) + 0x10000 ;
215
+ return ( ( first - 0xD800 ) << 10 ) + ( second - 0xDC00 ) + 0x10000 ; // Calculate and return surrogate pair code point
178
216
}
179
217
}
180
218
219
+ // Return the code unit if not a surrogate pair
181
220
return first ;
182
221
}
0 commit comments