@@ -17,19 +17,42 @@ extension BPETokenizer {
17
17
18
18
/// Read merges.txt file at URL into a dictionary mapping bigrams to the line number/rank/priority
19
19
static func readMerges( url: URL ) throws -> [ TokenPair : Int ] {
20
- let content = try String ( contentsOf: url)
21
- let lines = content. split ( separator: " \n " )
22
-
23
- let merges : [ ( TokenPair , Int ) ] = try lines. enumerated ( ) . compactMap { ( index, line) in
24
- if line. hasPrefix ( " # " ) {
25
- return nil
26
- }
27
- let pair = line. split ( separator: " " )
28
- if pair. count != 2 {
29
- throw FileReadError . invalidMergeFileLine ( index+ 1 )
20
+ let data = try Data ( contentsOf: url)
21
+ var merges = [ TokenPair: Int] ( )
22
+ var index = 0
23
+ var line = [ UInt8] ( )
24
+ for byte in data {
25
+ if byte == UInt8 ( ascii: " \n " ) {
26
+ if let pair = try parseMergesLine ( line, index: index) {
27
+ merges [ pair] = index
28
+ }
29
+ line. removeAll ( keepingCapacity: true )
30
+ index += 1
31
+ } else {
32
+ line. append ( byte)
30
33
}
31
- return ( TokenPair ( String ( pair [ 0 ] ) , String ( pair [ 1 ] ) ) , index)
32
34
}
33
- return [ TokenPair : Int] ( uniqueKeysWithValues: merges)
35
+
36
+ return merges
37
+ }
38
+
39
+ static func parseMergesLine( _ line: [ UInt8 ] , index: Int ) throws -> TokenPair ? {
40
+ if line. isEmpty || line. first == UInt8 ( ascii: " # " ) {
41
+ return nil
42
+ }
43
+ let pair = line. split ( separator: UInt8 ( ascii: " " ) )
44
+ if pair. count != 2 {
45
+ throw FileReadError . invalidMergeFileLine ( index + 1 )
46
+ }
47
+ return TokenPair ( String ( bytes: pair [ 0 ] ) , String ( bytes: pair [ 1 ] ) )
48
+ }
49
+ }
50
+
51
+ extension String {
52
+ init ( bytes: some Collection < UInt8 > ) {
53
+ self . init ( unsafeUninitializedCapacity: bytes. count) { pointer in
54
+ _ = pointer. initialize ( fromContentsOf: bytes)
55
+ return bytes. count
56
+ }
34
57
}
35
58
}
0 commit comments