@@ -101,6 +101,72 @@ class File extends BinaryStream {
101
101
"Ccaron " , "ccaron " , "dmacron "
102
102
);
103
103
104
+ private function uniord (string $ c , string $ encoding = null ) {
105
+ if (function_exists ("mb_ord " )) {
106
+ if (PHP_VERSION_ID < 80000 && $ encoding === null ) {
107
+ // in PHP < 8 the encoding argument, if supplied, must be a valid encoding
108
+ $ encoding = "UTF-8 " ;
109
+ }
110
+ return mb_ord ($ c , $ encoding );
111
+ }
112
+
113
+ if ($ encoding != "UTF-8 " && $ encoding !== null ) {
114
+ $ c = mb_convert_encoding ($ c , "UTF-8 " , $ encoding );
115
+ }
116
+
117
+ $ length = mb_strlen (mb_substr ($ c , 0 , 1 ), '8bit ' );
118
+ $ ord = false ;
119
+ $ bytes = [];
120
+ $ numbytes = 1 ;
121
+ for ($ i = 0 ; $ i < $ length ; $ i ++) {
122
+ $ o = \ord ($ c [$ i ]); // get one string character at time
123
+ if (\count ($ bytes ) === 0 ) { // get starting octect
124
+ if ($ o <= 0x7F ) {
125
+ $ ord = $ o ;
126
+ $ numbytes = 1 ;
127
+ } elseif (($ o >> 0x05 ) === 0x06 ) { // 2 bytes character (0x06 = 110 BIN)
128
+ $ bytes [] = ($ o - 0xC0 ) << 0x06 ;
129
+ $ numbytes = 2 ;
130
+ } elseif (($ o >> 0x04 ) === 0x0E ) { // 3 bytes character (0x0E = 1110 BIN)
131
+ $ bytes [] = ($ o - 0xE0 ) << 0x0C ;
132
+ $ numbytes = 3 ;
133
+ } elseif (($ o >> 0x03 ) === 0x1E ) { // 4 bytes character (0x1E = 11110 BIN)
134
+ $ bytes [] = ($ o - 0xF0 ) << 0x12 ;
135
+ $ numbytes = 4 ;
136
+ } else {
137
+ $ ord = false ;
138
+ break ;
139
+ }
140
+ } elseif (($ o >> 0x06 ) === 0x02 ) { // bytes 2, 3 and 4 must start with 0x02 = 10 BIN
141
+ $ bytes [] = $ o - 0x80 ;
142
+ if (\count ($ bytes ) === $ numbytes ) {
143
+ // compose UTF-8 bytes to a single unicode value
144
+ $ o = $ bytes [0 ];
145
+ for ($ j = 1 ; $ j < $ numbytes ; $ j ++) {
146
+ $ o += ($ bytes [$ j ] << (($ numbytes - $ j - 1 ) * 0x06 ));
147
+ }
148
+ if ((($ o >= 0xD800 ) and ($ o <= 0xDFFF )) or ($ o >= 0x10FFFF )) {
149
+ // The definition of UTF-8 prohibits encoding character numbers between
150
+ // U+D800 and U+DFFF, which are reserved for use with the UTF-16
151
+ // encoding form (as surrogate pairs) and do not directly represent
152
+ // characters.
153
+ return false ;
154
+ } else {
155
+ $ ord = $ o ; // add char to array
156
+ }
157
+ // reset data for next char
158
+ $ bytes = [];
159
+ $ numbytes = 1 ;
160
+ }
161
+ } else {
162
+ $ ord = false ;
163
+ break ;
164
+ }
165
+ }
166
+
167
+ return $ ord ;
168
+ }
169
+
104
170
function getTable () {
105
171
$ this ->parseTableEntries ();
106
172
@@ -157,7 +223,7 @@ function utf8toUnicode($str) {
157
223
function getUnicodeCharMap () {
158
224
$ subtable = null ;
159
225
foreach ($ this ->getData ("cmap " , "subtables " ) as $ _subtable ) {
160
- if ($ _subtable ["platformID " ] == 0 || $ _subtable ["platformID " ] == 3 && $ _subtable ["platformSpecificID " ] == 1 ) {
226
+ if ($ _subtable ["platformID " ] == 0 || ( $ _subtable ["platformID " ] == 3 && $ _subtable ["platformSpecificID " ] == 1 ) ) {
161
227
$ subtable = $ _subtable ;
162
228
break ;
163
229
}
@@ -167,6 +233,51 @@ function getUnicodeCharMap() {
167
233
return $ subtable ["glyphIndexArray " ];
168
234
}
169
235
236
+ $ system_encodings = mb_list_encodings ();
237
+ $ system_encodings = array_change_key_case (array_fill_keys ($ system_encodings , true ), CASE_UPPER );
238
+ foreach ($ this ->getData ("cmap " , "subtables " ) as $ _subtable ) {
239
+ $ encoding = null ;
240
+ switch ($ _subtable ["platformID " ]) {
241
+ case 3 :
242
+ switch ($ _subtable ["platformSpecificID " ]) {
243
+ case 2 :
244
+ if (\array_key_exists ("SJIS " , $ system_encodings )) {
245
+ $ encoding = "SJIS " ;
246
+ }
247
+ break ;
248
+ case 3 :
249
+ if (\array_key_exists ("GB18030 " , $ system_encodings )) {
250
+ $ encoding = "GB18030 " ;
251
+ }
252
+ break ;
253
+ case 4 :
254
+ if (\array_key_exists ("BIG-5 " , $ system_encodings )) {
255
+ $ encoding = "BIG-5 " ;
256
+ }
257
+ break ;
258
+ case 5 :
259
+ if (\array_key_exists ("UHC " , $ system_encodings )) {
260
+ $ encoding = "UHC " ;
261
+ }
262
+ break ;
263
+ }
264
+ break ;
265
+ }
266
+ if ($ encoding ) {
267
+ $ glyphIndexArray = array ();
268
+ foreach ($ _subtable ["glyphIndexArray " ] as $ c => $ gid ) {
269
+ $ str = trim (pack ("N " , $ c ));
270
+ if (\strlen ($ str ) > 0 ) {
271
+ $ ord = $ this ->uniord ($ str , $ encoding );
272
+ if ($ ord > 0 ) {
273
+ $ glyphIndexArray [$ ord ] = $ gid ;
274
+ }
275
+ }
276
+ }
277
+ return $ glyphIndexArray ;
278
+ }
279
+ }
280
+
170
281
return null ;
171
282
}
172
283
0 commit comments