-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathraw_features.py
453 lines (368 loc) · 18.3 KB
/
raw_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
import re
import lief
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher
class FeatureType(object):
''' Base class from which each feature type may inherit '''
name = ''
dim = 0
def __repr__(self):
return '{}({})'.format(self.name, self.dim)
def raw_features(self, bytez, lief_binary):
''' Generate a JSON-able representation of the file '''
raise (NotImplemented)
def process_raw_features(self, raw_obj):
''' Generate a feature vector from the raw features '''
raise (NotImplemented)
def feature_vector(self, bytez, lief_binary):
''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
if there are significant speedups to be gained from combining the two functions. '''
return self.process_raw_features(self.raw_features(bytez, lief_binary))
class ByteHistogram(FeatureType):
''' Byte histogram (count + non-normalized) over the entire binary file '''
name = 'histogram'
dim = 256
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
return counts.tolist()
def process_raw_features(self, raw_obj):
counts = np.array(raw_obj, dtype=np.float32)
sum_ = counts.sum()
normalized = counts / sum_
return normalized
class ByteEntropyHistogram(FeatureType):
''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
This roughly approximates the joint probability of byte value and local entropy.
See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
'''
name = 'byteentropy'
dim = 256
def __init__(self, step=1024, window=2048):
super(FeatureType, self).__init__()
self.window = window
self.step = step
def _entropy_bin_counts(self, block):
# coarse histogram, 16 bytes per bin
c = np.bincount(block >> 4, minlength=16) # 16-bin histogram
p = c.astype(np.float32) / self.window
wh = np.where(c)[0]
H = np.sum(-p[wh] * np.log2(
p[wh])) * 2 # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)
Hbin = int(H * 2) # up to 16 bins (max entropy is 8 bits)
if Hbin == 16: # handle entropy = 8.0 bits
Hbin = 15
return Hbin, c
def raw_features(self, bytez, lief_binary):
output = np.zeros((16, 16), dtype=np.int)
a = np.frombuffer(bytez, dtype=np.uint8)
if a.shape[0] < self.window:
Hbin, c = self._entropy_bin_counts(a)
output[Hbin, :] += c
else:
# strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
strides = a.strides + (a.strides[-1],)
blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]
# from the blocks, compute histogram
for block in blocks:
Hbin, c = self._entropy_bin_counts(block)
output[Hbin, :] += c
return output.flatten().tolist()
def process_raw_features(self, raw_obj):
counts = np.array(raw_obj, dtype=np.float32)
sum_ = counts.sum()
normalized = counts / sum_
return normalized
class SectionInfo(FeatureType):
''' Information about section names, sizes and entropy. Uses hashing trick
to summarize all this section info into a feature vector.
'''
name = 'section'
dim = 5 + 50 + 50 + 50 + 50 + 50
def __init__(self):
super(FeatureType, self).__init__()
@staticmethod
def _properties(s):
return [str(c).split('.')[-1] for c in s.characteristics_lists]
def raw_features(self, bytez, lief_binary):
if lief_binary is None:
return {"entry": "", "sections": []}
# properties of entry point, or if invalid, the first executable section
try:
entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
except lief.not_found:
# bad entry point, let's find the first executable section
entry_section = ""
for s in lief_binary.sections:
if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
entry_section = s.name
break
raw_obj = {"entry": entry_section}
raw_obj["sections"] = [{
'name': s.name,
'size': s.size,
'entropy': s.entropy,
'vsize': s.virtual_size,
'props': self._properties(s)
} for s in lief_binary.sections]
return raw_obj
def process_raw_features(self, raw_obj):
sections = raw_obj['sections']
general = [
len(sections), # total number of sections
# number of sections with nonzero size
sum(1 for s in sections if s['size'] == 0),
# number of sections with an empty name
sum(1 for s in sections if s['name'] == ""),
# number of RX
sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
# number of W
sum(1 for s in sections if 'MEM_WRITE' in s['props'])
]
# gross characteristics of each section
section_sizes = [(s['name'], s['size']) for s in sections]
section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
section_entropy = [(s['name'], s['entropy']) for s in sections]
section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
section_vsize = [(s['name'], s['vsize']) for s in sections]
section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0]
characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
return np.hstack([
general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
characteristics_hashed
]).astype(np.float32)
class ImportsInfo(FeatureType):
''' Information about imported libraries and functions from the
import address table. Note that the total number of imported
functions is contained in GeneralFileInfo.
'''
name = 'imports'
dim = 1280
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
imports = {}
if lief_binary is None:
return imports
for lib in lief_binary.imports:
if lib.name not in imports:
imports[lib.name] = [] # libraries can be duplicated in listing, extend instead of overwrite
# Clipping assumes there are diminishing returns on the discriminatory power of imported functions
# beyond the first 10000 characters, and this will help limit the dataset size
imports[lib.name].extend([entry.name[:10000] for entry in lib.entries])
return imports
def process_raw_features(self, raw_obj):
# unique libraries
libraries = list(set([l.lower() for l in raw_obj.keys()]))
libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]
# A string like "kernel32.dll:CreateFileMappingA" for each imported function
imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist]
imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]
# Two separate elements: libraries (alone) and fully-qualified names of imported functions
return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)
class ExportsInfo(FeatureType):
''' Information about exported functions. Note that the total number of exported
functions is contained in GeneralFileInfo.
'''
name = 'exports'
dim = 128
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
if lief_binary is None:
return []
clipped_exports = [export.name for export in lief_binary.exported_functions]
return clipped_exports
def process_raw_features(self, raw_obj):
exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
return exports_hashed.astype(np.float32)
class GeneralFileInfo(FeatureType):
''' General information about the file '''
name = 'general'
dim = 10
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
if lief_binary is None:
return {
'size': len(bytez),
'vsize': 0,
'has_debug': 0,
'exports': 0,
'imports': 0,
'has_relocations': 0,
'has_resources': 0,
'has_signature': 0,
'has_tls': 0,
'symbols': 0
}
return {
'size': len(bytez),
'vsize': lief_binary.virtual_size,
'has_debug': int(lief_binary.has_debug),
'exports': len(lief_binary.exported_functions),
'imports': len(lief_binary.imported_functions),
'has_relocations': int(lief_binary.has_relocations),
'has_resources': int(lief_binary.has_resources),
'has_signature': int(lief_binary.has_signature),
'has_tls': int(lief_binary.has_tls),
'symbols': len(lief_binary.symbols),
}
def process_raw_features(self, raw_obj):
return np.asarray(
[
raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'],
raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
raw_obj['symbols']
],
dtype=np.float32)
class HeaderFileInfo(FeatureType):
''' Machine, architecure, OS, linker and other information extracted from header '''
name = 'header'
dim = 62
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
raw_obj = {}
raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []}
raw_obj['optional'] = {
'subsystem': "",
'dll_characteristics': [],
'magic': "",
'major_image_version': 0,
'minor_image_version': 0,
'major_linker_version': 0,
'minor_linker_version': 0,
'major_operating_system_version': 0,
'minor_operating_system_version': 0,
'major_subsystem_version': 0,
'minor_subsystem_version': 0,
'sizeof_code': 0,
'sizeof_headers': 0,
'sizeof_heap_commit': 0
}
if lief_binary is None:
return raw_obj
raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps
raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1]
raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list]
raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1]
raw_obj['optional']['dll_characteristics'] = [
str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists
]
raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1]
raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version
raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version
raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version
raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version
raw_obj['optional'][
'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version
raw_obj['optional'][
'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version
raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version
raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version
raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code
raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers
raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit
return raw_obj
def process_raw_features(self, raw_obj):
return np.hstack([
raw_obj['coff']['timestamp'],
FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0],
FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0],
FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0],
FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0],
FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0],
raw_obj['optional']['major_image_version'],
raw_obj['optional']['minor_image_version'],
raw_obj['optional']['major_linker_version'],
raw_obj['optional']['minor_linker_version'],
raw_obj['optional']['major_operating_system_version'],
raw_obj['optional']['minor_operating_system_version'],
raw_obj['optional']['major_subsystem_version'],
raw_obj['optional']['minor_subsystem_version'],
raw_obj['optional']['sizeof_code'],
raw_obj['optional']['sizeof_headers'],
raw_obj['optional']['sizeof_heap_commit'],
]).astype(np.float32)
class StringExtractor(FeatureType):
''' Extracts strings from raw byte stream '''
name = 'strings'
dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1
def __init__(self):
super(FeatureType, self).__init__()
# all consecutive runs of 0x20 - 0x7f that are 5+ characters
self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
# occurances of the string 'C:\'. Not actually extracting the path
self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
# occurances of http:// or https://. Not actually extracting the URLs
self._urls = re.compile(b'https?://', re.IGNORECASE)
# occurances of the string prefix HKEY_. No actually extracting registry names
self._registry = re.compile(b'HKEY_')
# crude evidence of an MZ header (dropper?) somewhere in the byte stream
self._mz = re.compile(b'MZ')
def raw_features(self, bytez, lief_binary):
allstrings = self._allstrings.findall(bytez)
if allstrings:
# statistics about strings:
string_lengths = [len(s) for s in allstrings]
avlength = sum(string_lengths) / len(string_lengths)
# map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
c = np.bincount(as_shifted_string, minlength=96) # histogram count
# distribution of characters in printable strings
csum = c.sum()
p = c.astype(np.float32) / csum
wh = np.where(c)[0]
H = np.sum(-p[wh] * np.log2(p[wh])) # entropy
else:
avlength = 0
c = np.zeros((96,), dtype=np.float32)
H = 0
csum = 0
return {
'numstrings': len(allstrings),
'avlength': avlength,
'printabledist': c.tolist(), # store non-normalized histogram
'printables': int(csum),
'entropy': float(H),
'paths': len(self._paths.findall(bytez)),
'urls': len(self._urls.findall(bytez)),
'registry': len(self._registry.findall(bytez)),
'MZ': len(self._mz.findall(bytez))
}
def process_raw_features(self, raw_obj):
hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0
return np.hstack([
raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'],
raw_obj['registry'], raw_obj['MZ']
]).astype(np.float32)
class PEFeatureExtractor(object):
''' Extract useful features from a PE file, and return as a vector of fixed size. '''
features = [
ByteHistogram(), ByteEntropyHistogram(), GeneralFileInfo(),
HeaderFileInfo(), ExportsInfo(), SectionInfo()
] # ImportsInfo(), StringExtractor()
dim = sum([fe.dim for fe in features])
def raw_features(self, bytez):
try:
lief_binary = lief.PE.parse(list(bytez))
except (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, RuntimeError) as e:
print("lief error: ", str(e))
lief_binary = None
except Exception: # everything else (KeyboardInterrupt, SystemExit, ValueError):
raise
features = {"sha256": hashlib.sha256(bytez).hexdigest()}
features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
return features
def process_raw_features(self, raw_obj):
feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
return np.hstack(feature_vectors).astype(np.float32)
def feature_vector(self, bytez):
return self.process_raw_features(self.raw_features(bytez))