-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path__init__.py
executable file
·631 lines (529 loc) · 23.5 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
#!/usr/bin/env python3
# Copyright (c) 2015 Pawel Rozlach
# Copyright (c) 2014 Pawel Rozlach
# Copyright (c) 2014 Zadane.pl sp. z o.o.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
# Imports:
from pymisc.monitoring import ScriptStatus
from pymisc.script import RecoverableException, ScriptConfiguration, ScriptLock
import argparse
import logging
import logging.handlers as lh
import numpy
import os
import sys
import time
import yaml
# Defaults:
LOCKFILE_LOCATION = './'+os.path.basename(__file__)+'.lock'
CONFIGFILE_LOCATION = './'+os.path.basename(__file__)+'.conf'
class HistoryFile():
"""
Abstraction of all the operations on historical datapoints
This class takes care of storing, retreiving, and trimming of historical
datapoints, plus some additionall syntax checking.
Attributes:
_data: a nested hash with the data itself
_location: location of the file where data is stored betwean script runs
_max_averaging_window: please see class's init() method
_min_averaging_window: please see class's init() method
"""
_data = {}
_location = None
_max_averaging_window = None
_min_averaging_window = None
@classmethod
def _remove_old_datapoints(cls):
"""
Remove all the datapoints older than cls._max_averaging_window from
the internal storage.
"""
cur_time = time.time()
averaging_border = cur_time - cls._max_averaging_window * 3600 * 24
cur_dict = cls._data['datapoints']['memory']
cls._data['datapoints']['memory'] = {x: cur_dict[x] for x in
cur_dict.keys() if x >
averaging_border}
for mountpoint in cls._data['datapoints']['disk'].keys():
for data_type in cls._data['datapoints']['disk'][mountpoint].keys():
cur_dict = cls._data['datapoints']['disk'][mountpoint][data_type]
cls._data['datapoints']['disk'][mountpoint][data_type] = \
{x: cur_dict[x] for x in cur_dict.keys()
if x > averaging_border}
@classmethod
def _verify_resource_types(cls, prefix=None, path=None, data_type=None):
if prefix is None or prefix not in ['disk', 'memory']:
raise ValueError('Not supported prefix during datapoint addition')
if prefix == 'disk':
if path is None or not os.path.exists(path) or \
data_type not in ['inode', 'space']:
raise ValueError('data_type and path params are required for' +
' "disk" prefix')
@classmethod
def init(cls, location, max_averaging_window, min_averaging_window):
"""
Initialize HistoryFIle class.
Class either fetches stored datapoints from the file or creates empty
storage. It takes care of setting some internal fields as well.
Args:
location: location of the file where data is stored or should be
stored. File is in YAML format.
max_averaging_window: maximum time span betwean the oldest and newest
datapoint. Points older that this are removed and are no longer
taken into consideration.
min_averaging_window: minimum time span betwean the oldest and newest
datapoint which permits calculation of the growth ratio.
"""
cls._max_averaging_window = max_averaging_window
cls._min_averaging_window = min_averaging_window
cls._location = location
try:
with open(location, 'r') as fh:
cls._data = yaml.load(fh)
except (IOError, yaml.YAMLError):
cls._data = {'datapoints': {'memory': {}, 'disk': {}}}
else:
cls._remove_old_datapoints()
@classmethod
def add_datapoint(cls, prefix, datapoint, path=None, data_type=None):
"""
Add a datapoint to the internal store.
This method takes care of some simple sanity-checking and addition of
the new datapoints.
Args:
prefix: either 'disk' or 'memory' - whether a datapoint is actually
a disk usage or memory usage
datapoint: current value of the resource
path: in case of the 'disk' resource - the path where device
relevant to the datapoint is mounted.
data_type: in case of the 'disk' respource - whether it is an inode
usage or disk space usage
Raises:
ValueError: input data is invalid
"""
cls._verify_resource_types(prefix, path, data_type)
float(datapoint)
cur_time = round(time.time())
if prefix == 'memory':
cls._data['datapoints'][prefix][cur_time] = datapoint
else:
if path not in cls._data['datapoints'][prefix].keys():
cls._data['datapoints'][prefix][path] = dict()
cls._data['datapoints'][prefix][path]['inode'] = dict()
cls._data['datapoints'][prefix][path]['space'] = dict()
cls._data['datapoints'][prefix][path][data_type][cur_time] = datapoint
@classmethod
def verify_dataspan(cls, prefix, path=None, data_type=None):
"""
Check whether we have enough data to calculate growth ratio.
This method calculates the difference between current timespan for
the given resource (memory or disk-inode or disk-space) and the
min_averaging_window.
Args:
prefix: same as for add_datapoint() method
path: same as for add_datapoint() method
data_type: same as for add_datapoint() method
Returns:
Difference expressed in number of days. If it is negative then
there is not enough data to process.
"""
cls._verify_resource_types(prefix, path, data_type)
dataspan = cls.get_dataspan(prefix, path, data_type)
return (dataspan - cls._min_averaging_window)
@classmethod
def get_dataspan(cls, prefix, path=None, data_type=None):
"""
Return the difference (in days) betwean oldest and latest data sample
for given reource type
Args:
prefix: same as for add_datapoint() method
path: same as for add_datapoint() method
data_type: same as for add_datapoint() method
Returns:
Data span for given rousource type expressed in days.
"""
cls._verify_resource_types(prefix, path, data_type)
if prefix == 'memory':
timestamps = cls._data['datapoints'][prefix].keys()
else:
timestamps = cls._data['datapoints'][prefix][path][data_type].keys()
dataspan = round((max(timestamps) - min(timestamps))/(3600*24), 2)
return dataspan
@classmethod
def get_datapoints(cls, prefix, path=None, data_type=None):
"""
Get all datapoints for given data type.
This method ensures that all datapoints not older than averaging
window are returned for the given resource type.
Args:
prefix: same as for add_datapoint() method
path: same as for add_datapoint() method
data_type: same as for add_datapoint() method
Returns:
A dictionary with timestamps as keys and resource usages as values.
Raises:
ValueError: input data is invalid
"""
cls._verify_resource_types(prefix, path, data_type)
cls._remove_old_datapoints()
if prefix == 'disk':
datapoints = cls._data['datapoints'][prefix][path][data_type]
else:
datapoints = cls._data['datapoints'][prefix]
return datapoints
@classmethod
def clear_history(cls):
"""
Remove all datapoints.
"""
for res_type in cls._data['datapoints'].keys():
cls._data['datapoints'][res_type] = dict()
@classmethod
def save(cls):
"""
Save all the datapoints.
This method saves all datapoints not older than
(max_averaging_window - 1) * 3600 * 24 seconds to the the file provided
in init() call.
"""
cls._remove_old_datapoints()
with open(cls._location, 'w') as fh:
data = yaml.dump(cls._data, default_flow_style=False)
fh.write(data)
def fetch_memory_usage():
"""
Fetch current memory usage.
Returns:
A tuple: (memory used, memory total), in megabytes.
"""
# Calculation based on 'free' source:
# used = MemTotal - MemFree - Cached - Slab - Buffers
# total = MemTotal
with open('/proc/meminfo', 'r') as fh:
data = fh.read()
used = 0
total = 0
# Using fh.readlines() would be more convinient but it makes testing difficult
for line in data.split('\n'):
if line == '':
continue
tmp = line.split()
if tmp[0][:-1] in ['MemFree', 'Cached', 'Slab', 'Buffers']:
used -= int(tmp[1])
elif tmp[0][:-1] == 'MemTotal':
used += int(tmp[1])
total = int(tmp[1])
return round(used/1024, 2), round(total/1024, 2)
def fetch_disk_usage(mountpoint):
"""
Fetch current disk usage.
Args:
mountpoint: path to mountpoint for which current usage data should be
fetched.
Returns:
A tuple: (disk usage, total disk space available), in megabytes.
"""
statvfs = os.statvfs(mountpoint)
cur_u = round(statvfs.f_frsize * (statvfs.f_blocks-statvfs.f_bavail)/1024**2, 2)
max_u = round(statvfs.f_frsize * statvfs.f_blocks/1024**2, 2)
return cur_u, max_u
def fetch_inode_usage(mountpoint):
"""
Fetch current inode usage.
Args:
mountpoint: path to mountpoint for which current usage data should be
fetched.
Returns:
A tuple: (inode usage, total inodes available).
"""
statvfs = os.statvfs(mountpoint)
cur_u = statvfs.f_files - statvfs.f_ffree
max_u = statvfs.f_files
return cur_u, max_u
def find_planned_grow_ratio(cur_usage, max_usage, timeframe):
"""
Calculate 'ideal' growth ratio for a resource.
Units-agnostic function used to calculate ideal resource grow ratio,
basing soley on the available resources and given timeframe.
Args:
cur_usage: current resource usage
max_usage: how much of the resource there is in general
timeframe: for how long given resource should be sufficient
Returns:
See below :)
"""
return round(max_usage/timeframe, 2)
def find_current_grow_ratio(datapoints):
"""
Find current grow ratio of the resource.
Units-agnostic function which calculates current resource grow ratio,
basing on the historic data. This is done using linear regression.
Assuming that resource growth during current timeframe can be approximed by
y = ax + b
then y is current usage, a is current growth ratio and b is usage generated
earlier, before the begining of our time window.
Args:
datapoints: a dictionary with timestamps as keys and resource usages as
values.
Returns:
resource-units/day with 2 digit precision.
"""
sorted_x = sorted(datapoints.keys())
y = numpy.array([datapoints[x] for x in sorted_x])
x = numpy.array(sorted_x)
A = numpy.vstack([x, numpy.ones(len(x))]).T
m, c = numpy.linalg.lstsq(A, y)[0]
slope, intercept = numpy.linalg.lstsq(A, y)[0]
return round(slope*3600*24, 2)
def parse_command_line():
parser = argparse.ArgumentParser(
description='Simple resource usage check',
epilog="Author: Pawel Rozlach <[email protected]>",
add_help=True,)
parser.add_argument(
'--version',
action='version',
version='1.0')
parser.add_argument(
"-c", "--config-file",
action='store',
required=True,
help="Location of the configuration file")
parser.add_argument(
"-v", "--verbose",
action='store_true',
required=False,
help="Provide extra logging messages.")
parser.add_argument(
"-s", "--std-err",
action='store_true',
required=False,
help="Log to stderr instead of syslog")
parser.add_argument(
"-d", "--clean-histdata",
action='store_true',
required=False,
help="ACK abnormal growth")
args = parser.parse_args()
return {'std_err': args.std_err,
'verbose': args.verbose,
'config_file': args.config_file,
'clean_histdata': args.clean_histdata,
}
def verify_conf():
msg = []
prefixes = []
timeframe = ScriptConfiguration.get_val('timeframe')
max_averaging_window = ScriptConfiguration.get_val('max_averaging_window')
min_averaging_window = ScriptConfiguration.get_val('min_averaging_window')
if timeframe <= 0:
msg.append('Timeframe should be a positive int.')
if max_averaging_window <= 0:
msg.append('Max averaging window should be a positive int.')
if 0.5 * timeframe <= max_averaging_window:
msg.append('Max averaging windown should not be grater than ' +
'0.5 * timeframe.')
if min_averaging_window >= max_averaging_window:
msg.append('Maximum averaging windown should be grater than ' +
'minimal averaging window.')
if ScriptConfiguration.get_val('memory_mon_enabled'):
prefixes.append('memory_mon_')
if ScriptConfiguration.get_val('disk_mon_enabled'):
prefixes.append('disk_mon_')
if not prefixes:
msg.append('There should be at least one resourece check enabled.')
for prefix in prefixes:
warn_reduction = ScriptConfiguration.get_val(prefix + 'warn_reduction')
crit_reduction = ScriptConfiguration.get_val(prefix + 'crit_reduction')
if warn_reduction <= 0:
msg.append(prefix + 'warn_reduction should be a positive int.')
if crit_reduction <= 0:
msg.append(prefix + 'crit_reduction should be a positive int.')
if warn_reduction >= crit_reduction:
msg.append(prefix + "warn_reduction should be lower than " +
prefix + "crit_reduction.")
if ScriptConfiguration.get_val('disk_mon_enabled'):
mountpoints = ScriptConfiguration.get_val('disk_mountpoints')
for mountpoint in mountpoints:
# ismount seems to not properly detect all mount types :/
# if not (os.path.exists(mountpoint) and os.path.ismount(mountpoint)):
if not os.path.exists(mountpoint):
msg.append('disk_mountpoint {0} '.format(mountpoint) +
'does not point to a valid mountpoint.')
# if there are problems with configuration file then there is no point
# in continuing:
if msg:
ScriptStatus.notify_immediate('unknown',
"Configuration file contains errors: " +
' '.join(msg))
# Everything is fine:
return
def main(config_file, std_err=False, verbose=True, clean_histdata=False):
"""
Main function of the script
Args:
config_file: file path of the config file to load
std_err: whether print logging output to stderr
verbose: whether to provide verbose logging messages
clean_histdata: all historical data should be cleared
"""
try:
# Configure logging:
fmt = logging.Formatter('%(filename)s[%(process)d] %(levelname)s: ' +
'%(message)s')
logger = logging.getLogger()
if verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
if std_err:
handler = logging.StreamHandler()
else:
handler = lh.SysLogHandler(address='/dev/log',
facility=lh.SysLogHandler.LOG_USER)
handler.setFormatter(fmt)
logger.addHandler(handler)
logger.debug("{0} is starting, ".format(os.path.basename(__file__)) +
"command line arguments: " +
"config_file={0}, ".format(config_file) +
"std_err={0}, ".format(std_err) +
"verbose={0}, ".format(verbose) +
"clean_histdata={0}".format(clean_histdata)
)
# FIXME - Remember to correctly configure syslog, otherwise rsyslog will
# discard messages
ScriptConfiguration.load_config(config_file)
logger.debug("Loaded configuration: " +
str(ScriptConfiguration.get_config())
)
# Initialize reporting to monitoring system:
ScriptStatus.init(nrpe_enable=True)
# Make sure that we are the only ones running on the server:
ScriptLock.init(ScriptConfiguration.get_val('lockfile'))
ScriptLock.aqquire()
# Some basic sanity checking:
verify_conf()
# We are all set, lets do some real work:
HistoryFile.init(location=ScriptConfiguration.get_val('history_file'),
max_averaging_window=ScriptConfiguration.get_val(
'max_averaging_window'),
min_averaging_window=ScriptConfiguration.get_val(
'min_averaging_window'))
if clean_histdata:
HistoryFile.clear_history()
HistoryFile.save()
ScriptStatus.notify_immediate('unknown',
'History data has been cleared.')
timeframe = ScriptConfiguration.get_val('timeframe')
# FIXME: not sure how to refactor this, copypaste does not seem the best
# solution :(
def do_status_processing(prefix, current_growth, planned_growth,
mountpoint=None, data_type=None):
warn_tresh = 1 + (ScriptConfiguration.get_val(
prefix + '_mon_warn_reduction')/100)
crit_tresh = 1 + (ScriptConfiguration.get_val(
prefix + '_mon_crit_reduction')/100)
if prefix == 'disk' and data_type == 'inode':
units = 'inodes/day'
else:
units = 'MB/day'
if prefix == 'disk':
rname = data_type + \
' usage growth for mount {0}'.format(mountpoint)
else:
rname = '{0} usage growth'.format(prefix)
rname = rname.capitalize()
if current_growth > planned_growth * warn_tresh:
msg = '{0} exceeds planned growth '.format(rname) + \
'- current: {0} {1}'.format(current_growth, units) + \
', planned: {0} {1}.'.format(planned_growth, units)
if current_growth > planned_growth * crit_tresh:
ScriptStatus.update('crit', msg)
else:
ScriptStatus.update('warn', msg)
else:
ScriptStatus.update('ok',
'{0} is OK ({1} {2}).'.format(
rname, current_growth, units))
if ScriptConfiguration.get_val('memory_mon_enabled'):
cur_usage, max_usage = fetch_memory_usage()
HistoryFile.add_datapoint('memory', cur_usage)
tmp = HistoryFile.verify_dataspan('memory')
if tmp < 0:
ScriptStatus.update('unknown', 'There is not enough data ' +
'to calculate current memory ' +
'usage growth: {0} '.format(abs(tmp)) +
'days more is needed.')
else:
datapoints = HistoryFile.get_datapoints('memory')
planned_growth = find_planned_grow_ratio(cur_usage, max_usage,
timeframe)
current_growth = find_current_grow_ratio(datapoints)
logging.debug('memory -> ' +
'current_growth: {0}, '.format(current_growth) +
'planned_growth: {0}'.format(planned_growth))
do_status_processing('memory', current_growth, planned_growth)
if ScriptConfiguration.get_val('disk_mon_enabled'):
mountpoints = ScriptConfiguration.get_val('disk_mountpoints')
for dtype in ['space', 'inode']:
for mountpoint in mountpoints:
if dtype == 'inode':
cur_usage, max_usage = fetch_inode_usage(mountpoint)
else:
cur_usage, max_usage = fetch_disk_usage(mountpoint)
HistoryFile.add_datapoint('disk', cur_usage,
data_type=dtype,
path=mountpoint)
tmp = HistoryFile.verify_dataspan('disk',
data_type=dtype,
path=mountpoint)
if tmp < 0:
ScriptStatus.update('unknown',
'There is not enough data to ' +
'calculate current disk ' + dtype +
' usage growth for mountpoint ' +
'{0}: {1} '.format(
mountpoint, abs(tmp)) +
'days more is needed.')
else:
datapoints = HistoryFile.get_datapoints('disk',
data_type=dtype,
path=mountpoint)
planned_growth = find_planned_grow_ratio(cur_usage,
max_usage,
timeframe)
current_growth = find_current_grow_ratio(datapoints)
logging.debug('disk, ' +
'mountpoint {0}, '.format(mountpoint) +
'data_type {0}: '.format(dtype) +
'current_growth: {0}'.format(current_growth) +
'planned_growth: {0}'.format(planned_growth))
do_status_processing('disk', current_growth, planned_growth,
mountpoint=mountpoint, data_type=dtype)
HistoryFile.save()
ScriptStatus.notify_agregated()
ScriptLock.release()
except RecoverableException as e:
msg = str(e)
logging.critical(msg)
ScriptStatus.notify_immediate('unknown', msg)
sys.exit(1)
except AssertionError as e:
# Unittests require it:
raise
except Exception as e:
msg = "Exception occured: {0}".format(e.__class__.__name__)
logging.exception(msg)
print(msg) # We can use notify immediate here :(
sys.exit(3)