forked from librenms/librenms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwrapper.py
694 lines (616 loc) · 23 KB
/
wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
#! /usr/bin/env python3
"""
wrapper A small tool which wraps services, discovery and poller php scripts
in order to run them as threads with Queue and workers
Authors: Orsiris de Jong <[email protected]>
Neil Lathwood <[email protected]>
Job Snijders <[email protected]>
Distributed poller code (c) 2015, GPLv3, Daniel Preussker <[email protected]>
All code parts that belong to Daniel are enclosed in EOC comments
Date: Sep 2021
Usage: This program accepts three command line arguments
- the number of threads (defaults to 1 for discovery / service, and 16 for poller)
- the wrapper type (service, discovery or poller)
- optional debug boolean
Ubuntu Linux: apt-get install python-mysqldb
FreeBSD: cd /usr/ports/*/py-MySQLdb && make install clean
RHEL 7: yum install MySQL-python
RHEL 8: dnf install mariadb-connector-c-devel gcc && python -m pip install mysqlclient
Tested on: Python 3.6.8 / PHP 7.2.11 / CentOS 8 / AlmaLinux 8.4
License: This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.
You should have received a copy of the GNU General Public License along
with this program. If not, see https://www.gnu.org/licenses/.
LICENSE.txt contains a copy of the full GPLv3 licensing conditions.
"""
import logging
import os
import queue
import re
import sys
import threading
import time
import uuid
from argparse import ArgumentParser
import LibreNMS
from LibreNMS.command_runner import command_runner
from LibreNMS.config import DBConfig
logger = logging.getLogger(__name__)
# Timeout in seconds for any poller / service / discovery action per device
# Should be higher than stepping which defaults to 300
PER_DEVICE_TIMEOUT = 900
# 5 = no new discovered devices, 6 = unreachable device
VALID_EXIT_CODES = [0, 5, 6]
DISTRIBUTED_POLLING = False # Is overriden by config.php
REAL_DURATION = 0
DISCOVERED_DEVICES_COUNT = 0
PER_DEVICE_DURATION = {}
ERRORS = 0
MEMC = None
IS_NODE = None
STEPPING = None
MASTER_TAG = None
NODES_TAG = None
TIME_TAG = ""
"""
Per wrapper type configuration
All time related variables are in seconds
"""
wrappers = {
"service": {
"executable": "check-services.php",
"option": "-h",
"table_name": "services",
"memc_touch_time": 10,
"stepping": 300,
"nodes_stepping": 300,
"total_exec_time": 300,
},
"discovery": {
"executable": "discovery.php",
"option": "-h",
"table_name": "devices",
"memc_touch_time": 30,
"stepping": 300,
"nodes_stepping": 3600,
"total_exec_time": 21600,
},
"poller": {
"executable": "lnms",
"option": "device:poll",
"table_name": "devices",
"memc_touch_time": 10,
"stepping": 300,
"nodes_stepping": 300,
"total_exec_time": 300,
},
}
"""
Threading helper functions
"""
# <<<EOC
def memc_alive(name): # Type: str
"""
Checks if memcache is working by injecting a random string and trying to read it again
"""
try:
key = str(uuid.uuid4())
MEMC.set(name + ".ping." + key, key, 60)
if MEMC.get(name + ".ping." + key) == key:
MEMC.delete(name + ".ping." + key)
return True
return False
except:
return False
def memc_touch(key, _time): # Type: str # Type: int
"""
Updates a memcache key wait time
"""
try:
val = MEMC.get(key)
MEMC.set(key, val, _time)
except:
pass
def get_time_tag(step): # Type: int
"""
Get current time tag as timestamp module stepping
"""
timestamp = int(time.time())
return timestamp - timestamp % step
# EOC
def print_worker(print_queue, wrapper_type): # Type: Queue # Type: str
"""
A seperate queue and a single worker for printing information to the screen prevents
the good old joke:
Some people, when confronted with a problem, think,
"I know, I'll use threads," and then they have two problems.
"""
nodeso = 0
while True:
# <<<EOC
global IS_NODE
global DISTRIBUTED_POLLING
if DISTRIBUTED_POLLING:
if not IS_NODE:
memc_touch(MASTER_TAG, wrappers[wrapper_type]["memc_touch_time"])
nodes = MEMC.get(NODES_TAG)
if nodes is None and not memc_alive(wrapper_type):
logger.warning(
"Lost Memcached. Taking over all devices. Nodes will quit shortly."
)
DISTRIBUTED_POLLING = False
nodes = nodeso
if nodes is not nodeso:
logger.info("{} Node(s) Total".format(nodes))
nodeso = nodes
else:
memc_touch(NODES_TAG, wrappers[wrapper_type]["memc_touch_time"])
try:
(
worker_id,
device_id,
elapsed_time,
command,
exit_code,
) = print_queue.get(False)
except:
pass
try:
time.sleep(1)
except:
pass
continue
else:
worker_id, device_id, elapsed_time, command, exit_code = print_queue.get()
# EOC
global REAL_DURATION
global PER_DEVICE_DURATION
global DISCOVERED_DEVICES_COUNT
REAL_DURATION += elapsed_time
PER_DEVICE_DURATION[device_id] = elapsed_time
DISCOVERED_DEVICES_COUNT += 1
if elapsed_time < STEPPING and exit_code in VALID_EXIT_CODES:
logger.info(
"worker {} finished device {} in {} seconds".format(
worker_id, device_id, elapsed_time
)
)
else:
logger.warning(
"worker {} finished device {} in {} seconds with exit code {}".format(
worker_id, device_id, elapsed_time, exit_code
)
)
logger.debug("Command was {}".format(command))
print_queue.task_done()
def poll_worker(
poll_queue, # Type: Queue
print_queue, # Type: Queue
config, # Type: dict
log_dir, # Type: str
wrapper_type, # Type: str
debug, # Type: bool
modules="", # Type: string
):
"""
This function will fork off single instances of the php process, record
how long it takes, and push the resulting reports to the printer queue
"""
global ERRORS
while True:
device_id = poll_queue.get()
# <<<EOC
if (
not DISTRIBUTED_POLLING
or MEMC.get("{}.device.{}{}".format(wrapper_type, device_id, TIME_TAG))
is None
):
if DISTRIBUTED_POLLING:
result = MEMC.add(
"{}.device.{}{}".format(wrapper_type, device_id, TIME_TAG),
config["distributed_poller_name"],
STEPPING,
)
if not result:
logger.info(
"The device {} appears to be being checked by another node".format(
device_id
)
)
poll_queue.task_done()
continue
if not memc_alive(wrapper_type) and IS_NODE:
logger.warning(
"Lost Memcached, Not checking Device {} as Node. Master will check it.".format(
device_id
)
)
poll_queue.task_done()
continue
# EOC
try:
start_time = time.time()
device_log = os.path.join(
log_dir, "{}_device_{}.log".format(wrapper_type, device_id)
)
executable = os.path.join(
os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
wrappers[wrapper_type]["executable"],
)
command = "/usr/bin/env php {} {} {}".format(
executable, wrappers[wrapper_type]["option"], device_id
)
if modules is not None and len(str(modules).strip()):
module_str = re.sub("\s", "", str(modules).strip())
command = command + " -m {}".format(module_str)
# enable debug output otherwise, set -q for lnms commands
if wrappers[wrapper_type]["executable"] == "lnms":
command = command + (" -vv" if debug else " -q")
elif debug:
command = command + " -d"
exit_code, output = command_runner(
command,
shell=True,
timeout=PER_DEVICE_TIMEOUT,
valid_exit_codes=VALID_EXIT_CODES,
)
if exit_code not in [0, 6]:
logger.error(
"Thread {} exited with code {}".format(
threading.current_thread().name, exit_code
)
)
ERRORS += 1
logger.error(output)
elif exit_code == 5:
logger.info("Unreachable device {}".format(device_id))
else:
logger.debug(output)
if debug:
with open(device_log, "w", encoding="utf-8") as dev_log_file:
dev_log_file.write(output)
elapsed_time = int(time.time() - start_time)
print_queue.put(
[
threading.current_thread().name,
device_id,
elapsed_time,
command,
exit_code,
]
)
except (KeyboardInterrupt, SystemExit):
raise
except Exception:
logger.error("Unknown problem happened: ")
logger.error("Traceback:", exc_info=True)
poll_queue.task_done()
def wrapper(
wrapper_type, # Type: str
amount_of_workers, # Type: int
config, # Type: dict
log_dir, # Type: str
_debug=False, # Type: bool
**kwargs, # Type: dict, may contain modules
): # -> None
"""
Actual code that runs various php scripts, in single node mode or distributed poller mode
"""
global MEMC
global IS_NODE
global DISTRIBUTED_POLLING
global MASTER_TAG
global NODES_TAG
global TIME_TAG
global STEPPING
# Setup wrapper dependent variables
STEPPING = wrappers[wrapper_type]["stepping"]
if wrapper_type == "poller":
if "rrd" in config and "step" in config["rrd"]:
STEPPING = config["rrd"]["step"]
TIME_TAG = "." + str(get_time_tag(STEPPING))
MASTER_TAG = "{}.master{}".format(wrapper_type, TIME_TAG)
NODES_TAG = "{}.nodes{}".format(wrapper_type, TIME_TAG)
# <<<EOC
if "distributed_poller_group" in config:
poller_group = str(config["distributed_poller_group"])
else:
poller_group = False
if (
"distributed_poller" in config
and "distributed_poller_memcached_host" in config
and "distributed_poller_memcached_port" in config
and config["distributed_poller"]
):
try:
import memcache
MEMC = memcache.Client(
[
config["distributed_poller_memcached_host"]
+ ":"
+ str(config["distributed_poller_memcached_port"])
]
)
if str(MEMC.get(MASTER_TAG)) == config["distributed_poller_name"]:
logger.info("This system is already joined as the service master.")
sys.exit(2)
if memc_alive(wrapper_type):
if MEMC.get(MASTER_TAG) is None:
logger.info("Registered as Master")
MEMC.set(MASTER_TAG, config["distributed_poller_name"], 10)
MEMC.set(NODES_TAG, 0, wrappers[wrapper_type]["nodes_stepping"])
IS_NODE = False
else:
logger.info(
"Registered as Node joining Master {}".format(
MEMC.get(MASTER_TAG)
)
)
IS_NODE = True
MEMC.incr(NODES_TAG)
DISTRIBUTED_POLLING = True
else:
logger.warning(
"Could not connect to memcached, disabling distributed service checks."
)
DISTRIBUTED_POLLING = False
IS_NODE = False
except SystemExit:
raise
except ImportError:
logger.critical("ERROR: missing memcache python module:")
logger.critical("On deb systems: apt-get install python3-memcache")
logger.critical("On other systems: pip3 install python-memcached")
logger.critical("Disabling distributed discovery.")
DISTRIBUTED_POLLING = False
else:
DISTRIBUTED_POLLING = False
# EOC
s_time = time.time()
devices_list = []
if wrapper_type == "service":
# <<<EOC
if poller_group is not False:
query = (
"SELECT DISTINCT(services.device_id) FROM services LEFT JOIN devices ON "
"services.device_id = devices.device_id WHERE devices.poller_group IN({}) AND "
"devices.disabled = 0".format(poller_group)
)
else:
query = (
"SELECT DISTINCT(services.device_id) FROM services LEFT JOIN devices ON "
"services.device_id = devices.device_id WHERE devices.disabled = 0"
)
# EOC
elif wrapper_type in ["discovery", "poller"]:
"""
This query specificly orders the results depending on the last_discovered_timetaken variable
Because this way, we put the devices likely to be slow, in the top of the queue
thus greatening our chances of completing _all_ the work in exactly the time it takes to
discover the slowest device! cool stuff he
"""
# <<<EOC
if poller_group is not False:
query = (
"SELECT device_id FROM devices WHERE poller_group IN ({}) AND "
"disabled = 0 ORDER BY last_polled_timetaken DESC".format(poller_group)
)
else:
query = "SELECT device_id FROM devices WHERE disabled = 0 ORDER BY last_polled_timetaken DESC"
# EOC
else:
logger.critical("Bogus wrapper type called")
sys.exit(3)
maxlocks = 0
minlocks = 0
sconfig = DBConfig()
sconfig.populate(config)
db_connection = LibreNMS.DB(sconfig)
cursor = db_connection.query(query)
devices = cursor.fetchall()
for row in devices:
devices_list.append(int(row[0]))
# <<<EOC
if DISTRIBUTED_POLLING and not IS_NODE:
query = "SELECT max(device_id),min(device_id) FROM {}".format(
wrappers[wrapper_type]["table_name"]
)
cursor = db_connection.query(query)
devices = cursor.fetchall()
maxlocks = devices[0][0] or 0
minlocks = devices[0][1] or 0
# EOC
poll_queue = queue.Queue()
print_queue = queue.Queue()
# Don't have more threads than workers
amount_of_devices = len(devices_list)
if amount_of_workers > amount_of_devices:
amount_of_workers = amount_of_devices
logger.info(
"starting the {} check at {} with {} threads for {} devices".format(
wrapper_type,
time.strftime("%Y-%m-%d %H:%M:%S"),
amount_of_workers,
amount_of_devices,
)
)
for device_id in devices_list:
poll_queue.put(device_id)
for _ in range(amount_of_workers):
worker = threading.Thread(
target=poll_worker,
kwargs={
"poll_queue": poll_queue,
"print_queue": print_queue,
"config": config,
"log_dir": log_dir,
"wrapper_type": wrapper_type,
"debug": _debug,
"modules": kwargs.get("modules", ""),
},
)
worker.setDaemon(True)
worker.start()
pworker = threading.Thread(
target=print_worker,
kwargs={"print_queue": print_queue, "wrapper_type": wrapper_type},
)
pworker.setDaemon(True)
pworker.start()
try:
poll_queue.join()
print_queue.join()
except (KeyboardInterrupt, SystemExit):
raise
total_time = int(time.time() - s_time)
end_msg = "{}-wrapper checked {} devices in {} seconds with {} workers with {} errors".format(
wrapper_type, DISCOVERED_DEVICES_COUNT, total_time, amount_of_workers, ERRORS
)
if ERRORS == 0:
logger.info(end_msg)
else:
logger.error(end_msg)
# <<<EOC
if DISTRIBUTED_POLLING or memc_alive(wrapper_type):
master = MEMC.get(MASTER_TAG)
if master == config["distributed_poller_name"] and not IS_NODE:
logger.info("Wait for all service-nodes to finish")
nodes = MEMC.get(NODES_TAG)
while nodes is not None and nodes > 0:
try:
time.sleep(1)
nodes = MEMC.get(NODES_TAG)
except:
pass
logger.info("Clearing Locks for {}".format(NODES_TAG))
x = minlocks
while x <= maxlocks:
MEMC.delete("{}.device.{}".format(wrapper_type, x))
x = x + 1
logger.info("{} Locks Cleared".format(x))
logger.info("Clearing Nodes")
MEMC.delete(MASTER_TAG)
MEMC.delete(NODES_TAG)
else:
MEMC.decr(NODES_TAG)
logger.info("Finished {}.".format(time.strftime("%Y-%m-%d %H:%M:%S")))
# EOC
# Update poller statistics
if wrapper_type == "poller":
query = "UPDATE pollers SET last_polled=NOW(), devices='{}', time_taken='{}' WHERE poller_name='{}'".format(
DISCOVERED_DEVICES_COUNT, total_time, config["distributed_poller_name"]
)
cursor = db_connection.query(query)
if cursor.rowcount < 1:
query = "INSERT INTO pollers SET poller_name='{}', last_polled=NOW(), devices='{}', time_taken='{}'".format(
config["distributed_poller_name"], DISCOVERED_DEVICES_COUNT, total_time
)
db_connection.query(query)
db_connection.close()
if total_time > wrappers[wrapper_type]["total_exec_time"]:
logger.warning(
"the process took more than {} seconds to finish, you need faster hardware or more threads".format(
wrappers[wrapper_type]["total_exec_time"]
)
)
logger.warning(
"in sequential style service checks the elapsed time would have been: {} seconds".format(
REAL_DURATION
)
)
show_stopper = False
for device in PER_DEVICE_DURATION:
if PER_DEVICE_DURATION[device] > wrappers[wrapper_type]["nodes_stepping"]:
logger.warning(
"device {} is taking too long: {} seconds".format(
device, PER_DEVICE_DURATION[device]
)
)
show_stopper = True
if show_stopper:
logger.error(
"Some devices are taking more than {} seconds, the script cannot recommend you what to do.".format(
wrappers[wrapper_type]["nodes_stepping"]
)
)
else:
recommend = int(total_time / STEPPING * amount_of_workers + 1)
logger.warning(
"Consider setting a minimum of {} threads. (This does not constitute professional advice!)".format(
recommend
)
)
sys.exit(2)
if __name__ == "__main__":
parser = ArgumentParser(
prog="wrapper.py",
usage="usage: %(prog)s [options] <wrapper_type> <workers>\n"
"wrapper_type = 'service', 'poller' or 'disccovery'"
"workers defaults to 1 for service and discovery, and 16 for poller "
"(Do not set too high, or you will get an OOM)",
description="Spawn multiple librenms php processes in parallel.",
)
parser.add_argument(
"-d",
"--debug",
action="store_true",
default=False,
help="Enable debug output. WARNING: Leaving this enabled will consume a lot of disk space.",
)
parser.add_argument(
"-m",
"--modules",
default="",
help="Enable passing of a module string, modules are separated by comma",
)
parser.add_argument(
dest="wrapper",
default=None,
help="Execute wrapper for 'service', 'poller' or 'discovery'",
)
parser.add_argument(
dest="threads", action="store_true", default=None, help="Number of workers"
)
args = parser.parse_args()
debug = args.debug
modules = args.modules or ""
wrapper_type = args.wrapper
amount_of_workers = args.threads
if wrapper_type not in ["service", "discovery", "poller"]:
parser.error("Invalid wrapper type '{}'".format(wrapper_type))
sys.exit(4)
config = LibreNMS.get_config_data(
os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
)
log_dir = config["log_dir"]
log_file = os.path.join(log_dir, wrapper_type + ".log")
logger = LibreNMS.logger_get_logger(log_file, debug=debug)
try:
amount_of_workers = int(amount_of_workers)
except (IndexError, ValueError, TypeError):
amount_of_workers = (
16 if wrapper_type == "poller" else 1
) # Defaults to 1 for service/discovery, 16 for poller
logger.warning(
"Bogus number of workers given. Using default number ({}) of workers.".format(
amount_of_workers
)
)
if wrapper_type in ["discovery", "poller"]:
modules_validated = modules
else:
modules_validated = "" # ignore module parameter
wrapper(
wrapper_type,
amount_of_workers,
config,
log_dir,
_debug=debug,
modules=modules_validated,
)