From 47387de03cbe129fd6d5811afc122559276a29aa Mon Sep 17 00:00:00 2001 From: kayn Date: Thu, 28 Aug 2014 13:13:06 +0200 Subject: [PATCH 1/9] initial commit - added testing lag report --- mongodb.py | 290 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 232 insertions(+), 58 deletions(-) diff --git a/mongodb.py b/mongodb.py index 25410eb..5700f6b 100644 --- a/mongodb.py +++ b/mongodb.py @@ -35,6 +35,178 @@ def submit(self, type, instance, value, db=None): v.values = [value, ] v.dispatch() + def set_read_preference(db): + if pymongo.version >= "2.1": + db.read_preference = pymongo.ReadPreference.SECONDARY + + #def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd): + def check_rep_lag(): + user = self.mongo_user + passwd = self.mongo_password + # Get mongo to tell us replica set member name when connecting locally + con = Connection(host=self.mongo_host, port=self.mongo_port, slave_okay=True) + db = con[self.mongo_db[0]] +# if "127.0.0.1" == host: +# host = con.admin.command("ismaster","1")["me"].split(':')[0] + + if percent: + warning = warning or 50 + critical = critical or 75 + else: + warning = warning or 600 + critical = critical or 3600 + rs_status = {} + slaveDelays = {} + try: + set_read_preference(con.admin) + + # Get replica set status + try: + rs_status = con.admin.command("replSetGetStatus") + except pymongo.errors.OperationFailure, e: + if e.code == None and str(e).find('failed: not running with --replSet"'): + print "OK - Not running with replSet" + return 0 + + serverVersion = tuple(con.server_info()['version'].split('.')) + if serverVersion >= tuple("2.0.0".split(".")): + # + # check for version greater then 2.0 + # + rs_conf = con.local.system.replset.find_one() + for member in rs_conf['members']: + if member.get('slaveDelay') is not None: + slaveDelays[member['host']] = member.get('slaveDelay') + else: + slaveDelays[member['host']] = 0 + + # Find the primary and/or the current node + primary_node = None + host_node = None + + for member in rs_status["members"]: + if member["stateStr"] == "PRIMARY": + primary_node = member + if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port: + host_node = member + + # Check if we're in the middle of an election and don't have a primary + if primary_node is None: + print "WARNING - No primary defined. In an election?" + return 1 + + # Check if we failed to find the current host + # below should never happen + if host_node is None: + print "CRITICAL - Unable to find host '" + host + "' in replica set." + return 2 + # Is the specified host the primary? + if host_node["stateStr"] == "PRIMARY": + if max_lag == False: + print "OK - This is the primary." + return 0 + else: + #get the maximal replication lag + data = "" + maximal_lag = 0 + for member in rs_status['members']: + if not member['stateStr'] == "ARBITER": + lastSlaveOpTime = member['optimeDate'] + replicationLag = abs(primary_node["optimeDate"] - lastSlaveOpTime).seconds - slaveDelays[member['name']] + data = data + member['name'] + " lag=%d;" % replicationLag + maximal_lag = max(maximal_lag, replicationLag) + if percent: + err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) + if err != 0: + return err + primary_timediff = replication_get_time_diff(con) + maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100) + message = "Maximal lag is " + str(maximal_lag) + " percents" + message += performance_data(perf_data, [(maximal_lag, "replication_lag_percent", warning, critical)]) + else: + message = "Maximal lag is " + str(maximal_lag) + " seconds" + message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)]) + return check_levels(maximal_lag, warning, critical, message) + elif host_node["stateStr"] == "ARBITER": + print "OK - This is an arbiter" + return 0 + + # Find the difference in optime between current node and PRIMARY + + optime_lag = abs(primary_node["optimeDate"] - host_node["optimeDate"]) + + if host_node['name'] in slaveDelays: + slave_delay = slaveDelays[host_node['name']] + elif host_node['name'].endswith(':27017') and host_node['name'][:-len(":27017")] in slaveDelays: + slave_delay = slaveDelays[host_node['name'][:-len(":27017")]] + else: + raise Exception("Unable to determine slave delay for {0}".format(host_node['name'])) + + try: # work starting from python2.7 + lag = optime_lag.total_seconds() + except: + lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600) + + if percent: + err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) + if err != 0: + return err + primary_timediff = replication_get_time_diff(con) + if primary_timediff != 0: + lag = int(float(lag) / float(primary_timediff) * 100) + else: + lag = 0 + message = "Lag is " + str(lag) + " percents" + message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)]) + else: + message = "Lag is " + str(lag) + " seconds" + message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)]) + return check_levels(lag, warning + slaveDelays[host_node['name']], critical + slaveDelays[host_node['name']], message) + else: + # + # less than 2.0 check + # + # Get replica set status + rs_status = con.admin.command("replSetGetStatus") + + # Find the primary and/or the current node + primary_node = None + host_node = None + for member in rs_status["members"]: + if member["stateStr"] == "PRIMARY": + primary_node = (member["name"], member["optimeDate"]) + if member["name"].split(":")[0].startswith(host): + host_node = member + + # Check if we're in the middle of an election and don't have a primary + if primary_node is None: + print "WARNING - No primary defined. In an election?" + sys.exit(1) + + # Is the specified host the primary? + if host_node["stateStr"] == "PRIMARY": + print "OK - This is the primary." + sys.exit(0) + + # Find the difference in optime between current node and PRIMARY + optime_lag = abs(primary_node[1] - host_node["optimeDate"]) + lag = optime_lag.seconds + if percent: + err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1])) + if err != 0: + return err + primary_timediff = replication_get_time_diff(con) + lag = int(float(lag) / float(primary_timediff) * 100) + message = "Lag is " + str(lag) + " percents" + message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)]) + else: + message = "Lag is " + str(lag) + " seconds" + message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)]) + return check_levels(lag, warning, critical, message) + + except Exception, e: + return exit_with_general_critical(e) + def do_server_status(self): con = Connection(host=self.mongo_host, port=self.mongo_port, slave_okay=True) db = con[self.mongo_db[0]] @@ -45,63 +217,64 @@ def do_server_status(self): version = server_status['version'] at_least_2_4 = V(version) >= V('2.4.0') - # operations - for k, v in server_status['opcounters'].items(): - self.submit('total_operations', k, v) - - # memory - for t in ['resident', 'virtual', 'mapped']: - self.submit('memory', t, server_status['mem'][t]) - - # connections - self.submit('connections', 'connections', server_status['connections']['current']) +# # operations +# for k, v in server_status['opcounters'].items(): +# self.submit('total_operations', k, v) +# +# # memory +# for t in ['resident', 'virtual', 'mapped']: +# self.submit('memory', t, server_status['mem'][t]) +# +# # connections +# self.submit('connections', 'connections', server_status['connections']['current']) +# +# # locks +# if self.lockTotalTime is not None and self.lockTime is not None: +# if self.lockTime == server_status['globalLock']['lockTime']: +# value = 0.0 +# else: +# value = float(server_status['globalLock']['lockTime'] - self.lockTime) * 100.0 / float(server_status['globalLock']['totalTime'] - self.lockTotalTime) +# self.submit('percent', 'lock_ratio', value) +# +# self.lockTotalTime = server_status['globalLock']['totalTime'] +# self.lockTime = server_status['globalLock']['lockTime'] +# +# # indexes +# accesses = None +# misses = None +# index_counters = server_status['indexCounters'] if at_least_2_4 else server_status['indexCounters']['btree'] +# +# if self.accesses is not None: +# accesses = index_counters['accesses'] - self.accesses +# if accesses < 0: +# accesses = None +# misses = (index_counters['misses'] or 0) - (self.misses or 0) +# if misses < 0: +# misses = None +# if accesses and misses is not None: +# self.submit('cache_ratio', 'cache_misses', int(misses * 100 / float(accesses))) +# else: +# self.submit('cache_ratio', 'cache_misses', 0) +# self.accesses = index_counters['accesses'] +# self.misses = index_counters['misses'] +# +# for mongo_db in self.mongo_db: +# db = con[mongo_db] +# if self.mongo_user and self.mongo_password: +# db.authenticate(self.mongo_user, self.mongo_password) +# db_stats = db.command('dbstats') +# +# # stats counts +# self.submit('counter', 'object_count', db_stats['objects'], mongo_db) +# self.submit('counter', 'collections', db_stats['collections'], mongo_db) +# self.submit('counter', 'num_extents', db_stats['numExtents'], mongo_db) +# self.submit('counter', 'indexes', db_stats['indexes'], mongo_db) +# +# # stats sizes +# self.submit('file_size', 'storage', db_stats['storageSize'], mongo_db) +# self.submit('file_size', 'index', db_stats['indexSize'], mongo_db) +# self.submit('file_size', 'data', db_stats['dataSize'], mongo_db) - # locks - if self.lockTotalTime is not None and self.lockTime is not None: - if self.lockTime == server_status['globalLock']['lockTime']: - value = 0.0 - else: - value = float(server_status['globalLock']['lockTime'] - self.lockTime) * 100.0 / float(server_status['globalLock']['totalTime'] - self.lockTotalTime) - self.submit('percent', 'lock_ratio', value) - - self.lockTotalTime = server_status['globalLock']['totalTime'] - self.lockTime = server_status['globalLock']['lockTime'] - - # indexes - accesses = None - misses = None - index_counters = server_status['indexCounters'] if at_least_2_4 else server_status['indexCounters']['btree'] - - if self.accesses is not None: - accesses = index_counters['accesses'] - self.accesses - if accesses < 0: - accesses = None - misses = (index_counters['misses'] or 0) - (self.misses or 0) - if misses < 0: - misses = None - if accesses and misses is not None: - self.submit('cache_ratio', 'cache_misses', int(misses * 100 / float(accesses))) - else: - self.submit('cache_ratio', 'cache_misses', 0) - self.accesses = index_counters['accesses'] - self.misses = index_counters['misses'] - - for mongo_db in self.mongo_db: - db = con[mongo_db] - if self.mongo_user and self.mongo_password: - db.authenticate(self.mongo_user, self.mongo_password) - db_stats = db.command('dbstats') - - # stats counts - self.submit('counter', 'object_count', db_stats['objects'], mongo_db) - self.submit('counter', 'collections', db_stats['collections'], mongo_db) - self.submit('counter', 'num_extents', db_stats['numExtents'], mongo_db) - self.submit('counter', 'indexes', db_stats['indexes'], mongo_db) - - # stats sizes - self.submit('file_size', 'storage', db_stats['storageSize'], mongo_db) - self.submit('file_size', 'index', db_stats['indexSize'], mongo_db) - self.submit('file_size', 'data', db_stats['dataSize'], mongo_db) con.disconnect() @@ -121,5 +294,6 @@ def config(self, obj): collectd.warning("mongodb plugin: Unkown configuration key %s" % node.key) mongodb = MongoDB() -collectd.register_read(mongodb.do_server_status) -collectd.register_config(mongodb.config) +#collectd.register_read(mongodb.do_server_status) +mongodb.check_rep_lag() +#collectd.register_config(mongodb.config) From decb578f8eae3b6b29cd5da9f60235cf01c8938e Mon Sep 17 00:00:00 2001 From: kayn Date: Thu, 28 Aug 2014 13:22:00 +0200 Subject: [PATCH 2/9] added interpreter --- mongodb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mongodb.py b/mongodb.py index 5700f6b..22b4af7 100644 --- a/mongodb.py +++ b/mongodb.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # # Plugin to collectd statistics from MongoDB # From c5d3aa5c09c858b4cca4d104e1c88cd14acb7cc8 Mon Sep 17 00:00:00 2001 From: kayn Date: Thu, 28 Aug 2014 16:36:19 +0200 Subject: [PATCH 3/9] prepared to testing --- mongodb.py | 415 +++++++++++++++++++++++++++-------------------------- 1 file changed, 209 insertions(+), 206 deletions(-) diff --git a/mongodb.py b/mongodb.py index 22b4af7..fd5d711 100644 --- a/mongodb.py +++ b/mongodb.py @@ -3,16 +3,68 @@ # Plugin to collectd statistics from MongoDB # -import collectd +import imp + +foo = imp.load_source('collectd', '/root/mongodb/collectd.py') +collectd = foo +#import collectd +import pymongo from pymongo import Connection from distutils.version import StrictVersion as V +def set_read_preference(db): + if pymongo.version >= "2.1": + db.read_preference = pymongo.ReadPreference.SECONDARY + +def mongo_connect(host=None, port=None, ssl=False, user=None, passwd=None, replica=None): + try: + # ssl connection for pymongo > 2.3 + if pymongo.version >= "2.3": + if replica is None: + con = pymongo.MongoClient(host, port) + else: + con = pymongo.Connection(host, port, read_preference=pymongo.ReadPreference.SECONDARY, ssl=ssl, replicaSet=replica, network_timeout=10) + else: + if replica is None: + con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10) + else: + con = pymongo.Connection(host, port, slave_okay=True, network_timeout=10) + #con = pymongo.Connection(host, port, slave_okay=True, replicaSet=replica, network_timeout=10) + + if user and passwd: + db = con["admin"] + if not db.authenticate(user, passwd): + sys.exit("Username/Password incorrect") + except Exception, e: + if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1: + # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server + # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter + print "OK - State: 7 (Arbiter)" + sys.exit(0) + return exit_with_general_critical(e), None + return 0, con + +def replication_get_time_diff(con): + col = 'oplog.rs' + local = con.local + ol = local.system.namespaces.find_one({"name": "local.oplog.$main"}) + if ol: + col = 'oplog.$main' + firstc = local[col].find().sort("$natural", 1).limit(1) + lastc = local[col].find().sort("$natural", -1).limit(1) + first = firstc.next() + last = lastc.next() + tfirst = first["ts"] + tlast = last["ts"] + delta = tlast.time - tfirst.time + return delta + class MongoDB(object): def __init__(self): self.plugin_name = "mongo" - self.mongo_host = "127.0.0.1" + self.mongo_host = "172.18.3.21" self.mongo_port = 27017 self.mongo_db = ["admin", ] self.mongo_user = None @@ -36,31 +88,21 @@ def submit(self, type, instance, value, db=None): v.values = [value, ] v.dispatch() - def set_read_preference(db): - if pymongo.version >= "2.1": - db.read_preference = pymongo.ReadPreference.SECONDARY #def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd): - def check_rep_lag(): + def check_rep_lag(self, percent): + host = self.mongo_host + port = self.mongo_port user = self.mongo_user passwd = self.mongo_password + perf_data = False # Get mongo to tell us replica set member name when connecting locally con = Connection(host=self.mongo_host, port=self.mongo_port, slave_okay=True) db = con[self.mongo_db[0]] -# if "127.0.0.1" == host: -# host = con.admin.command("ismaster","1")["me"].split(':')[0] - if percent: - warning = warning or 50 - critical = critical or 75 - else: - warning = warning or 600 - critical = critical or 3600 rs_status = {} slaveDelays = {} try: - set_read_preference(con.admin) - # Get replica set status try: rs_status = con.admin.command("replSetGetStatus") @@ -69,144 +111,103 @@ def check_rep_lag(): print "OK - Not running with replSet" return 0 - serverVersion = tuple(con.server_info()['version'].split('.')) - if serverVersion >= tuple("2.0.0".split(".")): - # - # check for version greater then 2.0 - # - rs_conf = con.local.system.replset.find_one() - for member in rs_conf['members']: - if member.get('slaveDelay') is not None: - slaveDelays[member['host']] = member.get('slaveDelay') - else: - slaveDelays[member['host']] = 0 - - # Find the primary and/or the current node - primary_node = None - host_node = None - - for member in rs_status["members"]: - if member["stateStr"] == "PRIMARY": - primary_node = member - if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port: - host_node = member - - # Check if we're in the middle of an election and don't have a primary - if primary_node is None: - print "WARNING - No primary defined. In an election?" - return 1 - - # Check if we failed to find the current host - # below should never happen - if host_node is None: - print "CRITICAL - Unable to find host '" + host + "' in replica set." - return 2 - # Is the specified host the primary? - if host_node["stateStr"] == "PRIMARY": - if max_lag == False: - print "OK - This is the primary." - return 0 - else: - #get the maximal replication lag - data = "" - maximal_lag = 0 - for member in rs_status['members']: - if not member['stateStr'] == "ARBITER": - lastSlaveOpTime = member['optimeDate'] - replicationLag = abs(primary_node["optimeDate"] - lastSlaveOpTime).seconds - slaveDelays[member['name']] - data = data + member['name'] + " lag=%d;" % replicationLag - maximal_lag = max(maximal_lag, replicationLag) - if percent: - err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) - if err != 0: - return err - primary_timediff = replication_get_time_diff(con) - maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100) - message = "Maximal lag is " + str(maximal_lag) + " percents" - message += performance_data(perf_data, [(maximal_lag, "replication_lag_percent", warning, critical)]) - else: - message = "Maximal lag is " + str(maximal_lag) + " seconds" - message += performance_data(perf_data, [(maximal_lag, "replication_lag", warning, critical)]) - return check_levels(maximal_lag, warning, critical, message) - elif host_node["stateStr"] == "ARBITER": - print "OK - This is an arbiter" + rs_conf = con.local.system.replset.find_one() + for member in rs_conf['members']: + if member.get('slaveDelay') is not None: + slaveDelays[member['host']] = member.get('slaveDelay') + else: + slaveDelays[member['host']] = 0 + + # Find the primary and/or the current node + primary_node = None + host_node = None + + for member in rs_status["members"]: + if member["stateStr"] == "PRIMARY": + primary_node = member + if member["name"].split(':')[0] == host and int(member["name"].split(':')[1]) == port: + host_node = member + + # Check if we're in the middle of an election and don't have a primary + if primary_node is None: + print "WARNING - No primary defined. In an election?" + return 1 + + # Check if we failed to find the current host + # below should never happen + if host_node is None: + print "CRITICAL - Unable to find host '" + host + "' in replica set." + return 2 + # Is the specified host the primary? + if host_node["stateStr"] == "PRIMARY": + if max_lag == False: + print "OK - This is the primary." return 0 - - # Find the difference in optime between current node and PRIMARY - - optime_lag = abs(primary_node["optimeDate"] - host_node["optimeDate"]) - - if host_node['name'] in slaveDelays: - slave_delay = slaveDelays[host_node['name']] - elif host_node['name'].endswith(':27017') and host_node['name'][:-len(":27017")] in slaveDelays: - slave_delay = slaveDelays[host_node['name'][:-len(":27017")]] else: - raise Exception("Unable to determine slave delay for {0}".format(host_node['name'])) - - try: # work starting from python2.7 - lag = optime_lag.total_seconds() - except: - lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600) - - if percent: - err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) - if err != 0: - return err - primary_timediff = replication_get_time_diff(con) - if primary_timediff != 0: - lag = int(float(lag) / float(primary_timediff) * 100) + #get the maximal replication lag + data = "" + maximal_lag = 0 + for member in rs_status['members']: + if not member['stateStr'] == "ARBITER": + lastSlaveOpTime = member['optimeDate'] + replicationLag = abs(primary_node["optimeDate"] - lastSlaveOpTime).seconds - slaveDelays[member['name']] + data = data + member['name'] + " lag=%d;" % replicationLag + maximal_lag = max(maximal_lag, replicationLag) + if percent: + err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) + if err != 0: + return err + primary_timediff = replication_get_time_diff(con) + maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100) + message = "Maximal lag is " + str(maximal_lag) + " percents" + print message + self.submit('replication', 'maximal-lag-percentage', str(maximal_lag)) else: - lag = 0 - message = "Lag is " + str(lag) + " percents" - message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)]) - else: - message = "Lag is " + str(lag) + " seconds" - message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)]) - return check_levels(lag, warning + slaveDelays[host_node['name']], critical + slaveDelays[host_node['name']], message) + message = "Maximal lag is " + str(maximal_lag) + " seconds" + print message + self.submit('replication', 'maximal-lag-seconds', str(maximal_lag)) + return str(maximal_lag) + elif host_node["stateStr"] == "ARBITER": + print "OK - This is an arbiter" + return 0 + + # Find the difference in optime between current node and PRIMARY + + optime_lag = abs(primary_node["optimeDate"] - host_node["optimeDate"]) + + if host_node['name'] in slaveDelays: + slave_delay = slaveDelays[host_node['name']] + elif host_node['name'].endswith(':27017') and host_node['name'][:-len(":27017")] in slaveDelays: + slave_delay = slaveDelays[host_node['name'][:-len(":27017")]] else: - # - # less than 2.0 check - # - # Get replica set status - rs_status = con.admin.command("replSetGetStatus") - - # Find the primary and/or the current node - primary_node = None - host_node = None - for member in rs_status["members"]: - if member["stateStr"] == "PRIMARY": - primary_node = (member["name"], member["optimeDate"]) - if member["name"].split(":")[0].startswith(host): - host_node = member - - # Check if we're in the middle of an election and don't have a primary - if primary_node is None: - print "WARNING - No primary defined. In an election?" - sys.exit(1) - - # Is the specified host the primary? - if host_node["stateStr"] == "PRIMARY": - print "OK - This is the primary." - sys.exit(0) - - # Find the difference in optime between current node and PRIMARY - optime_lag = abs(primary_node[1] - host_node["optimeDate"]) - lag = optime_lag.seconds - if percent: - err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1])) - if err != 0: - return err - primary_timediff = replication_get_time_diff(con) + raise Exception("Unable to determine slave delay for {0}".format(host_node['name'])) + + try: # work starting from python2.7 + lag = optime_lag.total_seconds() + except: + lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600) + + if percent: + err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) + if err != 0: + return err + primary_timediff = replication_get_time_diff(con) + if primary_timediff != 0: lag = int(float(lag) / float(primary_timediff) * 100) - message = "Lag is " + str(lag) + " percents" - message += performance_data(perf_data, [(lag, "replication_lag_percent", warning, critical)]) else: - message = "Lag is " + str(lag) + " seconds" - message += performance_data(perf_data, [(lag, "replication_lag", warning, critical)]) - return check_levels(lag, warning, critical, message) + lag = 0 + message = "Lag is " + str(lag) + " percents" + print message + self.submit('replication', 'lag-percentage', str(maximal_lag)) + else: + message = "Lag is " + str(lag) + " seconds" + print message + self.submit('replication', 'lag-seconds', str(maximal_lag)) + return str(lag) + #return check_levels(lag, warning + slaveDelays[host_node['name']], critical + slaveDelays[host_node['name']], message) except Exception, e: - return exit_with_general_critical(e) + return e def do_server_status(self): con = Connection(host=self.mongo_host, port=self.mongo_port, slave_okay=True) @@ -218,63 +219,63 @@ def do_server_status(self): version = server_status['version'] at_least_2_4 = V(version) >= V('2.4.0') -# # operations -# for k, v in server_status['opcounters'].items(): -# self.submit('total_operations', k, v) -# -# # memory -# for t in ['resident', 'virtual', 'mapped']: -# self.submit('memory', t, server_status['mem'][t]) -# -# # connections -# self.submit('connections', 'connections', server_status['connections']['current']) -# -# # locks -# if self.lockTotalTime is not None and self.lockTime is not None: -# if self.lockTime == server_status['globalLock']['lockTime']: -# value = 0.0 -# else: -# value = float(server_status['globalLock']['lockTime'] - self.lockTime) * 100.0 / float(server_status['globalLock']['totalTime'] - self.lockTotalTime) -# self.submit('percent', 'lock_ratio', value) -# -# self.lockTotalTime = server_status['globalLock']['totalTime'] -# self.lockTime = server_status['globalLock']['lockTime'] -# -# # indexes -# accesses = None -# misses = None -# index_counters = server_status['indexCounters'] if at_least_2_4 else server_status['indexCounters']['btree'] -# -# if self.accesses is not None: -# accesses = index_counters['accesses'] - self.accesses -# if accesses < 0: -# accesses = None -# misses = (index_counters['misses'] or 0) - (self.misses or 0) -# if misses < 0: -# misses = None -# if accesses and misses is not None: -# self.submit('cache_ratio', 'cache_misses', int(misses * 100 / float(accesses))) -# else: -# self.submit('cache_ratio', 'cache_misses', 0) -# self.accesses = index_counters['accesses'] -# self.misses = index_counters['misses'] -# -# for mongo_db in self.mongo_db: -# db = con[mongo_db] -# if self.mongo_user and self.mongo_password: -# db.authenticate(self.mongo_user, self.mongo_password) -# db_stats = db.command('dbstats') -# -# # stats counts -# self.submit('counter', 'object_count', db_stats['objects'], mongo_db) -# self.submit('counter', 'collections', db_stats['collections'], mongo_db) -# self.submit('counter', 'num_extents', db_stats['numExtents'], mongo_db) -# self.submit('counter', 'indexes', db_stats['indexes'], mongo_db) -# -# # stats sizes -# self.submit('file_size', 'storage', db_stats['storageSize'], mongo_db) -# self.submit('file_size', 'index', db_stats['indexSize'], mongo_db) -# self.submit('file_size', 'data', db_stats['dataSize'], mongo_db) + # operations + for k, v in server_status['opcounters'].items(): + self.submit('total_operations', k, v) + + # memory + for t in ['resident', 'virtual', 'mapped']: + self.submit('memory', t, server_status['mem'][t]) + + # connections + self.submit('connections', 'connections', server_status['connections']['current']) + + # locks + if self.lockTotalTime is not None and self.lockTime is not None: + if self.lockTime == server_status['globalLock']['lockTime']: + value = 0.0 + else: + value = float(server_status['globalLock']['lockTime'] - self.lockTime) * 100.0 / float(server_status['globalLock']['totalTime'] - self.lockTotalTime) + self.submit('percent', 'lock_ratio', value) + + self.lockTotalTime = server_status['globalLock']['totalTime'] + self.lockTime = server_status['globalLock']['lockTime'] + + # indexes + accesses = None + misses = None + index_counters = server_status['indexCounters'] if at_least_2_4 else server_status['indexCounters']['btree'] + + if self.accesses is not None: + accesses = index_counters['accesses'] - self.accesses + if accesses < 0: + accesses = None + misses = (index_counters['misses'] or 0) - (self.misses or 0) + if misses < 0: + misses = None + if accesses and misses is not None: + self.submit('cache_ratio', 'cache_misses', int(misses * 100 / float(accesses))) + else: + self.submit('cache_ratio', 'cache_misses', 0) + self.accesses = index_counters['accesses'] + self.misses = index_counters['misses'] + + for mongo_db in self.mongo_db: + db = con[mongo_db] + if self.mongo_user and self.mongo_password: + db.authenticate(self.mongo_user, self.mongo_password) + db_stats = db.command('dbstats') + + # stats counts + self.submit('counter', 'object_count', db_stats['objects'], mongo_db) + self.submit('counter', 'collections', db_stats['collections'], mongo_db) + self.submit('counter', 'num_extents', db_stats['numExtents'], mongo_db) + self.submit('counter', 'indexes', db_stats['indexes'], mongo_db) + + # stats sizes + self.submit('file_size', 'storage', db_stats['storageSize'], mongo_db) + self.submit('file_size', 'index', db_stats['indexSize'], mongo_db) + self.submit('file_size', 'data', db_stats['dataSize'], mongo_db) con.disconnect() @@ -295,6 +296,8 @@ def config(self, obj): collectd.warning("mongodb plugin: Unkown configuration key %s" % node.key) mongodb = MongoDB() -#collectd.register_read(mongodb.do_server_status) -mongodb.check_rep_lag() -#collectd.register_config(mongodb.config) +collectd.register_read(mongodb.do_server_status) +# lag in seconds +collectd.register_read(mongodb.check_rep_lag(True)) +collectd.register_read(mongodb.check_rep_lag(False)) +collectd.register_config(mongodb.config) From 02706291bd46f727ecf43bb99d2b73db4640918d Mon Sep 17 00:00:00 2001 From: kayn Date: Mon, 1 Sep 2014 08:51:35 +0200 Subject: [PATCH 4/9] remove condition for percent --- mongodb.py | 68 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/mongodb.py b/mongodb.py index fd5d711..6c543be 100644 --- a/mongodb.py +++ b/mongodb.py @@ -64,7 +64,7 @@ class MongoDB(object): def __init__(self): self.plugin_name = "mongo" - self.mongo_host = "172.18.3.21" + self.mongo_host = "127.0.0.1" self.mongo_port = 27017 self.mongo_db = ["admin", ] self.mongo_user = None @@ -90,7 +90,7 @@ def submit(self, type, instance, value, db=None): #def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd): - def check_rep_lag(self, percent): + def check_rep_lag(self): host = self.mongo_host port = self.mongo_port user = self.mongo_user @@ -153,19 +153,21 @@ def check_rep_lag(self, percent): replicationLag = abs(primary_node["optimeDate"] - lastSlaveOpTime).seconds - slaveDelays[member['name']] data = data + member['name'] + " lag=%d;" % replicationLag maximal_lag = max(maximal_lag, replicationLag) - if percent: - err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) - if err != 0: - return err - primary_timediff = replication_get_time_diff(con) - maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100) - message = "Maximal lag is " + str(maximal_lag) + " percents" - print message - self.submit('replication', 'maximal-lag-percentage', str(maximal_lag)) - else: - message = "Maximal lag is " + str(maximal_lag) + " seconds" - print message - self.submit('replication', 'maximal-lag-seconds', str(maximal_lag)) + + # send message with maximal lag + message = "Maximal lag is " + str(maximal_lag) + " seconds" + print message + self.submit('replication', 'maximal-lag-seconds', str(maximal_lag)) + + # send message with maximal lag in percentage + err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) + if err != 0: + return err + primary_timediff = replication_get_time_diff(con) + maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100) + message = "Maximal lag is " + str(maximal_lag) + " percents" + print message + self.submit('replication', 'maximal-lag-percentage', str(maximal_lag)) return str(maximal_lag) elif host_node["stateStr"] == "ARBITER": print "OK - This is an arbiter" @@ -187,22 +189,23 @@ def check_rep_lag(self, percent): except: lag = float(optime_lag.seconds + optime_lag.days * 24 * 3600) - if percent: - err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) - if err != 0: - return err - primary_timediff = replication_get_time_diff(con) - if primary_timediff != 0: - lag = int(float(lag) / float(primary_timediff) * 100) - else: - lag = 0 - message = "Lag is " + str(lag) + " percents" - print message - self.submit('replication', 'lag-percentage', str(maximal_lag)) + # send message with lag + message = "Lag is " + str(lag) + " seconds" + print message + self.submit('replication', 'lag-seconds', str(maximal_lag)) + + # send message with lag in percentage + err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) + if err != 0: + return err + primary_timediff = replication_get_time_diff(con) + if primary_timediff != 0: + lag = int(float(lag) / float(primary_timediff) * 100) else: - message = "Lag is " + str(lag) + " seconds" - print message - self.submit('replication', 'lag-seconds', str(maximal_lag)) + lag = 0 + message = "Lag is " + str(lag) + " percents" + print message + self.submit('replication', 'lag-percentage', str(maximal_lag)) return str(lag) #return check_levels(lag, warning + slaveDelays[host_node['name']], critical + slaveDelays[host_node['name']], message) @@ -297,7 +300,6 @@ def config(self, obj): mongodb = MongoDB() collectd.register_read(mongodb.do_server_status) -# lag in seconds -collectd.register_read(mongodb.check_rep_lag(True)) -collectd.register_read(mongodb.check_rep_lag(False)) +# lag in seconds and percentage +collectd.register_read(mongodb.check_rep_lag) collectd.register_config(mongodb.config) From 53f3a532a62bcf5bbf33a3da3e8ea104bc98bd5e Mon Sep 17 00:00:00 2001 From: kayn Date: Tue, 2 Sep 2014 16:27:54 +0200 Subject: [PATCH 5/9] added part for lag from mzupan/nagios-plugin-mongodb --- mongodb.py | 163 ++++++++++++++++++++++++++--------------------------- 1 file changed, 80 insertions(+), 83 deletions(-) diff --git a/mongodb.py b/mongodb.py index 6c543be..31280d1 100644 --- a/mongodb.py +++ b/mongodb.py @@ -3,11 +3,7 @@ # Plugin to collectd statistics from MongoDB # -import imp - -foo = imp.load_source('collectd', '/root/mongodb/collectd.py') -collectd = foo -#import collectd +import collectd import pymongo from pymongo import Connection from distutils.version import StrictVersion as V @@ -89,16 +85,80 @@ def submit(self, type, instance, value, db=None): v.dispatch() - #def check_rep_lag(con, host, port, warning, critical, percent, perf_data, max_lag, user, passwd): - def check_rep_lag(self): + def do_server_status(self): host = self.mongo_host port = self.mongo_port user = self.mongo_user passwd = self.mongo_password perf_data = False - # Get mongo to tell us replica set member name when connecting locally con = Connection(host=self.mongo_host, port=self.mongo_port, slave_okay=True) db = con[self.mongo_db[0]] + if self.mongo_user and self.mongo_password: + db.authenticate(self.mongo_user, self.mongo_password) + server_status = db.command('serverStatus') + + version = server_status['version'] + at_least_2_4 = V(version) >= V('2.4.0') + + # operations + for k, v in server_status['opcounters'].items(): + self.submit('total_operations', k, v) + + # memory + for t in ['resident', 'virtual', 'mapped']: + self.submit('memory', t, server_status['mem'][t]) + + # connections + self.submit('connections', 'connections', server_status['connections']['current']) + + # locks + if self.lockTotalTime is not None and self.lockTime is not None: + if self.lockTime == server_status['globalLock']['lockTime']: + value = 0.0 + else: + value = float(server_status['globalLock']['lockTime'] - self.lockTime) * 100.0 / float(server_status['globalLock']['totalTime'] - self.lockTotalTime) + self.submit('percent', 'lock_ratio', value) + + self.lockTotalTime = server_status['globalLock']['totalTime'] + self.lockTime = server_status['globalLock']['lockTime'] + + # indexes + accesses = None + misses = None + index_counters = server_status['indexCounters'] if at_least_2_4 else server_status['indexCounters']['btree'] + + if self.accesses is not None: + accesses = index_counters['accesses'] - self.accesses + if accesses < 0: + accesses = None + misses = (index_counters['misses'] or 0) - (self.misses or 0) + if misses < 0: + misses = None + if accesses and misses is not None: + self.submit('cache_ratio', 'cache_misses', int(misses * 100 / float(accesses))) + else: + self.submit('cache_ratio', 'cache_misses', 0) + self.accesses = index_counters['accesses'] + self.misses = index_counters['misses'] + + for mongo_db in self.mongo_db: + db = con[mongo_db] + if self.mongo_user and self.mongo_password: + db.authenticate(self.mongo_user, self.mongo_password) + db_stats = db.command('dbstats') + + # stats counts + self.submit('counter', 'object_count', db_stats['objects'], mongo_db) + self.submit('counter', 'collections', db_stats['collections'], mongo_db) + self.submit('counter', 'num_extents', db_stats['numExtents'], mongo_db) + self.submit('counter', 'indexes', db_stats['indexes'], mongo_db) + + # stats sizes + self.submit('file_size', 'storage', db_stats['storageSize'], mongo_db) + self.submit('file_size', 'index', db_stats['indexSize'], mongo_db) + self.submit('file_size', 'data', db_stats['dataSize'], mongo_db) + + # Replica check rs_status = {} slaveDelays = {} @@ -109,6 +169,7 @@ def check_rep_lag(self): except pymongo.errors.OperationFailure, e: if e.code == None and str(e).find('failed: not running with --replSet"'): print "OK - Not running with replSet" + con.disconnect() return 0 rs_conf = con.local.system.replset.find_one() @@ -131,17 +192,20 @@ def check_rep_lag(self): # Check if we're in the middle of an election and don't have a primary if primary_node is None: print "WARNING - No primary defined. In an election?" + con.disconnect() return 1 # Check if we failed to find the current host # below should never happen if host_node is None: print "CRITICAL - Unable to find host '" + host + "' in replica set." + con.disconnect() return 2 # Is the specified host the primary? if host_node["stateStr"] == "PRIMARY": if max_lag == False: print "OK - This is the primary." + con.disconnect() return 0 else: #get the maximal replication lag @@ -162,21 +226,23 @@ def check_rep_lag(self): # send message with maximal lag in percentage err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) if err != 0: + con.disconnect() return err primary_timediff = replication_get_time_diff(con) maximal_lag = int(float(maximal_lag) / float(primary_timediff) * 100) message = "Maximal lag is " + str(maximal_lag) + " percents" print message self.submit('replication', 'maximal-lag-percentage', str(maximal_lag)) + con.disconnect() return str(maximal_lag) elif host_node["stateStr"] == "ARBITER": print "OK - This is an arbiter" + con.disconnect() return 0 # Find the difference in optime between current node and PRIMARY optime_lag = abs(primary_node["optimeDate"] - host_node["optimeDate"]) - if host_node['name'] in slaveDelays: slave_delay = slaveDelays[host_node['name']] elif host_node['name'].endswith(':27017') and host_node['name'][:-len(":27017")] in slaveDelays: @@ -192,11 +258,12 @@ def check_rep_lag(self): # send message with lag message = "Lag is " + str(lag) + " seconds" print message - self.submit('replication', 'lag-seconds', str(maximal_lag)) + self.submit('replication', 'lag-seconds', str(lag)) # send message with lag in percentage err, con = mongo_connect(primary_node['name'].split(':')[0], int(primary_node['name'].split(':')[1]), False, user, passwd) if err != 0: + con.disconnect() return err primary_timediff = replication_get_time_diff(con) if primary_timediff != 0: @@ -205,83 +272,15 @@ def check_rep_lag(self): lag = 0 message = "Lag is " + str(lag) + " percents" print message - self.submit('replication', 'lag-percentage', str(maximal_lag)) + self.submit('replication', 'lag-percentage', str(lag)) + con.disconnect() return str(lag) #return check_levels(lag, warning + slaveDelays[host_node['name']], critical + slaveDelays[host_node['name']], message) except Exception, e: + con.disconnect() return e - def do_server_status(self): - con = Connection(host=self.mongo_host, port=self.mongo_port, slave_okay=True) - db = con[self.mongo_db[0]] - if self.mongo_user and self.mongo_password: - db.authenticate(self.mongo_user, self.mongo_password) - server_status = db.command('serverStatus') - - version = server_status['version'] - at_least_2_4 = V(version) >= V('2.4.0') - - # operations - for k, v in server_status['opcounters'].items(): - self.submit('total_operations', k, v) - - # memory - for t in ['resident', 'virtual', 'mapped']: - self.submit('memory', t, server_status['mem'][t]) - - # connections - self.submit('connections', 'connections', server_status['connections']['current']) - - # locks - if self.lockTotalTime is not None and self.lockTime is not None: - if self.lockTime == server_status['globalLock']['lockTime']: - value = 0.0 - else: - value = float(server_status['globalLock']['lockTime'] - self.lockTime) * 100.0 / float(server_status['globalLock']['totalTime'] - self.lockTotalTime) - self.submit('percent', 'lock_ratio', value) - - self.lockTotalTime = server_status['globalLock']['totalTime'] - self.lockTime = server_status['globalLock']['lockTime'] - - # indexes - accesses = None - misses = None - index_counters = server_status['indexCounters'] if at_least_2_4 else server_status['indexCounters']['btree'] - - if self.accesses is not None: - accesses = index_counters['accesses'] - self.accesses - if accesses < 0: - accesses = None - misses = (index_counters['misses'] or 0) - (self.misses or 0) - if misses < 0: - misses = None - if accesses and misses is not None: - self.submit('cache_ratio', 'cache_misses', int(misses * 100 / float(accesses))) - else: - self.submit('cache_ratio', 'cache_misses', 0) - self.accesses = index_counters['accesses'] - self.misses = index_counters['misses'] - - for mongo_db in self.mongo_db: - db = con[mongo_db] - if self.mongo_user and self.mongo_password: - db.authenticate(self.mongo_user, self.mongo_password) - db_stats = db.command('dbstats') - - # stats counts - self.submit('counter', 'object_count', db_stats['objects'], mongo_db) - self.submit('counter', 'collections', db_stats['collections'], mongo_db) - self.submit('counter', 'num_extents', db_stats['numExtents'], mongo_db) - self.submit('counter', 'indexes', db_stats['indexes'], mongo_db) - - # stats sizes - self.submit('file_size', 'storage', db_stats['storageSize'], mongo_db) - self.submit('file_size', 'index', db_stats['indexSize'], mongo_db) - self.submit('file_size', 'data', db_stats['dataSize'], mongo_db) - - - con.disconnect() def config(self, obj): for node in obj.children: @@ -300,6 +299,4 @@ def config(self, obj): mongodb = MongoDB() collectd.register_read(mongodb.do_server_status) -# lag in seconds and percentage -collectd.register_read(mongodb.check_rep_lag) collectd.register_config(mongodb.config) From 1c3f2e6d92bc65508e5f696f561cad6f07cacced Mon Sep 17 00:00:00 2001 From: kayn Date: Tue, 14 Oct 2014 10:54:18 +0200 Subject: [PATCH 6/9] getting databases direct from mongo --- mongodb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mongodb.py b/mongodb.py index 31280d1..43a0f99 100644 --- a/mongodb.py +++ b/mongodb.py @@ -62,7 +62,6 @@ def __init__(self): self.plugin_name = "mongo" self.mongo_host = "127.0.0.1" self.mongo_port = 27017 - self.mongo_db = ["admin", ] self.mongo_user = None self.mongo_password = None @@ -92,6 +91,7 @@ def do_server_status(self): passwd = self.mongo_password perf_data = False con = Connection(host=self.mongo_host, port=self.mongo_port, slave_okay=True) + self.mongo_db = con.database_names() db = con[self.mongo_db[0]] if self.mongo_user and self.mongo_password: db.authenticate(self.mongo_user, self.mongo_password) From 4e33eee829152c82db2a2f91f830ce73ef6c4267 Mon Sep 17 00:00:00 2001 From: Pavel Pulec Date: Tue, 14 Oct 2014 10:58:11 +0200 Subject: [PATCH 7/9] list of databases is getting direct from mongo --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index fe2bfef..e4f5020 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,6 @@ The following is an example Collectd configuration for this plugin: Host "127.0.0.1" Password "password" - Database "admin" "db-prod" "db-dev" From 2e0f0702e207ae510ece6dd0564791630e10040c Mon Sep 17 00:00:00 2001 From: kayn Date: Tue, 14 Oct 2014 11:40:41 +0200 Subject: [PATCH 8/9] added support for manually configuration of monitored databases --- mongodb.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mongodb.py b/mongodb.py index 43a0f99..491d748 100644 --- a/mongodb.py +++ b/mongodb.py @@ -62,6 +62,7 @@ def __init__(self): self.plugin_name = "mongo" self.mongo_host = "127.0.0.1" self.mongo_port = 27017 + self.mongo_db = "" self.mongo_user = None self.mongo_password = None @@ -91,7 +92,8 @@ def do_server_status(self): passwd = self.mongo_password perf_data = False con = Connection(host=self.mongo_host, port=self.mongo_port, slave_okay=True) - self.mongo_db = con.database_names() + if not self.mongo_db: + self.mongo_db = con.database_names() db = con[self.mongo_db[0]] if self.mongo_user and self.mongo_password: db.authenticate(self.mongo_user, self.mongo_password) From 2dfed4fdd3e423f2ac43f37b970fed1e7897b72f Mon Sep 17 00:00:00 2001 From: kayn Date: Tue, 14 Oct 2014 11:41:46 +0200 Subject: [PATCH 9/9] added support for manually configuration of monitored databases --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index e4f5020..64feef2 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ The following is an example Collectd configuration for this plugin: Host "127.0.0.1" Password "password" + Database "admin"