|
| 1 | +#!/usr/bin/env python |
| 2 | +""" Check all the customer pods status on every compute node, send status code "1" if all pods on a compute node are not running status """ |
| 3 | + |
| 4 | +# Adding the ignore because it does not like the naming of the script |
| 5 | +# to be different than the class name |
| 6 | +# pylint: disable=invalid-name |
| 7 | + |
| 8 | +# pylint: disable=wrong-import-position |
| 9 | +# pylint: disable=broad-except |
| 10 | +# pylint: disable=line-too-long |
| 11 | + |
| 12 | +import argparse |
| 13 | +import time |
| 14 | + |
| 15 | +import logging |
| 16 | +logging.basicConfig( |
| 17 | + format='%(asctime)s - %(relativeCreated)6d - %(levelname)-8s - %(message)s', |
| 18 | +) |
| 19 | +logger = logging.getLogger() |
| 20 | +logger.setLevel(logging.WARN) |
| 21 | + |
| 22 | +# Our jenkins server does not include these rpms. |
| 23 | +# In the future we might move this to a container where these |
| 24 | +# libs might exist |
| 25 | +#pylint: disable=import-error |
| 26 | + |
| 27 | +from openshift_tools.monitoring.ocutil import OCUtil |
| 28 | +from openshift_tools.monitoring.metric_sender import MetricSender |
| 29 | + |
| 30 | +ocutil = OCUtil() |
| 31 | + |
| 32 | +def runOCcmd_yaml(cmd, base_cmd='oc'): |
| 33 | + """ log commands through ocutil """ |
| 34 | + logger.info(base_cmd + " " + cmd) |
| 35 | + ocy_time = time.time() |
| 36 | + ocy_result = ocutil.run_user_cmd_yaml(cmd, base_cmd=base_cmd, ) |
| 37 | + logger.info("oc command took %s seconds", str(time.time() - ocy_time)) |
| 38 | + return ocy_result |
| 39 | + |
| 40 | +def parse_args(): |
| 41 | + """ parse the args from the cli """ |
| 42 | + parser = argparse.ArgumentParser(description='Check all the nodes pods Status') |
| 43 | + parser.add_argument('-s', '--skip_namespaces', nargs='+', help='namespaces exception list that we should avoid to check', required=True) |
| 44 | + parser.add_argument('-v', '--verbose', action='count', default=0, |
| 45 | + help='verbosity level, specify multiple') |
| 46 | + |
| 47 | + args = parser.parse_args() |
| 48 | + |
| 49 | + if args.verbose > 0: |
| 50 | + logger.setLevel(logging.INFO) |
| 51 | + |
| 52 | + if args.verbose > 1: |
| 53 | + logger.setLevel(logging.DEBUG) |
| 54 | + |
| 55 | + return args |
| 56 | + |
| 57 | +def check_node_pods_status(nsList): |
| 58 | + """get all the info of all node """ |
| 59 | + result_status = 0 |
| 60 | + nsFilter = "" |
| 61 | + for ns in nsList: |
| 62 | + nsFilter += ",metadata.namespace!="+ns |
| 63 | + node_info = runOCcmd_yaml("get node ") |
| 64 | + for item in node_info['items']: |
| 65 | + nodeName = item['metadata']['name'] |
| 66 | + logger.info("Checking node: %s", item['metadata']['name']) |
| 67 | + node_not_running_pods = runOCcmd_yaml("get pods --all-namespaces --field-selector='spec.nodeName="+nodeName+",status.phase!=Running"+nsFilter+"'") |
| 68 | + node_pods = runOCcmd_yaml("get pods --all-namespaces --field-selector='spec.nodeName="+nodeName+nsFilter+"'") |
| 69 | + if len(node_not_running_pods['items']) == len(node_pods['items']): |
| 70 | + result_status = 1 |
| 71 | + logger.warn("Node: %s, all pods are not running", item['metadata']['name']) |
| 72 | + return result_status |
| 73 | + |
| 74 | +def main(): |
| 75 | + """ check all the node pods tatus see if any node have problem """ |
| 76 | + args = parse_args() |
| 77 | + logger.debug("args: ") |
| 78 | + logger.debug(args) |
| 79 | + nsList = args.skip_namespaces |
| 80 | + pods_status = check_node_pods_status(nsList) |
| 81 | + |
| 82 | + #send the value to zabbix |
| 83 | + mts = MetricSender(verbose=args.verbose) |
| 84 | + mts.add_metric({'openshift.nodes.pods.status': pods_status}) |
| 85 | + mts.send_metrics() |
| 86 | + |
| 87 | +if __name__ == "__main__": |
| 88 | + main() |
0 commit comments