Skip to content

Commit ea085ae

Browse files
Add monitor scripts to check pods status on compute node
1 parent 8559bd8 commit ea085ae

File tree

1 file changed

+88
-0
lines changed

1 file changed

+88
-0
lines changed
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
#!/usr/bin/env python
2+
""" Check all the customer pods status on every compute node, send status code "1" if all pods on a compute node are not running status """
3+
4+
# Adding the ignore because it does not like the naming of the script
5+
# to be different than the class name
6+
# pylint: disable=invalid-name
7+
8+
# pylint: disable=wrong-import-position
9+
# pylint: disable=broad-except
10+
# pylint: disable=line-too-long
11+
12+
import argparse
13+
import time
14+
15+
import logging
16+
logging.basicConfig(
17+
format='%(asctime)s - %(relativeCreated)6d - %(levelname)-8s - %(message)s',
18+
)
19+
logger = logging.getLogger()
20+
logger.setLevel(logging.WARN)
21+
22+
# Our jenkins server does not include these rpms.
23+
# In the future we might move this to a container where these
24+
# libs might exist
25+
#pylint: disable=import-error
26+
27+
from openshift_tools.monitoring.ocutil import OCUtil
28+
from openshift_tools.monitoring.metric_sender import MetricSender
29+
30+
ocutil = OCUtil()
31+
32+
def runOCcmd_yaml(cmd, base_cmd='oc'):
33+
""" log commands through ocutil """
34+
logger.info(base_cmd + " " + cmd)
35+
ocy_time = time.time()
36+
ocy_result = ocutil.run_user_cmd_yaml(cmd, base_cmd=base_cmd, )
37+
logger.info("oc command took %s seconds", str(time.time() - ocy_time))
38+
return ocy_result
39+
40+
def parse_args():
41+
""" parse the args from the cli """
42+
parser = argparse.ArgumentParser(description='Check all the nodes pods Status')
43+
parser.add_argument('-s', '--skip_namespaces', nargs='+', help='namespaces exception list that we should avoid to check', required=True)
44+
parser.add_argument('-v', '--verbose', action='count', default=0,
45+
help='verbosity level, specify multiple')
46+
47+
args = parser.parse_args()
48+
49+
if args.verbose > 0:
50+
logger.setLevel(logging.INFO)
51+
52+
if args.verbose > 1:
53+
logger.setLevel(logging.DEBUG)
54+
55+
return args
56+
57+
def check_node_pods_status(nsList):
58+
"""get all the info of all node """
59+
result_status = 0
60+
nsFilter = ""
61+
for ns in nsList:
62+
nsFilter += ",metadata.namespace!="+ns
63+
node_info = runOCcmd_yaml("get node ")
64+
for item in node_info['items']:
65+
nodeName = item['metadata']['name']
66+
logger.info("Checking node: %s", item['metadata']['name'])
67+
node_not_running_pods = runOCcmd_yaml("get pods --all-namespaces --field-selector='spec.nodeName="+nodeName+",status.phase!=Running"+nsFilter+"'")
68+
node_pods = runOCcmd_yaml("get pods --all-namespaces --field-selector='spec.nodeName="+nodeName+nsFilter+"'")
69+
if len(node_not_running_pods['items']) == len(node_pods['items']):
70+
result_status = 1
71+
logger.warn("Node: %s, all pods are not running", item['metadata']['name'])
72+
return result_status
73+
74+
def main():
75+
""" check all the node pods tatus see if any node have problem """
76+
args = parse_args()
77+
logger.debug("args: ")
78+
logger.debug(args)
79+
nsList = args.skip_namespaces
80+
pods_status = check_node_pods_status(nsList)
81+
82+
#send the value to zabbix
83+
mts = MetricSender(verbose=args.verbose)
84+
mts.add_metric({'openshift.nodes.pods.status': pods_status})
85+
mts.send_metrics()
86+
87+
if __name__ == "__main__":
88+
main()

0 commit comments

Comments
 (0)