-
Notifications
You must be signed in to change notification settings - Fork 191
Description
Description
[Bug]: test_mgr_enable_rook_backend_module Failed with CephHealthRecoveredException on ODF4.18
Steps to Reproduce
- ODF 4.18 Deployment
- Run test test_mgr_enable_rook_backend_module
Actual Behavior
Expected Behavior
Impact (likelihood of reproduction, impact on the cluster and on other tests, etc.)
Screenshots (if applicable)
Environment
- Test Suite(s):
- Platform(s):
- Version(s):
- OS:
Additional Context
def finalizer():
if not skipped:
multi_storagecluster_external_health_passed = False
try:
teardown = ocsci_config.RUN["cli_params"]["teardown"]
skip_ocs_deployment = ocsci_config.ENV_DATA["skip_ocs_deployment"]
ceph_cluster_installed = ocsci_config.RUN.get("cephcluster")
if not (
teardown
or skip_ocs_deployment
or mcg_only_deployment
or not ceph_cluster_installed
):
# We are allowing 20 re-tries for health check, to avoid teardown failures for cases like:
# "flip-flopping ceph health OK and warn because of:
# HEALTH_WARN Reduced data availability: 2 pgs peering
ceph_health_check(
namespace=ocsci_config.ENV_DATA["cluster_namespace"],
fix_ceph_health=True,
)
/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/tests/conftest.py:1834:
/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/utility/utils.py:2704: in ceph_health_check
return retry(
/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/utility/retry.py:79: in f_retry
return f(args, *kwargs)
/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/utility/utils.py:2744: in ceph_health_check_base
ceph_health_recover(health, namespace)
health_status = 'HEALTH_WARN 2 mgr modules have recently crashed\n'
namespace = 'openshift-storage'
def ceph_health_recover(health_status, namespace=None):
"""
Function which tries to recover ceph health to be HEALTH OK
Args:
health_status (str): Ceph health status
namespace (str): Namespace of OCS
Raises:
CephHealthNotRecoveredException: When Ceph health was not recovered
CephHealthRecoveredException: When Ceph health was recovered
"""
ceph_health_fixes = [
{
"pattern": r"daemons have recently crashed",
"func": ceph_health_resolve_crash,
"func_args": [],
"func_kwargs": {},
"ceph_health_tries": 5,
"ceph_health_delay": 30,
},
{
"pattern": r"modules have recently crashed",
"func": ceph_health_resolve_crash,
"func_args": [],
"func_kwargs": {},
"ceph_health_tries": 5,
"ceph_health_delay": 30,
},
{
"pattern": r"slow ops, oldest one blocked for \d+ sec, mon\.([a-z]) has slow ops",
"func": ceph_health_resolve_mon_slow_ops,
"func_args": [health_status],
"func_kwargs": {},
"ceph_health_tries": 6,
"ceph_health_delay": 30,
},
# TODO: Add more patterns and fix functions
]
for fix_dict in ceph_health_fixes:
pattern = fix_dict["pattern"]
if re.search(pattern, health_status):
log.info(
"Trying to fix Ceph Health because we found in Health status the matching pattern"
f": '{pattern}'!"
)
# Avoid circular dependencies, importing here
from ocs_ci.ocs.utils import collect_ocs_logs
# Collecting logs here before trying to fix issue
timestamp = int(time.time())
collect_ocs_logs(
f"ceph_health_recover_{timestamp}",
ocp=False,
timeout=defaults.MUST_GATHER_TIMEOUT,
)
fix_dict["func"](
*fix_dict.get("func_args", []), **fix_dict.get("func_kwargs", {})
)
try:
ceph_health_check(
namespace,
tries=fix_dict.get("ceph_health_tries", 5),
delay=fix_dict.get("ceph_health_delay", 30),
)
except Exception as ex:
raise CephHealthNotRecoveredException(
f"Attempt to try to recover the Ceph Health failed! Exception: {ex}"
)
raise CephHealthRecoveredException(
"Ceph health was not OK and got forcibly recovered to not block other tests"
f" after the issue: {pattern} !"
" This might be because of product bug, so please do not ignore this error and"
" analyze why this has happened!"
)
E ocs_ci.ocs.exceptions.CephHealthRecoveredException: Ceph health was not OK and got forcibly recovered to not block other tests after the issue: modules have recently crashed ! This might be because of product bug, so please do not ignore this error and analyze why this has happened!
/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/utility/utils.py:2679: CephHealthRecoveredException