Skip to content

[Bug]: test_mgr_enable_rook_backend_module Failed with CephHealthRecoveredException on ODF4.18 #14544

@suchita-g

Description

@suchita-g

Description

[Bug]: test_mgr_enable_rook_backend_module Failed with CephHealthRecoveredException on ODF4.18

Steps to Reproduce

  1. ODF 4.18 Deployment
  2. Run test test_mgr_enable_rook_backend_module

Actual Behavior

Expected Behavior

Impact (likelihood of reproduction, impact on the cluster and on other tests, etc.)

Screenshots (if applicable)

Environment

  • Test Suite(s):
  • Platform(s):
  • Version(s):
  • OS:

Additional Context

def finalizer():
if not skipped:
multi_storagecluster_external_health_passed = False
try:
teardown = ocsci_config.RUN["cli_params"]["teardown"]
skip_ocs_deployment = ocsci_config.ENV_DATA["skip_ocs_deployment"]
ceph_cluster_installed = ocsci_config.RUN.get("cephcluster")
if not (
teardown
or skip_ocs_deployment
or mcg_only_deployment
or not ceph_cluster_installed
):
# We are allowing 20 re-tries for health check, to avoid teardown failures for cases like:
# "flip-flopping ceph health OK and warn because of:
# HEALTH_WARN Reduced data availability: 2 pgs peering

          ceph_health_check(
                namespace=ocsci_config.ENV_DATA["cluster_namespace"],
                fix_ceph_health=True,
            )

/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/tests/conftest.py:1834:

/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/utility/utils.py:2704: in ceph_health_check
return retry(
/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/utility/retry.py:79: in f_retry
return f(args, *kwargs)
/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/utility/utils.py:2744: in ceph_health_check_base
ceph_health_recover(health, namespace)

health_status = 'HEALTH_WARN 2 mgr modules have recently crashed\n'
namespace = 'openshift-storage'

def ceph_health_recover(health_status, namespace=None):
"""
Function which tries to recover ceph health to be HEALTH OK

Args:
    health_status (str): Ceph health status
    namespace (str): Namespace of OCS

Raises:
    CephHealthNotRecoveredException: When Ceph health was not recovered
    CephHealthRecoveredException: When Ceph health was recovered

"""
ceph_health_fixes = [
    {
        "pattern": r"daemons have recently crashed",
        "func": ceph_health_resolve_crash,
        "func_args": [],
        "func_kwargs": {},
        "ceph_health_tries": 5,
        "ceph_health_delay": 30,
    },
    {
        "pattern": r"modules have recently crashed",
        "func": ceph_health_resolve_crash,
        "func_args": [],
        "func_kwargs": {},
        "ceph_health_tries": 5,
        "ceph_health_delay": 30,
    },
    {
        "pattern": r"slow ops, oldest one blocked for \d+ sec, mon\.([a-z]) has slow ops",
        "func": ceph_health_resolve_mon_slow_ops,
        "func_args": [health_status],
        "func_kwargs": {},
        "ceph_health_tries": 6,
        "ceph_health_delay": 30,
    },
    # TODO: Add more patterns and fix functions
]
for fix_dict in ceph_health_fixes:
    pattern = fix_dict["pattern"]
    if re.search(pattern, health_status):
        log.info(
            "Trying to fix Ceph Health because we found in Health status the matching pattern"
            f": '{pattern}'!"
        )
        # Avoid circular dependencies, importing here
        from ocs_ci.ocs.utils import collect_ocs_logs

        # Collecting logs here before trying to fix issue
        timestamp = int(time.time())
        collect_ocs_logs(
            f"ceph_health_recover_{timestamp}",
            ocp=False,
            timeout=defaults.MUST_GATHER_TIMEOUT,
        )
        fix_dict["func"](
            *fix_dict.get("func_args", []), **fix_dict.get("func_kwargs", {})
        )
        try:
            ceph_health_check(
                namespace,
                tries=fix_dict.get("ceph_health_tries", 5),
                delay=fix_dict.get("ceph_health_delay", 30),
            )
        except Exception as ex:
            raise CephHealthNotRecoveredException(
                f"Attempt to try to recover the Ceph Health failed! Exception: {ex}"
            )

      raise CephHealthRecoveredException(
            "Ceph health was not OK and got forcibly recovered to not block other tests"
            f" after the issue: {pattern} !"
            " This might be because of product bug, so please do not ignore this error and"
            " analyze why this has happened!"
        )

E ocs_ci.ocs.exceptions.CephHealthRecoveredException: Ceph health was not OK and got forcibly recovered to not block other tests after the issue: modules have recently crashed ! This might be because of product bug, so please do not ignore this error and analyze why this has happened!

/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/utility/utils.py:2679: CephHealthRecoveredException

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions