Skip to content

Commit 6045c31

Browse files
Update _get_failure_rate_exceeded_error to reference Metric Fetching Errors (facebook#4780)
Summary: Pull Request resolved: facebook#4780 Updates wording of the FailureRateExceededError, including informing the user when Metric Fetching Errors are to blame for the orchestrator exceeding failure rate. Reviewed By: mpolson64 Differential Revision: D90888949 fbshipit-source-id: 4b50303be3f29350976bddbe90356e332beea81a
1 parent fbab2d5 commit 6045c31

File tree

1 file changed

+23
-6
lines changed

1 file changed

+23
-6
lines changed

ax/orchestration/orchestrator.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,22 @@
7171
trials attached to the underlying Ax experiment '{experiment_name}'.
7272
"""
7373
FAILURE_EXCEEDED_MSG = (
74-
"Failure rate exceeds the tolerated trial failure rate of {f_rate} (at least "
75-
"{n_failed} out of first {n_ran} trials failed or were abandoned). Checks are "
76-
"triggered both at the end of an optimization and if at least {min_failed} trials "
77-
"have either failed, or have been abandoned, potentially automatically due to "
78-
"issues with the trial."
74+
"NOTE: This error is usually not caused by Ax. Please please check any trial "
75+
"evaluation processes/jobs to see why they are failing, and ensure that they "
76+
"succeed over the entire range of the parameters defined in this optimization.\n\n"
77+
"Trials are failing or being abandoned at a rate {observed_rate} that exceeds the "
78+
"tolerated trial failure rate of {f_rate} (at least {n_failed} out of first "
79+
"{n_ran} trials failed or were abandoned). Checks are triggered both at the end "
80+
"of an optimization and if at least {min_failed} trials have been "
81+
"failed/abandoned, potentially automatically due to issues with the trial."
7982
)
83+
METRIC_FETCH_ERR_MESSAGE = (
84+
"A majority of the trial failures encountered are due to metric fetching errors. "
85+
"This could mean the metrics are flaky, broken, or misconfigured. Please check "
86+
"that the trial processes/jobs are successfully producing the expected metrics and "
87+
"that the metric is correctly configured."
88+
)
89+
8090
EXPECTED_STAGED_MSG = (
8191
"Expected all trials to be in status {expected} after running or staging, "
8292
"found {t_idx_to_status}."
@@ -2132,11 +2142,18 @@ def _get_failure_rate_exceeded_error(
21322142
num_ran_in_orchestrator: int,
21332143
) -> FailureRateExceededError:
21342144
return FailureRateExceededError(
2135-
FAILURE_EXCEEDED_MSG.format(
2145+
(
2146+
f"{METRIC_FETCH_ERR_MESSAGE}\n"
2147+
if self._num_trials_bad_due_to_err > num_bad_in_orchestrator / 2
2148+
else ""
2149+
)
2150+
+ " Orignal error message: "
2151+
+ FAILURE_EXCEEDED_MSG.format(
21362152
f_rate=self.options.tolerated_trial_failure_rate,
21372153
n_failed=num_bad_in_orchestrator,
21382154
n_ran=num_ran_in_orchestrator,
21392155
min_failed=self.options.min_failed_trials_for_failure_rate_check,
2156+
observed_rate=float(num_bad_in_orchestrator) / num_ran_in_orchestrator,
21402157
)
21412158
)
21422159

0 commit comments

Comments
 (0)