|
71 | 71 | trials attached to the underlying Ax experiment '{experiment_name}'. |
72 | 72 | """ |
73 | 73 | FAILURE_EXCEEDED_MSG = ( |
74 | | - "Failure rate exceeds the tolerated trial failure rate of {f_rate} (at least " |
75 | | - "{n_failed} out of first {n_ran} trials failed or were abandoned). Checks are " |
76 | | - "triggered both at the end of an optimization and if at least {min_failed} trials " |
77 | | - "have either failed, or have been abandoned, potentially automatically due to " |
78 | | - "issues with the trial." |
| 74 | + "NOTE: This error is usually not caused by Ax. Please please check any trial " |
| 75 | + "evaluation processes/jobs to see why they are failing, and ensure that they " |
| 76 | + "succeed over the entire range of the parameters defined in this optimization.\n\n" |
| 77 | + "Trials are failing or being abandoned at a rate {observed_rate} that exceeds the " |
| 78 | + "tolerated trial failure rate of {f_rate} (at least {n_failed} out of first " |
| 79 | + "{n_ran} trials failed or were abandoned). Checks are triggered both at the end " |
| 80 | + "of an optimization and if at least {min_failed} trials have been " |
| 81 | + "failed/abandoned, potentially automatically due to issues with the trial." |
79 | 82 | ) |
| 83 | +METRIC_FETCH_ERR_MESSAGE = ( |
| 84 | + "A majority of the trial failures encountered are due to metric fetching errors. " |
| 85 | + "This could mean the metrics are flaky, broken, or misconfigured. Please check " |
| 86 | + "that the trial processes/jobs are successfully producing the expected metrics and " |
| 87 | + "that the metric is correctly configured." |
| 88 | +) |
| 89 | + |
80 | 90 | EXPECTED_STAGED_MSG = ( |
81 | 91 | "Expected all trials to be in status {expected} after running or staging, " |
82 | 92 | "found {t_idx_to_status}." |
@@ -2132,11 +2142,18 @@ def _get_failure_rate_exceeded_error( |
2132 | 2142 | num_ran_in_orchestrator: int, |
2133 | 2143 | ) -> FailureRateExceededError: |
2134 | 2144 | return FailureRateExceededError( |
2135 | | - FAILURE_EXCEEDED_MSG.format( |
| 2145 | + ( |
| 2146 | + f"{METRIC_FETCH_ERR_MESSAGE}\n" |
| 2147 | + if self._num_trials_bad_due_to_err > num_bad_in_orchestrator / 2 |
| 2148 | + else "" |
| 2149 | + ) |
| 2150 | + + " Orignal error message: " |
| 2151 | + + FAILURE_EXCEEDED_MSG.format( |
2136 | 2152 | f_rate=self.options.tolerated_trial_failure_rate, |
2137 | 2153 | n_failed=num_bad_in_orchestrator, |
2138 | 2154 | n_ran=num_ran_in_orchestrator, |
2139 | 2155 | min_failed=self.options.min_failed_trials_for_failure_rate_check, |
| 2156 | + observed_rate=float(num_bad_in_orchestrator) / num_ran_in_orchestrator, |
2140 | 2157 | ) |
2141 | 2158 | ) |
2142 | 2159 |
|
|
0 commit comments