@@ -102,34 +102,15 @@ json HttpCzarWorkerModule::_workerCzarComIssue() {
102102}
103103
104104json HttpCzarWorkerModule::_handleJobError (string const & func) {
105- LOGS (_log, LOG_LVL_DEBUG, " HttpCzarWorkerModule::_handleJobError start" );
105+ string const fName (" HttpCzarWorkerModule::_handleJobError" );
106+ LOGS (_log, LOG_LVL_DEBUG, fName << " start" );
106107 // Metadata-only responses for the file-based protocol should not have any data
107108
108109 // Parse and verify the json message and then kill the UberJob.
109110 try {
110- string const & repliInstanceId = cconfig::CzarConfig::instance ()->replicationInstanceId ();
111- string const & repliAuthKey = cconfig::CzarConfig::instance ()->replicationAuthKey ();
112111 auto const & jsReq = body ().objJson ;
113- auto jrMsg = protojson::UberJobErrorMsg::createFromJson (jsReq, repliInstanceId, repliAuthKey);
114-
115- auto const queryId = jrMsg->getQueryId ();
116- auto const czarId = jrMsg->getCzarId ();
117- auto const uberJobId = jrMsg->getUberJobId ();
118-
119- // Find UberJob
120- qdisp::Executive::Ptr exec = czar::Czar::getCzar ()->getExecutiveFromMap (queryId);
121- if (exec == nullptr ) {
122- throw invalid_argument (string (" HttpCzarWorkerModule::_handleJobError No executive for qid=" ) +
123- to_string (queryId) + " czar=" + to_string (czarId));
124- }
125- qdisp::UberJob::Ptr uj = exec->findUberJob (uberJobId);
126- if (uj == nullptr ) {
127- throw invalid_argument (string (" HttpCzarWorkerModule::_handleJobError No UberJob for qid=" ) +
128- to_string (queryId) + " ujId=" + to_string (uberJobId) +
129- " czar=" + to_string (czarId));
130- }
131-
132- auto importRes = uj->workerError (jrMsg->getErrorCode (), jrMsg->getErrorMsg ());
112+ auto jrMsg = protojson::UberJobErrorMsg::createFromJson (jsReq);
113+ auto importRes = czar::Czar::getCzar ()->handleUberJobErrorMsg (jrMsg, fName );
133114 return importRes;
134115 } catch (std::invalid_argument const & iaEx) {
135116 LOGS (_log, LOG_LVL_ERROR,
@@ -148,33 +129,7 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) {
148129 try {
149130 auto const & jsReq = body ().objJson ;
150131 auto jrMsg = protojson::UberJobReadyMsg::createFromJson (jsReq);
151-
152- // Find UberJob
153- auto queryId = jrMsg->getQueryId ();
154- auto czarId = jrMsg->getCzarId ();
155- auto uberJobId = jrMsg->getUberJobId ();
156- qdisp::Executive::Ptr exec = czar::Czar::getCzar ()->getExecutiveFromMap (queryId);
157- if (exec == nullptr ) {
158- LOGS (_log, LOG_LVL_WARN,
159- fName << " null exec QID:" << queryId << " ujId=" << uberJobId << " cz=" << czarId);
160- throw invalid_argument (string (" HttpCzarWorkerModule::_handleJobReady No executive for qid=" ) +
161- to_string (queryId) + " czar=" + to_string (czarId));
162- }
163-
164- qdisp::UberJob::Ptr uj = exec->findUberJob (uberJobId);
165- if (uj == nullptr ) {
166- LOGS (_log, LOG_LVL_WARN,
167- fName << " null uj QID:" << queryId << " ujId=" << uberJobId << " cz=" << czarId);
168- throw invalid_argument (string (" HttpCzarWorkerModule::_handleJobReady No UberJob for qid=" ) +
169- to_string (queryId) + " ujId=" + to_string (uberJobId) +
170- " czar=" + to_string (czarId));
171- }
172-
173- uj->setResultFileSize (jrMsg->getFileSize ());
174- exec->checkResultFileSize (jrMsg->getFileSize ());
175-
176- auto importRes =
177- uj->importResultFile (jrMsg->getFileUrl (), jrMsg->getRowCount (), jrMsg->getFileSize ());
132+ auto importRes = czar::Czar::getCzar ()->handleUberJobReadyMsg (jrMsg, fName );
178133 return importRes;
179134 } catch (std::invalid_argument const & iaEx) {
180135 LOGS (_log, LOG_LVL_ERROR,
@@ -185,14 +140,14 @@ json HttpCzarWorkerModule::_handleJobReady(string const& func) {
185140}
186141
187142json HttpCzarWorkerModule::_handleWorkerCzarComIssue (string const & func) {
188- LOGS (_log, LOG_LVL_DEBUG, " HttpCzarWorkerModule::_handleWorkerCzarComIssue start" );
143+ string const fName (" HttpCzarWorkerModule::_handleWorkerCzarComIssue" );
144+ LOGS (_log, LOG_LVL_DEBUG, fName << " start" );
189145 // Parse and verify the json message and then deal with the problems.
190146 try {
191- string const replicationInstanceId = cconfig::CzarConfig::instance ()->replicationInstanceId ();
192- string const replicationAuthKey = cconfig::CzarConfig::instance ()->replicationAuthKey ();
147+ protojson::AuthContext const authC ( cconfig::CzarConfig::instance ()->replicationInstanceId (),
148+ cconfig::CzarConfig::instance ()->replicationAuthKey () );
193149 auto const & jsReq = body ().objJson ;
194- auto wccIssue = protojson::WorkerCzarComIssue::createFromJson (jsReq, replicationInstanceId,
195- replicationAuthKey);
150+ auto wccIssue = protojson::WorkerCzarComIssue::createFromJson (jsReq, authC);
196151
197152 auto wId = wccIssue->getWorkerInfo ()->wId ;
198153 if (wccIssue->getThoughtCzarWasDead ()) {
@@ -209,7 +164,29 @@ json HttpCzarWorkerModule::_handleWorkerCzarComIssue(string const& func) {
209164 execPtr->killIncompleteUberJobsOnWorker (wId);
210165 }
211166 }
167+ // The response here includes the QueryId and UberJobId of all
168+ // uberjobs in the original message. If the czar cannot handle
169+ // one now, it won't be able to handle it later, so there's no
170+ // point in the worker sending it again.
171+ // Under normal circumstances, the czar should be able to
172+ // find and handle all failed transmits. Anything it can't find should
173+ // show up in completed query IDs, or failed uberJobs, and failing that
174+ // it should be garbage collected.
212175 auto jsRet = wccIssue->responseToJson ();
176+ auto failedTransmits = wccIssue->takeFailedTransmitsMap ();
177+ for (auto & [key, elem] : *failedTransmits) {
178+ protojson::UberJobStatusMsg::Ptr& statusMsg = elem;
179+ auto rdyMsg = dynamic_pointer_cast<protojson::UberJobReadyMsg>(statusMsg);
180+ if (rdyMsg != nullptr ) {
181+ bool const retry = true ;
182+ // Put the file on a queue to be collected later.
183+ czar::Czar::getCzar ()->handleUberJobReadyMsg (rdyMsg, fName , retry);
184+ } else {
185+ auto errMsg = dynamic_pointer_cast<protojson::UberJobErrorMsg>(statusMsg);
186+ // Kill the UberJob or user query depending on the error.
187+ czar::Czar::getCzar ()->handleUberJobErrorMsg (errMsg, fName );
188+ }
189+ }
213190 LOGS (_log, LOG_LVL_TRACE, " HttpCzarWorkerModule::_handleWorkerCzarComIssue jsRet=" << jsRet.dump ());
214191 return jsRet;
215192 } catch (std::invalid_argument const & iaEx) {
0 commit comments