Skip to content

Commit 9af2fe8

Browse files
authored
Bugfix: after the center restarts, the device status may be overwritten (#690)
<!-- Please provide brief information about the PR, what it contains & its purpose, new behaviors after the change. And let us know here if you need any help: https://github.com/microsoft/HydraLab/issues/new --> ## Description <!-- A few words to explain your changes --> ### Linked GitHub issue ID: # ## Pull Request Checklist <!-- Put an x in the boxes that apply. This is simply a reminder of what we are going to look for before merging your code. --> - [ ] Tests for the changes have been added (for bug fixes / features) - [x] Code compiles correctly with all tests are passed. - [x] I've read the [contributing guide](https://github.com/microsoft/HydraLab/blob/main/CONTRIBUTING.md#making-changes-to-the-code) and followed the recommended practices. - [ ] [Wikis](https://github.com/microsoft/HydraLab/wiki) or [README](https://github.com/microsoft/HydraLab/blob/main/README.md) have been reviewed and added / updated if needed (for bug fixes / features) ### Does this introduce a breaking change? *If this introduces a breaking change for Hydra Lab users, please describe the impact and migration path.* - [ ] Yes - [x] No ## How you tested it *Please make sure the change is tested, you can test it by adding UTs, do local test and share the screenshots, etc.* Please check the type of change your PR introduces: - [x] Bugfix - [ ] Feature - [ ] Technical design - [ ] Build related changes - [ ] Refactoring (no functional changes, no api changes) - [ ] Code style update (formatting, renaming) or Documentation content changes - [ ] Other (please describe): ### Feature UI screenshots or Technical design diagrams *If this is a relatively large or complex change, kick it off by drawing the tech design with PlantUML and explaining why you chose the solution you did and what alternatives you considered, etc...*
1 parent 3b3015d commit 9af2fe8

File tree

2 files changed

+49
-9
lines changed

2 files changed

+49
-9
lines changed

agent/src/main/java/com/microsoft/hydralab/agent/socket/AgentWebSocketClient.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ public class AgentWebSocketClient extends WebSocketClient {
2222

2323
private boolean connectionActive = false;
2424
private int reconnectTime = 0;
25+
private int violatedReconnectTime = 0;
2526

2627
public AgentWebSocketClient(URI serverUri, AgentWebSocketClientService agentWebSocketClientService) {
2728
super(serverUri);
@@ -54,6 +55,7 @@ public void onMessage(ByteBuffer bytes) {
5455
agentWebSocketClientService.onMessage(message);
5556
if (Const.Path.DEVICE_LIST.equals(message.getPath())) {
5657
reconnectTime = 0;
58+
violatedReconnectTime = 0;
5759
}
5860
}
5961

@@ -65,13 +67,30 @@ public void onMessage(String message) {
6567

6668
@Override
6769
public void onClose(int code, String reason, boolean remote) {
68-
log.error("onClose {}, {}, {}", code, reason, remote);
70+
log.error("onClose {}, {}, {}, {}", code, reason, remote, reconnectTime);
6971
reconnectTime++;
7072
connectionActive = false;
71-
// if the connection is closed by server with 1008,1003, exit the agent
72-
if (code == CloseReason.CloseCodes.CANNOT_ACCEPT.getCode() || code == CloseReason.CloseCodes.VIOLATED_POLICY.getCode()) {
73+
// if the connection is closed by server with 1003, exit the agent
74+
if (code == CloseReason.CloseCodes.CANNOT_ACCEPT.getCode()) {
7375
System.exit(code);
7476
}
77+
78+
// if the connection is closed by server with 1008, wait and retry
79+
if (code == CloseReason.CloseCodes.VIOLATED_POLICY.getCode()) {
80+
violatedReconnectTime++;
81+
if(violatedReconnectTime > 5) {
82+
log.error("onClose, code: {}, reason: {}, remote: {}, reconnectTime: {}, {}", code, reason, remote, reconnectTime, violatedReconnectTime);
83+
System.exit(code);
84+
} else {
85+
// wait for 10 seconds and then retry
86+
try {
87+
log.info("onClose, code: {}, reason: {}, remote: {}, reconnectTime: {}, {} sleep 10 seconds", code, reason, remote, reconnectTime, violatedReconnectTime);
88+
Thread.sleep(10000);
89+
} catch (InterruptedException e) {
90+
log.error("onClose, sleep error", e);
91+
}
92+
}
93+
}
7594
}
7695

7796
@Override

center/src/main/java/com/microsoft/hydralab/center/service/DeviceAgentManagementService.java

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,16 @@
2121
import com.microsoft.hydralab.common.entity.common.AgentUpdateTask;
2222
import com.microsoft.hydralab.common.entity.common.AgentUser;
2323
import com.microsoft.hydralab.common.entity.common.AnalysisTask;
24+
import com.microsoft.hydralab.common.entity.common.BlockedDeviceInfo;
2425
import com.microsoft.hydralab.common.entity.common.DeviceInfo;
26+
import com.microsoft.hydralab.common.entity.common.DeviceOperation;
2527
import com.microsoft.hydralab.common.entity.common.Message;
2628
import com.microsoft.hydralab.common.entity.common.StatisticData;
2729
import com.microsoft.hydralab.common.entity.common.StorageFileInfo;
2830
import com.microsoft.hydralab.common.entity.common.Task;
2931
import com.microsoft.hydralab.common.entity.common.TestRun;
3032
import com.microsoft.hydralab.common.entity.common.TestTask;
3133
import com.microsoft.hydralab.common.entity.common.TestTaskSpec;
32-
import com.microsoft.hydralab.common.entity.common.BlockedDeviceInfo;
33-
import com.microsoft.hydralab.common.entity.common.DeviceOperation;
3434
import com.microsoft.hydralab.common.file.StorageServiceClientProxy;
3535
import com.microsoft.hydralab.common.management.device.DeviceType;
3636
import com.microsoft.hydralab.common.repository.BlockedDeviceInfoRepository;
@@ -68,11 +68,11 @@
6868
import java.util.Date;
6969
import java.util.HashMap;
7070
import java.util.HashSet;
71+
import java.util.Iterator;
7172
import java.util.List;
7273
import java.util.Map;
7374
import java.util.Optional;
7475
import java.util.Set;
75-
import java.util.Iterator;
7676
import java.util.concurrent.ConcurrentHashMap;
7777
import java.util.concurrent.TimeUnit;
7878
import java.util.concurrent.atomic.AtomicBoolean;
@@ -431,10 +431,10 @@ public void updateDeviceGroup(List<DeviceInfo> agentDeviceInfos, String agentId)
431431
DeviceInfo centerDevice = deviceListMap.get(agentDeviceInfo.getSerialNum());
432432
// if the status saved in Center is testing, the value will not be covered
433433
if (centerDevice != null && centerDevice.isTesting()) {
434-
log.warn("Center status: {}, Agent status: {}, status should be synced to CENTER's value when TESTING.", centerDevice.getStatus(), agentDeviceInfo.getStatus());
434+
log.info("Center status: {}, Agent status: {}, status should be synced to CENTER's value when TESTING.", centerDevice.getStatus(), agentDeviceInfo.getStatus());
435435
agentDeviceInfo.setStatus(DeviceInfo.TESTING);
436-
} else if (agentDeviceInfo.isTesting()) {
437-
log.warn("Test on the device is canceled, status of device in AGENT should be reset to ONLINE, otherwise TESTING would never be covered by agent");
436+
} else if (agentDeviceInfo.isTesting() && !checkTaskIsRunning(agentDeviceInfo)) {
437+
log.info("Test on the device is canceled, status of device in AGENT should be reset to ONLINE, otherwise TESTING would never be covered by agent");
438438
agentDeviceInfo.setStatus(DeviceInfo.ONLINE);
439439
}
440440

@@ -448,6 +448,27 @@ public void updateDeviceGroup(List<DeviceInfo> agentDeviceInfos, String agentId)
448448
}
449449
}
450450

451+
private boolean checkTaskIsRunning(DeviceInfo deviceInfo) {
452+
String taskId = deviceInfo.getRunningTaskId();
453+
if (taskId == null) {
454+
return false;
455+
}
456+
Task task = testDataService.getTaskDetail(taskId);
457+
// check if the task is running
458+
if (task == null || !Task.TaskStatus.RUNNING.equals(task.getStatus())) {
459+
return false;
460+
}
461+
462+
// check the timeout of the task
463+
if (task.getTimeOutSecond() > 0) {
464+
long timeout = task.getStartDate().getTime() + TimeUnit.SECONDS.toMillis(task.getTimeOutSecond());
465+
if (System.currentTimeMillis() > timeout) {
466+
return false;
467+
}
468+
}
469+
return true;
470+
}
471+
451472
//update Device Status : start task,complete task,device offline,device online
452473
public void updateDeviceStatus(String serialNum, String status, String testTaskId) {
453474
DeviceInfo device = deviceListMap.get(serialNum);

0 commit comments

Comments
 (0)