|
|
@@ -56,10 +56,6 @@ public class InstanceMonitorService {
|
|
|
@Resource
|
|
|
private TaskRecordService taskRecordService;
|
|
|
|
|
|
- // 最大重试次数
|
|
|
- @org.springframework.beans.factory.annotation.Value("${parser.task.max-retry:3}")
|
|
|
- private int maxRetry;
|
|
|
-
|
|
|
@PostConstruct
|
|
|
public void initParseInstance(){
|
|
|
log.info("开始初始化解析实例...");
|
|
|
@@ -71,7 +67,7 @@ public class InstanceMonitorService {
|
|
|
String instanceId = instanceManager.startParseInstance(port);
|
|
|
// 增加容器ID空值校验
|
|
|
if (instanceId == null || instanceId.isEmpty()) {
|
|
|
- log.error("初始化实例失败:进程创建失败,端口:{}", port);
|
|
|
+ log.error("初始化实例失败:实例创建失败,端口:{}", port);
|
|
|
portPool.releasePort(port); // 归还端口
|
|
|
continue;
|
|
|
}
|
|
|
@@ -104,8 +100,8 @@ public class InstanceMonitorService {
|
|
|
// 3. 校验活跃实例数,触发实例拉起
|
|
|
checkAndSpinUpInstance();
|
|
|
|
|
|
- // 4. 根据GPU负载动态扩缩容
|
|
|
- checkAndScaleUpByGpuLoad();
|
|
|
+ // 4. 检查并缩容空闲实例
|
|
|
+ checkAndScaleDownIdleInstance();
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
log.error("解析实例监控任务执行失败", e);
|
|
|
@@ -118,7 +114,6 @@ public class InstanceMonitorService {
|
|
|
private void checkHeartbeatTimeout() {
|
|
|
long now = System.currentTimeMillis();
|
|
|
Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
- // 复制为新集合,避免遍历中修改原集合
|
|
|
Map<String, InstanceStatus> copyPool = new HashMap<>(activeInstancePool);
|
|
|
|
|
|
copyPool.forEach((instanceId, state) -> {
|
|
|
@@ -184,6 +179,9 @@ public class InstanceMonitorService {
|
|
|
instancestatus.setGpuUsage(data.getGpuUsage());
|
|
|
instancestatus.setLastHeartbeatTime(System.currentTimeMillis());
|
|
|
instancestatus.setGpuMemory(data.getGpuMemory());
|
|
|
+ if (data.getStatus() == 0) {
|
|
|
+ instancestatus.setLastIdleTime(System.currentTimeMillis());
|
|
|
+ }
|
|
|
log.info("实例{}状态查询成功,状态:{}", instanceId, data.getStatus());
|
|
|
|
|
|
} else {
|
|
|
@@ -268,53 +266,45 @@ public class InstanceMonitorService {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * 根据GPU负载检测并扩缩容实例
|
|
|
+ * 检查并缩容空闲实例
|
|
|
*/
|
|
|
- private void checkAndScaleUpByGpuLoad() {
|
|
|
- Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
-
|
|
|
- // 计算所有活跃实例的平均GPU负载
|
|
|
- double avgGpuLoad = calculateAverageGpuLoad(activeInstancePool);
|
|
|
- log.info("当前平均GPU负载:{}%,阈值:{}%", avgGpuLoad, parserConfig.GPU_LOAD_THRESHOLD);
|
|
|
+ private void checkAndScaleDownIdleInstance() {
|
|
|
+ int currentActiveNum = getEffectiveActiveInstanceNum();
|
|
|
+ if (currentActiveNum > parserConfig.MIN_ACTIVE_INSTANCE) {
|
|
|
+ Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
|
|
|
- // 判断是否需要扩容
|
|
|
- if (avgGpuLoad < parserConfig.GPU_LOAD_THRESHOLD) {
|
|
|
- int currentActiveNum = getEffectiveActiveInstanceNum();
|
|
|
+ // 筛选出空闲的实例,按 lastIdleTime 升序排序(最久空闲的在前)
|
|
|
+ List<InstanceStatus> idleInstances = activeInstancePool.values().stream()
|
|
|
+ .filter(status -> status.getStatus() == 0 && status.getLastIdleTime() != null)
|
|
|
+ .sorted(Comparator.comparingLong(InstanceStatus::getLastIdleTime))
|
|
|
+ .collect(Collectors.toList());
|
|
|
|
|
|
- // 检查是否可以扩容
|
|
|
- if (currentActiveNum < parserConfig.MAX_ACTIVE_INSTANCE) {
|
|
|
- ScaleUpInstance(currentActiveNum);
|
|
|
- } else {
|
|
|
- log.warn("GPU资源充足但已达到最大实例数({}),无法继续扩容", parserConfig.MAX_ACTIVE_INSTANCE);
|
|
|
- }
|
|
|
- }
|
|
|
+ if (!idleInstances.isEmpty()) {
|
|
|
+ InstanceStatus instanceToClose = idleInstances.get(0); // 最久空闲的
|
|
|
+ String instanceId = instanceToClose.getInstanceId();
|
|
|
|
|
|
- //判断是否需要缩容
|
|
|
- if (avgGpuLoad > parserConfig.GPU_LOAD_THRESHOLD) {
|
|
|
- int currentActiveNum = getEffectiveActiveInstanceNum();
|
|
|
+ try {
|
|
|
+ log.info("缩容空闲实例:{},端口:{}", instanceId, instanceToClose.getPort());
|
|
|
+
|
|
|
+ // 标记实例为失联
|
|
|
+ instanceToClose.setStatus(2);
|
|
|
+ // 从活跃实例池移除
|
|
|
+ activeInstancePool.remove(instanceId);
|
|
|
+ // 释放端口
|
|
|
+ portPool.releasePort(instanceToClose.getPort());
|
|
|
+ // 关闭进程
|
|
|
+ instanceManager.terminateInstance(instanceId);
|
|
|
|
|
|
- // 检查是否可以缩容
|
|
|
- if (currentActiveNum > parserConfig.MIN_ACTIVE_INSTANCE){
|
|
|
- int needScaleNum = Math.min(parserConfig.GPU_SCALE_INSTANCE_NUM, currentActiveNum - parserConfig.MIN_ACTIVE_INSTANCE);
|
|
|
- ScaleDownInstance(needScaleNum, activeInstancePool);
|
|
|
+ log.info("缩容实例{}成功,已关闭进程并释放端口{}", instanceId, instanceToClose.getPort());
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("缩容实例{}失败", instanceId, e);
|
|
|
+ // 缩容失败时,将实例重新加入活跃池(避免端口丢失)
|
|
|
+ activeInstancePool.put(instanceId, instanceToClose);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * 计算所有活跃实例的平均GPU负载
|
|
|
- * @param activeInstancePool 活跃实例池
|
|
|
- * @return 平均GPU负载(百分比)
|
|
|
- */
|
|
|
- private double calculateAverageGpuLoad(Map<String, InstanceStatus> activeInstancePool) {
|
|
|
- return activeInstancePool.values().stream()
|
|
|
- .filter(status -> status.getStatus() == 1 || status.getStatus() == 0) // 只统计正常状态的实例
|
|
|
- .filter(status -> status.getGpuUsage() != null) // 过滤掉GPU使用率为null的实例
|
|
|
- .mapToDouble(InstanceStatus::getGpuUsage)
|
|
|
- .average()
|
|
|
- .orElse(0.0);
|
|
|
- }
|
|
|
-
|
|
|
//保存实例状态
|
|
|
private InstanceStatus saveInstanceStatus(String instanceId, Integer port) {
|
|
|
InstanceStatus instanceStatus = new InstanceStatus();
|
|
|
@@ -323,6 +313,7 @@ public class InstanceMonitorService {
|
|
|
instanceStatus.setLastHeartbeatTime(System.currentTimeMillis());
|
|
|
instanceStatus.setStatus(0);
|
|
|
instanceStatus.setInstanceId(instanceId);
|
|
|
+ instanceStatus.setLastIdleTime(System.currentTimeMillis());
|
|
|
// 获取并设置进程PID
|
|
|
Long pid = instanceManager.getPid(instanceId);
|
|
|
instanceStatus.setPid(pid);
|
|
|
@@ -332,77 +323,8 @@ public class InstanceMonitorService {
|
|
|
return instanceStatus;
|
|
|
}
|
|
|
|
|
|
- //增加实例
|
|
|
- private void ScaleUpInstance(int currentActiveNum){
|
|
|
- int needCreateNum = parserConfig.GPU_SCALE_INSTANCE_NUM;
|
|
|
- // 防止超过最大实例数
|
|
|
- needCreateNum = Math.min(needCreateNum, parserConfig.MAX_ACTIVE_INSTANCE - currentActiveNum);
|
|
|
-
|
|
|
- log.info("需要扩容{}个解析实例", needCreateNum);
|
|
|
-
|
|
|
- for (int i = 0; i < needCreateNum; i++) {
|
|
|
- // 使用PortPool分配端口
|
|
|
- Integer port = portPool.allocatePort();
|
|
|
- if(port != null){
|
|
|
- String instanceId = instanceManager.startParseInstance(port);
|
|
|
- // 增加实例ID空值校验
|
|
|
- if (instanceId == null || instanceId.isEmpty()) {
|
|
|
- log.error("GPU扩容实例失败:进程创建失败,端口:{}", port);
|
|
|
- portPool.releasePort(port); // 归还端口
|
|
|
- continue;
|
|
|
- }
|
|
|
- InstanceStatus instanceStatus = saveInstanceStatus(instanceId, port);
|
|
|
- log.info("基于GPU负载扩容,已创建实例,实例ID:{},端口:{}", instanceId, port);
|
|
|
- } else {
|
|
|
- log.warn("端口池已满,无法继续GPU扩容");
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- /**
|
|
|
- * 执行缩容操作:优先关闭负载最低的实例
|
|
|
- * @param needDownNum 需要缩容的实例数
|
|
|
- * @param activeInstancePool 活跃实例池
|
|
|
- */
|
|
|
- private void ScaleDownInstance(int needDownNum, Map<String, InstanceStatus> activeInstancePool) {
|
|
|
- // 1. 筛选出正常运行的实例,并按GPU负载升序排序(负载最低的优先关闭)
|
|
|
- List<Map.Entry<String, InstanceStatus>> sortedInstances = activeInstancePool.entrySet().stream()
|
|
|
- .filter(entry -> entry.getValue().getStatus() == 1 || entry.getValue().getStatus() == 0) // 仅正常实例
|
|
|
- .filter(entry -> entry.getValue().getGpuUsage() != null) // 有GPU负载数据
|
|
|
- .sorted(Comparator.comparingDouble(entry -> entry.getValue().getGpuUsage())) // 升序排序
|
|
|
- .limit(needDownNum) // 只取需要缩容的数量
|
|
|
- .collect(Collectors.toList());
|
|
|
-
|
|
|
- // 2. 逐个关闭实例
|
|
|
- for (Map.Entry<String, InstanceStatus> entry : sortedInstances) {
|
|
|
- String instanceId = entry.getKey();
|
|
|
- InstanceStatus instanceStatus = entry.getValue();
|
|
|
-
|
|
|
- log.info("开始缩容实例:{},GPU负载:{}%,端口:{}",
|
|
|
- instanceId, instanceStatus.getGpuUsage(), instanceStatus.getPort());
|
|
|
-
|
|
|
- try {
|
|
|
- // 标记实例为失联
|
|
|
- instanceStatus.setStatus(2);
|
|
|
- // 从活跃实例池移除
|
|
|
- activeInstancePool.remove(instanceId);
|
|
|
- // 释放端口
|
|
|
- portPool.releasePort(instanceStatus.getPort());
|
|
|
- // 关闭进程
|
|
|
- instanceManager.terminateInstance(instanceId);
|
|
|
-
|
|
|
- log.info("缩容实例{}成功,已关闭进程并释放端口{}", instanceId, instanceStatus.getPort());
|
|
|
- } catch (Exception e) {
|
|
|
- log.error("缩容实例{}失败", instanceId, e);
|
|
|
- // 缩容失败时,将实例重新加入活跃池(避免端口丢失)
|
|
|
- activeInstancePool.put(instanceId, instanceStatus);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
/**
|
|
|
* 执行多模态任务解析任务,阻塞调用底层解析器执行任务
|
|
|
- * 失败任务重试maxRetry次后,转入失败Topic
|
|
|
*/
|
|
|
public ExecuteResponse processMultimodalTask(Task task) {
|
|
|
Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
@@ -421,12 +343,43 @@ public class InstanceMonitorService {
|
|
|
// 检查是否有空闲的解析实例
|
|
|
InstanceStatus idleInstance = findIdleInstance(activeInstancePool);
|
|
|
if (idleInstance == null) {
|
|
|
- log.debug("当前无空闲解析实例");
|
|
|
- taskLogService.logTaskFailure(taskId, "当前无空闲解析实例");
|
|
|
- // 更新任务状态为解析失败
|
|
|
- taskRecordRequest.setStatus(4);
|
|
|
- taskRecordService.updateStatus(taskRecordRequest);
|
|
|
- return ExecuteResponse.fail(300,"当前无空闲解析实例");
|
|
|
+ // 检查是否可以扩容
|
|
|
+ int currentActiveNum = getEffectiveActiveInstanceNum();
|
|
|
+ if (currentActiveNum < parserConfig.MAX_ACTIVE_INSTANCE) {
|
|
|
+ double avgGpuLoad = calculateAverageGpuLoad(activeInstancePool);
|
|
|
+ if (avgGpuLoad < parserConfig.GPU_LOAD_THRESHOLD) {
|
|
|
+ log.info("无空闲实例,GPU资源充足,扩容一个实例");
|
|
|
+ ScaleUpOneInstance();
|
|
|
+ // 重新查找空闲实例
|
|
|
+ idleInstance = findIdleInstance(activeInstancePool);
|
|
|
+ // 等待实例启动,最多10秒
|
|
|
+ long startTime = System.currentTimeMillis();
|
|
|
+ while (System.currentTimeMillis() - startTime < 10000) {
|
|
|
+ try {
|
|
|
+ Thread.sleep(1000);
|
|
|
+ } catch (InterruptedException e) {
|
|
|
+ Thread.currentThread().interrupt();
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ idleInstance = findIdleInstance(activeInstancePool);
|
|
|
+ if (idleInstance != null) {
|
|
|
+ log.info("新扩容实例已启动并空闲,开始处理任务");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (idleInstance == null) {
|
|
|
+ log.warn("扩容后等待30秒仍无空闲实例,任务失败");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (idleInstance == null) {
|
|
|
+ log.debug("当前无空闲解析实例");
|
|
|
+ taskLogService.logTaskFailure(taskId, "当前无空闲解析实例");
|
|
|
+ // 更新任务状态为解析失败
|
|
|
+ taskRecordRequest.setStatus(4);
|
|
|
+ taskRecordService.updateStatus(taskRecordRequest);
|
|
|
+ return ExecuteResponse.fail(300,"当前无空闲解析实例");
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
// 更新任务状态为解析中
|
|
|
@@ -437,7 +390,7 @@ public class InstanceMonitorService {
|
|
|
taskLogService.logInstanceAllocation(taskId, idleInstance.getInstanceId());
|
|
|
|
|
|
// 执行任务解析
|
|
|
- ExecuteResponse response = executeTaskWithRetry(idleInstance, task);
|
|
|
+ ExecuteResponse response = executeTask(idleInstance, task);
|
|
|
|
|
|
// 根据执行结果更新任务状态
|
|
|
if ( response.getCode() == 200) {
|
|
|
@@ -481,41 +434,33 @@ public class InstanceMonitorService {
|
|
|
|
|
|
|
|
|
/**
|
|
|
- * 执行任务并处理重试逻辑
|
|
|
+ * 执行任务
|
|
|
*/
|
|
|
- private ExecuteResponse executeTaskWithRetry(InstanceStatus instance, Task task) {
|
|
|
+ private ExecuteResponse executeTask(InstanceStatus instance, Task task) {
|
|
|
String instanceId = instance.getInstanceId();
|
|
|
- int retryCount = 0;
|
|
|
// 标记实例为运行中
|
|
|
instance.setStatus(1);
|
|
|
|
|
|
try {
|
|
|
- while (retryCount <= maxRetry ) {
|
|
|
- try {
|
|
|
- log.info("开始执行任务,实例:{},重试次数:{}/{},任务内容:{}",
|
|
|
- instanceId, retryCount, maxRetry, task.getFilePath());
|
|
|
-
|
|
|
- // 调用解析器执行任务
|
|
|
- ExecuteResponse response = callParser(instance, task);
|
|
|
-
|
|
|
- if (response != null && response.getCode() == 200) {
|
|
|
- log.info("任务执行成功,实例:{},响应:{}", instanceId, response);
|
|
|
- return response;
|
|
|
- } else {
|
|
|
- log.warn("任务执行返回失败,实例:{},响应:{},准备重试", instanceId, response);
|
|
|
- retryCount++;
|
|
|
- }
|
|
|
- } catch (Exception e) {
|
|
|
- log.error("任务执行异常,实例:{},重试次数:{}/{}", instanceId, retryCount, maxRetry, e);
|
|
|
- retryCount++;
|
|
|
- }
|
|
|
+ log.info("开始执行任务,实例:{},任务内容:{}", instanceId, task.getFilePath());
|
|
|
|
|
|
- }
|
|
|
+ // 调用解析器执行任务
|
|
|
+ ExecuteResponse response = callParser(instance, task);
|
|
|
|
|
|
- return ExecuteResponse.fail("任务执行失败,已达最大重试次数");
|
|
|
+ if (response != null && response.getCode() == 200) {
|
|
|
+ log.info("任务执行成功,实例:{},响应:{}", instanceId, response.getMessage());
|
|
|
+ return response;
|
|
|
+ } else {
|
|
|
+ log.warn("任务执行失败,实例:{},响应:{}", instanceId, response);
|
|
|
+ return ExecuteResponse.fail("任务执行失败");
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("任务执行异常,实例:{}", instanceId, e);
|
|
|
+ return ExecuteResponse.fail("任务执行异常: " + e.getMessage());
|
|
|
} finally {
|
|
|
// 恢复实例状态为空闲
|
|
|
instance.setStatus(0);
|
|
|
+ instance.setLastIdleTime(System.currentTimeMillis());
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -569,6 +514,7 @@ public class InstanceMonitorService {
|
|
|
.memoryUsage(status.getMemoryUsage())
|
|
|
.gpuUsage(status.getGpuUsage())
|
|
|
.gpuMemory(status.getGpuMemory())
|
|
|
+ .lastIdleTime(status.getLastIdleTime())
|
|
|
.lastHeartbeatTime(status.getLastHeartbeatTime())
|
|
|
.build())
|
|
|
.collect(Collectors.toList());
|
|
|
@@ -619,5 +565,35 @@ public class InstanceMonitorService {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-}
|
|
|
+ /**
|
|
|
+ * 计算平均GPU负载
|
|
|
+ */
|
|
|
+ private double calculateAverageGpuLoad(Map<String, InstanceStatus> activeInstancePool) {
|
|
|
+ return activeInstancePool.values().stream()
|
|
|
+ .filter(status -> status.getGpuUsage() != null)
|
|
|
+ .mapToDouble(InstanceStatus::getGpuUsage)
|
|
|
+ .average()
|
|
|
+ .orElse(0.0);
|
|
|
+ }
|
|
|
|
|
|
+ /**
|
|
|
+ * 扩容一个实例
|
|
|
+ */
|
|
|
+ private void ScaleUpOneInstance() {
|
|
|
+ // 使用PortPool分配端口
|
|
|
+ Integer port = portPool.allocatePort();
|
|
|
+ if(port != null){
|
|
|
+ String instanceId = instanceManager.startParseInstance(port);
|
|
|
+ // 增加实例ID空值校验
|
|
|
+ if (instanceId == null || instanceId.isEmpty()) {
|
|
|
+ log.error("扩容实例失败:进程创建失败,端口:{}", port);
|
|
|
+ portPool.releasePort(port); // 归还端口
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ InstanceStatus instanceStatus = saveInstanceStatus(instanceId, port);
|
|
|
+ log.info("扩容,已创建实例,实例ID:{},端口:{}", instanceId, port);
|
|
|
+ } else {
|
|
|
+ log.warn("端口池已满,无法扩容");
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|