|
|
@@ -0,0 +1,390 @@
|
|
|
+package cn.com.yusys.manager.service;
|
|
|
+
|
|
|
+import cn.com.yusys.manager.common.ParseInstanceStatusRegistry;
|
|
|
+import cn.com.yusys.manager.config.ParserConfig;
|
|
|
+import cn.com.yusys.manager.model.InstanceStatus;
|
|
|
+import cn.com.yusys.manager.model.InstanceStatusResponse;
|
|
|
+import cn.com.yusys.manager.instanceManager.Impl.DockerInstanceManager;
|
|
|
+import cn.com.yusys.manager.util.ParseInstanceClient;
|
|
|
+import cn.com.yusys.manager.common.PortPool;
|
|
|
+import lombok.RequiredArgsConstructor;
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
+import org.springframework.scheduling.annotation.Scheduled;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+
|
|
|
+import javax.annotation.PostConstruct;
|
|
|
+import javax.annotation.Resource;
|
|
|
+import java.util.Comparator;
|
|
|
+import java.util.HashMap;
|
|
|
+import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
+import java.util.concurrent.ConcurrentHashMap;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+/**
|
|
|
+ * 解析实例监控核心服务
|
|
|
+ * 该服务负责管理解析实例的生命周期,包括实例的创建、状态监控、心跳检测和负载管理
|
|
|
+ */
|
|
|
+@Slf4j
|
|
|
+@Service
|
|
|
+@RequiredArgsConstructor
|
|
|
+public class InstanceMonitorService {
|
|
|
+
|
|
|
+ // 活跃实例池
|
|
|
+ @Resource
|
|
|
+ private ParseInstanceStatusRegistry instancestatusRegistry;
|
|
|
+
|
|
|
+ // 实例接口调用客户端
|
|
|
+ @Resource
|
|
|
+ private final ParseInstanceClient instanceClient;
|
|
|
+
|
|
|
+ // 解析服务配置
|
|
|
+ @Resource
|
|
|
+ private final ParserConfig parserConfig;
|
|
|
+
|
|
|
+ //docker实例管理器
|
|
|
+ @Resource
|
|
|
+ private DockerInstanceManager dockerInstanceManager;
|
|
|
+
|
|
|
+ // 注入独立的端口池管理工具类
|
|
|
+ @Resource
|
|
|
+ private PortPool portPool;
|
|
|
+
|
|
|
+ @PostConstruct
|
|
|
+ public void initParseInstance(){
|
|
|
+ log.info("开始初始化解析实例...");
|
|
|
+ //启动时初始化实例
|
|
|
+ for (int i = 0; i < parserConfig.MIN_ACTIVE_INSTANCE; i++) {
|
|
|
+ // 4. 使用PortPool分配端口
|
|
|
+ Integer port = portPool.allocatePort();
|
|
|
+ if(port != null){
|
|
|
+ String containerId = dockerInstanceManager.startParseInstance(parserConfig.IMAGE_NAME, port);
|
|
|
+ // 增加容器ID空值校验
|
|
|
+ if (containerId == null || containerId.isEmpty()) {
|
|
|
+ log.error("初始化实例失败:Docker容器创建失败,端口:{}", port);
|
|
|
+ portPool.releasePort(port); // 归还端口
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ InstanceStatus instanceStatus = saveInstanceStatus(containerId, port);
|
|
|
+ } else {
|
|
|
+ log.error("初始化实例失败:无可用端口");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ log.info("解析实例初始化完成,等待30秒后启动监控任务");
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 核心监控定时任务:
|
|
|
+ * - initialDelay = 30000:首次执行延迟30秒
|
|
|
+ * - fixedRate = 5000:之后每5秒执行一次
|
|
|
+ */
|
|
|
+ @Scheduled(initialDelay = 30000, fixedRate = 5000)
|
|
|
+ public void parserInstanceMonitor() {
|
|
|
+ Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
+ try {
|
|
|
+ log.info("执行解析实例监控,当前活跃实例数:{}", activeInstancePool.size());
|
|
|
+
|
|
|
+ // 1. 主动调用/status接口更新实例状态
|
|
|
+ updateInstanceStatusByApi();
|
|
|
+
|
|
|
+ // 2. 心跳超时检测
|
|
|
+ checkHeartbeatTimeout();
|
|
|
+
|
|
|
+ // 3. 校验活跃实例数,触发实例拉起
|
|
|
+ checkAndSpinUpInstance();
|
|
|
+
|
|
|
+ // 4. 根据GPU负载动态扩缩容
|
|
|
+ checkAndScaleUpByGpuLoad();
|
|
|
+
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("解析实例监控任务执行失败", e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 心跳超时检测:超过阈值停止实例
|
|
|
+ */
|
|
|
+ private void checkHeartbeatTimeout() {
|
|
|
+ long now = System.currentTimeMillis();
|
|
|
+ Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
+ // 复制为新集合,避免遍历中修改原集合
|
|
|
+ Map<String, InstanceStatus> copyPool = new HashMap<>(activeInstancePool);
|
|
|
+
|
|
|
+ copyPool.forEach((instanceId, state) -> {
|
|
|
+ if (state.getLastHeartbeatTime() == null || now - state.getLastHeartbeatTime() > parserConfig.HEARTBEAT_TIMEOUT) {
|
|
|
+ log.warn("实例{}心跳超时,标记为失联", instanceId);
|
|
|
+ state.setStatus(2);
|
|
|
+ // 原子操作移除实例
|
|
|
+ activeInstancePool.remove(instanceId);
|
|
|
+ // 使用PortPool释放端口
|
|
|
+ portPool.releasePort(state.getPort());
|
|
|
+ // 捕获Docker操作异常
|
|
|
+ try {
|
|
|
+ dockerInstanceManager.terminateInstance(state.getContainerId());
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("终止实例{}的Docker容器失败", instanceId, e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 主动调用实例状态接口更新实例状态
|
|
|
+ */
|
|
|
+ private void updateInstanceStatusByApi() {
|
|
|
+ Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
+ activeInstancePool.forEach((instanceId, instancestatus) -> {
|
|
|
+ // 仅处理正常状态的实例
|
|
|
+ if (instancestatus.getStatus()==1|| instancestatus.getStatus()==0) {
|
|
|
+ InstanceStatusResponse response = null;
|
|
|
+ try {
|
|
|
+ // 增加接口调用异常捕获
|
|
|
+ response = instanceClient.getInstanceStatus(
|
|
|
+ instancestatus.getIp(), instancestatus.getPort()
|
|
|
+ );
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("调用实例{}状态接口失败", instanceId, e);
|
|
|
+ int failCount = instancestatus.getStatusQueryFailCount() + 1;
|
|
|
+ instancestatus.setStatusQueryFailCount(failCount);
|
|
|
+ if (failCount >= parserConfig.STATUS_QUERY_FAIL_COUNT) {
|
|
|
+ log.warn("实例{}连续{}次状态查询失败,标记为失联", instanceId, parserConfig.STATUS_QUERY_FAIL_COUNT);
|
|
|
+ instancestatus.setStatus(2);
|
|
|
+ }
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (response != null) {
|
|
|
+ // 增加data空值校验
|
|
|
+ InstanceStatusResponse.InstanceStatusData data = response.getData();
|
|
|
+ if (data == null) {
|
|
|
+ log.warn("实例{}状态接口返回数据为空", instanceId);
|
|
|
+ int failCount = instancestatus.getStatusQueryFailCount() + 1;
|
|
|
+ instancestatus.setStatusQueryFailCount(failCount);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ // 更新实例状态
|
|
|
+ instancestatus.setStatus(data.getStatus());
|
|
|
+ instancestatus.setStatusQueryFailCount(0); // 重置失败次数
|
|
|
+
|
|
|
+ // 更新负载信息
|
|
|
+ instancestatus.setCpuUsage(data.getCpuUsage());
|
|
|
+ instancestatus.setMemoryUsage(data.getMemoryUsage());
|
|
|
+ instancestatus.setGpuUsage(data.getGpuUsage());
|
|
|
+ instancestatus.setLastHeartbeatTime(System.currentTimeMillis());
|
|
|
+ instancestatus.setGpuMemory(data.getGpuMemory());
|
|
|
+ log.info("实例{}状态查询成功,状态:{}", instanceId, data.getStatus());
|
|
|
+
|
|
|
+ } else {
|
|
|
+ // 接口调用失败,累计失败次数
|
|
|
+ int failCount = instancestatus.getStatusQueryFailCount() + 1;
|
|
|
+ instancestatus.setStatusQueryFailCount(failCount);
|
|
|
+ log.warn("实例{}状态查询失败,累计失败次数:{}", instanceId, failCount);
|
|
|
+
|
|
|
+ // 超过失败次数阈值,标记为失联
|
|
|
+ if (failCount >= parserConfig.STATUS_QUERY_FAIL_COUNT) {
|
|
|
+ log.warn("实例{}连续{}次状态查询失败,标记为失联", instanceId, parserConfig.STATUS_QUERY_FAIL_COUNT);
|
|
|
+ instancestatus.setStatus(2);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 校验活跃实例数,触发实例拉起
|
|
|
+ */
|
|
|
+ private void checkAndSpinUpInstance() {
|
|
|
+ // 获取有效活跃实例数
|
|
|
+ int currentActiveNum = getEffectiveActiveInstanceNum();
|
|
|
+ log.info("当前有效活跃实例数:{},最小要求:{}", currentActiveNum, parserConfig.MIN_ACTIVE_INSTANCE);
|
|
|
+
|
|
|
+ // 判断是否需要拉起实例
|
|
|
+ if (needSpinUpInstance(currentActiveNum)) {
|
|
|
+ int needCreateNum = parserConfig.MIN_ACTIVE_INSTANCE - currentActiveNum;
|
|
|
+ // 防止超过最大实例数
|
|
|
+ needCreateNum = Math.min(needCreateNum, parserConfig.MAX_ACTIVE_INSTANCE - currentActiveNum);
|
|
|
+
|
|
|
+ log.info("需要拉起{}个解析实例", needCreateNum);
|
|
|
+ for (int i = 0; i < needCreateNum; i++) {
|
|
|
+ // 使用PortPool分配端口
|
|
|
+ Integer port = portPool.allocatePort();
|
|
|
+ if(port != null){
|
|
|
+ String containerId = dockerInstanceManager.startParseInstance(parserConfig.IMAGE_NAME, port);
|
|
|
+ // 增加容器ID空值校验
|
|
|
+ if (containerId == null || containerId.isEmpty()) {
|
|
|
+ log.error("创建实例失败:Docker容器创建失败,端口:{}", port);
|
|
|
+ portPool.releasePort(port); // 归还端口
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ InstanceStatus instanceStatus = saveInstanceStatus(containerId, port);
|
|
|
+ } else {
|
|
|
+ log.error("创建实例失败:无可用端口");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 获取有效活跃实例数
|
|
|
+ */
|
|
|
+ private int getEffectiveActiveInstanceNum() {
|
|
|
+ Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
+
|
|
|
+ return (int) activeInstancePool.values().stream()
|
|
|
+ .filter(info -> info.getStatus()==1 || info.getStatus()==0)
|
|
|
+ .count();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 判断是否需要拉起实例
|
|
|
+ */
|
|
|
+ private boolean needSpinUpInstance(int currentActiveNum) {
|
|
|
+ // 条件1:活跃数小于最小值
|
|
|
+ boolean isLessThanMin = currentActiveNum < parserConfig.MIN_ACTIVE_INSTANCE;
|
|
|
+ // 条件2:任务积压超过阈值
|
|
|
+ boolean isTaskBacklog = getKafkaTaskBacklog() > parserConfig.TASK_BACKLOG_THRESHOLD;
|
|
|
+
|
|
|
+ return isLessThanMin || isTaskBacklog;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 获取Kafka任务积压量
|
|
|
+ */
|
|
|
+ private int getKafkaTaskBacklog() {
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 根据GPU负载检测并扩缩容实例
|
|
|
+ */
|
|
|
+ private void checkAndScaleUpByGpuLoad() {
|
|
|
+ Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
+
|
|
|
+ // 计算所有活跃实例的平均GPU负载
|
|
|
+ double avgGpuLoad = calculateAverageGpuLoad(activeInstancePool);
|
|
|
+ log.info("当前平均GPU负载:{}%,阈值:{}%", avgGpuLoad, parserConfig.GPU_LOAD_THRESHOLD);
|
|
|
+
|
|
|
+ // 判断是否需要扩容
|
|
|
+ if (avgGpuLoad < parserConfig.GPU_LOAD_THRESHOLD) {
|
|
|
+ int currentActiveNum = getEffectiveActiveInstanceNum();
|
|
|
+
|
|
|
+ // 检查是否可以扩容
|
|
|
+ if (currentActiveNum < parserConfig.MAX_ACTIVE_INSTANCE) {
|
|
|
+ ScaleUpInstance(currentActiveNum);
|
|
|
+ } else {
|
|
|
+ log.warn("GPU资源充足但已达到最大实例数({}),无法继续扩容", parserConfig.MAX_ACTIVE_INSTANCE);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ //判断是否需要缩容
|
|
|
+ if (avgGpuLoad > parserConfig.GPU_LOAD_THRESHOLD) {
|
|
|
+ int currentActiveNum = getEffectiveActiveInstanceNum();
|
|
|
+
|
|
|
+ // 检查是否可以缩容
|
|
|
+ if (currentActiveNum > parserConfig.MIN_ACTIVE_INSTANCE){
|
|
|
+ int needScaleNum = Math.min(parserConfig.GPU_SCALE_INSTANCE_NUM, currentActiveNum - parserConfig.MIN_ACTIVE_INSTANCE);
|
|
|
+ ScaleDownInstance(needScaleNum, activeInstancePool);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 计算所有活跃实例的平均GPU负载
|
|
|
+ * @param activeInstancePool 活跃实例池
|
|
|
+ * @return 平均GPU负载(百分比)
|
|
|
+ */
|
|
|
+ private double calculateAverageGpuLoad(Map<String, InstanceStatus> activeInstancePool) {
|
|
|
+ return activeInstancePool.values().stream()
|
|
|
+ .filter(status -> status.getStatus() == 1 || status.getStatus() == 0) // 只统计正常状态的实例
|
|
|
+ .filter(status -> status.getGpuUsage() != null) // 过滤掉GPU使用率为null的实例
|
|
|
+ .mapToDouble(InstanceStatus::getGpuUsage)
|
|
|
+ .average()
|
|
|
+ .orElse(0.0);
|
|
|
+ }
|
|
|
+
|
|
|
+ //保存实例状态
|
|
|
+ private InstanceStatus saveInstanceStatus(String containerId,Integer port) {
|
|
|
+ InstanceStatus instanceStatus = new InstanceStatus();
|
|
|
+ instanceStatus.setIp("127.0.0.1");
|
|
|
+ instanceStatus.setPort(port);
|
|
|
+ instanceStatus.setLastHeartbeatTime(System.currentTimeMillis());
|
|
|
+ instanceStatus.setStatus(0);
|
|
|
+
|
|
|
+ Map<String, InstanceStatus> activeInstancePool = instancestatusRegistry.getActiveInstancePool();
|
|
|
+
|
|
|
+ instanceStatus.setContainerId(containerId);
|
|
|
+ activeInstancePool.put(containerId, instanceStatus);
|
|
|
+ return instanceStatus;
|
|
|
+ }
|
|
|
+
|
|
|
+ //增加实例
|
|
|
+ private void ScaleUpInstance(int currentActiveNum){
|
|
|
+ int needCreateNum = parserConfig.GPU_SCALE_INSTANCE_NUM;
|
|
|
+ // 防止超过最大实例数
|
|
|
+ needCreateNum = Math.min(needCreateNum, parserConfig.MAX_ACTIVE_INSTANCE - currentActiveNum);
|
|
|
+
|
|
|
+ log.info("需要扩容{}个解析实例", needCreateNum);
|
|
|
+
|
|
|
+ for (int i = 0; i < needCreateNum; i++) {
|
|
|
+ // 使用PortPool分配端口
|
|
|
+ Integer port = portPool.allocatePort();
|
|
|
+ if(port != null){
|
|
|
+ String containerId = dockerInstanceManager.startParseInstance(parserConfig.IMAGE_NAME, port);
|
|
|
+ // 增加容器ID空值校验
|
|
|
+ if (containerId == null || containerId.isEmpty()) {
|
|
|
+ log.error("GPU扩容实例失败:Docker容器创建失败,端口:{}", port);
|
|
|
+ portPool.releasePort(port); // 归还端口
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ InstanceStatus instanceStatus = saveInstanceStatus(containerId, port);
|
|
|
+ log.info("基于GPU负载扩容,已创建实例,容器ID:{},端口:{}", containerId, port);
|
|
|
+ } else {
|
|
|
+ log.warn("端口池已满,无法继续GPU扩容");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ /**
|
|
|
+ * 执行缩容操作:优先关闭负载最低的实例
|
|
|
+ * @param needDownNum 需要缩容的实例数
|
|
|
+ * @param activeInstancePool 活跃实例池
|
|
|
+ */
|
|
|
+ private void ScaleDownInstance(int needDownNum, Map<String, InstanceStatus> activeInstancePool) {
|
|
|
+ // 1. 筛选出正常运行的实例,并按GPU负载升序排序(负载最低的优先关闭)
|
|
|
+ List<Map.Entry<String, InstanceStatus>> sortedInstances = activeInstancePool.entrySet().stream()
|
|
|
+ .filter(entry -> entry.getValue().getStatus() == 1 || entry.getValue().getStatus() == 0) // 仅正常实例
|
|
|
+ .filter(entry -> entry.getValue().getGpuUsage() != null) // 有GPU负载数据
|
|
|
+ .sorted(Comparator.comparingDouble(entry -> entry.getValue().getGpuUsage())) // 升序排序
|
|
|
+ .limit(needDownNum) // 只取需要缩容的数量
|
|
|
+ .collect(Collectors.toList());
|
|
|
+
|
|
|
+ // 2. 逐个关闭实例
|
|
|
+ for (Map.Entry<String, InstanceStatus> entry : sortedInstances) {
|
|
|
+ String instanceId = entry.getKey();
|
|
|
+ InstanceStatus instanceStatus = entry.getValue();
|
|
|
+
|
|
|
+ log.info("开始缩容实例:{},GPU负载:{}%,端口:{}",
|
|
|
+ instanceId, instanceStatus.getGpuUsage(), instanceStatus.getPort());
|
|
|
+
|
|
|
+ try {
|
|
|
+ // 标记实例为失联
|
|
|
+ instanceStatus.setStatus(2);
|
|
|
+ // 从活跃实例池移除
|
|
|
+ activeInstancePool.remove(instanceId);
|
|
|
+ // 释放端口
|
|
|
+ portPool.releasePort(instanceStatus.getPort());
|
|
|
+ // 关闭Docker容器
|
|
|
+ dockerInstanceManager.terminateInstance(instanceStatus.getContainerId());
|
|
|
+
|
|
|
+ log.info("缩容实例{}成功,已关闭容器并释放端口{}", instanceId, instanceStatus.getPort());
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.error("缩容实例{}失败", instanceId, e);
|
|
|
+ // 缩容失败时,将实例重新加入活跃池(避免端口丢失)
|
|
|
+ activeInstancePool.put(instanceId, instanceStatus);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|