소스 검색

异常识别模块

jiaqiang 1 개월 전
부모
커밋
9b37194a66

+ 1098 - 0
llmops/agents/anomaly_recognizer_agent.py

@@ -0,0 +1,1098 @@
+
+from langgraph.prebuilt import create_react_agent
+from langchain_openai import ChatOpenAI
+from typing import Dict, List, Any, Optional
+import pandas as pd
+import json
+from datetime import datetime
+from pathlib import Path
+import numpy as np
+from llmops.agents.tools.balance_info_missing_recognizer import BalanceInfoMissingRecognizer
+from llmops.agents.tools.inactive_account_recognizer import InactiveAccountRecognizer
+from llmops.agents.tools.balance_recognizer import BalanceContinuityRecognizer
+from llmops.agents.tools.night_transaction_recognizer import NightTransactionRecognizer
+from llmops.agents.tools.high_frequency_transaction_recognizer import HighFrequencyTransactionRecognizer
+from llmops.agents.tools.large_amount_transaction_recognizer import LargeAmountTransactionRecognizer
+from llmops.agents.tools.occasional_high_integer_transaction_recognizer import OccasionalHighIntegerTransactionRecognizer
+from llmops.agents.tools.low_interest_rate_recognizer import LowInterestRateRecognizer
+from llmops.agents.tools.over_book_transaction_recognizer import OverBookTransactionRecognizer
+from llmops.agents.data_manager import DataManager
+from llmops.config import LLM_API_KEY, LLM_BASE_URL, LLM_MODEL_NAME, anomaly_recognizer_config
+
+
+
+class AnomalyRecognitionAgent:
+    """异常识别智能体"""
+
+    def __init__(self, csv_path: str, api_key: str, base_url: str = "https://api.deepseek.com",
+                 model_name: str = "deepseek-chat", config: Optional[Dict] = None):
+        """
+        初始化异常识别智能体
+        """
+        self.csv_path = csv_path
+        self.llm = ChatOpenAI(
+            model=model_name,
+            api_key=api_key,
+            base_url=base_url,
+            temperature=0.1
+        )
+        self.config = config or {}
+        self.transaction_data = None
+        self.data_summary = {}
+        self.recognizer_tools = []
+        self.agent = None
+        self.recognition_results = {}
+
+        # 初始化识别工具
+        self._initialize_recognizers()
+
+        # 如果提供了LLM,初始化Agent
+        self._initialize_agent()
+
+    def _initialize_recognizers(self):
+        """初始化所有异常识别工具"""
+        # 余额信息缺失检查
+        if self.config.get('enable_balance_missing_check', True):
+            balance_missing_config = self.config.get('balance_missing_check', {})
+            self.recognizer_tools.append(BalanceInfoMissingRecognizer(
+                csv_path=self.csv_path,
+                config={'balance_missing_check': balance_missing_config}
+            ))
+            print(f"✅ 初始化余额信息缺失检查器(高优先级)")
+        # 长期无交易账户识别器
+        if self.config.get('enable_inactive_account_check', True):
+            inactive_account_config = self.config.get('inactive_account_check', {})
+            self.recognizer_tools.append(InactiveAccountRecognizer(
+                csv_path=self.csv_path,
+                config={'inactive_account_check': inactive_account_config}
+            ))
+            print(f"✅ 初始化长期无交易账户识别器(高优先级)")
+
+        # 余额连续性识别
+        if self.config.get('enable_balance_recognition', True):
+            self.recognizer_tools.append(BalanceContinuityRecognizer(csv_path=self.csv_path))
+            print(f"✅ 初始化余额连续性识别器")
+
+        # 夜间交易识别
+        if self.config.get('enable_night_recognition', True):
+            night_config = self.config.get('night_recognition', {})
+            self.recognizer_tools.append(NightTransactionRecognizer(
+                csv_path=self.csv_path,
+                config={'night_transaction': night_config}
+            ))
+            print(f"✅ 初始化夜间交易识别器")
+
+        # 高频交易识别
+        if self.config.get('enable_high_frequency_recognition', True):
+            high_freq_config = self.config.get('high_frequency_recognition', {})
+            self.recognizer_tools.append(HighFrequencyTransactionRecognizer(
+                csv_path=self.csv_path,
+                config={'high_frequency': high_freq_config}
+            ))
+            print(f"✅ 初始化高频交易识别器")
+
+        # 大额交易识别
+        if self.config.get('enable_large_amount_recognition', True):
+            large_amount_recognition_config = self.config.get('large_amount_recognition', {})
+            self.recognizer_tools.append(LargeAmountTransactionRecognizer(
+                csv_path=self.csv_path,
+                config={'large_amount_recognition': large_amount_recognition_config}
+            ))
+            print(f"✅ 初始化大额交易识别器")
+
+        # 偶发大额整数交易识别
+        if self.config.get('enable_occasional_high_integer_recognition', True):
+            integer_config = self.config.get('occasional_high_integer_transaction', {})
+            self.recognizer_tools.append(OccasionalHighIntegerTransactionRecognizer(
+                csv_path=self.csv_path,
+                config={'occasional_high_integer_transaction': integer_config}
+            ))
+            print(f"✅ 初始化偶发高额整数交易识别器")
+
+        # 结算交易识别
+        if self.config.get('enable_low_interest_rate_recognition', True):
+            interest_config = self.config.get('low_interest_rate_recognition', {})
+            self.recognizer_tools.append(LowInterestRateRecognizer(
+                csv_path=self.csv_path,
+                config={'interest_rate_check': interest_config}
+            ))
+            print(f"✅ 初始化低利率结息识别器(高优先级)")
+
+        # 疑似过账交易识别
+        if self.config.get('enable_over_book_transaction_recognition', True):  # 使用 "over_book" 而不是 "overbook"
+            overbook_config = self.config.get('over_book_transaction_recognition', {})  # 保持一致
+            self.recognizer_tools.append(OverBookTransactionRecognizer(
+                csv_path=self.csv_path,
+                config={'over_book_transaction_recognition': overbook_config}  # 保持一致
+            ))
+            print(f"✅ 初始化疑似过账流水交易识别器")
+
+        print(f"📋 共初始化 {len(self.recognizer_tools)} 个识别器")
+
+    def _initialize_agent(self):
+        """初始化智能体 - 优化版本"""
+        try:
+            # 确保每个工具都有清晰的描述
+            for tool in self.recognizer_tools:
+                # 如果描述太短,添加说明
+                if len(tool.description) < 30:
+                    tool.description = f"分析银行流水数据中的{tool.display_name}"
+
+            # 创建Agent
+            self.agent = create_react_agent(
+                model=self.llm,
+                tools=self.recognizer_tools
+            )
+
+            print("🤖 异常识别智能体初始化成功")
+            print(f"🛠️ 加载了 {len(self.recognizer_tools)} 个工具:")
+
+            for i, tool in enumerate(self.recognizer_tools, 1):
+                print(f"  {i}. {tool.display_name} ({tool.name})")
+                print(f"     描述: {tool.description}")
+
+            # 测试工具是否可用
+            print("🧪 测试工具可用性...")
+            for tool in self.recognizer_tools:
+                try:
+                    # 测试工具的基本属性
+                    has_run = hasattr(tool, '_run')
+                    has_name = hasattr(tool, 'name')
+                    has_desc = hasattr(tool, 'description')
+                    print(
+                        f"  ✓ {tool.name}: 接口完整" if all([has_run, has_name, has_desc]) else f"  ⚠️ {tool.name}: 接口不完整")
+                except:
+                    print(f"  ❌ {tool.name}: 测试失败")
+        except Exception as e:
+            print(f"智能体初始化失败: {e}")
+            import traceback
+            traceback.print_exc()
+            self.agent = None
+
+    def load_transaction_data(self) -> pd.DataFrame:
+        """加载交易数据"""
+        try:
+            print(f"📥 正在加载交易数据: {self.csv_path}")
+            self.transaction_data = DataManager.load_from_standardized_csv(self.csv_path)
+            self.data_summary = self._generate_data_summary()
+            return self.transaction_data
+        except Exception as e:
+            print(f"数据加载失败: {e}")
+            raise
+
+    def _generate_data_summary(self) -> Dict[str, Any]:
+        """生成数据摘要"""
+        if self.transaction_data is None or len(self.transaction_data) == 0:
+            return {}
+
+        df = self.transaction_data
+
+        summary = {
+            'transaction_count': len(df),
+            'date_range': {
+                'start': df['txDate'].min() if 'txDate' in df.columns else '未知',
+                'end': df['txDate'].max() if 'txDate' in df.columns else '未知'
+            },
+            'total_amount': float(df['txAmount'].sum()) if 'txAmount' in df.columns else 0,
+            'income_amount': float(df[df['txDirection'] == '收入']['txAmount'].sum())
+            if 'txAmount' in df.columns and 'txDirection' in df.columns else 0,
+            'expense_amount': float(df[df['txDirection'] == '支出']['txAmount'].sum())
+            if 'txAmount' in df.columns and 'txDirection' in df.columns else 0,
+            'average_amount': float(df['txAmount'].mean()) if 'txAmount' in df.columns else 0,
+            'max_amount': float(df['txAmount'].max()) if 'txAmount' in df.columns else 0,
+            'min_amount': float(df['txAmount'].min()) if 'txAmount' in df.columns else 0,
+            'unique_days': df['datetime'].dt.date.nunique() if 'datetime' in df.columns else 0,
+            'direction_distribution': df['txDirection'].value_counts().to_dict()
+            if 'txDirection' in df.columns else {}
+        }
+
+        return summary
+
+    def execute_full_recognition(self) -> Dict[str, Any]:
+        """执行完整异常识别"""
+        if self.transaction_data is None:
+            raise ValueError("请先加载交易数据")
+
+        print("🔍 开始执行银行流水异常识别...")
+
+        # 清空之前的结果
+        self.recognition_results = {
+            'agent_results': None,
+            'direct_results': None,
+            'all_anomalies': [],
+            'summary': {}
+        }
+
+        # 先执行直接识别
+        # try:
+        #     direct_results = self._execute_direct_recognition()
+        #     self.recognition_results['direct_results'] = direct_results
+        # except Exception as e:
+        #     print(f"⚠️ 直接异常识别失败: {e}")
+
+        # 执行Agent识别(如果可用)
+        if self.agent:
+            try:
+                agent_results = self._execute_agent_recognition()
+                self.recognition_results['agent_results'] = agent_results
+            except Exception as e:
+                print(f"⚠️ Agent异常识别失败: {e}")
+        else:
+            print("⚠️ Agent未初始化,跳过Agent识别")
+
+        # 合并所有识别的异常
+        self._consolidate_anomalies()
+
+        # 生成识别摘要
+        self._generate_recognition_summary()
+
+        print("✅ 异常识别完成")
+
+        return self.recognition_results
+
+    def _execute_direct_recognition(self) -> Dict[str, Any]:
+        """执行直接异常识别"""
+        print("🚀 开始直接异常识别...")
+
+        results = {}
+        all_anomalies = []
+
+        for recognizer in self.recognizer_tools:
+            try:
+                print(f"  🔍 执行 {recognizer.display_name}...")
+                # 不传入任何参数,让识别器使用初始化时的csv_path
+                result = recognizer._run()
+                results[recognizer.display_name] = result
+
+                # 处理结果
+                if isinstance(result, str):
+                    # 字符串结果
+                    print(f"  📝 {recognizer.display_name}: {result[:100]}...")
+                elif isinstance(result, dict):
+                    # 字典结果
+                    if 'identified_anomalies' in result:
+                        for anomaly in result['identified_anomalies']:
+                            anomaly['recognition_type'] = recognizer.display_name
+                            all_anomalies.append(anomaly)
+
+                    anomaly_count = result.get('identified_count', 0)
+                    status = result.get('recognition_status', '未知')
+                    print(f"  ✅ {recognizer.display_name}: 识别完成,发现 {anomaly_count} 条异常 ({status})")
+
+            except Exception as e:
+                error_msg = f"{recognizer.display_name} 识别失败: {e}"
+                print(f"  ❌ {error_msg}")
+
+        return {
+            'results': results,
+            'all_anomalies': all_anomalies,
+            'total_recognizers': len(self.recognizer_tools),
+            'completed_recognizers': len(results)
+        }
+
+    def _execute_agent_recognition(self) -> Dict[str, Any]:
+        """执行Agent异常识别"""
+        print("🤖 开始智能体异常识别...")
+
+        try:
+            agent_results = self.recognition_results.get('agent_results', {})
+            if agent_results and 'all_anomalies' in agent_results:
+                for anomaly in agent_results['all_anomalies']:
+                    if anomaly.get('check_type') == 'balance_info_missing':
+                        balance_missing_alert = f"""
+                                ⚠️ **重要提示**:
+                                检测到数据完整性异常:银行流水缺少余额信息字段!
+                                这会影响以下分析的准确性:
+                                1. 余额连续性检查(可能无法执行)
+                                2. 资金存量波动分析
+                                3. 交易与余额的匹配验证
+
+                                请在分析时考虑这一限制条件。
+                                """
+                        break
+
+            # 准备工具信息
+            tools_info = self._prepare_tools_info_for_prompt()
+
+            # 生成通用提示词
+            prompt = self._generate_universal_prompt(tools_info)
+
+            # 创建初始状态
+            initial_state = {
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": self._get_universal_system_prompt()
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
+                ]
+            }
+
+            print("🔄 正在执行Agent...")
+            print("📋 提示词已发送:")
+            print("-" * 50)
+            print(prompt[:500] + "..." if len(prompt) > 500 else prompt)
+            print("-" * 50)
+
+            # 执行代理
+            result = self.agent.invoke(initial_state)
+
+            print(f"✅ Agent执行完成,共 {len(result['messages'])} 条消息")
+
+            # 处理结果
+            agent_output = self._process_agent_result(result)
+
+            # 如果没有调用工具,尝试备用方案
+            if len(agent_output['tool_calls']) == 0:
+                print("⚠️ Agent没有调用工具,启动备用方案...")
+                backup_result = self._execute_backup_recognition()
+                agent_output['all_anomalies'].extend(backup_result['all_anomalies'])
+                agent_output['backup_used'] = True
+
+            print(f"🤖 最终统计: {len(agent_output['tool_calls'])} 次工具调用, {len(agent_output['all_anomalies'])} 条异常")
+
+            return agent_output
+
+        except Exception as e:
+            error_msg = f"Agent识别执行失败: {str(e)}"
+            print(f"❌ {error_msg}")
+            import traceback
+            traceback.print_exc()
+
+            return {
+                'final_output': f"Agent识别失败: {error_msg}",
+                'tool_calls': [],
+                'tool_results': [],
+                'all_anomalies': [],
+                'error': str(e)
+            }
+
+    def _execute_backup_recognition(self) -> Dict[str, Any]:
+        """备用识别方案:直接调用所有工具"""
+        print("🔄 启动备用识别方案:直接调用所有工具...")
+
+        backup_results = {
+            'all_anomalies': [],
+            'tool_results': [],
+            'tool_names': []
+        }
+
+        for recognizer in self.recognizer_tools:
+            print(f"  🔧 调用 {recognizer.display_name}...")
+            try:
+                result = recognizer._run(csv_path=self.csv_path)
+                backup_results['tool_results'].append(result)
+                backup_results['tool_names'].append(recognizer.name)
+
+                # 提取异常
+                if isinstance(result, dict):
+                    if 'identified_anomalies' in result:
+                        anomalies = result['identified_anomalies']
+                        for anomaly in anomalies:
+                            standardized = self._standardize_anomaly_record(anomaly, result)
+                            backup_results['all_anomalies'].append(standardized)
+                        print(f"    发现 {len(anomalies)} 条异常")
+                    elif 'identified_count' in result:
+                        print(f"    工具返回 {result['identified_count']} 条异常(但未找到详细记录)")
+                else:
+                    print(f"    工具返回非字典结果: {type(result)}")
+
+            except Exception as e:
+                print(f"    ❌ 工具调用失败: {e}")
+
+        print(f"✅ 备用方案完成: 调用了 {len(backup_results['tool_names'])} 个工具, 发现 {len(backup_results['all_anomalies'])} 条异常")
+
+        return backup_results
+
+    def _process_agent_result(self, result: Any) -> Dict[str, Any]:
+        """处理Agent执行结果"""
+        agent_output = {
+            'final_output': '',
+            'tool_calls': [],
+            'tool_results': [],
+            'all_anomalies': [],
+            'messages_analysis': []
+        }
+
+        # 分析消息流
+        for i, message in enumerate(result["messages"]):
+            msg_info = {
+                'index': i + 1,
+                'type': message.type,
+                'has_tool_calls': False,
+                'tool_call_count': 0
+            }
+
+            # 记录工具调用
+            if hasattr(message, 'tool_calls') and message.tool_calls:
+                tool_calls = message.tool_calls
+                agent_output['tool_calls'].extend(tool_calls)
+                msg_info['has_tool_calls'] = True
+                msg_info['tool_call_count'] = len(tool_calls)
+
+                print(f"🛠️ 消息{i + 1}: 发现 {len(tool_calls)} 个工具调用")
+                for tc in tool_calls:
+                    print(f"  工具: {tc.get('name', '未知')}")
+                    print(f"  参数: {tc.get('args', {})}")
+
+            # 处理工具返回结果
+            if message.type == 'tool':
+                content = message.content
+                agent_output['tool_results'].append(content)
+
+                # ============ 新增调试信息 ============
+                print(f"\n🔍 工具返回内容类型: {type(content)}")
+                if isinstance(content, dict):
+                    print(f"📋 工具返回字典键: {list(content.keys())}")
+                    if 'identified_count' in content:
+                        print(f"📊 工具报告的异常数量: {content['identified_count']}")
+                    if 'identified_anomalies' in content:
+                        print(f"📦 工具返回的异常列表长度: {len(content['identified_anomalies'])}")
+                        # 显示前几条异常详情
+                        for j, anomaly in enumerate(content['identified_anomalies'][:3], 1):
+                            print(
+                                f"  异常{j}: ID={anomaly.get('txId')}, 原因={anomaly.get('recognition_reason', '')[:50]}...")
+                elif isinstance(content, str):
+                    print(f"📝 工具返回字符串长度: {len(content)}")
+                    print(f"  前200字符: {content[:200]}...")
+                # ============ 调试信息结束 ============
+
+                # 处理异常数据
+                anomalies = self._extract_anomalies_from_content(content)
+                if anomalies:
+                    print(f"✅ 从工具结果提取到 {len(anomalies)} 条异常")
+                    agent_output['all_anomalies'].extend(anomalies)
+                else:
+                    print(f"⚠️ 从工具结果提取到 0 条异常")
+
+                msg_info['content_type'] = type(content).__name__
+                msg_info['content_length'] = len(str(content))
+
+            # 记录最终AI输出
+            if message.type == 'ai' and i == len(result["messages"]) - 1:
+                agent_output['final_output'] = getattr(message, 'content', '')
+                msg_info['is_final'] = True
+                msg_info['output_length'] = len(agent_output['final_output'])
+
+                print(f"🤖 最终AI输出 ({msg_info['output_length']} 字符):")
+                print("-" * 40)
+                print(agent_output['final_output'][:300] + "..." if len(agent_output['final_output']) > 300 else
+                      agent_output['final_output'])
+                print("-" * 40)
+
+            agent_output['messages_analysis'].append(msg_info)
+
+        return agent_output
+
+    def _extract_anomalies_from_content(self, content: Any) -> List[Dict[str, Any]]:
+        """从工具结果中提取异常数据 - 修复版"""
+        anomalies = []
+
+        try:
+            print(f"🔍 提取异常,输入类型: {type(content)}")
+
+            # ============ 第一步:统一转换为字典 ============
+            processed_content = None
+
+            if isinstance(content, dict):
+                print(f"  ✅ 已经是字典,直接处理")
+                processed_content = content
+
+            elif isinstance(content, str):
+                print(f"  📝 处理字符串内容,长度: {len(content)}")
+                print(f"    预览: {content[:200]}...")
+
+                # 尝试多种解析方式,传入初始深度0
+                processed_content = self._parse_string_content(content, depth=0, max_depth=3)
+
+                if processed_content is None:
+                    print(f"  ❌ 无法解析字符串内容,返回空列表")
+                    return anomalies
+
+            else:
+                print(f"  ⚠️ 未知内容类型: {type(content)},返回空列表")
+                return anomalies
+
+            # ============ 第二步:从字典中提取异常 ============
+            if isinstance(processed_content, dict):
+                print(f"  📋 处理字典,键: {list(processed_content.keys())}")
+
+                # 可能包含异常的字段名列表(按优先级)
+                anomaly_fields = [
+                    'identified_anomalies',
+                    'all_anomalies',
+                    'anomalies',
+                    'abnormal_records',
+                    'identified_abnormalities'
+                ]
+
+                found_anomalies = False
+
+                for field in anomaly_fields:
+                    if field in processed_content:
+                        anomaly_list = processed_content[field]
+                        print(f"  ✅ 找到字段 '{field}',类型: {type(anomaly_list)}")
+
+                        if isinstance(anomaly_list, list):
+                            print(f"    列表长度: {len(anomaly_list)}")
+
+                            for i, anomaly in enumerate(anomaly_list):
+                                if isinstance(anomaly, dict):
+                                    standardized = self._standardize_anomaly_record(anomaly, processed_content)
+                                    anomalies.append(standardized)
+                                    print(f"    ✓ 标准化异常 {i + 1}: ID={anomaly.get('txId', '未知')}")
+                                else:
+                                    print(f"    ⚠️ 异常记录 {i + 1} 不是字典: {type(anomaly)}")
+                                    # 尝试转换非字典异常
+                                    if hasattr(anomaly, '__dict__'):
+                                        anomaly_dict = anomaly.__dict__
+                                        standardized = self._standardize_anomaly_record(anomaly_dict, processed_content)
+                                        anomalies.append(standardized)
+
+                            found_anomalies = True
+                            print(f"  📊 从字段 '{field}' 提取到 {len(anomaly_list)} 条异常")
+                            break  # 找到一个就停止
+                        else:
+                            print(f"  ⚠️ 字段 '{field}' 不是列表类型: {type(anomaly_list)}")
+
+                # 如果没有找到标准字段,搜索任何包含字典的列表字段
+                if not found_anomalies:
+                    print(f"  🔎 没有找到标准异常字段,搜索其他列表字段...")
+                    for key, value in processed_content.items():
+                        if isinstance(value, list) and len(value) > 0:
+                            print(f"    发现列表字段 '{key}',长度: {len(value)},元素类型: {type(value[0])}")
+
+                            # 检查列表元素是否是字典(可能包含异常)
+                            if len(value) > 0 and isinstance(value[0], dict):
+                                print(f"    ⚠️ 列表 '{key}' 包含字典,可能包含异常数据")
+                                # 可以选择是否提取这些数据
+                                # for item in value:
+                                #     if isinstance(item, dict) and 'txId' in item:
+                                #         standardized = self._standardize_anomaly_record(item, processed_content)
+                                #         anomalies.append(standardized)
+
+            print(f"  🎯 最终提取到 {len(anomalies)} 条异常")
+
+        except Exception as e:
+            print(f"❌ 提取异常数据时出错: {e}")
+            import traceback
+            traceback.print_exc()
+
+        return anomalies
+
+
+    def _standardize_anomaly_record(self, anomaly: Dict, source_content: Any) -> Dict[str, Any]:
+        """标准化异常记录"""
+        if not isinstance(anomaly, dict):
+            anomaly = {'raw_data': str(anomaly)}
+
+        # 提取识别器名称
+        recognizer_name = ''
+        if isinstance(source_content, dict):
+            recognizer_name = source_content.get('recognition_type', '未知')
+            # 从execution_info中提取更多信息
+            if 'execution_info' in source_content:
+                exec_info = source_content['execution_info']
+                recognizer_name = exec_info.get('display_name', recognizer_name)
+
+        # 确保有必要的字段
+        standardized = {
+            'recognition_source': 'agent',
+            'recognition_type': recognizer_name,
+            'txId': str(anomaly.get('txId', anomaly.get('tx_id', ''))),
+            'txDate': str(anomaly.get('txDate', anomaly.get('tx_date', ''))),
+            'txTime': str(anomaly.get('txTime', anomaly.get('tx_time', ''))),
+            'txAmount': float(anomaly.get('txAmount', anomaly.get('tx_amount', 0))),
+            'txDirection': str(anomaly.get('txDirection', anomaly.get('tx_direction', ''))),
+            'recognition_reason': str(anomaly.get('recognition_reason', anomaly.get('reason', ''))),
+            'severity': str(anomaly.get('severity', 'medium')),
+            'status': str(anomaly.get('status', '待核查')),
+            'raw_anomaly': anomaly  # 保留原始数据
+        }
+
+        # 添加datetime信息(如果存在)
+        if 'datetime' in anomaly and pd.notna(anomaly['datetime']):
+            standardized['datetime'] = str(anomaly['datetime'])
+
+        return standardized
+
+
+    def _generate_universal_prompt(self, tools_info: List[Dict[str, str]]) -> str:
+        """生成通用提示词"""
+
+        # 构建工具列表
+        tools_list = "\n".join([
+            f"{info['index']}. {info['display_name']} ({info['name']}): {info['short_desc']}"
+            for info in tools_info
+        ])
+
+        # 构建建议顺序
+        suggested_order = " → ".join([info['display_name'] for info in tools_info])
+
+        # 构建工具调用示例
+        first_tool = tools_info[0]
+        example_call = f"""{first_tool['name']}(csv_path="{self.csv_path}")"""
+
+        return f"""
+            # 银行交易流水异常识别分析任务
+        
+            ## 数据文件:
+            {self.csv_path}
+
+            ## 可用分析工具(共{len(tools_info)}个):
+            {tools_list}
+
+            ## 执行要求:
+            1. **必须使用上述工具**进行分析,不能跳过工具调用
+            2. 建议按顺序执行:{suggested_order}
+            3. 每个工具都需要传入csv_path参数,值为:{self.csv_path}
+            4. 整合所有工具的结果生成综合报告
+        
+            ## 工具调用示例:
+            要调用第一个工具,使用:{example_call}
+
+            ## 请开始执行:
+            请首先调用 {first_tool['display_name']} 工具开始分析。
+        """
+
+    def _prepare_tools_info_for_prompt(self) -> List[Dict[str, str]]:
+        """为提示词准备工具信息"""
+        tools_info = []
+
+        for i, tool in enumerate(self.recognizer_tools, 1):
+            tool_info = {
+                'index': i,
+                'name': tool.name,
+                'display_name': getattr(tool, 'display_name', tool.name),
+                'description': tool.description,
+                'short_desc': tool.description[:100] + "..." if len(tool.description) > 100 else tool.description
+            }
+            tools_info.append(tool_info)
+
+        return tools_info
+
+    def _get_universal_system_prompt(self) -> str:
+        """获取通用系统提示词"""
+        return """
+            你是一个银行流水异常识别专家AI助手。
+
+            ## 核心规则:
+            1. 你必须使用提供的工具来分析数据
+            2. 不能跳过工具直接回答问题
+            3. 每次分析至少要调用一个工具
+            4. 等待工具返回结果后再继续分析
+            5. 基于工具结果生成报告
+
+            ## 工具使用说明:
+            - 每个工具都需要csv_path参数
+            - 使用用户提供的文件路径
+            - 可以按顺序调用多个工具
+            - 记录每个工具的结果
+
+            ## 输出要求:
+            - 总结每个工具的分析结果
+            - 列出所有发现的异常
+            - 提供综合风险评估
+            - 给出后续核查建议
+        """
+
+    def _debug_agent_execution(self, agent_output: Dict[str, Any]):
+        """调试Agent执行过程"""
+        print("\n🔍 Agent执行调试信息:")
+        print(f"  工具调用次数: {len(agent_output['tool_calls'])}")
+
+        for i, tool_call in enumerate(agent_output['tool_calls']):
+            print(f"  工具调用 {i + 1}:")
+            print(f"    名称: {tool_call.get('name', '未知')}")
+            print(f"    参数: {tool_call.get('args', {})}")
+
+        print(f"  工具结果数量: {len(agent_output['tool_results'])}")
+        for i, result in enumerate(agent_output['tool_results']):
+            print(f"  工具结果 {i + 1}: {str(result)[:150]}...")
+
+    def _consolidate_anomalies(self):
+        """合并所有识别的异常"""
+        all_anomalies = []
+
+        # # 从直接识别结果中收集异常
+        # direct_results = self.recognition_results.get('direct_results', {})
+        # if 'all_anomalies' in direct_results:
+        #     all_anomalies.extend(direct_results['all_anomalies'])
+
+        # 从Agent结果中收集异常
+        agent_results = self.recognition_results.get('agent_results')
+        if agent_results and 'all_anomalies' in agent_results:
+            all_anomalies.extend(agent_results['all_anomalies'])
+
+        # 去重
+        unique_anomalies = []
+        seen = set()
+
+        for anomaly in all_anomalies:
+            key = f"{anomaly.get('txId', '')}_{anomaly.get('recognition_type', '')}"
+            if key not in seen:
+                seen.add(key)
+                unique_anomalies.append(anomaly)
+
+        self.recognition_results['all_anomalies'] = unique_anomalies
+        print(f"📊 合并后共有 {len(unique_anomalies)} 条异常")
+
+
+    def _generate_recognition_summary(self):
+        """生成识别摘要"""
+        all_anomalies = self.recognition_results.get('all_anomalies', [])
+
+        summary = {
+            'total_transactions': self.data_summary.get('transaction_count', 0),
+            'total_identified_anomalies': len(all_anomalies),
+            'recognition_ratio': f"{(len(all_anomalies) / self.data_summary.get('transaction_count', 1) * 100):.2f}%"
+            if self.data_summary.get('transaction_count', 0) > 0 else "0%",
+            'recognition_completion_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            'enabled_recognizers': len(self.recognizer_tools),
+            'anomaly_distribution': {}
+        }
+
+        # 按类型统计异常
+        for anomaly in all_anomalies:
+            anomaly_type = anomaly.get('recognition_type', '未知')
+            summary['anomaly_distribution'][anomaly_type] = summary['anomaly_distribution'].get(anomaly_type, 0) + 1
+
+        # 按严重程度统计
+        severity_counts = {'high': 0, 'medium': 0, 'low': 0}
+        for anomaly in all_anomalies:
+            severity = anomaly.get('severity', 'medium')
+            severity_counts[severity] = severity_counts.get(severity, 0) + 1
+
+        summary['severity_distribution'] = severity_counts
+
+        self.recognition_results['summary'] = summary
+
+    # 以下方法保持不变...
+    def generate_recognition_report(self, output_dir: str = "outputs/reports") -> str:
+        """生成异常识别报告"""
+        try:
+            # 创建输出目录
+            Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            report_id = f"anomaly_report_{timestamp}"
+
+            print(f"\n📊 生成异常识别报告...")
+
+            # 1. 保存识别的异常记录(CSV格式)
+            anomalies_path = self._save_anomalies_csv(output_dir, report_id)
+
+            # 2. 生成详细识别报告(JSON格式)
+            report_path = self._save_detailed_report(output_dir, report_id)
+
+            # 3. 生成识别摘要(文本格式)
+            summary_path = self._save_summary_txt(output_dir, report_id)
+
+
+            print(f"✅ 报告生成完成")
+            print(f"  异常记录: {anomalies_path}")
+            print(f"  详细报告: {report_path}")
+            print(f"  识别摘要: {summary_path}")
+
+            return report_path
+
+        except Exception as e:
+            raise
+
+    def _save_anomalies_csv(self, output_dir: str, report_id: str) -> str:
+        """保存异常记录为CSV文件"""
+        anomalies_df = pd.DataFrame(self.recognition_results['all_anomalies'])
+
+        # 定义列顺序
+        column_order = [
+            'recognition_type', 'txId', 'txDate', 'txTime', 'txAmount',
+            'txDirection', 'recognition_reason', 'severity', 'status'
+        ]
+
+        # 只保留存在的列
+        existing_columns = [col for col in column_order if col in anomalies_df.columns]
+        other_columns = [col for col in anomalies_df.columns if col not in column_order]
+
+        # 重新排序列
+        anomalies_df = anomalies_df[existing_columns + other_columns]
+
+        # 保存CSV
+        anomalies_path = Path(output_dir) / f"{report_id}_anomalies.csv"
+        anomalies_df.to_csv(anomalies_path, index=False, encoding='utf-8-sig')
+
+        return str(anomalies_path)
+
+    # 其他方法保持不变...
+    # anomaly_recognizer_agent.py
+    # 在 AnomalyRecognitionAgent 类的末尾添加以下方法(在现有方法之后)
+
+    def get_recognition_summary(self) -> Dict[str, Any]:
+        """获取识别摘要"""
+        return self.recognition_results.get('summary', {})
+
+    def get_recognizer_stats(self) -> List[Dict[str, Any]]:
+        """获取识别器统计信息"""
+        stats = []
+        for recognizer in self.recognizer_tools:
+            stats.append(recognizer.get_summary())
+        return stats
+
+    def _save_detailed_report(self, output_dir: str, report_id: str) -> str:
+        """保存详细识别报告(JSON格式)"""
+        report_data = {
+            'report_metadata': {
+                'report_id': report_id,
+                'generation_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                'data_source': self.csv_path,
+                'recognition_method': '混合模式' if self.recognition_results.get('agent_results') else '直接模式'
+            },
+            'data_summary': self.data_summary,
+            'recognition_configuration': {
+                'enabled_recognizers': [
+                    {
+                        'name': tool.name,
+                        'display_name': tool.display_name,
+                        'description': tool.description[:100] + '...' if len(
+                            tool.description) > 100 else tool.description
+                    }
+                    for tool in self.recognizer_tools
+                ],
+                'total_recognizers': len(self.recognizer_tools),
+                'config': self.config
+            },
+            'recognition_results': {
+                'summary': self.recognition_results.get('summary', {}),
+                'direct_results_summary': {},
+                'agent_results_summary': {}
+            }
+        }
+
+        # 添加直接识别结果摘要
+        # direct_results = self.recognition_results.get('direct_results', {})
+        # if 'results' in direct_results:
+        #     for recognizer_name, result in direct_results['results'].items():
+        #         report_data['recognition_results']['direct_results_summary'][recognizer_name] = {
+        #             'identified_count': result.get('identified_count', 0),
+        #             'recognition_status': result.get('recognition_status', '未知'),
+        #             'execution_time': result.get('execution_info', {}).get('execution_time', '')
+        #         }
+
+        # 添加Agent识别结果摘要
+        agent_results = self.recognition_results.get('agent_results')
+        if agent_results:
+            report_data['recognition_results']['agent_results_summary'] = {
+                'iterations': agent_results.get('iterations', 0),
+                'tool_calls_count': len(agent_results.get('tool_calls', [])),
+                'final_output_preview': agent_results.get('final_output', '')[:500] + '...'
+                if agent_results.get('final_output') else '无'
+            }
+
+        # 保存JSON报告
+        report_path = Path(output_dir) / f"{report_id}.json"
+
+        def json_serializer(obj):
+            if isinstance(obj, (pd.Timestamp, datetime)):
+                return obj.strftime("%Y-%m-%d %H:%M:%S")
+            elif isinstance(obj, np.integer):
+                return int(obj)
+            elif isinstance(obj, np.floating):
+                return float(obj)
+            elif isinstance(obj, np.ndarray):
+                return obj.tolist()
+            elif pd.isna(obj):
+                return None
+            elif hasattr(obj, '__dict__'):
+                return str(obj)
+            return str(obj)
+
+        with open(report_path, 'w', encoding='utf-8') as f:
+            json.dump(report_data, f, ensure_ascii=False, indent=2, default=json_serializer)
+
+        return str(report_path)
+
+    def _save_summary_txt(self, output_dir: str, report_id: str) -> str:
+        """保存识别摘要(文本格式)"""
+        summary = self.recognition_results.get('summary', {})
+        anomaly_distribution = summary.get('anomaly_distribution', {})
+        severity_distribution = summary.get('severity_distribution', {})
+
+        summary_path = Path(output_dir) / f"{report_id}_summary.txt"
+
+        with open(summary_path, 'w', encoding='utf-8') as f:
+            f.write("=" * 70 + "\n")
+            f.write("银行流水异常识别报告摘要\n")
+            f.write("=" * 70 + "\n\n")
+
+            # 报告信息
+            f.write("📅 报告信息:\n")
+            f.write(f"  报告ID: {report_id}\n")
+            f.write(f"  生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write(f"  数据源: {self.csv_path}\n\n")
+
+            # 数据概览
+            f.write("📈 数据概览:\n")
+            f.write(f"  总交易笔数: {summary.get('total_transactions', 0):,}\n")
+            f.write(
+                f"  时间范围: {self.data_summary.get('date_range', {}).get('start', '未知')} 至 {self.data_summary.get('date_range', {}).get('end', '未知')}\n")
+            f.write(f"  总交易金额: {self.data_summary.get('total_amount', 0):,.2f}元\n")
+            f.write(f"  平均交易金额: {self.data_summary.get('average_amount', 0):,.2f}元\n\n")
+
+            # 识别结果
+            f.write("🔍 异常识别结果:\n")
+            f.write(f"  启用的识别器: {summary.get('enabled_recognizers', 0)} 个\n")
+            f.write(f"  识别出的异常: {summary.get('total_identified_anomalies', 0)} 条\n")
+            f.write(f"  异常识别率: {summary.get('recognition_ratio', '0%')}\n\n")
+
+            # 异常类型分布
+            if anomaly_distribution:
+                f.write("📊 异常类型分布:\n")
+                for anomaly_type, count in anomaly_distribution.items():
+                    f.write(f"  - {anomaly_type}: {count} 条\n")
+                f.write("\n")
+
+            # 严重程度分布
+            if severity_distribution:
+                f.write("⚠️ 严重程度分布:\n")
+                for severity, count in severity_distribution.items():
+                    f.write(f"  - {severity.upper()}: {count} 条\n")
+                f.write("\n")
+
+            f.write("\n" + "=" * 70 + "\n")
+            f.write("报告生成完成\n")
+            f.write("=" * 70 + "\n")
+
+        return str(summary_path)
+
+    def _parse_string_content(self, content: str, depth: int = 0, max_depth: int = 3) -> Optional[Dict]:
+        """解析字符串内容为字典 - 支持多种格式,带递归深度控制"""
+
+        # 递归深度保护
+        if depth >= max_depth:
+            print(f"    ⚠️ 达到最大递归深度 {max_depth},停止解析")
+            return None
+
+        if not content or not isinstance(content, str):
+            return None
+
+        print(f"    [{depth}] 解析字符串,长度: {len(content)}")
+
+        # 尝试1: JSON解析(标准格式,双引号)
+        try:
+            import json
+            parsed = json.loads(content)
+            if isinstance(parsed, dict):
+                print(f"    [{depth}] ✅ JSON解析成功")
+                return parsed
+            else:
+                print(f"    [{depth}] ⚠️ JSON解析成功但不是字典: {type(parsed)}")
+                # 如果是列表或其他类型,包装成字典
+                return {
+                    'parsed_content': parsed,
+                    'original_type': type(parsed).__name__,
+                    'parse_method': 'json'
+                }
+        except json.JSONDecodeError as e:
+            print(f"    [{depth}] ⚠️ JSON解析失败: {e}")
+
+        # 尝试2: Python字典字符串表示(单引号)
+        # 先清理字符串,移除可能的额外空白
+        cleaned_content = content.strip()
+        if cleaned_content.startswith('{') and cleaned_content.endswith('}'):
+            try:
+                import ast
+                parsed = ast.literal_eval(cleaned_content)  # 安全解析Python表达式
+                if isinstance(parsed, dict):
+                    print(f"    [{depth}] ✅ ast解析成功(Python字典字符串)")
+                    return parsed
+                else:
+                    print(f"    [{depth}] ⚠️ ast解析成功但不是字典: {type(parsed)}")
+                    return {
+                        'parsed_content': parsed,
+                        'original_type': type(parsed).__name__,
+                        'parse_method': 'ast'
+                    }
+            except (SyntaxError, ValueError, TypeError) as e:
+                print(f"    [{depth}] ⚠️ ast解析失败: {e}")
+
+        # 尝试3: 包含字典的复杂字符串(如日志输出)
+        # 查找第一个{和最后一个},尝试提取字典部分
+        start_idx = content.find('{')
+        end_idx = content.rfind('}')
+
+        if start_idx >= 0 and end_idx > start_idx:
+            dict_str = content[start_idx:end_idx + 1]
+
+            # 避免提取的内容和原内容相同(会导致无限递归)
+            if dict_str == content:
+                print(f"    [{depth}] ⚠️ 提取的子字符串与原字符串相同,跳过递归")
+                return None
+
+            print(f"    [{depth}] 尝试提取子字符串,长度: {len(dict_str)}")
+            print(f"    [{depth}] 子字符串前100字符: {dict_str[:100]}...")
+
+            # 递归尝试解析提取的部分,增加深度计数
+            result = self._parse_string_content(dict_str, depth + 1, max_depth)
+            if result:
+                return result
+
+        # 尝试4: 可能是eval安全的简单表示
+        try:
+            # 最后尝试:直接eval(仅用于调试,生产环境慎用)
+            # 这里用更安全的方式
+            import ast
+            parsed = ast.literal_eval(content)
+            print(f"    [{depth}] ⚠️ 直接解析成功: {type(parsed)}")
+            return {
+                'raw_content': content,
+                'parsed_content': parsed,
+                'original_type': type(parsed).__name__,
+                'parse_method': 'direct'
+            }
+        except Exception as e:
+            print(f"    [{depth}] ⚠️ 直接解析失败: {e}")
+
+        print(f"    [{depth}] ❌ 所有解析方式都失败")
+        return None
+
+
+
+# 修改主程序的这一部分
+if __name__ == '__main__':
+    import os
+
+
+    os.environ["LANGCHAIN_TRACING_V2"] = "false"
+    os.environ["LANGCHAIN_API_KEY"] = ""
+    # 禁用 LangGraph 的追踪
+    os.environ["LANGSMITH_TRACING"] = "false"
+
+    file_name = "11111_data_standard_20260113_112906.csv"
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    file_path = os.path.join(curr_dir, "..", "..", "data_files", file_name)
+    print(f"csv文件:{file_path}, 是否存在: {os.path.exists(file_path)}")
+    agent = AnomalyRecognitionAgent(csv_path=file_path, api_key=LLM_API_KEY, base_url=LLM_BASE_URL, model_name=LLM_MODEL_NAME, config=anomaly_recognizer_config)
+    print("\n" + "=" * 60)
+    print("开始运行异常识别流程")
+    print("=" * 60)
+
+    try:
+        # 1. 加载数据
+        print("\n📥 步骤1: 加载交易数据...")
+        transaction_data = agent.load_transaction_data()
+        print(f"  成功加载 {len(transaction_data)} 条交易记录")
+
+        # 2. 执行异常识别
+        print("\n🔍 步骤2: 执行异常识别...")
+        results = agent.execute_full_recognition()
+
+        # 3. 生成报告
+        print("\n📊 步骤3: 生成识别报告...")
+        report_path = agent.generate_recognition_report()
+
+    except Exception as e:
+        print(f"\n❌ 执行过程中发生错误: {e}")
+        import traceback
+        traceback.print_exc()

+ 44 - 2
llmops/agents/data_manager.py

@@ -23,7 +23,39 @@ class DataManager:
     }
 
     # 必需字段
-    REQUIRED_FIELDS = ['txId', 'txDate', 'txAmount', 'txDirection']
+    REQUIRED_FIELDS = ['txId', 'txDate', 'txTime','txAmount', 'txDirection', 'txBalance', 'txSummary', 'txCounterparty']
+
+    @staticmethod
+    def display_data_info(df: pd.DataFrame):
+        """显示数据信息"""
+        print("\n📊 标准化数据基本信息:")
+        print(f"  总记录数: {len(df)}")
+        print(f"  字段数: {len(df.columns)}")
+
+    @staticmethod
+    def load_from_standardized_csv(csv_path: str) -> pd.DataFrame:
+        """从标准化后的 CSV 文件中加载数据"""
+        try:
+            if not os.path.exists(csv_path):
+                print(f"标准化CSV数据文件:{csv_path} 不存在")
+                raise ValueError(f"标准化CSV数据文件:{csv_path} 不存在")
+
+            # 读取标准化后的CSV文件
+            df = pd.read_csv(csv_path)
+            print(f"✅ 标准化CSV文件加载成功,共 {len(df)} 行数据")
+
+            # 验证必需字段
+            missing_columns = [col for col in DataManager.REQUIRED_FIELDS if col not in df.columns]
+            if missing_columns:
+                raise ValueError(f"CSV文件缺少必需字段: {missing_columns}")
+
+            # 显示数据信息
+            DataManager.display_data_info(df)
+
+            return df
+        except Exception as e:
+            print(f"❌ 标准化CSV文件加载失败: {e}")
+            raise
 
     @staticmethod
     def load_from_file(file_path: str) -> Tuple[List[Dict[str, Any]], pd.DataFrame]:
@@ -309,4 +341,14 @@ class DataManager:
         csv_string = output.getvalue()
         output.close()
 
-        return csv_string
+        return csv_string
+
+
+if __name__ == '__main__':
+    import os
+    csv_file = "data_files/11111_data_standard_20260113_112906.csv"
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    file_path = os.path.join(curr_dir, "..", "..", csv_file)
+    print(f"{file_path}, 是否存在:{os.path.exists(file_path)}")
+    df = DataManager.load_from_standardized_csv(file_path)
+    print(f"记录:{df}")

+ 250 - 0
llmops/agents/tools/balance_info_missing_recognizer.py

@@ -0,0 +1,250 @@
+from typing import Dict, Any, Optional, Type, List
+from datetime import datetime
+from pydantic import BaseModel, Field
+
+from .enhanced_base_recognizer import EnhancedBaseRecognizer
+
+
+class BalanceInfoMissingInput(BaseModel):
+    """余额信息缺失识别工具输入"""
+    csv_path: Optional[str] = Field(
+        None,
+        description="CSV文件路径(可选)。如果初始化时已提供csv_path,可以不用再次传入。"
+    )
+
+
+class BalanceInfoMissingRecognizer(EnhancedBaseRecognizer):
+    """
+    余额信息缺失异常识别器
+
+    异常规则定义:
+    银行流水若表头未列示余额信息,缺乏资金结余的关键展示项,会导致无法直观获取账户资金存量的核心数据,
+    不符合银行流水应完整呈现账户资金变动及结余情况的常规要求,属于信息完整性缺失的情形。
+    """
+
+    args_schema: Type[BaseModel] = BalanceInfoMissingInput
+
+    # 需要检查的余额相关字段
+    balance_columns_to_check: List[str] = Field(
+        ['txBalance'],
+        description="需要检查的余额字段名称列表"
+    )
+
+    # 严重程度配置
+    missing_severity: str = Field(
+        'high',
+        description="余额信息缺失的严重程度(high/medium/low)"
+    )
+
+    # 影响描述
+    impact_description: str = Field(
+        "缺少余额信息将导致无法进行:1)余额连续性检查 2)资金存量分析 3)异常余额波动检测 4)交易真实性验证",
+        description="余额信息缺失的影响描述"
+    )
+
+    def __init__(self, csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化余额信息缺失识别器
+
+        Args:
+            csv_path: CSV文件路径
+            config: 配置参数
+            **kwargs: 其他参数
+        """
+        super().__init__(
+            name="balance_info_missing_recognizer",
+            description="检查银行流水中是否缺少余额信息字段,识别信息完整性缺失异常。",
+            display_name="余额信息缺失检查器",
+            csv_path=csv_path,
+            config=config,
+            **kwargs
+        )
+
+        # 从config获取配置
+        balance_missing_config = self.get_config_value('balance_missing_check', {})
+        if balance_missing_config:
+            if 'balance_columns_to_check' in balance_missing_config:
+                self.balance_columns_to_check = balance_missing_config['balance_columns_to_check']
+            if 'missing_severity' in balance_missing_config:
+                self.missing_severity = balance_missing_config['missing_severity']
+
+        print(f"✅ {self.display_name} 初始化完成")
+        print(f"  检查字段: {', '.join(self.balance_columns_to_check)}")
+        print(f"  缺失严重度: {self.missing_severity}")
+
+    def recognize(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        检查余额信息是否缺失
+
+        Args:
+            csv_path: CSV文件路径
+            **kwargs: 其他参数
+
+        Returns:
+            Dict[str, Any]: 识别结果
+        """
+        try:
+            # 使用父类的load_data方法加载标准化数据
+            df = self.load_data(csv_path)
+
+            print(f"🔍 {self.display_name}开始检查,共 {len(df)} 条记录")
+            print(f"  检查规则: 银行流水应包含余额信息,缺失则属于信息完整性异常")
+
+            # 检查所有可能的余额字段
+            columns_found = []
+            columns_missing = []
+
+            for col in self.balance_columns_to_check:
+                if col in df.columns:
+                    columns_found.append(col)
+                    # 检查该列是否有有效数据(非空)
+                    non_null_count = df[col].notna().sum()
+                    print(f"  ✓ 找到字段 '{col}',有效数据: {non_null_count}/{len(df)} 条")
+                else:
+                    columns_missing.append(col)
+                    print(f"  ✗ 缺少字段 '{col}'")
+
+            # 判断是否有任何余额字段
+            has_any_balance_column = len(columns_found) > 0
+
+            # 如果没有找到任何余额字段,标记为异常
+            if not has_any_balance_column:
+                print(f"❌ 严重异常: 未找到任何余额字段!")
+                print(f"  缺少字段: {', '.join(columns_missing)}")
+                print(f"  影响: {self.impact_description}")
+
+                # 生成异常记录(系统级异常)
+                anomaly = {
+                    'txId': 'SYSTEM_BALANCE_INFO_MISSING',
+                    'txDate': datetime.now().strftime('%Y-%m-%d'),
+                    'txTime': datetime.now().strftime('%H:%M:%S'),
+                    'datetime': datetime.now(),
+                    'txAmount': 0.0,
+                    'txDirection': '系统',
+                    'recognition_reason': f"银行流水缺少余额信息字段,无法获取账户资金结余的核心数据。缺少的字段包括: {', '.join(columns_missing)}。",
+                    'severity': self.missing_severity,
+                    'status': '数据完整性异常',
+                    'check_type': 'balance_info_missing',
+                    'impact_analysis': self.impact_description,
+                    'recommendation': '建议提供包含余额信息的完整银行流水,或至少包含以下字段之一: balance, txBalance, balance_after'
+                }
+
+                # 格式化异常记录
+                formatted_anomaly = self.format_anomaly_record(
+                    row=None,
+                    reason=anomaly['recognition_reason'],
+                    severity=anomaly['severity'],
+                    check_type=anomaly['check_type'],
+                    impact_analysis=anomaly['impact_analysis'],
+                    recommendation=anomaly['recommendation'],
+                    txId=anomaly['txId'],
+                    txDate=anomaly['txDate'],
+                    txTime=anomaly['txTime'],
+                    txAmount=anomaly['txAmount'],
+                    txDirection=anomaly['txDirection']
+                )
+
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 1,
+                    'identified_anomalies': [formatted_anomaly],
+                    'recognition_status': '完成',
+                    'missing_columns_analysis': {
+                        'has_any_balance_column': False,
+                        'columns_found': columns_found,
+                        'columns_missing': columns_missing,
+                        'columns_checked': self.balance_columns_to_check,
+                        'impact_level': '严重 - 无法进行余额相关分析'
+                    },
+                    'execution_info': {
+                        'total_records': len(df),
+                        'check_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+                        'data_file': csv_path or self.csv_path
+                    }
+                }
+            else:
+                print(f"✅ 通过检查: 找到余额字段 {len(columns_found)} 个")
+                print(f"  可用字段: {', '.join(columns_found)}")
+                print(f"  缺失字段: {', '.join(columns_missing) if columns_missing else '无'}")
+
+                # 分析找到的余额字段
+                balance_analysis = {}
+                for col in columns_found:
+                    col_data = df[col]
+                    non_null_count = col_data.notna().sum()
+                    null_count = col_data.isna().sum()
+                    unique_count = col_data.nunique()
+
+                    balance_analysis[col] = {
+                        'non_null_count': non_null_count,
+                        'null_count': null_count,
+                        'null_percentage': f"{(null_count / len(df) * 100):.1f}%",
+                        'unique_values': unique_count,
+                        'data_type': str(col_data.dtype)
+                    }
+
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'missing_columns_analysis': {
+                        'has_any_balance_column': True,
+                        'columns_found': columns_found,
+                        'columns_missing': columns_missing,
+                        'columns_checked': self.balance_columns_to_check,
+                        'impact_level': '正常 - 可进行余额相关分析',
+                        'balance_analysis': balance_analysis
+                    },
+                    'execution_info': {
+                        'total_records': len(df),
+                        'check_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+                        'data_file': csv_path or self.csv_path
+                    }
+                }
+
+        except FileNotFoundError as e:
+            error_result = {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'文件不存在: {str(e)}'
+            }
+            print(f"❌ 文件加载失败: {str(e)}")
+            return error_result
+
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            error_result = {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'检查失败: {str(e)}'
+            }
+            print(f"❌ 检查失败: {str(e)}")
+            return error_result
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        summary = super().get_summary()
+        summary.update({
+            'balance_columns_to_check': self.balance_columns_to_check,
+            'missing_severity': self.missing_severity,
+            'check_type': '数据完整性检查',
+            'priority': 'high'
+        })
+        return summary
+
+    def get_config_summary(self) -> Dict[str, Any]:
+        """获取配置摘要"""
+        return {
+            "检查字段": f"{len(self.balance_columns_to_check)}个: {', '.join(self.balance_columns_to_check[:3])}..." if len(
+                self.balance_columns_to_check) > 3 else f"{len(self.balance_columns_to_check)}个: {', '.join(self.balance_columns_to_check)}",
+            "缺失严重度": self.missing_severity.upper(),
+            "检查优先级": "高",
+            "异常规则": "银行流水缺少余额信息字段属于信息完整性缺失",
+            "影响分析": "无法进行余额连续性、资金存量分析等关键检查"
+        }

+ 550 - 0
llmops/agents/tools/balance_recognizer.py

@@ -0,0 +1,550 @@
+
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, Type
+import pandas as pd
+from itertools import permutations
+
+from .enhanced_base_recognizer import EnhancedBaseRecognizer
+
+
+class BalanceRecognitionInput(BaseModel):
+    """余额识别工具输入"""
+    csv_path: Optional[str] = Field(
+        None,
+        description="CSV文件路径(可选)。如果初始化时已提供csv_path,可以不用再次传入。"
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class BalanceContinuityRecognizer(EnhancedBaseRecognizer):
+    """余额连续性异常识别器(带智能排序)"""
+
+    args_schema: Type[BaseModel] = BalanceRecognitionInput
+
+    # 配置参数
+    balance_tolerance: float = Field(
+        0.01,
+        description="余额计算容差,允许的余额差异阈值"
+    )
+    enable_smart_sorting: bool = Field(
+        True,
+        description="是否启用智能排序处理时间相同的交易"
+    )
+    max_permutation_search: int = Field(
+        6,  # 3! = 6, 4! = 24, 设置为6可以处理最多3笔时间相同的交易
+        description="最大排列搜索数,防止组合爆炸"
+    )
+
+    def __init__(self, csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化余额连续性异常识别器
+
+        Args:
+            csv_path: CSV文件路径
+            config: 配置参数
+            **kwargs: 其他参数
+        """
+        # 调用父类的 __init__
+        super().__init__(
+            name="balance_continuity_recognizer",
+            description="识别银行流水中的余额连续性异常,检查每笔交易后的余额计算是否正确。"
+                        "支持智能排序处理时间相同的交易。",
+            display_name="余额连续性异常识别",
+            csv_path=csv_path,
+            config=config,
+            **kwargs
+        )
+
+        # 从config获取配置,更新Field属性
+        balance_config = self.get_config_value('balance_recognition', {})
+        if balance_config:
+            if 'balance_tolerance' in balance_config:
+                self.balance_tolerance = balance_config['balance_tolerance']
+            if 'enable_smart_sorting' in balance_config:
+                self.enable_smart_sorting = balance_config['enable_smart_sorting']
+            if 'max_permutation_search' in balance_config:
+                self.max_permutation_search = balance_config['max_permutation_search']
+
+    def recognize(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """识别余额连续性异常(带智能排序)"""
+
+        try:
+            # 使用父类的load_data方法加载标准化数据
+            df = self.load_data(csv_path)
+
+            print(f"🔍 {self.display_name}开始检查余额连续性,共 {len(df)} 条记录")
+            print(f"  余额容差: {self.balance_tolerance}")
+            print(f"  智能排序: {'启用' if self.enable_smart_sorting else '禁用'}")
+
+            # 检查必需字段
+            required_fields = ['txId', 'txDate', 'txTime', 'txAmount', 'txDirection', 'txBalance']
+            missing_fields = [field for field in required_fields if field not in df.columns]
+
+            if missing_fields:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '失败',
+                    'error': f'缺少必需字段: {missing_fields}'
+                }
+
+            # ============ 智能排序处理 ============
+            if self.enable_smart_sorting and 'datetime' in df.columns:
+                df = self._apply_smart_sorting(df)
+            else:
+                # 简单排序:按时间,时间相同按ID
+                if 'datetime' in df.columns:
+                    df = df.sort_values(['datetime', 'txId'])
+                else:
+                    df = df.sort_values(['txDate', 'txTime', 'txId'])
+
+            print("📋 排序后的交易顺序:")
+            for i, (_, row) in enumerate(df.head(10).iterrows(), 1):
+                time_str = row['datetime'].strftime(
+                    "%Y-%m-%d %H:%M:%S") if 'datetime' in row else f"{row['txDate']} {row['txTime']}"
+                dir_symbol = "→" if row['txDirection'] == '支出' else "←"
+
+                # 现在 txId 已经是字符串,可以直接使用
+                # 但为了代码清晰,可以明确标注
+                tx_id = row['txId']  # 已经是字符串
+
+                print(
+                    f"  {i:2d}. ID:{tx_id:>4s} | {time_str} | {dir_symbol} {row['txAmount']:8.2f} | 余额:{row['txBalance']:8.2f}")
+
+            # ============ 开始余额连续性检查 ============
+            identified_anomalies = []
+            prev_balance = None
+            valid_transactions = 0
+
+            for idx, row in df.iterrows():
+                tx_id = row['txId']
+                current_balance = row['txBalance']
+                valid_transactions += 1
+
+                # 检查1:余额是否为空
+                if pd.isna(current_balance):
+                    anomaly = self.format_anomaly_record(
+                        row=row,
+                        reason='余额字段为空',
+                        severity='high',
+                        check_type='missing_balance',
+                        previous_balance=prev_balance
+                    )
+                    identified_anomalies.append(anomaly)
+                    print(f"  ❌ 交易ID {tx_id}: 余额字段为空")
+                    continue
+
+                # 检查2:余额连续性(如果不是第一条记录)
+                if prev_balance is not None:
+                    amount = row['txAmount']
+                    direction = str(row['txDirection']).strip()
+
+                    # 计算预期余额
+                    if direction == '收入':
+                        expected_balance = prev_balance + amount
+                    elif direction == '支出':
+                        expected_balance = prev_balance - amount
+                    else:
+                        # 未知方向,跳过检查
+                        print(f"  ⚠️ 交易ID {tx_id}: 未知的交易方向 '{direction}',跳过余额检查")
+                        prev_balance = current_balance
+                        continue
+
+                    # 检查余额是否连续(允许小误差)
+                    if pd.isna(expected_balance):
+                        # 预期余额计算异常
+                        anomaly = self.format_anomaly_record(
+                            row=row,
+                            reason=f'预期余额计算异常,可能金额字段有问题: amount={amount}',
+                            severity='high',
+                            check_type='calculation_error',
+                            previous_balance=prev_balance,
+                            expected_balance=expected_balance,
+                            actual_balance=current_balance
+                        )
+                        identified_anomalies.append(anomaly)
+                        print(f"  ❌ 交易ID {tx_id}: 预期余额计算异常")
+                    else:
+                        balance_diff = abs(expected_balance - current_balance)
+                        if balance_diff > self.balance_tolerance:
+                            anomaly = self.format_anomaly_record(
+                                row=row,
+                                reason=f'余额计算不连续,预期{expected_balance:.2f},实际{current_balance:.2f},差异{balance_diff:.2f}',
+                                severity='high',
+                                check_type='balance_discontinuity',
+                                previous_balance=prev_balance,
+                                expected_balance=expected_balance,
+                                actual_balance=current_balance,
+                                balance_difference=balance_diff,
+                                tolerance_threshold=self.balance_tolerance
+                            )
+                            identified_anomalies.append(anomaly)
+                            print(f"  ❌ 交易ID {tx_id}: 余额不连续,差异 {balance_diff:.2f}")
+                        else:
+                            # 余额连续,正常情况
+                            pass
+
+                prev_balance = current_balance
+
+            print(f"✅ {self.display_name}检查完成")
+            print(f"  检查交易数: {valid_transactions}")
+            print(f"  发现异常数: {len(identified_anomalies)}")
+
+            # 统计不同类型异常的数量
+            missing_balance_count = len([a for a in identified_anomalies
+                                         if a.get('additional_info', {}).get('check_type') == 'missing_balance'])
+            discontinuity_count = len([a for a in identified_anomalies
+                                       if a.get('additional_info', {}).get('check_type') == 'balance_discontinuity'])
+            calculation_error_count = len([a for a in identified_anomalies
+                                           if a.get('additional_info', {}).get('check_type') == 'calculation_error'])
+
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': len(identified_anomalies),
+                'identified_anomalies': identified_anomalies,
+                'recognition_status': '完成',
+                'recognition_parameters': {
+                    'balance_tolerance': self.balance_tolerance,
+                    'enable_smart_sorting': self.enable_smart_sorting,
+                    'checked_transactions': valid_transactions,
+                    'data_source': csv_path or self._csv_path
+                },
+                'statistics': {
+                    'missing_balance_count': missing_balance_count,
+                    'discontinuity_count': discontinuity_count,
+                    'calculation_error_count': calculation_error_count,
+                    'first_valid_balance': float(df['txBalance'].iloc[0]) if len(df) > 0 and not pd.isna(
+                        df['txBalance'].iloc[0]) else None,
+                    'last_valid_balance': float(df['txBalance'].iloc[-1]) if len(df) > 0 and not pd.isna(
+                        df['txBalance'].iloc[-1]) else None,
+                    'total_transactions': len(df),
+                    'valid_balance_count': df['txBalance'].notna().sum(),
+                    'avg_balance': float(df['txBalance'].mean()) if df['txBalance'].notna().any() else None
+                }
+            }
+
+        except FileNotFoundError as e:
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'文件不存在: {str(e)}'
+            }
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'数据加载或处理失败: {str(e)}'
+            }
+
+    # ==================== 智能排序核心算法 ====================
+
+    def _apply_smart_sorting(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        应用智能排序算法
+
+        处理步骤:
+        1. 按时间分组
+        2. 对每个时间组内的交易进行智能排序
+        3. 合并所有组
+        """
+        print("🧠 应用智能排序算法...")
+
+        # 首先按时间排序,得到时间组
+        df = df.sort_values('datetime')
+
+        # 找出所有时间相同的交易组
+        time_groups = list(df.groupby('datetime'))
+
+        if len(time_groups) == len(df):
+            print("  ✅ 所有交易时间都不同,无需智能排序")
+            return df
+
+        # 处理每个时间组
+        sorted_groups = []
+        prev_group_last_balance = None
+
+        for i, (time_val, group) in enumerate(time_groups):
+            group_size = len(group)
+
+            if group_size == 1:
+                # 单笔交易,直接加入
+                sorted_groups.append(group)
+                if not group['txBalance'].isna().iloc[0]:
+                    prev_group_last_balance = group['txBalance'].iloc[0]
+                continue
+
+            # 多笔交易时间相同,需要智能排序
+            print(f"  🔍 时间组 {i + 1}/{len(time_groups)}: {time_val},共 {group_size} 笔交易")
+
+            # 获取前一组的最后一笔余额(如果有)
+            prev_balance = prev_group_last_balance
+
+            # 智能排序这个组
+            sorted_group = self._smart_sort_time_group(group, prev_balance)
+            sorted_groups.append(sorted_group)
+
+            # 更新前一组的最后一笔余额
+            if not sorted_group['txBalance'].isna().iloc[-1]:
+                prev_group_last_balance = sorted_group['txBalance'].iloc[-1]
+
+        # 合并所有组
+        result_df = pd.concat(sorted_groups, ignore_index=True)
+        print(f"  ✅ 智能排序完成,处理了 {len(time_groups)} 个时间组")
+
+        return result_df
+
+    def _smart_sort_time_group(self, group: pd.DataFrame, prev_balance: float = None) -> pd.DataFrame:
+        """
+        智能排序一个时间组内的交易
+
+        策略:
+        1. 如果组内交易数 <= 3,尝试所有排列
+        2. 如果更多,使用启发式算法
+        """
+        group_size = len(group)
+
+        if group_size == 0:
+            return group
+
+        # 显示组内交易详情
+        print(f"    组内交易详情:")
+        for idx, (_, row) in enumerate(group.iterrows(), 1):
+            dir_symbol = "→" if row['txDirection'] == '支出' else "←"
+            balance_info = f"余额:{row['txBalance']:.2f}" if not pd.isna(row['txBalance']) else "余额:None"
+            print(f"      {idx}. ID:{row['txId']} {dir_symbol} {row['txAmount']:.2f} {balance_info}")
+
+        # 策略1:少量交易,尝试所有排列
+        if group_size <= 3:
+            return self._try_all_permutations(group, prev_balance)
+
+        # 策略2:较多交易,使用启发式算法
+        else:
+            return self._heuristic_sort(group, prev_balance)
+
+    def _try_all_permutations(self, group: pd.DataFrame, prev_balance: float = None) -> pd.DataFrame:
+        """
+        尝试所有可能的排列,选择最优的
+
+        适用于少量交易(<=3笔)
+        """
+        group_size = len(group)
+        print(f"    尝试 {group_size} 笔交易的所有排列 ({self._factorial(group_size)} 种可能)...")
+
+        # 如果是2笔交易,特殊处理(常见情况)
+        if group_size == 2:
+            return self._optimize_two_transactions(group, prev_balance)
+
+        # 生成所有排列
+        best_order = None
+        best_score = float('-inf')
+
+        # 限制最大尝试数,防止组合爆炸
+        max_tries = min(self.max_permutation_search, self._factorial(group_size))
+        permutations_tried = 0
+
+        for perm_indices in permutations(range(group_size)):
+            if permutations_tried >= max_tries:
+                break
+
+            perm_group = group.iloc[list(perm_indices)].reset_index(drop=True)
+            score = self._evaluate_order_quality(perm_group, prev_balance)
+
+            if score > best_score:
+                best_score = score
+                best_order = perm_group
+
+            permutations_tried += 1
+
+        if best_order is not None:
+            print(f"    找到最优排列,质量评分: {best_score:.2f}")
+            if best_score < 0.5:
+                print(f"    ⚠️ 警告:最优排列质量评分较低 ({best_score:.2f})")
+
+            # 显示最优顺序
+            print(f"    最优顺序:")
+            for idx, (_, row) in enumerate(best_order.iterrows(), 1):
+                dir_symbol = "→" if row['txDirection'] == '支出' else "←"
+                print(f"      {idx}. ID:{row['txId']} {dir_symbol} {row['txAmount']:.2f}")
+
+            return best_order
+
+        return group
+
+    def _optimize_two_transactions(self, group: pd.DataFrame, prev_balance: float = None) -> pd.DataFrame:
+        """
+        优化两笔时间相同交易的顺序
+
+        这是最常见的情况,专门优化
+        """
+        if len(group) != 2:
+            return group
+
+        row1, row2 = group.iloc[0], group.iloc[1]
+
+        # 计算两种顺序的质量评分
+        order1 = pd.DataFrame([row1, row2])  # 原始顺序
+        order2 = pd.DataFrame([row2, row1])  # 反转顺序
+
+        score1 = self._evaluate_order_quality(order1, prev_balance)
+        score2 = self._evaluate_order_quality(order2, prev_balance)
+
+        print(f"    顺序1 (ID {row1['txId']}→{row2['txId']}): 评分 {score1:.2f}")
+        print(f"    顺序2 (ID {row2['txId']}→{row1['txId']}): 评分 {score2:.2f}")
+
+        if score2 > score1:
+            print(f"    ✅ 选择顺序2: ID {row2['txId']} → ID {row1['txId']}")
+            return order2
+        else:
+            print(f"    ✅ 选择顺序1: ID {row1['txId']} → ID {row2['txId']}")
+            return order1
+
+    def _heuristic_sort(self, group: pd.DataFrame, prev_balance: float = None) -> pd.DataFrame:
+        """
+        启发式排序算法
+
+        适用于较多交易(>3笔)
+        启发式规则:
+        1. 先处理支出,后处理收入(常见模式)
+        2. 金额大的优先
+        3. 余额连续性验证
+        """
+        print(f"    使用启发式排序 ({len(group)} 笔交易)...")
+
+        group = group.copy()
+
+        # 启发式1:按交易方向排序
+        group['sort_direction'] = group['txDirection'].map({'支出': 0, '收入': 1})
+
+        # 启发式2:按金额排序(支出从大到小,收入从小到大)
+        def get_amount_sort_key(row):
+            if row['txDirection'] == '支出':
+                return -row['txAmount']  # 支出金额大的优先
+            else:
+                return row['txAmount']  # 收入金额小的优先
+
+        group['sort_amount'] = group.apply(get_amount_sort_key, axis=1)
+
+        # 排序
+        sorted_group = group.sort_values(['sort_direction', 'sort_amount', 'txId']).drop(
+            ['sort_direction', 'sort_amount'], axis=1)
+
+        # 验证排序质量
+        score = self._evaluate_order_quality(sorted_group, prev_balance)
+        print(f"    启发式排序质量评分: {score:.2f}")
+
+        if score < 0.3:
+            print(f"    ⚠️ 启发式排序质量较低,考虑使用原始顺序")
+            return group.drop(['sort_direction', 'sort_amount'], axis=1)
+
+        return sorted_group
+
+    def _evaluate_order_quality(self, ordered_group: pd.DataFrame, start_balance: float = None) -> float:
+        """
+        评估排序质量
+
+        基于余额连续性计算质量评分
+        返回0-1之间的分数,越高越好
+        """
+        if len(ordered_group) == 0:
+            return 0.0
+
+        current_balance = start_balance
+        total_score = 0.0
+        valid_checks = 0
+
+        for _, row in ordered_group.iterrows():
+            if pd.isna(row.get('txBalance')):
+                # 缺失余额,无法评估
+                continue
+
+            if current_balance is not None:
+                # 计算预期余额
+                expected = self._calculate_expected_balance(current_balance, row)
+
+                if expected is not None:
+                    diff = abs(expected - row['txBalance'])
+                    if diff <= self.balance_tolerance:
+                        total_score += 1.0  # 完美匹配
+                    elif diff <= self.balance_tolerance * 10:  # 允许10倍容差
+                        total_score += 0.5  # 部分匹配
+                    else:
+                        total_score -= 0.5  # 严重不匹配
+
+                    valid_checks += 1
+
+            # 更新当前余额
+            current_balance = row['txBalance']
+
+        # 归一化分数
+        if valid_checks > 0:
+            # 基础分数是余额连续性分数
+            continuity_score = total_score / valid_checks
+
+            # 额外加分:如果整个组的总金额与余额变化匹配
+            if start_balance is not None and not ordered_group['txBalance'].isna().all():
+                final_balance = ordered_group['txBalance'].iloc[-1]
+                total_change = sum(
+                    row['txAmount'] if row['txDirection'] == '收入' else -row['txAmount']
+                    for _, row in ordered_group.iterrows()
+                )
+                expected_final = start_balance + total_change
+                final_diff = abs(expected_final - final_balance)
+
+                if final_diff <= self.balance_tolerance:
+                    continuity_score += 0.2  # 额外加分
+                elif final_diff <= self.balance_tolerance * 10:
+                    continuity_score += 0.1
+
+            # 确保分数在0-1之间
+            return max(0.0, min(1.0, continuity_score))
+
+        return 0.5  # 没有足够信息,返回中性分数
+
+    def _calculate_expected_balance(self, prev_balance: float, row: pd.Series) -> float:
+        """计算预期余额"""
+        if pd.isna(prev_balance):
+            return None
+
+        amount = row['txAmount']
+        direction = row['txDirection']
+
+        if direction == '收入':
+            return prev_balance + amount
+        elif direction == '支出':
+            return prev_balance - amount
+        else:
+            return None
+
+    def _factorial(self, n: int) -> int:
+        """计算阶乘(用于评估排列数)"""
+        result = 1
+        for i in range(2, n + 1):
+            result *= i
+        return result
+
+    # ==================== 其他方法 ====================
+
+    def _format_result_for_llm(self, result: Dict[str, Any]) -> str:
+        """将识别结果格式化为适合LLM理解的字符串"""
+        # ... 保持原有实现不变
+        pass
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        summary = super().get_summary()
+        summary.update({
+            'balance_tolerance': self.balance_tolerance,
+            'enable_smart_sorting': self.enable_smart_sorting,
+            'max_permutation_search': self.max_permutation_search,
+            'data_loaded': self._data is not None
+        })
+        return summary

+ 485 - 0
llmops/agents/tools/enhanced_base_recognizer.py

@@ -0,0 +1,485 @@
+
+from langchain.tools import BaseTool
+from abc import abstractmethod
+from typing import Dict, Any
+import pandas as pd
+from datetime import datetime
+from pydantic import Field, PrivateAttr
+
+
+
+class EnhancedBaseRecognizer(BaseTool):
+    """增强版异常识别器基类 - 提供统一的数据处理和异常记录格式"""
+
+    name: str = Field(..., description="识别器名称")
+    description: str = Field(..., description="识别器描述")
+    display_name: str = Field("", description="显示名称")
+
+    # 使用 PrivateAttr 定义不需要验证的私有属性
+    _recognized_count: int = PrivateAttr(0)
+    _csv_path: str = PrivateAttr(None)  # 标准化后的csv文件路径
+    _data: pd.DataFrame = PrivateAttr(None)  # 加载的数据
+    _config: Dict[str, Any] = PrivateAttr({})  # 配置参数
+
+    def __init__(self, name: str, description: str, display_name: str = "",
+                 csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化增强版识别器
+
+        Args:
+            name: 识别器名称
+            description: 识别器描述
+            display_name: 显示名称
+            csv_path: CSV文件路径
+            config: 配置参数字典
+        """
+        # 确保display_name有默认值
+        if not display_name:
+            display_name = name
+
+        # 调用父类初始化
+        super().__init__(
+            name=name,
+            description=description,
+            **kwargs
+        )
+
+        # 设置属性
+        self.display_name = display_name
+        self._recognized_count = 0
+        self._csv_path = csv_path
+        self._config = config or {}
+        self._data = None
+
+    # ==================== 统一的数据处理方法 ====================
+
+    def load_data(self, csv_path: str = None) -> pd.DataFrame:
+        """
+        加载并标准化数据
+
+        Args:
+            csv_path: CSV文件路径,如果为None则使用初始化时的路径
+
+        Returns:
+            pd.DataFrame: 标准化后的数据
+
+        Raises:
+            ValueError: 如果没有提供数据路径
+            FileNotFoundError: 如果文件不存在
+        """
+        data_path = csv_path or self._csv_path
+        if not data_path:
+            raise ValueError("未提供数据路径,请在初始化时设置csv_path或调用时传入")
+
+        print(f"📥 {self.display_name} 正在加载数据: {data_path}")
+
+        try:
+            # 加载数据
+            df = pd.read_csv(data_path)
+            print(f"✅ 成功加载 {len(df)} 条交易记录")
+
+            # 数据标准化处理
+            df = self._standardize_data(df)
+
+            # 缓存数据
+            self._data = df
+            return df
+
+        except FileNotFoundError:
+            raise FileNotFoundError(f"文件不存在: {data_path}")
+        except Exception as e:
+            raise Exception(f"数据加载失败: {str(e)}")
+
+    # enhanced_base_recognizer.py 中的 _standardize_data 方法
+    def _standardize_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        标准化数据格式
+
+        Args:
+            df: 原始数据
+
+        Returns:
+            pd.DataFrame: 标准化后的数据
+        """
+        df = df.copy()
+
+        # 1. 确保关键字段存在
+        required_fields = ['txId', 'txDate', 'txTime', 'txAmount', 'txDirection']
+        missing_fields = [f for f in required_fields if f not in df.columns]
+
+        if missing_fields:
+            print(f"⚠️ 警告:缺少字段 {missing_fields},某些检查可能无法进行")
+
+        # 2. 关键字段类型转换
+        # txId: 统一转换为字符串,确保格式化时不会出错
+        if 'txId' in df.columns:
+            df['txId'] = df['txId'].astype(str).str.strip()
+            print(f"  ✅ txId 已转换为字符串类型,共 {len(df)} 条记录")
+
+        # txAmount 和 txBalance: 转换为数值类型
+        if 'txAmount' in df.columns:
+            df['txAmount'] = pd.to_numeric(df['txAmount'], errors='coerce')
+            invalid_amounts = df['txAmount'].isna().sum()
+            if invalid_amounts > 0:
+                print(f"  ⚠️  有 {invalid_amounts} 条记录的 txAmount 无法转换为数值")
+
+        if 'txBalance' in df.columns:
+            df['txBalance'] = pd.to_numeric(df['txBalance'], errors='coerce')
+            invalid_balances = df['txBalance'].isna().sum()
+            if invalid_balances > 0:
+                print(f"  ⚠️  有 {invalid_balances} 条记录的 txBalance 无法转换为数值")
+
+        # 3. 字符串字段清理
+        if 'txDirection' in df.columns:
+            df['txDirection'] = df['txDirection'].astype(str).str.strip()
+            # 统计方向分布
+            direction_counts = df['txDirection'].value_counts()
+            print(f"  📊 交易方向分布: {dict(direction_counts)}")
+
+        if 'txSummary' in df.columns:
+            df['txSummary'] = df['txSummary'].astype(str).str.strip()
+
+        if 'txCounterparty' in df.columns:
+            df['txCounterparty'] = df['txCounterparty'].astype(str).str.strip()
+
+        # 4. 创建统一的datetime字段(如果日期时间字段存在)
+        if 'txDate' in df.columns and 'txTime' in df.columns:
+            try:
+                # 先确保是字符串
+                df['txDate'] = df['txDate'].astype(str).str.strip()
+                df['txTime'] = df['txTime'].astype(str).str.strip()
+
+                # 组合成datetime
+                datetime_str = df['txDate'] + ' ' + df['txTime']
+                df['datetime'] = pd.to_datetime(datetime_str, errors='coerce')
+
+                # 统计解析失败的数量
+                failed_parse = df['datetime'].isna().sum()
+                if failed_parse > 0:
+                    print(f"  ⚠️  有 {failed_parse} 条记录的时间解析失败")
+                else:
+                    print(f"  ✅ 所有 {len(df)} 条记录的时间解析成功")
+
+                # 提取时间组件
+                df['hour'] = df['datetime'].dt.hour
+                df['minute'] = df['datetime'].dt.minute
+                df['date_only'] = df['datetime'].dt.date
+                df['day_of_week'] = df['datetime'].dt.dayofweek  # 0=周一, 6=周日
+
+            except Exception as e:
+                print(f"⚠️ 时间解析失败: {e}")
+
+        # 5. 数据质量检查
+        print(f"📊 数据标准化完成统计:")
+        print(f"  总记录数: {len(df)}")
+        if 'txId' in df.columns:
+            unique_ids = df['txId'].nunique()
+            print(f"  唯一交易ID数: {unique_ids}")
+            if unique_ids != len(df):
+                print(f"  ⚠️  警告: 有 {len(df) - unique_ids} 条重复的交易ID")
+
+        if 'datetime' in df.columns:
+            date_range = df['datetime'].min(), df['datetime'].max()
+            print(f"  时间范围: {date_range[0]} 到 {date_range[1]}")
+            print(f"  总天数: {df['date_only'].nunique()}")
+
+        return df
+
+
+    # ==================== 统一的异常记录格式 ====================
+
+    def format_anomaly_record(self, row: pd.Series, reason: str,
+                              severity: str = "medium", **additional_info) -> Dict[str, Any]:
+        """
+        创建标准化的异常记录
+
+        Args:
+            row: 数据行(pandas Series)
+            reason: 异常原因描述
+            severity: 严重程度 (high/medium/low)
+            **additional_info: 额外信息
+
+        Returns:
+            Dict[str, Any]: 标准化异常记录
+        """
+        # 基础字段
+        record = {
+            'recognition_type': self.display_name,
+            'txId': str(row.get('txId', '')),
+            'txDate': str(row.get('txDate', '')),
+            'txTime': str(row.get('txTime', '')),
+            'txAmount': float(row.get('txAmount', 0)),
+            'txDirection': str(row.get('txDirection', '')),
+            'recognition_reason': reason,
+            'severity': severity,
+            'additional_info': additional_info,
+            'status': '待核查',
+            'recognizer_name': self.name
+        }
+
+        # 可选字段
+        optional_fields = ['txBalance', 'txSummary', 'txCounterparty']
+        for field in optional_fields:
+            if field in row and pd.notna(row[field]):
+                record[field] = row[field]
+
+        # 添加datetime信息(如果已标准化)
+        if 'datetime' in row and pd.notna(row['datetime']):
+            record['datetime'] = row['datetime'].strftime("%Y-%m-%d %H:%M:%S")
+
+        return record
+
+    # ==================== 配置管理 ====================
+
+    def get_config_value(self, key: str, default: Any = None) -> Any:
+        """
+        获取配置值
+
+        Args:
+            key: 配置键
+            default: 默认值
+
+        Returns:
+            配置值或默认值
+        """
+        return self._config.get(key, default)
+
+    # ==================== 工具方法 ====================
+
+    def is_night_time(self, hour: int, start_hour: int = 2, end_hour: int = 5) -> bool:
+        """判断是否为夜间时间(凌晨2-5点)"""
+        return start_hour <= hour <= end_hour
+
+    def is_integer_amount(self, amount: float, base_amount: float = 10000.0,
+                          tolerance: float = 0.01) -> bool:
+        """判断是否为整数金额(基准金额的整数倍)"""
+        if pd.isna(amount):
+            return False
+        return abs(amount % base_amount) < tolerance or abs(amount % base_amount - base_amount) < tolerance
+
+    def calculate_time_difference(self, time1: datetime, time2: datetime,
+                                  unit: str = 'hours') -> float:
+        """计算两个时间点的时间差"""
+        if pd.isna(time1) or pd.isna(time2):
+            return float('inf')
+
+        diff_seconds = abs((time2 - time1).total_seconds())
+
+        if unit == 'hours':
+            return diff_seconds / 3600
+        elif unit == 'days':
+            return diff_seconds / 86400
+        elif unit == 'minutes':
+            return diff_seconds / 60
+        else:
+            return diff_seconds
+
+    # ==================== 抽象方法(子类必须实现) ====================
+
+    @abstractmethod
+    def recognize(self, **kwargs) -> Dict[str, Any]:
+        """
+        执行异常识别(子类必须实现)
+
+        Returns:
+            Dict[str, Any]: 识别结果,必须包含:
+                - recognition_type: 识别类型
+                - identified_count: 识别的异常数量
+                - identified_anomalies: 异常记录列表
+                - recognition_status: 识别状态
+        """
+        pass
+
+    # ==================== BaseTool接口实现 ====================
+
+    def _run(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        实现BaseTool的_run方法
+
+        Args:
+            csv_path: CSV文件路径
+            **kwargs: 其他参数
+
+        Returns:
+            Dict[str, Any]: 识别结果
+        """
+        try:
+            start_time = datetime.now()
+
+            # 执行识别
+            result = self.recognize(csv_path=csv_path, **kwargs)
+
+            # 对结果进行标准化处理
+            standardized_result = self._standardize_result(result)
+
+            # 确保结果是字典(不是字符串)
+            if isinstance(standardized_result, str):
+                try:
+                    # 尝试解析字符串为字典
+                    import json
+                    standardized_result = json.loads(standardized_result)
+                except json.JSONDecodeError:
+                    # 如果无法解析,包装成字典
+                    standardized_result = {
+                        'recognition_type': self.display_name,
+                        'identified_count': 0,
+                        'identified_anomalies': [],
+                        'recognition_status': '完成',
+                        'raw_output': standardized_result
+                    }
+
+            # 确保结果包含必要字段(原有逻辑)
+            if 'recognition_type' not in standardized_result:
+                standardized_result['recognition_type'] = self.display_name
+
+            if 'identified_count' not in standardized_result:
+                standardized_result['identified_count'] = len(standardized_result.get('identified_anomalies', []))
+
+            if 'recognition_status' not in standardized_result:
+                standardized_result['recognition_status'] = '完成'
+
+            # 添加执行信息
+            execution_time = (datetime.now() - start_time).total_seconds()
+            standardized_result['execution_info'] = {
+                'recognizer_name': self.name,
+                'display_name': self.display_name,
+                'execution_time_seconds': execution_time,
+                'execution_time': start_time.strftime("%Y-%m-%d %H:%M:%S")
+            }
+
+            # 更新识别计数
+            self._recognized_count = standardized_result.get('identified_count', 0)
+
+            return standardized_result
+
+            return result
+
+        except Exception as e:
+            error_msg = f"异常识别失败: {str(e)}"
+            print(f"❌ {self.display_name} - {error_msg}")
+
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': error_msg,
+                'execution_info': {
+                    'recognizer_name': self.name,
+                    'display_name': self.display_name,
+                    'error': str(e)
+                }
+            }
+
+
+    async def _arun(self, **kwargs):
+        raise NotImplementedError("异步识别不支持")
+
+
+    def _standardize_result(self, raw_result: Any) -> Dict[str, Any]:
+        """
+        标准化工具返回结果
+
+        Args:
+            raw_result: 原始结果(可能是dict、str、list等)
+
+        Returns:
+            标准化的字典结果
+        """
+        # 1. 如果已经是字典,直接返回
+        if isinstance(raw_result, dict):
+            return raw_result
+
+        # 2. 如果是字符串
+        elif isinstance(raw_result, str):
+            # 尝试解析JSON
+            try:
+                import json
+                parsed = json.loads(raw_result)
+                if isinstance(parsed, dict):
+                    return parsed
+                else:
+                    # JSON但不是字典(如list、str等)
+                    return {
+                        'recognition_type': self.display_name,
+                        'identified_count': 0,
+                        'identified_anomalies': [],
+                        'recognition_status': '完成',
+                        'raw_output': raw_result[:500]  # 截断长文本
+                    }
+            except json.JSONDecodeError:
+                # 不是JSON格式的字符串
+                # 检查是否是Python字典的字符串表示(如"{'key': 'value'}")
+                if raw_result.startswith('{') and raw_result.endswith('}'):
+                    try:
+                        # 尝试使用ast安全解析
+                        import ast
+                        parsed = ast.literal_eval(raw_result)
+                        if isinstance(parsed, dict):
+                            return parsed
+                    except (SyntaxError, ValueError):
+                        pass
+
+                # 无法解析,包装成标准格式
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'error': '工具返回了无法解析的字符串格式',
+                    'raw_output_preview': raw_result[:200]
+                }
+
+        # 3. 其他类型(list、tuple、数字等)
+        else:
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '完成',
+                'error': f'工具返回了非标准类型: {type(raw_result).__name__}',
+                'raw_output': str(raw_result)[:500]
+            }
+
+
+    # ==================== 其他方法 ====================
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        return {
+            'name': self.name,
+            'display_name': self.display_name,
+            'description': self.description,
+            'recognized_count': self._recognized_count,
+            'csv_path': self._csv_path,
+            'config': self._config
+        }
+
+    def get_data_summary(self) -> Dict[str, Any]:
+        """获取数据摘要"""
+        if self._data is None:
+            return {}
+
+        df = self._data
+        summary = {
+            'total_records': len(df),
+            'date_range': None,
+            'amount_stats': None
+        }
+
+        if 'datetime' in df.columns and not df['datetime'].isna().all():
+            summary['date_range'] = {
+                'start': df['datetime'].min().strftime("%Y-%m-%d"),
+                'end': df['datetime'].max().strftime("%Y-%m-%d")
+            }
+
+        if 'txAmount' in df.columns:
+            summary['amount_stats'] = {
+                'mean': float(df['txAmount'].mean()),
+                'max': float(df['txAmount'].max()),
+                'min': float(df['txAmount'].min()),
+                'sum': float(df['txAmount'].sum()),
+                'std': float(df['txAmount'].std())
+            }
+
+        return summary

+ 354 - 0
llmops/agents/tools/high_frequency_transaction_recognizer.py

@@ -0,0 +1,354 @@
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, Type
+import pandas as pd
+
+from .enhanced_base_recognizer import EnhancedBaseRecognizer
+
+
+class HighFrequencyInput(BaseModel):
+    """高频交易识别工具输入"""
+    csv_path: Optional[str] = Field(
+        None,
+        description="CSV文件路径(可选)。如果初始化时已提供csv_path,可以不用再次传入。"
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class HighFrequencyTransactionRecognizer(EnhancedBaseRecognizer):
+    """
+    高频交易异常识别器
+
+    严格遵循业务规则定义:
+    银行流水高频交易(按小时维度)的定义为:
+    以"日期 + 小时"为统计单位,通过汇总该时间间隔内的交易笔数与交易金额,
+    识别出单小时交易笔数超过 10 笔的特定时段组合,
+    此类在短时间内(1 小时)集中发生、交易频次密集的资金收付行为,
+    即为银行流水高频交易,其核心特征是交易笔数在单位小时内达到预设阈值(10 笔),
+    体现出资金往来的集中性与活跃度异常。
+    """
+
+    args_schema: Type[BaseModel] = HighFrequencyInput
+
+    # 配置参数 - 严格按照业务规则
+    frequency_threshold: int = Field(
+        10,
+        description="高频交易阈值,每小时超过此笔数视为高频交易(业务规则要求10笔)"
+    )
+
+    def __init__(self, csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化高频交易识别器
+
+        Args:
+            csv_path: CSV文件路径
+            config: 配置参数
+            **kwargs: 其他参数
+        """
+        # 调用父类的 __init__
+        super().__init__(
+            name="high_frequency_recognizer",
+            description="识别银行流水中的高频交易异常,严格按照业务规则:单小时交易笔数超过10笔即为高频异常。",
+            display_name="高频交易异常识别",
+            csv_path=csv_path,
+            config=config,
+            **kwargs
+        )
+
+        # 从config获取配置
+        high_freq_config = self.get_config_value('high_frequency', {})
+        if high_freq_config and 'frequency_threshold' in high_freq_config:
+            self.frequency_threshold = high_freq_config['frequency_threshold']
+
+        print(f"✅ {self.display_name} 初始化完成")
+        print(f"  严格遵循业务规则:单小时交易笔数 > {self.frequency_threshold}笔 = 高频异常")
+
+    def _calculate_hourly_statistics(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        计算每小时的交易统计数据
+
+        Args:
+            df: 标准化后的交易数据
+
+        Returns:
+            pd.DataFrame: 每小时统计结果
+        """
+        # 提取日期和小时
+        df['date_hour'] = df['datetime'].dt.strftime('%Y-%m-%d %H')
+        df['date'] = df['datetime'].dt.date
+        df['hour'] = df['datetime'].dt.hour
+
+        # 按照业务规则:以"日期 + 小时"为统计单位
+        hour_stats = []
+
+        # 按日期+小时分组
+        for (date_val, hour_val), group in df.groupby(['date', 'hour']):
+            transaction_count = len(group)
+            amount_sum = group['txAmount'].sum()
+
+            # 统计交易方向分布
+            direction_counts = group['txDirection'].value_counts().to_dict()
+
+            hour_stats.append({
+                'date': date_val,
+                'hour': hour_val,
+                'transaction_count': transaction_count,
+                'amount_sum': amount_sum,
+                'amount_avg': amount_sum / transaction_count if transaction_count > 0 else 0,
+                'in_count': direction_counts.get('收入', 0),
+                'out_count': direction_counts.get('支出', 0)
+            })
+
+        return pd.DataFrame(hour_stats)
+
+    def _identify_high_frequency_periods(self, hour_stats: pd.DataFrame) -> pd.DataFrame:
+        """
+        识别高频交易时段
+
+        Args:
+            hour_stats: 每小时统计数据
+
+        Returns:
+            pd.DataFrame: 标记了高频时段的结果
+        """
+        # 严格按照业务规则:单小时交易笔数超过10笔
+        hour_stats['is_high_frequency'] = hour_stats['transaction_count'] > self.frequency_threshold
+
+        return hour_stats
+
+    def _get_transactions_in_period(self, df: pd.DataFrame, date_val: pd.Timestamp.date, hour_val: int) -> pd.DataFrame:
+        """
+        获取指定时段内的所有交易
+
+        Args:
+            df: 原始交易数据
+            date_val: 日期
+            hour_val: 小时
+
+        Returns:
+            pd.DataFrame: 指定时段内的交易
+        """
+        return df[
+            (df['datetime'].dt.date == date_val) &
+            (df['datetime'].dt.hour == hour_val)
+            ].copy()
+
+    def recognize(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        识别高频交易异常 - 严格按照业务规则
+
+        Args:
+            csv_path: CSV文件路径
+            **kwargs: 其他参数
+
+        Returns:
+            Dict[str, Any]: 识别结果
+        """
+        try:
+            # 使用父类的load_data方法加载标准化数据
+            df = self.load_data(csv_path)
+
+            print(f"🔍 {self.display_name}开始检查,共 {len(df)} 条记录")
+            print(f"  业务规则: 以'日期 + 小时'为统计单位")
+            print(f"  高频阈值: 单小时交易笔数 > {self.frequency_threshold}笔")
+
+            # 检查必需字段
+            required_fields = ['txId', 'datetime', 'txAmount', 'txDirection']
+            missing_fields = [field for field in required_fields if field not in df.columns]
+
+            if missing_fields:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '失败',
+                    'error': f'缺少必需字段: {missing_fields}'
+                }
+
+            # 确保datetime列已正确解析
+            if not pd.api.types.is_datetime64_any_dtype(df['datetime']):
+                df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
+
+            # 检查是否有无效的时间数据
+            invalid_times = df['datetime'].isna().sum()
+            if invalid_times > 0:
+                print(f"⚠️  警告: 有 {invalid_times} 条记录的时间解析失败,将跳过这些记录")
+                df = df[df['datetime'].notna()]
+
+            if len(df) == 0:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'statistics': {'total_valid_transactions': 0}
+                }
+
+            # ============ 按照业务规则:以"日期 + 小时"为统计单位 ============
+            hour_stats = self._calculate_hourly_statistics(df)
+
+            if len(hour_stats) == 0:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'statistics': {
+                        'total_transactions': len(df),
+                        'hour_periods': 0,
+                        'high_frequency_periods': 0
+                    }
+                }
+
+            print(f"📊 按'日期+小时'分组统计完成,共 {len(hour_stats)} 个时段")
+
+            # ============ 识别高频时段 ============
+            hour_stats = self._identify_high_frequency_periods(hour_stats)
+
+            # 高频时段统计
+            high_freq_periods = hour_stats[hour_stats['is_high_frequency']]
+
+            print(f"📊 发现 {len(high_freq_periods)} 个高频时段(> {self.frequency_threshold}笔/小时)")
+
+            # ============ 生成异常记录 ============
+            identified_anomalies = []
+
+            # 为每个高频时段创建异常记录
+            for _, period_row in high_freq_periods.iterrows():
+                date_val = period_row['date']
+                hour_val = period_row['hour']
+
+                # 获取该时段内的所有交易
+                period_transactions = self._get_transactions_in_period(df, date_val, hour_val)
+
+                # 为每笔交易创建异常记录
+                for _, tx_row in period_transactions.iterrows():
+                    # 生成异常原因 - 严格按照业务规则描述
+                    reason = f"属于高频交易时段:{date_val} {hour_val:02d}:00-{hour_val + 1:02d}:00,该时段共{period_row['transaction_count']}笔交易,超过阈值{self.frequency_threshold}笔,体现资金往来的集中性与活跃度异常"
+
+                    # 额外信息
+                    additional_info = {
+                        'period_info': {
+                            'date': date_val.strftime('%Y-%m-%d'),
+                            'hour': int(hour_val),
+                            'start_time': f"{hour_val:02d}:00",
+                            'end_time': f"{hour_val + 1:02d}:00",
+                            'transaction_count': int(period_row['transaction_count']),
+                            'amount_sum': float(period_row['amount_sum']),
+                            'amount_avg': float(period_row['amount_avg']),
+                            'in_count': int(period_row['in_count']),
+                            'out_count': int(period_row['out_count'])
+                        },
+                        'business_rule': {
+                            'statistic_unit': "日期 + 小时",
+                            'threshold': self.frequency_threshold,
+                            'description': "单小时交易笔数超过阈值,体现资金往来的集中性与活跃度异常"
+                        }
+                    }
+
+                    anomaly = self.format_anomaly_record(
+                        row=tx_row,
+                        reason=reason,
+                        severity='high',  # 高频交易通常视为高风险
+                        check_type='high_frequency_transaction',
+                        **additional_info
+                    )
+                    identified_anomalies.append(anomaly)
+
+            # ============ 结果统计 ============
+            print(f"✅ {self.display_name}检查完成")
+            print(f"  检查交易总数: {len(df)}")
+            print(f"  统计时段总数: {len(hour_stats)}")
+            print(f"  高频时段数: {len(high_freq_periods)}")
+            print(f"  异常交易笔数: {len(identified_anomalies)}")
+
+            # 显示高频时段详情
+            if len(high_freq_periods) > 0:
+                print("📋 高频时段详情(按交易笔数排序):")
+                sorted_periods = high_freq_periods.sort_values('transaction_count', ascending=False)
+                for i, (_, row) in enumerate(sorted_periods.iterrows(), 1):
+                    time_range = f"{row['hour']:02d}:00-{row['hour'] + 1:02d}:00"
+                    print(f"  {i}. {row['date']} {time_range}: {row['transaction_count']}笔交易")
+                    print(f"     收入: {row['in_count']}笔,支出: {row['out_count']}笔")
+                    print(f"     总金额: ¥{row['amount_sum']:,.2f},笔均: ¥{row['amount_avg']:,.2f}")
+
+            # 显示整体统计
+            if len(hour_stats) > 0:
+                max_transactions = hour_stats['transaction_count'].max()
+                avg_transactions = hour_stats['transaction_count'].mean()
+                print(f"📊 整体统计: 最高{max_transactions}笔/小时,平均{avg_transactions:.1f}笔/小时")
+
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': len(identified_anomalies),
+                'identified_anomalies': identified_anomalies,
+                'recognition_status': '完成',
+                'recognition_parameters': {
+                    'frequency_threshold': self.frequency_threshold,
+                    'statistic_unit': "日期 + 小时",
+                    'business_rule': "单小时交易笔数超过阈值即视为高频交易异常"
+                },
+                'statistics': {
+                    'total_transactions': len(df),
+                    'total_periods': len(hour_stats),
+                    'high_frequency_periods': len(high_freq_periods),
+                    'period_statistics': {
+                        'max_transactions_per_hour': int(hour_stats['transaction_count'].max()),
+                        'min_transactions_per_hour': int(hour_stats['transaction_count'].min()),
+                        'avg_transactions_per_hour': float(hour_stats['transaction_count'].mean()),
+                        'max_amount_per_hour': float(hour_stats['amount_sum'].max()),
+                        'min_amount_per_hour': float(hour_stats['amount_sum'].min()),
+                        'avg_amount_per_hour': float(hour_stats['amount_sum'].mean())
+                    },
+                    'high_frequency_details': [
+                        {
+                            'date': row['date'].strftime('%Y-%m-%d'),
+                            'hour': int(row['hour']),
+                            'transaction_count': int(row['transaction_count']),
+                            'amount_sum': float(row['amount_sum']),
+                            'in_count': int(row['in_count']),
+                            'out_count': int(row['out_count'])
+                        }
+                        for _, row in high_freq_periods.iterrows()
+                    ] if len(high_freq_periods) > 0 else []
+                }
+            }
+
+        except FileNotFoundError as e:
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'文件不存在: {str(e)}'
+            }
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'数据加载或处理失败: {str(e)}'
+            }
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        summary = super().get_summary()
+        summary.update({
+            'frequency_threshold': self.frequency_threshold,
+            'business_rule': '单小时交易笔数超过阈值即视为高频交易异常',
+            'data_loaded': self._data is not None
+        })
+        return summary
+
+    def get_config_summary(self) -> Dict[str, Any]:
+        """获取配置摘要"""
+        return {
+            "高频交易阈值": f"{self.frequency_threshold}笔/小时",
+            "统计单位": "日期 + 小时",
+            "检测逻辑": "单小时交易笔数 > 阈值 = 高频交易异常",
+            "业务规则描述": "识别短时间内集中发生、交易频次密集的资金收付行为"
+        }

+ 450 - 0
llmops/agents/tools/inactive_account_recognizer.py

@@ -0,0 +1,450 @@
+
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, Type, List
+import pandas as pd
+
+from .enhanced_base_recognizer import EnhancedBaseRecognizer
+
+
+class InactiveAccountInput(BaseModel):
+    """长期无交易账户识别工具输入"""
+    csv_path: Optional[str] = Field(
+        None,
+        description="CSV文件路径(可选)。如果初始化时已提供csv_path,可以不用再次传入。"
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class InactiveAccountRecognizer(EnhancedBaseRecognizer):
+    """
+    长期无交易账户识别器
+
+    异常规则定义:
+    若发现账户在指定周期内未产生任何流水明细,与正常经营或资金往来应具备的交易活跃度不符,
+    违背业务常理,可判定为流水存在异常,完整性存疑。
+
+    核心逻辑(根据您的要求调整):
+    1. 数据只属于一个账户,无需账户标识字段
+    2. 从数据最早日期开始计算连续无交易天数
+    3. 检查是否存在长时间无交易的"空白期"
+    """
+
+    args_schema: Type[BaseModel] = InactiveAccountInput
+
+    # 配置参数(简化版)
+    inactive_period_days: int = Field(
+        30,
+        description="无交易天数阈值(天),连续无交易超过此天数视为异常"
+    )
+
+    # 严重程度配置
+    severity_level: str = Field(
+        'medium',
+        description="异常严重程度(high/medium/low)"
+    )
+
+    def __init__(self, csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化长期无交易账户识别器
+
+        Args:
+            csv_path: CSV文件路径
+            config: 配置参数
+            **kwargs: 其他参数
+        """
+        # 调用父类的 __init__
+        super().__init__(
+            name="inactive_account_recognizer",
+            description="识别在指定周期内无任何交易记录的异常账户,检查流水完整性。",
+            display_name="长期无交易账户识别器",
+            csv_path=csv_path,
+            config=config,
+            **kwargs
+        )
+
+        # 从config获取配置,更新Field属性
+        inactive_config = self.get_config_value('inactive_account_check', {})
+        if inactive_config:
+            config_mapping = {
+                'inactive_period_days': 'inactive_period_days',
+                'severity_level': 'severity_level'
+            }
+
+            for config_key, attr_name in config_mapping.items():
+                if config_key in inactive_config:
+                    setattr(self, attr_name, inactive_config[config_key])
+
+        print(f"✅ {self.display_name} 初始化完成")
+        print(f"  无交易天数阈值: {self.inactive_period_days}天")
+        print(f"  异常严重程度: {self.severity_level}")
+        print(f"  账户假设: 整个文件视为单个账户")
+        print(f"  基准日策略: 从最早交易日期开始检查")
+
+    def _check_long_inactive_periods(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
+        """
+        检查长时间无交易的空白期
+
+        Args:
+            df: 交易数据(已按时间排序)
+
+        Returns:
+            List[Dict[str, Any]]: 发现的空白期列表
+        """
+        if len(df) < 2:
+            return []
+
+        # 确保按时间排序
+        df = df.sort_values('datetime')
+
+        inactive_periods = []
+
+        # 检查交易之间的时间间隔
+        for i in range(len(df) - 1):
+            current_date = df.iloc[i]['datetime']
+            next_date = df.iloc[i + 1]['datetime']
+
+            # 计算天数差
+            days_diff = (next_date - current_date).days
+
+            # 如果间隔超过阈值,记录为空白期
+            if days_diff > self.inactive_period_days:
+                period_info = {
+                    'start_date': current_date,
+                    'end_date': next_date,
+                    'inactive_days': days_diff,
+                    'period_index': i,
+                    'next_tx_id': df.iloc[i + 1]['txId']
+                }
+                inactive_periods.append(period_info)
+
+        return inactive_periods
+
+    def _check_data_beginning_gap(self, df: pd.DataFrame, earliest_date: pd.Timestamp,
+                                  data_coverage_days: int) -> Optional[Dict[str, Any]]:
+        """
+        检查数据开始前是否有空白期
+
+        Args:
+            df: 交易数据
+            earliest_date: 数据中最早的交易日期
+            data_coverage_days: 数据覆盖的总天数
+
+        Returns:
+            空白期信息或None
+        """
+        # 如果数据覆盖天数足够长(比如超过60天),但前面部分没有交易
+        # 这本身可能就是一个空白期的迹象
+        if data_coverage_days > self.inactive_period_days:
+            # 找到第一条交易后的日期范围
+            second_date = df.sort_values('datetime').iloc[1]['datetime'] if len(df) > 1 else earliest_date
+            first_gap = (second_date - earliest_date).days if len(df) > 1 else 0
+
+            if first_gap > self.inactive_period_days:
+                return {
+                    'type': 'beginning_gap',
+                    'inactive_days': first_gap,
+                    'start_date': earliest_date,
+                    'end_date': second_date
+                }
+
+        return None
+
+    def recognize(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        识别长期无交易账户异常
+
+        Args:
+            csv_path: CSV文件路径
+            **kwargs: 其他参数
+
+        Returns:
+            Dict[str, Any]: 识别结果
+        """
+        try:
+            # 使用父类的load_data方法加载标准化数据
+            df = self.load_data(csv_path)
+
+            print(f"🔍 {self.display_name}开始检查,共 {len(df)} 条记录")
+            print(f"  检查规则: 从最早交易日期开始,检查是否存在连续{self.inactive_period_days}天以上的无交易空白期")
+
+            # 检查必需字段
+            if 'datetime' not in df.columns:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '失败',
+                    'error': '缺少必需字段: datetime(时间信息)',
+                    'recommendation': '请确保数据包含有效的日期时间信息'
+                }
+
+            # ============ 数据基本情况分析 ============
+            earliest_date = df['datetime'].min()
+            latest_date = df['datetime'].max()
+            data_coverage_days = (latest_date - earliest_date).days + 1
+
+            print(f"📊 数据基本情况:")
+            print(f"  时间范围: {earliest_date.strftime('%Y-%m-%d')} 至 {latest_date.strftime('%Y-%m-%d')}")
+            print(f"  数据覆盖天数: {data_coverage_days}天")
+            print(f"  总交易笔数: {len(df)}")
+            print(f"  日均交易笔数: {len(df) / data_coverage_days:.2f}笔/天")
+
+            # ============ 检查数据是否足够 ============
+            if data_coverage_days < self.inactive_period_days:
+                print(f"⚠️ 数据不足: 仅覆盖{data_coverage_days}天,小于阈值{self.inactive_period_days}天")
+                print(f"  建议: 需要更长时间范围的数据才能准确判断")
+
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'recognition_parameters': {
+                        'inactive_period_days': self.inactive_period_days,
+                        'severity_level': self.severity_level,
+                        'data_coverage_days': data_coverage_days
+                    },
+                    'statistics': {
+                        'total_transactions': len(df),
+                        'data_coverage_days': data_coverage_days,
+                        'date_range': {
+                            'start': earliest_date.strftime('%Y-%m-%d'),
+                            'end': latest_date.strftime('%Y-%m-%d')
+                        },
+                        'transaction_frequency': len(df) / data_coverage_days if data_coverage_days > 0 else 0
+                    },
+                    'note': f'数据覆盖天数({data_coverage_days}天)不足阈值({self.inactive_period_days}天),无法准确判断'
+                }
+
+            # ============ 检查长时间空白期 ============
+            identified_anomalies = []
+
+            # 1. 检查交易之间的空白期
+            inactive_periods = self._check_long_inactive_periods(df)
+
+            for period in inactive_periods:
+                start_date = period['start_date']
+                end_date = period['end_date']
+                inactive_days = period['inactive_days']
+                next_tx_id = period['next_tx_id']
+
+                # 生成异常原因
+                reason = f"发现长时间无交易空白期:从{start_date.strftime('%Y-%m-%d')}到{end_date.strftime('%Y-%m-%d')},连续{inactive_days}天无任何交易,超过阈值{self.inactive_period_days}天"
+
+                print(f"  ❌ 发现空白期: {reason}")
+
+                # 获取下一笔交易的详细信息
+                next_tx_data = df[df['txId'] == next_tx_id]
+
+                if not next_tx_data.empty:
+                    next_tx = next_tx_data.iloc[0]
+
+                    # 使用真实交易数据创建异常记录
+                    anomaly = {
+                        'txId': str(next_tx_id),
+                        'txDate': str(next_tx['txDate']),
+                        'txTime': str(next_tx['txTime']),
+                        'datetime': next_tx['datetime'] if 'datetime' in next_tx else end_date,
+                        'txAmount': float(next_tx['txAmount']),
+                        'txDirection': str(next_tx['txDirection']),
+                        'recognition_reason': f"长期无交易异常:账户在{start_date.strftime('%Y-%m-%d')}至{end_date.strftime('%Y-%m-%d')}期间连续{inactive_days}天无任何交易,超过阈值{self.inactive_period_days}天。此笔交易({next_tx_id})为空白期后的首笔交易",
+                        'severity': self.severity_level,
+                        'status': '待核查',
+                        'check_type': 'inactive_account_period',
+                        'period_info': {
+                            'start_date': start_date.strftime('%Y-%m-%d'),
+                            'end_date': end_date.strftime('%Y-%m-%d'),
+                            'inactive_days': inactive_days,
+                            'threshold_days': self.inactive_period_days,
+                            'next_tx_id': next_tx_id,
+                            'next_tx_info': {
+                                'txDate': str(next_tx['txDate']),
+                                'txTime': str(next_tx['txTime']),
+                                'txAmount': float(next_tx['txAmount']),
+                                'txDirection': str(next_tx['txDirection'])
+                            }
+                        }
+                    }
+
+                    # 创建包含所有必要字段的Series
+                    anomaly_series = pd.Series({
+                        'txId': anomaly['txId'],
+                        'txDate': anomaly['txDate'],
+                        'txTime': anomaly['txTime'],
+                        'txAmount': anomaly['txAmount'],
+                        'txDirection': anomaly['txDirection'],
+                        'txBalance': next_tx.get('txBalance', None),
+                        'txSummary': next_tx.get('txSummary', ''),
+                        'txCounterparty': next_tx.get('txCounterparty', ''),
+                        'datetime': anomaly['datetime']
+                    })
+                else:
+                    # 如果找不到下一笔交易,使用改进的默认格式
+                    print(f"⚠️ 警告:未找到交易ID {next_tx_id} 的详细信息,使用默认格式")
+                    anomaly = {
+                        'txId': str(next_tx_id),
+                        'txDate': end_date.strftime('%Y-%m-%d'),
+                        'txTime': '23:59:59',
+                        'datetime': end_date,
+                        'txAmount': 0.0,
+                        'txDirection': '收入',  # 默认设为收入
+                        'recognition_reason': f"{reason}",
+                        'severity': self.severity_level,
+                        'status': '待核查',
+                        'check_type': 'inactive_account_period',
+                        'period_info': {
+                            'start_date': start_date.strftime('%Y-%m-%d'),
+                            'end_date': end_date.strftime('%Y-%m-%d'),
+                            'inactive_days': inactive_days,
+                            'threshold_days': self.inactive_period_days
+                        }
+                    }
+
+                    anomaly_series = pd.Series({
+                        'txId': anomaly['txId'],
+                        'txDate': anomaly['txDate'],
+                        'txTime': anomaly['txTime'],
+                        'txAmount': anomaly['txAmount'],
+                        'txDirection': anomaly['txDirection'],
+                        'datetime': anomaly['datetime']
+                    })
+
+                # 格式化异常记录
+                formatted_anomaly = self.format_anomaly_record(
+                    row=anomaly_series,
+                    reason=anomaly['recognition_reason'],
+                    severity=anomaly['severity'],
+                    check_type=anomaly['check_type'],
+                    **anomaly['period_info']
+                )
+                identified_anomalies.append(formatted_anomaly)
+
+            # 2. 检查整体交易活跃度(如果整个数据期交易都很少)
+            avg_transactions_per_day = len(df) / data_coverage_days
+            if avg_transactions_per_day < 0.1:  # 平均每天不足0.1笔交易
+                print(f"⚠️ 交易活跃度极低: 平均{avg_transactions_per_day:.2f}笔/天")
+
+                # 生成低活跃度异常
+                low_activity_reason = f"账户整体交易活跃度极低:{data_coverage_days}天内仅{len(df)}笔交易,平均{avg_transactions_per_day:.2f}笔/天,不符合正常资金往来特征"
+
+                # 使用最后一笔交易作为异常记录的基础
+                last_tx = df.iloc[-1] if len(df) > 0 else None
+
+                if last_tx is not None:
+                    anomaly_series = pd.Series({
+                        'txId': 'LOW_ACTIVITY_OVERALL',
+                        'txDate': last_tx['txDate'],
+                        'txTime': last_tx['txTime'],
+                        'txAmount': float(last_tx['txAmount']),
+                        'txDirection': str(last_tx['txDirection']),
+                        'txBalance': last_tx.get('txBalance', None),
+                        'txSummary': last_tx.get('txSummary', ''),
+                        'txCounterparty': last_tx.get('txCounterparty', ''),
+                        'datetime': last_tx.get('datetime', latest_date)
+                    })
+                else:
+                    anomaly_series = pd.Series({
+                        'txId': 'LOW_ACTIVITY_OVERALL',
+                        'txDate': latest_date.strftime('%Y-%m-%d'),
+                        'txTime': '23:59:59',
+                        'txAmount': 0.0,
+                        'txDirection': '收入',
+                        'datetime': latest_date
+                    })
+
+                formatted_anomaly = self.format_anomaly_record(
+                    row=anomaly_series,
+                    reason=low_activity_reason,
+                    severity='low' if self.severity_level == 'medium' else self.severity_level,  # 降一级严重度
+                    check_type='low_activity_overall',
+                    activity_metrics={
+                        'data_coverage_days': data_coverage_days,
+                        'total_transactions': len(df),
+                        'avg_transactions_per_day': avg_transactions_per_day
+                    }
+                )
+                identified_anomalies.append(formatted_anomaly)
+
+            # ============ 结果统计 ============
+            print(f"✅ {self.display_name}检查完成")
+            print(f"  检查结果:")
+            print(f"    空白期数量: {len(inactive_periods)}")
+            print(f"    异常记录数: {len(identified_anomalies)}")
+
+            if len(inactive_periods) == 0:
+                print(f"    ✅ 未发现超过{self.inactive_period_days}天的空白期")
+
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': len(identified_anomalies),
+                'identified_anomalies': identified_anomalies,
+                'recognition_status': '完成',
+                'recognition_parameters': {
+                    'inactive_period_days': self.inactive_period_days,
+                    'severity_level': self.severity_level,
+                    'check_strategy': '从最早交易日期开始检查空白期'
+                },
+                'statistics': {
+                    'total_transactions': len(df),
+                    'data_coverage_days': data_coverage_days,
+                    'date_range': {
+                        'start': earliest_date.strftime('%Y-%m-%d'),
+                        'end': latest_date.strftime('%Y-%m-%d')
+                    },
+                    'transaction_frequency': avg_transactions_per_day,
+                    'inactive_periods_count': len(inactive_periods),
+                    'inactive_periods_details': [
+                        {
+                            'start_date': p['start_date'].strftime('%Y-%m-%d'),
+                            'end_date': p['end_date'].strftime('%Y-%m-%d'),
+                            'inactive_days': p['inactive_days'],
+                            'next_tx_id': p['next_tx_id']
+                        }
+                        for p in inactive_periods
+                    ]
+                }
+            }
+
+        except FileNotFoundError as e:
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'文件不存在: {str(e)}'
+            }
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'数据加载或处理失败: {str(e)}'
+            }
+
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        summary = super().get_summary()
+        summary.update({
+            'inactive_period_days': self.inactive_period_days,
+            'severity_level': self.severity_level,
+            'data_loaded': self._data is not None,
+            'check_strategy': '从最早交易日期开始检查连续无交易空白期'
+        })
+        return summary
+
+    def get_config_summary(self) -> Dict[str, Any]:
+        """获取配置摘要"""
+        return {
+            "无交易天数阈值": f"{self.inactive_period_days}天",
+            "异常严重程度": self.severity_level.upper(),
+            "检测逻辑": f"从最早交易日期开始,检查连续{self.inactive_period_days}天以上的无交易空白期",
+            "账户假设": "整个文件视为单个账户",
+            "基准日策略": "以数据中最早交易日期为起点",
+            "业务规则描述": "连续长时间无任何交易,与正常经营或资金往来的交易活跃度不符"
+        }

+ 539 - 0
llmops/agents/tools/large_amount_transaction_recognizer.py

@@ -0,0 +1,539 @@
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, Type, List
+import pandas as pd
+
+from .enhanced_base_recognizer import EnhancedBaseRecognizer
+
+
+class LargeAmountInput(BaseModel):
+    """大额交易识别工具输入"""
+    csv_path: Optional[str] = Field(
+        None,
+        description="CSV文件路径(可选)。如果初始化时已提供csv_path,可以不用再次传入。"
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class LargeAmountTransactionRecognizer(EnhancedBaseRecognizer):
+    """
+    大额交易异常识别器
+
+    业务规则定义:
+    若交易对手方个人银行账户出现单次交易金额超过预设阈值(如 5 万元、20 万元等)的大额资金往来,
+    且该交易与账户日常交易规模、资金使用场景及个人经济活动特征不匹配,缺乏合理交易背景支撑,
+    可触发大额交易异常提示,需进一步核查该笔交易的真实性、合法性及资金来源与去向。
+    """
+
+    args_schema: Type[BaseModel] = LargeAmountInput
+
+    # 配置参数
+    amount_threshold: float = Field(
+        50000.0,
+        description="大额交易阈值(元),单次交易金额超过此值视为大额交易"
+    )
+
+    # 历史分析参数
+    history_days: int = Field(
+        90,
+        description="历史分析天数,用于分析账户日常交易规模"
+    )
+
+    outlier_multiplier: float = Field(
+        3.0,
+        description="异常倍数阈值,交易金额超过历史均值的多少倍视为异常"
+    )
+
+    # 背景分析参数
+    enable_background_check: bool = Field(
+        True,
+        description="是否启用交易背景检查"
+    )
+
+    # 合理背景关键词(常见的大额合理交易场景)
+    reasonable_background_keywords: List[str] = Field(
+        [
+            "工资", "奖金", "绩效", "年终奖", "报销", "货款", "租金",
+            "购房款", "装修款", "学费", "医疗费", "保险", "理财",
+            "投资款", "分红", "还款", "借款", "赠与", "遗产"
+        ],
+        description="合理的交易背景关键词,用于识别可能有合理背景的大额交易"
+    )
+
+    # 高风险关键词(可能需要关注的场景)
+    high_risk_keywords: List[str] = Field(
+        [
+            "赌博", "赌资", "彩票", "博彩", "虚拟货币", "比特币",
+            "地下钱庄", "洗钱", "套现", "非法", "不明", "无摘要"
+        ],
+        description="高风险关键词,出现这些词的大额交易需要重点关注"
+    )
+
+    def __init__(self, csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化大额交易识别器
+
+        Args:
+            csv_path: CSV文件路径
+            config: 配置参数
+            **kwargs: 其他参数
+        """
+        # 调用父类的 __init__
+        super().__init__(
+            name="large_amount_recognizer",
+            description="识别银行流水中的大额交易异常,检测单次交易金额超过阈值且与账户历史行为不匹配的交易。",
+            display_name="大额交易异常识别",
+            csv_path=csv_path,
+            config=config,
+            **kwargs
+        )
+
+        # 从config获取配置,更新Field属性
+        large_amount_config = self.get_config_value('large_amount_recognition', {})
+        if large_amount_config:
+            config_mapping = {
+                'amount_threshold': 'amount_threshold',
+                'history_days': 'history_days',
+                'outlier_multiplier': 'outlier_multiplier',
+                'enable_background_check': 'enable_background_check',
+                'reasonable_background_keywords': 'reasonable_background_keywords',
+                'high_risk_keywords': 'high_risk_keywords'
+            }
+
+            for config_key, attr_name in config_mapping.items():
+                if config_key in large_amount_config:
+                    setattr(self, attr_name, large_amount_config[config_key])
+
+        print(f"✅ {self.display_name} 初始化完成")
+        print(f"  金额阈值: ¥{self.amount_threshold:,.2f}")
+        print(f"  历史分析天数: {self.history_days}天")
+        print(f"  异常倍数阈值: {self.outlier_multiplier}倍")
+        print(f"  背景检查: {'启用' if self.enable_background_check else '禁用'}")
+        print(f"  合理背景关键词: {len(self.reasonable_background_keywords)}个")
+        print(f"  高风险关键词: {len(self.high_risk_keywords)}个")
+
+    def _analyze_account_history(self, df: pd.DataFrame, current_date: pd.Timestamp) -> Dict[str, Any]:
+        """
+        分析账户历史交易特征
+
+        Args:
+            df: 交易数据
+            current_date: 当前交易日期
+
+        Returns:
+            Dict[str, Any]: 账户历史交易特征
+        """
+        # 计算历史日期范围
+        history_start = current_date - pd.Timedelta(days=self.history_days)
+
+        # 筛选历史交易(当前日期之前的历史数据)
+        history_df = df[df['datetime'] < current_date]
+        history_df = history_df[history_df['datetime'] >= history_start]
+
+        if len(history_df) == 0:
+            return {
+                'has_history': False,
+                'message': f'无最近{self.history_days}天的历史交易数据'
+            }
+
+        # 计算历史交易特征
+        history_features = {
+            'has_history': True,
+            'history_days': self.history_days,
+            'total_transactions': len(history_df),
+            'avg_amount': float(history_df['txAmount'].mean()) if len(history_df) > 0 else 0,
+            'max_amount': float(history_df['txAmount'].max()) if len(history_df) > 0 else 0,
+            'min_amount': float(history_df['txAmount'].min()) if len(history_df) > 0 else 0,
+            'std_amount': float(history_df['txAmount'].std()) if len(history_df) > 0 else 0,
+            'total_income': float(history_df[history_df['txDirection'] == '收入']['txAmount'].sum()),
+            'total_expense': float(history_df[history_df['txDirection'] == '支出']['txAmount'].sum()),
+            'income_count': len(history_df[history_df['txDirection'] == '收入']),
+            'expense_count': len(history_df[history_df['txDirection'] == '支出']),
+            'date_range': {
+                'start': history_df['datetime'].min().strftime('%Y-%m-%d'),
+                'end': history_df['datetime'].max().strftime('%Y-%m-%d')
+            }
+        }
+
+        return history_features
+
+    def _check_transaction_background(self, row: pd.Series) -> Dict[str, Any]:
+        """
+        检查交易背景合理性
+
+        Args:
+            row: 交易记录
+
+        Returns:
+            Dict[str, Any]: 背景检查结果
+        """
+        background_result = {
+            'has_reasonable_background': False,
+            'has_high_risk_indicator': False,
+            'reasonable_keywords_found': [],
+            'high_risk_keywords_found': [],
+            'summary': '',
+            'counterparty': '',
+            'summary_text': ''
+        }
+
+        if not self.enable_background_check:
+            return background_result
+
+        # 获取交易摘要和对手方信息
+        summary = str(row.get('txSummary', '')).lower()
+        counterparty = str(row.get('txCounterparty', '')).lower()
+
+        # 检查合理背景关键词
+        reasonable_found = []
+        for keyword in self.reasonable_background_keywords:
+            if keyword in summary or keyword in counterparty:
+                reasonable_found.append(keyword)
+
+        # 检查高风险关键词
+        high_risk_found = []
+        for keyword in self.high_risk_keywords:
+            if keyword in summary or keyword in counterparty:
+                high_risk_found.append(keyword)
+
+        # 判断是否有合理背景
+        has_reasonable_background = len(reasonable_found) > 0
+        has_high_risk = len(high_risk_found) > 0
+
+        # 生成背景描述
+        background_desc = []
+        if reasonable_found:
+            background_desc.append(f"合理背景: {', '.join(reasonable_found)}")
+        if high_risk_found:
+            background_desc.append(f"高风险关键词: {', '.join(high_risk_found)}")
+
+        background_result.update({
+            'has_reasonable_background': has_reasonable_background,
+            'has_high_risk_indicator': has_high_risk,
+            'reasonable_keywords_found': reasonable_found,
+            'high_risk_keywords_found': high_risk_found,
+            'summary': '; '.join(background_desc) if background_desc else '无特殊背景信息',
+            'counterparty': counterparty,
+            'summary_text': summary
+        })
+
+        return background_result
+
+    def _is_amount_outlier(self, amount: float, history_features: Dict[str, Any]) -> bool:
+        """
+        判断交易金额是否为异常值(与历史行为不匹配)
+
+        Args:
+            amount: 当前交易金额
+            history_features: 账户历史特征
+
+        Returns:
+            bool: 是否为异常值
+        """
+        if not history_features['has_history']:
+            # 无历史数据,无法判断是否为异常值
+            return False
+
+        if history_features['avg_amount'] == 0:
+            # 历史平均金额为0,无法判断
+            return False
+
+        # 判断是否超过历史平均值的异常倍数
+        is_outlier = amount > (history_features['avg_amount'] * self.outlier_multiplier)
+
+        return is_outlier
+
+    def recognize(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        识别大额交易异常
+
+        Args:
+            csv_path: CSV文件路径
+            **kwargs: 其他参数
+
+        Returns:
+            Dict[str, Any]: 识别结果
+        """
+        try:
+            # 使用父类的load_data方法加载标准化数据
+            df = self.load_data(csv_path)
+
+            print(f"🔍 {self.display_name}开始检查,共 {len(df)} 条记录")
+            print(f"  大额阈值: ¥{self.amount_threshold:,.2f}")
+            print(f"  检测规则: 大额金额 + 与历史不匹配 + 缺乏合理背景 = 大额交易异常")
+
+            # 检查必需字段
+            required_fields = ['txId', 'datetime', 'txAmount', 'txDirection']
+            missing_fields = [field for field in required_fields if field not in df.columns]
+
+            if missing_fields:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '失败',
+                    'error': f'缺少必需字段: {missing_fields}'
+                }
+
+            # 确保datetime列已正确解析
+            if not pd.api.types.is_datetime64_any_dtype(df['datetime']):
+                df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
+
+            # 按时间排序,便于历史分析
+            df = df.sort_values('datetime')
+
+            # 复制一份用于分析(避免修改原始数据)
+            analysis_df = df.copy()
+
+            # ============ 识别大额交易 ============
+            # 根据业务规则:单次交易金额超过预设阈值
+            large_amount_mask = analysis_df['txAmount'].abs() >= self.amount_threshold
+            large_amount_transactions = analysis_df[large_amount_mask].copy()
+
+            if len(large_amount_transactions) == 0:
+                print(f"📊 未发现大额交易(≥¥{self.amount_threshold:,.2f})")
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'recognition_parameters': {
+                        'amount_threshold': self.amount_threshold,
+                        'history_days': self.history_days,
+                        'outlier_multiplier': self.outlier_multiplier,
+                        'enable_background_check': self.enable_background_check,
+                        'total_checked': len(df)
+                    },
+                    'statistics': {
+                        'total_transactions': len(df),
+                        'large_amount_transactions': 0,
+                        'max_amount': float(df['txAmount'].max()),
+                        'min_amount': float(df['txAmount'].min()),
+                        'avg_amount': float(df['txAmount'].mean())
+                    }
+                }
+
+            print(f"📊 发现 {len(large_amount_transactions)} 笔大额交易(≥¥{self.amount_threshold:,.2f})")
+
+            # ============ 分析每笔大额交易 ============
+            identified_anomalies = []
+            analyzed_transactions = []
+
+            for idx, row in large_amount_transactions.iterrows():
+                tx_id = str(row['txId'])
+                tx_date = row['datetime']
+                tx_amount = float(row['txAmount'])
+
+                print(f"  🔍 分析交易 {tx_id}: ¥{tx_amount:,.2f} ({row['txDirection']})")
+
+                # 1. 分析账户历史特征
+                history_features = self._analyze_account_history(analysis_df, tx_date)
+
+                # 2. 检查交易背景
+                background_check = self._check_transaction_background(row)
+
+                # 3. 判断是否为异常值(与历史不匹配)
+                is_amount_outlier = self._is_amount_outlier(abs(tx_amount), history_features)
+
+                # 4. 综合判断是否为异常
+                # 规则:大额 + (历史不匹配 或 缺乏合理背景) = 异常
+                is_abnormal = True  # 默认大额就是异常
+
+                # 如果有合理背景,可以降低异常级别
+                severity_level = 'high'
+                if background_check['has_reasonable_background']:
+                    if not is_amount_outlier:
+                        # 有合理背景且金额不异常,可能不是异常
+                        is_abnormal = False
+                        print(f"    ✅ 有合理背景且金额不异常,跳过")
+                        continue
+                    else:
+                        severity_level = 'medium'
+                        print(f"    ⚠️ 有合理背景但金额异常")
+
+                # 如果有高风险关键词,提高异常级别
+                if background_check['has_high_risk_indicator']:
+                    severity_level = 'high'
+                    print(f"    ⚠️ 发现高风险关键词")
+
+                # 记录分析结果
+                transaction_analysis = {
+                    'tx_id': tx_id,
+                    'date': tx_date.strftime('%Y-%m-%d'),
+                    'time': tx_date.strftime('%H:%M:%S'),
+                    'amount': tx_amount,
+                    'direction': row['txDirection'],
+                    'is_large_amount': True,
+                    'is_amount_outlier': is_amount_outlier,
+                    'history_features': history_features,
+                    'background_check': background_check,
+                    'is_abnormal': is_abnormal,
+                    'severity_level': severity_level
+                }
+                analyzed_transactions.append(transaction_analysis)
+
+                # 如果判断为异常,生成异常记录
+                if is_abnormal:
+                    # 生成异常原因
+                    reasons = []
+                    reasons.append(f"大额交易(¥{tx_amount:,.2f}≥¥{self.amount_threshold:,.2f})")
+
+                    if is_amount_outlier:
+                        if history_features['has_history']:
+                            avg_amount = history_features['avg_amount']
+                            outlier_ratio = tx_amount / avg_amount if avg_amount > 0 else float('inf')
+                            reasons.append(f"金额异常(超出历史均值{outlier_ratio:.1f}倍)")
+
+                    if not background_check['has_reasonable_background']:
+                        reasons.append("缺乏合理交易背景")
+
+                    if background_check['has_high_risk_indicator']:
+                        reasons.append("存在高风险关键词")
+
+                    reason_str = ",".join(reasons)
+
+                    # 额外信息
+                    additional_info = {
+                        'amount_analysis': {
+                            'threshold': self.amount_threshold,
+                            'is_outlier': is_amount_outlier,
+                            'outlier_ratio': float(tx_amount / history_features['avg_amount']) if history_features[
+                                                                                                      'has_history'] and
+                                                                                                  history_features[
+                                                                                                      'avg_amount'] > 0 else None,
+                            'history_avg': history_features['avg_amount'] if history_features['has_history'] else None
+                        },
+                        'background_analysis': background_check,
+                        'history_analysis': history_features
+                    }
+
+                    anomaly = self.format_anomaly_record(
+                        row=row,
+                        reason=f"大额交易异常: {reason_str},需核查真实性、合法性及资金来源去向",
+                        severity=severity_level,
+                        check_type='large_amount_transaction',
+                        **additional_info
+                    )
+                    identified_anomalies.append(anomaly)
+
+                    print(f"    ❌ 标记为异常: {reason_str}")
+                else:
+                    print(f"    ✅ 未标记为异常")
+
+            # ============ 结果统计 ============
+            print(f"✅ {self.display_name}检查完成")
+            print(f"  检查交易总数: {len(df)}")
+            print(f"  大额交易数: {len(large_amount_transactions)}")
+            print(f"  异常交易数: {len(identified_anomalies)}")
+            print(f"  通过检查数: {len(large_amount_transactions) - len(identified_anomalies)}")
+
+            # 显示大额交易统计
+            if len(large_amount_transactions) > 0:
+                print("📋 大额交易统计:")
+                total_large_amount = large_amount_transactions['txAmount'].sum()
+                avg_large_amount = large_amount_transactions['txAmount'].mean()
+                print(f"  总大额金额: ¥{total_large_amount:,.2f}")
+                print(f"  平均大额金额: ¥{avg_large_amount:,.2f}")
+
+                # 按方向统计
+                income_large = large_amount_transactions[large_amount_transactions['txDirection'] == '收入']
+                expense_large = large_amount_transactions[large_amount_transactions['txDirection'] == '支出']
+                print(f"  大额收入: {len(income_large)}笔, ¥{income_large['txAmount'].sum():,.2f}")
+                print(f"  大额支出: {len(expense_large)}笔, ¥{expense_large['txAmount'].sum():,.2f}")
+
+            # 显示异常交易示例
+            if len(identified_anomalies) > 0:
+                print("📋 大额异常交易示例:")
+                for i, anomaly in enumerate(identified_anomalies[:5], 1):
+                    time_str = f"{anomaly.get('txDate', '')} {anomaly.get('txTime', '')}"
+                    print(
+                        f"  {i}. ID:{anomaly['txId']} | {time_str} | {anomaly['txDirection']} ¥{anomaly['txAmount']:,.2f} | {anomaly['recognition_reason'][:50]}...")
+
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': len(identified_anomalies),
+                'identified_anomalies': identified_anomalies,
+                'recognition_status': '完成',
+                'recognition_parameters': {
+                    'amount_threshold': self.amount_threshold,
+                    'history_days': self.history_days,
+                    'outlier_multiplier': self.outlier_multiplier,
+                    'enable_background_check': self.enable_background_check,
+                    'total_checked': len(df),
+                    'large_transactions_found': len(large_amount_transactions)
+                },
+                'statistics': {
+                    'total_transactions': len(df),
+                    'large_amount_transactions': len(large_amount_transactions),
+                    'abnormal_large_transactions': len(identified_anomalies),
+                    'amount_statistics': {
+                        'max_amount': float(df['txAmount'].max()),
+                        'min_amount': float(df['txAmount'].min()),
+                        'avg_amount': float(df['txAmount'].mean()),
+                        'total_amount': float(df['txAmount'].sum()),
+                        'large_amount_total': float(large_amount_transactions['txAmount'].sum()),
+                        'large_amount_avg': float(large_amount_transactions['txAmount'].mean()) if len(
+                            large_amount_transactions) > 0 else 0
+                    },
+                    'direction_distribution': {
+                        'income_count': len(df[df['txDirection'] == '收入']),
+                        'expense_count': len(df[df['txDirection'] == '支出']),
+                        'large_income_count': len(
+                            large_amount_transactions[large_amount_transactions['txDirection'] == '收入']),
+                        'large_expense_count': len(
+                            large_amount_transactions[large_amount_transactions['txDirection'] == '支出'])
+                    },
+                    'background_analysis': {
+                        'reasonable_background_count': sum(
+                            1 for t in analyzed_transactions if t['background_check']['has_reasonable_background']),
+                        'high_risk_count': sum(
+                            1 for t in analyzed_transactions if t['background_check']['has_high_risk_indicator']),
+                        'outlier_count': sum(1 for t in analyzed_transactions if t['is_amount_outlier'])
+                    }
+                }
+            }
+
+        except FileNotFoundError as e:
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'文件不存在: {str(e)}'
+            }
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'数据加载或处理失败: {str(e)}'
+            }
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        summary = super().get_summary()
+        summary.update({
+            'amount_threshold': self.amount_threshold,
+            'history_days': self.history_days,
+            'outlier_multiplier': self.outlier_multiplier,
+            'enable_background_check': self.enable_background_check,
+            'reasonable_background_keywords_count': len(self.reasonable_background_keywords),
+            'high_risk_keywords_count': len(self.high_risk_keywords),
+            'data_loaded': self._data is not None
+        })
+        return summary
+
+    def get_config_summary(self) -> Dict[str, Any]:
+        """获取配置摘要"""
+        return {
+            "大额阈值": f"¥{self.amount_threshold:,.2f}",
+            "历史分析天数": f"{self.history_days}天",
+            "异常倍数阈值": f"{self.outlier_multiplier}倍",
+            "背景检查": "启用" if self.enable_background_check else "禁用",
+            "检测逻辑": "大额金额 + 与历史不匹配 + 缺乏合理背景 = 大额交易异常",
+            "业务规则描述": "单次交易金额超过阈值且与账户历史行为不匹配,缺乏合理背景"
+        }

+ 509 - 0
llmops/agents/tools/low_interest_rate_recognizer.py

@@ -0,0 +1,509 @@
+
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, Type, List
+import pandas as pd
+import re
+
+from .enhanced_base_recognizer import EnhancedBaseRecognizer
+
+
+class LowInterestRateInput(BaseModel):
+    """低利率结息记录识别工具输入"""
+    csv_path: Optional[str] = Field(
+        None,
+        description="CSV文件路径(可选)。如果初始化时已提供csv_path,可以不用再次传入。"
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class LowInterestRateRecognizer(EnhancedBaseRecognizer):
+    """
+    低利率结息记录识别器
+
+    异常规则定义:
+    银行流水结息核查中,若实际结息金额对应的有效利率显著低于同档期银行公布的
+    活期存款基准利率,或低于账户所属银行同期执行的活期存款利率标准,
+    且无合理利率下浮依据,可判定为结息记录存在异常,需进一步核查。
+
+    核心逻辑:
+    1. 识别结息记录(txSummary包含关键词,txDirection为收入)
+    2. 估算年化利率:结息金额 ÷ 结息时点余额 × 年化系数
+    3. 对比配置的基准利率和阈值
+    4. 标记利率异常低的结息记录
+    """
+
+    args_schema: Type[BaseModel] = LowInterestRateInput
+
+    # 配置参数
+    base_interest_rate: float = Field(
+        0.0035,
+        description="基准活期存款利率(如0.35%应输入为0.0035)"
+    )
+
+    threshold_ratio: float = Field(
+        0.5,
+        description="异常阈值比例,实际利率低于基准利率的比例时视为异常"
+    )
+
+    # 结息识别关键词
+    interest_keywords: List[str] = Field(
+        ['结息', '利息', '存款利息'],
+        description="识别结息记录的关键词列表"
+    )
+
+    # 计息参数
+    assumed_interest_days: int = Field(
+        90,
+        description="假设计息天数(通常为90天,按季结息)"
+    )
+
+    annual_days: int = Field(
+        360,
+        description="年计息天数(银行常用360天)"
+    )
+
+    # 计算所需的最小余额
+    min_balance_for_calc: float = Field(
+        100.0,
+        description="计算利率所需的最小余额(元),余额低于此值可能无法准确计算"
+    )
+
+    # 严重程度配置
+    severity_level: str = Field(
+        'medium',
+        description="异常严重程度(high/medium/low)"
+    )
+
+    # 模糊匹配设置
+    enable_fuzzy_match: bool = Field(
+        True,
+        description="是否启用模糊匹配(处理不规范的txSummary字段)"
+    )
+
+    def __init__(self, csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化低利率结息记录识别器
+
+        Args:
+            csv_path: CSV文件路径
+            config: 配置参数
+            **kwargs: 其他参数
+        """
+        # 调用父类的 __init__
+        super().__init__(
+            name="low_interest_rate_recognizer",
+            description="识别银行流水中利率异常低的结息记录,检测结息金额对应的有效利率是否显著低于基准利率。",
+            display_name="低利率结息记录识别器",
+            csv_path=csv_path,
+            config=config,
+            **kwargs
+        )
+
+        # 从config获取配置,更新Field属性
+        interest_config = self.get_config_value('interest_rate_check', {})
+        if interest_config:
+            config_mapping = {
+                'base_interest_rate': 'base_interest_rate',
+                'threshold_ratio': 'threshold_ratio',
+                'interest_keywords': 'interest_keywords',
+                'assumed_interest_days': 'assumed_interest_days',
+                'annual_days': 'annual_days',
+                'min_balance_for_calc': 'min_balance_for_calc',
+                'severity_level': 'severity_level',
+                'enable_fuzzy_match': 'enable_fuzzy_match'
+            }
+
+            for config_key, attr_name in config_mapping.items():
+                if config_key in interest_config:
+                    setattr(self, attr_name, interest_config[config_key])
+
+        print(f"✅ {self.display_name} 初始化完成")
+        print(f"  基准利率: {self.base_interest_rate:.4%}")
+        print(f"  异常阈值: 低于基准的{self.threshold_ratio:.0%}")
+        print(f"  识别关键词: {', '.join(self.interest_keywords)}")
+        print(f"  假设计息天数: {self.assumed_interest_days}天")
+        print(f"  年计息天数: {self.annual_days}天")
+        print(f"  最小计算余额: ¥{self.min_balance_for_calc:,.2f}")
+        print(f"  模糊匹配: {'启用' if self.enable_fuzzy_match else '禁用'}")
+
+    def _is_interest_record(self, summary: str, direction: str) -> bool:
+        """
+        判断是否为结息记录
+
+        Args:
+            summary: 交易摘要
+            direction: 交易方向
+
+        Returns:
+            bool: 是否为结息记录
+        """
+        # 交易方向必须是收入
+        if direction != '收入':
+            return False
+
+        # 检查摘要是否包含结息关键词
+        summary_lower = str(summary).lower()
+
+        if self.enable_fuzzy_match:
+            # 模糊匹配:检查是否包含任何关键词
+            for keyword in self.interest_keywords:
+                if keyword in summary_lower:
+                    return True
+            return False
+        else:
+            # 精确匹配:使用正则表达式
+            pattern = '|'.join(self.interest_keywords)
+            return bool(re.search(pattern, summary_lower))
+
+    def _estimate_annual_interest_rate(self, interest_amount: float,
+                                       balance_at_interest: float) -> Optional[float]:
+        """
+        估算年化利率
+
+        公式:
+        年化利率 = (结息金额 ÷ 结息时点余额) × (年计息天数 ÷ 假设计息天数)
+
+        Args:
+            interest_amount: 结息金额(元)
+            balance_at_interest: 结息时点余额(元)
+
+        Returns:
+            Optional[float]: 估算的年化利率,如果无法计算则返回None
+        """
+        # 检查输入有效性
+        if pd.isna(interest_amount) or pd.isna(balance_at_interest):
+            return None
+
+        if balance_at_interest <= 0:
+            return None
+
+        if abs(interest_amount) < 0.01:  # 结息金额过小
+            return None
+
+        if balance_at_interest < self.min_balance_for_calc:
+            return None
+
+        try:
+            # 计算日利率
+            daily_rate = interest_amount / balance_at_interest
+
+            # 年化利率
+            annual_rate = daily_rate * (self.annual_days / self.assumed_interest_days)
+
+            return annual_rate
+
+        except ZeroDivisionError:
+            return None
+        except Exception:
+            return None
+
+    def _is_abnormal_interest(self, annual_rate: float) -> bool:
+        """
+        判断结息利率是否异常
+
+        Args:
+            annual_rate: 估算的年化利率
+
+        Returns:
+            bool: 是否异常
+        """
+        if annual_rate is None:
+            return False
+
+        # 判断是否低于阈值
+        threshold_rate = self.base_interest_rate * self.threshold_ratio
+        return annual_rate < threshold_rate
+
+    def _generate_interest_reason(self, row: pd.Series, annual_rate: float) -> str:
+        """
+        生成异常原因描述
+
+        Args:
+            row: 交易记录
+            annual_rate: 估算的年化利率
+
+        Returns:
+            str: 异常原因描述
+        """
+        interest_amount = row['txAmount']
+        balance = row.get('txBalance', 0)
+        threshold_rate = self.base_interest_rate * self.threshold_ratio
+
+        reason_parts = []
+
+        # 利率对比
+        if annual_rate is not None:
+            rate_diff_percent = (self.base_interest_rate - annual_rate) / self.base_interest_rate * 100
+            reason_parts.append(
+                f"估算年化利率{annual_rate:.4%},显著低于基准利率{self.base_interest_rate:.4%}"
+                f"(低{rate_diff_percent:.1f}%,低于阈值{threshold_rate:.4%})"
+            )
+
+        # 金额信息
+        reason_parts.append(f"结息金额¥{interest_amount:,.2f},结息时点余额¥{balance:,.2f}")
+
+        # 补充信息
+        if annual_rate is not None and annual_rate < 0.0001:  # 利率极低
+            reason_parts.append("利率极低,可能存在异常")
+
+        return "结息利率异常: " + ",".join(reason_parts)
+
+    def recognize(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        识别低利率结息记录异常
+
+        Args:
+            csv_path: CSV文件路径
+            **kwargs: 其他参数
+
+        Returns:
+            Dict[str, Any]: 识别结果
+        """
+        try:
+            # 使用父类的load_data方法加载标准化数据
+            df = self.load_data(csv_path)
+
+            print(f"🔍 {self.display_name}开始检查,共 {len(df)} 条记录")
+            print(f"  检查规则: 结息记录的实际利率 < 基准利率({self.base_interest_rate:.4%}) × 阈值({self.threshold_ratio:.0%})")
+
+            # 检查必需字段
+            required_fields = ['txId', 'txSummary', 'txDirection', 'txAmount']
+            missing_fields = [field for field in required_fields if field not in df.columns]
+
+            if missing_fields:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '失败',
+                    'error': f'缺少必需字段: {missing_fields}'
+                }
+
+            # 检查余额字段(可选但重要)
+            has_balance_field = 'txBalance' in df.columns
+            if not has_balance_field:
+                print(f"⚠️ 警告: 缺少txBalance字段,将无法准确计算利率")
+                print(f"  建议: 确保数据包含余额信息以进行准确的利率分析")
+
+            # ============ 识别结息记录 ============
+            print(f"🔍 正在识别结息记录...")
+
+            # 筛选可能的结息记录
+            interest_mask = df.apply(
+                lambda row: self._is_interest_record(row['txSummary'], row['txDirection']),
+                axis=1
+            )
+            interest_transactions = df[interest_mask].copy()
+
+            if len(interest_transactions) == 0:
+                print(f"📊 未发现结息记录")
+                print(f"  检查的关键词: {', '.join(self.interest_keywords)}")
+                print(f"  检查的交易方向: 收入")
+
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'recognition_parameters': {
+                        'base_interest_rate': self.base_interest_rate,
+                        'threshold_ratio': self.threshold_ratio,
+                        'interest_keywords': self.interest_keywords,
+                        'assumed_interest_days': self.assumed_interest_days,
+                        'total_checked': len(df)
+                    },
+                    'statistics': {
+                        'total_transactions': len(df),
+                        'interest_transactions': 0,
+                        'has_balance_field': has_balance_field
+                    }
+                }
+
+            print(f"📊 发现 {len(interest_transactions)} 笔结息记录")
+
+            # ============ 分析结息记录 ============
+            print(f"🔍 正在分析结息记录利率...")
+
+            identified_anomalies = []
+            interest_analyses = []
+
+            for idx, row in interest_transactions.iterrows():
+                tx_id = str(row['txId'])
+                interest_amount = float(row['txAmount'])
+                balance = float(row.get('txBalance', 0)) if has_balance_field else 0
+
+                print(f"  🔍 分析结息记录 {tx_id}: ¥{interest_amount:,.2f}")
+
+                # 1. 估算年化利率
+                annual_rate = None
+                if has_balance_field and balance >= self.min_balance_for_calc:
+                    annual_rate = self._estimate_annual_interest_rate(interest_amount, balance)
+                else:
+                    print(f"    ⚠️ 无法计算利率: 余额不足或缺少余额字段")
+
+                # 2. 判断是否异常
+                is_abnormal = False
+                if annual_rate is not None:
+                    is_abnormal = self._is_abnormal_interest(annual_rate)
+
+                # 记录分析结果
+                analysis = {
+                    'tx_id': tx_id,
+                    'interest_amount': interest_amount,
+                    'balance_at_interest': balance if has_balance_field else None,
+                    'estimated_annual_rate': annual_rate,
+                    'is_abnormal': is_abnormal,
+                    'can_calculate_rate': annual_rate is not None
+                }
+                interest_analyses.append(analysis)
+
+                # 3. 如果异常,生成异常记录
+                if is_abnormal:
+                    # 生成异常原因
+                    reason = self._generate_interest_reason(row, annual_rate)
+
+                    print(f"    ❌ 发现利率异常: {reason[:80]}...")
+
+                    # 额外信息
+                    additional_info = {
+                        'interest_analysis': {
+                            'estimated_annual_rate': annual_rate,
+                            'base_interest_rate': self.base_interest_rate,
+                            'threshold_rate': self.base_interest_rate * self.threshold_ratio,
+                            'interest_amount': interest_amount,
+                            'balance_at_interest': balance,
+                            'rate_calculation': {
+                                'assumed_interest_days': self.assumed_interest_days,
+                                'annual_days': self.annual_days,
+                                'min_balance_for_calc': self.min_balance_for_calc
+                            }
+                        }
+                    }
+
+                    anomaly = self.format_anomaly_record(
+                        row=row,
+                        reason=reason,
+                        severity=self.severity_level,
+                        check_type='low_interest_rate',
+                        **additional_info
+                    )
+                    identified_anomalies.append(anomaly)
+                elif annual_rate is not None:
+                    print(f"    ✅ 利率正常: {annual_rate:.4%} ≥ 阈值{self.base_interest_rate * self.threshold_ratio:.4%}")
+                else:
+                    print(f"    ⚠️ 无法判断: 缺少余额数据或余额不足")
+
+            # ============ 结果统计 ============
+            print(f"✅ {self.display_name}检查完成")
+            print(f"  检查结果:")
+            print(f"    结息记录总数: {len(interest_transactions)}")
+            print(f"    可计算利率记录: {sum(1 for a in interest_analyses if a['can_calculate_rate'])}")
+            print(f"    利率异常记录: {len(identified_anomalies)}")
+
+            # 显示结息统计
+            if len(interest_transactions) > 0:
+                print("📋 结息记录统计:")
+                total_interest = interest_transactions['txAmount'].sum()
+                avg_interest = interest_transactions['txAmount'].mean()
+                print(f"  总结息金额: ¥{total_interest:,.2f}")
+                print(f"  平均结息金额: ¥{avg_interest:,.2f}")
+
+                # 显示利率分布
+                valid_rates = [a['estimated_annual_rate'] for a in interest_analyses
+                               if a['estimated_annual_rate'] is not None]
+                if valid_rates:
+                    avg_rate = sum(valid_rates) / len(valid_rates)
+                    min_rate = min(valid_rates)
+                    max_rate = max(valid_rates)
+                    print(f"  平均估算利率: {avg_rate:.4%}")
+                    print(f"  最低估算利率: {min_rate:.4%}")
+                    print(f"  最高估算利率: {max_rate:.4%}")
+
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': len(identified_anomalies),
+                'identified_anomalies': identified_anomalies,
+                'recognition_status': '完成',
+                'recognition_parameters': {
+                    'base_interest_rate': self.base_interest_rate,
+                    'threshold_ratio': self.threshold_ratio,
+                    'interest_keywords': self.interest_keywords,
+                    'assumed_interest_days': self.assumed_interest_days,
+                    'annual_days': self.annual_days,
+                    'min_balance_for_calc': self.min_balance_for_calc,
+                    'has_balance_field': has_balance_field,
+                    'total_interest_records': len(interest_transactions)
+                },
+                'statistics': {
+                    'total_transactions': len(df),
+                    'interest_transactions': len(interest_transactions),
+                    'abnormal_interest_transactions': len(identified_anomalies),
+                    'interest_amount_statistics': {
+                        'total_interest': float(interest_transactions['txAmount'].sum()),
+                        'avg_interest': float(interest_transactions['txAmount'].mean()),
+                        'max_interest': float(interest_transactions['txAmount'].max()),
+                        'min_interest': float(interest_transactions['txAmount'].min())
+                    } if len(interest_transactions) > 0 else {},
+                    'rate_analysis': {
+                        'valid_rate_count': sum(1 for a in interest_analyses if a['can_calculate_rate']),
+                        'abnormal_rate_count': len(identified_anomalies),
+                        'rate_summary': {
+                            'avg_rate': float(sum(a['estimated_annual_rate'] for a in interest_analyses
+                                                  if a['estimated_annual_rate'] is not None) /
+                                              max(1, sum(1 for a in interest_analyses
+                                                         if a['estimated_annual_rate'] is not None)))
+                        } if any(a['estimated_annual_rate'] is not None for a in interest_analyses) else {}
+                    }
+                }
+            }
+
+        except FileNotFoundError as e:
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'文件不存在: {str(e)}'
+            }
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'数据加载或处理失败: {str(e)}'
+            }
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        summary = super().get_summary()
+        summary.update({
+            'base_interest_rate': self.base_interest_rate,
+            'threshold_ratio': self.threshold_ratio,
+            'interest_keywords_count': len(self.interest_keywords),
+            'assumed_interest_days': self.assumed_interest_days,
+            'annual_days': self.annual_days,
+            'min_balance_for_calc': self.min_balance_for_calc,
+            'severity_level': self.severity_level,
+            'enable_fuzzy_match': self.enable_fuzzy_match,
+            'data_loaded': self._data is not None
+        })
+        return summary
+
+    def get_config_summary(self) -> Dict[str, Any]:
+        """获取配置摘要"""
+        return {
+            "基准利率": f"{self.base_interest_rate:.4%}",
+            "异常阈值": f"低于基准的{self.threshold_ratio:.0%}",
+            "识别关键词": f"{len(self.interest_keywords)}个: {', '.join(self.interest_keywords[:3])}..."
+            if len(
+                self.interest_keywords) > 3 else f"{len(self.interest_keywords)}个: {', '.join(self.interest_keywords)}",
+            "计息参数": f"{self.assumed_interest_days}天/季,{self.annual_days}天/年",
+            "最小计算余额": f"¥{self.min_balance_for_calc:,.2f}",
+            "检测逻辑": f"结息记录的实际利率 < {self.base_interest_rate:.4%} × {self.threshold_ratio:.0%} = 异常",
+            "业务规则描述": "结息金额对应的有效利率显著低于基准利率,需核查结息真实性"
+        }

+ 473 - 0
llmops/agents/tools/night_transaction_recognizer.py

@@ -0,0 +1,473 @@
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, Type, List, Set
+import pandas as pd
+
+from .enhanced_base_recognizer import EnhancedBaseRecognizer
+
+
+class NightTransactionInput(BaseModel):
+    """夜间交易识别工具输入"""
+    csv_path: Optional[str] = Field(
+        None,
+        description="CSV文件路径(可选)。如果初始化时已提供csv_path,可以不用再次传入。"
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class NightTransactionRecognizer(EnhancedBaseRecognizer):
+    """
+    夜间交易(2-5点)异常识别器
+
+    基于图2规则定义:
+    1. 时间范围:特指凌晨2点至5点期间发生的交易
+    2. 时段性质:偏离日常资金往来常规时段,通常为非经营、非生活活动的常规休息时段
+    3. 异常判定:
+       - 频繁出现资金收付记录
+       - 交易金额较大
+       - 缺乏合理交易背景
+    4. 合理性例外:特定行业经营需求、紧急资金周转等
+
+    核心逻辑:2-5点交易 + (高频或大额) + (无合理背景) = 异常夜间交易
+    """
+
+    args_schema: Type[BaseModel] = NightTransactionInput
+
+    # 配置参数
+    night_start_hour: int = Field(
+        2,
+        description="夜间检测开始小时(0-23),默认2点"
+    )
+    night_end_hour: int = Field(
+        5,
+        description="夜间检测结束小时(0-23),默认5点"
+    )
+    frequency_threshold_per_hour: int = Field(
+        3,
+        description="高频交易阈值,每小时超过此笔数视为高频"
+    )
+    large_amount_threshold: float = Field(
+        50000.0,
+        description="大额交易阈值(元),超过此金额的夜间交易视为大额异常"
+    )
+
+    # 行业特征关键词(24小时营业或夜间经营行业)
+    night_industry_keywords: List[str] = Field(
+        [
+            "酒店", "宾馆", "KTV", "酒吧", "夜总会", "网吧", "便利店",
+            "医院", "急救", "急诊", "消防", "公安", "保安", "物流",
+            "运输", "出租车", "网约车", "外卖", "配送"
+        ],
+        description="夜间经营行业关键词,用于识别可能有合理背景的交易"
+    )
+
+    # 紧急情况关键词
+    emergency_keywords: List[str] = Field(
+        [
+            "急救", "急诊", "抢救", "紧急", "urgent", "emergency",
+            "抢险", "救援", "救灾", "应急", "加急"
+        ],
+        description="紧急情况关键词,用于识别可能有合理背景的交易"
+    )
+
+    def __init__(self, csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化夜间交易识别器
+
+        Args:
+            csv_path: CSV文件路径
+            config: 配置参数
+            **kwargs: 其他参数
+        """
+        # 调用父类的 __init__
+        super().__init__(
+            name="night_transaction_recognizer",
+            description="识别银行流水中的夜间交易异常(2-5点),检测高频、大额等异常特征。",
+            display_name="夜间交易异常识别",
+            csv_path=csv_path,
+            config=config,
+            **kwargs
+        )
+
+        # 从config获取配置,更新Field属性
+        night_config = self.get_config_value('night_transaction', {})
+        if night_config:
+            config_mapping = {
+                'night_start_hour': 'night_start_hour',
+                'night_end_hour': 'night_end_hour',
+                'frequency_threshold_per_hour': 'frequency_threshold_per_hour',
+                'large_amount_threshold': 'large_amount_threshold',
+                'night_industry_keywords': 'night_industry_keywords',
+                'emergency_keywords': 'emergency_keywords'
+            }
+
+            for config_key, attr_name in config_mapping.items():
+                if config_key in night_config:
+                    setattr(self, attr_name, night_config[config_key])
+
+        # 验证时间配置
+        self._validate_time_config()
+
+        print(f"✅ {self.display_name} 初始化完成")
+        print(f"  夜间时段: {self.night_start_hour:02d}:00 - {self.night_end_hour:02d}:00")
+        print(f"  高频阈值: {self.frequency_threshold_per_hour}笔/小时")
+        print(f"  大额阈值: ¥{self.large_amount_threshold:,.2f}")
+        print(f"  夜间行业关键词: {len(self.night_industry_keywords)}个")
+        print(f"  紧急情况关键词: {len(self.emergency_keywords)}个")
+
+    def _validate_time_config(self):
+        """验证时间配置合理性"""
+        if not (0 <= self.night_start_hour <= 23):
+            raise ValueError(f"夜间开始小时必须在0-23之间: {self.night_start_hour}")
+
+        if not (0 <= self.night_end_hour <= 23):
+            raise ValueError(f"夜间结束小时必须在0-23之间: {self.night_end_hour}")
+
+        # 确保开始时间早于结束时间
+        if self.night_start_hour >= self.night_end_hour:
+            print(f"⚠️ 注意:夜间开始时间({self.night_start_hour}:00) >= 结束时间({self.night_end_hour}:00),"
+                  f"将按跨午夜处理")
+
+    def _is_in_night_period(self, hour: int) -> bool:
+        """
+        判断小时数是否在夜间时段内
+
+        Args:
+            hour: 小时数(0-23)
+
+        Returns:
+            bool: 是否在夜间时段
+        """
+        if self.night_start_hour < self.night_end_hour:
+            # 正常情况:开始时间 < 结束时间
+            return self.night_start_hour <= hour < self.night_end_hour
+        else:
+            # 跨午夜情况:开始时间 >= 结束时间
+            return hour >= self.night_start_hour or hour < self.night_end_hour
+
+    def _has_reasonable_background(self, row: pd.Series) -> bool:
+        """
+        判断交易是否有合理背景
+
+        根据图2规则,合理的背景包括:
+        1. 夜间特定行业经营需求
+        2. 紧急资金周转
+
+        Args:
+            row: 交易记录
+
+        Returns:
+            bool: 是否有合理背景
+        """
+        # 检查交易摘要中的关键词
+        summary = str(row.get('txSummary', '')).lower()
+        counterparty = str(row.get('txCounterparty', '')).lower()
+
+        # 合并检查文本
+        check_text = f"{summary} {counterparty}"
+
+        # 1. 检查是否为夜间行业
+        for keyword in self.night_industry_keywords:
+            if keyword.lower() in check_text:
+                return True
+
+        # 2. 检查是否为紧急情况
+        for keyword in self.emergency_keywords:
+            if keyword.lower() in check_text:
+                return True
+
+        # 3. 可以扩展其他合理背景检查逻辑
+
+        return False
+
+    def _detect_high_frequency_transactions(self,
+                                            night_transactions: pd.DataFrame,
+                                            date_col: str = 'date_only') -> Set[str]:
+        """
+        检测高频夜间交易
+
+        Args:
+            night_transactions: 夜间交易数据
+            date_col: 日期列名
+
+        Returns:
+            Set[str]: 高频交易ID集合
+        """
+        high_freq_ids = set()
+
+        if len(night_transactions) == 0:
+            return high_freq_ids
+
+        # 按日期和小时分组统计
+        if 'hour' in night_transactions.columns and date_col in night_transactions.columns:
+            # 统计每小时交易笔数
+            hourly_counts = night_transactions.groupby([date_col, 'hour']).size()
+
+            for (trans_date, trans_hour), count in hourly_counts.items():
+                if count > self.frequency_threshold_per_hour:
+                    # 获取该小时的所有交易
+                    mask = (night_transactions[date_col] == trans_date) & \
+                           (night_transactions['hour'] == trans_hour)
+                    hour_transactions = night_transactions[mask]
+
+                    # 收集交易ID
+                    for tx_id in hour_transactions['txId'].unique():
+                        high_freq_ids.add(str(tx_id))
+
+                    print(f"  ⚠️  {trans_date} {trans_hour:02d}:00-{trans_hour + 1:02d}:00: "
+                          f"{count}笔交易,超过阈值{self.frequency_threshold_per_hour}笔")
+
+        return high_freq_ids
+
+    def _detect_large_amount_transactions(self, night_transactions: pd.DataFrame) -> Set[str]:
+        """
+        检测大额夜间交易
+
+        Args:
+            night_transactions: 夜间交易数据
+
+        Returns:
+            Set[str]: 大额交易ID集合
+        """
+        large_amount_ids = set()
+
+        if len(night_transactions) == 0:
+            return large_amount_ids
+
+        # 筛选大额交易
+        large_amount_mask = night_transactions['txAmount'].abs() >= self.large_amount_threshold
+        large_amount_tx = night_transactions[large_amount_mask]
+
+        for _, row in large_amount_tx.iterrows():
+            tx_id = str(row['txId'])
+            large_amount_ids.add(tx_id)
+
+        if len(large_amount_ids) > 0:
+            print(f"  ⚠️  发现 {len(large_amount_ids)} 笔大额夜间交易(≥¥{self.large_amount_threshold:,.2f})")
+
+        return large_amount_ids
+
+    def recognize(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        识别夜间交易异常
+
+        Args:
+            csv_path: CSV文件路径
+            **kwargs: 其他参数
+
+        Returns:
+            Dict[str, Any]: 识别结果
+        """
+        try:
+            # 使用父类的load_data方法加载标准化数据
+            df = self.load_data(csv_path)
+
+            print(f"🔍 {self.display_name}开始检查,共 {len(df)} 条记录")
+            print(f"  夜间时段: {self.night_start_hour:02d}:00 - {self.night_end_hour:02d}:00")
+            print(f"  检测规则: 夜间交易 + (高频或大额) + (无合理背景) = 异常")
+
+            # 检查必需字段
+            required_fields = ['txId', 'datetime', 'txAmount', 'txDirection']
+            missing_fields = [field for field in required_fields if field not in df.columns]
+
+            if missing_fields:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '失败',
+                    'error': f'缺少必需字段: {missing_fields}'
+                }
+
+            # 确保datetime列已正确解析
+            if not pd.api.types.is_datetime64_any_dtype(df['datetime']):
+                df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
+
+            # 提取时间组件
+            df['hour'] = df['datetime'].dt.hour
+            df['minute'] = df['datetime'].dt.minute
+            df['date_only'] = df['datetime'].dt.date
+
+            # ============ 识别所有夜间交易 ============
+            night_mask = df['hour'].apply(self._is_in_night_period)
+            night_transactions = df[night_mask].copy()
+
+            if len(night_transactions) == 0:
+                print(f"✅ 未发现夜间交易({self.night_start_hour:02d}:00-{self.night_end_hour:02d}:00)")
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'recognition_parameters': {
+                        'night_period': f"{self.night_start_hour:02d}:00-{self.night_end_hour:02d}:00",
+                        'frequency_threshold': f"{self.frequency_threshold_per_hour}笔/小时",
+                        'large_amount_threshold': self.large_amount_threshold,
+                        'total_checked': len(df),
+                        'night_transactions_found': 0
+                    },
+                    'statistics': {
+                        'total_transactions': len(df),
+                        'night_transaction_count': 0,
+                        'night_transaction_ratio': 0.0
+                    }
+                }
+
+            print(f"📊 发现 {len(night_transactions)} 笔夜间交易")
+
+            # ============ 合理性背景分析 ============
+            reasonable_transactions = []
+            for _, row in night_transactions.iterrows():
+                if self._has_reasonable_background(row):
+                    reasonable_transactions.append(row['txId'])
+
+            if reasonable_transactions:
+                print(f"📊 其中 {len(reasonable_transactions)} 笔交易可能有合理背景(夜间行业/紧急情况)")
+
+            # ============ 异常特征检测 ============
+            identified_anomalies = []
+
+            # 1. 检测高频交易
+            high_freq_ids = self._detect_high_frequency_transactions(night_transactions)
+
+            # 2. 检测大额交易
+            large_amount_ids = self._detect_large_amount_transactions(night_transactions)
+
+            # 3. 合并异常交易ID(排除有合理背景的)
+            abnormal_ids = (high_freq_ids | large_amount_ids) - set(reasonable_transactions)
+
+            # 4. 生成异常记录
+            for tx_id in abnormal_ids:
+                mask = night_transactions['txId'] == tx_id
+                if mask.any():
+                    row = night_transactions[mask].iloc[0]
+
+                    # 判断异常类型
+                    is_high_freq = tx_id in high_freq_ids
+                    is_large_amount = tx_id in large_amount_ids
+
+                    # 生成异常原因
+                    reasons = []
+                    if is_high_freq:
+                        reasons.append("高频夜间交易")
+                    if is_large_amount:
+                        reasons.append(f"大额夜间交易(¥{row['txAmount']:,.2f}≥¥{self.large_amount_threshold:,.2f})")
+
+                    reason_str = ",".join(reasons)
+
+                    # 检查是否有合理背景但依然被标记为异常的原因
+                    additional_info = {
+                        'hour': int(row['hour']),
+                        'is_high_frequency': is_high_freq,
+                        'is_large_amount': is_large_amount,
+                        'amount': float(row['txAmount']),
+                        'has_reasonable_background': str(row['txId']) in reasonable_transactions
+                    }
+
+                    anomaly = self.format_anomaly_record(
+                        row=row,
+                        reason=f"夜间{reason_str},缺乏合理交易背景",
+                        severity='high' if is_large_amount else 'medium',
+                        check_type='night_transaction_abnormal',
+                        **additional_info
+                    )
+                    identified_anomalies.append(anomaly)
+
+            # ============ 结果统计和汇总 ============
+            print(f"✅ {self.display_name}检查完成")
+            print(f"  检查交易总数: {len(df)}")
+            print(f"  夜间交易数: {len(night_transactions)}")
+            print(f"  有合理背景: {len(reasonable_transactions)}")
+            print(f"  高频异常: {len(high_freq_ids)}")
+            print(f"  大额异常: {len(large_amount_ids)}")
+            print(f"  最终异常数: {len(identified_anomalies)}")
+
+            # 显示夜间交易时间分布
+            if len(night_transactions) > 0:
+                hour_distribution = night_transactions['hour'].value_counts().sort_index()
+                print("📋 夜间交易时间分布:")
+                for hour, count in hour_distribution.items():
+                    print(f"  {hour:02d}:00-{hour + 1:02d}:00: {count}笔")
+
+            # 显示前5笔异常交易详情
+            if len(identified_anomalies) > 0:
+                print("📋 异常夜间交易示例:")
+                for i, anomaly in enumerate(identified_anomalies[:5], 1):
+                    time_str = f"{anomaly.get('txDate', '')} {anomaly.get('txTime', '')}"
+                    print(
+                        f"  {i}. ID:{anomaly['txId']} | {time_str} | ¥{anomaly['txAmount']:,.2f} | {anomaly['recognition_reason']}")
+
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': len(identified_anomalies),
+                'identified_anomalies': identified_anomalies,
+                'recognition_status': '完成',
+                'recognition_parameters': {
+                    'night_period': f"{self.night_start_hour:02d}:00-{self.night_end_hour:02d}:00",
+                    'frequency_threshold_per_hour': self.frequency_threshold_per_hour,
+                    'large_amount_threshold': self.large_amount_threshold,
+                    'night_industry_keywords_count': len(self.night_industry_keywords),
+                    'emergency_keywords_count': len(self.emergency_keywords),
+                    'total_checked': len(df),
+                    'night_transactions_found': len(night_transactions),
+                    'reasonable_transactions': len(reasonable_transactions)
+                },
+                'statistics': {
+                    'total_transactions': len(df),
+                    'night_transaction_count': len(night_transactions),
+                    'night_transaction_ratio': len(night_transactions) / max(1, len(df)),
+                    'reasonable_transaction_count': len(reasonable_transactions),
+                    'high_frequency_count': len(high_freq_ids),
+                    'large_amount_count': len(large_amount_ids),
+                    'abnormal_night_transaction_count': len(identified_anomalies),
+                    'hour_distribution': {
+                        str(hour): int(count)
+                        for hour, count in night_transactions['hour'].value_counts().items()
+                    } if len(night_transactions) > 0 else {}
+                }
+            }
+
+        except FileNotFoundError as e:
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'文件不存在: {str(e)}'
+            }
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'数据加载或处理失败: {str(e)}'
+            }
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        summary = super().get_summary()
+        summary.update({
+            'night_start_hour': self.night_start_hour,
+            'night_end_hour': self.night_end_hour,
+            'frequency_threshold_per_hour': self.frequency_threshold_per_hour,
+            'large_amount_threshold': self.large_amount_threshold,
+            'night_industry_keywords_count': len(self.night_industry_keywords),
+            'emergency_keywords_count': len(self.emergency_keywords),
+            'data_loaded': self._data is not None
+        })
+        return summary
+
+    def get_config_summary(self) -> Dict[str, Any]:
+        """获取配置摘要"""
+        return {
+            "夜间时段": f"{self.night_start_hour:02d}:00 - {self.night_end_hour:02d}:00",
+            "高频阈值": f"{self.frequency_threshold_per_hour}笔/小时",
+            "大额阈值": f"¥{self.large_amount_threshold:,.2f}",
+            "夜间行业关键词": f"{len(self.night_industry_keywords)}个",
+            "紧急情况关键词": f"{len(self.emergency_keywords)}个",
+            "检测逻辑": "夜间交易 + (高频或大额) + (无合理背景) = 异常"
+        }

+ 492 - 0
llmops/agents/tools/occasional_high_integer_transaction_recognizer.py

@@ -0,0 +1,492 @@
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, Type
+import pandas as pd
+
+from datetime import timedelta
+
+from .enhanced_base_recognizer import EnhancedBaseRecognizer
+
+
+class OccasionalHighIntegerTransactionInput(BaseModel):
+    """偶发高额整数交易识别工具输入"""
+    csv_path: Optional[str] = Field(
+        None,
+        description="CSV文件路径(可选)。如果初始化时已提供csv_path,可以不用再次传入。"
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class OccasionalHighIntegerTransactionRecognizer(EnhancedBaseRecognizer):
+    """
+    偶发高额整数交易异常识别器
+
+    异常规则定义:
+    银行流水核查中,若存在金额为10,000元整数倍的交易,且该类交易金额超过整体流水平均交易金额的5倍
+    (构成极端异常值),同时此类极端异常交易呈现偶发且高频次发生的特征,可判定为金额维度存在
+    异常交易情形,需进一步核查交易真实性。
+    """
+
+    args_schema: Type[BaseModel] = OccasionalHighIntegerTransactionInput
+
+    # 整数倍基数
+    integer_multiple: float = Field(
+        10000.0,
+        description="整数倍基数(元),检查是否为该金额的整数倍"
+    )
+
+    # 异常倍数阈值
+    outlier_multiplier: float = Field(
+        5.0,
+        description="异常倍数阈值,交易金额超过整体平均交易金额的多少倍视为极端异常值"
+    )
+
+    # 频率分析参数
+    frequency_window_days: int = Field(
+        7,
+        description="频率分析时间窗口(天),用于判断是否为高频发生"
+    )
+
+    min_occurrences_for_high_frequency: int = Field(
+        3,
+        description="高频最小发生次数,在时间窗口内达到此次数视为高频"
+    )
+
+    # 偶发性判断参数
+    gap_std_threshold: float = Field(
+        2.0,
+        description="时间间隔标准差阈值,大于此值视为时间不规律(偶发)"
+    )
+
+    # 严重程度配置
+    severity_level: str = Field(
+        'high',
+        description="异常严重程度(high/medium/low)"
+    )
+
+    def __init__(self, csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化偶发高额整数交易识别器
+
+        Args:
+            csv_path: CSV文件路径
+            config: 配置参数
+            **kwargs: 其他参数
+        """
+        # 调用父类的 __init__
+        super().__init__(
+            name="occasional_high_integer_recognizer",
+            description="识别银行流水中偶发的高额整数倍交易异常,检测金额为整数倍、超过平均金额5倍且呈现偶发高频特征的交易。",
+            display_name="偶发高额整数交易异常识别",
+            csv_path=csv_path,
+            config=config,
+            **kwargs
+        )
+
+        # 从config获取配置,更新Field属性
+        integer_config = self.get_config_value('occasional_integer_transaction', {})
+        if integer_config:
+            config_mapping = {
+                'integer_multiple': 'integer_multiple',
+                'outlier_multiplier': 'outlier_multiplier',
+                'frequency_window_days': 'frequency_window_days',
+                'min_occurrences_for_high_frequency': 'min_occurrences_for_high_frequency',
+                'gap_std_threshold': 'gap_std_threshold',
+                'severity_level': 'severity_level'
+            }
+
+            for config_key, attr_name in config_mapping.items():
+                if config_key in integer_config:
+                    setattr(self, attr_name, integer_config[config_key])
+
+        print(f"✅ {self.display_name} 初始化完成")
+        print(f"  整数倍基数: ¥{self.integer_multiple:,.0f}")
+        print(f"  异常倍数阈值: {self.outlier_multiplier}倍")
+        print(f"  频率分析窗口: {self.frequency_window_days}天")
+        print(f"  高频最小次数: {self.min_occurrences_for_high_frequency}次")
+        print(f"  偶发判断阈值: 间隔标准差>{self.gap_std_threshold}")
+        print(f"  严重程度: {self.severity_level}")
+
+    def _is_integer_multiple(self, amount: float, tolerance: float = 0.01) -> bool:
+        """
+        判断金额是否为整数倍
+
+        Args:
+            amount: 交易金额
+            tolerance: 容差(元),解决浮点数精度问题
+
+        Returns:
+            bool: 是否为整数倍
+        """
+        if pd.isna(amount):
+            return False
+
+        # 计算余数
+        remainder = abs(amount % self.integer_multiple)
+
+        # 考虑浮点数精度,有两种情况:
+        # 1. 余数接近于0(如10000 % 10000 = 0)
+        # 2. 余数接近于整数倍基数(如10000 % 10000 = 0,但浮点误差可能为0.000001)
+        return remainder < tolerance or abs(remainder - self.integer_multiple) < tolerance
+
+    def _analyze_frequency_pattern(self, transactions: pd.DataFrame) -> Dict[str, Any]:
+        """
+        分析交易频率模式
+
+        Args:
+            transactions: 交易数据
+
+        Returns:
+            Dict[str, Any]: 频率分析结果
+        """
+        if len(transactions) < 2:
+            return {
+                'is_occasional': False,
+                'is_high_frequency': False,
+                'total_count': len(transactions),
+                'gap_std': 0.0,
+                'time_analysis': '数据不足,无法分析频率模式'
+            }
+
+        # 确保按时间排序
+        sorted_transactions = transactions.sort_values('datetime')
+
+        # 1. 计算时间间隔(天数)
+        time_diffs = sorted_transactions['datetime'].diff().dt.total_seconds() / 86400.0
+        time_diffs = time_diffs.dropna()
+
+        if len(time_diffs) == 0:
+            return {
+                'is_occasional': False,
+                'is_high_frequency': False,
+                'total_count': len(transactions),
+                'gap_std': 0.0,
+                'time_analysis': '时间间隔数据不足'
+            }
+
+        # 2. 判断是否偶发(时间间隔不规律)
+        gap_std = time_diffs.std()
+        is_occasional = gap_std > self.gap_std_threshold
+
+        # 3. 判断是否高频(在一定时间内多次发生)
+        # 按天统计发生次数
+        date_counts = sorted_transactions['datetime'].dt.date.value_counts()
+
+        # 检查是否有连续发生的情况
+        dates = sorted(sorted_transactions['datetime'].dt.date.unique())
+        date_diffs = [(dates[i + 1] - dates[i]).days for i in range(len(dates) - 1)]
+
+        # 判断是否有在时间窗口内多次发生
+        is_high_frequency = False
+        if len(transactions) >= self.min_occurrences_for_high_frequency:
+            # 检查是否有在frequency_window_days内达到min_occurrences_for_high_frequency次
+            sliding_window_counts = []
+            for i in range(len(dates)):
+                window_start = dates[i]
+                window_end = window_start + timedelta(days=self.frequency_window_days)
+                count_in_window = sum(1 for d in dates if window_start <= d <= window_end)
+                sliding_window_counts.append(count_in_window)
+
+            max_in_window = max(sliding_window_counts) if sliding_window_counts else 0
+            is_high_frequency = max_in_window >= self.min_occurrences_for_high_frequency
+
+        return {
+            'is_occasional': is_occasional,
+            'is_high_frequency': is_high_frequency,
+            'total_count': len(transactions),
+            'gap_std': float(gap_std),
+            'gap_mean': float(time_diffs.mean()),
+            'date_counts': date_counts.to_dict(),
+            'unique_dates': len(date_counts),
+            'date_range': {
+                'start': dates[0].strftime('%Y-%m-%d') if dates else '',
+                'end': dates[-1].strftime('%Y-%m-%d') if dates else '',
+                'total_days': (dates[-1] - dates[0]).days + 1 if len(dates) > 1 else 1
+            },
+            'time_analysis': f"时间间隔标准差: {gap_std:.2f}天,最大窗口内次数: {max_in_window if 'max_in_window' in locals() else 0}次"
+        }
+
+    def _generate_anomaly_reason(self, row: pd.Series, avg_amount: float,
+                                 frequency_info: Dict[str, Any]) -> str:
+        """
+        生成异常原因描述
+
+        Args:
+            row: 交易记录
+            avg_amount: 整体平均交易金额
+            frequency_info: 频率分析结果
+
+        Returns:
+            str: 异常原因描述
+        """
+        reasons = []
+
+        # 整数倍特征
+        multiple = row['txAmount'] / self.integer_multiple
+        reasons.append(f"金额为¥{self.integer_multiple:,.0f}的{multiple:.0f}倍整数")
+
+        # 极端异常值特征
+        if avg_amount > 0:
+            outlier_ratio = abs(row['txAmount']) / avg_amount
+            reasons.append(f"金额超出整体平均{outlier_ratio:.1f}倍")
+
+        # 频率特征
+        if frequency_info['is_occasional']:
+            reasons.append(f"交易时间不规律(间隔标准差{frequency_info['gap_std']:.1f}天)")
+
+        if frequency_info['is_high_frequency']:
+            reasons.append(f"高频发生({frequency_info['total_count']}次)")
+
+        return f"偶发高额整数交易异常: {','.join(reasons)},需核查交易真实性"
+
+    def recognize(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        识别偶发高额整数交易异常
+
+        Args:
+            csv_path: CSV文件路径
+            **kwargs: 其他参数
+
+        Returns:
+            Dict[str, Any]: 识别结果
+        """
+        try:
+            # 使用父类的load_data方法加载标准化数据
+            df = self.load_data(csv_path)
+
+            print(f"🔍 {self.display_name}开始检查,共 {len(df)} 条记录")
+            print(f"  检测规则: 整数倍({self.integer_multiple:,.0f}元) + 极端异常值({self.outlier_multiplier}倍) + 偶发高频 = 异常")
+
+            # 检查必需字段
+            required_fields = ['txId', 'datetime', 'txAmount', 'txDirection']
+            missing_fields = [field for field in required_fields if field not in df.columns]
+
+            if missing_fields:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '失败',
+                    'error': f'缺少必需字段: {missing_fields}'
+                }
+
+            # 确保datetime列已正确解析
+            if not pd.api.types.is_datetime64_any_dtype(df['datetime']):
+                df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
+
+            # 按时间排序,便于频率分析
+            df = df.sort_values('datetime')
+
+            # 计算整体平均交易金额(绝对值)
+            avg_amount = df['txAmount'].abs().mean()
+            print(f"📊 整体平均交易金额: ¥{avg_amount:,.2f}")
+            print(f"  极端异常值阈值: ¥{avg_amount * self.outlier_multiplier:,.2f}")
+
+            # ============ 第一步:筛选整数倍交易 ============
+            integer_mask = df['txAmount'].apply(lambda x: self._is_integer_multiple(abs(x)))
+            integer_transactions = df[integer_mask].copy()
+
+            if len(integer_transactions) == 0:
+                print(f"📊 未发现{self.integer_multiple:,.0f}元整数倍交易")
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'statistics': {
+                        'total_transactions': len(df),
+                        'integer_transactions': 0,
+                        'avg_amount': float(avg_amount),
+                        'integer_multiple': self.integer_multiple
+                    }
+                }
+
+            print(f"📊 发现 {len(integer_transactions)} 笔{self.integer_multiple:,.0f}元整数倍交易")
+
+            # ============ 第二步:筛选极端异常值 ============
+            outlier_threshold = avg_amount * self.outlier_multiplier
+            outlier_mask = integer_transactions['txAmount'].abs() > outlier_threshold
+            outlier_transactions = integer_transactions[outlier_mask].copy()
+
+            if len(outlier_transactions) == 0:
+                print(f"📊 未发现极端异常值整数倍交易(金额≤¥{outlier_threshold:,.2f})")
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'statistics': {
+                        'total_transactions': len(df),
+                        'integer_transactions': len(integer_transactions),
+                        'outlier_transactions': 0,
+                        'avg_amount': float(avg_amount),
+                        'outlier_threshold': float(outlier_threshold)
+                    }
+                }
+
+            print(f"📊 发现 {len(outlier_transactions)} 笔极端异常值整数倍交易(金额>¥{outlier_threshold:,.2f})")
+
+            # ============ 第三步:分析频率模式 ============
+            frequency_info = self._analyze_frequency_pattern(outlier_transactions)
+            print(f"📈 频率分析结果:")
+            print(f"  总次数: {frequency_info['total_count']}")
+            print(f"  是否偶发: {'是' if frequency_info['is_occasional'] else '否'} (标准差={frequency_info['gap_std']:.2f}天)")
+            print(f"  是否高频: {'是' if frequency_info['is_high_frequency'] else '否'}")
+            print(f"  涉及天数: {frequency_info['unique_dates']}天")
+
+            # ============ 第四步:识别异常交易 ============
+            identified_anomalies = []
+
+            # 只有同时满足偶发且高频才标记为异常
+            if frequency_info['is_occasional'] and frequency_info['is_high_frequency']:
+                print(f"⚠️ 检测到偶发且高频的高额整数交易,开始标记异常...")
+
+                for idx, row in outlier_transactions.iterrows():
+                    # 生成异常原因
+                    reason = self._generate_anomaly_reason(row, avg_amount, frequency_info)
+
+                    # 额外信息
+                    additional_info = {
+                        'frequency_analysis': frequency_info,
+                        'amount_analysis': {
+                            'integer_multiple': self.integer_multiple,
+                            'outlier_multiplier': self.outlier_multiplier,
+                            'avg_amount': avg_amount,
+                            'outlier_threshold': outlier_threshold,
+                            'outlier_ratio': abs(row['txAmount']) / avg_amount if avg_amount > 0 else 0
+                        },
+                        'integer_analysis': {
+                            'multiple_count': row['txAmount'] / self.integer_multiple,
+                            'is_integer_multiple': True
+                        }
+                    }
+
+                    # 生成异常记录
+                    anomaly = self.format_anomaly_record(
+                        row=row,
+                        reason=reason,
+                        severity=self.severity_level,
+                        check_type='occasional_high_integer_transaction',
+                        **additional_info
+                    )
+                    identified_anomalies.append(anomaly)
+
+                    print(f"  ❌ 标记异常: ID={row['txId']}, ¥{row['txAmount']:,.2f}, {reason[:60]}...")
+            else:
+                print(f"✅ 未检测到偶发高频模式,不标记异常")
+                print(f"  偶发性: {frequency_info['is_occasional']}")
+                print(f"  高频性: {frequency_info['is_high_frequency']}")
+
+            # ============ 第五步:结果统计 ============
+            print(f"✅ {self.display_name}检查完成")
+            print(f"  检查交易总数: {len(df)}")
+            print(f"  整数倍交易数: {len(integer_transactions)}")
+            print(f"  极端异常值数: {len(outlier_transactions)}")
+            print(f"  异常交易数: {len(identified_anomalies)}")
+
+            # 显示整数倍交易统计
+            if len(integer_transactions) > 0:
+                print("📋 整数倍交易统计:")
+                total_integer_amount = integer_transactions['txAmount'].sum()
+                avg_integer_amount = integer_transactions['txAmount'].mean()
+                print(f"  总整数倍金额: ¥{total_integer_amount:,.2f}")
+                print(f"  平均整数倍金额: ¥{avg_integer_amount:,.2f}")
+
+                # 倍数分布
+                integer_transactions['multiple'] = (integer_transactions['txAmount'] / self.integer_multiple).round()
+                multiple_counts = integer_transactions['multiple'].value_counts().sort_index()
+                print(f"  倍数分布: {dict(multiple_counts.head(10))}" +
+                      (", ..." if len(multiple_counts) > 10 else ""))
+
+                # 按方向统计
+                income_integer = integer_transactions[integer_transactions['txDirection'] == '收入']
+                expense_integer = integer_transactions[integer_transactions['txDirection'] == '支出']
+                print(f"  整数倍收入: {len(income_integer)}笔, ¥{income_integer['txAmount'].sum():,.2f}")
+                print(f"  整数倍支出: {len(expense_integer)}笔, ¥{expense_integer['txAmount'].sum():,.2f}")
+
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': len(identified_anomalies),
+                'identified_anomalies': identified_anomalies,
+                'recognition_status': '完成',
+                'recognition_parameters': {
+                    'integer_multiple': self.integer_multiple,
+                    'outlier_multiplier': self.outlier_multiplier,
+                    'frequency_window_days': self.frequency_window_days,
+                    'min_occurrences_for_high_frequency': self.min_occurrences_for_high_frequency,
+                    'gap_std_threshold': self.gap_std_threshold,
+                    'severity_level': self.severity_level,
+                    'avg_amount': float(avg_amount)
+                },
+                'statistics': {
+                    'total_transactions': len(df),
+                    'integer_transactions': len(integer_transactions),
+                    'outlier_transactions': len(outlier_transactions),
+                    'anomalous_transactions': len(identified_anomalies),
+                    'frequency_analysis': frequency_info,
+                    'amount_statistics': {
+                        'overall_avg': float(avg_amount),
+                        'overall_max': float(df['txAmount'].max()),
+                        'overall_min': float(df['txAmount'].min()),
+                        'integer_avg': float(integer_transactions['txAmount'].mean()) if len(
+                            integer_transactions) > 0 else 0,
+                        'integer_total': float(integer_transactions['txAmount'].sum()) if len(
+                            integer_transactions) > 0 else 0,
+                        'outlier_avg': float(outlier_transactions['txAmount'].mean()) if len(
+                            outlier_transactions) > 0 else 0
+                    },
+                    'direction_distribution': {
+                        'integer_income': len(integer_transactions[integer_transactions['txDirection'] == '收入']),
+                        'integer_expense': len(integer_transactions[integer_transactions['txDirection'] == '支出']),
+                        'outlier_income': len(outlier_transactions[outlier_transactions['txDirection'] == '收入']),
+                        'outlier_expense': len(outlier_transactions[outlier_transactions['txDirection'] == '支出'])
+                    }
+                }
+            }
+
+        except FileNotFoundError as e:
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'文件不存在: {str(e)}'
+            }
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'数据加载或处理失败: {str(e)}'
+            }
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        summary = super().get_summary()
+        summary.update({
+            'integer_multiple': self.integer_multiple,
+            'outlier_multiplier': self.outlier_multiplier,
+            'frequency_window_days': self.frequency_window_days,
+            'min_occurrences_for_high_frequency': self.min_occurrences_for_high_frequency,
+            'gap_std_threshold': self.gap_std_threshold,
+            'severity_level': self.severity_level,
+            'data_loaded': self._data is not None
+        })
+        return summary
+
+    def get_config_summary(self) -> Dict[str, Any]:
+        """获取配置摘要"""
+        return {
+            "整数倍基数": f"¥{self.integer_multiple:,.0f}元",
+            "异常倍数阈值": f"{self.outlier_multiplier}倍",
+            "频率分析窗口": f"{self.frequency_window_days}天",
+            "高频最小次数": f"{self.min_occurrences_for_high_frequency}次",
+            "偶发判断阈值": f"标准差>{self.gap_std_threshold}天",
+            "检测逻辑": f"整数倍(¥{self.integer_multiple:,.0f}) + 极端异常值({self.outlier_multiplier}倍) + 偶发高频 = 异常",
+            "严重程度": self.severity_level.upper(),
+            "业务规则描述": "金额为整数倍、超过平均金额5倍且呈现偶发高频特征的交易需核查真实性"
+        }

+ 628 - 0
llmops/agents/tools/over_book_transaction_recognizer.py

@@ -0,0 +1,628 @@
+
+from pydantic import BaseModel, Field
+from typing import Dict, Any, Optional, Type, List, Tuple
+import pandas as pd
+from datetime import timedelta
+
+from .enhanced_base_recognizer import EnhancedBaseRecognizer
+
+
+class OverBookTransactionInput(BaseModel):
+    """疑似过账流水识别工具输入"""
+    csv_path: Optional[str] = Field(
+        None,
+        description="CSV文件路径(可选)。如果初始化时已提供csv_path,可以不用再次传入。"
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
+
+
+class OverBookTransactionRecognizer(EnhancedBaseRecognizer):
+    """
+    疑似过账流水识别器
+
+    异常规则定义:
+    账户在接收大额资金入账后,7个自然日内即发生与该入账金额完全一致(或高度接近)的资金流出,
+    形成"入账-流出"的闭环资金流动,且缺乏合理商业背景、实际业务往来支撑或真实收付需求,
+    资金未发生实质性使用或流转,仅通过账户完成过渡划转,符合过账交易核心属性。
+
+    核心逻辑:
+    1. 筛选≥阈值金额的"收入"交易
+    2. 查找每笔大额收入后7天内的"支出"交易
+    3. 匹配金额(±容忍度范围内)
+    4. 分析交易背景合理性
+    5. 标记疑似过账的交易对
+    """
+
+    args_schema: Type[BaseModel] = OverBookTransactionInput
+
+    # 配置参数
+    amount_threshold: float = Field(
+        100000.0,
+        description="金额阈值(元),交易金额≥此值才进行检测"
+    )
+
+    time_window_days: int = Field(
+        7,
+        description="时间窗口(天),从收入发生日开始计算"
+    )
+
+    amount_tolerance: float = Field(
+        0.01,
+        description="金额容忍度,±此比例内视为金额匹配"
+    )
+
+    min_stay_time_hours: int = Field(
+        1,
+        description="最小停留时间(小时),避免即时进出被视为过账"
+    )
+
+    # 合理性判断参数
+    enable_background_check: bool = Field(
+        True,
+        description="是否启用交易背景合理性检查"
+    )
+
+    reasonable_background_keywords: List[str] = Field(
+        [
+            "工资发放", "奖金发放", "绩效发放", "报销款",
+            "货款", "租金收入", "投资款", "贷款", "还款",
+            "采购付款", "支付货款", "缴税", "缴费", "消费",
+            "日常支出", "生活支出", "业务往来", "贸易款"
+        ],
+        description="合理业务背景关键词列表"
+    )
+
+    high_risk_keywords: List[str] = Field(
+        [
+            "过账", "过渡", "走账", "倒账", "资金划转",
+            "临时周转", "无实际业务", "过渡资金", "资金过桥",
+            "代收代付", "代转", "垫资", "拆借", "内部往来"
+        ],
+        description="高风险关键词(过账特征)列表"
+    )
+
+    # 交易对手分析
+    enable_counterparty_check: bool = Field(
+        True,
+        description="是否启用交易对手关联性检查"
+    )
+
+    # 模式检测配置
+    detect_single_pair: bool = Field(
+        True,
+        description="检测单笔流入-流出对"
+    )
+
+    detect_split_pattern: bool = Field(
+        True,
+        description="检测拆分过账(一笔流入多笔流出)"
+    )
+
+    detect_merge_pattern: bool = Field(
+        True,
+        description="检测合并过账(多笔流入一笔流出)"
+    )
+
+    # 严重程度配置
+    severity_level: str = Field(
+        'high',
+        description="异常严重程度(high/medium/low)"
+    )
+
+    def __init__(self, csv_path: str = None, config: Dict[str, Any] = None, **kwargs):
+        """
+        初始化疑似过账流水识别器
+
+        Args:
+            csv_path: CSV文件路径
+            config: 配置参数
+            **kwargs: 其他参数
+        """
+        # 调用父类的 __init__
+        super().__init__(
+            name="over_book_transaction_recognizer",
+            description="识别疑似过账流水:大额资金短期内相同金额进出,缺乏真实业务背景。",
+            display_name="疑似过账流水识别器",
+            csv_path=csv_path,
+            config=config,
+            **kwargs
+        )
+
+        # 从config获取配置,更新Field属性
+        overbook_config = self.get_config_value('over_book_transaction_recognition', {})
+        if overbook_config:
+            config_mapping = {
+                'amount_threshold': 'amount_threshold',
+                'time_window_days': 'time_window_days',
+                'amount_tolerance': 'amount_tolerance',
+                'min_stay_time_hours': 'min_stay_time_hours',
+                'enable_background_check': 'enable_background_check',
+                'reasonable_background_keywords': 'reasonable_background_keywords',
+                'high_risk_keywords': 'high_risk_keywords',
+                'enable_counterparty_check': 'enable_counterparty_check',
+                'detect_single_pair': 'detect_single_pair',
+                'detect_split_pattern': 'detect_split_pattern',
+                'detect_merge_pattern': 'detect_merge_pattern',
+                'severity_level': 'severity_level'
+            }
+
+            for config_key, attr_name in config_mapping.items():
+                if config_key in overbook_config:
+                    setattr(self, attr_name, overbook_config[config_key])
+
+        print(f"✅ {self.display_name} 初始化完成")
+        print(f"  金额阈值: ¥{self.amount_threshold:,.2f}")
+        print(f"  时间窗口: {self.time_window_days}天")
+        print(f"  金额容忍度: ±{self.amount_tolerance:.1%}")
+        print(f"  最小停留时间: {self.min_stay_time_hours}小时")
+        print(f"  背景检查: {'启用' if self.enable_background_check else '禁用'}")
+        print(f"  对手方检查: {'启用' if self.enable_counterparty_check else '禁用'}")
+        print(f"  检测模式: 单笔匹配/拆分/合并")
+        print(f"  异常严重程度: {self.severity_level.upper()}")
+
+    def _is_large_inflow(self, row: pd.Series) -> bool:
+        """
+        判断是否为需要检测的大额收入
+
+        Args:
+            row: 交易记录
+
+        Returns:
+            bool: 是否为大额收入
+        """
+        # 必须是收入方向
+        if row.get('txDirection') != '收入':
+            return False
+
+        # 金额必须达到阈值
+        amount = row.get('txAmount', 0)
+        if pd.isna(amount) or amount < self.amount_threshold:
+            return False
+
+        return True
+
+    def _find_matching_outflows(self, inflow: pd.Series, df: pd.DataFrame) -> List[pd.Series]:
+        """
+        查找匹配的流出交易
+
+        Args:
+            inflow: 大额收入记录
+            df: 完整数据集
+
+        Returns:
+            List[pd.Series]: 匹配的流出交易列表
+        """
+        if pd.isna(inflow.get('datetime')):
+            return []
+
+        inflow_time = inflow['datetime']
+        inflow_amount = inflow['txAmount']
+
+        # 计算时间窗口
+        time_end = inflow_time + timedelta(days=self.time_window_days)
+
+        # 筛选条件:时间窗口内、支出方向、金额匹配
+        mask = (
+                (df['datetime'] > inflow_time) &  # 晚于流入时间
+                (df['datetime'] <= time_end) &  # 在时间窗口内
+                (df['txDirection'] == '支出') &  # 支出方向
+                (df['txId'] != inflow['txId'])  # 排除同一笔交易
+        )
+
+        candidate_outflows = df[mask].copy()
+
+        # 金额匹配检查
+        matching_outflows = []
+        for _, outflow in candidate_outflows.iterrows():
+            outflow_amount = outflow['txAmount']
+
+            # 检查金额是否匹配(考虑容忍度)
+            amount_ratio = outflow_amount / inflow_amount
+            if abs(amount_ratio - 1.0) <= self.amount_tolerance:
+                matching_outflows.append(outflow)
+
+        return matching_outflows
+
+    def _analyze_background_reasonableness(self, inflow: pd.Series, outflow: pd.Series) -> Tuple[bool, str]:
+        """
+        分析交易背景合理性
+
+        Args:
+            inflow: 流入交易记录
+            outflow: 流出交易记录
+
+        Returns:
+            Tuple[bool, str]: (是否合理, 合理性描述)
+        """
+        if not self.enable_background_check:
+            return True, "背景检查已禁用"
+
+        inflow_summary = str(inflow.get('txSummary', '')).lower()
+        outflow_summary = str(outflow.get('txSummary', '')).lower()
+
+        inflow_counterparty = str(inflow.get('txCounterparty', '')).lower()
+        outflow_counterparty = str(outflow.get('txCounterparty', '')).lower()
+
+        reasons = []
+        is_reasonable = True
+
+        # 1. 检查高风险关键词
+        for keyword in self.high_risk_keywords:
+            if keyword in inflow_summary or keyword in outflow_summary:
+                reasons.append(f"包含高风险关键词: '{keyword}'")
+                is_reasonable = False
+
+        # 2. 检查合理背景关键词
+        has_reasonable_keyword = False
+        for keyword in self.reasonable_background_keywords:
+            if keyword in inflow_summary or keyword in outflow_summary:
+                has_reasonable_keyword = True
+                reasons.append(f"包含合理背景关键词: '{keyword}'")
+
+        if has_reasonable_keyword:
+            is_reasonable = True
+
+        # 3. 检查交易对手关系(如果启用)
+        if self.enable_counterparty_check:
+            if inflow_counterparty == outflow_counterparty and inflow_counterparty not in ['', 'nan']:
+                reasons.append(f"相同交易对手: {inflow_counterparty}")
+                # 相同对手方可能是正常业务(如还款),也可能是过账嫌疑
+                if '还款' in inflow_summary or '还款' in outflow_summary:
+                    reasons.append("可能为正常还款业务")
+                    is_reasonable = True
+                else:
+                    reasons.append("相同对手方资金来回流动,需关注")
+                    is_reasonable = False
+
+        # 4. 检查停留时间(太短可能有问题)
+        stay_time_hours = (outflow['datetime'] - inflow['datetime']).total_seconds() / 3600
+        if stay_time_hours < self.min_stay_time_hours:
+            reasons.append(f"资金停留时间过短: {stay_time_hours:.1f}小时")
+            is_reasonable = False
+
+        # 5. 检查摘要信息完整性
+        if inflow_summary == '' or outflow_summary == '':
+            reasons.append("交易摘要信息不完整")
+            is_reasonable = False
+
+        # 生成描述
+        if not reasons:
+            description = "背景检查未发现明显异常"
+        else:
+            description = "; ".join(reasons)
+
+        return is_reasonable, description
+
+    def _format_over_book_reason(self, inflow: pd.Series, outflow: pd.Series,
+                                 background_analysis: str, stay_days: float) -> str:
+        """
+        生成过账异常原因描述
+
+        Args:
+            inflow: 流入交易记录
+            outflow: 流出交易记录
+            background_analysis: 背景分析结果
+            stay_days: 停留天数
+
+        Returns:
+            str: 异常原因描述
+        """
+        inflow_amount = inflow['txAmount']
+        outflow_amount = outflow['txAmount']
+
+        amount_diff = abs(outflow_amount - inflow_amount)
+        amount_diff_percent = (amount_diff / inflow_amount) * 100
+
+        reason_parts = [
+            f"疑似过账交易:收入¥{inflow_amount:,.2f}后{stay_days:.1f}天内支出¥{outflow_amount:,.2f}",
+            f"金额匹配度:差异¥{amount_diff:,.2f}({amount_diff_percent:.2f}%)"
+        ]
+
+        if stay_days < 1:
+            reason_parts.append(f"资金停留时间仅{stay_days * 24:.1f}小时")
+        else:
+            reason_parts.append(f"资金停留时间{stay_days:.1f}天")
+
+        if background_analysis:
+            reason_parts.append(f"背景分析:{background_analysis}")
+
+        return ";".join(reason_parts)
+
+    def recognize(self, csv_path: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        识别疑似过账流水异常
+
+        Args:
+            csv_path: CSV文件路径
+            **kwargs: 其他参数
+
+        Returns:
+            Dict[str, Any]: 识别结果
+        """
+        try:
+            # 使用父类的load_data方法加载标准化数据
+            df = self.load_data(csv_path)
+
+            print(f"🔍 {self.display_name}开始检查,共 {len(df)} 条记录")
+            print(f"  检查规则: ≥¥{self.amount_threshold:,.2f}收入 → {self.time_window_days}天内 → 匹配金额支出")
+
+            # 检查必需字段
+            required_fields = ['txId', 'txDate', 'txTime', 'txAmount', 'txDirection', 'txSummary']
+            missing_fields = [field for field in required_fields if field not in df.columns]
+
+            if missing_fields:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '失败',
+                    'error': f'缺少必需字段: {missing_fields}'
+                }
+
+            # 确保数据按时间排序
+            if 'datetime' not in df.columns:
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '失败',
+                    'error': '缺少datetime字段,无法进行时间序列分析'
+                }
+
+            df = df.sort_values('datetime').copy()
+
+            # ============ 识别大额收入交易 ============
+            print(f"🔍 正在识别大额收入交易...")
+
+            # 筛选大额收入
+            large_inflows_mask = df.apply(self._is_large_inflow, axis=1)
+            large_inflows = df[large_inflows_mask].copy()
+
+            if len(large_inflows) == 0:
+                print(f"📊 未发现≥¥{self.amount_threshold:,.2f}的大额收入记录")
+
+                return {
+                    'recognition_type': self.display_name,
+                    'identified_count': 0,
+                    'identified_anomalies': [],
+                    'recognition_status': '完成',
+                    'recognition_parameters': {
+                        'amount_threshold': self.amount_threshold,
+                        'time_window_days': self.time_window_days,
+                        'amount_tolerance': self.amount_tolerance,
+                        'total_checked': len(df)
+                    },
+                    'statistics': {
+                        'total_transactions': len(df),
+                        'large_inflows_count': 0,
+                        'max_transaction_amount': float(df['txAmount'].max()),
+                        'avg_transaction_amount': float(df['txAmount'].mean())
+                    }
+                }
+
+            print(f"📊 发现 {len(large_inflows)} 笔大额收入记录")
+            print(f"  大额收入金额范围: ¥{large_inflows['txAmount'].min():,.2f} - ¥{large_inflows['txAmount'].max():,.2f}")
+
+            # ============ 查找匹配的流出交易 ============
+            print(f"🔍 正在查找匹配的流出交易...")
+
+            identified_anomalies = []
+            transaction_pairs = []
+            match_statistics = {
+                'total_pairs_found': 0,
+                'reasonable_pairs': 0,
+                'suspicious_pairs': 0
+            }
+
+            for idx, inflow in large_inflows.iterrows():
+                inflow_id = str(inflow['txId'])
+                inflow_amount = inflow['txAmount']
+                inflow_date = inflow['datetime'].strftime('%Y-%m-%d %H:%M:%S')
+
+                print(f"  🔍 分析大额收入 {inflow_id}: ¥{inflow_amount:,.2f} ({inflow_date})")
+
+                # 查找匹配的流出
+                matching_outflows = self._find_matching_outflows(inflow, df)
+
+                if not matching_outflows:
+                    print(f"    ✅ 未发现匹配的流出交易")
+                    continue
+
+                print(f"    📊 发现 {len(matching_outflows)} 笔匹配流出")
+
+                # 分析每对交易
+                for outflow in matching_outflows:
+                    outflow_id = str(outflow['txId'])
+                    outflow_amount = outflow['txAmount']
+
+                    # 计算停留时间
+                    stay_time = outflow['datetime'] - inflow['datetime']
+                    stay_days = stay_time.total_seconds() / 86400
+
+                    # 分析背景合理性
+                    is_reasonable, background_analysis = self._analyze_background_reasonableness(inflow, outflow)
+
+                    # 记录交易对信息
+                    pair_info = {
+                        'inflow_id': inflow_id,
+                        'outflow_id': outflow_id,
+                        'inflow_amount': inflow_amount,
+                        'outflow_amount': outflow_amount,
+                        'amount_diff': abs(outflow_amount - inflow_amount),
+                        'stay_days': stay_days,
+                        'is_reasonable': is_reasonable,
+                        'background_analysis': background_analysis
+                    }
+                    transaction_pairs.append(pair_info)
+                    match_statistics['total_pairs_found'] += 1
+
+                    if is_reasonable:
+                        match_statistics['reasonable_pairs'] += 1
+                        print(f"    ✅ 交易对 {inflow_id}→{outflow_id}: 合理背景 ({background_analysis[:50]}...)")
+                    else:
+                        match_statistics['suspicious_pairs'] += 1
+
+                        # 生成异常原因
+                        reason = self._format_over_book_reason(inflow, outflow, background_analysis, stay_days)
+
+                        print(f"    ❌ 发现疑似过账: {inflow_id}→{outflow_id}")
+                        print(f"      原因: {reason[:80]}...")
+
+                        # 创建异常记录(记录流出交易作为异常点)
+                        additional_info = {
+                            'over_book_analysis': {
+                                'inflow_transaction': {
+                                    'txId': inflow_id,
+                                    'txDate': inflow['txDate'],
+                                    'txTime': inflow['txTime'],
+                                    'txAmount': inflow_amount,
+                                    'txSummary': inflow.get('txSummary', ''),
+                                    'txCounterparty': inflow.get('txCounterparty', '')
+                                },
+                                'outflow_transaction': {
+                                    'txId': outflow_id,
+                                    'txDate': outflow['txDate'],
+                                    'txTime': outflow['txTime'],
+                                    'txAmount': outflow_amount,
+                                    'txSummary': outflow.get('txSummary', ''),
+                                    'txCounterparty': outflow.get('txCounterparty', '')
+                                },
+                                'pair_analysis': {
+                                    'stay_days': stay_days,
+                                    'stay_hours': stay_days * 24,
+                                    'amount_match_ratio': outflow_amount / inflow_amount,
+                                    'background_check_result': background_analysis,
+                                    'is_reasonable': is_reasonable,
+                                    'detection_parameters': {
+                                        'amount_threshold': self.amount_threshold,
+                                        'time_window_days': self.time_window_days,
+                                        'amount_tolerance': self.amount_tolerance
+                                    }
+                                }
+                            }
+                        }
+
+                        # 使用流出交易作为异常记录主体
+                        anomaly = self.format_anomaly_record(
+                            row=outflow,
+                            reason=reason,
+                            severity=self.severity_level,
+                            check_type='over_book_transaction',
+                            **additional_info
+                        )
+                        identified_anomalies.append(anomaly)
+
+            # ============ 结果统计 ============
+            print(f"✅ {self.display_name}检查完成")
+            print(f"  检查结果:")
+            print(f"    大额收入记录: {len(large_inflows)} 笔")
+            print(f"    匹配交易对: {match_statistics['total_pairs_found']} 对")
+            print(f"    合理交易对: {match_statistics['reasonable_pairs']} 对")
+            print(f"    疑似过账对: {match_statistics['suspicious_pairs']} 对")
+            print(f"    异常记录数: {len(identified_anomalies)} 条")
+
+            # 显示详细信息
+            if match_statistics['suspicious_pairs'] > 0:
+                print("📋 疑似过账交易详情:")
+                for i, pair in enumerate(transaction_pairs[:5]):  # 显示前5条
+                    if not pair['is_reasonable']:
+                        print(f"  {i + 1}. {pair['inflow_id']}→{pair['outflow_id']}: "
+                              f"¥{pair['inflow_amount']:,.2f}→¥{pair['outflow_amount']:,.2f} "
+                              f"({pair['stay_days']:.1f}天)")
+
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': len(identified_anomalies),
+                'identified_anomalies': identified_anomalies,
+                'recognition_status': '完成',
+                'recognition_parameters': {
+                    'amount_threshold': self.amount_threshold,
+                    'time_window_days': self.time_window_days,
+                    'amount_tolerance': self.amount_tolerance,
+                    'min_stay_time_hours': self.min_stay_time_hours,
+                    'enable_background_check': self.enable_background_check,
+                    'enable_counterparty_check': self.enable_counterparty_check,
+                    'total_large_inflows': len(large_inflows)
+                },
+                'statistics': {
+                    'total_transactions': len(df),
+                    'large_inflows_count': len(large_inflows),
+                    'large_inflows_amount_stats': {
+                        'total': float(large_inflows['txAmount'].sum()),
+                        'avg': float(large_inflows['txAmount'].mean()),
+                        'max': float(large_inflows['txAmount'].max()),
+                        'min': float(large_inflows['txAmount'].min())
+                    } if len(large_inflows) > 0 else {},
+                    'match_statistics': match_statistics,
+                    'transaction_pairs_count': len(transaction_pairs),
+                    'suspicious_pairs_details': [
+                                                    {
+                                                        'inflow_id': p['inflow_id'],
+                                                        'outflow_id': p['outflow_id'],
+                                                        'inflow_amount': p['inflow_amount'],
+                                                        'outflow_amount': p['outflow_amount'],
+                                                        'stay_days': p['stay_days'],
+                                                        'background_analysis': p['background_analysis']
+                                                    }
+                                                    for p in transaction_pairs if not p['is_reasonable']
+                                                ][:10]  # 只保留前10条详情
+                }
+            }
+
+        except FileNotFoundError as e:
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'文件不存在: {str(e)}'
+            }
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {
+                'recognition_type': self.display_name,
+                'identified_count': 0,
+                'identified_anomalies': [],
+                'recognition_status': '失败',
+                'error': f'数据加载或处理失败: {str(e)}'
+            }
+
+    def get_summary(self) -> Dict[str, Any]:
+        """获取识别器摘要"""
+        summary = super().get_summary()
+        summary.update({
+            'amount_threshold': self.amount_threshold,
+            'time_window_days': self.time_window_days,
+            'amount_tolerance': self.amount_tolerance,
+            'min_stay_time_hours': self.min_stay_time_hours,
+            'enable_background_check': self.enable_background_check,
+            'reasonable_keywords_count': len(self.reasonable_background_keywords),
+            'high_risk_keywords_count': len(self.high_risk_keywords),
+            'enable_counterparty_check': self.enable_counterparty_check,
+            'detect_patterns': {
+                'single_pair': self.detect_single_pair,
+                'split_pattern': self.detect_split_pattern,
+                'merge_pattern': self.detect_merge_pattern
+            },
+            'severity_level': self.severity_level,
+            'data_loaded': self._data is not None
+        })
+        return summary
+
+    def get_config_summary(self) -> Dict[str, Any]:
+        """获取配置摘要"""
+        return {
+            "金额阈值": f"¥{self.amount_threshold:,.2f}",
+            "时间窗口": f"{self.time_window_days}天",
+            "金额容忍度": f"±{self.amount_tolerance:.1%}",
+            "最小停留时间": f"{self.min_stay_time_hours}小时",
+            "背景检查": "启用" if self.enable_background_check else "禁用",
+            "合理关键词": f"{len(self.reasonable_background_keywords)}个",
+            "高风险关键词": f"{len(self.high_risk_keywords)}个",
+            "对手方检查": "启用" if self.enable_counterparty_check else "禁用",
+            "检测逻辑": f"大额收入后{self.time_window_days}天内出现匹配金额支出,且缺乏合理业务背景",
+            "业务规则描述": "资金短暂停留即流出,缺乏真实业务背景,疑似过账交易"
+        }

+ 149 - 10
llmops/complete_agent_flow_rule.py

@@ -41,11 +41,13 @@ from llmops.workflow_state import (
     update_state_with_data_classified,
     convert_numpy_types,
     update_state_with_data_standardize,
-    update_state_with_report
+    update_state_with_report,
+    update_state_with_anomaly_recognition
 )
 from llmops.agents.outline_agent import generate_report_outline
 from llmops.agents.planning_agent import  plan_next_action
 from llmops.agents.rules_engine_metric_calculation_agent import RulesEngineMetricCalculationAgent
+from llmops.agents.anomaly_recognizer_agent import AnomalyRecognitionAgent
 from llmops.agents.data_manager import DataManager
 import os
 from llmops.agents.data_classify_agent import data_classify
@@ -71,6 +73,8 @@ class CompleteAgentFlow:
 
         # 初始规则引擎智能体
         self.rules_engine_agent = RulesEngineMetricCalculationAgent(api_key, base_url)
+        # 异常识别智能体
+        self.anomaly_recognizer = None
 
         # 创建工作流图
         self.workflow = self._create_workflow()
@@ -85,6 +89,7 @@ class CompleteAgentFlow:
         workflow.add_node("metric_calculator", self._metric_calculator_node)
         workflow.add_node("data_classify", self._data_classify_node)
         workflow.add_node("data_standardize", self._data_standardize_node)
+        workflow.add_node("anomaly_recognition", self._anomaly_recognition_node)
         workflow.add_node("report_generator", self._report_generator_node)
 
         # 设置入口点
@@ -99,6 +104,7 @@ class CompleteAgentFlow:
                 "metric_calculator": "metric_calculator",
                 "data_classify": "data_classify",
                 "data_standardize": "data_standardize",
+                "anomaly_recognition": "anomaly_recognition",
                 "report_generator": "report_generator",
                 END: END
             }
@@ -109,6 +115,7 @@ class CompleteAgentFlow:
         workflow.add_edge("data_classify", "planning_node")
         workflow.add_edge("outline_generator", "planning_node")
         workflow.add_edge("metric_calculator", "planning_node")
+        workflow.add_edge("anomaly_recognition", "planning_node")
         workflow.add_edge("report_generator", END)
 
         return workflow
@@ -143,6 +150,11 @@ class CompleteAgentFlow:
             print("→ 路由到 data_classify(分类打标)")
             return "data_classify"
 
+        # 异常识别未完成且有标准化数据 → 异常识别
+        if not state.get("anomaly_recognition_completed", False) and state.get("standardized_file_path"):
+            print("→ 路由到 anomaly_recognition(异常识别)")
+            return "anomaly_recognition"
+
         # 如果大纲为空 → 生成大纲
         if not state.get("outline_draft"):
             print("→ 路由到 outline_generator(生成大纲)")
@@ -332,7 +344,10 @@ class CompleteAgentFlow:
             # 获取大纲和计算结果
             outline = state.get("outline_draft")
             computed_metrics = state.get("computed_metrics", {})
+            anomaly_summary = state.get("anomaly_summary", {})
+
             print(f"已经完成的计算指标:{computed_metrics}")
+            print(f"异常识别结果:{anomaly_summary.get('total_anomalies', 0)}条异常")
 
             if not outline:
                 raise ValueError("没有可用的报告大纲")
@@ -350,7 +365,8 @@ class CompleteAgentFlow:
                         outline.global_metrics) if outline.global_metrics else 0
                 },
                 "sections": [],
-                "metrics_detail": {}
+                # "metrics_detail": {},
+                "anomaly_analysis": anomaly_summary
             }
 
             chapter_num = 0
@@ -384,15 +400,26 @@ class CompleteAgentFlow:
                 final_report["sections"].append(section_content)
 
             # 添加详细的指标信息
-            for metric_req in outline.global_metrics:
-                metric_id = metric_req.metric_id
-                final_report["metrics_detail"][metric_id] = {
-                    "name": metric_req.metric_name,
-                    "logic": metric_req.calculation_logic,
-                    "required_fields": metric_req.required_fields,
-                    "computed": metric_id in computed_metrics,
-                    "value": computed_metrics.get(metric_id, {}).get("value", "N/A")
+            # for metric_req in outline.global_metrics:
+            #     metric_id = metric_req.metric_id
+            #     final_report["metrics_detail"][metric_id] = {
+            #         "name": metric_req.metric_name,
+            #         "logic": metric_req.calculation_logic,
+            #         "required_fields": metric_req.required_fields,
+            #         "computed": metric_id in computed_metrics,
+            #         "value": computed_metrics.get(metric_id, {}).get("value", "N/A")
+            #     }
+
+            # 添加异常识别章节(如果存在异常)
+            if anomaly_summary.get('total_anomalies', 0) > 0:
+                anomaly_section = {
+                    "section_id": "anomaly_analysis",
+                    "title": "交易异常识别分析",
+                    "description": "基于交易流水数据识别的异常交易情况分析",
+                    "content": self._generate_anomaly_analysis_content(anomaly_summary),
+                    "metrics": {}
                 }
+                final_report["sections"].append(anomaly_section)
 
             # 更新状态
             new_state = update_state_with_report(state, final_report)
@@ -407,6 +434,7 @@ class CompleteAgentFlow:
             print(f"✅ 最终报告生成完成:{outline.report_title}")
             print(f"   章节数:{len(final_report['sections'])}")
             print(f"   计算指标:{len(computed_metrics)}/{len(outline.global_metrics)}")
+            print(f"   识别异常:{anomaly_summary.get('total_anomalies', 0)}条")
             print(".2%")
 
             return convert_numpy_types(new_state)
@@ -417,6 +445,46 @@ class CompleteAgentFlow:
             new_state["errors"].append(f"报告完成错误: {str(e)}")
             return convert_numpy_types(new_state)
 
+    def _generate_anomaly_analysis_content(self, anomaly_summary: Dict[str, Any]) -> str:
+        """生成异常分析章节内容"""
+        total_anomalies = anomaly_summary.get('total_anomalies', 0)
+        anomaly_ratio = anomaly_summary.get('anomaly_ratio', '0%')
+
+        content = f"""
+            ## 交易异常识别分析
+        
+            ### 异常识别概况
+            本次分析共识别出 **{total_anomalies}** 条异常交易记录,异常识别率为 **{anomaly_ratio}**。
+        
+            ### 异常类型分布
+            """
+
+        # 添加异常类型分布
+        anomaly_distribution = anomaly_summary.get('anomaly_distribution', {})
+        if anomaly_distribution:
+            content += "\n| 异常类型 | 数量 | 占比 |\n|----------|------|------|\n"
+            for anomaly_type, count in anomaly_distribution.items():
+                percentage = (count / total_anomalies * 100) if total_anomalies > 0 else 0
+                content += f"| {anomaly_type} | {count} | {percentage:.1f}% |\n"
+
+        # 添加严重程度分布
+        severity_distribution = anomaly_summary.get('severity_distribution', {})
+        if severity_distribution:
+            content += "\n### 严重程度分布\n"
+            for severity, count in severity_distribution.items():
+                percentage = (count / total_anomalies * 100) if total_anomalies > 0 else 0
+                content += f"- **{severity.upper()}** 级别:{count} 条 ({percentage:.1f}%)\n"
+
+        content += """
+            ### 分析建议
+            1. 建议对高风险异常进行重点核查
+            2. 结合业务背景判断异常交易的真实性
+            3. 建立异常交易监控机制
+            """
+
+        return content
+
+
     def _print_ai_selection_analysis(self, outline):
         """打印AI指标选择的推理过程分析 - 完全通用版本"""
         print()
@@ -653,6 +721,77 @@ class CompleteAgentFlow:
             return convert_numpy_types(new_state)
 
 
+    async def _anomaly_recognition_node(self, state: IntegratedWorkflowState) -> IntegratedWorkflowState:
+        """异常识别节点"""
+        try:
+            print("🔍 正在执行异常识别...")
+
+            # 检查是否已初始化异常识别智能体
+            if self.anomaly_recognizer is None:
+                print("🤖 初始化异常识别智能体...")
+                from llmops.agents.anomaly_recognizer_agent import AnomalyRecognitionAgent
+                from llmops.config import anomaly_recognizer_config
+
+                self.anomaly_recognizer = AnomalyRecognitionAgent(
+                    csv_path=state["standardized_file_path"],
+                    api_key=self.api_key,
+                    base_url=self.base_url,
+                    model_name=self.model_name,
+                    config=anomaly_recognizer_config
+                )
+
+            # 加载交易数据
+            print("📥 加载交易数据...")
+            transaction_data = self.anomaly_recognizer.load_transaction_data()
+
+            # 执行异常识别
+            print("🔍 执行异常识别分析...")
+            recognition_results = self.anomaly_recognizer.execute_full_recognition()
+
+            # 生成异常报告
+            print("📊 生成异常识别报告...")
+            output_dir = "outputs/anomaly_reports"
+            report_path = self.anomaly_recognizer.generate_recognition_report(output_dir)
+
+            # 使用 update_state_with_anomaly_recognition 函数更新状态
+            new_state = update_state_with_anomaly_recognition(
+                state,
+                recognition_results,
+                report_path
+            )
+
+            print(f"✅ 异常识别完成:发现 {recognition_results.get('summary', {}).get('total_identified_anomalies', 0)} 条异常")
+            return convert_numpy_types(new_state)
+
+        except Exception as e:
+            print(f"❌ 异常识别失败: {e}")
+            import traceback
+            traceback.print_exc()
+
+            # 即使失败也标记为完成,避免阻塞流程
+            new_state = state.copy()
+            new_state["anomaly_recognition_completed"] = True  # 标记为完成,避免卡住
+            new_state["errors"].append(f"异常识别错误: {str(e)}")
+
+            # 使用默认的异常摘要
+            new_state["anomaly_summary"] = {
+                "total_anomalies": 0,
+                "anomaly_ratio": "0%",
+                "severity_distribution": {},
+                "anomaly_distribution": {}
+            }
+
+            # 添加消息
+            new_state["messages"].append({
+                "role": "assistant",
+                "content": f"⚠️ 异常识别失败,跳过异常分析: {str(e)}",
+                "timestamp": datetime.now().isoformat()
+            })
+
+            return convert_numpy_types(new_state)
+
+
+
     def _decision_to_route(self, decision: str) -> str:
         """将规划决策转换为路由"""
         decision_routes = {

+ 249 - 1
llmops/config.py

@@ -66,6 +66,246 @@ for path in PATHS.values():
 # ============================================================================
 # 配置验证函数
 # ============================================================================
+# 异常识别配置
+anomaly_recognizer_config = {
+
+    'execution_mode': 'direct',  # 'direct' 或 'agent',默认使用直接模式
+
+    # 启用/禁用识别器
+    'enable_balance_recognition': True,          # 余额连续性
+    'enable_night_recognition': True,            # 夜间交易
+    'enable_high_frequency_recognition': True,   # 高频交易
+    'enable_transfer_recognition': True,
+    'enable_large_amount_recognition': True,
+    'enable_balance_missing_check': True,
+    'enable_occasional_high_integer_recognition': True,
+    'enable_inactive_account_check': True,
+    'enable_low_interest_rate_recognition': True,        # 结息异常
+    'enable_over_book_transaction_recognition': True,    # 疑似过账流水
+
+    # ============ 疑似过账流水识别配置 ============
+    'over_book_transaction_recognition': {  # 配置项名称保持一致
+        # 启用/禁用选项
+        'enabled': True,
+
+        # 金额检测参数
+        'amount_threshold': 50000.0,
+        'amount_tolerance': 0.01,
+
+        # 时间窗口参数
+        'time_window_days': 7,
+        'min_stay_time_hours': 1,
+
+        # 合理性判断参数
+        'enable_background_check': True,
+
+        # 正常业务背景关键词
+        'reasonable_background_keywords': [
+            "工资发放", "奖金发放", "绩效发放", "报销款",
+            "货款", "租金收入", "投资款", "贷款", "还款",
+            "采购付款", "支付货款", "缴税", "缴费", "消费",
+            "日常支出", "生活支出", "业务往来", "贸易款"
+        ],
+
+        # 高风险关键词(过账特征)
+        'high_risk_keywords': [
+            "过账", "过渡", "走账", "倒账", "资金划转",
+            "临时周转", "无实际业务", "过渡资金", "资金过桥",
+            "代收代付", "代转", "垫资", "拆借", "内部往来"
+        ],
+
+        # 交易对手关联分析
+        'enable_counterparty_check': False,
+        'suspicious_counterparty_patterns': [
+            "相同对手方", "关联公司", "个人账户", "无明显业务关系"
+        ],
+
+        # 严重程度配置
+        'severity_level': 'high',
+
+        # 模式检测配置
+        'detect_single_pair': True,
+        'detect_split_pattern': True,
+        'detect_merge_pattern': True,
+
+        # 详细描述
+        'description': '识别疑似过账流水:大额资金在短期内以相同或相近金额进出账户,缺乏真实业务背景',
+        'business_rule': '银行流水核查中,若账户在接收大额资金入账后,7个自然日内发生与该入账金额完全一致(或高度接近)的资金流出,且缺乏合理商业背景、实际业务往来支撑或真实收付需求,资金未发生实质性使用或流转,仅通过账户完成过渡划转,可判定为疑似过账流水。',
+    },
+
+    'low_interest_rate_recognition': {
+        # 启用/禁用选项
+        'enabled': True,
+
+        # 利率检测参数
+        'base_interest_rate': 0.0035,  # 基准活期存款利率(0.35%)
+        'threshold_ratio': 0.5,  # 异常阈值比例,实际利率低于基准利率的比例时视为异常
+
+        # 结息识别关键词
+        'interest_keywords': [
+            '结息', '利息', '存款利息', '存息', '利息收入',
+            '活期利息', '定期利息', '利息结算'
+        ],
+
+        # 计息参数
+        'assumed_interest_days': 90,  # 假设计息天数(通常为90天,按季结息)
+        'annual_days': 360,  # 年计息天数(银行常用360天)
+
+        # 计算所需的最小余额
+        'min_balance_for_calc': 100.0,  # 计算利率所需的最小余额(元)
+
+        # 严重程度配置
+        'severity_level': 'medium',  # 异常严重程度(high/medium/low)
+
+        # 模糊匹配设置
+        'enable_fuzzy_match': True,  # 是否启用模糊匹配
+
+        # 详细描述
+        'description': '识别银行流水中利率异常低的结息记录,检测结息金额对应的有效利率是否显著低于基准利率',
+        'business_rule': '银行流水结息核查中,若实际结息金额对应的有效利率显著低于同档期银行公布的活期存款基准利率,或低于账户所属银行同期执行的活期存款利率标准,且无合理利率下浮依据,可判定为结息记录存在异常',
+
+        # 验证规则
+        'validation_rules': {
+            'base_interest_rate_range': (0.0001, 0.1),  # 合理利率范围(0.01% - 10%)
+            'threshold_ratio_range': (0.1, 0.9),  # 合理阈值比例范围
+            'min_balance_for_calc_min': 1.0,  # 最小余额最小值
+            'assumed_interest_days_range': (1, 365),  # 计息天数范围
+            'annual_days_allowed': [360, 365]  # 允许的年计息天数
+        }
+    },
+
+    'inactive_account_check': {
+        'inactive_period_days': 3,  # 唯一必需的配置
+        'severity_level': 'medium'  # 可选
+    },
+
+    'balance_missing_check': {
+        'balance_columns_to_check': ['txBalance'],
+        'missing_severity': 'high',
+        'impact_description': '缺少余额信息将导致无法进行余额连续性检查、资金存量分析等关键异常识别'
+    },
+
+    'balance_recognition': {
+        'balance_tolerance': 0.01,
+        'enable_smart_sorting': True,  # 新增
+        'max_permutation_search': 6,  # 新增
+    },
+
+    # 各识别器参数配置
+    'integer_recognition': {
+        'amount_threshold': 10000.0,  # 整数金额阈值
+        'multiplier': 5.0,  # 平均值倍数阈值
+        'frequency': 3  # 高频阈值
+    },
+
+    # 夜间交易识别配置 - 更新为完整版
+    'night_recognition': {
+        'night_start_hour': 2,  # 夜间检测开始小时(0-23)
+        'night_end_hour': 5,  # 夜间检测结束小时(0-23)
+        'frequency_threshold_per_hour': 3,  # 高频交易阈值,每小时超过此笔数视为高频
+        'large_amount_threshold': 50000.0,  # 大额交易阈值(元)
+
+        # 夜间经营行业关键词
+        'night_industry_keywords': [
+            "酒店", "宾馆", "KTV", "酒吧", "夜总会", "网吧", "便利店",
+            "医院", "急救", "急诊", "消防", "公安", "保安", "物流",
+            "运输", "出租车", "网约车", "外卖", "配送"
+        ],
+
+        # 紧急情况关键词
+        'emergency_keywords': [
+            "急救", "急诊", "抢救", "紧急", "urgent", "emergency",
+            "抢险", "救援", "救灾", "应急", "加急"
+        ],
+
+        # 合理性检查选项
+        'enable_background_check': True,  # 是否启用合理背景检查
+    },
+
+    'high_frequency_recognition': {
+        'frequency_threshold': 2,  # 高频交易阈值,每小时超过此笔数视为高频交易异常
+        'statistic_unit': 'date_hour',  # 统计单位:日期+小时
+        'severity_level': 'high',  # 异常严重级别:high/medium/low
+        'description': '单小时交易笔数超过阈值,体现资金往来的集中性与活跃度异常'
+    },
+
+    'transfer_recognition': {
+        'amount_threshold': 100000.0,  # 大额阈值
+        'window_days': 7  # 时间窗口(天)
+    },
+
+    'large_amount_recognition': {
+        'amount_threshold': 10000.0,
+        'history_days': 90,
+        'outlier_multiplier': 3.0,
+        'enable_background_check': True,
+        'reasonable_background_keywords': [  # 合理的交易背景关键词
+            "工资", "奖金", "绩效", "年终奖", "报销", "货款", "租金",
+            "购房款", "装修款", "学费", "医疗费", "保险", "理财",
+            "投资款", "分红", "还款", "借款", "赠与", "遗产"
+        ],
+        'high_risk_keywords': [  # 高风险关键词
+            "赌博", "赌资", "彩票", "博彩", "虚拟货币", "比特币",
+            "地下钱庄", "洗钱", "套现", "非法", "不明", "无摘要"
+        ],
+        'description': '单次交易金额超过阈值且与账户历史行为不匹配,缺乏合理背景'
+    },
+    'occasional_high_integer_transaction': {
+        # 检测规则参数
+        'integer_multiple': 10000.0,  # 整数倍基数(默认1万元)
+        'outlier_multiplier': 2.0,  # 异常倍数阈值(默认2倍)
+
+        # 频率分析参数
+        'frequency_window_days': 7,  # 频率分析窗口(7天)
+        'min_occurrences_for_high_frequency': 3,  # 高频最小次数(3次)
+        'gap_std_threshold': 2.0,  # 偶发判断阈值(2天标准差)
+
+        # 结果参数
+        'severity_level': 'high',  # 严重程度(高)
+
+        # 描述信息
+        'description': '金额为10,000元整数倍、超过整体平均金额5倍且呈现偶发高频特征的交易...',
+        'business_rule': '银行流水核查中,若存在金额为10,000元整数倍的交易...'
+    }
+}
+
+
+
+def validate_night_transaction_config(config):
+    """
+    验证夜间交易配置
+
+    Args:
+        config: 夜间交易配置字典
+
+    Returns:
+        list: 错误信息列表
+    """
+    errors = []
+
+    night_config = config.get('night_recognition', {})
+
+    # 检查时间范围
+    start_hour = night_config.get('night_start_hour', 2)
+    end_hour = night_config.get('night_end_hour', 5)
+
+    if not (0 <= start_hour <= 23):
+        errors.append(f"夜间开始小时必须在0-23之间: {start_hour}")
+
+    if not (0 <= end_hour <= 23):
+        errors.append(f"夜间结束小时必须在0-23之间: {end_hour}")
+
+    # 检查阈值
+    if night_config.get('frequency_threshold_per_hour', 3) <= 0:
+        errors.append("高频阈值必须大于0")
+
+    if night_config.get('large_amount_threshold', 50000.0) <= 0:
+        errors.append("大额阈值必须大于0")
+
+    return errors
+
+
+
 def validate_config():
     """
     验证系统配置是否正确
@@ -87,6 +327,11 @@ def validate_config():
     if not DEEPSEEK_BASE_URL.startswith("https://"):
         errors.append("DEEPSEEK_BASE_URL必须是HTTPS URL")
 
+
+    # 新增夜间交易配置检查
+    night_errors = validate_night_transaction_config(anomaly_recognizer_config)
+    errors.extend(night_errors)
+
     return errors
 
 # ============================================================================
@@ -130,4 +375,7 @@ multimodal_api_url = "http://103.154.31.78:20012/api/file/read"
 
 # 是否从知识沉淀平台获取报告章节写作提示词
 # 0:否 1:是
-enable_kp_rc_prompts = 0
+enable_kp_rc_prompts = 0
+
+
+

+ 50 - 1
llmops/workflow_state.py

@@ -120,6 +120,12 @@ class IntegratedWorkflowState(TypedDict):
     completeness_score: float
     answer: Optional[str]  # 最终答案
 
+    # === 异常识别层 ===
+    anomaly_recognition_completed: bool
+    anomaly_recognition_results: Optional[Dict[str, Any]]
+    anomaly_report_path: Optional[str]
+    anomaly_summary: Dict[str, Any]
+
     # === 对话和消息层 ===
     messages: List[Dict[str, Any]]  # Big Agent消息格式
     current_node: str
@@ -214,7 +220,13 @@ def create_initial_integrated_state(question: str, industry: str, original_file_
 
         # 计算模式配置层
         "use_rules_engine_only": False,
-        "use_traditional_engine_only": False
+        "use_traditional_engine_only": False,
+
+        # 异常识别层
+        "anomaly_recognition_completed": False,
+        "anomaly_recognition_results": None,
+        "anomaly_report_path": None,
+        "anomaly_summary": {},
     }
 
 
@@ -388,4 +400,41 @@ def update_state_with_data_standardize(state: IntegratedWorkflowState, is_succ:
         "timestamp": datetime.now().isoformat()
     })
 
+    return new_state
+
+
+def update_state_with_anomaly_recognition(state: IntegratedWorkflowState, recognition_results: Dict[str, Any],
+                                          report_path: str) -> IntegratedWorkflowState:
+    """
+    使用异常识别结果更新状态
+
+    Args:
+        state: 当前状态
+        recognition_results: 异常识别结果
+        report_path: 异常报告路径
+
+    Returns:
+        更新后的状态
+    """
+    new_state = state.copy()
+    new_state["anomaly_recognition_completed"] = True
+    new_state["anomaly_recognition_results"] = recognition_results
+    new_state["anomaly_report_path"] = report_path
+
+    # 提取摘要信息
+    summary = recognition_results.get('summary', {})
+    new_state["anomaly_summary"] = {
+        "total_anomalies": summary.get('total_identified_anomalies', 0),
+        "anomaly_ratio": summary.get('recognition_ratio', '0%'),
+        "severity_distribution": summary.get('severity_distribution', {}),
+        "anomaly_distribution": summary.get('anomaly_distribution', {})
+    }
+
+    # 添加消息
+    new_state["messages"].append({
+        "role": "assistant",
+        "content": f"🔍 异常识别完成:发现 {summary.get('total_identified_anomalies', 0)} 条异常",
+        "timestamp": datetime.now().isoformat()
+    })
+
     return new_state