Przeglądaj źródła

提示词调整

jiayongqiang 5 dni temu
rodzic
commit
a828ea7c6c
29 zmienionych plików z 244 dodań i 168 usunięć
  1. 3 2
      agent/config.ini
  2. BIN
      agent/dist/agent-0.1.8.3-py3-none-any.whl
  3. BIN
      agent/dist/agent-0.1.8.3-py3-none-any.whl.zip
  4. BIN
      agent/dist/agent-0.1.8.3.tar.gz
  5. BIN
      agent/logs/aitagging-app.2026-04-10_10-20-00_803764.log.zip
  6. BIN
      agent/logs/aitagging-app.2026-05-06_18-50-04_192821.log.zip
  7. BIN
      agent/logs/aitagging-app.2026-05-07_16-18-49_120948.log.zip
  8. BIN
      agent/logs/aitagging-app.2026-05-08_09-57-47_624813.log.zip
  9. BIN
      agent/logs/aitagging-app.2026-05-12_10-11-58_363564.log.zip
  10. BIN
      agent/logs/aitagging-app.2026-05-13_15-07-47_974217.log.zip
  11. BIN
      agent/logs/aitagging-app.2026-05-14_10-30-41_494585.log.zip
  12. BIN
      agent/logs/aitagging-app.2026-05-15_10-29-03_563382.log.zip
  13. BIN
      agent/logs/aitagging-app.2026-05-20_15-26-06_549713.log.zip
  14. BIN
      agent/logs/aitagging-app.2026-05-21_11-09-34_131485.log.zip
  15. BIN
      agent/logs/aitagging-app.2026-06-02_15-32-55_749414.log.zip
  16. BIN
      agent/logs/aitagging-app.2026-06-16_11-37-39_368494.log.zip
  17. BIN
      agent/logs/aitagging-app.2026-06-22_16-10-14_996741.log.zip
  18. 8 48
      agent/src/agent/agent.py
  19. 11 1
      agent/src/agent/api_inner.py
  20. 37 29
      agent/src/agent/api_outter.py
  21. 60 43
      agent/src/agent/core/es.py
  22. 4 2
      agent/src/agent/core/vector.py
  23. 7 4
      agent/src/agent/main.py
  24. 4 35
      agent/tests/test_es.py
  25. 4 0
      agent/tests/test_reg1.py
  26. 74 0
      agent/tests/test_report.py
  27. BIN
      agent/tests/test_report.zip
  28. 27 4
      agent/tests/test_tagging.py
  29. 5 0
      agent/tests/test_vector.py

+ 3 - 2
agent/config.ini

@@ -18,10 +18,11 @@ timeout = 60
 
 [embedding]
 model = Qwen3-Embedding-8B
-base_url = http://10.192.72.11:18081/v1/embeddings
-api_key =
+base_url = http://10.192.72.12:18081/v1/embeddings
 default_dims = 4096
 enable_config = false
+api_key =
+
 
 [es]
 url = http://10.192.72.13:9200

BIN
agent/dist/agent-0.1.8.3-py3-none-any.whl


BIN
agent/dist/agent-0.1.8.3-py3-none-any.whl.zip


BIN
agent/dist/agent-0.1.8.3.tar.gz


BIN
agent/logs/aitagging-app.2026-04-10_10-20-00_803764.log.zip


BIN
agent/logs/aitagging-app.2026-05-06_18-50-04_192821.log.zip


BIN
agent/logs/aitagging-app.2026-05-07_16-18-49_120948.log.zip


BIN
agent/logs/aitagging-app.2026-05-08_09-57-47_624813.log.zip


BIN
agent/logs/aitagging-app.2026-05-12_10-11-58_363564.log.zip


BIN
agent/logs/aitagging-app.2026-05-13_15-07-47_974217.log.zip


BIN
agent/logs/aitagging-app.2026-05-14_10-30-41_494585.log.zip


BIN
agent/logs/aitagging-app.2026-05-15_10-29-03_563382.log.zip


BIN
agent/logs/aitagging-app.2026-05-20_15-26-06_549713.log.zip


BIN
agent/logs/aitagging-app.2026-05-21_11-09-34_131485.log.zip


BIN
agent/logs/aitagging-app.2026-06-02_15-32-55_749414.log.zip


BIN
agent/logs/aitagging-app.2026-06-16_11-37-39_368494.log.zip


BIN
agent/logs/aitagging-app.2026-06-22_16-10-14_996741.log.zip


+ 8 - 48
agent/src/agent/agent.py

@@ -19,6 +19,7 @@ model = config['llm']['model']
 # max_retries和timeout也从配置文件中读取,增加了默认值,以防止配置文件中缺失这两个参数导致的错误
 max_retries = config['llm'].getint("max_retries", 2)
 timeout = config['llm'].getint("timeout", 60)
+max_tokens = config['llm'].getint("max_tokens", 4096)
 logger.info(f"model_name:{model}, base_url:{base_url}")
 llm = init_chat_model(
     model_provider="openai", 
@@ -27,7 +28,8 @@ llm = init_chat_model(
     base_url=base_url,
     temperature= temperature,
     extra_body={"enable_thinking": False},
-    timeout=timeout
+    timeout=timeout,
+    max_tokens=max_tokens
 )
 
 class Lable(BaseModel):
@@ -39,7 +41,7 @@ class Lable(BaseModel):
     tag_name: str = Field(description="标签名称")
     tag_path: str = Field(description="标签路径")
     category_id: str = Field(description="标签类别ID")
-    desc: str = Field(description="给出简要的理由说明该标签为何被保留或剔除。")
+    desc: str = Field(description="给出简要的理由解释说明该标签为何被保留或剔除。输出解释内容不要超过100个字符。")
     passr:bool = Field(description="是否保留该标签,true表示保留,false表示剔除。")
 
 
@@ -57,8 +59,9 @@ def reflect_check_sync(context: str,is_marine: bool, labels: list[str]):
 1. 沿海行社标识参数为is_marine
     - is_marine=1表示是沿海行社,如果候选标签中包含海洋体系的标签,则在判定时可以适当放宽对海洋标签的语义关联要求;
     - is_marine=0表示不是沿海行社,判定时忽略此参数
-2. 不区分职业、投向、用途等属性的界限,凡包含相关内容即参与打标
-    """
+2. 职业、投向、用途、主营业务等属性分别独立判断,不考虑其他属性的关联关系,每个属性的判断结果独立,不依赖于其他属性的判断结果。
+3. 命中规则宽松一点,不要因为职业、投向、用途、主营业务这几个属性不相关或者互斥而否定候选标签。
+       """
     agent = create_agent(
         model = llm, 
         system_prompt=system_prompt,
@@ -72,49 +75,6 @@ def reflect_check_sync(context: str,is_marine: bool, labels: list[str]):
             ),
         ]
     )
-#     prompt = f"""
-#     一、角色与职责
-# 你是一名银行智能标签平台中的语义判定模型,负责基于贷款信息文本内容,对候选标签进行相关性判断和筛选。
-# 你的核心职责是从“贷款打标”的视角,评估文本中所描述的资产、项目、合同、用途或相关经济活动,是否与标签定义所描述的产业活动在语义上存在明确、合理、可引用的关联。
-
-# 二、基本工作原则
-# 1. 仅依据所提供的文本内容进行判断,不得引入外部资料。
-# 2. 不要求文本构成完整的行业介绍或业务说明。
-# 3. 不要求覆盖标签定义的全部要素。
-# 4. 所有结论必须能够回溯到具体文本证据。
-# 5. “语义不矛盾”不等于“语义有关联”。仅凭无法排除可能性就推荐标签属于错误判定。
-# 6. 若存在合理支持但信息不完整,可判定命中并通过不确定性说明体现风险,但绝不允许在没有任何语义连接点的情况下强行命中。
-# 7. 转贷操作词无效原则:如果文本中出现“无还本续贷”“借新还旧”“二押”“收回再贷”“压降转贷”等表述,这些属于贷款操作形式,其本身不能作为任何产业标签关联的实质依据。判定时必须忽略这些操作词,重点分析文本中括号内补充的内容,或上下文中的“投向”“用途”等字段里的实质性描述。若实质内容缺失,无法判断产业关联,则所有标签均判false。此原则适用于所有产业标签体系。
-
-# 三、关键判定标准(必须遵守)
-# 1. 关联性必须具体:文本中必须出现某个实体、行为、商品、服务、场景或用途,能够与标签定义中的产业活动形成直接或可解释的间接关联。
-# 2. 常识否定原则:若文本所描述的活动具有明显与该标签所属产业体系核心特征相悖的常识性冲突(例如在海洋经济体系中,茶叶种植等典型内陆活动不得推荐涉海标签),则不得推荐该标签。
-# 3. 排除情形必须严格执行:若文本内容明确符合标签定义中的排除说明,应判定为不命中;若标签定义无明确排除,但常识上显著冲突,也应判定为不命中。
-
-# 四、海洋经济体系专属规则
-# 本部分规则仅适用于标签所属产业体系为“海洋经济”的标签。体系归属可通过标签定义中的体系名称或标签路径前缀(如"海洋产业"/"海洋科研教育"/"海洋公共管理服务"/"海洋上游相关产业"/"海洋下游相关产业"等)识别。对于养老产业、科技金融等其他体系的标签,请忽略本部分所有内容,直接进入第五步进行标准语义判定。
-
-# 当前贷款发起行社的沿海标识为 {is_marine} (1=沿海行社,0=非沿海行社),该参数仅在处理海洋经济体系标签时生效。
-
-# 1. 教育场景排除规则
-# 如果文本中的用途涉及“学费”“住宿费”“培训费”“杂费”等教育类支出,且投向描述为“普通高等教育”“大学”“学院”等未明确指向海洋相关专业的学历教育:
-# a. 绝对不得推荐“海洋中等职业教育”“海洋社会人文科学研究”及其他海洋教育/科研类标签。
-# b. 除非文本中明确出现了“海洋”“航海”“水产”等涉海专业名称或科研机构,否则视为不命中。
-
-# 2. 沿海行社宽泛匹配规则
-# 当 {is_marine} = 1 时,启用以下扩展推理授权:
-# a. 对于文本中出现的住宿、餐饮、装修、仓储、物流、运输、批发零售等具有服务或流通属性的行业性描述,允许在语义上进行适度延伸,将标签定义中最相近的涉海服务类标签(如“涉海旅游消费”“涉海物流”“涉海批发经营”等)纳入候选判断。
-# b. 进行此类延伸时,文本中仍必须存在可关联的实体或行为,不得凭空推荐。延伸推荐的理由中应注明“基于沿海区域宽泛匹配推断,需人工确认”。
-# 当 {is_marine} = 0 时,禁用上述扩展推理,严格按照标准语义关联进行判断,不允许任何无直接证据的延伸推荐。
-
-# 五、候选标签
-# 以下是本次判定所依据的产业标签定义信息:
-#     {labels}
-
-# 六、贷款信息
-#     {context}   
-
-#     """
 
     user_message = f"""
 is_marine: {1 if is_marine else 0}
@@ -138,7 +98,7 @@ labels: {json.dumps(labels, ensure_ascii=False)}
         logger.info(f"{context} LLM result: {result}")
         # 只保留passr为true的标签,并将结果转换为字典列表格式
         result =  [r.dict() for r in result.labels ]
-        result = json.dumps(result, ensure_ascii=False)
+        
         l2 = datetime.now().isoformat()
         logger.info(f"Reflection check completed with uuid {uid}. timestamp: {l2}, consuming {(datetime.fromisoformat(l2) - datetime.fromisoformat(l1)).total_seconds()} seconds.")
 

+ 11 - 1
agent/src/agent/api_inner.py

@@ -9,6 +9,8 @@ from fastapi import BackgroundTasks
 from agent.logger import logger
 from agent.core.config import get_config_path
 from typing import Optional
+from agent.core.es import create_index
+
 config = get_config_path()
 
 router = APIRouter(prefix="/v1", tags=["平台内部接口"])
@@ -98,7 +100,7 @@ def load_category_2_es(category_id: str):
         "tag_reg": label[4],
         "tag_prompt": label[9],
         "category_id":label[10],
-        "tag_vector": get_embeddings([(label[6] or '')+"\n"+(label[3] or '')])[0] 
+        "tag_vector": get_embeddings([(label[1] or '')+"\n"+(label[6] or '')+"\n"+(label[3] or '')])[0] 
     } for label in labels])
     return labels
 
@@ -133,6 +135,14 @@ def synchronize_category(request: SynchronizeTagRequest):
         "message": "synchronize_category successful"
     }
 
+@router.get("/create_index_force")
+def create_index_force():
+    create_index(force=True)
+    return {
+        "code": 200,
+        "message": "create_index_force successful"
+    }
+
 class GenerateRegRequest(BaseModel):
     tag_name: str = Field(..., description="标签名称")
     tag_remark: str = Field(..., description="标签定义")

+ 37 - 29
agent/src/agent/api_outter.py

@@ -22,7 +22,7 @@ import requests
 
 
 config = get_config_path()
-CONCURRENCE = int(config['app']['concurrence'])
+CONCURRENCE = config['app'].getint("concurrence", 1)
 # background_semaphore = threading.BoundedSemaphore(CONCURRENCE)
 executor = ThreadPoolExecutor(max_workers=CONCURRENCE)
 ESB_CALLBACK = config['app']['esb_callback']
@@ -65,7 +65,7 @@ def reg_match(query:str, reglist:list[any]):
     result = []
     for id,reg in reglist:
         try:
-            if re.search(reg, query):
+            if re.search(reg, query,re.DOTALL | re.MULTILINE):
                 result.append(id)
         except re.error:
             continue
@@ -95,23 +95,23 @@ def execute_reg(log_id:str,tag_category_id:str,phrase: str)-> list:
     logger.info(f"[{log_id}] Regex filtering result: {result}")
     return result
 
-def vector_similarity_search(log_id:str,phrase: str,tag_ids:list[str]=None)-> list:
+def vector_similarity_search(log_id:str,phrase: str,tag_ids:list[str]=None):
     logger.info("Starting vector similarity search...")
     # 这里应该调用向量数据库进行相似度检索,返回相关标签id列表
     l1 = time.time()
-    query = get_embeddings([phrase])[0]
+    query = get_embeddings([f"{phrase}"])[0]
     l2 = time.time()
     logger.info(f"[{log_id}] Vector embedding time: {l2-l1}")
     l3 = time.time()
     rrf_score_threshold = float(config['es'].get('rrf_score_threshold',0.016))
-    results = bm25_vector_search(phrase,query,tag_ids=tag_ids,rrf_score_threshold=rrf_score_threshold)
+    results,full_results = bm25_vector_search(phrase,query,tag_ids=tag_ids,rrf_score_threshold=rrf_score_threshold)
     l4 = time.time()
     logger.info(f"[{log_id}] Vector search time: {l4-l3}")
     dao.execute(
             """UPDATE aitag_tag_log SET tagging_channel = %s WHERE id = %s""",
             (TAGGING_CHANNEL.VECTOR.value, log_id)    
         )
-    return results
+    return results,full_results
 
 def init_tag_log(request: TaggingRequest):
     id = uuid.uuid4().hex
@@ -195,20 +195,22 @@ order by r.tag_type,r.tag_nm,r.defined_rule desc"""
         if rules and len(rules) > 0:
             for matched in rules:
                 try:
-                    if re.search(matched[2], phrase):
+                    if re.search(matched[2], phrase,re.DOTALL | re.MULTILINE):
                         tag_info = dao.query("""select ati.id,ati.category_id, ati.tag_nm, ati.tag_path,ati.tag_code from aitag_tag_info ati left join aitag_tag_category atc on ati.category_id = atc.id where ati.tag_nm = %s and ati.is_delete = 0 and atc.category_code = %s""", (matched[1], matched[0]))
                         # 安全检查:只有当 tag_info 有数据时才加入结果
-                        if tag_info and len(tag_info) > 0 and tag_info[0][0] not in seen_ids:
-                            seen_ids.add(tag_info[0][0])
-                            result.append({
-                                "id": tag_info[0][0],
-                                "desc": generate_html(phrase, matched[2], tag_info[0][2]),
-                                "passr": True,
-                                "tag_code": tag_info[0][4],
-                                "tag_name": tag_info[0][2],
-                                "tag_path": tag_info[0][3],
-                                "category_id": tag_info[0][1]
-                            })
+                        if tag_info and len(tag_info) > 0:
+                            for ti in tag_info: 
+                                if  ti[0] not in seen_ids:
+                                    seen_ids.add(ti[0])
+                                    result.append({
+                                        "id": ti[0],
+                                        "desc": generate_html(phrase, matched[2], ti[2]),
+                                        "passr": True,
+                                        "tag_code": ti[4],
+                                        "tag_name": ti[2],
+                                        "tag_path": ti[3],
+                                        "category_id": ti[1]
+                                    })
                 except Exception as e:
                     logger.error(f"Defined rule match failed 1: {e}")
     except Exception as e:
@@ -236,32 +238,32 @@ def run_ai_pipeline(log_id: str, tag_category_id: str, phrase: str, instucde: Op
 
             # step0.5: 预设规则匹配,如果匹配成功则直接更新结果并结束打标流程
             defined_rule_result = defined_rule_match(phrase)    
-            if defined_rule_result:
-                logger.info(f"预设规则匹配成功,直接返回结果: {defined_rule_result}")
-                end_tagging_predefined_rule(log_id, json.dumps(defined_rule_result),business_attr)
-                return
+            # if defined_rule_result:
+            #     logger.info(f"预设规则匹配成功,直接返回结果: {defined_rule_result}")
+            #     end_tagging_predefined_rule(log_id, json.dumps(defined_rule_result),business_attr)
+            #     return
 
             # step1: 正则过滤
             result = execute_reg(log_id,tag_category_id,phrase)
             logger.info(f"正则过滤结果: {result}")
+
             # step2: 向量检索
-            # if not result or len(result) == 0 or len(result) >TOP_N: # 正则过滤结果过多或没有结果都进行向量检索,避免正则规则不完善导致的漏匹配问题,同时也避免正则规则过于宽泛导致的过多匹配问题
-            v_result = vector_similarity_search(log_id,phrase)
+            v_result,full_results = vector_similarity_search(log_id,phrase)
             logger.info(f"向量检索结果: {v_result}")
             # step2.5: 合并结果,取交集优先,交集为空则取并集
             if  result and len(result) > 0:
                 v_result1 = list(set(result) & set(v_result)) # 取交集,既满足正则规则又满足向量相似度的标签,优先级更高
                 if v_result1 and len(v_result1) > 0:
-                    result = v_result1
-                    logger.info(f"交集结果: {v_result1}")
+                    result = v_result
                 else:
                     result = list(set(result) | set(v_result)) # 取并集,满足正则规则或者满足向量相似度的标签
                     if result and len(result) > TOP_N:
-                       result = vector_similarity_search(log_id,phrase,tag_ids=result) # 如果合并后结果过多,则再次进行向量检索过滤一次
+                       result,full_results = vector_similarity_search(log_id,phrase,tag_ids=result) # 如果合并后结果过多,则再次进行向量检索过滤一次
                        logger.info(f"并集后再次向量检索结果: {result}")
             else:
                 result = v_result
             logger.info(f"最终候选结果: {result}")
+            logger.info(f"排序结果: {full_results}")
             # step3: LLM 打标
             if result and len(result) > 0:
                 try:
@@ -269,14 +271,20 @@ def run_ai_pipeline(log_id: str, tag_category_id: str, phrase: str, instucde: Op
                     logger.info(f"筛选结果: {tags}")
                     from agent.agent import reflect_check_sync
                     result, x_input = reflect_check_sync(phrase,is_marine, tags)
+                    if result and len(result) > 0:
+                        for item in result:
+                            _id = item["id"]
+                            if _id in full_results:
+                                item["rank"] = full_results[_id]["rank"]
+                                item["score"] = full_results[_id]["score"]
                 except Exception as e:
                     logger.error(f"LLM reflection check failed: {e}")
                     result = None
                     fail_tagging(log_id)
                     return
             # step4: 更新数据库
-            # 如果result是个空集合,插入None
-            end_tagging(log_id, result if result else None,x_input)
+            merged_list = list({item["id"]: item for item in result + defined_rule_result}.values())
+            end_tagging(log_id, json.dumps(merged_list, ensure_ascii=False),x_input)
             
     except Exception as e:
         logger.error(f"[{log_id}] Pipeline failed: {e}")

+ 60 - 43
agent/src/agent/core/es.py

@@ -18,7 +18,8 @@ es = Elasticsearch(
 )
 
 INDEX_NAME = "ai-tagging"
-if not es.indices.exists(index=INDEX_NAME):
+
+def _create_index():
     es.indices.create(
         index=INDEX_NAME,
         settings={
@@ -26,48 +27,57 @@ if not es.indices.exists(index=INDEX_NAME):
                 "similarity": {
                     "my_bm25": { 
                     "type": "BM25",
-                    "b": "0.9",   
-                    "k1": "0.6"   
+                    "b": 0.1,   
+                    "k1": 1.5,   
+                }
+            },
+            "analysis": {
+                "analyzer": {
+                    "my_custom_analyzer": {
+                        "type": "custom",
+                        "tokenizer": "standard", 
+                        "filter": [
+                            "lowercase", 
+                            "my_stop_filter" 
+                        ]
                     }
                 },
-                "analysis": {
-                    "analyzer": {
-                        "my_custom_analyzer": {
-                            "type": "custom",
-                            "tokenizer": "standard", 
-                            "filter": [
-                                "lowercase", 
-                                "my_stop_filter" 
-                            ]
-                        }
-                    },
-                    "filter": {
-                        "my_stop_filter": {
-                            "type": "stop",
-                            "stopwords": ["的", "了", "是", "在", "职","业","投","向","海","洋","水"] 
-                        }
+                "filter": {
+                    "my_stop_filter": {
+                        "type": "stop",
+                        "stopwords": ["的", "了", "是", "在", "职","业","投","向","海","洋","水"] 
                     }
                 }
-            },
+            }
         },
-        mappings={
-            "properties": {
-                "tag_code": {"type": "text"},
-                "tag_name": {"type": "text"},
-                "tag_path": {"type": "text","similarity": "my_bm25"},
-                "tag_level": {"type": "integer"},
-                "tag_remark": {"type": "text"},
-                "tag_reg": {"type": "text"},
-                "tag_prompt": {"type": "text"},
-                "category_id": {"type": "text"},
-                "tag_vector": {
-                    "type": "dense_vector",
-                    "dims": DIMS,
-                    "index": True,          # 必须为 True 才能做 kNN 搜索
-                    "similarity": "cosine"
-                }
+    },
+    mappings={
+        "properties": {
+            "tag_code": {"type": "text"},
+            "tag_name": {"type": "text"},
+            "tag_path": {"type": "text","similarity": "my_bm25"},
+            "tag_level": {"type": "integer"},
+            "tag_remark": {"type": "text"},
+            "tag_reg": {"type": "text"},
+            "tag_prompt": {"type": "text"},
+            "category_id": {"type": "text"},
+            "tag_vector": {
+                "type": "dense_vector",
+                "dims": DIMS,
+                "index": True,          # 必须为 True 才能做 kNN 搜索
+                "similarity": "cosine"
             }
-        })
+        }
+    })
+
+def create_index(force=False):
+    if force:
+        if es.indices.exists(index=INDEX_NAME):
+            es.indices.delete(index=INDEX_NAME)
+        _create_index()
+    elif not es.indices.exists(index=INDEX_NAME):
+        _create_index()
+
 
 def upsert_document(doc_id, document):
     es.update(index=INDEX_NAME, id=doc_id, doc=document,doc_as_upsert=True)
@@ -144,11 +154,10 @@ def hybrid_search(query_vector):
     return [item["_id"] for item in r]
 
 def bm25_vector_search(querystr:str,query_vector,tag_ids:list[str]=None,rrf_score_threshold=0.016):
-    
     query = {
             "multi_match": {
                 "query": querystr,
-                "fields": ["tag_path", "tag_remark"]
+                "fields": ["tag_name^3","tag_reg^2","tag_path^2", "tag_remark"]
             }
         }
     if tag_ids and tag_ids != [] and len(tag_ids) > 0:
@@ -158,7 +167,7 @@ def bm25_vector_search(querystr:str,query_vector,tag_ids:list[str]=None,rrf_scor
                     {
                         "multi_match": {
                             "query": querystr,
-                            "fields": ["tag_path", "tag_remark"]
+                            "fields": ["tag_name^3","tag_reg^2","tag_path^2", "tag_remark"]
                         }
                     }
                 ],
@@ -200,7 +209,7 @@ def bm25_vector_search(querystr:str,query_vector,tag_ids:list[str]=None,rrf_scor
         doc_id = hit['_id']
         score = rrf_scores.get(doc_id, 0.0) 
         rrf_scores[doc_id] = score + (1.0 / (RRF_CONST + rank))
-    
+  
     for rank, hit in enumerate(resp_vector['hits']['hits'], start=1):
         hit["_source"]["tag_vector"] = None
         doc_id = hit['_id']
@@ -208,8 +217,16 @@ def bm25_vector_search(querystr:str,query_vector,tag_ids:list[str]=None,rrf_scor
         rrf_scores[doc_id] = score + (1.0 / (RRF_CONST + rank))
     
     sorted_ids = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
-    result =  [id for id,score in sorted_ids if score>rrf_score_threshold]
-    return result[:TOP_N]
+    result =  [id for id,score in sorted_ids]
+    full_results = {}
+    for rank, (id, score) in enumerate(sorted_ids, start=1):
+        full_results[str(id)] = {
+            "score": score,
+            "rank": rank
+        }
+    return result[:TOP_N],full_results
+
+create_index(force=False)
 
 if __name__ == "__main__":
     results = search_all()

+ 4 - 2
agent/src/agent/core/vector.py

@@ -13,6 +13,8 @@ enable_config = config["embedding"]["enable_config"]
 if isinstance(enable_config, str):
     enable_config = enable_config.lower() in ['true', '1', 'yes']
 def get_embeddings(texts):
+    prefix = "Represent the document for semantic search: "
+    texts = [prefix + text for text in texts]
     url = base_url
     headers = {
         "Authorization": api_key,
@@ -36,5 +38,5 @@ if __name__ == "__main__":
     texts = ["Hello, world!", "How are you?"]
     embeddings = get_embeddings(texts)
     # print(embeddings)
-    logger.info(f"Embedding for '{texts[0]}': len({embeddings[0]})") 
-    logger.info(f"Embedding for '{texts[0]}': {embeddings[0][:5]}...")  # 打印前5个维度的值
+    logger.info(f"Embedding for '{texts[0]}': {len(embeddings[0])}") 
+    # logger.info(f"Embedding for '{texts[0]}': {embeddings[0][:5]}...")  # 打印前5个维度的值

+ 7 - 4
agent/src/agent/main.py

@@ -8,15 +8,16 @@ from agent.api_outter import batch_run_async
 from agent.core.config import get_config_path
 
 logger.info("ai-tagging starting!")
-breaker = get_config_path()['app'].getboolean("breaker", False)
+config = get_config_path()
+breaker = config['app'].getboolean("breaker", False)
+CONCURRENCE = config['app'].getint("concurrence", 1)
 api_router = APIRouter()
 api_router.include_router(inner_router,prefix="/admin")
 api_router.include_router(outter_router,prefix="")
 
-from agent.core.config import get_config_path
-config = get_config_path()
+
 # 如果没有配置port,默认使用9876
-port = int(config['app'].get('port', 9876))
+port = config['app'].getint("port", 9876)
 
 # 1. 定义 lifespan 事件
 @asynccontextmanager
@@ -39,6 +40,8 @@ print('API routes initialized')
 def main():
     logger.info("Starting AI Tagging Service...")
     logger.info(f"挡板状态: {breaker}")
+    logger.info(f"并发数: {CONCURRENCE}")
+    logger.info(f"端口: {port}")
     uvicorn.run(app, host="0.0.0.0", port=port)
 
 if __name__ == "__main__":

+ 4 - 35
agent/tests/test_es.py

@@ -1,36 +1,5 @@
-from agent.core.es import es
-from agent.core.vector import get_embeddings
-query = '职业:电动机加工制造,投向:电动机加工制造,用途:电动机加工制造'
-resp_bm25 = es.search(
-        index='ai-tagging',
-        size=10,
-        query={
-            "multi_match": {
-                "query": query,
-                "fields": ["tag_path", "tag_remark", "tag_prompt"]
-            }
-        }
-    )
-for rank, hit in enumerate(resp_bm25['hits']['hits'], start=1):
-    hit["_source"]["tag_vector"] = None
-    doc_id = hit['_id']
-    tag_nm = hit['_source']['tag_name']
-    print(doc_id,tag_nm)
-print('-----------------')
+import requests
+
+response = requests.get("http://10.192.72.13:9876/api/aitag/admin/v1/create_index_force")
+print(response.text)
 
-query_vector = get_embeddings([query])[0]
-resp_vector = es.search(
-        index='ai-tagging',
-        size=10,
-        knn={
-            "field": "tag_vector",
-            "query_vector": query_vector,
-            "k": 10,
-            "num_candidates": 100
-        }
-    )
-for rank, hit in enumerate(resp_vector['hits']['hits'], start=1):
-    hit["_source"]["tag_vector"] = None
-    doc_id = hit['_id']
-    tag_nm = hit['_source']['tag_name']
-    print(doc_id,tag_nm)

+ 4 - 0
agent/tests/test_reg1.py

@@ -1,6 +1,10 @@
 import re
+import json
 
 
 print(re.search('海洋天然气及可燃冰开采1', """职业:海洋天然气及可燃冰开采,
                 投向:海洋天然气及可燃冰开采1,
                 用途:海洋天然气及可燃冰开采"""))
+result = None
+result = json.dumps(result, ensure_ascii=False)
+print(result)

+ 74 - 0
agent/tests/test_report.py

@@ -0,0 +1,74 @@
+
+from agent.core.dao import query
+from datetime import datetime
+sql = """
+select result, tagging_channel,id,phrase,ai_result_starttime,ai_result_endtime from aitag_tag_log  order by insert_time desc
+"""
+result = query(sql)
+
+# 待统计指标
+total_count = len(result)
+haslabel_count = 0  # 有标签的数量
+maplabel_count = 0  # 走映射规则映射标签的数量
+rank_1_count= 0  # 排名第一并且passr=true的标签数量
+rank_2_count= 0  # 排名第二并且passr=true的标签数量
+rank_3_count= 0  # 排名第三并且passr=true的标签数量
+rank_4_count= 0  # 排名第四并且passr=true的标签数量
+rank_5_count= 0  # 排名第五并且passr=true的标签数量
+marine_count = 0 # 命中海洋标签的数量
+tech_count = 0 # 命中科技标签的数量
+pension_count = 0 # 命中养老标签的数量
+for item in result:
+    result = item[0]
+    channel = item[1]
+    if item[5] == None:
+        continue
+    end = datetime.strptime(item[5], "%Y-%m-%d %H:%M:%S")
+    start = datetime.strptime(item[4], "%Y-%m-%d %H:%M:%S")
+    print(f"stat: {end-start}")
+    
+    if result == None:
+        continue
+    if channel == 2:
+        maplabel_count += 1
+
+    haslabel = False
+    for tag in result:
+        if tag["passr"]:
+            haslabel_count += 1
+            haslabel = True
+            break
+    if not haslabel:
+        print(f"phrase: {item[3]}================>没有命中标签")
+    if channel != 2:
+        for tag in result:
+            if  type(tag) == dict and tag["passr"]:
+                if "rank" in tag:
+                    if tag["rank"] == 1:
+                        rank_1_count += 1
+                    elif tag["rank"] == 2:
+                        rank_2_count += 1
+                    elif tag["rank"] == 3:
+                        rank_3_count += 1
+                    elif tag["rank"] == 4:
+                        rank_4_count += 1
+                    elif tag["rank"] == 5:
+                        rank_5_count += 1
+                if tag["category_id"] == "f47ac10b-58cc-4372-a567-0e02b2c3d479":#海洋标签
+                    marine_count += 1
+                elif tag["category_id"] == "0a2dc889-6205-4cb2-be31-d67c6390a0d6":#科技标签
+                    tech_count += 1
+                elif tag["category_id"] == "cd4de5d4-491f-4779-8d96-9246c861e907":#养老标签
+                    pension_count += 1
+
+print(f"总样本数: {total_count}")
+print(f"有标签的样本数: {haslabel_count}")
+print(f"命中海洋标签的数量: {marine_count}")
+print(f"命中科技标签的数量: {tech_count}")
+print(f"命中养老标签的数量: {pension_count}")
+print(f"走映射规则映射标签的数量: {maplabel_count}")
+print(f"排名第一并且最终命中的标签数量: {rank_1_count}")
+print(f"排名第二并且最终命中的标签数量: {rank_2_count}")
+print(f"排名第三并且最终命中的标签数量: {rank_3_count}")
+print(f"排名第四并且最终命中的标签数量: {rank_4_count}")
+print(f"排名第五并且最终命中的标签数量: {rank_5_count}")               

BIN
agent/tests/test_report.zip


+ 27 - 4
agent/tests/test_tagging.py

@@ -9,11 +9,34 @@ querys =[ '职业:电动机加工制造,投向:电动机加工制造,用途:电
  '投向:远洋货物运输,用途:经营运输车队,经营范围:',
  '职业:舒缓医疗服务,投向:舒缓医疗服务,用途:舒缓医疗服务',
  '职业:安宁照护服务,投向:安宁照护服务,用途:安宁照护服务',
- '主营范围:大都市方法 贷款投向:的萨芬撒飞洒 贷款用途:海洋服务']
+ '主营范围:大都市方法 贷款投向:的萨芬撒飞洒 贷款用途:海洋服务',
+ "职业:休养院 投向:休养院 用途:休养院",
+ "职业:大都市方法 投向:贷款投向:的萨芬撒飞洒 用途:贷款投向:的萨芬撒飞洒 经营范围:仪器仪表设备制造",
+ "其他电子设备制造",
+ "职业:地理遥感信息运营服务,投向:地理遥感信息运营服务,用途:地理遥感信息运营服务",
+ "文本输入:职业:,投向:百货零售,用途:计算机制造,主营业务:主营范围",
+ "养点海胆卖了补贴生活",
+ "批量DIY个人电脑,用来办公",# 13
+ """主营范围:xx贷款投向:15345345贷款用途:海洋服务""",
+ "职业:信息安全设备生产,投向:信息安全设备生产,用途:信息安全设备生产",
+ "贷款用途:23424,贷款投向:xxx, 主营范围:老年司法援助服务",
+ "海水淡化处理", #17
+ "职业:保健服务人员 贷款用途:经营养老院 贷款投向:家庭服务 经营范围:",#18
+ "职业: ;贷款用途:福建省颐宁养老有限公司工程建设使用;贷款投向:其他房屋建筑业;经营范围:老年人养护服务;旅游度假服务;住宿服务",#19
+ "职业:;\n贷款用途:补充医院流动资金; \n贷款投向:综合医院; \n经营范围:综合医院,营利性养老机构服务目,对医疗机构及养老机构的投资与管理",
+ "职业:老年保健服务人员,投向:百货零售,用途:百货零售,主营业务:养殖鲍鱼", #21
+ "职业:保健服务人员,投向:百货零售,用途:百货零售,主营业务:养殖鲍鱼", #22
+ "职业:养殖鲍鱼,投向:百货零售,用途:百货零售,主营业务:保健服务人员",
+ "70岁以上群体人员的保健、照顾服务"
+]
 
-for i in range(1):
-    # query = querys[i % len(querys)]
-    query = querys[6]
+target = 21
+
+for i in range(len(querys)):
+    query = querys[i]
+    print(query)
+    # if i!=target:
+    #     continue
     res = requests.post("http://10.192.72.13:9876/api/aitag/v1/tagging", json={
         "esb_seq_no":"abc",
         "business_attr": "test_attr64",

+ 5 - 0
agent/tests/test_vector.py

@@ -0,0 +1,5 @@
+from agent.core.vector import get_embeddings
+texts = ["Hello, world!"]
+embeddings = get_embeddings(texts)
+print(f"Embedding for '{texts[0]}': {len(embeddings[0])}") 
+