jiayq
/
ai-tagging


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334
							import re

def highlight_long_common_substrings(str_a, str_b):
    """
    提取 A 和 B 中长度大于1的共同字符，并将 A 中的这些字符用 <strong> 标记
    """
    # 1. 从字符串 B 中提取所有长度大于1的连续字符片段（过滤掉正则符号）
    # \w+ 会匹配字母、数字和下划线（如果你想匹配中文、字母和数字，可以保留\w；如果只想匹配纯中文，可以改成 [\u4e00-\u9fff]+）
    b_substrings = re.findall(r'[\w\u4e00-\u9fff]{2,}', str_b)
    
    # 2. 去重，并按照长度从长到短排序
    # 排序非常重要！这能确保先匹配长词（如“海马”），避免短词（如“海马”被拆成“海”和“马”）干扰
    unique_substrings = sorted(set(b_substrings), key=len, reverse=True)
    
    highlighted_a = str_a
    
    # 3. 遍历这些长字符串，如果在 A 中出现，就进行高亮替换
    for substring in unique_substrings:
        if substring in highlighted_a:
            # 使用 re.sub 进行替换，re.escape 用于防止字符串中包含特殊正则符号报错
            highlighted_a = re.sub(f'({re.escape(substring)})', r'<strong>\1</strong>', highlighted_a)
    
    return highlighted_a

def generate_html(phrase, matched_rule, tag_name):
    highlighted_phrase = highlight_long_common_substrings(phrase, matched_rule)
    html_output = f"映射文本【{highlighted_phrase}】与映射规则【{matched_rule}】匹配，映射为标签【{tag_name}】"
    return html_output

phrase = "投向:环境保护专用设备制造; 用途:养海马"
matched_rule = "环境保护专用设备制造|环境监测专用仪器仪表制造"
tag_name = "环保专用设备仪器制造业"

print(generate_html(phrase, matched_rule, tag_name))