2 月之前 · 0c4511677b
--- a/compare_ocr_results.py
+++ b/compare_ocr_results.py
@@ -0,0 +1,436 @@
 
				+import re
			
 
				+import difflib
			
 
				+import json
			
 
				+import argparse
			
 
				+from typing import Dict, List, Tuple
			
 
				+import markdown
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+class OCRResultComparator:
			
 
				+    def __init__(self):
			
 
				+        self.differences = []
			
 
				+        
			
 
				+    def normalize_text(self, text: str) -> str:
			
 
				+        """标准化文本：去除多余空格、回车等无效字符"""
			
 
				+        if not text:
			
 
				+            return ""
			
 
				+        # 去除多余的空白字符
			
 
				+        text = re.sub(r'\s+', ' ', text.strip())
			
 
				+        # 去除标点符号周围的空格
			
 
				+        text = re.sub(r'\s*([，。：；！？、])\s*', r'\1', text)
			
 
				+        return text
			
 
				+    
			
 
				+    def is_image_reference(self, text: str) -> bool:
			
 
				+        """判断是否为图片引用或描述"""
			
 
				+        image_keywords = [
			
 
				+            '图', '图片', '图像', 'image', 'figure', 'fig',
			
 
				+            '照片', '截图', '示意图', '流程图', '结构图'
			
 
				+        ]
			
 
				+        # 检查是否包含图片相关关键词
			
 
				+        for keyword in image_keywords:
			
 
				+            if keyword in text.lower():
			
 
				+                return True
			
 
				+        
			
 
				+        # 检查是否为Markdown图片语法
			
 
				+        if re.search(r'!\[.*?\]\(.*?\)', text):
			
 
				+            return True
			
 
				+            
			
 
				+        # 检查是否为HTML图片标签
			
 
				+        if re.search(r'<img[^>]*>', text, re.IGNORECASE):
			
 
				+            return True
			
 
				+            
			
 
				+        return False
			
 
				+    
			
 
				+    def extract_table_data(self, md_content: str) -> List[List[List[str]]]:
			
 
				+        """从Markdown中提取表格数据"""
			
 
				+        tables = []
			
 
				+        
			
 
				+        # 使用BeautifulSoup解析HTML表格
			
 
				+        soup = BeautifulSoup(md_content, 'html.parser')
			
 
				+        html_tables = soup.find_all('table')
			
 
				+        
			
 
				+        for table in html_tables:
			
 
				+            table_data = []
			
 
				+            rows = table.find_all('tr')
			
 
				+            
			
 
				+            for row in rows:
			
 
				+                cells = row.find_all(['td', 'th'])
			
 
				+                row_data = []
			
 
				+                for cell in cells:
			
 
				+                    cell_text = self.normalize_text(cell.get_text())
			
 
				+                    # 跳过图片内容
			
 
				+                    if not self.is_image_reference(cell_text):
			
 
				+                        row_data.append(cell_text)
			
 
				+                    else:
			
 
				+                        row_data.append("[图片内容-忽略]")
			
 
				+                        
			
 
				+                if row_data:  # 只添加非空行
			
 
				+                    table_data.append(row_data)
			
 
				+            
			
 
				+            if table_data:
			
 
				+                tables.append(table_data)
			
 
				+        
			
 
				+        return tables
			
 
				+    
			
 
				+    def extract_paragraphs(self, md_content: str) -> List[str]:
			
 
				+        """提取段落文本"""
			
 
				+        # 移除表格
			
 
				+        content = re.sub(r'<table>.*?</table>', '', md_content, flags=re.DOTALL)
			
 
				+        # 移除HTML标签
			
 
				+        content = re.sub(r'<[^>]+>', '', content)
			
 
				+        # 移除Markdown注释
			
 
				+        content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
			
 
				+        
			
 
				+        # 分割段落
			
 
				+        paragraphs = []
			
 
				+        lines = content.split('\n')
			
 
				+        for line in lines:
			
 
				+            normalized = self.normalize_text(line)
			
 
				+            if normalized and not normalized.startswith('#'):
			
 
				+                # 跳过图片内容
			
 
				+                if not self.is_image_reference(normalized):
			
 
				+                    paragraphs.append(normalized)
			
 
				+        
			
 
				+        return paragraphs
			
 
				+    
			
 
				+    def compare_tables(self, table1: List[List[str]], table2: List[List[str]]) -> List[Dict]:
			
 
				+        """比较表格数据"""
			
 
				+        differences = []
			
 
				+        
			
 
				+        # 确定最大行数
			
 
				+        max_rows = max(len(table1), len(table2))
			
 
				+        
			
 
				+        for i in range(max_rows):
			
 
				+            row1 = table1[i] if i < len(table1) else []
			
 
				+            row2 = table2[i] if i < len(table2) else []
			
 
				+            
			
 
				+            # 确定最大列数
			
 
				+            max_cols = max(len(row1), len(row2))
			
 
				+            
			
 
				+            for j in range(max_cols):
			
 
				+                cell1 = row1[j] if j < len(row1) else ""
			
 
				+                cell2 = row2[j] if j < len(row2) else ""
			
 
				+                
			
 
				+                # 跳过图片内容比较
			
 
				+                if "[图片内容-忽略]" in cell1 or "[图片内容-忽略]" in cell2:
			
 
				+                    continue
			
 
				+                
			
 
				+                if cell1 != cell2:
			
 
				+                    # 特别处理数字金额
			
 
				+                    if self.is_numeric(cell1) and self.is_numeric(cell2):
			
 
				+                        num1 = self.parse_number(cell1)
			
 
				+                        num2 = self.parse_number(cell2)
			
 
				+                        if abs(num1 - num2) > 0.001:  # 允许小数精度误差
			
 
				+                            differences.append({
			
 
				+                                'type': 'table_amount',
			
 
				+                                'position': f'行{i+1}列{j+1}',
			
 
				+                                'file1_value': cell1,
			
 
				+                                'file2_value': cell2,
			
 
				+                                'description': f'金额不一致: {cell1} vs {cell2}',
			
 
				+                                'row_index': i,
			
 
				+                                'col_index': j
			
 
				+                            })
			
 
				+                    else:
			
 
				+                        differences.append({
			
 
				+                            'type': 'table_text',
			
 
				+                            'position': f'行{i+1}列{j+1}',
			
 
				+                            'file1_value': cell1,
			
 
				+                            'file2_value': cell2,
			
 
				+                            'description': f'文本不一致: {cell1} vs {cell2}',
			
 
				+                            'row_index': i,
			
 
				+                            'col_index': j
			
 
				+                        })
			
 
				+        
			
 
				+        return differences
			
 
				+    
			
 
				+    def is_numeric(self, text: str) -> bool:
			
 
				+        """判断文本是否为数字"""
			
 
				+        if not text:
			
 
				+            return False
			
 
				+        # 移除千分位分隔符和负号
			
 
				+        clean_text = re.sub(r'[,，-]', '', text)
			
 
				+        try:
			
 
				+            float(clean_text)
			
 
				+            return True
			
 
				+        except ValueError:
			
 
				+            return False
			
 
				+    
			
 
				+    def parse_number(self, text: str) -> float:
			
 
				+        """解析数字"""
			
 
				+        if not text:
			
 
				+            return 0.0
			
 
				+        clean_text = re.sub(r'[,，]', '', text)
			
 
				+        try:
			
 
				+            return float(clean_text)
			
 
				+        except ValueError:
			
 
				+            return 0.0
			
 
				+    
			
 
				+    def compare_paragraphs(self, paras1: List[str], paras2: List[str]) -> List[Dict]:
			
 
				+        """比较段落文本"""
			
 
				+        differences = []
			
 
				+        
			
 
				+        # 使用difflib进行文本比较
			
 
				+        matcher = difflib.SequenceMatcher(None, paras1, paras2)
			
 
				+        
			
 
				+        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
			
 
				+            if tag == 'replace':
			
 
				+                for k in range(max(i2-i1, j2-j1)):
			
 
				+                    para1 = paras1[i1+k] if i1+k < i2 else ""
			
 
				+                    para2 = paras2[j1+k] if j1+k < j2 else ""
			
 
				+                    if para1 != para2:
			
 
				+                        differences.append({
			
 
				+                            'type': 'paragraph',
			
 
				+                            'position': f'段落{i1+k+1}',
			
 
				+                            'file1_value': para1,
			
 
				+                            'file2_value': para2,
			
 
				+                            'description': f'段落文本不一致',
			
 
				+                            'paragraph_index': i1+k
			
 
				+                        })
			
 
				+            elif tag == 'delete':
			
 
				+                for k in range(i1, i2):
			
 
				+                    differences.append({
			
 
				+                        'type': 'paragraph',
			
 
				+                        'position': f'段落{k+1}',
			
 
				+                        'file1_value': paras1[k],
			
 
				+                        'file2_value': "",
			
 
				+                        'description': f'文件1中存在但文件2中缺失的段落',
			
 
				+                        'paragraph_index': k
			
 
				+                    })
			
 
				+            elif tag == 'insert':
			
 
				+                for k in range(j1, j2):
			
 
				+                    differences.append({
			
 
				+                        'type': 'paragraph',
			
 
				+                        'position': f'段落{k+1}',
			
 
				+                        'file1_value': "",
			
 
				+                        'file2_value': paras2[k],
			
 
				+                        'description': f'文件2中存在但文件1中缺失的段落',
			
 
				+                        'paragraph_index': k
			
 
				+                    })
			
 
				+        
			
 
				+        return differences
			
 
				+    
			
 
				+    def compare_files(self, file1_path: str, file2_path: str) -> Dict:
			
 
				+        """比较两个文件"""
			
 
				+        # 读取文件
			
 
				+        with open(file1_path, 'r', encoding='utf-8') as f:
			
 
				+            content1 = f.read()
			
 
				+        
			
 
				+        with open(file2_path, 'r', encoding='utf-8') as f:
			
 
				+            content2 = f.read()
			
 
				+        
			
 
				+        # 提取表格和段落
			
 
				+        tables1 = self.extract_table_data(content1)
			
 
				+        tables2 = self.extract_table_data(content2)
			
 
				+        
			
 
				+        paras1 = self.extract_paragraphs(content1)
			
 
				+        paras2 = self.extract_paragraphs(content2)
			
 
				+        
			
 
				+        # 比较结果
			
 
				+        all_differences = []
			
 
				+        
			
 
				+        # 比较表格
			
 
				+        if tables1 and tables2:
			
 
				+            table_diffs = self.compare_tables(tables1[0], tables2[0])
			
 
				+            all_differences.extend(table_diffs)
			
 
				+        elif tables1 and not tables2:
			
 
				+            all_differences.append({
			
 
				+                'type': 'table_structure',
			
 
				+                'position': '表格结构',
			
 
				+                'file1_value': f'包含{len(tables1)}个表格',
			
 
				+                'file2_value': '无表格',
			
 
				+                'description': '文件1包含表格但文件2无表格'
			
 
				+            })
			
 
				+        elif not tables1 and tables2:
			
 
				+            all_differences.append({
			
 
				+                'type': 'table_structure',
			
 
				+                'position': '表格结构',
			
 
				+                'file1_value': '无表格',
			
 
				+                'file2_value': f'包含{len(tables2)}个表格',
			
 
				+                'description': '文件2包含表格但文件1无表格'
			
 
				+            })
			
 
				+        
			
 
				+        # 比较段落
			
 
				+        para_diffs = self.compare_paragraphs(paras1, paras2)
			
 
				+        all_differences.extend(para_diffs)
			
 
				+        
			
 
				+        # 统计信息
			
 
				+        stats = {
			
 
				+            'total_differences': len(all_differences),
			
 
				+            'table_differences': len([d for d in all_differences if d['type'].startswith('table')]),
			
 
				+            'paragraph_differences': len([d for d in all_differences if d['type'] == 'paragraph']),
			
 
				+            'amount_differences': len([d for d in all_differences if d['type'] == 'table_amount'])
			
 
				+        }
			
 
				+        
			
 
				+        return {
			
 
				+            'differences': all_differences,
			
 
				+            'statistics': stats,
			
 
				+            'file1_tables': len(tables1),
			
 
				+            'file2_tables': len(tables2),
			
 
				+            'file1_paragraphs': len(paras1),
			
 
				+            'file2_paragraphs': len(paras2),
			
 
				+            'file1_path': file1_path,
			
 
				+            'file2_path': file2_path
			
 
				+        }
			
 
				+    
			
 
				+    def generate_json_report(self, comparison_result: Dict, output_file: str):
			
 
				+        """生成JSON格式的比较报告"""
			
 
				+        report_data = {
			
 
				+            'comparison_summary': {
			
 
				+                'timestamp': re.sub(r'[^\w\-_\.]', '_', str(comparison_result.get('timestamp', ''))),
			
 
				+                'file1': comparison_result['file1_path'],
			
 
				+                'file2': comparison_result['file2_path'],
			
 
				+                'statistics': comparison_result['statistics'],
			
 
				+                'file_info': {
			
 
				+                    'file1_tables': comparison_result['file1_tables'],
			
 
				+                    'file2_tables': comparison_result['file2_tables'],
			
 
				+                    'file1_paragraphs': comparison_result['file1_paragraphs'],
			
 
				+                    'file2_paragraphs': comparison_result['file2_paragraphs']
			
 
				+                }
			
 
				+            },
			
 
				+            'differences': comparison_result['differences']
			
 
				+        }
			
 
				+        
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(report_data, f, ensure_ascii=False, indent=2)
			
 
				+    
			
 
				+    def generate_markdown_report(self, comparison_result: Dict, output_file: str):
			
 
				+        """生成Markdown格式的比较报告"""
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            f.write("# OCR结果对比报告\n\n")
			
 
				+            
			
 
				+            # 基本信息
			
 
				+            f.write("## 基本信息\n\n")
			
 
				+            f.write(f"- **文件1**: `{comparison_result['file1_path']}`\n")
			
 
				+            f.write(f"- **文件2**: `{comparison_result['file2_path']}`\n")
			
 
				+            f.write(f"- **比较时间**: {comparison_result.get('timestamp', 'N/A')}\n\n")
			
 
				+            
			
 
				+            # 统计信息
			
 
				+            stats = comparison_result['statistics']
			
 
				+            f.write("## 统计信息\n\n")
			
 
				+            f.write(f"- 总差异数量: **{stats['total_differences']}**\n")
			
 
				+            f.write(f"- 表格差异: **{stats['table_differences']}**\n")
			
 
				+            f.write(f"- 金额差异: **{stats['amount_differences']}**\n")
			
 
				+            f.write(f"- 段落差异: **{stats['paragraph_differences']}**\n")
			
 
				+            f.write(f"- 文件1表格数: {comparison_result['file1_tables']}\n")
			
 
				+            f.write(f"- 文件2表格数: {comparison_result['file2_tables']}\n")
			
 
				+            f.write(f"- 文件1段落数: {comparison_result['file1_paragraphs']}\n")
			
 
				+            f.write(f"- 文件2段落数: {comparison_result['file2_paragraphs']}\n\n")
			
 
				+            
			
 
				+            # 差异摘要
			
 
				+            if stats['total_differences'] == 0:
			
 
				+                f.write("## 结论\n\n")
			
 
				+                f.write("🎉 **完美匹配！没有发现任何差异。**\n\n")
			
 
				+            else:
			
 
				+                f.write("## 差异摘要\n\n")
			
 
				+                
			
 
				+                # 按类型分组显示差异
			
 
				+                diff_by_type = {}
			
 
				+                for diff in comparison_result['differences']:
			
 
				+                    diff_type = diff['type']
			
 
				+                    if diff_type not in diff_by_type:
			
 
				+                        diff_by_type[diff_type] = []
			
 
				+                    diff_by_type[diff_type].append(diff)
			
 
				+                
			
 
				+                for diff_type, diffs in diff_by_type.items():
			
 
				+                    type_name = {
			
 
				+                        'table_amount': '💰 表格金额差异',
			
 
				+                        'table_text': '📝 表格文本差异',
			
 
				+                        'paragraph': '📄 段落差异',
			
 
				+                        'table_structure': '🏗️ 表格结构差异'
			
 
				+                    }.get(diff_type, f'❓ {diff_type}')
			
 
				+                    
			
 
				+                    f.write(f"### {type_name} ({len(diffs)}个)\n\n")
			
 
				+                    
			
 
				+                    for i, diff in enumerate(diffs, 1):
			
 
				+                        f.write(f"**{i}. {diff['position']}**\n")
			
 
				+                        f.write(f"- 文件1: `{diff['file1_value']}`\n")
			
 
				+                        f.write(f"- 文件2: `{diff['file2_value']}`\n")
			
 
				+                        f.write(f"- 说明: {diff['description']}\n\n")
			
 
				+            
			
 
				+            # 详细差异列表
			
 
				+            if comparison_result['differences']:
			
 
				+                f.write("## 详细差异列表\n\n")
			
 
				+                f.write("| 序号 | 类型 | 位置 | 文件1内容 | 文件2内容 | 描述 |\n")
			
 
				+                f.write("| --- | --- | --- | --- | --- | --- |\n")
			
 
				+                
			
 
				+                for i, diff in enumerate(comparison_result['differences'], 1):
			
 
				+                    f.write(f"| {i} | {diff['type']} | {diff['position']} | ")
			
 
				+                    f.write(f"`{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}` | ")
			
 
				+                    f.write(f"`{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}` | ")
			
 
				+                    f.write(f"{diff['description']} |\n")
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    parser = argparse.ArgumentParser(description='OCR结果对比工具')
			
 
				+    parser.add_argument('file1', help='第一个OCR结果文件路径')
			
 
				+    parser.add_argument('file2', help='第二个OCR结果文件路径')
			
 
				+    parser.add_argument('-o', '--output', default='comparison_report', 
			
 
				+                       help='输出文件名（不含扩展名）')
			
 
				+    parser.add_argument('-f', '--format', choices=['json', 'markdown', 'both'], 
			
 
				+                       default='markdown', help='输出格式: json, markdown, 或 both')
			
 
				+    parser.add_argument('--ignore-images', action='store_true', 
			
 
				+                       help='忽略图片内容（默认已启用）')
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    comparator = OCRResultComparator()
			
 
				+    
			
 
				+    print("🔍 开始对比OCR结果...")
			
 
				+    print(f"📄 文件1: {args.file1}")
			
 
				+    print(f"📄 文件2: {args.file2}")
			
 
				+    print(f"📁 输出格式: {args.format}")
			
 
				+    print(f"🖼️  图片处理: {'忽略' if args.ignore_images else '对比'}")
			
 
				+    
			
 
				+    try:
			
 
				+        # 执行比较
			
 
				+        result = comparator.compare_files(args.file1, args.file2)
			
 
				+        
			
 
				+        # 添加时间戳
			
 
				+        import datetime
			
 
				+        result['timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
			
 
				+        
			
 
				+        # 生成报告
			
 
				+        if args.format in ['json', 'both']:
			
 
				+            json_file = f"{args.output}.json"
			
 
				+            comparator.generate_json_report(result, json_file)
			
 
				+            print(f"📄 JSON报告已保存至: {json_file}")
			
 
				+        
			
 
				+        if args.format in ['markdown', 'both']:
			
 
				+            md_file = f"{args.output}.md"
			
 
				+            comparator.generate_markdown_report(result, md_file)
			
 
				+            print(f"📝 Markdown报告已保存至: {md_file}")
			
 
				+        
			
 
				+        # 打印简要结果
			
 
				+        print(f"\n📊 对比完成！")
			
 
				+        print(f"   总差异数: {result['statistics']['total_differences']}")
			
 
				+        print(f"   表格差异: {result['statistics']['table_differences']}")
			
 
				+        print(f"   金额差异: {result['statistics']['amount_differences']}")
			
 
				+        print(f"   段落差异: {result['statistics']['paragraph_differences']}")
			
 
				+        
			
 
				+        # 打印前几个重要差异
			
 
				+        if result['differences']:
			
 
				+            print(f"\n🔍 前3个重要差异:")
			
 
				+            for i, diff in enumerate(result['differences'][:3], 1):
			
 
				+                print(f"   {i}. {diff['position']}: {diff['description']}")
			
 
				+                print(f"      文件1: '{diff['file1_value'][:50]}{'...' if len(diff['file1_value']) > 50 else ''}'")
			
 
				+                print(f"      文件2: '{diff['file2_value'][:50]}{'...' if len(diff['file2_value']) > 50 else ''}'")
			
 
				+        else:
			
 
				+            print(f"\n🎉 恭喜！两个文件内容完全一致！")
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ 对比过程中出现错误: {e}")
			
 
				+        return 1
			
 
				+    
			
 
				+    return 0
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 如果sys.argv没有被传入参数，则提供默认参数用于测试
			
 
				+    import sys
			
 
				+    if len(sys.argv) == 1:
			
 
				+        sys.argv.extend([
			
 
				+            '至远彩色印刷工业有限公司-2022年母公司_2.md', 'demo_54fa7ad0_page_1_nohf.md', 
			
 
				+            '-o', 'comparison_result', 
			
 
				+            '-f', 'both',
			
 
				+            '--ignore-images'])
			
 
				+
			
 
				+    exit(main())
			
--- a/ocr_by_vlm.py
+++ b/ocr_by_vlm.py
@@ -0,0 +1,391 @@
 
				+import os
			
 
				+import base64
			
 
				+import json
			
 
				+import time
			
 
				+import re
			
 
				+from pathlib import Path
			
 
				+from openai import OpenAI
			
 
				+from dotenv import load_dotenv
			
 
				+from typing import Any, Dict, List
			
 
				+
			
 
				+# 加载环境变量
			
 
				+load_dotenv()
			
 
				+
			
 
				+def normalize_financial_numbers(text: str) -> str:
			
 
				+    """
			
 
				+    标准化财务数字：将全角字符转换为半角字符
			
 
				+    
			
 
				+    Args:
			
 
				+        text: 原始文本
			
 
				+    
			
 
				+    Returns:
			
 
				+        标准化后的文本
			
 
				+    """
			
 
				+    if not text:
			
 
				+        return text
			
 
				+    
			
 
				+    # 定义全角到半角的映射
			
 
				+    fullwidth_to_halfwidth = {
			
 
				+        '０': '0', '１': '1', '２': '2', '３': '3', '４': '4',
			
 
				+        '５': '5', '６': '6', '７': '7', '８': '8', '９': '9',
			
 
				+        '，': ',',  # 全角逗号转半角逗号
			
 
				+        '。': '.',  # 全角句号转半角句号  
			
 
				+        '．': '.',  # 全角句点转半角句点
			
 
				+        '：': ':',  # 全角冒号转半角冒号
			
 
				+        '；': ';',  # 全角分号转半角分号
			
 
				+        '（': '(',  # 全角左括号转半角左括号
			
 
				+        '）': ')',  # 全角右括号转半角右括号
			
 
				+        '－': '-',  # 全角减号转半角减号
			
 
				+        '＋': '+',  # 全角加号转半角加号
			
 
				+        '％': '%',  # 全角百分号转半角百分号
			
 
				+    }
			
 
				+    
			
 
				+    # 执行字符替换
			
 
				+    normalized_text = text
			
 
				+    for fullwidth, halfwidth in fullwidth_to_halfwidth.items():
			
 
				+        normalized_text = normalized_text.replace(fullwidth, halfwidth)
			
 
				+    
			
 
				+    # 特别处理金额格式：识别数字模式并标准化
			
 
				+    # 匹配金额模式：数字+全角逗号+数字+小数点+数字
			
 
				+    amount_pattern = r'(\d+(?:[，,]\d{3})*(?:[。．.]\d{2})?)'
			
 
				+    
			
 
				+    def normalize_amount(match):
			
 
				+        amount = match.group(1)
			
 
				+        # 将全角逗号替换为半角逗号
			
 
				+        amount = amount.replace('，', ',')
			
 
				+        # 将全角句号、句点替换为半角小数点
			
 
				+        amount = re.sub(r'[。．]', '.', amount)
			
 
				+        return amount
			
 
				+    
			
 
				+    normalized_text = re.sub(amount_pattern, normalize_amount, normalized_text)
			
 
				+    
			
 
				+    return normalized_text
			
 
				+
			
 
				+def normalize_markdown_table(markdown_content: str) -> str:
			
 
				+    """
			
 
				+    专门处理Markdown表格中的数字标准化
			
 
				+    
			
 
				+    Args:
			
 
				+        markdown_content: Markdown内容
			
 
				+    
			
 
				+    Returns:
			
 
				+        标准化后的Markdown内容
			
 
				+    """
			
 
				+    # 使用BeautifulSoup处理HTML表格
			
 
				+    from bs4 import BeautifulSoup
			
 
				+    
			
 
				+    soup = BeautifulSoup(markdown_content, 'html.parser')
			
 
				+    tables = soup.find_all('table')
			
 
				+    
			
 
				+    for table in tables:
			
 
				+        cells = table.find_all(['td', 'th'])
			
 
				+        for cell in cells:
			
 
				+            original_text = cell.get_text()
			
 
				+            normalized_text = normalize_financial_numbers(original_text)
			
 
				+            
			
 
				+            # 如果内容发生了变化，更新单元格内容
			
 
				+            if original_text != normalized_text:
			
 
				+                cell.string = normalized_text
			
 
				+    
			
 
				+    # 返回更新后的HTML
			
 
				+    return str(soup)
			
 
				+
			
 
				+def ocr_with_vlm(image_path, output_dir="./", 
			
 
				+                        api_key=None, api_base=None, model_id=None, 
			
 
				+                        temperature=0.1, max_tokens=4096, timeout=180,
			
 
				+                        normalize_numbers=True):
			
 
				+    """
			
 
				+    使用VLM识别图片中的文本
			
 
				+    
			
 
				+    Args:
			
 
				+        image_path: 原图路径
			
 
				+        output_dir: 结果输出文件路径
			
 
				+        api_key: API密钥，如果为None则从环境变量获取
			
 
				+        api_base: API基础URL，如果为None则从环境变量获取
			
 
				+        model_id: 模型ID，如果为None则从环境变量获取
			
 
				+        temperature: 生成温度，默认0.1
			
 
				+        max_tokens: 最大输出token数，默认4096
			
 
				+        timeout: 请求超时时间，默认180秒
			
 
				+        normalize_numbers: 是否标准化数字格式，默认True
			
 
				+    """
			
 
				+    # 从参数或环境变量获取API配置
			
 
				+    api_key = api_key or os.getenv("YUSYS_MULTIMODAL_API_KEY")
			
 
				+    api_base = api_base or os.getenv("YUSYS_MULTIMODAL_API_BASE")
			
 
				+    model_id = model_id or os.getenv("YUSYS_MULTIMODAL_ID")
			
 
				+    
			
 
				+    if not api_key:
			
 
				+        raise ValueError("未找到API密钥，请通过参数传入或设置YUSYS_MULTIMODAL_API_KEY环境变量")
			
 
				+    if not api_base:
			
 
				+        raise ValueError("未找到API基础URL，请通过参数传入或设置YUSYS_MULTIMODAL_API_BASE环境变量")
			
 
				+    if not model_id:
			
 
				+        raise ValueError("未找到模型ID，请通过参数传入或设置YUSYS_MULTIMODAL_ID环境变量")
			
 
				+    
			
 
				+    # 去掉openai/前缀
			
 
				+    model_name = model_id.replace("openai/", "")
			
 
				+
			
 
				+    # 读取图片文件并转换为base64
			
 
				+    try:
			
 
				+        with open(image_path, "rb") as image_file:
			
 
				+            image_data = base64.b64encode(image_file.read()).decode('utf-8')
			
 
				+    except FileNotFoundError:
			
 
				+        raise FileNotFoundError(f"找不到图片文件: {image_path}")
			
 
				+    
			
 
				+    # 获取图片的MIME类型
			
 
				+    file_extension = Path(image_path).suffix.lower()
			
 
				+    mime_type_map = {
			
 
				+        '.jpg': 'image/jpeg',
			
 
				+        '.jpeg': 'image/jpeg',
			
 
				+        '.png': 'image/png',
			
 
				+        '.gif': 'image/gif',
			
 
				+        '.webp': 'image/webp'
			
 
				+    }
			
 
				+    mime_type = mime_type_map.get(file_extension, 'image/jpeg')
			
 
				+    
			
 
				+    # 构建分析提示词
			
 
				+    prompt = r'''You are an AI assistant specialized in converting PDF images to Markdown format. Please follow these instructions for the conversion:
			
 
				+
			
 
				+        1. Text Processing:
			
 
				+        - Accurately recognize all text content in the PDF image without guessing or inferring.
			
 
				+        - Convert the recognized text into Markdown format.
			
 
				+        - Maintain the original document structure, including headings, paragraphs, lists, etc.
			
 
				+        - For financial amounts, use standard half-width characters (e.g., use "," for thousands separator and "." for decimal point)
			
 
				+
			
 
				+        2. Mathematical Formula Processing:
			
 
				+        - Convert all mathematical formulas to LaTeX format.
			
 
				+        - Enclose inline formulas with \( \). For example: This is an inline formula \( E = mc^2 \)
			
 
				+        - Enclose block formulas with \\[ \\]. For example: \[ \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} \]
			
 
				+
			
 
				+        3. Table Processing:
			
 
				+        - Convert tables to HTML format.
			
 
				+        - Wrap the entire table with <table> and </table>.
			
 
				+        - For financial data in tables, ensure numbers use standard format with half-width commas and periods
			
 
				+
			
 
				+        4. Figure Handling:
			
 
				+        - Ignore figures content in the PDF image. Do not attempt to describe or convert images.
			
 
				+
			
 
				+        5. Output Format:
			
 
				+        - Ensure the output Markdown document has a clear structure with appropriate line breaks between elements.
			
 
				+        - For complex layouts, try to maintain the original document's structure and format as closely as possible.
			
 
				+        - Use standard ASCII characters for punctuation and numbers
			
 
				+
			
 
				+        Please strictly follow these guidelines to ensure accuracy and consistency in the conversion. Your task is to accurately convert the content of the PDF image into Markdown format without adding any extra explanations or comments.
			
 
				+        '''
			
 
				+    
			
 
				+    # 创建OpenAI客户端
			
 
				+    client = OpenAI(
			
 
				+        api_key=api_key,
			
 
				+        base_url=api_base
			
 
				+    )
			
 
				+    
			
 
				+    # 构建消息内容
			
 
				+    messages: List[Dict[str, Any]] = [
			
 
				+        {
			
 
				+            "role": "user",
			
 
				+            "content": [
			
 
				+                {
			
 
				+                    "type": "text",
			
 
				+                    "text": prompt
			
 
				+                },
			
 
				+                {
			
 
				+                    "type": "image_url",
			
 
				+                    "image_url": {
			
 
				+                        "url": f"data:{mime_type};base64,{image_data}"
			
 
				+                    }
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+    ]
			
 
				+    
			
 
				+    try:
			
 
				+        print(f"正在通过模型 {model_name} 进行OCR...")
			
 
				+        print(f"API地址: {api_base}")
			
 
				+        print(f"数字标准化: {'启用' if normalize_numbers else '禁用'}")
			
 
				+        
			
 
				+        # 调用API
			
 
				+        response = client.chat.completions.create(
			
 
				+            model=model_name,
			
 
				+            messages=messages,  # type: ignore
			
 
				+            temperature=temperature,
			
 
				+            max_tokens=max_tokens,
			
 
				+            timeout=timeout
			
 
				+        )
			
 
				+        
			
 
				+        # 提取响应内容
			
 
				+        generated_text = response.choices[0].message.content
			
 
				+        
			
 
				+        if not generated_text:
			
 
				+            raise Exception("模型没有生成文本内容")
			
 
				+
			
 
				+        # 标准化数字格式（如果启用）
			
 
				+        original_text = generated_text
			
 
				+        if normalize_numbers:
			
 
				+            print("🔧 正在标准化数字格式...")
			
 
				+            generated_text = normalize_financial_numbers(generated_text)
			
 
				+            # 不用再调用表格标准化，避免重复处理
			
 
				+            # generated_text = normalize_markdown_table(generated_text)
			
 
				+            
			
 
				+            # 统计标准化的变化
			
 
				+            changes_count = len([1 for o, n in zip(original_text, generated_text) if o != n])
			
 
				+            if changes_count > 0:
			
 
				+                print(f"✅ 已标准化 {changes_count} 个字符（全角→半角）")
			
 
				+            else:
			
 
				+                print("ℹ️ 无需标准化（已是标准格式）")
			
 
				+
			
 
				+        print(f"✅ 成功使用模型 {model_id} 完成OCR!")
			
 
				+
			
 
				+        # 保存结果文件
			
 
				+        Path(output_dir).mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+        # 保存标准化后的Markdown文件
			
 
				+        markdown_path = Path(image_path).with_suffix('.md')
			
 
				+        markdown_path = Path(output_dir) / markdown_path.name
			
 
				+        markdown_path = markdown_path.resolve()
			
 
				+        with open(markdown_path, 'w', encoding='utf-8') as f:
			
 
				+            f.write(generated_text)
			
 
				+
			
 
				+        # 如果启用了标准化，也保存原始版本用于对比
			
 
				+        if normalize_numbers and original_text != generated_text:
			
 
				+            original_markdown_path = Path(output_dir) / f"{Path(image_path).stem}_original.md"
			
 
				+            with open(original_markdown_path, 'w', encoding='utf-8') as f:
			
 
				+                f.write(original_text)
			
 
				+            print(f"📄 原始OCR结果已保存到: {original_markdown_path}")
			
 
				+
			
 
				+        # 准备元数据
			
 
				+        ocr_result: Dict[str, Any] = {
			
 
				+            "processing_info": {
			
 
				+                "normalize_numbers": normalize_numbers,
			
 
				+                "changes_applied": original_text != generated_text if normalize_numbers else False,
			
 
				+                "character_changes_count": len([1 for o, n in zip(original_text, generated_text) if o != n]) if normalize_numbers else 0
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        result_path = Path(image_path).with_suffix('.json')
			
 
				+        result_path = Path(output_dir) / result_path.name
			
 
				+        result_path = result_path.resolve()
			
 
				+        
			
 
				+        # 添加元数据
			
 
				+        ocr_result["metadata"] = {
			
 
				+            "model_used": model_id,
			
 
				+            "api_base": api_base,
			
 
				+            "temperature": temperature,
			
 
				+            "max_tokens": max_tokens,
			
 
				+            "timeout": timeout,
			
 
				+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
			
 
				+            "original_image": Path(image_path).resolve().as_posix(),
			
 
				+            "output_path": Path(markdown_path).resolve().as_posix(),
			
 
				+            "normalize_numbers": normalize_numbers
			
 
				+        }
			
 
				+        
			
 
				+        # 保存结果
			
 
				+        with open(result_path, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(ocr_result, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+        print(f"📄 OCR结果已保存到: {markdown_path}")
			
 
				+        print(f"📊 元数据已保存到: {result_path}")
			
 
				+
			
 
				+        # 打印详细统计
			
 
				+        print("\n📊 OCR处理统计")
			
 
				+        print(f"   原始图片: {ocr_result['metadata']['original_image']}")
			
 
				+        print(f"   输出路径: {ocr_result['metadata']['output_path']}")
			
 
				+        print(f"   使用模型: {ocr_result['metadata']['model_used']}")
			
 
				+        print(f"   数字标准化: {ocr_result['metadata']['normalize_numbers']}")
			
 
				+        if normalize_numbers:
			
 
				+            print(f"   字符变化数: {ocr_result['processing_info']['character_changes_count']}")
			
 
				+            print(f"   应用了标准化: {ocr_result['processing_info']['changes_applied']}")
			
 
				+        print(f"   处理时间: {ocr_result['metadata']['timestamp']}")
			
 
				+
			
 
				+        return ocr_result
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        raise Exception(f"OCR任务失败: {e}")
			
 
				+
			
 
				+def batch_normalize_existing_files(input_dir: str, output_dir: str = None):
			
 
				+    """
			
 
				+    批量标准化已有的Markdown文件中的数字格式
			
 
				+    
			
 
				+    Args:
			
 
				+        input_dir: 输入目录
			
 
				+        output_dir: 输出目录，如果为None则覆盖原文件
			
 
				+    """
			
 
				+    input_path = Path(input_dir)
			
 
				+    output_path = Path(output_dir) if output_dir else input_path
			
 
				+    
			
 
				+    if not input_path.exists():
			
 
				+        raise ValueError(f"输入目录不存在: {input_dir}")
			
 
				+    
			
 
				+    output_path.mkdir(parents=True, exist_ok=True)
			
 
				+    
			
 
				+    md_files = list(input_path.glob("*.md"))
			
 
				+    
			
 
				+    if not md_files:
			
 
				+        print("⚠️ 未找到Markdown文件")
			
 
				+        return
			
 
				+    
			
 
				+    print(f"🔧 开始批量标准化 {len(md_files)} 个Markdown文件...")
			
 
				+    
			
 
				+    for md_file in md_files:
			
 
				+        print(f"   处理: {md_file.name}")
			
 
				+        
			
 
				+        # 读取原文件
			
 
				+        with open(md_file, 'r', encoding='utf-8') as f:
			
 
				+            original_content = f.read()
			
 
				+        
			
 
				+        # 标准化内容
			
 
				+        normalized_content = normalize_financial_numbers(original_content)
			
 
				+        normalized_content = normalize_markdown_table(normalized_content)
			
 
				+        
			
 
				+        # 保存标准化后的文件
			
 
				+        output_file = output_path / md_file.name
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            f.write(normalized_content)
			
 
				+        
			
 
				+        # 统计变化
			
 
				+        changes = len([1 for o, n in zip(original_content, normalized_content) if o != n])
			
 
				+        if changes > 0:
			
 
				+            print(f"     ✅ 标准化了 {changes} 个字符")
			
 
				+        else:
			
 
				+            print(f"     ℹ️ 无需更改")
			
 
				+    
			
 
				+    print(f"✅ 批量标准化完成！结果保存到: {output_path}")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import argparse
			
 
				+    
			
 
				+    parser = argparse.ArgumentParser(description='VLM OCR识别工具')
			
 
				+    parser.add_argument('image_path', nargs='?', help='图片文件路径')
			
 
				+    parser.add_argument('-o', '--output', default='./', help='输出目录')
			
 
				+    parser.add_argument('-t', '--temperature', type=float, default=0.1, help='生成温度')
			
 
				+    parser.add_argument('-m', '--max-tokens', type=int, default=4096, help='最大token数')
			
 
				+    parser.add_argument('--timeout', type=int, default=180, help='超时时间（秒）')
			
 
				+    parser.add_argument('--no-normalize', action='store_true', help='禁用数字标准化')
			
 
				+    parser.add_argument('--batch-normalize', help='批量标准化指定目录中的Markdown文件')
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    if args.batch_normalize:
			
 
				+        # 批量标准化模式
			
 
				+        batch_normalize_existing_files(args.batch_normalize, args.output)
			
 
				+    elif args.image_path:
			
 
				+        # 单文件OCR模式
			
 
				+        try:
			
 
				+            result = ocr_with_vlm(
			
 
				+                image_path=args.image_path,
			
 
				+                output_dir=args.output,
			
 
				+                temperature=args.temperature,
			
 
				+                max_tokens=args.max_tokens,
			
 
				+                timeout=args.timeout,
			
 
				+                normalize_numbers=not args.no_normalize
			
 
				+            )
			
 
				+            print("\n🎉 OCR识别完成！")
			
 
				+        except Exception as e:
			
 
				+            print(f"❌ OCR识别失败: {e}")
			
 
				+    else:
			
 
				+        # 默认示例
			
 
				+        image_path = "至远彩色印刷工业有限公司-2022年母公司_2.png"
			
 
				+        try:
			
 
				+            result = ocr_with_vlm(image_path)
			
 
				+            print("\n🎉 OCR识别完成！")
			
 
				+        except Exception as e:
			
 
				+            print(f"❌ OCR识别失败: {e}")