zhengchun
/
ocr_platform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
							"""PaddleX 公共工具函数"""
import json
import traceback
import warnings
import base64
from pathlib import Path
from PIL import Image
from typing import List, Dict, Any, Union
import numpy as np

# 导入 ocr_utils
import sys
ocr_platform_root = Path(__file__).parents[2]
if str(ocr_platform_root) not in sys.path:
    sys.path.insert(0, str(ocr_platform_root))

from ocr_utils import (
    normalize_markdown_table,
    normalize_financial_numbers
)

# 注意：load_images_from_pdf 不再需要，因为 PDF 转图片由 ocr_utils.get_input_files() 统一处理


def convert_pruned_result_to_json(pruned_result: Dict[str, Any], 
                              input_image_path: str, 
                              output_dir: str, 
                              filename: str,
                              normalize_numbers: bool = True) -> tuple[str, Dict[str, Any]]:
    """
    将API返回结果转换为标准JSON格式，并支持数字标准化
    """
    if not pruned_result:
        return "", {}
    
    # 构造标准格式的JSON
    converted_json = {
        "input_path": input_image_path,
        "page_index": None,
        "model_settings": pruned_result.get('model_settings', {}),
        "parsing_res_list": pruned_result.get('parsing_res_list', []),
        "doc_preprocessor_res": {
            "input_path": None,
            "page_index": None,
            "model_settings": pruned_result.get('doc_preprocessor_res', {}).get('model_settings', {}),
            "angle": pruned_result.get('doc_preprocessor_res', {}).get('angle', 0)
        },
        "layout_det_res": {
            "input_path": None,
            "page_index": None,
            "boxes": pruned_result.get('layout_det_res', {}).get('boxes', [])
        },
        "overall_ocr_res": {
            "input_path": None,
            "page_index": None,
            "model_settings": pruned_result.get('overall_ocr_res', {}).get('model_settings', {}),
            "dt_polys": pruned_result.get('overall_ocr_res', {}).get('dt_polys', []),
            "text_det_params": pruned_result.get('overall_ocr_res', {}).get('text_det_params', {}),
            "text_type": pruned_result.get('overall_ocr_res', {}).get('text_type', 'general'),
            "textline_orientation_angles": pruned_result.get('overall_ocr_res', {}).get('textline_orientation_angles', []),
            "text_rec_score_thresh": pruned_result.get('overall_ocr_res', {}).get('text_rec_score_thresh', 0.0),
            "return_word_box": pruned_result.get('overall_ocr_res', {}).get('return_word_box', False),
            "rec_texts": pruned_result.get('overall_ocr_res', {}).get('rec_texts', []),
            "rec_scores": pruned_result.get('overall_ocr_res', {}).get('rec_scores', []),
            "rec_polys": pruned_result.get('overall_ocr_res', {}).get('rec_polys', []),
            "rec_boxes": pruned_result.get('overall_ocr_res', {}).get('rec_boxes', [])
        },
        "table_res_list": pruned_result.get('table_res_list', [])
    }
    
    # 数字标准化处理
    original_json = converted_json.copy()
    changes_count = 0
    
    if normalize_numbers:
        # 1. 标准化 parsing_res_list 中的文本内容
        for item in converted_json.get('parsing_res_list', []):
            if 'block_content' in item:
                original_content = item['block_content']
                normalized_content = original_content
                # 根据block_label类型选择标准化方法
                if item.get('block_label') == 'table':
                    normalized_content = normalize_markdown_table(original_content)
                
                if original_content != normalized_content:
                    item['block_content'] = normalized_content
                    changes_count += len([1 for o, n in zip(original_content, normalized_content) if o != n])
        
        # 2. 标准化 table_res_list 中的HTML表格
        for table_item in converted_json.get('table_res_list', []):
            if 'pred_html' in table_item:
                original_html = table_item['pred_html']
                normalized_html = normalize_markdown_table(original_html)
                
                if original_html != normalized_html:
                    table_item['pred_html'] = normalized_html
                    changes_count += len([1 for o, n in zip(original_html, normalized_html) if o != n])

        # 统计表格数量
        parsing_res_tables_count = 0
        table_res_list_count = 0
        if 'parsing_res_list' in converted_json:
            parsing_res_tables_count = len([item for item in converted_json['parsing_res_list'] 
                                          if 'block_label' in item and item['block_label'] == 'table'])
        if 'table_res_list' in converted_json:
            table_res_list_count = len(converted_json["table_res_list"])
        table_consistency_fixed = False
        if parsing_res_tables_count != table_res_list_count:
            warnings.warn(f"⚠️ Warning: {filename} Table count mismatch - parsing_res_list has {parsing_res_tables_count} tables, "
                          f"but table_res_list has {table_res_list_count} tables.")
            table_consistency_fixed = True
        
        # 添加标准化处理信息
        converted_json['processing_info'] = {
            "normalize_numbers": normalize_numbers,
            "changes_applied": changes_count > 0,
            "character_changes_count": changes_count,
            "parsing_res_tables_count": parsing_res_tables_count,
            "table_res_list_count": table_res_list_count,
            "table_consistency_fixed": table_consistency_fixed
        }
    else:
        converted_json['processing_info'] = {
            "normalize_numbers": False,
            "changes_applied": False,
            "character_changes_count": 0
        }
    
    # 保存JSON文件
    output_path = Path(output_dir).resolve()
    output_path.mkdir(parents=True, exist_ok=True)
    
    json_file_path = output_path / f"{filename}.json"
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(converted_json, f, ensure_ascii=False, indent=2)
    
    # 如果启用了标准化且有变化，保存原始版本用于对比
    if normalize_numbers and changes_count > 0:
        original_output_path = output_path / f"{filename}_original.json"
        with open(original_output_path, 'w', encoding='utf-8') as f:
            json.dump(original_json, f, ensure_ascii=False, indent=2)
    
    return str(output_path), converted_json

def save_image(image: Union[Image.Image, str, np.ndarray], output_path: str) -> str:
    """
    保存单个图像到指定路径

    Args:
        image: 要保存的图像，可以是PIL Image对象、base64字符串或numpy数组
        output_path: 输出文件路径

    Returns:
        保存的图像文件路径
    """
    try:
        if isinstance(image, Image.Image):
            image.save(output_path)
        elif isinstance(image, str):
            # 处理base64字符串
            img_data = base64.b64decode(image)
            with open(output_path, 'wb') as f:
                f.write(img_data)
        elif isinstance(image, np.ndarray):
            # 处理numpy数组
            pil_image = Image.fromarray(image)
            pil_image.save(output_path)
        else:
            raise ValueError(f"Unsupported image type: {type(image)}")

        return str(output_path)

    except Exception as e:
        print(f"❌ Error saving image {output_path}: {e}")
        return ""

def save_output_images(output_images: Dict[str, Any], output_dir: str, output_filename: str) -> Dict[str, str]:
    """
    保存API返回的输出图像
    
    Args:
        output_images: 图像数组字典或PIL Image对象字典
        output_dir: 输出目录
        output_filename: 输出文件名前缀
        
    Returns:
        保存的图像文件路径字典
    """
    if not output_images:
        return {}
    
    output_path = Path(output_dir).resolve()
    output_path.mkdir(parents=True, exist_ok=True)
    
    saved_images = {}
    
    for img_name, img_data in output_images.items():
        try:
            # 生成文件名
            img_filename = f"{output_filename}_{img_name}.jpg"
            img_path = output_path / img_filename
            save_image(img_data, str(img_path))
            saved_images[img_name] = str(img_path)
            
        except Exception as e:
            print(f"❌ Error saving image {img_name}: {e}")
            print(f"   Image data type: {type(img_data)}")
            if hasattr(img_data, 'shape'):
                print(f"   Image shape: {img_data.shape}")
            traceback.print_exc()
    
    return saved_images

def save_markdown_content(markdown_data: Dict[str, Any], output_dir: str, 
                         filename: str, normalize_numbers: bool = True, 
                         key_text: str = 'text', key_images: str = 'images',
                         json_data: Dict[str, Any] = None) -> str:
    """
    保存Markdown内容，支持数字标准化和表格补全
    """
    if not markdown_data and not json_data:
        return ""
    
    output_path = Path(output_dir).resolve()
    output_path.mkdir(parents=True, exist_ok=True)
    
    # 🎯 优先使用json_data生成完整内容
    if json_data:
        return save_markdown_content_enhanced(json_data, str(output_path), filename, normalize_numbers)
    
    # 原有逻辑保持不变
    markdown_text = markdown_data.get(key_text, '')
    
    # 数字标准化处理
    changes_count = 0
    if normalize_numbers and markdown_text:
        original_markdown_text = markdown_text
        markdown_text = normalize_markdown_table(markdown_text)
        
        changes_count = len([1 for o, n in zip(original_markdown_text, markdown_text) if o != n])
    
    md_file_path = output_path / f"{filename}.md"
    with open(md_file_path, 'w', encoding='utf-8') as f:
        f.write(markdown_text)
    
    # 如果启用了标准化且有变化，保存原始版本用于对比
    if normalize_numbers and changes_count > 0:
        original_output_path = output_path / f"{filename}_original.md"
        with open(original_output_path, 'w', encoding='utf-8') as f:
            f.write(original_markdown_text)

    # 保存Markdown中的图像
    markdown_images = markdown_data.get(key_images, {})
    for img_path, img_data in markdown_images.items():
        try:
            full_img_path = output_path / img_path
            full_img_path.parent.mkdir(parents=True, exist_ok=True)
            save_image(img_data, str(full_img_path))
            
        except Exception as e:
            print(f"❌ Error saving Markdown image {img_path}: {e}")

    return str(md_file_path)

def save_markdown_content_enhanced(json_data: Dict[str, Any], output_dir: str, 
                         filename: str, normalize_numbers: bool = True) -> str:
    """
    增强版Markdown内容保存，同时处理parsing_res_list和table_res_list
    """
    if not json_data:
        return ""
    
    output_path = Path(output_dir).resolve()
    output_path.mkdir(parents=True, exist_ok=True)
    
    markdown_content = []
    
    # 处理 parsing_res_list
    parsing_res_list = json_data.get('parsing_res_list', [])
    table_res_list = json_data.get('table_res_list', [])
    
    table_index = 0  # 用于匹配table_res_list中的表格
    
    for item in parsing_res_list:
        block_label = item.get('block_label', '')
        block_content = item.get('block_content', '')
        
        if block_label == 'table':
            # 如果是表格，优先使用table_res_list中的详细HTML
            if table_index < len(table_res_list):
                detailed_html = table_res_list[table_index].get('pred_html', block_content)
                if normalize_numbers:
                    detailed_html = normalize_markdown_table(detailed_html)
                
                # 转换为居中显示的HTML
                markdown_content.append(f'<div style="text-align: center;">{detailed_html}</div>')
                table_index += 1
            else:
                # 如果table_res_list中没有对应项，使用parsing_res_list中的内容
                if normalize_numbers:
                    block_content = normalize_markdown_table(block_content)
                markdown_content.append(f'<div style="text-align: center;">{block_content}</div>')
        else:
            # 非表格内容直接添加
            if normalize_numbers:
                block_content = normalize_financial_numbers(block_content)
            markdown_content.append(block_content)
    
    # 🎯 关键修复：处理剩余的table_res_list项目
    # 如果table_res_list中还有未处理的表格（比parsing_res_list中的表格多）
    remaining_tables = table_res_list[table_index:]
    for table_item in remaining_tables:
        detailed_html = table_item.get('pred_html', '')
        if detailed_html:
            if normalize_numbers:
                detailed_html = normalize_markdown_table(detailed_html)
            markdown_content.append(f'<div style="text-align: center;">{detailed_html}</div>')
    
    # 合并所有内容
    final_markdown = '\n\n'.join(markdown_content)
    
    # 保存文件
    md_file_path = output_path / f"{filename}.md"
    with open(md_file_path, 'w', encoding='utf-8') as f:
        f.write(final_markdown)
    
    print(f"📄 Enhanced Markdown saved: {md_file_path}")
    print(f"   - parsing_res_list tables: {sum(1 for item in parsing_res_list if item.get('block_label') == 'table')}")
    print(f"   - table_res_list tables: {len(table_res_list)}")
    print(f"   - remaining tables added: {len(remaining_tables)}")
    
    return str(md_file_path)