فهرست منبع

feat: 移动get_input_files函数导入位置,优化代码结构

zhch158_admin 1 ماه پیش
والد
کامیت
5febd95970
3فایلهای تغییر یافته به همراه8 افزوده شده و 77 حذف شده
  1. 4 4
      zhch/ppstructurev3_single_client.py
  2. 4 4
      zhch/ppstructurev3_single_process.py
  3. 0 69
      zhch/ppstructurev3_utils.py

+ 4 - 4
zhch/ppstructurev3_single_client.py

@@ -17,13 +17,13 @@ load_dotenv(override=True)
 
 from utils import (
     collect_pid_files,
+    get_input_files,
 )
 
 from ppstructurev3_utils import (
-   get_input_files,
-   convert_pruned_result_to_json,
-   save_output_images,
-   save_markdown_content
+    convert_pruned_result_to_json,
+    save_output_images,
+    save_markdown_content
 )
 
 def call_api_for_image(image_path: str, api_url: str, timeout: int = 300) -> Dict[str, Any]:

+ 4 - 4
zhch/ppstructurev3_single_process.py

@@ -25,13 +25,13 @@ load_dotenv(override=True)
 
 from utils import (
     collect_pid_files,
+    get_input_files,
 )
 
 from ppstructurev3_utils import (
-   get_input_files,
-   convert_pruned_result_to_json,
-   save_output_images,
-   save_markdown_content
+    convert_pruned_result_to_json,
+    save_output_images,
+    save_markdown_content
 )
 
 def process_images_unified(image_paths: List[str],

+ 0 - 69
zhch/ppstructurev3_utils.py

@@ -9,9 +9,6 @@ from typing import List, Dict, Any, Union
 import numpy as np
 
 from utils import (
-    get_image_files_from_dir,
-    get_image_files_from_list,
-    get_image_files_from_csv,
     load_images_from_pdf,
     normalize_markdown_table
 )
@@ -63,72 +60,6 @@ def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int
         traceback.print_exc()
         return []
 
-def get_input_files(args) -> List[str]:
-    """
-    获取输入文件列表,统一处理PDF和图像文件
-    
-    Args:
-        args: 命令行参数
-        
-    Returns:
-        处理后的图像文件路径列表
-    """
-    input_files = []
-    
-    # 获取原始输入文件
-    if args.input_csv:
-        raw_files = get_image_files_from_csv(args.input_csv, "fail")
-    elif args.input_file_list:
-        raw_files = get_image_files_from_list(args.input_file_list)
-    elif args.input_file:
-        raw_files = [Path(args.input_file).resolve()]
-    else:
-        input_dir = Path(args.input_dir).resolve()
-        if not input_dir.exists():
-            print(f"❌ Input directory does not exist: {input_dir}")
-            return []
-        
-        # 获取所有支持的文件(图像和PDF)
-        image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
-        pdf_extensions = ['.pdf']
-        
-        raw_files = []
-        for ext in image_extensions + pdf_extensions:
-            raw_files.extend(list(input_dir.glob(f"*{ext}")))
-            raw_files.extend(list(input_dir.glob(f"*{ext.upper()}")))
-        
-        raw_files = [str(f) for f in raw_files]
-    
-    # 分别处理PDF和图像文件
-    pdf_count = 0
-    image_count = 0
-    
-    for file_path in raw_files:
-        file_path = Path(file_path)
-        
-        if file_path.suffix.lower() == '.pdf':
-            # 转换PDF为图像
-            print(f"📄 Processing PDF: {file_path.name}")
-            pdf_images = convert_pdf_to_images(
-                str(file_path), 
-                args.output_dir,
-                dpi=args.pdf_dpi
-            )
-            input_files.extend(pdf_images)
-            pdf_count += 1
-        else:
-            # 直接添加图像文件
-            if file_path.exists():
-                input_files.append(str(file_path))
-                image_count += 1
-    
-    print(f"📊 Input summary:")
-    print(f"  PDF files processed: {pdf_count}")
-    print(f"  Image files found: {image_count}")
-    print(f"  Total image files to process: {len(input_files)}")
-    
-    return sorted(list(set(str(f) for f in input_files)))
-
 def convert_pruned_result_to_json(pruned_result: Dict[str, Any], 
                               input_image_path: str, 
                               output_dir: str,