|
|
@@ -9,9 +9,6 @@ from typing import List, Dict, Any, Union
|
|
|
import numpy as np
|
|
|
|
|
|
from utils import (
|
|
|
- get_image_files_from_dir,
|
|
|
- get_image_files_from_list,
|
|
|
- get_image_files_from_csv,
|
|
|
load_images_from_pdf,
|
|
|
normalize_markdown_table
|
|
|
)
|
|
|
@@ -63,72 +60,6 @@ def convert_pdf_to_images(pdf_file: str, output_dir: str | None = None, dpi: int
|
|
|
traceback.print_exc()
|
|
|
return []
|
|
|
|
|
|
-def get_input_files(args) -> List[str]:
|
|
|
- """
|
|
|
- 获取输入文件列表,统一处理PDF和图像文件
|
|
|
-
|
|
|
- Args:
|
|
|
- args: 命令行参数
|
|
|
-
|
|
|
- Returns:
|
|
|
- 处理后的图像文件路径列表
|
|
|
- """
|
|
|
- input_files = []
|
|
|
-
|
|
|
- # 获取原始输入文件
|
|
|
- if args.input_csv:
|
|
|
- raw_files = get_image_files_from_csv(args.input_csv, "fail")
|
|
|
- elif args.input_file_list:
|
|
|
- raw_files = get_image_files_from_list(args.input_file_list)
|
|
|
- elif args.input_file:
|
|
|
- raw_files = [Path(args.input_file).resolve()]
|
|
|
- else:
|
|
|
- input_dir = Path(args.input_dir).resolve()
|
|
|
- if not input_dir.exists():
|
|
|
- print(f"❌ Input directory does not exist: {input_dir}")
|
|
|
- return []
|
|
|
-
|
|
|
- # 获取所有支持的文件(图像和PDF)
|
|
|
- image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']
|
|
|
- pdf_extensions = ['.pdf']
|
|
|
-
|
|
|
- raw_files = []
|
|
|
- for ext in image_extensions + pdf_extensions:
|
|
|
- raw_files.extend(list(input_dir.glob(f"*{ext}")))
|
|
|
- raw_files.extend(list(input_dir.glob(f"*{ext.upper()}")))
|
|
|
-
|
|
|
- raw_files = [str(f) for f in raw_files]
|
|
|
-
|
|
|
- # 分别处理PDF和图像文件
|
|
|
- pdf_count = 0
|
|
|
- image_count = 0
|
|
|
-
|
|
|
- for file_path in raw_files:
|
|
|
- file_path = Path(file_path)
|
|
|
-
|
|
|
- if file_path.suffix.lower() == '.pdf':
|
|
|
- # 转换PDF为图像
|
|
|
- print(f"📄 Processing PDF: {file_path.name}")
|
|
|
- pdf_images = convert_pdf_to_images(
|
|
|
- str(file_path),
|
|
|
- args.output_dir,
|
|
|
- dpi=args.pdf_dpi
|
|
|
- )
|
|
|
- input_files.extend(pdf_images)
|
|
|
- pdf_count += 1
|
|
|
- else:
|
|
|
- # 直接添加图像文件
|
|
|
- if file_path.exists():
|
|
|
- input_files.append(str(file_path))
|
|
|
- image_count += 1
|
|
|
-
|
|
|
- print(f"📊 Input summary:")
|
|
|
- print(f" PDF files processed: {pdf_count}")
|
|
|
- print(f" Image files found: {image_count}")
|
|
|
- print(f" Total image files to process: {len(input_files)}")
|
|
|
-
|
|
|
- return sorted(list(set(str(f) for f in input_files)))
|
|
|
-
|
|
|
def convert_pruned_result_to_json(pruned_result: Dict[str, Any],
|
|
|
input_image_path: str,
|
|
|
output_dir: str,
|