|
|
@@ -710,9 +710,40 @@ def find_available_ocr_files(ocr_out_dir: str) -> List[str]:
|
|
|
for json_file in search_dir.rglob("*.json"):
|
|
|
available_files.append(str(json_file))
|
|
|
# 去重并排序
|
|
|
- available_files = sorted(list(set(available_files)))
|
|
|
+ # available_files = sorted(list(set(available_files)))
|
|
|
+ # 解析文件名并提取页码信息
|
|
|
+ file_info = []
|
|
|
+ for file_path in available_files:
|
|
|
+ file_name = Path(file_path).stem
|
|
|
+ # 提取页码 (例如从 "2023年度报告母公司_page_001" 中提取 "001")
|
|
|
+ if 'page_' in file_name:
|
|
|
+ try:
|
|
|
+ page_part = file_name.split('page_')[-1]
|
|
|
+ page_num = int(page_part)
|
|
|
+ file_info.append({
|
|
|
+ 'path': file_path,
|
|
|
+ 'page': page_num,
|
|
|
+ 'display_name': f"第{page_num}页"
|
|
|
+ })
|
|
|
+ except ValueError:
|
|
|
+ # 如果无法解析页码,使用文件名
|
|
|
+ file_info.append({
|
|
|
+ 'path': file_path,
|
|
|
+ 'page': len(file_info) + 1,
|
|
|
+ 'display_name': Path(file_path).stem
|
|
|
+ })
|
|
|
+ else:
|
|
|
+ # 对于没有page_的文件,按顺序编号
|
|
|
+ file_info.append({
|
|
|
+ 'path': file_path,
|
|
|
+ 'page': len(file_info) + 1,
|
|
|
+ 'display_name': Path(file_path).stem
|
|
|
+ })
|
|
|
+
|
|
|
+ # 按页码排序
|
|
|
+ file_info.sort(key=lambda x: x['page'])
|
|
|
|
|
|
- return available_files
|
|
|
+ return file_info
|
|
|
|
|
|
|
|
|
def get_ocr_tool_info(ocr_data: List) -> Dict:
|