|
|
@@ -117,7 +117,7 @@ class MinerUPaddleOCRMerger:
|
|
|
|
|
|
merged_data.append(merged_item)
|
|
|
|
|
|
- elif item['type'] in ['text', 'header']:
|
|
|
+ elif item['type'] in ['text', 'title']:
|
|
|
# 处理普通文本
|
|
|
merged_item = item.copy()
|
|
|
text = item.get('text', '')
|
|
|
@@ -128,14 +128,33 @@ class MinerUPaddleOCRMerger:
|
|
|
)
|
|
|
|
|
|
if matched_bbox:
|
|
|
- merged_item['bbox'] = matched_bbox['bbox']
|
|
|
- merged_item['bbox_source'] = 'paddle_ocr'
|
|
|
- merged_item['text_score'] = matched_bbox['score']
|
|
|
+ # merged_item['bbox'] = matched_bbox['bbox']
|
|
|
+ # merged_item['bbox_source'] = 'paddle_ocr'
|
|
|
+ # merged_item['text_score'] = matched_bbox['score']
|
|
|
+
|
|
|
+ # 沿用mineru的bbox, 就是要移动位置paddle_pointer, last_matched_index
|
|
|
# 标记为已使用
|
|
|
matched_bbox['used'] = True
|
|
|
|
|
|
merged_data.append(merged_item)
|
|
|
-
|
|
|
+ elif item['type'] == 'list':
|
|
|
+ # 处理列表项
|
|
|
+ merged_item = item.copy()
|
|
|
+ list_items = item.get('list_items', [])
|
|
|
+ sub_type = item.get('sub_type', 'unordered') # 有序或无序
|
|
|
+
|
|
|
+ for list_item in list_items:
|
|
|
+ # 查找匹配的 bbox
|
|
|
+ matched_bbox, paddle_pointer, last_matched_index = self._find_matching_bbox(
|
|
|
+ list_item, paddle_text_boxes, paddle_pointer, last_matched_index
|
|
|
+ )
|
|
|
+
|
|
|
+ if matched_bbox:
|
|
|
+ # 沿用mineru的bbox, 就是要移动位置paddle_pointer, last_matched_index
|
|
|
+ # 标记为已使用
|
|
|
+ matched_bbox['used'] = True
|
|
|
+
|
|
|
+ merged_data.append(merged_item)
|
|
|
else:
|
|
|
# 其他类型直接复制
|
|
|
merged_data.append(item.copy())
|
|
|
@@ -218,6 +237,11 @@ class MinerUPaddleOCRMerger:
|
|
|
|
|
|
# 由于minerU和Paddle的顺序基本一致, 也有不一致的地方, 所以需要向前找第一个未使用的位置
|
|
|
# MinerU和Paddle都可能识别错误,所以需要一个look_ahead_window来避免漏掉匹配
|
|
|
+ # 匹配时会遇到一些特殊情况,比如Paddle把两个连着的cell识别为一个字符串,MinerU将单元格上下2行识别为一行
|
|
|
+ # '1|2024-08-11|扫二维码付' minerU识别为“扫二维码付款”,Paddle识别为'12024-08-11扫二维码付'
|
|
|
+ # 款
|
|
|
+ # 字符串的顺序极大概率是一致的,所以如果短字符串是长字符串的子串,可以增加相似权重
|
|
|
+
|
|
|
search_start = last_match_index - 1
|
|
|
unused_count = 0
|
|
|
while search_start >= 0:
|
|
|
@@ -238,6 +262,12 @@ class MinerUPaddleOCRMerger:
|
|
|
continue
|
|
|
|
|
|
box_text = self._normalize_text(text_boxes[i]['text'])
|
|
|
+ # 精确匹配优先
|
|
|
+ if target_text == box_text:
|
|
|
+ if i >= start_index:
|
|
|
+ return text_boxes[i], i + 1, i
|
|
|
+ else:
|
|
|
+ return text_boxes[i], start_index, i
|
|
|
|
|
|
# 过滤过短的候选文本(避免单字符匹配)
|
|
|
if len(box_text) < 2:
|
|
|
@@ -247,21 +277,20 @@ class MinerUPaddleOCRMerger:
|
|
|
length_ratio = min(len(target_text), len(box_text)) / max(len(target_text), len(box_text))
|
|
|
if length_ratio < 0.3: # 长度差异超过70%则跳过
|
|
|
continue
|
|
|
-
|
|
|
- # 精确匹配优先
|
|
|
- if target_text == box_text:
|
|
|
- if i >= start_index:
|
|
|
- return text_boxes[i], i + 1, i
|
|
|
- else:
|
|
|
- return text_boxes[i], start_index, i
|
|
|
-
|
|
|
+
|
|
|
+ # 子串检查
|
|
|
+ shorter = target_text if len(target_text) < len(box_text) else box_text
|
|
|
+ longer = box_text if len(target_text) < len(box_text) else target_text
|
|
|
+ is_substring = shorter in longer
|
|
|
+
|
|
|
# 计算多种相似度
|
|
|
+ # token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
|
|
|
partial_ratio = fuzz.partial_ratio(target_text, box_text)
|
|
|
- token_sort_ratio = fuzz.token_sort_ratio(target_text, box_text)
|
|
|
+ if is_substring:
|
|
|
+ partial_ratio += 10 # 子串时提升相似度
|
|
|
|
|
|
# 综合相似度 - 两种算法都要达到阈值
|
|
|
- if (partial_ratio >= self.similarity_threshold and
|
|
|
- token_sort_ratio >= 50): # token_sort 阈值稍低
|
|
|
+ if (partial_ratio >= self.similarity_threshold):
|
|
|
return text_boxes[i], start_index, last_match_index
|
|
|
|
|
|
return best_match, best_index, last_match_index
|
|
|
@@ -290,10 +319,12 @@ class MinerUPaddleOCRMerger:
|
|
|
output_path: Optional[str] = None, mineru_file: Optional[str] = None) -> str:
|
|
|
"""
|
|
|
生成增强的 Markdown(包含 bbox 信息的注释)
|
|
|
+ 参考 MinerU 的实现,支持标题、列表、表格标题等
|
|
|
|
|
|
Args:
|
|
|
merged_data: 合并后的数据
|
|
|
output_path: 输出路径(可选)
|
|
|
+ mineru_file: MinerU 源文件路径(用于复制图片)
|
|
|
|
|
|
Returns:
|
|
|
Markdown 内容
|
|
|
@@ -301,49 +332,145 @@ class MinerUPaddleOCRMerger:
|
|
|
md_lines = []
|
|
|
|
|
|
for item in merged_data:
|
|
|
- if item['type'] == 'header':
|
|
|
- text = item.get('text', '')
|
|
|
- bbox = item.get('bbox', [])
|
|
|
+ item_type = item.get('type', '')
|
|
|
+ bbox = item.get('bbox', [])
|
|
|
+
|
|
|
+ # 添加 bbox 注释
|
|
|
+ if bbox:
|
|
|
md_lines.append(f"<!-- bbox: {bbox} -->")
|
|
|
- md_lines.append(f"# {text}\n")
|
|
|
|
|
|
- elif item['type'] == 'text':
|
|
|
+ # 根据类型处理
|
|
|
+ if item_type == 'title':
|
|
|
+ # 标题 - 使用 text_level 确定标题级别
|
|
|
text = item.get('text', '')
|
|
|
- bbox = item.get('bbox', [])
|
|
|
- if bbox:
|
|
|
- md_lines.append(f"<!-- bbox: {bbox} -->")
|
|
|
- md_lines.append(f"{text}\n")
|
|
|
+ text_level = item.get('text_level', 1)
|
|
|
+ heading = '#' * min(text_level, 6) # 最多6级标题
|
|
|
+ md_lines.append(f"{heading} {text}\n")
|
|
|
|
|
|
- elif item['type'] == 'table':
|
|
|
- md_lines.append("<!-- 表格单元格包含 data-bbox 属性 -->\n")
|
|
|
- md_lines.append(item.get('table_body_with_bbox', item.get('table_body', '')))
|
|
|
- md_lines.append("\n")
|
|
|
+ elif item_type == 'text':
|
|
|
+ # 普通文本 - 可能也有 text_level
|
|
|
+ text = item.get('text', '')
|
|
|
+ text_level = item.get('text_level', 0)
|
|
|
+
|
|
|
+ if text_level > 0:
|
|
|
+ # 作为标题处理
|
|
|
+ heading = '#' * min(text_level, 6)
|
|
|
+ md_lines.append(f"{heading} {text}\n")
|
|
|
+ else:
|
|
|
+ # 普通段落
|
|
|
+ md_lines.append(f"{text}\n")
|
|
|
+
|
|
|
+ elif item_type == 'list':
|
|
|
+ # 列表
|
|
|
+ sub_type = item.get('sub_type', 'text')
|
|
|
+ list_items = item.get('list_items', [])
|
|
|
+
|
|
|
+ for list_item in list_items:
|
|
|
+ md_lines.append(f"{list_item}\n")
|
|
|
+
|
|
|
+ md_lines.append("") # 列表后添加空行
|
|
|
+
|
|
|
+ elif item_type == 'table':
|
|
|
+ # 表格标题
|
|
|
+ table_caption = item.get('table_caption', [])
|
|
|
+ if table_caption:
|
|
|
+ for caption in table_caption:
|
|
|
+ if caption: # 跳过空标题
|
|
|
+ md_lines.append(f"**{caption}**\n")
|
|
|
+
|
|
|
+ # 表格内容
|
|
|
+ table_body = item.get('table_body_with_bbox', item.get('table_body', ''))
|
|
|
+ if table_body:
|
|
|
+ md_lines.append(table_body)
|
|
|
+ md_lines.append("")
|
|
|
+
|
|
|
+ # 表格脚注
|
|
|
+ table_footnote = item.get('table_footnote', [])
|
|
|
+ if table_footnote:
|
|
|
+ for footnote in table_footnote:
|
|
|
+ if footnote:
|
|
|
+ md_lines.append(f"*{footnote}*")
|
|
|
+ md_lines.append("")
|
|
|
|
|
|
- elif item['type'] == 'image':
|
|
|
+ elif item_type == 'image':
|
|
|
+ # 图片
|
|
|
img_path = item.get('img_path', '')
|
|
|
- # 需要将minerU图像路径下的图片拷贝到输出目录
|
|
|
- if img_path and mineru_file:
|
|
|
+
|
|
|
+ # 复制图片到输出目录
|
|
|
+ if img_path and mineru_file and output_path:
|
|
|
mineru_dir = Path(mineru_file).parent
|
|
|
img_full_path = mineru_dir / img_path
|
|
|
if img_full_path.exists():
|
|
|
- # 需要将图片拷贝到输出目录
|
|
|
output_img_path = Path(output_path).parent / img_path
|
|
|
output_img_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
shutil.copy(img_full_path, output_img_path)
|
|
|
-
|
|
|
- bbox = item.get('bbox', [])
|
|
|
- if bbox:
|
|
|
- md_lines.append(f"<!-- bbox: {bbox} -->")
|
|
|
+
|
|
|
+ # 图片标题
|
|
|
+ image_caption = item.get('image_caption', [])
|
|
|
+ if image_caption:
|
|
|
+ for caption in image_caption:
|
|
|
+ if caption:
|
|
|
+ md_lines.append(f"**{caption}**\n")
|
|
|
+
|
|
|
+ # 插入图片
|
|
|
md_lines.append(f"\n")
|
|
|
+
|
|
|
+ # 图片脚注
|
|
|
+ image_footnote = item.get('image_footnote', [])
|
|
|
+ if image_footnote:
|
|
|
+ for footnote in image_footnote:
|
|
|
+ if footnote:
|
|
|
+ md_lines.append(f"*{footnote}*")
|
|
|
+ md_lines.append("")
|
|
|
+
|
|
|
+ elif item_type == 'equation':
|
|
|
+ # 公式
|
|
|
+ latex = item.get('latex', '')
|
|
|
+ if latex:
|
|
|
+ md_lines.append(f"$$\n{latex}\n$$\n")
|
|
|
+
|
|
|
+ elif item_type == 'inline_equation':
|
|
|
+ # 行内公式
|
|
|
+ latex = item.get('latex', '')
|
|
|
+ if latex:
|
|
|
+ md_lines.append(f"${latex}$\n")
|
|
|
+
|
|
|
+ elif item_type == 'page_number':
|
|
|
+ # 页码 - 通常跳过或作为注释
|
|
|
+ text = item.get('text', '')
|
|
|
+ md_lines.append(f"<!-- 页码: {text} -->\n")
|
|
|
+
|
|
|
+ elif item_type == 'header':
|
|
|
+ # 页眉
|
|
|
+ text = item.get('text', '')
|
|
|
+ md_lines.append(f"<!-- 页眉: {text} -->\n")
|
|
|
+
|
|
|
+ elif item_type == 'footer':
|
|
|
+ # 页脚
|
|
|
+ text = item.get('text', '')
|
|
|
+ if text:
|
|
|
+ md_lines.append(f"<!-- 页脚: {text} -->\n")
|
|
|
+
|
|
|
+ elif item_type == 'reference':
|
|
|
+ # 参考文献
|
|
|
+ text = item.get('text', '')
|
|
|
+ md_lines.append(f"> {text}\n")
|
|
|
+
|
|
|
+ else:
|
|
|
+ # 未知类型 - 尝试提取文本
|
|
|
+ text = item.get('text', '')
|
|
|
+ if text:
|
|
|
+ md_lines.append(f"{text}\n")
|
|
|
|
|
|
markdown_content = '\n'.join(md_lines)
|
|
|
|
|
|
+ # 保存文件
|
|
|
if output_path:
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
f.write(markdown_content)
|
|
|
|
|
|
return markdown_content
|
|
|
-
|
|
|
+
|
|
|
def extract_table_cells_with_bbox(self, merged_data: List[Dict]) -> List[Dict]:
|
|
|
"""
|
|
|
提取所有表格单元格及其 bbox 信息
|
|
|
@@ -581,7 +708,7 @@ def main():
|
|
|
algo_group.add_argument(
|
|
|
'-t', '--threshold',
|
|
|
type=int,
|
|
|
- default=85,
|
|
|
+ default=80,
|
|
|
help='文本相似度阈值(0-100,默认: 80)'
|
|
|
)
|
|
|
|
|
|
@@ -662,9 +789,9 @@ if __name__ == "__main__":
|
|
|
|
|
|
# 默认配置
|
|
|
default_config = {
|
|
|
- "mineru-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/mineru-vlm-2.5.3_Results/B用户_扫描流水_page_001.json",
|
|
|
- "paddle-file": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/data_PPStructureV3_Results/B用户_扫描流水_page_001.json",
|
|
|
- "output-dir": "/Users/zhch158/workspace/data/流水分析/B用户_扫描流水/merged_results",
|
|
|
+ "mineru-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/mineru-vlm-2.5.3_Results/A用户_单元格扫描流水_page_001.json",
|
|
|
+ "paddle-file": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/data_PPStructureV3_Results/A用户_单元格扫描流水_page_001.json",
|
|
|
+ "output-dir": "/Users/zhch158/workspace/data/流水分析/A用户_单元格扫描流水/merged_results",
|
|
|
# "mineru-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/mineru-vlm-2.5.3_Results",
|
|
|
# "paddle-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/data_PPStructureV3_Results",
|
|
|
# "output-dir": "/Users/zhch158/workspace/data/流水分析/德_内蒙古银行照/merged_results",
|