|
|
@@ -1,7 +1,10 @@
|
|
|
import copy
|
|
|
|
|
|
+from loguru import logger
|
|
|
+
|
|
|
from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
|
|
|
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
|
|
+from magic_pdf.libs.language import detect_lang
|
|
|
|
|
|
LINE_STOP_FLAG = (
|
|
|
'.',
|
|
|
@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
|
|
|
|
|
|
# 添加所有文本,包括空行,保持与block['lines']长度一致
|
|
|
lines_text_list.append(line_text)
|
|
|
+ block_text = ''.join(lines_text_list)
|
|
|
+ block_lang = detect_lang(block_text)
|
|
|
+ # logger.info(f"block_lang: {block_lang}")
|
|
|
|
|
|
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
|
|
|
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
|
|
|
@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
|
|
|
if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
|
|
|
right_close_num += 1
|
|
|
else:
|
|
|
- # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
|
|
|
- # block宽的阈值可以小些,block窄的阈值要大
|
|
|
-
|
|
|
- if block_weight_radio >= 0.5:
|
|
|
+ # 类中文没有超长单词的情况,可以用统一的阈值
|
|
|
+ if block_lang in ['zh', 'ja', 'ko']:
|
|
|
closed_area = 0.26 * block_weight
|
|
|
else:
|
|
|
- closed_area = 0.36 * block_weight
|
|
|
+ # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
|
|
|
+ # block宽的阈值可以小些,block窄的阈值要大
|
|
|
+ if block_weight_radio >= 0.5:
|
|
|
+ closed_area = 0.26 * block_weight
|
|
|
+ else:
|
|
|
+ closed_area = 0.36 * block_weight
|
|
|
if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
|
|
|
right_not_close_num += 1
|
|
|
|