|
@@ -4,8 +4,8 @@ import statistics
|
|
|
import time
|
|
import time
|
|
|
from typing import List
|
|
from typing import List
|
|
|
|
|
|
|
|
-import torch
|
|
|
|
|
import fitz
|
|
import fitz
|
|
|
|
|
+import torch
|
|
|
from loguru import logger
|
|
from loguru import logger
|
|
|
|
|
|
|
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
|
@@ -16,17 +16,13 @@ from magic_pdf.libs.clean_memory import clean_memory
|
|
|
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
|
|
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
|
|
|
from magic_pdf.libs.convert_utils import dict_to_list
|
|
from magic_pdf.libs.convert_utils import dict_to_list
|
|
|
from magic_pdf.libs.hash_utils import compute_md5
|
|
from magic_pdf.libs.hash_utils import compute_md5
|
|
|
-
|
|
|
|
|
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
|
|
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
|
|
|
from magic_pdf.model.magic_model import MagicModel
|
|
from magic_pdf.model.magic_model import MagicModel
|
|
|
|
|
|
|
|
-os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
|
|
|
|
-os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
|
|
|
|
|
-
|
|
|
|
|
try:
|
|
try:
|
|
|
import torchtext
|
|
import torchtext
|
|
|
|
|
|
|
|
- if torchtext.__version__ >= "0.18.0":
|
|
|
|
|
|
|
+ if torchtext.__version__ >= '0.18.0':
|
|
|
torchtext.disable_torchtext_deprecation_warning()
|
|
torchtext.disable_torchtext_deprecation_warning()
|
|
|
except ImportError:
|
|
except ImportError:
|
|
|
pass
|
|
pass
|
|
@@ -39,6 +35,9 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layo
|
|
|
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
|
|
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
|
|
|
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
|
|
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
|
|
|
|
|
|
|
|
|
|
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
|
|
|
|
+os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
|
|
|
|
|
+
|
|
|
|
|
|
|
|
def __replace_STX_ETX(text_str: str):
|
|
def __replace_STX_ETX(text_str: str):
|
|
|
"""Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
|
|
"""Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
|
|
@@ -90,7 +89,10 @@ def chars_to_content(span):
|
|
|
|
|
|
|
|
|
|
|
|
|
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|
|
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|
|
|
|
|
+<<<<<<< HEAD
|
|
|
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
|
|
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
|
|
|
|
|
+=======
|
|
|
|
|
+>>>>>>> 731f4bf (feat: add function definitions)
|
|
|
|
|
|
|
|
|
|
|
|
|
def fill_char_in_spans(spans, all_chars):
|
|
def fill_char_in_spans(spans, all_chars):
|
|
@@ -233,7 +235,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
|
# 初始化ocr模型
|
|
# 初始化ocr模型
|
|
|
atom_model_manager = AtomModelSingleton()
|
|
atom_model_manager = AtomModelSingleton()
|
|
|
ocr_model = atom_model_manager.get_atom_model(
|
|
ocr_model = atom_model_manager.get_atom_model(
|
|
|
- atom_model_name="ocr",
|
|
|
|
|
|
|
+ atom_model_name='ocr',
|
|
|
ocr_show_log=False,
|
|
ocr_show_log=False,
|
|
|
det_db_box_thresh=0.3,
|
|
det_db_box_thresh=0.3,
|
|
|
lang=lang
|
|
lang=lang
|
|
@@ -241,7 +243,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
|
|
|
|
|
|
for span in empty_spans:
|
|
for span in empty_spans:
|
|
|
# 对span的bbox截图再ocr
|
|
# 对span的bbox截图再ocr
|
|
|
- span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
|
|
|
|
|
|
|
+ span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
|
|
|
ocr_res = ocr_model.ocr(span_img, det=False)
|
|
ocr_res = ocr_model.ocr(span_img, det=False)
|
|
|
if ocr_res and len(ocr_res) > 0:
|
|
if ocr_res and len(ocr_res) > 0:
|
|
|
if len(ocr_res[0]) > 0:
|
|
if len(ocr_res[0]) > 0:
|
|
@@ -681,7 +683,7 @@ def parse_page_core(
|
|
|
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
|
|
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
|
|
|
if parse_mode == SupportedPdfParseMethod.TXT:
|
|
if parse_mode == SupportedPdfParseMethod.TXT:
|
|
|
|
|
|
|
|
- """使用新版本的混合ocr方案"""
|
|
|
|
|
|
|
+ """使用新版本的混合ocr方案."""
|
|
|
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
|
|
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
|
|
|
|
|
|
|
|
elif parse_mode == SupportedPdfParseMethod.OCR:
|
|
elif parse_mode == SupportedPdfParseMethod.OCR:
|
|
@@ -689,7 +691,6 @@ def parse_page_core(
|
|
|
else:
|
|
else:
|
|
|
raise Exception('parse_mode must be txt or ocr')
|
|
raise Exception('parse_mode must be txt or ocr')
|
|
|
|
|
|
|
|
-
|
|
|
|
|
"""先处理不需要排版的discarded_blocks"""
|
|
"""先处理不需要排版的discarded_blocks"""
|
|
|
discarded_block_with_spans, spans = fill_spans_in_blocks(
|
|
discarded_block_with_spans, spans = fill_spans_in_blocks(
|
|
|
all_discarded_blocks, spans, 0.4
|
|
all_discarded_blocks, spans, 0.4
|
|
@@ -762,8 +763,8 @@ def parse_page_core(
|
|
|
|
|
|
|
|
|
|
|
|
|
def pdf_parse_union(
|
|
def pdf_parse_union(
|
|
|
- dataset: Dataset,
|
|
|
|
|
model_list,
|
|
model_list,
|
|
|
|
|
+ dataset: Dataset,
|
|
|
imageWriter,
|
|
imageWriter,
|
|
|
parse_mode,
|
|
parse_mode,
|
|
|
start_page_id=0,
|
|
start_page_id=0,
|
|
@@ -832,4 +833,4 @@ def pdf_parse_union(
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
- pass
|
|
|
|
|
|
|
+ pass
|