|
|
@@ -2,28 +2,28 @@ import time
|
|
|
|
|
|
# from anyio import Path
|
|
|
|
|
|
-from pdf_tools.libs.commons import fitz, get_delta_time, get_img_s3_client
|
|
|
+from magic_pdf.libs.commons import fitz, get_delta_time, get_img_s3_client
|
|
|
import json
|
|
|
import os
|
|
|
import math
|
|
|
from loguru import logger
|
|
|
-from pdf_tools.layout.bbox_sort import (
|
|
|
+from magic_pdf.layout.bbox_sort import (
|
|
|
prepare_bboxes_for_layout_split,
|
|
|
)
|
|
|
-from pdf_tools.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
|
|
|
-from pdf_tools.libs.drop_reason import DropReason
|
|
|
-from pdf_tools.libs.markdown_utils import escape_special_markdown_char
|
|
|
-from pdf_tools.libs.safe_filename import sanitize_filename
|
|
|
-from pdf_tools.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
|
|
|
-from pdf_tools.pre_proc.detect_images import parse_images
|
|
|
-from pdf_tools.pre_proc.detect_tables import parse_tables # 获取tables的bbox
|
|
|
-from pdf_tools.pre_proc.detect_equation import parse_equations # 获取equations的bbox
|
|
|
-from pdf_tools.pre_proc.detect_header import parse_headers # 获取headers的bbox
|
|
|
-from pdf_tools.pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox
|
|
|
-from pdf_tools.pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox
|
|
|
-from pdf_tools.pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox
|
|
|
-
|
|
|
-from pdf_tools.post_proc.detect_para import (
|
|
|
+from magic_pdf.layout.layout_sort import LAYOUT_UNPROC, get_bboxes_layout, get_columns_cnt_of_layout, sort_text_block
|
|
|
+from magic_pdf.libs.drop_reason import DropReason
|
|
|
+from magic_pdf.libs.markdown_utils import escape_special_markdown_char
|
|
|
+from magic_pdf.libs.safe_filename import sanitize_filename
|
|
|
+from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
|
|
|
+from magic_pdf.pre_proc.detect_images import parse_images
|
|
|
+from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox
|
|
|
+from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox
|
|
|
+from magic_pdf.pre_proc.detect_header import parse_headers # 获取headers的bbox
|
|
|
+from magic_pdf.pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox
|
|
|
+from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model, parse_footnotes_by_rule # 获取footnotes的bbox
|
|
|
+from magic_pdf.pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox
|
|
|
+
|
|
|
+from magic_pdf.post_proc.detect_para import (
|
|
|
ParaProcessPipeline,
|
|
|
TitleDetectionException,
|
|
|
TitleLevelException,
|
|
|
@@ -31,9 +31,9 @@ from pdf_tools.post_proc.detect_para import (
|
|
|
ParaMergeException,
|
|
|
DenseSingleLineBlockException,
|
|
|
)
|
|
|
-from pdf_tools.pre_proc.main_text_font import get_main_text_font
|
|
|
-from pdf_tools.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
|
|
|
-from pdf_tools.pre_proc.remove_footer_header import remove_headder_footer_one_page
|
|
|
+from magic_pdf.pre_proc.main_text_font import get_main_text_font
|
|
|
+from magic_pdf.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
|
|
|
+from magic_pdf.pre_proc.remove_footer_header import remove_headder_footer_one_page
|
|
|
|
|
|
'''
|
|
|
from para.para_pipeline import ParaProcessPipeline
|
|
|
@@ -46,19 +46,19 @@ from para.exceptions import (
|
|
|
)
|
|
|
'''
|
|
|
|
|
|
-from pdf_tools.libs.commons import read_file, join_path
|
|
|
-from pdf_tools.libs.pdf_image_tools import save_images_by_bboxes
|
|
|
-from pdf_tools.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
|
|
|
-from pdf_tools.pre_proc.citationmarker_remove import remove_citation_marker
|
|
|
-from pdf_tools.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
|
|
|
-from pdf_tools.pre_proc.pdf_pre_filter import pdf_filter
|
|
|
-from pdf_tools.pre_proc.detect_footer_header_by_statistics import drop_footer_header
|
|
|
-from pdf_tools.pre_proc.construct_paras import construct_page_component
|
|
|
-from pdf_tools.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
|
|
|
-from pdf_tools.post_proc.pdf_post_filter import pdf_post_filter
|
|
|
-from pdf_tools.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
|
|
|
-from pdf_tools.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
|
|
|
-from pdf_tools.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
|
|
|
+from magic_pdf.libs.commons import read_file, join_path
|
|
|
+from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes
|
|
|
+from magic_pdf.post_proc.remove_footnote import merge_footnote_blocks, remove_footnote_blocks
|
|
|
+from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
|
|
|
+from magic_pdf.pre_proc.equations_replace import combine_chars_to_pymudict, remove_chars_in_text_blocks, replace_equations_in_textblock
|
|
|
+from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
|
|
|
+from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
|
|
|
+from magic_pdf.pre_proc.construct_paras import construct_page_component
|
|
|
+from magic_pdf.pre_proc.fix_image import combine_images, fix_image_vertical, fix_seperated_image, include_img_title
|
|
|
+from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
|
|
|
+from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
|
|
|
+from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
|
|
|
+from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
|
|
|
|
|
|
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
|
|
|
titleDetectionException_msg = TitleDetectionException().message
|
|
|
@@ -108,7 +108,7 @@ def parse_pdf_by_model(
|
|
|
debug_mode=False,
|
|
|
):
|
|
|
pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
|
|
|
- save_tmp_path = os.path.join(os.path.dirname(__file__), "../../..", "tmp", "unittest")
|
|
|
+ save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
|
|
|
md_bookname_save_path = ""
|
|
|
book_name = sanitize_filename(book_name)
|
|
|
if debug_mode:
|