Kaynağa Gözat

将模型和pymu坐标的转换逻辑抽象成方法

赵小蒙 1 yıl önce
ebeveyn
işleme
f62d1aa781

+ 9 - 0
magic_pdf/libs/coordinate_transform.py

@@ -0,0 +1,9 @@
+def get_scale_ratio(ocr_page_info, page):
+    pix = page.get_pixmap(dpi=72)
+    pymu_width = int(pix.w)
+    pymu_height = int(pix.h)
+    width_from_json = ocr_page_info['page_info']['width']
+    height_from_json = ocr_page_info['page_info']['height']
+    horizontal_scale_ratio = width_from_json / pymu_width
+    vertical_scale_ratio = height_from_json / pymu_height
+    return horizontal_scale_ratio, vertical_scale_ratio

+ 5 - 21
magic_pdf/pdf_parse_by_ocr.py

@@ -4,6 +4,7 @@ import time
 from loguru import logger
 
 from magic_pdf.libs.commons import read_file, join_path, fitz, get_img_s3_client, get_delta_time, get_docx_model_output
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.safe_filename import sanitize_filename
 from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
 from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
@@ -82,7 +83,7 @@ def parse_pdf_by_ocr(
         page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
         header_bboxes = parse_headers(page_id, page, ocr_page_info)
         footer_bboxes = parse_footers(page_id, page, ocr_page_info)
-        footnote_bboxes =  parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode)
+        footnote_bboxes = parse_footnotes_by_model(page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode)
 
         # 构建需要remove的bbox列表
         need_remove_spans_bboxes = []
@@ -90,35 +91,19 @@ def parse_pdf_by_ocr(
         need_remove_spans_bboxes.extend(header_bboxes)
         need_remove_spans_bboxes.extend(footer_bboxes)
         need_remove_spans_bboxes.extend(footnote_bboxes)
-        remove_bboxes.append(need_remove_spans_bboxes)
-
-
 
         layout_dets = ocr_page_info['layout_dets']
         spans = []
 
-        # 将模型坐标转换成pymu格式下的未缩放坐标
-        DPI = 72  # use this resolution
-        pix = page.get_pixmap(dpi=DPI)
-        pageL = 0
-        pageR = int(pix.w)
-        pageU = 0
-        pageD = int(pix.h)
-        width_from_json = ocr_page_info['page_info']['width']
-        height_from_json = ocr_page_info['page_info']['height']
-        LR_scaleRatio = width_from_json / (pageR - pageL)
-        UD_scaleRatio = height_from_json / (pageD - pageU)
+        # 计算模型坐标和pymu坐标的缩放比例
+        horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
 
         for layout_det in layout_dets:
             category_id = layout_det['category_id']
             allow_category_id_list = [1, 7, 13, 14, 15]
             if category_id in allow_category_id_list:
                 x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
-                x0 = x0 / LR_scaleRatio
-                y0 = y0 / UD_scaleRatio
-                x1 = x1 / LR_scaleRatio
-                y1 = y1 / UD_scaleRatio
-                bbox = [int(x0), int(y0), int(x1), int(y1)]
+                bbox = [int(x0/horizontal_scale_ratio), int(y0/vertical_scale_ratio), int(x1/horizontal_scale_ratio), int(y1/vertical_scale_ratio)]
                 '''要删除的'''
                 #  3: 'header',      # 页眉
                 #  4: 'page number', # 页码
@@ -184,6 +169,5 @@ def parse_pdf_by_ocr(
         page_info = construct_page_component(page_id, blocks, layout_bboxes)
         pdf_info_dict[f"page_{page_id}"] = page_info
 
-    # logger.info(remove_bboxes)
     return pdf_info_dict
 

+ 6 - 16
magic_pdf/pre_proc/detect_footer_by_model.py

@@ -1,4 +1,5 @@
 from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
 
 def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
@@ -8,23 +9,12 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
     :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
     """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    
 
     #--------- 通过json_from_DocXchain来获取 footer ---------#
     footer_bbox_from_DocXChain = []
 
-    
     xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
 
     # {0: 'title',  # 标题
     # 1: 'figure', # 图片
@@ -42,10 +32,10 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     #  13: 'embedding',     # 嵌入公式
     #  14: 'isolated'}      # 单行公式
     for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
         # L += pageL          # 有的页面,artBox偏移了。不在(0,0)
         # R += pageL
         # U += pageU

+ 6 - 15
magic_pdf/pre_proc/detect_footnote.py

@@ -1,5 +1,6 @@
 from collections import Counter
 from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
 
 def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
@@ -9,22 +10,12 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_
     :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
     :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
     """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    
 
     #--------- 通过json_from_DocXchain来获取 footnote ---------#
     footnote_bbox_from_DocXChain = []
 
     xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
 
     # {0: 'title',  # 标题
     # 1: 'figure', # 图片
@@ -42,10 +33,10 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_
     #  13: 'embedding',     # 嵌入公式
     #  14: 'isolated'}      # 单行公式
     for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
         # L += pageL          # 有的页面,artBox偏移了。不在(0,0)
         # R += pageL
         # U += pageU

+ 6 - 15
magic_pdf/pre_proc/detect_header.py

@@ -1,4 +1,5 @@
 from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
 
 def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
@@ -8,22 +9,12 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
     :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
     """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    
 
     #--------- 通过json_from_DocXchain来获取 header ---------#
     header_bbox_from_DocXChain = []
 
     xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
 
     # {0: 'title',  # 标题
     # 1: 'figure', # 图片
@@ -41,10 +32,10 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     #  13: 'embedding',     # 嵌入公式
     #  14: 'isolated'}      # 单行公式
     for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
         # L += pageL          # 有的页面,artBox偏移了。不在(0,0)
         # R += pageL
         # U += pageU

+ 6 - 15
magic_pdf/pre_proc/detect_page_number.py

@@ -1,4 +1,5 @@
 from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
 
 
 def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
@@ -8,22 +9,12 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
     :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
     """
-    DPI = 72  # use this resolution
-    pix = page.get_pixmap(dpi=DPI)
-    pageL = 0
-    pageR = int(pix.w)
-    pageU = 0
-    pageD = int(pix.h)
-    
 
     #--------- 通过json_from_DocXchain来获取 pageNo ---------#
     pageNo_bbox_from_DocXChain = []
 
     xf_json = json_from_DocXchain_obj
-    width_from_json = xf_json['page_info']['width']
-    height_from_json = xf_json['page_info']['height']
-    LR_scaleRatio = width_from_json / (pageR - pageL)
-    UD_scaleRatio = height_from_json / (pageD - pageU)
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
 
     # {0: 'title',  # 标题
     # 1: 'figure', # 图片
@@ -41,10 +32,10 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
     #  13: 'embedding',     # 嵌入公式
     #  14: 'isolated'}      # 单行公式
     for xf in xf_json['layout_dets']:
-        L = xf['poly'][0] / LR_scaleRatio
-        U = xf['poly'][1] / UD_scaleRatio
-        R = xf['poly'][2] / LR_scaleRatio
-        D = xf['poly'][5] / UD_scaleRatio
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
         # L += pageL          # 有的页面,artBox偏移了。不在(0,0)
         # R += pageL
         # U += pageU