فهرست منبع

截图增加s3上传逻辑,移除宽或高为0的spans

赵小蒙 1 سال پیش
والد
کامیت
8a2736a53f
3فایلهای تغییر یافته به همراه11 افزوده شده و 11 حذف شده
  1. 7 7
      demo/ocr_demo.py
  2. 1 1
      magic_pdf/pdf_parse_by_ocr.py
  3. 3 3
      magic_pdf/pre_proc/ocr_cut_image.py

+ 7 - 7
demo/ocr_demo.py

@@ -4,7 +4,7 @@ import os
 from loguru import logger
 from pathlib import Path
 
-from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown, mk_mm_markdown
+from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
 from magic_pdf.libs.commons import join_path
 from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
 
@@ -30,12 +30,12 @@ def read_json_file(file_path):
 
 
 if __name__ == '__main__':
-    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
-    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
+    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
+    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
     # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
     # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
-    ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
-    ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
+    # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
+    # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_fix.json"
     try:
         ocr_pdf_model_info = read_json_file(ocr_json_file_path)
         pth = Path(ocr_json_file_path)
@@ -56,8 +56,8 @@ if __name__ == '__main__':
         if not os.path.exists(parent_dir):
             os.makedirs(parent_dir)
 
-        # markdown_content = mk_nlp_markdown(pdf_info_dict)
-        markdown_content = mk_mm_markdown(pdf_info_dict)
+        # markdown_content = ocr_mk_nlp_markdown(pdf_info_dict)
+        markdown_content = ocr_mk_mm_markdown(pdf_info_dict)
 
         with open(text_content_save_path, "w", encoding="utf-8") as f:
             f.write(markdown_content)

+ 1 - 1
magic_pdf/pdf_parse_by_ocr.py

@@ -208,7 +208,7 @@ def parse_pdf_by_ocr(
         spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
 
         # 对image和table截图
-        spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
+        spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
 
         # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
         displayed_list = []

+ 3 - 3
magic_pdf/pre_proc/ocr_cut_image.py

@@ -3,7 +3,7 @@ from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.libs.pdf_image_tools import cut_image
 
 
-def cut_image_and_table(spans, page, page_id, book_name, save_path):
+def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
     def s3_return_path(type):
         return join_path(book_name, type)
 
@@ -13,8 +13,8 @@ def cut_image_and_table(spans, page, page_id, book_name, save_path):
     for span in spans:
         span_type = span['type']
         if span_type == ContentType.Image:
-            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'), s3_return_path=s3_return_path('images'), img_s3_client=img_s3_client)
         elif span_type == ContentType.Table:
-            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
+            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'), s3_return_path=s3_return_path('tables'), img_s3_client=img_s3_client)
 
     return spans