Forráskód Böngészése

parse_pdf_by_ocr 逻辑更新

赵小蒙 1 éve
szülő
commit
a0be4652e6
2 módosított fájl, 11 hozzáadás és 11 törlés
  1. 9 1
      demo/ocr_demo.py
  2. 2 10
      magic_pdf/pdf_parse_by_ocr.py

+ 9 - 1
demo/ocr_demo.py

@@ -1,3 +1,4 @@
+import json
 import os
 
 from loguru import logger
@@ -20,9 +21,16 @@ def save_markdown(markdown_text, input_filepath):
         file.write(markdown_text)
 
 
+def read_json_file(file_path):
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return data
+
+
 if __name__ == '__main__':
     ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json"
-    pdf_info_dict = parse_pdf_by_ocr(ocr_json_file_path)
+    ocr_pdf_info = read_json_file(ocr_json_file_path)
+    pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
     markdown_text = mk_nlp_markdown(pdf_info_dict)
     logger.info(markdown_text)
     save_markdown(markdown_text, ocr_json_file_path)

+ 2 - 10
magic_pdf/pdf_parse_by_ocr.py

@@ -1,15 +1,7 @@
-import json
-
 from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
 from magic_pdf.libs.ocr_dict_merge import merge_spans
 
 
-def read_json_file(file_path):
-    with open(file_path, 'r') as f:
-        data = json.load(f)
-    return data
-
-
 def construct_page_component(page_id, text_blocks_preproc):
     return_dict = {
         'preproc_blocks': text_blocks_preproc,
@@ -19,11 +11,11 @@ def construct_page_component(page_id, text_blocks_preproc):
 
 
 def parse_pdf_by_ocr(
-    ocr_json_file_path,
+    ocr_pdf_info,
     start_page_id=0,
     end_page_id=None,
 ):
-    ocr_pdf_info = read_json_file(ocr_json_file_path)
+
     pdf_info_dict = {}
     end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
     for page_id in range(start_page_id, end_page_id + 1):