1 éve · a0be4652e6
--- a/demo/ocr_demo.py
+++ b/demo/ocr_demo.py
@@ -1,3 +1,4 @@
 
				+import json
			
 
				 import os
			
 
				 
			
 
				 from loguru import logger
			
@@ -20,9 +21,16 @@ def save_markdown(markdown_text, input_filepath):
 
				         file.write(markdown_text)
			
 
				 
			
 
				 
			
 
				+def read_json_file(file_path):
			
 
				+    with open(file_path, 'r') as f:
			
 
				+        data = json.load(f)
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				     ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json"
			
 
				-    pdf_info_dict = parse_pdf_by_ocr(ocr_json_file_path)
			
 
				+    ocr_pdf_info = read_json_file(ocr_json_file_path)
			
 
				+    pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
			
 
				     markdown_text = mk_nlp_markdown(pdf_info_dict)
			
 
				     logger.info(markdown_text)
			
 
				     save_markdown(markdown_text, ocr_json_file_path)
			
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -1,15 +1,7 @@
 
				-import json
			
 
				-
			
 
				 from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
			
 
				 from magic_pdf.libs.ocr_dict_merge import merge_spans
			
 
				 
			
 
				 
			
 
				-def read_json_file(file_path):
			
 
				-    with open(file_path, 'r') as f:
			
 
				-        data = json.load(f)
			
 
				-    return data
			
 
				-
			
 
				-
			
 
				 def construct_page_component(page_id, text_blocks_preproc):
			
 
				     return_dict = {
			
 
				         'preproc_blocks': text_blocks_preproc,
			
@@ -19,11 +11,11 @@ def construct_page_component(page_id, text_blocks_preproc):
 
				 
			
 
				 
			
 
				 def parse_pdf_by_ocr(
			
 
				-    ocr_json_file_path,
			
 
				+    ocr_pdf_info,
			
 
				     start_page_id=0,
			
 
				     end_page_id=None,
			
 
				 ):
			
 
				-    ocr_pdf_info = read_json_file(ocr_json_file_path)
			
 
				+
			
 
				     pdf_info_dict = {}
			
 
				     end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
			
 
				     for page_id in range(start_page_id, end_page_id + 1):