赵小蒙 hai 1 ano
pai
achega
7fcbae01fe
Modificáronse 4 ficheiros con 45 adicións e 37 borrados
  1. 3 1
      README.md
  2. 32 0
      demo/demo_commons.py
  3. 8 7
      demo/ocr_demo.py
  4. 2 29
      demo/text_demo.py

+ 3 - 1
README.md

@@ -28,7 +28,9 @@ pip install -r requirements.txt
 3.Run the main script
 
 ```sh
-use demo/demo_test.py
+use demo/text_demo.py
+or
+use demo/ocr_demo.py
 ```
 
 ### 版权说明

+ 32 - 0
demo/demo_commons.py

@@ -0,0 +1,32 @@
+import json
+
+from app.common.s3 import get_s3_config
+from magic_pdf.libs.commons import join_path, read_file, json_dump_path
+
+
+local_json_path = "Z:/format.json"
+local_jsonl_path = "Z:/format.jsonl"
+
+def get_json_from_local_or_s3(book_name=None):
+    if book_name is None:
+        with open(local_json_path, "r", encoding="utf-8") as json_file:
+            json_line = json_file.read()
+            json_object = json.loads(json_line)
+    else:
+        # error_log_path & json_dump_path
+        # 可配置从上述两个地址获取源json
+        json_path = join_path(json_dump_path, book_name + ".json")
+        s3_config = get_s3_config(json_path)
+        file_content = read_file(json_path, s3_config)
+        json_str = file_content.decode("utf-8")
+        # logger.info(json_str)
+        json_object = json.loads(json_str)
+    return json_object
+
+
+def write_json_to_local(jso, book_name=None):
+    if book_name is None:
+        with open(local_json_path, "w", encoding="utf-8") as file:
+            file.write(json.dumps(jso, ensure_ascii=False))
+    else:
+        pass

+ 8 - 7
demo/ocr_demo.py

@@ -5,7 +5,7 @@ from loguru import logger
 from pathlib import Path
 
 from app.common.s3 import get_s3_config
-from demo.demo_test import get_json_from_local_or_s3
+from demo.demo_commons import get_json_from_local_or_s3
 from magic_pdf.dict2md.ocr_mkcontent import (
     ocr_mk_mm_markdown_with_para,
     ocr_mk_nlp_markdown,
@@ -14,7 +14,7 @@ from magic_pdf.dict2md.ocr_mkcontent import (
     ocr_mk_mm_markdown_with_para_and_pagination,
     make_standard_format_with_para
 )
-from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.commons import join_path, read_file
 from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
 
 
@@ -43,7 +43,8 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path):
         ocr_pdf_model_info = read_json_file(ocr_json_file_path)
         pth = Path(ocr_json_file_path)
         book_name = pth.name
-        ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info)
+        pdf_bytes = read_file(ocr_pdf_path, None)
+        ocr_parse_core(book_name, pdf_bytes, ocr_pdf_model_info)
     except Exception as e:
         logger.exception(e)
 
@@ -54,20 +55,20 @@ def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
         # logger.info(json_object)
         s3_pdf_path = json_object["file_location"]
         s3_config = get_s3_config(s3_pdf_path)
+        pdf_bytes = read_file(s3_pdf_path, s3_config)
         ocr_pdf_model_info = json_object.get("doc_layout_result")
-        ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config)
+        ocr_parse_core(book_name, pdf_bytes, ocr_pdf_model_info)
     except Exception as e:
         logger.exception(e)
 
 
-def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, s3_config=None):
+def ocr_parse_core(book_name, pdf_bytes, ocr_pdf_model_info, start_page_id=0):
     save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
     save_path = join_path(save_tmp_path, "md")
     save_path_with_bookname = os.path.join(save_path, book_name)
     text_content_save_path = f"{save_path_with_bookname}/book.md"
     pdf_info_dict = parse_pdf_by_ocr(
-        ocr_pdf_path,
-        s3_config,
+        pdf_bytes,
         ocr_pdf_model_info,
         save_path,
         book_name,

+ 2 - 29
demo/demo_test.py → demo/text_demo.py

@@ -5,6 +5,7 @@ from pathlib import Path
 
 import click
 
+from demo.demo_commons import get_json_from_local_or_s3, write_json_to_local, local_jsonl_path, local_json_path
 from magic_pdf.dict2md.mkcontent import mk_mm_markdown
 from magic_pdf.pipeline import (
     meta_scan,
@@ -13,38 +14,10 @@ from magic_pdf.pipeline import (
     pdf_intermediate_dict_to_markdown,
     save_tables_to_s3,
 )
-from magic_pdf.libs.commons import join_path, read_file, json_dump_path
-from app.common.s3 import get_s3_config
+from magic_pdf.libs.commons import join_path
 from loguru import logger
 
 
-local_json_path = "Z:/format.json"
-local_jsonl_path = "Z:/format.jsonl"
-
-
-def get_json_from_local_or_s3(book_name=None):
-    if book_name is None:
-        with open(local_json_path, "r", encoding="utf-8") as json_file:
-            json_line = json_file.read()
-            json_object = json.loads(json_line)
-    else:
-        # error_log_path & json_dump_path
-        # 可配置从上述两个地址获取源json
-        json_path = join_path(json_dump_path, book_name + ".json")
-        s3_config = get_s3_config(json_path)
-        file_content = read_file(json_path, s3_config)
-        json_str = file_content.decode("utf-8")
-        # logger.info(json_str)
-        json_object = json.loads(json_str)
-    return json_object
-
-
-def write_json_to_local(jso, book_name=None):
-    if book_name is None:
-        with open(local_json_path, "w", encoding="utf-8") as file:
-            file.write(json.dumps(jso, ensure_ascii=False))
-    else:
-        pass
 
 
 def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):