Преглед изворни кода

s3_image_save_path统一配置

赵小蒙 пре 1 година
родитељ
комит
f10b4a501f
3 измењених фајлова са 8 додато и 5 уклоњено
  1. 3 2
      magic_pdf/dict2md/ocr_mkcontent.py
  2. 2 0
      magic_pdf/libs/commons.py
  3. 3 3
      magic_pdf/pipeline.py

+ 3 - 2
magic_pdf/dict2md/ocr_mkcontent.py

@@ -1,3 +1,4 @@
+from magic_pdf.libs.commons import s3_image_save_path, join_path
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.ocr_content_type import ContentType
 
@@ -42,7 +43,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
                         if not span.get('image_path'):
                             continue
                         else:
-                            content = f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']})"
+                            content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
                     else:
                         content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                         if span['type'] == ContentType.InlineEquation:
@@ -73,7 +74,7 @@ def mk_mm_markdown2(pdf_info_dict:dict):
                     elif span_type == ContentType.InterlineEquation:
                         para_text += f"$$\n{span['content']}\n$$ "
                     elif span_type == ContentType.Image:
-                        para_text += f"![](s3://mllm-raw-media/pdf2md_img/{span['image_path']}) "
+                        para_text += f"![]({join_path(s3_image_save_path, span['image_path'])})"
             markdown.append(para_text)
 
     return '\n\n'.join(markdown)

+ 2 - 0
magic_pdf/libs/commons.py

@@ -24,6 +24,8 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
 # json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
 json_dump_path = "s3://llm-pdf-text/json_dump/"
 
+s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/"
+
 
 def get_top_percent_list(num_list, percent):
     """

+ 3 - 3
magic_pdf/pipeline.py

@@ -4,7 +4,7 @@ import time
 from urllib.parse import quote
 
 from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
-from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time
+from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time, s3_image_save_path
 from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.json_compressor import JsonCompressor
 from magic_pdf.dict2md.mkcontent import mk_nlp_markdown
@@ -287,7 +287,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
     #     jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
     else:
         try:
-            save_path = "s3://mllm-raw-media/pdf2md_img/"
+            save_path = s3_image_save_path
             image_s3_config = get_s3_config(save_path)
             start_time = time.time()  # 记录开始时间
             # 先打印一下book_name和解析开始的时间
@@ -328,7 +328,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
     file_id = jso.get('file_id')
     book_name = f"{data_source}/{file_id}"
     try:
-        save_path = "s3://mllm-raw-media/pdf2md_img/"
+        save_path = s3_image_save_path
         image_s3_config = get_s3_config(save_path)
         start_time = time.time()  # 记录开始时间
         # 先打印一下book_name和解析开始的时间