1 rok pred · 00f16239c6
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,77 @@
 
				+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
			
 
				+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
			
 
				+
			
 
				+name: Python package
			
 
				+
			
 
				+on:
			
 
				+  push:
			
 
				+    tags:
			
 
				+      - '*released'
			
 
				+  workflow_dispatch:
			
 
				+
			
 
				+
			
 
				+jobs:
			
 
				+  build:
			
 
				+
			
 
				+    runs-on: ubuntu-latest
			
 
				+    strategy:
			
 
				+      fail-fast: false
			
 
				+      matrix:
			
 
				+        python-version: ["3.10"]
			
 
				+
			
 
				+    steps:
			
 
				+    - name: Checkout code
			
 
				+      uses: actions/checkout@v4
			
 
				+      with:
			
 
				+        fetch-depth: 0
			
 
				+
			
 
				+    - name: Set up Python ${{ matrix.python-version }}
			
 
				+      uses: actions/setup-python@v5
			
 
				+      with:
			
 
				+        python-version: ${{ matrix.python-version }}
			
 
				+
			
 
				+    - name: Install dependencies
			
 
				+      run: |
			
 
				+        python -m pip install --upgrade pip
			
 
				+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
			
 
				+
			
 
				+    - name: Install wheel
			
 
				+      run: |
			
 
				+        python -m pip install wheel
			
 
				+
			
 
				+    - name: Build wheel
			
 
				+      run: |
			
 
				+        python setup.py bdist_wheel
			
 
				+
			
 
				+    - name: Upload artifact
			
 
				+      uses: actions/upload-artifact@v4
			
 
				+      with:
			
 
				+        name: wheel-file
			
 
				+        path: dist/*.whl
			
 
				+        retention-days: 30
			
 
				+
			
 
				+  release:
			
 
				+    needs: [ build ]
			
 
				+    runs-on: ubuntu-latest
			
 
				+    steps:
			
 
				+      - name: Checkout code
			
 
				+        uses: actions/checkout@v4
			
 
				+
			
 
				+      - name: Download artifact
			
 
				+        uses: actions/download-artifact@v4
			
 
				+        with:
			
 
				+          name: wheel-file
			
 
				+          path: dist
			
 
				+
			
 
				+      - name: Create and Upload Release
			
 
				+        id: create_release
			
 
				+        uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981
			
 
				+        with:
			
 
				+          files: './dist/*.whl'
			
 
				+        env:
			
 
				+          GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
			
 
				+      # - name: Publish to PyPI
			
 
				+      #   uses: pypa/gh-action-pypi-publish@release/v1
			
 
				+      #   with:
			
 
				+      #     user: __token__
			
 
				+      #     password: ${{ secrets.PYPI_TOKEN }}
			
--- a/demo/ocr_demo.py
+++ b/demo/ocr_demo.py
@@ -116,7 +116,7 @@ if __name__ == '__main__':
 
				     pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
			
 
				     json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
			
 
				     # ocr_local_parse(pdf_path, json_file_path)
			
 
				-    book_name = "科数网/edu_00011318"
			
 
				+    book_name = "数学新星网/edu_00001236"
			
 
				     ocr_online_parse(book_name)
			
 
				     
			
 
				     pass
			
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -1,4 +1,3 @@
 
				-from magic_pdf.libs.commons import s3_image_save_path, join_path
			
 
				 from magic_pdf.libs.language import detect_lang
			
 
				 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
			
 
				 from magic_pdf.libs.ocr_content_type import ContentType
			
@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
 
				                         if not span.get('image_path'):
			
 
				                             continue
			
 
				                         else:
			
 
				-                            content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
			
 
				+                            content = f"![]({span['image_path']})"
			
 
				                     else:
			
 
				                         content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
			
 
				                         if span['type'] == ContentType.InlineEquation:
			
@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
 
				                         content = f"\n$$\n{span['content']}\n$$\n"
			
 
				                     elif span_type in [ContentType.Image, ContentType.Table]:
			
 
				                         if mode == 'mm':
			
 
				-                            content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
			
 
				+                            content = f"\n![]({span['image_path']})\n"
			
 
				                         elif mode == 'nlp':
			
 
				                             pass
			
 
				                     if content != '':
			
@@ -195,13 +194,13 @@ def line_to_standard_format(line):
 
				                 if span['type'] == ContentType.Image:
			
 
				                     content = {
			
 
				                         'type': 'image',
			
 
				-                        'img_path': join_path(s3_image_save_path, span['image_path'])
			
 
				+                        'img_path': span['image_path']
			
 
				                     }
			
 
				                     return content
			
 
				                 elif span['type'] == ContentType.Table:
			
 
				                     content = {
			
 
				                         'type': 'table',
			
 
				-                        'img_path': join_path(s3_image_save_path, span['image_path'])
			
 
				+                        'img_path': span['image_path']
			
 
				                     }
			
 
				                     return content
			
 
				         else:
			
--- a/magic_pdf/libs/commons.py
+++ b/magic_pdf/libs/commons.py
@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
 
				 # json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
			
 
				 json_dump_path = "s3://llm-pdf-text/json_dump/"
			
 
				 
			
 
				-s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # TODO 基础库不应该有这些存在的路径，应该在业务代码中定义
			
 
				+# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径，应该在业务代码中定义
			
 
				 
			
 
				 
			
 
				 def get_top_percent_list(num_list, percent):
			
--- a/magic_pdf/libs/hash_utils.py
+++ b/magic_pdf/libs/hash_utils.py
@@ -0,0 +1,15 @@
 
				+import hashlib
			
 
				+
			
 
				+
			
 
				+def compute_md5(file_bytes):
			
 
				+    hasher = hashlib.md5()
			
 
				+    hasher.update(file_bytes)
			
 
				+    return hasher.hexdigest().upper()
			
 
				+
			
 
				+
			
 
				+def compute_sha256(input_string):
			
 
				+    hasher = hashlib.sha256()
			
 
				+    # 在Python3中，需要将字符串转化为字节对象才能被哈希函数处理
			
 
				+    input_bytes = input_string.encode('utf-8')
			
 
				+    hasher.update(input_bytes)
			
 
				+    return hasher.hexdigest()
			
--- a/magic_pdf/libs/pdf_image_tools.py
+++ b/magic_pdf/libs/pdf_image_tools.py
@@ -7,6 +7,7 @@ import io
 
				 from magic_pdf.libs.commons import fitz
			
 
				 from loguru import logger
			
 
				 from magic_pdf.libs.commons import parse_bucket_key, join_path
			
 
				+from magic_pdf.libs.hash_utils import compute_sha256
			
 
				 
			
 
				 
			
 
				 def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_return_path=None, img_s3_client=None, upload_switch=True):
			
@@ -16,9 +17,13 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
 
				     """
			
 
				     # 拼接文件名
			
 
				     filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg"
			
 
				-    # 拼接路径
			
 
				-    image_save_path = join_path(save_parent_path, filename)
			
 
				+
			
 
				+    # 老版本返回不带bucket的路径
			
 
				     s3_img_path = join_path(s3_return_path, filename) if s3_return_path is not None else None
			
 
				+
			
 
				+    # 新版本生成s3的平铺路径
			
 
				+    s3_img_hash256_path = f"{compute_sha256(s3_img_path)}.jpg"
			
 
				+
			
 
				     # 打印图片文件名
			
 
				     # print(f"Saved {image_save_path}")
			
 
				 
			
@@ -42,12 +47,16 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
 
				     # 截取图片
			
 
				     pix = page.get_pixmap(clip=rect, matrix=zoom)
			
 
				 
			
 
				-    if image_save_path.startswith("s3://"):
			
 
				+    if save_parent_path.startswith("s3://"):
			
 
				         if not upload_switch:
			
 
				             pass
			
 
				         else:
			
 
				-            # 图片保存到s3
			
 
				-            bucket_name, bucket_key = parse_bucket_key(image_save_path)
			
 
				+            """图片保存到s3"""
			
 
				+            # 从save_parent_path获取bucket_name
			
 
				+            bucket_name, bucket_key = parse_bucket_key(save_parent_path)
			
 
				+            # 平铺路径赋值给bucket_key
			
 
				+            bucket_key = s3_img_hash256_path
			
 
				+
			
 
				             # 将字节流上传到s3
			
 
				             byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
			
 
				             file_obj = io.BytesIO(byte_data)
			
@@ -58,18 +67,21 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
 
				                 # img_s3_client_once.upload_fileobj(file_obj, bucket_name, bucket_key)
			
 
				             else:
			
 
				                 logger.exception("must input img_s3_client")
			
 
				-        return s3_img_path
			
 
				+        # return s3_img_path # 早期版本要求返回不带bucket的路径
			
 
				+        s3_image_save_path = f"s3://{bucket_name}/{s3_img_hash256_path}"  # 新版本返回平铺的s3路径
			
 
				+        return s3_image_save_path
			
 
				     else:
			
 
				         # 保存图片到本地
			
 
				         # 先检查一下image_save_path的父目录是否存在，如果不存在，就创建
			
 
				-        parent_dir = os.path.dirname(image_save_path)
			
 
				+        local_image_save_path = join_path(save_parent_path, filename)
			
 
				+        parent_dir = os.path.dirname(local_image_save_path)
			
 
				         if not os.path.exists(parent_dir):
			
 
				             os.makedirs(parent_dir)
			
 
				-        pix.save(image_save_path, jpg_quality=95)
			
 
				+        pix.save(local_image_save_path, jpg_quality=95)
			
 
				         # 为了直接能在markdown里看，这里把地址改为相对于mardown的地址
			
 
				-        pth = Path(image_save_path)
			
 
				-        image_save_path = f"{pth.parent.name}/{pth.name}"
			
 
				-        return image_save_path
			
 
				+        pth = Path(local_image_save_path)
			
 
				+        local_image_save_path = f"{pth.parent.name}/{pth.name}"
			
 
				+        return local_image_save_path
			
 
				 
			
 
				 
			
 
				 def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_path: str,
			
--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -15,6 +15,7 @@ from magic_pdf.libs.commons import (
 
				 )
			
 
				 from magic_pdf.libs.coordinate_transform import get_scale_ratio
			
 
				 from magic_pdf.libs.drop_tag import DropTag
			
 
				+from magic_pdf.libs.hash_utils import compute_md5
			
 
				 from magic_pdf.libs.ocr_content_type import ContentType
			
 
				 from magic_pdf.libs.safe_filename import sanitize_filename
			
 
				 from magic_pdf.para.para_split import para_split
			
@@ -39,18 +40,18 @@ def parse_pdf_by_ocr(
 
				         pdf_bytes,
			
 
				         pdf_model_output,
			
 
				         save_path,
			
 
				-        book_name,
			
 
				+        book_name="",
			
 
				         pdf_model_profile=None,
			
 
				         image_s3_config=None,
			
 
				         start_page_id=0,
			
 
				         end_page_id=None,
			
 
				         debug_mode=False,
			
 
				 ):
			
 
				-
			
 
				+    pdf_bytes_md5 = compute_md5(pdf_bytes)
			
 
				     save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
			
 
				-    book_name = sanitize_filename(book_name)
			
 
				     md_bookname_save_path = ""
			
 
				     if debug_mode:
			
 
				+        book_name = sanitize_filename(book_name)
			
 
				         save_path = join_path(save_tmp_path, "md")
			
 
				         pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
			
 
				 
			
@@ -179,6 +180,8 @@ def parse_pdf_by_ocr(
 
				         spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
			
 
				 
			
 
				         '''对image和table截图'''
			
 
				+        if book_name == "":
			
 
				+            book_name = pdf_bytes_md5
			
 
				         spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
			
 
				 
			
 
				         '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
			
--- a/magic_pdf/pre_proc/ocr_cut_image.py
+++ b/magic_pdf/pre_proc/ocr_cut_image.py
@@ -4,6 +4,9 @@ from magic_pdf.libs.pdf_image_tools import cut_image
 
				 
			
 
				 
			
 
				 def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
			
 
				+
			
 
				+    """spark环境book_name为pdf_bytes_md5，本地环境会传正常bookname"""
			
 
				+
			
 
				     def s3_return_path(type):
			
 
				         return join_path(book_name, type)
			
 
				 
			
--- a/magic_pdf/spark/spark_api.py
+++ b/magic_pdf/spark/spark_api.py
@@ -15,6 +15,7 @@
 
				 
			
 
				 
			
 
				 from magic_pdf.io import AbsReaderWriter
			
 
				+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
			
 
				 
			
 
				 
			
 
				 def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
			
@@ -28,7 +29,14 @@ def parse_ocr_pdf(pdf_bytes:bytes,  pdf_models:list, imageWriter: AbsReaderWrite
 
				     """
			
 
				     解析ocr类pdf
			
 
				     """
			
 
				-    pass
			
 
				+    pdf_info_dict = parse_pdf_by_ocr(
			
 
				+        pdf_bytes,
			
 
				+        pdf_models,
			
 
				+        imageWriter,
			
 
				+        start_page_id=start_page,
			
 
				+        debug_mode=is_debug,
			
 
				+    )
			
 
				+    return pdf_info_dict
			
 
				 
			
 
				 
			
 
				 def parse_union_pdf(pdf_bytes:bytes,  pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,  *args, **kwargs):