Explorar o código

实现parse_ocr_pdf api,切图逻辑s3使用平铺地址,本地使用层级地址,删除预设s3_image_save_path

赵小蒙 hai 1 ano
pai
achega
00f16239c6

+ 77 - 0
.github/workflows/python-package.yml

@@ -0,0 +1,77 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    tags:
+      - '*released'
+  workflow_dispatch:
+
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10"]
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+    - name: Install wheel
+      run: |
+        python -m pip install wheel
+
+    - name: Build wheel
+      run: |
+        python setup.py bdist_wheel
+
+    - name: Upload artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: wheel-file
+        path: dist/*.whl
+        retention-days: 30
+
+  release:
+    needs: [ build ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: wheel-file
+          path: dist
+
+      - name: Create and Upload Release
+        id: create_release
+        uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981
+        with:
+          files: './dist/*.whl'
+        env:
+          GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
+      # - name: Publish to PyPI
+      #   uses: pypa/gh-action-pypi-publish@release/v1
+      #   with:
+      #     user: __token__
+      #     password: ${{ secrets.PYPI_TOKEN }}

+ 1 - 1
demo/ocr_demo.py

@@ -116,7 +116,7 @@ if __name__ == '__main__':
     pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
     json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
     # ocr_local_parse(pdf_path, json_file_path)
-    book_name = "科数网/edu_00011318"
+    book_name = "数学新星网/edu_00001236"
     ocr_online_parse(book_name)
     
     pass

+ 4 - 5
magic_pdf/dict2md/ocr_mkcontent.py

@@ -1,4 +1,3 @@
-from magic_pdf.libs.commons import s3_image_save_path, join_path
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.ocr_content_type import ContentType
@@ -56,7 +55,7 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
                         if not span.get('image_path'):
                             continue
                         else:
-                            content = f"![]({join_path(s3_image_save_path, span['image_path'])})"
+                            content = f"![]({span['image_path']})"
                     else:
                         content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
                         if span['type'] == ContentType.InlineEquation:
@@ -123,7 +122,7 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode):
                         content = f"\n$$\n{span['content']}\n$$\n"
                     elif span_type in [ContentType.Image, ContentType.Table]:
                         if mode == 'mm':
-                            content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
+                            content = f"\n![]({span['image_path']})\n"
                         elif mode == 'nlp':
                             pass
                     if content != '':
@@ -195,13 +194,13 @@ def line_to_standard_format(line):
                 if span['type'] == ContentType.Image:
                     content = {
                         'type': 'image',
-                        'img_path': join_path(s3_image_save_path, span['image_path'])
+                        'img_path': span['image_path']
                     }
                     return content
                 elif span['type'] == ContentType.Table:
                     content = {
                         'type': 'table',
-                        'img_path': join_path(s3_image_save_path, span['image_path'])
+                        'img_path': span['image_path']
                     }
                     return content
         else:

+ 1 - 1
magic_pdf/libs/commons.py

@@ -24,7 +24,7 @@ error_log_path = "s3://llm-pdf-text/err_logs/"
 # json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
 json_dump_path = "s3://llm-pdf-text/json_dump/"
 
-s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # TODO 基础库不应该有这些存在的路径,应该在业务代码中定义
+# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
 
 
 def get_top_percent_list(num_list, percent):

+ 15 - 0
magic_pdf/libs/hash_utils.py

@@ -0,0 +1,15 @@
+import hashlib
+
+
+def compute_md5(file_bytes):
+    hasher = hashlib.md5()
+    hasher.update(file_bytes)
+    return hasher.hexdigest().upper()
+
+
+def compute_sha256(input_string):
+    hasher = hashlib.sha256()
+    # 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
+    input_bytes = input_string.encode('utf-8')
+    hasher.update(input_bytes)
+    return hasher.hexdigest()

+ 23 - 11
magic_pdf/libs/pdf_image_tools.py

@@ -7,6 +7,7 @@ import io
 from magic_pdf.libs.commons import fitz
 from loguru import logger
 from magic_pdf.libs.commons import parse_bucket_key, join_path
+from magic_pdf.libs.hash_utils import compute_sha256
 
 
 def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str, s3_return_path=None, img_s3_client=None, upload_switch=True):
@@ -16,9 +17,13 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
     """
     # 拼接文件名
     filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}.jpg"
-    # 拼接路径
-    image_save_path = join_path(save_parent_path, filename)
+
+    # 老版本返回不带bucket的路径
     s3_img_path = join_path(s3_return_path, filename) if s3_return_path is not None else None
+
+    # 新版本生成s3的平铺路径
+    s3_img_hash256_path = f"{compute_sha256(s3_img_path)}.jpg"
+
     # 打印图片文件名
     # print(f"Saved {image_save_path}")
 
@@ -42,12 +47,16 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
     # 截取图片
     pix = page.get_pixmap(clip=rect, matrix=zoom)
 
-    if image_save_path.startswith("s3://"):
+    if save_parent_path.startswith("s3://"):
         if not upload_switch:
             pass
         else:
-            # 图片保存到s3
-            bucket_name, bucket_key = parse_bucket_key(image_save_path)
+            """图片保存到s3"""
+            # 从save_parent_path获取bucket_name
+            bucket_name, bucket_key = parse_bucket_key(save_parent_path)
+            # 平铺路径赋值给bucket_key
+            bucket_key = s3_img_hash256_path
+
             # 将字节流上传到s3
             byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
             file_obj = io.BytesIO(byte_data)
@@ -58,18 +67,21 @@ def cut_image(bbox: Tuple, page_num: int, page: fitz.Page, save_parent_path: str
                 # img_s3_client_once.upload_fileobj(file_obj, bucket_name, bucket_key)
             else:
                 logger.exception("must input img_s3_client")
-        return s3_img_path
+        # return s3_img_path # 早期版本要求返回不带bucket的路径
+        s3_image_save_path = f"s3://{bucket_name}/{s3_img_hash256_path}"  # 新版本返回平铺的s3路径
+        return s3_image_save_path
     else:
         # 保存图片到本地
         # 先检查一下image_save_path的父目录是否存在,如果不存在,就创建
-        parent_dir = os.path.dirname(image_save_path)
+        local_image_save_path = join_path(save_parent_path, filename)
+        parent_dir = os.path.dirname(local_image_save_path)
         if not os.path.exists(parent_dir):
             os.makedirs(parent_dir)
-        pix.save(image_save_path, jpg_quality=95)
+        pix.save(local_image_save_path, jpg_quality=95)
         # 为了直接能在markdown里看,这里把地址改为相对于mardown的地址
-        pth = Path(image_save_path)
-        image_save_path = f"{pth.parent.name}/{pth.name}"
-        return image_save_path
+        pth = Path(local_image_save_path)
+        local_image_save_path = f"{pth.parent.name}/{pth.name}"
+        return local_image_save_path
 
 
 def save_images_by_bboxes(book_name: str, page_num: int, page: fitz.Page, save_path: str,

+ 6 - 3
magic_pdf/pdf_parse_by_ocr.py

@@ -15,6 +15,7 @@ from magic_pdf.libs.commons import (
 )
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.drop_tag import DropTag
+from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.libs.safe_filename import sanitize_filename
 from magic_pdf.para.para_split import para_split
@@ -39,18 +40,18 @@ def parse_pdf_by_ocr(
         pdf_bytes,
         pdf_model_output,
         save_path,
-        book_name,
+        book_name="",
         pdf_model_profile=None,
         image_s3_config=None,
         start_page_id=0,
         end_page_id=None,
         debug_mode=False,
 ):
-
+    pdf_bytes_md5 = compute_md5(pdf_bytes)
     save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
-    book_name = sanitize_filename(book_name)
     md_bookname_save_path = ""
     if debug_mode:
+        book_name = sanitize_filename(book_name)
         save_path = join_path(save_tmp_path, "md")
         pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
 
@@ -179,6 +180,8 @@ def parse_pdf_by_ocr(
         spans, dropped_spans_by_removed_bboxes = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
 
         '''对image和table截图'''
+        if book_name == "":
+            book_name = pdf_bytes_md5
         spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
 
         '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''

+ 3 - 0
magic_pdf/pre_proc/ocr_cut_image.py

@@ -4,6 +4,9 @@ from magic_pdf.libs.pdf_image_tools import cut_image
 
 
 def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
+
+    """spark环境book_name为pdf_bytes_md5,本地环境会传正常bookname"""
+
     def s3_return_path(type):
         return join_path(book_name, type)
 

+ 9 - 1
magic_pdf/spark/spark_api.py

@@ -15,6 +15,7 @@
 
 
 from magic_pdf.io import AbsReaderWriter
+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
 
 
 def parse_txt_pdf(pdf_bytes:bytes, pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, **kwargs):
@@ -28,7 +29,14 @@ def parse_ocr_pdf(pdf_bytes:bytes,  pdf_models:list, imageWriter: AbsReaderWrite
     """
     解析ocr类pdf
     """
-    pass
+    pdf_info_dict = parse_pdf_by_ocr(
+        pdf_bytes,
+        pdf_models,
+        imageWriter,
+        start_page_id=start_page,
+        debug_mode=is_debug,
+    )
+    return pdf_info_dict
 
 
 def parse_union_pdf(pdf_bytes:bytes,  pdf_models:list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,  *args, **kwargs):