1 年之前 · 6b6f40f350
--- a/.github/workflows/cli.yml
+++ b/.github/workflows/cli.yml
@@ -0,0 +1,80 @@
 
				+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
			
 
				+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
			
 
				+
			
 
				+name: PDF
			
 
				+on:
			
 
				+  push:
			
 
				+    branches:
			
 
				+      - "master"
			
 
				+    paths-ignore:
			
 
				+      - "cmds/**"
			
 
				+      - "**.md"
			
 
				+  pull_request:
			
 
				+    branches:
			
 
				+      - "master"
			
 
				+    paths-ignore:
			
 
				+      - "cmds/**"
			
 
				+      - "**.md"
			
 
				+  workflow_dispatch:
			
 
				+jobs:
			
 
				+  cli-test:
			
 
				+    runs-on: pdf
			
 
				+    timeout-minutes: 40
			
 
				+    strategy:
			
 
				+      fail-fast: true
			
 
				+
			
 
				+    steps:
			
 
				+    - name: config-net
			
 
				+      run: |
			
 
				+        export http_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
			
 
				+        export https_proxy=http://bigdata_open_proxy:H89k5qwQRDYfz@10.140.90.20:10811
			
 
				+    - name: PDF cli
			
 
				+      uses: actions/checkout@v3
			
 
				+      with:
			
 
				+        fetch-depth: 2
			
 
				+    - name: check-requirements
			
 
				+      run: |
			
 
				+        changed_files=$(git diff --name-only -r HEAD~1 HEAD)
			
 
				+        echo $changed_files
			
 
				+        if [[ $changed_files =~ "requirements.txt" ]]; then
			
 
				+          pip install -r requirements.txt
			
 
				+        fi
			
 
				+
			
 
				+    - name: config-net-reset
			
 
				+      run: |
			
 
				+        export http_proxy=""
			
 
				+        export https_proxy=""
			
 
				+    - name: test_cli
			
 
				+      run: |
			
 
				+        echo $GITHUB_WORKSPACE
			
 
				+        cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli.py
			
 
				+
			
 
				+                                                                                                                            
			
 
				+  notify_to_feishu:
			
 
				+    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
			
 
				+    needs: [cli-test]
			
 
				+    runs-on: pdf
			
 
				+    steps:
			
 
				+    - name: get_actor
			
 
				+      run: |
			
 
				+          metion_list="quyuan"
			
 
				+          echo $GITHUB_ACTOR
			
 
				+          if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
			
 
				+            metion_list="xuchao"
			
 
				+          elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
			
 
				+            metion_list="zhaoxiaomeng"
			
 
				+          elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
			
 
				+            metion_list="xurui1"
			
 
				+          fi
			
 
				+          echo $metion_list
			
 
				+          echo "METIONS=$metion_list" >> "$GITHUB_ENV"
			
 
				+          echo ${{ env.METIONS }}
			
 
				+
			
 
				+    - name: notify
			
 
				+      run: |
			
 
				+        curl  ${{ secrets.WEBHOOK_URL }} -H 'Content-Type: application/json'  -d '{
			
 
				+        "msgtype": "text",
			
 
				+        "text": {
			
 
				+            "mentioned_list": ["${{ env.METIONS }}"] , "content": "'${{ github.repository }}' GitHubAction Failed!\n 细节请查看：https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"
			
 
				+        } 
			
 
				+        }'     
			
--- a/magic_pdf/cli/__init__.py
+++ b/magic_pdf/cli/__init__.py
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
 
				     page_markdown = []
			
 
				     for para_block in paras_of_layout:
			
 
				         para_text = ''
			
 
				-        para_type = para_block.get('type')
			
 
				+        para_type = para_block['type']
			
 
				         if para_type == BlockType.Text:
			
 
				             para_text = merge_para_with_text(para_block)
			
 
				         elif para_type == BlockType.Title:
			
@@ -106,32 +106,30 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
 
				             if mode == 'nlp':
			
 
				                 continue
			
 
				             elif mode == 'mm':
			
 
				-                img_blocks = para_block.get('blocks')
			
 
				-                for img_block in img_blocks:
			
 
				-                    if img_block.get('type') == BlockType.ImageBody:
			
 
				-                        for line in img_block.get('lines'):
			
 
				+                for block in para_block['blocks']:
			
 
				+                    if block['type'] == BlockType.ImageBody:
			
 
				+                        for line in block['lines']:
			
 
				                             for span in line['spans']:
			
 
				-                                if span.get('type') == ContentType.Image:
			
 
				+                                if span['type'] == ContentType.Image:
			
 
				                                     para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
			
 
				-                for img_block in img_blocks:
			
 
				-                    if img_block.get('type') == BlockType.ImageCaption:
			
 
				-                        para_text += merge_para_with_text(img_block)
			
 
				+                for block in para_block['blocks']:
			
 
				+                    if block['type'] == BlockType.ImageCaption:
			
 
				+                        para_text += merge_para_with_text(block)
			
 
				         elif para_type == BlockType.Table:
			
 
				             if mode == 'nlp':
			
 
				                 continue
			
 
				             elif mode == 'mm':
			
 
				-                table_blocks = para_block.get('blocks')
			
 
				-                for table_block in table_blocks:
			
 
				-                    if table_block.get('type') == BlockType.TableBody:
			
 
				-                        for line in table_block.get('lines'):
			
 
				+                for block in para_block['blocks']:
			
 
				+                    if block['type'] == BlockType.TableBody:
			
 
				+                        for line in block['lines']:
			
 
				                             for span in line['spans']:
			
 
				-                                if span.get('type') == ContentType.Table:
			
 
				+                                if span['type'] == ContentType.Table:
			
 
				                                     para_text = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
			
 
				-                for table_block in table_blocks:
			
 
				-                    if table_block.get('type') == BlockType.TableCaption:
			
 
				-                        para_text += merge_para_with_text(table_block)
			
 
				-                    elif table_block.get('type') == BlockType.TableFootnote:
			
 
				-                        para_text += merge_para_with_text(table_block)
			
 
				+                for block in para_block['blocks']:
			
 
				+                    if block['type'] == BlockType.TableCaption:
			
 
				+                        para_text += merge_para_with_text(block)
			
 
				+                    elif block['type'] == BlockType.TableFootnote:
			
 
				+                        para_text += merge_para_with_text(block)
			
 
				 
			
 
				         if para_text.strip() == '':
			
 
				             continue
			
@@ -141,11 +139,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
 
				     return page_markdown
			
 
				 
			
 
				 
			
 
				-def merge_para_with_text(para):
			
 
				+def merge_para_with_text(para_block):
			
 
				     para_text = ''
			
 
				-    for line in para['lines']:
			
 
				+    for line in para_block['lines']:
			
 
				         for span in line['spans']:
			
 
				-            span_type = span.get('type')
			
 
				+            span_type = span['type']
			
 
				             content = ''
			
 
				             language = ''
			
 
				             if span_type == ContentType.Text:
			
@@ -159,6 +157,7 @@ def merge_para_with_text(para):
 
				                 content = f"${span['content']}$"
			
 
				             elif span_type == ContentType.InterlineEquation:
			
 
				                 content = f"\n$$\n{span['content']}\n$$\n"
			
 
				+
			
 
				             if content != '':
			
 
				                 if language == 'en':  # 英文语境下 content间需要空格分隔
			
 
				                     para_text += content + ' '
			
--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
@@ -132,7 +132,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
 
				     pdf_docs = fitz.open("pdf", pdf_bytes)
			
 
				     for i, page in enumerate(pdf_docs):
			
 
				         draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
			
 
				-        draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0], True)
			
 
				+        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
			
 
				         draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True)  # color !
			
 
				         draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
			
 
				         draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
			
@@ -142,7 +142,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
 
				         draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
			
 
				         draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
			
 
				         draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
			
 
				-        draw_bbox_without_number(i, interequations_list, page, [160, 160, 160], True)
			
 
				+        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
			
 
				 
			
 
				     # Save the PDF
			
 
				     pdf_docs.save(f"{out_path}/layout.pdf")
			
--- a/magic_pdf/model/__init__.py
+++ b/magic_pdf/model/__init__.py
--- a/magic_pdf/pdf_parse_by_ocr_v2.py
+++ b/magic_pdf/pdf_parse_by_ocr_v2.py
@@ -61,7 +61,7 @@ def parse_pdf_by_ocr(pdf_bytes,
 
				         '''将所有区块的bbox整理到一起'''
			
 
				         all_bboxes = ocr_prepare_bboxes_for_layout_split(
			
 
				             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
			
 
				-            interline_equation_blocks, page_w, page_h)
			
 
				+            interline_equations, page_w, page_h)
			
 
				 
			
 
				         '''根据区块信息计算layout'''
			
 
				         page_boundry = [0, 0, page_w, page_h]
			
--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -57,8 +57,8 @@ def fix_text_overlap_title_blocks(all_bboxes):
 
				 
			
 
				     for text_block in text_blocks:
			
 
				         for title_block in title_blocks:
			
 
				-            text_block_bbox = text_block[0], text_block[1], text_block[2], text_block[3]
			
 
				-            title_block_bbox = title_block[0], title_block[1], title_block[2], title_block[3]
			
 
				+            text_block_bbox = text_block[:4]
			
 
				+            title_block_bbox = title_block[:4]
			
 
				             if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
			
 
				                 all_bboxes.remove(title_block)
			
 
				 
			
@@ -66,27 +66,37 @@ def fix_text_overlap_title_blocks(all_bboxes):
 
				 
			
 
				 
			
 
				 def remove_need_drop_blocks(all_bboxes, discarded_blocks):
			
 
				-    for block in all_bboxes.copy():
			
 
				+    need_remove = []
			
 
				+    for block in all_bboxes:
			
 
				         for discarded_block in discarded_blocks:
			
 
				-            block_bbox = block[0], block[1], block[2], block[3]
			
 
				+            block_bbox = block[:4]
			
 
				             if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
			
 
				-                all_bboxes.remove(block)
			
 
				+                if block not in need_remove:
			
 
				+                    need_remove.append(block)
			
 
				+                    break
			
 
				+
			
 
				+    if len(need_remove) > 0:
			
 
				+        for block in need_remove:
			
 
				+            all_bboxes.remove(block)
			
 
				     return all_bboxes
			
 
				 
			
 
				 
			
 
				 def remove_overlaps_min_blocks(all_bboxes):
			
 
				     #  删除重叠blocks中较小的那些
			
 
				-    for block1 in all_bboxes.copy():
			
 
				-        for block2 in all_bboxes.copy():
			
 
				+    need_remove = []
			
 
				+    for block1 in all_bboxes:
			
 
				+        for block2 in all_bboxes:
			
 
				             if block1 != block2:
			
 
				-                block1_bbox = [block1[0], block1[1], block1[2], block1[3]]
			
 
				-                block2_bbox = [block2[0], block2[1], block2[2], block2[3]]
			
 
				+                block1_bbox = block1[:4]
			
 
				+                block2_bbox = block2[:4]
			
 
				                 overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
			
 
				                 if overlap_box is not None:
			
 
				-                    bbox_to_remove = next(
			
 
				-                        (block for block in all_bboxes if [block[0], block[1], block[2], block[3]] == overlap_box),
			
 
				-                        None)
			
 
				-                    if bbox_to_remove is not None:
			
 
				-                        all_bboxes.remove(bbox_to_remove)
			
 
				+                    bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
			
 
				+                    if bbox_to_remove is not None and bbox_to_remove not in need_remove:
			
 
				+                        need_remove.append(bbox_to_remove)
			
 
				+
			
 
				+    if len(need_remove) > 0:
			
 
				+        for block in need_remove:
			
 
				+            all_bboxes.remove(block)
			
 
				 
			
 
				     return all_bboxes
			
--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -9,16 +9,20 @@ from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 
				 def remove_overlaps_min_spans(spans):
			
 
				     dropped_spans = []
			
 
				     #  删除重叠spans中较小的那些
			
 
				-    for span1 in spans.copy():
			
 
				-        for span2 in spans.copy():
			
 
				+    for span1 in spans:
			
 
				+        for span2 in spans:
			
 
				             if span1 != span2:
			
 
				                 overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
			
 
				                 if overlap_box is not None:
			
 
				-                    bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
			
 
				-                    if bbox_to_remove is not None:
			
 
				-                        spans.remove(bbox_to_remove)
			
 
				-                        bbox_to_remove['tag'] = DropTag.SPAN_OVERLAP
			
 
				-                        dropped_spans.append(bbox_to_remove)
			
 
				+                    span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
			
 
				+                    if span_need_remove is not None and span_need_remove not in dropped_spans:
			
 
				+                        dropped_spans.append(span_need_remove)
			
 
				+
			
 
				+    if len(dropped_spans) > 0:
			
 
				+        for span_need_remove in dropped_spans:
			
 
				+            spans.remove(span_need_remove)
			
 
				+            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
			
 
				+
			
 
				     return spans, dropped_spans
			
 
				 
			
 
				 
			
@@ -29,11 +33,13 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
 
				     for span in spans:
			
 
				         for removed_bbox in need_remove_spans_bboxes:
			
 
				             if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
			
 
				-                need_remove_spans.append(span)
			
 
				-                break
			
 
				+                if span not in need_remove_spans:
			
 
				+                    need_remove_spans.append(span)
			
 
				+                    break
			
 
				 
			
 
				-    for span in need_remove_spans:
			
 
				-        spans.remove(span)
			
 
				+    if len(need_remove_spans) > 0:
			
 
				+        for span in need_remove_spans:
			
 
				+            spans.remove(span)
			
 
				 
			
 
				     return spans
			
 
				 
			
--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -74,7 +74,7 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
 
				                 debug_mode=is_debug,
			
 
				             )
			
 
				         except Exception as e:
			
 
				-            logger.error(f"{method.__name__} error: {e}")
			
 
				+            logger.exception(e)
			
 
				             return None
			
 
				 
			
 
				     pdf_info_dict = parse_pdf(parse_pdf_by_txt)
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,4 +17,6 @@ zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_
 
				 scikit-learn>=1.0.2
			
 
				 nltk==3.8.1
			
 
				 s3pathlib>=2.1.1
			
 
				+pytest
			
 
				+subprocess
			
 
				 
			
--- a/tests/test_cli/conf/__init__py
+++ b/tests/test_cli/conf/__init__py
--- a/tests/test_cli/conf/conf.py
+++ b/tests/test_cli/conf/conf.py
@@ -0,0 +1,8 @@
 
				+import os
			
 
				+conf = {
			
 
				+"code_path": os.environ.get('GITHUB_WORKSPACE'),
			
 
				+"pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/tests/test_cli/pdf_dev",
			
 
				+"pdf_res_path": "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/data"
			
 
				+
			
 
				+}
			
 
				+
			
--- a/tests/test_cli/lib/common.py
+++ b/tests/test_cli/lib/common.py
@@ -0,0 +1,17 @@
 
				+import subprocess
			
 
				+def check_shell(cmd):
			
 
				+    res = subprocess.check_output(cmd, shell=True)
			
 
				+    assert res == 0
			
 
				+
			
 
				+def count_folders_and_check_contents(directory):
			
 
				+    # 获取目录下的所有文件和文件夹
			
 
				+    contents = os.listdir(directory)
			
 
				+    folder_count = 0
			
 
				+    for item in contents:
			
 
				+        # 检查是否为文件夹
			
 
				+        if os.path.isdir(os.path.join(directory, item)):
			
 
				+            folder_count += 1
			
 
				+            # 检查文件夹是否为空
			
 
				+            folder_path = os.path.join(directory, item)
			
 
				+            assert os.listdir(folder_path) is not None
			
 
				+    assert folder_count == 3 
			
--- a/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.json
+++ b/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.json
--- a/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.pdf
+++ b/tests/test_cli/pdf_dev/14a75ee1-b88a-4fe7-bb10-62cbfabbfdec.html.pdf
--- a/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.json
+++ b/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.json
--- a/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.pdf
+++ b/tests/test_cli/pdf_dev/2365839d-4116-45de-b2f0-3a740e1d6c20.html.pdf
--- a/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.json
+++ b/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.json
--- a/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.pdf
+++ b/tests/test_cli/pdf_dev/24cb61a0-cace-460a-a42b-495a86caf88f.html.pdf
--- a/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.json
+++ b/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.json
--- a/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.pdf
+++ b/tests/test_cli/pdf_dev/300970fd-b34a-4656-a334-23059595b360.html.pdf
--- a/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.json
+++ b/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.json
--- a/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.pdf
+++ b/tests/test_cli/pdf_dev/40c595b5-3b62-4021-b8dd-5e445d223c47.html.pdf
--- a/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.json
+++ b/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.json
--- a/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.pdf
+++ b/tests/test_cli/pdf_dev/416b8524-9a6f-4b49-b7d4-56ce5c825699.html.pdf
--- a/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.json
+++ b/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.json
--- a/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.pdf
+++ b/tests/test_cli/pdf_dev/658cbc48-9edd-4537-8b02-261c052a2845.html.pdf
--- a/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.json
+++ b/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.json
--- a/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.pdf
+++ b/tests/test_cli/pdf_dev/789b3b75-b5ad-49c2-8ba1-e8719f7a1d42.html.pdf
--- a/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.json
+++ b/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.json
--- a/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.pdf
+++ b/tests/test_cli/pdf_dev/9eb3c6a7-1564-4a10-8cfb-56c628e46208.html.pdf
--- a/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.json
+++ b/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.json
--- a/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.pdf
+++ b/tests/test_cli/pdf_dev/b80cbc13-6655-42a8-a3a1-fe2db6eff883.html.pdf
--- a/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.json
+++ b/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.json
--- a/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.pdf
+++ b/tests/test_cli/pdf_dev/bb72581d-bcbd-419c-ba55-a26af7c7f00d.html.pdf
--- a/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.json
+++ b/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.json
--- a/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.pdf
+++ b/tests/test_cli/pdf_dev/ef36fc6f-d521-49b6-9846-85e565404632.html.pdf
--- a/tests/test_cli/pdf_dev/p3_图文混排84.json
+++ b/tests/test_cli/pdf_dev/p3_图文混排84.json
--- a/tests/test_cli/pdf_dev/p3_图文混排84.pdf
+++ b/tests/test_cli/pdf_dev/p3_图文混排84.pdf
--- a/tests/test_cli/test_cli.py
+++ b/tests/test_cli/test_cli.py
@@ -0,0 +1,31 @@
 
				+import pytest
			
 
				+import os
			
 
				+from conf import conf
			
 
				+import subprocess
			
 
				+from lib import common
			
 
				+import logging
			
 
				+pdf_res_path = conf.conf["pdf_res_path"]
			
 
				+code_path = conf.conf["code_path"]
			
 
				+pdf_dev_path = conf.conf["pdf_dev_path"]
			
 
				+class TestCli:
			
 
				+   
			
 
				+    def test_pdf_specify_dir(self):
			
 
				+        """
			
 
				+        输入pdf和指定目录的模型结果
			
 
				+        """
			
 
				+        cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, pdf_dev_path)
			
 
				+        logging.info(cmd)
			
 
				+        common.check_shell(cmd)
			
 
				+        common.count_folders_and_check_contents(pdf_res_path)      
			
 
				+   
			
 
				+
			
 
				+    def test_pdf_specify_jsonl(self):
			
 
				+        """
			
 
				+        输入jsonl
			
 
				+        """
			
 
				+        cmd = "cd %s && export PYTHONPATH=. && python " 
			
 
				+
			
 
				+ 
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pytest.main()