5 ماه پیش · 0a899f1af8
--- a/mineru/api/vlm_middle_json_mkcontent.py
+++ b/mineru/api/vlm_middle_json_mkcontent.py
@@ -0,0 +1,189 @@
 
				+import re
			
 
				+from ..utils.enum_class import MakeMode, BlockType, ContentType
			
 
				+
			
 
				+
			
 
				+def merge_para_with_text(para_block):
			
 
				+
			
 
				+    para_text = ''
			
 
				+    for line in para_block['lines']:
			
 
				+        for span in line['spans']:
			
 
				+            content = span['content']
			
 
				+            content = content.strip()
			
 
				+
			
 
				+            if content:
			
 
				+                para_text += content
			
 
				+            else:
			
 
				+                continue
			
 
				+
			
 
				+    return para_text
			
 
				+
			
 
				+def mk_blocks_to_markdown(para_blocks, make_mode, img_buket_path=''):
			
 
				+    page_markdown = []
			
 
				+    for para_block in para_blocks:
			
 
				+        para_text = ''
			
 
				+        para_type = para_block['type']
			
 
				+        if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX, BlockType.TITLE, BlockType.INTERLINE_EQUATION]:
			
 
				+            para_text = merge_para_with_text(para_block)
			
 
				+        elif para_type == BlockType.IMAGE:
			
 
				+            if make_mode == MakeMode.NLP_MD:
			
 
				+                continue
			
 
				+            elif make_mode == MakeMode.MM_MD:
			
 
				+                # 检测是否存在图片脚注
			
 
				+                has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks'])
			
 
				+                # 如果存在图片脚注，则将图片脚注拼接到图片正文后面
			
 
				+                if has_image_footnote:
			
 
				+                    for block in para_block['blocks']:  # 1st.拼image_caption
			
 
				+                        if block['type'] == BlockType.IMAGE_CAPTION:
			
 
				+                            para_text += merge_para_with_text(block) + '  \n'
			
 
				+                    for block in para_block['blocks']:  # 2nd.拼image_body
			
 
				+                        if block['type'] == BlockType.IMAGE_BODY:
			
 
				+                            for line in block['lines']:
			
 
				+                                for span in line['spans']:
			
 
				+                                    if span['type'] == ContentType.IMAGE:
			
 
				+                                        if span.get('image_path', ''):
			
 
				+                                            para_text += f"![]({img_buket_path}/{span['image_path']})"
			
 
				+                    for block in para_block['blocks']:  # 3rd.拼image_footnote
			
 
				+                        if block['type'] == BlockType.IMAGE_FOOTNOTE:
			
 
				+                            para_text += '  \n' + merge_para_with_text(block)
			
 
				+                else:
			
 
				+                    for block in para_block['blocks']:  # 1st.拼image_body
			
 
				+                        if block['type'] == BlockType.IMAGE_BODY:
			
 
				+                            for line in block['lines']:
			
 
				+                                for span in line['spans']:
			
 
				+                                    if span['type'] == ContentType.IMAGE:
			
 
				+                                        if span.get('image_path', ''):
			
 
				+                                            para_text += f"![]({img_buket_path}/{span['image_path']})"
			
 
				+                    for block in para_block['blocks']:  # 2nd.拼image_caption
			
 
				+                        if block['type'] == BlockType.IMAGE_CAPTION:
			
 
				+                            para_text += '  \n' + merge_para_with_text(block)
			
 
				+
			
 
				+        elif para_type == BlockType.TABLE:
			
 
				+            if make_mode == MakeMode.NLP_MD:
			
 
				+                continue
			
 
				+            elif make_mode == MakeMode.MM_MD:
			
 
				+                for block in para_block['blocks']:  # 1st.拼table_caption
			
 
				+                    if block['type'] == BlockType.TABLE_CAPTION:
			
 
				+                        para_text += merge_para_with_text(block) + '  \n'
			
 
				+                for block in para_block['blocks']:  # 2nd.拼table_body
			
 
				+                    if block['type'] == BlockType.TABLE_BODY:
			
 
				+                        for line in block['lines']:
			
 
				+                            for span in line['spans']:
			
 
				+                                if span['type'] == ContentType.TABLE:
			
 
				+                                    # if processed by table model
			
 
				+                                    if span.get('html', ''):
			
 
				+                                        para_text += f"\n{span['html']}\n"
			
 
				+                                    elif span.get('image_path', ''):
			
 
				+                                        para_text += f"![]({img_buket_path}/{span['image_path']})"
			
 
				+                for block in para_block['blocks']:  # 3rd.拼table_footnote
			
 
				+                    if block['type'] == BlockType.TABLE_FOOTNOTE:
			
 
				+                        para_text += '\n' + merge_para_with_text(block) + '  '
			
 
				+
			
 
				+        if para_text.strip() == '':
			
 
				+            continue
			
 
				+        else:
			
 
				+            # page_markdown.append(para_text.strip() + '  ')
			
 
				+            page_markdown.append(para_text.strip())
			
 
				+
			
 
				+    return page_markdown
			
 
				+
			
 
				+
			
 
				+def count_leading_hashes(text):
			
 
				+    match = re.match(r'^(#+)', text)
			
 
				+    return len(match.group(1)) if match else 0
			
 
				+
			
 
				+def strip_leading_hashes(text):
			
 
				+    # 去除开头的#和紧随其后的空格
			
 
				+    return re.sub(r'^#+\s*', '', text)
			
 
				+
			
 
				+
			
 
				+def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
			
 
				+    para_type = para_block['type']
			
 
				+    para_content = {}
			
 
				+    if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
			
 
				+        para_content = {
			
 
				+            'type': 'text',
			
 
				+            'text': merge_para_with_text(para_block),
			
 
				+        }
			
 
				+    elif para_type == BlockType.TITLE:
			
 
				+        title_content = merge_para_with_text(para_block)
			
 
				+        title_level = count_leading_hashes(title_content)
			
 
				+        para_content = {
			
 
				+            'type': 'text',
			
 
				+            'text': strip_leading_hashes(title_content),
			
 
				+        }
			
 
				+        if title_level != 0:
			
 
				+            para_content['text_level'] = title_level
			
 
				+    elif para_type == BlockType.INTERLINE_EQUATION:
			
 
				+        para_content = {
			
 
				+            'type': 'equation',
			
 
				+            'text': merge_para_with_text(para_block),
			
 
				+            'text_format': 'latex',
			
 
				+        }
			
 
				+    elif para_type == BlockType.IMAGE:
			
 
				+        para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
			
 
				+        for block in para_block['blocks']:
			
 
				+            if block['type'] == BlockType.IMAGE_BODY:
			
 
				+                for line in block['lines']:
			
 
				+                    for span in line['spans']:
			
 
				+                        if span['type'] == ContentType.IMAGE:
			
 
				+                            if span.get('image_path', ''):
			
 
				+                                para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
			
 
				+            if block['type'] == BlockType.IMAGE_CAPTION:
			
 
				+                para_content['img_caption'].append(merge_para_with_text(block))
			
 
				+            if block['type'] == BlockType.IMAGE_FOOTNOTE:
			
 
				+                para_content['img_footnote'].append(merge_para_with_text(block))
			
 
				+    elif para_type == BlockType.TABLE:
			
 
				+        para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
			
 
				+        for block in para_block['blocks']:
			
 
				+            if block['type'] == BlockType.TABLE_BODY:
			
 
				+                for line in block['lines']:
			
 
				+                    for span in line['spans']:
			
 
				+                        if span['type'] == ContentType.TABLE:
			
 
				+
			
 
				+                            if span.get('html', ''):
			
 
				+                                para_content['table_body'] = f"{span['html']}"
			
 
				+
			
 
				+                            if span.get('image_path', ''):
			
 
				+                                para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
			
 
				+
			
 
				+            if block['type'] == BlockType.TABLE_CAPTION:
			
 
				+                para_content['table_caption'].append(merge_para_with_text(block))
			
 
				+            if block['type'] == BlockType.TABLE_FOOTNOTE:
			
 
				+                para_content['table_footnote'].append(merge_para_with_text(block))
			
 
				+
			
 
				+    para_content['page_idx'] = page_idx
			
 
				+
			
 
				+    return para_content
			
 
				+
			
 
				+def union_make(pdf_info_dict: list,
			
 
				+               make_mode: str,
			
 
				+               img_buket_path: str = '',
			
 
				+               ):
			
 
				+    output_content = []
			
 
				+    for page_info in pdf_info_dict:
			
 
				+        paras_of_layout = page_info.get('para_blocks')
			
 
				+        page_idx = page_info.get('page_idx')
			
 
				+        if not paras_of_layout:
			
 
				+            continue
			
 
				+        if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
			
 
				+            page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
			
 
				+            output_content.extend(page_markdown)
			
 
				+        elif make_mode == MakeMode.STANDARD_FORMAT:
			
 
				+            for para_block in paras_of_layout:
			
 
				+                para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
			
 
				+                output_content.append(para_content)
			
 
				+
			
 
				+    if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
			
 
				+        return '\n\n'.join(output_content)
			
 
				+    elif make_mode == MakeMode.STANDARD_FORMAT:
			
 
				+        return output_content
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def get_title_level(block):
			
 
				+    title_level = block.get('level', 1)
			
 
				+    if title_level > 4:
			
 
				+        title_level = 4
			
 
				+    elif title_level < 1:
			
 
				+        title_level = 0
			
 
				+    return title_level
			
--- a/mineru/backend/pipeline/batch_analyze.py
+++ b/mineru/backend/pipeline/batch_analyze.py
@@ -1,6 +1,8 @@
 
				 import cv2
			
 
				 from loguru import logger
			
 
				 from tqdm import tqdm
			
 
				+from collections import defaultdict
			
 
				+import numpy as np
			
 
				 
			
 
				 from .model_init import AtomModelSingleton
			
 
				 from ...utils.model_utils import crop_img, get_res_list_from_layout_res, get_coords_and_area
			
@@ -12,11 +14,12 @@ MFR_BASE_BATCH_SIZE = 16
 
				 
			
 
				 
			
 
				 class BatchAnalyze:
			
 
				-    def __init__(self, model_manager, batch_ratio: int, formula_enable, table_enable):
			
 
				+    def __init__(self, model_manager, batch_ratio: int, formula_enable, table_enable, enable_ocr_det_batch: bool = True):
			
 
				         self.batch_ratio = batch_ratio
			
 
				         self.formula_enable = formula_enable
			
 
				         self.table_enable = table_enable
			
 
				         self.model_manager = model_manager
			
 
				+        self.enable_ocr_det_batch = enable_ocr_det_batch
			
 
				 
			
 
				     def __call__(self, images_with_extra_info: list) -> list:
			
 
				         if len(images_with_extra_info) == 0:
			
@@ -89,48 +92,160 @@ class BatchAnalyze:
 
				                                                 'table_img':table_img,
			
 
				                                               })
			
 
				 
			
 
				-        # 文本框检测
			
 
				-
			
 
				-        for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"):
			
 
				-            # Process each area that requires OCR processing
			
 
				-            _lang = ocr_res_list_dict['lang']
			
 
				-            # Get OCR results for this language's images
			
 
				-            ocr_model = atom_model_manager.get_atom_model(
			
 
				-                atom_model_name='ocr',
			
 
				-                det_db_box_thresh=0.3,
			
 
				-                lang=_lang
			
 
				-            )
			
 
				-            for res in ocr_res_list_dict['ocr_res_list']:
			
 
				-                new_image, useful_list = crop_img(
			
 
				-                    res, ocr_res_list_dict['np_array_img'], crop_paste_x=50, crop_paste_y=50
			
 
				-                )
			
 
				-                adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
			
 
				-                    ocr_res_list_dict['single_page_mfdetrec_res'], useful_list
			
 
				-                )
			
 
				+                # OCR检测处理
			
 
				+                if self.enable_ocr_det_batch:
			
 
				+                    # 批处理模式 - 按语言和分辨率分组
			
 
				+                    # 收集所有需要OCR检测的裁剪图像
			
 
				+                    all_cropped_images_info = []
			
 
				+
			
 
				+                    for ocr_res_list_dict in ocr_res_list_all_page:
			
 
				+                        _lang = ocr_res_list_dict['lang']
			
 
				+
			
 
				+                        for res in ocr_res_list_dict['ocr_res_list']:
			
 
				+                            new_image, useful_list = crop_img(
			
 
				+                                res, ocr_res_list_dict['np_array_img'], crop_paste_x=50, crop_paste_y=50
			
 
				+                            )
			
 
				+                            adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
			
 
				+                                ocr_res_list_dict['single_page_mfdetrec_res'], useful_list
			
 
				+                            )
			
 
				+
			
 
				+                            # BGR转换
			
 
				+                            new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
			
 
				+
			
 
				+                            all_cropped_images_info.append((
			
 
				+                                new_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang
			
 
				+                            ))
			
 
				+
			
 
				+                    # 按语言分组
			
 
				+                    lang_groups = defaultdict(list)
			
 
				+                    for crop_info in all_cropped_images_info:
			
 
				+                        lang = crop_info[5]
			
 
				+                        lang_groups[lang].append(crop_info)
			
 
				+
			
 
				+                    # 对每种语言按分辨率分组并批处理
			
 
				+                    for lang, lang_crop_list in lang_groups.items():
			
 
				+                        if not lang_crop_list:
			
 
				+                            continue
			
 
				+
			
 
				+                        # logger.info(f"Processing OCR detection for language {lang} with {len(lang_crop_list)} images")
			
 
				+
			
 
				+                        # 获取OCR模型
			
 
				+                        ocr_model = atom_model_manager.get_atom_model(
			
 
				+                            atom_model_name='ocr',
			
 
				+                            ocr_show_log=False,
			
 
				+                            det_db_box_thresh=0.3,
			
 
				+                            lang=lang
			
 
				+                        )
			
 
				 
			
 
				-                # OCR-det
			
 
				-                new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
			
 
				-                ocr_res = ocr_model.ocr(
			
 
				-                    new_image, mfd_res=adjusted_mfdetrec_res, rec=False
			
 
				-                )[0]
			
 
				-
			
 
				-                # Integration results
			
 
				-                if ocr_res:
			
 
				-                    ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
			
 
				-
			
 
				-                    if res["category_id"] == 3:
			
 
				-                        # ocr_result_list中所有bbox的面积之和
			
 
				-                        ocr_res_area = sum(get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
			
 
				-                        # 求ocr_res_area和res的面积的比值
			
 
				-                        res_area = get_coords_and_area(res)[4]
			
 
				-                        if res_area > 0:
			
 
				-                            ratio = ocr_res_area / res_area
			
 
				-                            if ratio > 0.25:
			
 
				-                                res["category_id"] = 1
			
 
				-                            else:
			
 
				-                                continue
			
 
				-
			
 
				-                    ocr_res_list_dict['layout_res'].extend(ocr_result_list)
			
 
				+                        # 按分辨率分组并同时完成padding
			
 
				+                        resolution_groups = defaultdict(list)
			
 
				+                        for crop_info in lang_crop_list:
			
 
				+                            cropped_img = crop_info[0]
			
 
				+                            h, w = cropped_img.shape[:2]
			
 
				+                            # 使用更大的分组容差，减少分组数量
			
 
				+                            # 将尺寸标准化到32的倍数
			
 
				+                            normalized_h = ((h + 32) // 32) * 32  # 向上取整到32的倍数
			
 
				+                            normalized_w = ((w + 32) // 32) * 32
			
 
				+                            group_key = (normalized_h, normalized_w)
			
 
				+                            resolution_groups[group_key].append(crop_info)
			
 
				+
			
 
				+                        # 对每个分辨率组进行批处理
			
 
				+                        for group_key, group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
			
 
				+                            raw_images = [crop_info[0] for crop_info in group_crops]
			
 
				+
			
 
				+                            # 计算目标尺寸（组内最大尺寸，向上取整到32的倍数）
			
 
				+                            max_h = max(img.shape[0] for img in raw_images)
			
 
				+                            max_w = max(img.shape[1] for img in raw_images)
			
 
				+                            target_h = ((max_h + 32 - 1) // 32) * 32
			
 
				+                            target_w = ((max_w + 32 - 1) // 32) * 32
			
 
				+
			
 
				+                            # 对所有图像进行padding到统一尺寸
			
 
				+                            batch_images = []
			
 
				+                            for img in raw_images:
			
 
				+                                h, w = img.shape[:2]
			
 
				+                                # 创建目标尺寸的白色背景
			
 
				+                                padded_img = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
			
 
				+                                # 将原图像粘贴到左上角
			
 
				+                                padded_img[:h, :w] = img
			
 
				+                                batch_images.append(padded_img)
			
 
				+
			
 
				+                            # 批处理检测
			
 
				+                            batch_size = min(len(batch_images), self.batch_ratio * 16)  # 增加批处理大小
			
 
				+                            # logger.debug(f"OCR-det batch: {batch_size} images, target size: {target_h}x{target_w}")
			
 
				+                            batch_results = ocr_model.text_detector.batch_predict(batch_images, batch_size)
			
 
				+
			
 
				+                            # 处理批处理结果
			
 
				+                            for i, (crop_info, (dt_boxes, elapse)) in enumerate(zip(group_crops, batch_results)):
			
 
				+                                new_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang = crop_info
			
 
				+
			
 
				+                                if dt_boxes is not None:
			
 
				+                                    # 构造OCR结果格式 - 每个box应该是4个点的列表
			
 
				+                                    ocr_res = [box.tolist() for box in dt_boxes]
			
 
				+
			
 
				+                                    if ocr_res:
			
 
				+                                        ocr_result_list = get_ocr_result_list(
			
 
				+                                            ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang
			
 
				+                                        )
			
 
				+
			
 
				+                                        if res["category_id"] == 3:
			
 
				+                                            # ocr_result_list中所有bbox的面积之和
			
 
				+                                            ocr_res_area = sum(
			
 
				+                                                get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
			
 
				+                                            # 求ocr_res_area和res的面积的比值
			
 
				+                                            res_area = get_coords_and_area(res)[4]
			
 
				+                                            if res_area > 0:
			
 
				+                                                ratio = ocr_res_area / res_area
			
 
				+                                                if ratio > 0.25:
			
 
				+                                                    res["category_id"] = 1
			
 
				+                                                else:
			
 
				+                                                    continue
			
 
				+
			
 
				+                                        ocr_res_list_dict['layout_res'].extend(ocr_result_list)
			
 
				+                else:
			
 
				+                    # 原始单张处理模式
			
 
				+                    for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"):
			
 
				+                        # Process each area that requires OCR processing
			
 
				+                        _lang = ocr_res_list_dict['lang']
			
 
				+                        # Get OCR results for this language's images
			
 
				+                        ocr_model = atom_model_manager.get_atom_model(
			
 
				+                            atom_model_name='ocr',
			
 
				+                            ocr_show_log=False,
			
 
				+                            det_db_box_thresh=0.3,
			
 
				+                            lang=_lang
			
 
				+                        )
			
 
				+                        for res in ocr_res_list_dict['ocr_res_list']:
			
 
				+                            new_image, useful_list = crop_img(
			
 
				+                                res, ocr_res_list_dict['np_array_img'], crop_paste_x=50, crop_paste_y=50
			
 
				+                            )
			
 
				+                            adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
			
 
				+                                ocr_res_list_dict['single_page_mfdetrec_res'], useful_list
			
 
				+                            )
			
 
				+
			
 
				+                        # OCR-det
			
 
				+                        new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
			
 
				+                        ocr_res = ocr_model.ocr(
			
 
				+                            new_image, mfd_res=adjusted_mfdetrec_res, rec=False
			
 
				+                        )[0]
			
 
				+
			
 
				+                        # Integration results
			
 
				+                        if ocr_res:
			
 
				+                            ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],
			
 
				+                                                                  new_image, _lang)
			
 
				+
			
 
				+                            if res["category_id"] == 3:
			
 
				+                                # ocr_result_list中所有bbox的面积之和
			
 
				+                                ocr_res_area = sum(
			
 
				+                                    get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
			
 
				+                                # 求ocr_res_area和res的面积的比值
			
 
				+                                res_area = get_coords_and_area(res)[4]
			
 
				+                                if res_area > 0:
			
 
				+                                    ratio = ocr_res_area / res_area
			
 
				+                                    if ratio > 0.25:
			
 
				+                                        res["category_id"] = 1
			
 
				+                                    else:
			
 
				+                                        continue
			
 
				+
			
 
				+                            ocr_res_list_dict['layout_res'].extend(ocr_result_list)
			
 
				 
			
 
				         # 表格识别 table recognition
			
 
				         if self.table_enable:
			
--- a/mineru/backend/vlm/token_to_middle_json.py
+++ b/mineru/backend/vlm/token_to_middle_json.py
@@ -3,7 +3,7 @@ import re
 
				 from mineru.utils.cut_image import cut_image_and_table
			
 
				 from mineru.utils.enum_class import BlockType, ContentType
			
 
				 from mineru.utils.hash_utils import str_md5
			
 
				-from mineru.utils.magic_model import fix_two_layer_blocks
			
 
				+from mineru.utils.vlm_magic_model import fix_two_layer_blocks
			
 
				 from mineru.version import __version__
			
 
				 
			
 
				 
			
@@ -113,7 +113,7 @@ def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dic
 
				     # 对page_blocks根据index的值进行排序
			
 
				     page_blocks.sort(key=lambda x: x["index"])
			
 
				 
			
 
				-    page_info = {"para_blocks": page_blocks, "page_size": [width, height], "page_idx": page_index}
			
 
				+    page_info = {"para_blocks": page_blocks, "discarded_blocks": [], "page_size": [width, height], "page_idx": page_index}
			
 
				     return page_info
			
 
				 
			
 
				 
			
--- a/mineru/cli/client.py
+++ b/mineru/cli/client.py
@@ -0,0 +1,91 @@
 
				+# Copyright (c) Opendatalab. All rights reserved.
			
 
				+import os
			
 
				+import click
			
 
				+from pathlib import Path
			
 
				+from loguru import logger
			
 
				+from ..version import __version__
			
 
				+from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
			
 
				+
			
 
				+
			
 
				+@click.command()
			
 
				+@click.version_option(__version__,
			
 
				+                      '--version',
			
 
				+                      '-v',
			
 
				+                      help='display the version and exit')
			
 
				+@click.option(
			
 
				+    '-p',
			
 
				+    '--path',
			
 
				+    'input_path',
			
 
				+    type=click.Path(exists=True),
			
 
				+    required=True,
			
 
				+    help='local filepath or directory. support pdf, png, jpg, jpeg files',
			
 
				+)
			
 
				+@click.option(
			
 
				+    '-o',
			
 
				+    '--output-dir',
			
 
				+    'output_dir',
			
 
				+    type=click.Path(),
			
 
				+    required=True,
			
 
				+    help='output local directory',
			
 
				+)
			
 
				+@click.option(
			
 
				+    '-b',
			
 
				+    '--backend',
			
 
				+    'backend',
			
 
				+    type=click.Choice(['pipeline', 'vlm-huggingface', 'vlm-sglang-engine', 'vlm-sglang-client']),
			
 
				+    help="""the backend for parsing pdf:
			
 
				+    pipeline: More general.
			
 
				+    vlm-huggingface: More general.
			
 
				+    vlm-sglang-engine: Faster(engine).
			
 
				+    vlm-sglang-client: Faster(client).
			
 
				+    without method specified, huggingface will be used by default.""",
			
 
				+    default='pipeline',
			
 
				+)
			
 
				+@click.option(
			
 
				+    '-u',
			
 
				+    '--url',
			
 
				+    'server_url',
			
 
				+    type=str,
			
 
				+    help="""
			
 
				+    When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
			
 
				+    """,
			
 
				+    default=None,
			
 
				+)
			
 
				+@click.option(
			
 
				+    '-s',
			
 
				+    '--start',
			
 
				+    'start_page_id',
			
 
				+    type=int,
			
 
				+    help='The starting page for PDF parsing, beginning from 0.',
			
 
				+    default=0,
			
 
				+)
			
 
				+@click.option(
			
 
				+    '-e',
			
 
				+    '--end',
			
 
				+    'end_page_id',
			
 
				+    type=int,
			
 
				+    help='The ending page for PDF parsing, beginning from 0.',
			
 
				+    default=None,
			
 
				+)
			
 
				+
			
 
				+def main(input_path, output_dir, backend, server_url, start_page_id, end_page_id):
			
 
				+    os.makedirs(output_dir, exist_ok=True)
			
 
				+
			
 
				+    def parse_doc(path: Path):
			
 
				+        try:
			
 
				+            file_name = str(Path(path).stem)
			
 
				+            pdf_bits = read_fn(path)
			
 
				+            do_parse(output_dir, file_name, pdf_bits, backend, server_url,
			
 
				+                     start_page_id=start_page_id, end_page_id=end_page_id)
			
 
				+        except Exception as e:
			
 
				+            logger.exception(e)
			
 
				+
			
 
				+    if os.path.isdir(input_path):
			
 
				+        for doc_path in Path(input_path).glob('*'):
			
 
				+            if doc_path.suffix in pdf_suffixes + image_suffixes:
			
 
				+                parse_doc(Path(doc_path))
			
 
				+    else:
			
 
				+        parse_doc(Path(input_path))
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/mineru/cli/common.py
+++ b/mineru/cli/common.py
@@ -0,0 +1,153 @@
 
				+# Copyright (c) Opendatalab. All rights reserved.
			
 
				+import io
			
 
				+import json
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import pypdfium2 as pdfium
			
 
				+from loguru import logger
			
 
				+from ..api.vlm_middle_json_mkcontent import union_make
			
 
				+from ..backend.vlm.vlm_analyze import doc_analyze
			
 
				+from ..data.data_reader_writer import FileBasedDataWriter
			
 
				+from ..utils.draw_bbox import draw_layout_bbox, draw_span_bbox
			
 
				+from ..utils.enum_class import MakeMode
			
 
				+from ..utils.pdf_image_tools import images_bytes_to_pdf_bytes
			
 
				+
			
 
				+pdf_suffixes = [".pdf"]
			
 
				+image_suffixes = [".png", ".jpeg", ".jpg"]
			
 
				+
			
 
				+
			
 
				+def read_fn(path: Path):
			
 
				+    with open(str(path), "rb") as input_file:
			
 
				+        file_bytes = input_file.read()
			
 
				+        if path.suffix in image_suffixes:
			
 
				+            return images_bytes_to_pdf_bytes(file_bytes)
			
 
				+        elif path.suffix in pdf_suffixes:
			
 
				+            return file_bytes
			
 
				+        else:
			
 
				+            raise Exception(f"Unknown file suffix: {path.suffix}")
			
 
				+
			
 
				+
			
 
				+def prepare_env(output_dir, pdf_file_name):
			
 
				+    local_parent_dir = os.path.join(output_dir, pdf_file_name)
			
 
				+
			
 
				+    local_image_dir = os.path.join(str(local_parent_dir), "images")
			
 
				+    local_md_dir = local_parent_dir
			
 
				+    os.makedirs(local_image_dir, exist_ok=True)
			
 
				+    os.makedirs(local_md_dir, exist_ok=True)
			
 
				+    return local_image_dir, local_md_dir
			
 
				+
			
 
				+
			
 
				+def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
			
 
				+
			
 
				+    # 从字节数据加载PDF
			
 
				+    pdf = pdfium.PdfDocument(pdf_bytes)
			
 
				+
			
 
				+    # 确定结束页
			
 
				+    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
			
 
				+    if end_page_id > len(pdf) - 1:
			
 
				+        logger.warning("end_page_id is out of range, use pdf_docs length")
			
 
				+        end_page_id = len(pdf) - 1
			
 
				+
			
 
				+    # 创建一个新的PDF文档
			
 
				+    output_pdf = pdfium.PdfDocument.new()
			
 
				+
			
 
				+    # 选择要导入的页面索引
			
 
				+    page_indices = list(range(start_page_id, end_page_id + 1))
			
 
				+
			
 
				+    # 从原PDF导入页面到新PDF
			
 
				+    output_pdf.import_pages(pdf, page_indices)
			
 
				+
			
 
				+    # 将新PDF保存到内存缓冲区
			
 
				+    output_buffer = io.BytesIO()
			
 
				+    output_pdf.save(output_buffer)
			
 
				+
			
 
				+    # 获取字节数据
			
 
				+    output_bytes = output_buffer.getvalue()
			
 
				+
			
 
				+    return output_bytes
			
 
				+
			
 
				+
			
 
				+def do_parse(
			
 
				+    output_dir,
			
 
				+    pdf_file_name,
			
 
				+    pdf_bytes,
			
 
				+    backend="pipeline",
			
 
				+    model_path="jinzhenj/OEEzRkQ3RTAtMDMx-0415",  # TODO: change to formal path after release.
			
 
				+    server_url=None,
			
 
				+    f_draw_layout_bbox=True,
			
 
				+    f_draw_span_bbox=False,
			
 
				+    f_dump_md=True,
			
 
				+    f_dump_middle_json=True,
			
 
				+    f_dump_model_output=True,
			
 
				+    f_dump_orig_pdf=True,
			
 
				+    f_dump_content_list=True,
			
 
				+    f_make_md_mode=MakeMode.MM_MD,
			
 
				+    start_page_id=0,
			
 
				+    end_page_id=None,
			
 
				+):
			
 
				+    if backend == 'pipeline':
			
 
				+        f_draw_span_bbox = True
			
 
				+
			
 
				+    pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
			
 
				+    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name)
			
 
				+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
			
 
				+
			
 
				+    middle_json, infer_result = doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url)
			
 
				+    pdf_info = middle_json["pdf_info"]
			
 
				+
			
 
				+    if f_draw_layout_bbox:
			
 
				+        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
			
 
				+
			
 
				+    if f_draw_span_bbox:
			
 
				+        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
			
 
				+
			
 
				+    if f_dump_orig_pdf:
			
 
				+        md_writer.write(
			
 
				+            f"{pdf_file_name}_origin.pdf",
			
 
				+            pdf_bytes,
			
 
				+        )
			
 
				+
			
 
				+    if f_dump_md:
			
 
				+        image_dir = str(os.path.basename(local_image_dir))
			
 
				+        md_content_str = union_make(pdf_info, f_make_md_mode, image_dir)
			
 
				+        md_writer.write_string(
			
 
				+            f"{pdf_file_name}.md",
			
 
				+            md_content_str,
			
 
				+        )
			
 
				+
			
 
				+    if f_dump_content_list:
			
 
				+        image_dir = str(os.path.basename(local_image_dir))
			
 
				+        content_list = union_make(pdf_info, MakeMode.STANDARD_FORMAT, image_dir)
			
 
				+        md_writer.write_string(
			
 
				+            f"{pdf_file_name}_content_list.json",
			
 
				+            json.dumps(content_list, ensure_ascii=False, indent=4),
			
 
				+        )
			
 
				+
			
 
				+    if f_dump_middle_json:
			
 
				+        md_writer.write_string(
			
 
				+            f"{pdf_file_name}_middle.json",
			
 
				+            json.dumps(middle_json, ensure_ascii=False, indent=4),
			
 
				+        )
			
 
				+
			
 
				+    if f_dump_model_output:
			
 
				+        model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
			
 
				+        md_writer.write_string(
			
 
				+            f"{pdf_file_name}_model_output.txt",
			
 
				+            model_output,
			
 
				+        )
			
 
				+
			
 
				+    logger.info(f"local output dir is {local_md_dir}")
			
 
				+
			
 
				+    return infer_result
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    pdf_path = "../../demo/demo2.pdf"
			
 
				+    with open(pdf_path, "rb") as f:
			
 
				+        try:
			
 
				+            result = do_parse("./output", Path(pdf_path).stem, f.read())
			
 
				+        except Exception as e:
			
 
				+            logger.exception(e)
			
 
				+        # dict转成json
			
 
				+        print(json.dumps(result, ensure_ascii=False, indent=4))
			
--- a/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_det.py
+++ b/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_det.py
@@ -117,6 +117,128 @@ class TextDetector(BaseOCRV20):
 
				         self.net.eval()
			
 
				         self.net.to(self.device)
			
 
				 
			
 
				+    def _batch_process_same_size(self, img_list):
			
 
				+        """
			
 
				+            对相同尺寸的图像进行批处理
			
 
				+
			
 
				+            Args:
			
 
				+                img_list: 相同尺寸的图像列表
			
 
				+
			
 
				+            Returns:
			
 
				+                batch_results: 批处理结果列表
			
 
				+                total_elapse: 总耗时
			
 
				+            """
			
 
				+        starttime = time.time()
			
 
				+
			
 
				+        # 预处理所有图像
			
 
				+        batch_data = []
			
 
				+        batch_shapes = []
			
 
				+        ori_imgs = []
			
 
				+
			
 
				+        for img in img_list:
			
 
				+            ori_im = img.copy()
			
 
				+            ori_imgs.append(ori_im)
			
 
				+
			
 
				+            data = {'image': img}
			
 
				+            data = transform(data, self.preprocess_op)
			
 
				+            if data is None:
			
 
				+                # 如果预处理失败，返回空结果
			
 
				+                return [(None, 0) for _ in img_list], 0
			
 
				+
			
 
				+            img_processed, shape_list = data
			
 
				+            batch_data.append(img_processed)
			
 
				+            batch_shapes.append(shape_list)
			
 
				+
			
 
				+        # 堆叠成批处理张量
			
 
				+        try:
			
 
				+            batch_tensor = np.stack(batch_data, axis=0)
			
 
				+            batch_shapes = np.stack(batch_shapes, axis=0)
			
 
				+        except Exception as e:
			
 
				+            # 如果堆叠失败，回退到逐个处理
			
 
				+            batch_results = []
			
 
				+            for img in img_list:
			
 
				+                dt_boxes, elapse = self.__call__(img)
			
 
				+                batch_results.append((dt_boxes, elapse))
			
 
				+            return batch_results, time.time() - starttime
			
 
				+
			
 
				+        # 批处理推理
			
 
				+        with torch.no_grad():
			
 
				+            inp = torch.from_numpy(batch_tensor)
			
 
				+            inp = inp.to(self.device)
			
 
				+            outputs = self.net(inp)
			
 
				+
			
 
				+        # 处理输出
			
 
				+        preds = {}
			
 
				+        if self.det_algorithm == "EAST":
			
 
				+            preds['f_geo'] = outputs['f_geo'].cpu().numpy()
			
 
				+            preds['f_score'] = outputs['f_score'].cpu().numpy()
			
 
				+        elif self.det_algorithm == 'SAST':
			
 
				+            preds['f_border'] = outputs['f_border'].cpu().numpy()
			
 
				+            preds['f_score'] = outputs['f_score'].cpu().numpy()
			
 
				+            preds['f_tco'] = outputs['f_tco'].cpu().numpy()
			
 
				+            preds['f_tvo'] = outputs['f_tvo'].cpu().numpy()
			
 
				+        elif self.det_algorithm in ['DB', 'PSE', 'DB++']:
			
 
				+            preds['maps'] = outputs['maps'].cpu().numpy()
			
 
				+        elif self.det_algorithm == 'FCE':
			
 
				+            for i, (k, output) in enumerate(outputs.items()):
			
 
				+                preds['level_{}'.format(i)] = output.cpu().numpy()
			
 
				+        else:
			
 
				+            raise NotImplementedError
			
 
				+
			
 
				+        # 后处理每个图像的结果
			
 
				+        batch_results = []
			
 
				+        total_elapse = time.time() - starttime
			
 
				+
			
 
				+        for i in range(len(img_list)):
			
 
				+            # 提取单个图像的预测结果
			
 
				+            single_preds = {}
			
 
				+            for key, value in preds.items():
			
 
				+                if isinstance(value, np.ndarray):
			
 
				+                    single_preds[key] = value[i:i + 1]  # 保持批次维度
			
 
				+                else:
			
 
				+                    single_preds[key] = value
			
 
				+
			
 
				+            # 后处理
			
 
				+            post_result = self.postprocess_op(single_preds, batch_shapes[i:i + 1])
			
 
				+            dt_boxes = post_result[0]['points']
			
 
				+
			
 
				+            # 过滤和裁剪检测框
			
 
				+            if (self.det_algorithm == "SAST" and
			
 
				+                self.det_sast_polygon) or (self.det_algorithm in ["PSE", "FCE"] and
			
 
				+                                           self.postprocess_op.box_type == 'poly'):
			
 
				+                dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_imgs[i].shape)
			
 
				+            else:
			
 
				+                dt_boxes = self.filter_tag_det_res(dt_boxes, ori_imgs[i].shape)
			
 
				+
			
 
				+            batch_results.append((dt_boxes, total_elapse / len(img_list)))
			
 
				+
			
 
				+        return batch_results, total_elapse
			
 
				+
			
 
				+    def batch_predict(self, img_list, max_batch_size=8):
			
 
				+        """
			
 
				+        批处理预测方法，支持多张图像同时检测
			
 
				+
			
 
				+        Args:
			
 
				+            img_list: 图像列表
			
 
				+            max_batch_size: 最大批处理大小
			
 
				+
			
 
				+        Returns:
			
 
				+            batch_results: 批处理结果列表，每个元素为(dt_boxes, elapse)
			
 
				+        """
			
 
				+        if not img_list:
			
 
				+            return []
			
 
				+
			
 
				+        batch_results = []
			
 
				+
			
 
				+        # 分批处理
			
 
				+        for i in range(0, len(img_list), max_batch_size):
			
 
				+            batch_imgs = img_list[i:i + max_batch_size]
			
 
				+            # assert尺寸一致
			
 
				+            batch_dt_boxes, batch_elapse = self._batch_process_same_size(batch_imgs)
			
 
				+            batch_results.extend(batch_dt_boxes)
			
 
				+
			
 
				+        return batch_results
			
 
				+
			
 
				     def order_points_clockwise(self, pts):
			
 
				         """
			
 
				         reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
			
--- a/mineru/utils/draw_bbox.py
+++ b/mineru/utils/draw_bbox.py
@@ -4,7 +4,7 @@ from io import BytesIO
 
				 from PyPDF2 import PdfReader, PdfWriter
			
 
				 from reportlab.pdfgen import canvas
			
 
				 
			
 
				-from .enum_class import BlockType
			
 
				+from .enum_class import BlockType, ContentType
			
 
				 
			
 
				 
			
 
				 def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config):
			
@@ -54,7 +54,7 @@ def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_b
 
				 
			
 
				 
			
 
				 def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
			
 
				-    # dropped_bbox_list = []
			
 
				+    dropped_bbox_list = []
			
 
				     tables_list, tables_body_list = [], []
			
 
				     tables_caption_list, tables_footnote_list = [], []
			
 
				     imgs_list, imgs_body_list, imgs_caption_list = [], [], []
			
@@ -65,7 +65,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
 
				     lists_list = []
			
 
				     indexs_list = []
			
 
				     for page in pdf_info:
			
 
				-        # page_dropped_list = []
			
 
				+        page_dropped_list = []
			
 
				         tables, tables_body, tables_caption, tables_footnote = [], [], [], []
			
 
				         imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
			
 
				         titles = []
			
@@ -74,9 +74,9 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
 
				         lists = []
			
 
				         indices = []
			
 
				 
			
 
				-        # for dropped_bbox in page['discarded_blocks']:
			
 
				-        #     page_dropped_list.append(dropped_bbox['bbox'])
			
 
				-        # dropped_bbox_list.append(page_dropped_list)
			
 
				+        for dropped_bbox in page['discarded_blocks']:
			
 
				+            page_dropped_list.append(dropped_bbox['bbox'])
			
 
				+        dropped_bbox_list.append(page_dropped_list)
			
 
				         for block in page["para_blocks"]:
			
 
				             bbox = block["bbox"]
			
 
				             if block["type"] == BlockType.TABLE:
			
@@ -164,7 +164,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
 
				         # 使用原始PDF的尺寸创建canvas
			
 
				         c = canvas.Canvas(packet, pagesize=custom_page_size)
			
 
				 
			
 
				-        # c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
			
 
				+        c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True)
			
 
				         c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True)
			
 
				         c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True)
			
 
				         c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True)
			
@@ -190,6 +190,114 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
 
				         output_pdf.write(f)
			
 
				 
			
 
				 
			
 
				+def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
			
 
				+    text_list = []
			
 
				+    inline_equation_list = []
			
 
				+    interline_equation_list = []
			
 
				+    image_list = []
			
 
				+    table_list = []
			
 
				+    dropped_list = []
			
 
				+    next_page_text_list = []
			
 
				+    next_page_inline_equation_list = []
			
 
				+
			
 
				+    def get_span_info(span):
			
 
				+        if span['type'] == ContentType.TEXT:
			
 
				+            if span.get('cross_page', False):
			
 
				+                next_page_text_list.append(span['bbox'])
			
 
				+            else:
			
 
				+                page_text_list.append(span['bbox'])
			
 
				+        elif span['type'] == ContentType.INLINE_EQUATION:
			
 
				+            if span.get('cross_page', False):
			
 
				+                next_page_inline_equation_list.append(span['bbox'])
			
 
				+            else:
			
 
				+                page_inline_equation_list.append(span['bbox'])
			
 
				+        elif span['type'] == ContentType.INTERLINE_EQUATION:
			
 
				+            page_interline_equation_list.append(span['bbox'])
			
 
				+        elif span['type'] == ContentType.IMAGE:
			
 
				+            page_image_list.append(span['bbox'])
			
 
				+        elif span['type'] == ContentType.TABLE:
			
 
				+            page_table_list.append(span['bbox'])
			
 
				+
			
 
				+    for page in pdf_info:
			
 
				+        page_text_list = []
			
 
				+        page_inline_equation_list = []
			
 
				+        page_interline_equation_list = []
			
 
				+        page_image_list = []
			
 
				+        page_table_list = []
			
 
				+        page_dropped_list = []
			
 
				+
			
 
				+        # 将跨页的span放到移动到下一页的列表中
			
 
				+        if len(next_page_text_list) > 0:
			
 
				+            page_text_list.extend(next_page_text_list)
			
 
				+            next_page_text_list.clear()
			
 
				+        if len(next_page_inline_equation_list) > 0:
			
 
				+            page_inline_equation_list.extend(next_page_inline_equation_list)
			
 
				+            next_page_inline_equation_list.clear()
			
 
				+
			
 
				+        # 构造dropped_list
			
 
				+        for block in page['discarded_blocks']:
			
 
				+            if block['type'] == BlockType.DISCARDED:
			
 
				+                for line in block['lines']:
			
 
				+                    for span in line['spans']:
			
 
				+                        page_dropped_list.append(span['bbox'])
			
 
				+        dropped_list.append(page_dropped_list)
			
 
				+        # 构造其余useful_list
			
 
				+        # for block in page['para_blocks']:  # span直接用分段合并前的结果就可以
			
 
				+        for block in page['preproc_blocks']:
			
 
				+            if block['type'] in [
			
 
				+                BlockType.TEXT,
			
 
				+                BlockType.TITLE,
			
 
				+                BlockType.INTERLINE_EQUATION,
			
 
				+                BlockType.LIST,
			
 
				+                BlockType.INDEX,
			
 
				+            ]:
			
 
				+                for line in block['lines']:
			
 
				+                    for span in line['spans']:
			
 
				+                        get_span_info(span)
			
 
				+            elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]:
			
 
				+                for sub_block in block['blocks']:
			
 
				+                    for line in sub_block['lines']:
			
 
				+                        for span in line['spans']:
			
 
				+                            get_span_info(span)
			
 
				+        text_list.append(page_text_list)
			
 
				+        inline_equation_list.append(page_inline_equation_list)
			
 
				+        interline_equation_list.append(page_interline_equation_list)
			
 
				+        image_list.append(page_image_list)
			
 
				+        table_list.append(page_table_list)
			
 
				+
			
 
				+    pdf_bytes_io = BytesIO(pdf_bytes)
			
 
				+    pdf_docs = PdfReader(pdf_bytes_io)
			
 
				+    output_pdf = PdfWriter()
			
 
				+
			
 
				+    for i, page in enumerate(pdf_docs.pages):
			
 
				+        # 获取原始页面尺寸
			
 
				+        page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3])
			
 
				+        custom_page_size = (page_width, page_height)
			
 
				+
			
 
				+        packet = BytesIO()
			
 
				+        # 使用原始PDF的尺寸创建canvas
			
 
				+        c = canvas.Canvas(packet, pagesize=custom_page_size)
			
 
				+
			
 
				+        # 获取当前页面的数据
			
 
				+        draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False)
			
 
				+        draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False)
			
 
				+        draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False)
			
 
				+        draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False)
			
 
				+        draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False)
			
 
				+        draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False)
			
 
				+
			
 
				+        c.save()
			
 
				+        packet.seek(0)
			
 
				+        overlay_pdf = PdfReader(packet)
			
 
				+
			
 
				+        page.merge_page(overlay_pdf.pages[0])
			
 
				+        output_pdf.add_page(page)
			
 
				+
			
 
				+    # Save the PDF
			
 
				+    with open(f"{out_path}/{filename}", "wb") as f:
			
 
				+        output_pdf.write(f)
			
 
				+
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				     # 读取PDF文件
			
 
				     pdf_path = "examples/demo1.pdf"
			
--- a/mineru/utils/enum_class.py
+++ b/mineru/utils/enum_class.py
@@ -12,6 +12,7 @@ class BlockType:
 
				     INTERLINE_EQUATION = 'interline_equation'
			
 
				     LIST = 'list'
			
 
				     INDEX = 'index'
			
 
				+    DISCARDED = 'discarded'
			
 
				 
			
 
				 
			
 
				 class ContentType:
			
@@ -19,6 +20,7 @@ class ContentType:
 
				     TABLE = 'table'
			
 
				     TEXT = 'text'
			
 
				     INTERLINE_EQUATION = 'interline_equation'
			
 
				+    INLINE_EQUATION = 'inline_equation'
			
 
				 
			
 
				 
			
 
				 class MakeMode:
			
--- a/mineru/backend/pipeline/magic_model.py
+++ b/mineru/backend/pipeline/magic_model.py
--- a/mineru/utils/vlm_magic_model.py
+++ b/mineru/utils/vlm_magic_model.py