浏览代码

fix: solve conflicts

myhloli 1 年之前
父节点
当前提交
a4c72e2e33

+ 0 - 3
LICENSE.md

@@ -660,6 +660,3 @@ if any, to sign a "copyright disclaimer" for the program, if necessary.
 For more information on this, and how to apply and follow the GNU AGPL, see
 For more information on this, and how to apply and follow the GNU AGPL, see
 <https://www.gnu.org/licenses/>.
 <https://www.gnu.org/licenses/>.
 
 
-
-
-$^1$

文件差异内容过多而无法显示
+ 0 - 1
README.md


文件差异内容过多而无法显示
+ 0 - 1
README_zh-CN.md


+ 0 - 167
app.py

@@ -1,167 +0,0 @@
-# Copyright (c) Opendatalab. All rights reserved.
-
-import base64
-import os
-import time
-import zipfile
-from pathlib import Path
-import re
-
-from loguru import logger
-
-from magic_pdf.libs.hash_utils import compute_sha256
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-from magic_pdf.tools.common import do_parse, prepare_env
-
-os.system("pip install gradio")
-os.system("pip install gradio-pdf")
-import gradio as gr
-from gradio_pdf import PDF
-
-
-def read_fn(path):
-    disk_rw = DiskReaderWriter(os.path.dirname(path))
-    return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
-
-
-def parse_pdf(doc_path, output_dir, end_page_id):
-    os.makedirs(output_dir, exist_ok=True)
-
-    try:
-        file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
-        pdf_data = read_fn(doc_path)
-        parse_method = "auto"
-        local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
-        do_parse(
-            output_dir,
-            file_name,
-            pdf_data,
-            [],
-            parse_method,
-            False,
-            end_page_id=end_page_id,
-        )
-        return local_md_dir, file_name
-    except Exception as e:
-        logger.exception(e)
-
-
-def compress_directory_to_zip(directory_path, output_zip_path):
-    """
-    压缩指定目录到一个 ZIP 文件。
-
-    :param directory_path: 要压缩的目录路径
-    :param output_zip_path: 输出的 ZIP 文件路径
-    """
-    try:
-        with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
-
-            # 遍历目录中的所有文件和子目录
-            for root, dirs, files in os.walk(directory_path):
-                for file in files:
-                    # 构建完整的文件路径
-                    file_path = os.path.join(root, file)
-                    # 计算相对路径
-                    arcname = os.path.relpath(file_path, directory_path)
-                    # 添加文件到 ZIP 文件
-                    zipf.write(file_path, arcname)
-        return 0
-    except Exception as e:
-        logger.exception(e)
-        return -1
-
-
-def image_to_base64(image_path):
-    with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode('utf-8')
-
-
-def replace_image_with_base64(markdown_text, image_dir_path):
-    # 匹配Markdown中的图片标签
-    pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
-
-    # 替换图片链接
-    def replace(match):
-        relative_path = match.group(1)
-        full_path = os.path.join(image_dir_path, relative_path)
-        base64_image = image_to_base64(full_path)
-        return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
-
-    # 应用替换
-    return re.sub(pattern, replace, markdown_text)
-
-
-def to_markdown(file_path, end_pages):
-    # 获取识别的md文件以及压缩包文件路径
-    local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1)
-    archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
-    zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
-    if zip_archive_success == 0:
-        logger.info("压缩成功")
-    else:
-        logger.error("压缩失败")
-    md_path = os.path.join(local_md_dir, file_name + ".md")
-    with open(md_path, 'r', encoding='utf-8') as f:
-        txt_content = f.read()
-    md_content = replace_image_with_base64(txt_content, local_md_dir)
-    # 返回转换后的PDF路径
-    new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
-
-    return md_content, txt_content, archive_zip_path, new_pdf_path
-
-
-# def show_pdf(file_path):
-#     with open(file_path, "rb") as f:
-#         base64_pdf = base64.b64encode(f.read()).decode('utf-8')
-#     pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" ' \
-#                   f'width="100%" height="1000" type="application/pdf">'
-#     return pdf_display
-
-
-latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
-                    {"left": '$', "right": '$', "display": False}]
-
-
-def init_model():
-    from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
-    try:
-        model_manager = ModelSingleton()
-        txt_model = model_manager.get_model(False, False)
-        logger.info(f"txt_model init final")
-        ocr_model = model_manager.get_model(True, False)
-        logger.info(f"ocr_model init final")
-        return 0
-    except Exception as e:
-        logger.exception(e)
-        return -1
-
-
-model_init = init_model()
-logger.info(f"model_init: {model_init}")
-
-
-if __name__ == "__main__":
-    with gr.Blocks() as demo:
-        with gr.Row():
-            with gr.Column(variant='panel', scale=5):
-                pdf_show = gr.Markdown()
-                max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
-                with gr.Row() as bu_flow:
-                    change_bu = gr.Button("Convert")
-                    clear_bu = gr.ClearButton([pdf_show], value="Clear")
-                pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
-
-            with gr.Column(variant='panel', scale=5):
-                output_file = gr.File(label="convert result", interactive=False)
-                with gr.Tabs():
-                    with gr.Tab("Markdown rendering"):
-                        md = gr.Markdown(label="Markdown rendering", height=900, show_copy_button=True,
-                                         latex_delimiters=latex_delimiters, line_breaks=True)
-                    with gr.Tab("Markdown text"):
-                        md_text = gr.TextArea(lines=45, show_copy_button=True)
-        change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages], outputs=[md, md_text, output_file, pdf_show])
-        clear_bu.add([md, pdf_show, md_text, output_file])
-
-    demo.launch()
-

+ 0 - 4
magic_pdf/libs/version.py

@@ -1,5 +1 @@
-<<<<<<< HEAD
-__version__ = "0.7.1"
-=======
 __version__ = "0.8.0"
 __version__ = "0.8.0"
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999

+ 0 - 4
magic_pdf/model/doc_analyze_by_custom_model.py

@@ -106,11 +106,7 @@ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None):
 
 
 
 
 def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
 def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
-<<<<<<< HEAD
                 start_page_id=0, end_page_id=None, lang=None):
                 start_page_id=0, end_page_id=None, lang=None):
-=======
-                start_page_id=0, end_page_id=None):
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 
 
     model_manager = ModelSingleton()
     model_manager = ModelSingleton()
     custom_model = model_manager.get_model(ocr, show_log, lang)
     custom_model = model_manager.get_model(ocr, show_log, lang)

+ 0 - 13
magic_pdf/model/pdf_extract_kit.py

@@ -74,16 +74,11 @@ def layout_model_init(weight, config_file, device):
     return model
     return model
 
 
 
 
-<<<<<<< HEAD
 def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None):
 def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None):
     if lang is not None:
     if lang is not None:
         model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang)
         model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang)
     else:
     else:
         model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
         model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
-=======
-def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3):
-    model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
     return model
     return model
 
 
 
 
@@ -142,12 +137,8 @@ def atom_model_init(model_name: str, **kwargs):
     elif model_name == AtomicModel.OCR:
     elif model_name == AtomicModel.OCR:
         atom_model = ocr_model_init(
         atom_model = ocr_model_init(
             kwargs.get("ocr_show_log"),
             kwargs.get("ocr_show_log"),
-<<<<<<< HEAD
             kwargs.get("det_db_box_thresh"),
             kwargs.get("det_db_box_thresh"),
             kwargs.get("lang")
             kwargs.get("lang")
-=======
-            kwargs.get("det_db_box_thresh")
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
         )
         )
     elif model_name == AtomicModel.Table:
     elif model_name == AtomicModel.Table:
         atom_model = table_model_init(
         atom_model = table_model_init(
@@ -244,12 +235,8 @@ class CustomPEKModel:
             self.ocr_model = atom_model_manager.get_atom_model(
             self.ocr_model = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.OCR,
                 atom_model_name=AtomicModel.OCR,
                 ocr_show_log=show_log,
                 ocr_show_log=show_log,
-<<<<<<< HEAD
                 det_db_box_thresh=0.3,
                 det_db_box_thresh=0.3,
                 lang=self.lang
                 lang=self.lang
-=======
-                det_db_box_thresh=0.3
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
             )
             )
         # init table model
         # init table model
         if self.apply_table:
         if self.apply_table:

+ 0 - 7
magic_pdf/pipe/AbsPipe.py

@@ -17,11 +17,7 @@ class AbsPipe(ABC):
     PIP_TXT = "txt"
     PIP_TXT = "txt"
 
 
     def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
     def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
-<<<<<<< HEAD
                  start_page_id=0, end_page_id=None, lang=None):
                  start_page_id=0, end_page_id=None, lang=None):
-=======
-                 start_page_id=0, end_page_id=None):
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
         self.pdf_bytes = pdf_bytes
         self.pdf_bytes = pdf_bytes
         self.model_list = model_list
         self.model_list = model_list
         self.image_writer = image_writer
         self.image_writer = image_writer
@@ -29,10 +25,7 @@ class AbsPipe(ABC):
         self.is_debug = is_debug
         self.is_debug = is_debug
         self.start_page_id = start_page_id
         self.start_page_id = start_page_id
         self.end_page_id = end_page_id
         self.end_page_id = end_page_id
-<<<<<<< HEAD
         self.lang = lang
         self.lang = lang
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
     
     
     def get_compress_pdf_mid_data(self):
     def get_compress_pdf_mid_data(self):
         return JsonCompressor.compress_json(self.pdf_mid_data)
         return JsonCompressor.compress_json(self.pdf_mid_data)

+ 0 - 9
magic_pdf/pipe/OCRPipe.py

@@ -10,25 +10,16 @@ from magic_pdf.user_api import parse_ocr_pdf
 class OCRPipe(AbsPipe):
 class OCRPipe(AbsPipe):
 
 
     def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
     def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
-<<<<<<< HEAD
                  start_page_id=0, end_page_id=None, lang=None):
                  start_page_id=0, end_page_id=None, lang=None):
         super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)
         super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)
-=======
-                 start_page_id=0, end_page_id=None):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 
 
     def pipe_classify(self):
     def pipe_classify(self):
         pass
         pass
 
 
     def pipe_analyze(self):
     def pipe_analyze(self):
         self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
         self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
-<<<<<<< HEAD
                                       start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                       start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                       lang=self.lang)
                                       lang=self.lang)
-=======
-                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 
 
     def pipe_parse(self):
     def pipe_parse(self):
         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,

+ 0 - 9
magic_pdf/pipe/TXTPipe.py

@@ -11,25 +11,16 @@ from magic_pdf.user_api import parse_txt_pdf
 class TXTPipe(AbsPipe):
 class TXTPipe(AbsPipe):
 
 
     def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
     def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
-<<<<<<< HEAD
                  start_page_id=0, end_page_id=None, lang=None):
                  start_page_id=0, end_page_id=None, lang=None):
         super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)
         super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)
-=======
-                 start_page_id=0, end_page_id=None):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 
 
     def pipe_classify(self):
     def pipe_classify(self):
         pass
         pass
 
 
     def pipe_analyze(self):
     def pipe_analyze(self):
         self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
         self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
-<<<<<<< HEAD
                                       start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                       start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                       lang=self.lang)
                                       lang=self.lang)
-=======
-                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 
 
     def pipe_parse(self):
     def pipe_parse(self):
         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,

+ 0 - 17
magic_pdf/pipe/UNIPipe.py

@@ -14,15 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 class UNIPipe(AbsPipe):
 class UNIPipe(AbsPipe):
 
 
     def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
     def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
-<<<<<<< HEAD
                  start_page_id=0, end_page_id=None, lang=None):
                  start_page_id=0, end_page_id=None, lang=None):
         self.pdf_type = jso_useful_key["_pdf_type"]
         self.pdf_type = jso_useful_key["_pdf_type"]
         super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id, lang)
         super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id, lang)
-=======
-                 start_page_id=0, end_page_id=None):
-        self.pdf_type = jso_useful_key["_pdf_type"]
-        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id)
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
         if len(self.model_list) == 0:
         if len(self.model_list) == 0:
             self.input_model_is_empty = True
             self.input_model_is_empty = True
         else:
         else:
@@ -34,30 +28,19 @@ class UNIPipe(AbsPipe):
     def pipe_analyze(self):
     def pipe_analyze(self):
         if self.pdf_type == self.PIP_TXT:
         if self.pdf_type == self.PIP_TXT:
             self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
             self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
-<<<<<<< HEAD
                                           start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                           start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                           lang=self.lang)
                                           lang=self.lang)
         elif self.pdf_type == self.PIP_OCR:
         elif self.pdf_type == self.PIP_OCR:
             self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
             self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
                                           start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                           start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                           lang=self.lang)
                                           lang=self.lang)
-=======
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
-        elif self.pdf_type == self.PIP_OCR:
-            self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 
 
     def pipe_parse(self):
     def pipe_parse(self):
         if self.pdf_type == self.PIP_TXT:
         if self.pdf_type == self.PIP_TXT:
             self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
             self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                                 is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
                                                 is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
-<<<<<<< HEAD
                                                 start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                                 start_page_id=self.start_page_id, end_page_id=self.end_page_id,
                                                 lang=self.lang)
                                                 lang=self.lang)
-=======
-                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id)
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
         elif self.pdf_type == self.PIP_OCR:
         elif self.pdf_type == self.PIP_OCR:
             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
                                               is_debug=self.is_debug,
                                               is_debug=self.is_debug,

+ 0 - 4
magic_pdf/resources/model_config/model_configs.yaml

@@ -10,10 +10,6 @@ config:
 weights:
 weights:
   layout: Layout/model_final.pth
   layout: Layout/model_final.pth
   mfd: MFD/weights.pt
   mfd: MFD/weights.pt
-<<<<<<< HEAD
   mfr: MFR/unimernet_base
   mfr: MFR/unimernet_base
-=======
-  mfr: MFR/UniMERNet
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
   struct_eqtable: TabRec/StructEqTable
   struct_eqtable: TabRec/StructEqTable
   TableMaster: TabRec/TableMaster
   TableMaster: TabRec/TableMaster

+ 0 - 10
magic_pdf/tools/cli.py

@@ -45,7 +45,6 @@ without method specified, auto will be used by default.""",
     default='auto',
     default='auto',
 )
 )
 @click.option(
 @click.option(
-<<<<<<< HEAD
     '-l',
     '-l',
     '--lang',
     '--lang',
     'lang',
     'lang',
@@ -58,8 +57,6 @@ without method specified, auto will be used by default.""",
     default=None,
     default=None,
 )
 )
 @click.option(
 @click.option(
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
     '-d',
     '-d',
     '--debug',
     '--debug',
     'debug_able',
     'debug_able',
@@ -83,11 +80,7 @@ without method specified, auto will be used by default.""",
     help='The ending page for PDF parsing, beginning from 0.',
     help='The ending page for PDF parsing, beginning from 0.',
     default=None,
     default=None,
 )
 )
-<<<<<<< HEAD
 def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
 def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
-=======
-def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
     model_config.__use_inside_model__ = True
     model_config.__use_inside_model__ = True
     model_config.__model_mode__ = 'full'
     model_config.__model_mode__ = 'full'
     os.makedirs(output_dir, exist_ok=True)
     os.makedirs(output_dir, exist_ok=True)
@@ -109,10 +102,7 @@ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
                 debug_able,
                 debug_able,
                 start_page_id=start_page_id,
                 start_page_id=start_page_id,
                 end_page_id=end_page_id,
                 end_page_id=end_page_id,
-<<<<<<< HEAD
                 lang=lang
                 lang=lang
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
             )
             )
 
 
         except Exception as e:
         except Exception as e:

+ 0 - 10
magic_pdf/user_api.py

@@ -71,11 +71,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
 
 
 def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
 def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
                     input_model_is_empty: bool = False,
                     input_model_is_empty: bool = False,
-<<<<<<< HEAD
                     start_page_id=0, end_page_id=None, lang=None,
                     start_page_id=0, end_page_id=None, lang=None,
-=======
-                    start_page_id=0, end_page_id=None,
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
                     *args, **kwargs):
                     *args, **kwargs):
     """
     """
     ocr和文本混合的pdf,全部解析出来
     ocr和文本混合的pdf,全部解析出来
@@ -99,17 +95,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
     if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
     if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
         logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
         logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
         if input_model_is_empty:
         if input_model_is_empty:
-<<<<<<< HEAD
             pdf_models = doc_analyze(pdf_bytes,
             pdf_models = doc_analyze(pdf_bytes,
                                      ocr=True,
                                      ocr=True,
                                      start_page_id=start_page_id,
                                      start_page_id=start_page_id,
                                      end_page_id=end_page_id,
                                      end_page_id=end_page_id,
                                      lang=lang)
                                      lang=lang)
-=======
-            pdf_models = doc_analyze(pdf_bytes, ocr=True,
-                                     start_page_id=start_page_id,
-                                     end_page_id=end_page_id)
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
         if pdf_info_dict is None:
         if pdf_info_dict is None:
             raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
             raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")

+ 0 - 26
mv_pdf.py

@@ -1,26 +0,0 @@
-import os
-import shutil
-
-def move_pdfs(root_folder, destination_folder):
-    # 遍历根目录及其子目录中的所有文件
-    for root, dirs, files in os.walk(root_folder):
-        for file in files:
-            if file.endswith('.pdf'):
-                # 构建完整的文件路径
-                src_path = os.path.join(root, file)
-                # 构建目标路径
-                dst_path = os.path.join(destination_folder, file)
-                
-                # 移动文件
-                shutil.move(src_path, dst_path)
-                print(f'Moved {file} to {destination_folder}')
-
-# 使用方法
-root_folder = r'D:\mineru\datasets\datasets'  # 源文件夹路径
-destination_folder = r'D:\mineru\datasets\pdf'  # 目标文件夹路径
-
-# 创建目标文件夹如果不存在
-if not os.path.exists(destination_folder):
-    os.makedirs(destination_folder)
-
-move_pdfs(root_folder, destination_folder)

+ 0 - 3
projects/README.md

@@ -3,9 +3,6 @@
 ## Project List
 ## Project List
 
 
 - [llama_index_rag](./llama_index_rag/README.md): Build a lightweight RAG system based on llama_index
 - [llama_index_rag](./llama_index_rag/README.md): Build a lightweight RAG system based on llama_index
-<<<<<<< HEAD
 - [gradio_app](./gradio_app/README.md): Build a web app based on gradio
 - [gradio_app](./gradio_app/README.md): Build a web app based on gradio
 
 
 
 
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999

+ 0 - 3
projects/README_zh-CN.md

@@ -3,8 +3,5 @@
 ## 项目列表
 ## 项目列表
 
 
 - [llama_index_rag](./llama_index_rag/README_zh-CN.md): 基于 llama_index 构建轻量级 RAG 系统
 - [llama_index_rag](./llama_index_rag/README_zh-CN.md): 基于 llama_index 构建轻量级 RAG 系统
-<<<<<<< HEAD
 - [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
 - [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
 
 
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999

+ 0 - 20
projects/llama_index_rag/README_zh-CN.md

@@ -59,10 +59,6 @@ Server: Docker Engine - Community
 ```bash
 ```bash
 # install
 # install
 pip install modelscope==1.14.0
 pip install modelscope==1.14.0
-<<<<<<< HEAD
-
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 pip install llama-index-vector-stores-elasticsearch==0.2.0
 pip install llama-index-vector-stores-elasticsearch==0.2.0
 pip install llama-index-embeddings-dashscope==0.2.0
 pip install llama-index-embeddings-dashscope==0.2.0
 pip install llama-index-core==0.10.68
 pip install llama-index-core==0.10.68
@@ -74,19 +70,12 @@ pip install accelerate==0.33.0
 pip uninstall transformer-engine
 pip uninstall transformer-engine
 ```
 ```
 
 
-<<<<<<< HEAD
 
 
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 ## 示例
 ## 示例
 
 
 ````bash
 ````bash
 cd  projects/llama_index_rag
 cd  projects/llama_index_rag
 
 
-<<<<<<< HEAD
-
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 docker compose up -d
 docker compose up -d
 
 
 or
 or
@@ -94,20 +83,14 @@ or
 docker-compose up -d
 docker-compose up -d
 
 
 
 
-<<<<<<< HEAD
 # 配置环境变量
 # 配置环境变量
 
 
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 export ES_USER=elastic
 export ES_USER=elastic
 export ES_PASSWORD=llama_index
 export ES_PASSWORD=llama_index
 export ES_URL=http://127.0.0.1:9200
 export ES_URL=http://127.0.0.1:9200
 export DASHSCOPE_API_KEY={some_key}
 export DASHSCOPE_API_KEY={some_key}
 
 
-<<<<<<< HEAD
 
 
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 DASHSCOPE_API_KEY 开通参考[文档](https://help.aliyun.com/zh/dashscope/opening-service)
 DASHSCOPE_API_KEY 开通参考[文档](https://help.aliyun.com/zh/dashscope/opening-service)
 
 
 # 未导入数据,查询问题。返回通义千问默认答案
 # 未导入数据,查询问题。返回通义千问默认答案
@@ -135,10 +118,7 @@ python data_ingestion.py -p example/data/declaration_of_the_rights_of_man_1789.p
 
 
 
 
 # 导入数据后,查询问题。通义千问模型会根据 RAG 系统的检索结果,结合上下文,给出答案。
 # 导入数据后,查询问题。通义千问模型会根据 RAG 系统的检索结果,结合上下文,给出答案。
-<<<<<<< HEAD
 
 
-=======
->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
 python query.py -q 'how about the rights of men'
 python query.py -q 'how about the rights of men'
 
 
 ## outputs
 ## outputs

二进制
tests.zip


部分文件因为文件数量过多而无法显示