1 年之前 · a4c72e2e33
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -660,6 +660,3 @@ if any, to sign a "copyright disclaimer" for the program, if necessary.
 
															 For more information on this, and how to apply and follow the GNU AGPL, see
														
 
															 <https://www.gnu.org/licenses/>.
														
 
															-
														
 
															-
														
 
															-$^1$
														
--- a/README.md
+++ b/README.md
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
--- a/app.py
+++ b/app.py
@@ -1,167 +0,0 @@
 
															-# Copyright (c) Opendatalab. All rights reserved.
														
 
															-
														
 
															-import base64
														
 
															-import os
														
 
															-import time
														
 
															-import zipfile
														
 
															-from pathlib import Path
														
 
															-import re
														
 
															-
														
 
															-from loguru import logger
														
 
															-
														
 
															-from magic_pdf.libs.hash_utils import compute_sha256
														
 
															-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
														
 
															-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
														
 
															-from magic_pdf.tools.common import do_parse, prepare_env
														
 
															-
														
 
															-os.system("pip install gradio")
														
 
															-os.system("pip install gradio-pdf")
														
 
															-import gradio as gr
														
 
															-from gradio_pdf import PDF
														
 
															-
														
 
															-
														
 
															-def read_fn(path):
														
 
															-    disk_rw = DiskReaderWriter(os.path.dirname(path))
														
 
															-    return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
														
 
															-
														
 
															-
														
 
															-def parse_pdf(doc_path, output_dir, end_page_id):
														
 
															-    os.makedirs(output_dir, exist_ok=True)
														
 
															-
														
 
															-    try:
														
 
															-        file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
														
 
															-        pdf_data = read_fn(doc_path)
														
 
															-        parse_method = "auto"
														
 
															-        local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
														
 
															-        do_parse(
														
 
															-            output_dir,
														
 
															-            file_name,
														
 
															-            pdf_data,
														
 
															-            [],
														
 
															-            parse_method,
														
 
															-            False,
														
 
															-            end_page_id=end_page_id,
														
 
															-        )
														
 
															-        return local_md_dir, file_name
														
 
															-    except Exception as e:
														
 
															-        logger.exception(e)
														
 
															-
														
 
															-
														
 
															-def compress_directory_to_zip(directory_path, output_zip_path):
														
 
															-    """
														
 
															-    压缩指定目录到一个 ZIP 文件。
														
 
															-
														
 
															-    :param directory_path: 要压缩的目录路径
														
 
															-    :param output_zip_path: 输出的 ZIP 文件路径
														
 
															-    """
														
 
															-    try:
														
 
															-        with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
														
 
															-
														
 
															-            # 遍历目录中的所有文件和子目录
														
 
															-            for root, dirs, files in os.walk(directory_path):
														
 
															-                for file in files:
														
 
															-                    # 构建完整的文件路径
														
 
															-                    file_path = os.path.join(root, file)
														
 
															-                    # 计算相对路径
														
 
															-                    arcname = os.path.relpath(file_path, directory_path)
														
 
															-                    # 添加文件到 ZIP 文件
														
 
															-                    zipf.write(file_path, arcname)
														
 
															-        return 0
														
 
															-    except Exception as e:
														
 
															-        logger.exception(e)
														
 
															-        return -1
														
 
															-
														
 
															-
														
 
															-def image_to_base64(image_path):
														
 
															-    with open(image_path, "rb") as image_file:
														
 
															-        return base64.b64encode(image_file.read()).decode('utf-8')
														
 
															-
														
 
															-
														
 
															-def replace_image_with_base64(markdown_text, image_dir_path):
														
 
															-    # 匹配Markdown中的图片标签
														
 
															-    pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
														
 
															-
														
 
															-    # 替换图片链接
														
 
															-    def replace(match):
														
 
															-        relative_path = match.group(1)
														
 
															-        full_path = os.path.join(image_dir_path, relative_path)
														
 
															-        base64_image = image_to_base64(full_path)
														
 
															-        return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
														
 
															-
														
 
															-    # 应用替换
														
 
															-    return re.sub(pattern, replace, markdown_text)
														
 
															-
														
 
															-
														
 
															-def to_markdown(file_path, end_pages):
														
 
															-    # 获取识别的md文件以及压缩包文件路径
														
 
															-    local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1)
														
 
															-    archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
														
 
															-    zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
														
 
															-    if zip_archive_success == 0:
														
 
															-        logger.info("压缩成功")
														
 
															-    else:
														
 
															-        logger.error("压缩失败")
														
 
															-    md_path = os.path.join(local_md_dir, file_name + ".md")
														
 
															-    with open(md_path, 'r', encoding='utf-8') as f:
														
 
															-        txt_content = f.read()
														
 
															-    md_content = replace_image_with_base64(txt_content, local_md_dir)
														
 
															-    # 返回转换后的PDF路径
														
 
															-    new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
														
 
															-
														
 
															-    return md_content, txt_content, archive_zip_path, new_pdf_path
														
 
															-
														
 
															-
														
 
															-# def show_pdf(file_path):
														
 
															-#     with open(file_path, "rb") as f:
														
 
															-#         base64_pdf = base64.b64encode(f.read()).decode('utf-8')
														
 
															-#     pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" ' \
														
 
															-#                   f'width="100%" height="1000" type="application/pdf">'
														
 
															-#     return pdf_display
														
 
															-
														
 
															-
														
 
															-latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
														
 
															-                    {"left": '$', "right": '$', "display": False}]
														
 
															-
														
 
															-
														
 
															-def init_model():
														
 
															-    from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
														
 
															-    try:
														
 
															-        model_manager = ModelSingleton()
														
 
															-        txt_model = model_manager.get_model(False, False)
														
 
															-        logger.info(f"txt_model init final")
														
 
															-        ocr_model = model_manager.get_model(True, False)
														
 
															-        logger.info(f"ocr_model init final")
														
 
															-        return 0
														
 
															-    except Exception as e:
														
 
															-        logger.exception(e)
														
 
															-        return -1
														
 
															-
														
 
															-
														
 
															-model_init = init_model()
														
 
															-logger.info(f"model_init: {model_init}")
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    with gr.Blocks() as demo:
														
 
															-        with gr.Row():
														
 
															-            with gr.Column(variant='panel', scale=5):
														
 
															-                pdf_show = gr.Markdown()
														
 
															-                max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
														
 
															-                with gr.Row() as bu_flow:
														
 
															-                    change_bu = gr.Button("Convert")
														
 
															-                    clear_bu = gr.ClearButton([pdf_show], value="Clear")
														
 
															-                pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
														
 
															-
														
 
															-            with gr.Column(variant='panel', scale=5):
														
 
															-                output_file = gr.File(label="convert result", interactive=False)
														
 
															-                with gr.Tabs():
														
 
															-                    with gr.Tab("Markdown rendering"):
														
 
															-                        md = gr.Markdown(label="Markdown rendering", height=900, show_copy_button=True,
														
 
															-                                         latex_delimiters=latex_delimiters, line_breaks=True)
														
 
															-                    with gr.Tab("Markdown text"):
														
 
															-                        md_text = gr.TextArea(lines=45, show_copy_button=True)
														
 
															-        change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages], outputs=[md, md_text, output_file, pdf_show])
														
 
															-        clear_bu.add([md, pdf_show, md_text, output_file])
														
 
															-
														
 
															-    demo.launch()
														
 
															-
														
--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
@@ -1,5 +1 @@
 
															-<<<<<<< HEAD
														
 
															-__version__ = "0.7.1"
														
 
															-=======
														
 
															 __version__ = "0.8.0"
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
--- a/magic_pdf/model/doc_analyze_by_custom_model.py
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -106,11 +106,7 @@ def custom_model_init(ocr: bool = False, show_log: bool = False, lang=None):
 
															 def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
														
 
															-<<<<<<< HEAD
														
 
															                 start_page_id=0, end_page_id=None, lang=None):
														
 
															-=======
														
 
															-                start_page_id=0, end_page_id=None):
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     model_manager = ModelSingleton()
														
 
															     custom_model = model_manager.get_model(ocr, show_log, lang)
														
--- a/magic_pdf/model/pdf_extract_kit.py
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -74,16 +74,11 @@ def layout_model_init(weight, config_file, device):
 
															     return model
														
 
															-<<<<<<< HEAD
														
 
															 def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None):
														
 
															     if lang is not None:
														
 
															         model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang)
														
 
															     else:
														
 
															         model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
														
 
															-=======
														
 
															-def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3):
														
 
															-    model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     return model
														
@@ -142,12 +137,8 @@ def atom_model_init(model_name: str, **kwargs):
 
															     elif model_name == AtomicModel.OCR:
														
 
															         atom_model = ocr_model_init(
														
 
															             kwargs.get("ocr_show_log"),
														
 
															-<<<<<<< HEAD
														
 
															             kwargs.get("det_db_box_thresh"),
														
 
															             kwargs.get("lang")
														
 
															-=======
														
 
															-            kwargs.get("det_db_box_thresh")
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															         )
														
 
															     elif model_name == AtomicModel.Table:
														
 
															         atom_model = table_model_init(
														
@@ -244,12 +235,8 @@ class CustomPEKModel:
 
															             self.ocr_model = atom_model_manager.get_atom_model(
														
 
															                 atom_model_name=AtomicModel.OCR,
														
 
															                 ocr_show_log=show_log,
														
 
															-<<<<<<< HEAD
														
 
															                 det_db_box_thresh=0.3,
														
 
															                 lang=self.lang
														
 
															-=======
														
 
															-                det_db_box_thresh=0.3
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															             )
														
 
															         # init table model
														
 
															         if self.apply_table:
														
--- a/magic_pdf/pipe/AbsPipe.py
+++ b/magic_pdf/pipe/AbsPipe.py
@@ -17,11 +17,7 @@ class AbsPipe(ABC):
 
															     PIP_TXT = "txt"
														
 
															     def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
														
 
															-<<<<<<< HEAD
														
 
															                  start_page_id=0, end_page_id=None, lang=None):
														
 
															-=======
														
 
															-                 start_page_id=0, end_page_id=None):
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															         self.pdf_bytes = pdf_bytes
														
 
															         self.model_list = model_list
														
 
															         self.image_writer = image_writer
														
@@ -29,10 +25,7 @@ class AbsPipe(ABC):
 
															         self.is_debug = is_debug
														
 
															         self.start_page_id = start_page_id
														
 
															         self.end_page_id = end_page_id
														
 
															-<<<<<<< HEAD
														
 
															         self.lang = lang
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     def get_compress_pdf_mid_data(self):
														
 
															         return JsonCompressor.compress_json(self.pdf_mid_data)
														
--- a/magic_pdf/pipe/OCRPipe.py
+++ b/magic_pdf/pipe/OCRPipe.py
@@ -10,25 +10,16 @@ from magic_pdf.user_api import parse_ocr_pdf
 
															 class OCRPipe(AbsPipe):
														
 
															     def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
														
 
															-<<<<<<< HEAD
														
 
															                  start_page_id=0, end_page_id=None, lang=None):
														
 
															         super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)
														
 
															-=======
														
 
															-                 start_page_id=0, end_page_id=None):
														
 
															-        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     def pipe_classify(self):
														
 
															         pass
														
 
															     def pipe_analyze(self):
														
 
															         self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
														
 
															-<<<<<<< HEAD
														
 
															                                       start_page_id=self.start_page_id, end_page_id=self.end_page_id,
														
 
															                                       lang=self.lang)
														
 
															-=======
														
 
															-                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     def pipe_parse(self):
														
 
															         self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
														
--- a/magic_pdf/pipe/TXTPipe.py
+++ b/magic_pdf/pipe/TXTPipe.py
@@ -11,25 +11,16 @@ from magic_pdf.user_api import parse_txt_pdf
 
															 class TXTPipe(AbsPipe):
														
 
															     def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
														
 
															-<<<<<<< HEAD
														
 
															                  start_page_id=0, end_page_id=None, lang=None):
														
 
															         super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang)
														
 
															-=======
														
 
															-                 start_page_id=0, end_page_id=None):
														
 
															-        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     def pipe_classify(self):
														
 
															         pass
														
 
															     def pipe_analyze(self):
														
 
															         self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
														
 
															-<<<<<<< HEAD
														
 
															                                       start_page_id=self.start_page_id, end_page_id=self.end_page_id,
														
 
															                                       lang=self.lang)
														
 
															-=======
														
 
															-                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     def pipe_parse(self):
														
 
															         self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
														
--- a/magic_pdf/pipe/UNIPipe.py
+++ b/magic_pdf/pipe/UNIPipe.py
@@ -14,15 +14,9 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 
															 class UNIPipe(AbsPipe):
														
 
															     def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
														
 
															-<<<<<<< HEAD
														
 
															                  start_page_id=0, end_page_id=None, lang=None):
														
 
															         self.pdf_type = jso_useful_key["_pdf_type"]
														
 
															         super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id, lang)
														
 
															-=======
														
 
															-                 start_page_id=0, end_page_id=None):
														
 
															-        self.pdf_type = jso_useful_key["_pdf_type"]
														
 
															-        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id)
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															         if len(self.model_list) == 0:
														
 
															             self.input_model_is_empty = True
														
 
															         else:
														
@@ -34,30 +28,19 @@ class UNIPipe(AbsPipe):
 
															     def pipe_analyze(self):
														
 
															         if self.pdf_type == self.PIP_TXT:
														
 
															             self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
														
 
															-<<<<<<< HEAD
														
 
															                                           start_page_id=self.start_page_id, end_page_id=self.end_page_id,
														
 
															                                           lang=self.lang)
														
 
															         elif self.pdf_type == self.PIP_OCR:
														
 
															             self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
														
 
															                                           start_page_id=self.start_page_id, end_page_id=self.end_page_id,
														
 
															                                           lang=self.lang)
														
 
															-=======
														
 
															-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
														
 
															-        elif self.pdf_type == self.PIP_OCR:
														
 
															-            self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
														
 
															-                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     def pipe_parse(self):
														
 
															         if self.pdf_type == self.PIP_TXT:
														
 
															             self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
														
 
															                                                 is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
														
 
															-<<<<<<< HEAD
														
 
															                                                 start_page_id=self.start_page_id, end_page_id=self.end_page_id,
														
 
															                                                 lang=self.lang)
														
 
															-=======
														
 
															-                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id)
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															         elif self.pdf_type == self.PIP_OCR:
														
 
															             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
														
 
															                                               is_debug=self.is_debug,
														
--- a/magic_pdf/resources/model_config/model_configs.yaml
+++ b/magic_pdf/resources/model_config/model_configs.yaml
@@ -10,10 +10,6 @@ config:
 
															 weights:
														
 
															   layout: Layout/model_final.pth
														
 
															   mfd: MFD/weights.pt
														
 
															-<<<<<<< HEAD
														
 
															   mfr: MFR/unimernet_base
														
 
															-=======
														
 
															-  mfr: MFR/UniMERNet
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															   struct_eqtable: TabRec/StructEqTable
														
 
															   TableMaster: TabRec/TableMaster
														
--- a/magic_pdf/tools/cli.py
+++ b/magic_pdf/tools/cli.py
@@ -45,7 +45,6 @@ without method specified, auto will be used by default.""",
 
															     default='auto',
														
 
															 )
														
 
															 @click.option(
														
 
															-<<<<<<< HEAD
														
 
															     '-l',
														
 
															     '--lang',
														
 
															     'lang',
														
@@ -58,8 +57,6 @@ without method specified, auto will be used by default.""",
 
															     default=None,
														
 
															 )
														
 
															 @click.option(
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     '-d',
														
 
															     '--debug',
														
 
															     'debug_able',
														
@@ -83,11 +80,7 @@ without method specified, auto will be used by default.""",
 
															     help='The ending page for PDF parsing, beginning from 0.',
														
 
															     default=None,
														
 
															 )
														
 
															-<<<<<<< HEAD
														
 
															 def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
														
 
															-=======
														
 
															-def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															     model_config.__use_inside_model__ = True
														
 
															     model_config.__model_mode__ = 'full'
														
 
															     os.makedirs(output_dir, exist_ok=True)
														
@@ -109,10 +102,7 @@ def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
 
															                 debug_able,
														
 
															                 start_page_id=start_page_id,
														
 
															                 end_page_id=end_page_id,
														
 
															-<<<<<<< HEAD
														
 
															                 lang=lang
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															             )
														
 
															         except Exception as e:
														
--- a/magic_pdf/user_api.py
+++ b/magic_pdf/user_api.py
@@ -71,11 +71,7 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
 
															 def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
														
 
															                     input_model_is_empty: bool = False,
														
 
															-<<<<<<< HEAD
														
 
															                     start_page_id=0, end_page_id=None, lang=None,
														
 
															-=======
														
 
															-                    start_page_id=0, end_page_id=None,
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															                     *args, **kwargs):
														
 
															     """
														
 
															     ocr和文本混合的pdf，全部解析出来
														
@@ -99,17 +95,11 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
 
															     if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
														
 
															         logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
														
 
															         if input_model_is_empty:
														
 
															-<<<<<<< HEAD
														
 
															             pdf_models = doc_analyze(pdf_bytes,
														
 
															                                      ocr=True,
														
 
															                                      start_page_id=start_page_id,
														
 
															                                      end_page_id=end_page_id,
														
 
															                                      lang=lang)
														
 
															-=======
														
 
															-            pdf_models = doc_analyze(pdf_bytes, ocr=True,
														
 
															-                                     start_page_id=start_page_id,
														
 
															-                                     end_page_id=end_page_id)
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
														
 
															         if pdf_info_dict is None:
														
 
															             raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
														
--- a/mv_pdf.py
+++ b/mv_pdf.py
@@ -1,26 +0,0 @@
 
															-import os
														
 
															-import shutil
														
 
															-
														
 
															-def move_pdfs(root_folder, destination_folder):
														
 
															-    # 遍历根目录及其子目录中的所有文件
														
 
															-    for root, dirs, files in os.walk(root_folder):
														
 
															-        for file in files:
														
 
															-            if file.endswith('.pdf'):
														
 
															-                # 构建完整的文件路径
														
 
															-                src_path = os.path.join(root, file)
														
 
															-                # 构建目标路径
														
 
															-                dst_path = os.path.join(destination_folder, file)
														
 
															-                
														
 
															-                # 移动文件
														
 
															-                shutil.move(src_path, dst_path)
														
 
															-                print(f'Moved {file} to {destination_folder}')
														
 
															-
														
 
															-# 使用方法
														
 
															-root_folder = r'D:\mineru\datasets\datasets'  # 源文件夹路径
														
 
															-destination_folder = r'D:\mineru\datasets\pdf'  # 目标文件夹路径
														
 
															-
														
 
															-# 创建目标文件夹如果不存在
														
 
															-if not os.path.exists(destination_folder):
														
 
															-    os.makedirs(destination_folder)
														
 
															-
														
 
															-move_pdfs(root_folder, destination_folder)
														
--- a/projects/README.md
+++ b/projects/README.md
@@ -3,9 +3,6 @@
 
															 ## Project List
														
 
															 - [llama_index_rag](./llama_index_rag/README.md): Build a lightweight RAG system based on llama_index
														
 
															-<<<<<<< HEAD
														
 
															 - [gradio_app](./gradio_app/README.md): Build a web app based on gradio
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
--- a/projects/README_zh-CN.md
+++ b/projects/README_zh-CN.md
@@ -3,8 +3,5 @@
 
															 ## 项目列表
														
 
															 - [llama_index_rag](./llama_index_rag/README_zh-CN.md): 基于 llama_index 构建轻量级 RAG 系统
														
 
															-<<<<<<< HEAD
														
 
															 - [gradio_app](./gradio_app/README_zh-CN.md): 基于 Gradio 的 Web 应用
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
--- a/projects/llama_index_rag/README_zh-CN.md
+++ b/projects/llama_index_rag/README_zh-CN.md
@@ -59,10 +59,6 @@ Server: Docker Engine - Community
 
															 ```bash
														
 
															 # install
														
 
															 pip install modelscope==1.14.0
														
 
															-<<<<<<< HEAD
														
 
															-
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															 pip install llama-index-vector-stores-elasticsearch==0.2.0
														
 
															 pip install llama-index-embeddings-dashscope==0.2.0
														
 
															 pip install llama-index-core==0.10.68
														
@@ -74,19 +70,12 @@ pip install accelerate==0.33.0
 
															 pip uninstall transformer-engine
														
 
															 ```
														
 
															-<<<<<<< HEAD
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															 ## 示例
														
 
															 ````bash
														
 
															 cd  projects/llama_index_rag
														
 
															-<<<<<<< HEAD
														
 
															-
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															 docker compose up -d
														
 
															 or
														
@@ -94,20 +83,14 @@ or
 
															 docker-compose up -d
														
 
															-<<<<<<< HEAD
														
 
															 # 配置环境变量
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															 export ES_USER=elastic
														
 
															 export ES_PASSWORD=llama_index
														
 
															 export ES_URL=http://127.0.0.1:9200
														
 
															 export DASHSCOPE_API_KEY={some_key}
														
 
															-<<<<<<< HEAD
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															 DASHSCOPE_API_KEY 开通参考[文档](https://help.aliyun.com/zh/dashscope/opening-service)
														
 
															 # 未导入数据，查询问题。返回通义千问默认答案
														
@@ -135,10 +118,7 @@ python data_ingestion.py -p example/data/declaration_of_the_rights_of_man_1789.p
 
															 # 导入数据后，查询问题。通义千问模型会根据 RAG 系统的检索结果，结合上下文，给出答案。
														
 
															-<<<<<<< HEAD
														
 
															-=======
														
 
															->>>>>>> 0140d7d271ac3b1561ca2272030e9e038b469999
														
 
															 python query.py -q 'how about the rights of men'
														
 
															 ## outputs
														
--- a/tests.zip
+++ b/tests.zip