Browse Source

refactor: simplify file reading function and improve input validation

myhloli 5 tháng trước cách đây
mục cha
commit
02898cdd81
2 tập tin đã thay đổi với 12 bổ sung26 xóa
  1. 3 1
      mineru/cli/common.py
  2. 9 25
      projects/gradio_app/app.py

+ 3 - 1
mineru/cli/common.py

@@ -23,7 +23,9 @@ pdf_suffixes = [".pdf"]
 image_suffixes = [".png", ".jpeg", ".jpg"]
 
 
-def read_fn(path: Path):
+def read_fn(path):
+    if not isinstance(path, Path):
+        path = Path(path)
     with open(str(path), "rb") as input_file:
         file_bytes = input_file.read()
         if path.suffix in image_suffixes:

+ 9 - 25
projects/gradio_app/app.py

@@ -12,16 +12,10 @@ import gradio as gr
 from gradio_pdf import PDF
 from loguru import logger
 
-from mineru.cli.common import prepare_env, do_parse
-from mineru.data.data_reader_writer import FileBasedDataReader
+from mineru.cli.common import prepare_env, do_parse, read_fn
 from mineru.utils.hash_utils import str_sha256
 
 
-def read_fn(path):
-    disk_rw = FileBasedDataReader(os.path.dirname(path))
-    return disk_rw.read(os.path.basename(path))
-
-
 def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language):
     os.makedirs(output_dir, exist_ok=True)
 
@@ -120,19 +114,6 @@ latex_delimiters = [
 ]
 
 
-def init_model():
-    try:
-        pass
-        return 0
-    except Exception as e:
-        logger.exception(e)
-        return -1
-
-
-model_init = init_model()
-logger.info(f'model_init: {model_init}')
-
-
 with open('header.html', 'r') as file:
     header = file.read()
 
@@ -162,6 +143,8 @@ all_lang.extend([*other_lang, *add_lang])
 
 
 def to_pdf(file_path):
+    if file_path is None:
+        return None
     pdf_bytes = read_fn(file_path)
     # 将pdfbytes 写入到uuid.pdf中
     # 生成唯一的文件名
@@ -182,14 +165,15 @@ if __name__ == '__main__':
         gr.HTML(header)
         with gr.Row():
             with gr.Column(variant='panel', scale=5):
-                file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
-                max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
                 with gr.Row():
-                    with gr.Column():
-                        is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
-                    with gr.Column():
+                    file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
+                with gr.Row(equal_height=True):
+                    with gr.Column(scale=3):
+                        max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
+                    with gr.Column(scale=1):
                         language = gr.Dropdown(all_lang, label='Language', value='ch')
                 with gr.Row():
+                    is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
                     formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
                     table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
                 with gr.Row():