| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189 |
- import uuid
- import os
- from base64 import b64encode
- import uvicorn
- import argparse
- from pathlib import Path
- from glob import glob
- from fastapi import FastAPI, UploadFile, File, Form
- from fastapi.responses import JSONResponse
- from typing import List, Optional
- from loguru import logger
- from mineru.cli.common import aio_do_parse, read_fn
- from mineru.version import __version__
- app = FastAPI()
- def encode_image(image_path: str) -> str:
- """Encode image using base64"""
- with open(image_path, "rb") as f:
- return b64encode(f.read()).decode()
- @app.post(path="/file_parse",)
- async def parse_pdf(
- files: List[UploadFile] = File(...),
- output_dir: str = Form("./output"),
- lang_list: List[str] = Form(["ch"]),
- backend: str = Form("pipeline"),
- parse_method: str = Form("auto"),
- formula_enable: bool = Form(True),
- table_enable: bool = Form(True),
- server_url: Optional[str] = Form(None),
- reuturn_md: bool = Form(True),
- reuturn_middle_json: bool = Form(False),
- return_model_output: bool = Form(False),
- reuturn_content_list: bool = Form(False),
- return_images: bool = Form(False),
- start_page_id: int = Form(0),
- end_page_id: int = Form(99999),
- ):
- try:
- # 创建唯一的输出目录
- unique_dir = os.path.join(output_dir, str(uuid.uuid4()))
- os.makedirs(unique_dir, exist_ok=True)
- # 处理上传的PDF文件
- pdf_file_names = []
- pdf_bytes_list = []
- for file in files:
- content = await file.read()
- file_path = Path(file.filename)
- # 如果是图像文件或PDF,使用read_fn处理
- if file_path.suffix.lower() in [".pdf", ".png", ".jpeg", ".jpg"]:
- # 创建临时文件以便使用read_fn
- temp_path = Path(unique_dir) / file_path.name
- with open(temp_path, "wb") as f:
- f.write(content)
- try:
- pdf_bytes = read_fn(temp_path)
- pdf_bytes_list.append(pdf_bytes)
- pdf_file_names.append(file_path.stem)
- os.remove(temp_path) # 删除临时文件
- except Exception as e:
- return JSONResponse(
- status_code=400,
- content={"error": f"处理文件失败: {str(e)}"}
- )
- else:
- return JSONResponse(
- status_code=400,
- content={"error": f"不支持的文件类型: {file_path.suffix}"}
- )
- # 设置语言列表,确保与文件数量一致
- actual_lang_list = lang_list
- if len(actual_lang_list) != len(pdf_file_names):
- # 如果语言列表长度不匹配,使用第一个语言或默认"ch"
- actual_lang_list = [actual_lang_list[0] if actual_lang_list else "ch"] * len(pdf_file_names)
- # 调用异步处理函数
- await aio_do_parse(
- output_dir=unique_dir,
- pdf_file_names=pdf_file_names,
- pdf_bytes_list=pdf_bytes_list,
- p_lang_list=actual_lang_list,
- backend=backend,
- parse_method=parse_method,
- p_formula_enable=formula_enable,
- p_table_enable=table_enable,
- server_url=server_url,
- f_draw_layout_bbox=False,
- f_draw_span_bbox=False,
- f_dump_md=reuturn_md,
- f_dump_middle_json=reuturn_middle_json,
- f_dump_model_output=return_model_output,
- f_dump_orig_pdf=False,
- f_dump_content_list=reuturn_content_list,
- start_page_id=start_page_id,
- end_page_id=end_page_id,
- )
- # 构建结果路径
- result_dict = {}
- for pdf_name in pdf_file_names:
- result_dict[pdf_name] = {}
- data = result_dict[pdf_name]
- if backend.startswith("pipeline"):
- parse_dir = os.path.join(unique_dir, pdf_name, parse_method)
- else:
- parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
- def get_infer_result(file_suffix_identifier: str):
- """从结果文件中读取推理结果"""
- result_file_path = os.path.join(parse_dir, f"{pdf_name}{file_suffix_identifier}")
- if os.path.exists(result_file_path):
- with open(result_file_path, "r", encoding="utf-8") as fp:
- return fp.read()
- return None
- if os.path.exists(parse_dir):
- if reuturn_md:
- data["md_content"] = get_infer_result(".md")
- if reuturn_middle_json:
- data["middle_json"] = get_infer_result("_middle.json")
- if return_model_output:
- if backend.startswith("pipeline"):
- data["model_output"] = get_infer_result("_model.json")
- else:
- data["model_output"] = get_infer_result("_model_output.txt")
- if reuturn_content_list:
- data["content_list"] = get_infer_result("_content_list.json")
- if return_images:
- image_paths = glob(f"{parse_dir}/images/*.jpg")
- data["images"] = {
- os.path.basename(
- image_path
- ): f"data:image/jpeg;base64,{encode_image(image_path)}"
- for image_path in image_paths
- }
- return JSONResponse(
- status_code=200,
- content={
- "status": "success",
- "backend": backend,
- "version": __version__,
- "results": result_dict
- }
- )
- except Exception as e:
- logger.exception(e)
- return JSONResponse(
- status_code=500,
- content={"error": str(e)}
- )
- def main():
- """启动MinerU FastAPI服务器的命令行入口"""
- parser = argparse.ArgumentParser(description='Start MinerU FastAPI Service')
- parser.add_argument('--host', type=str, default='127.0.0.1', help='Server host (default: 127.0.0.1)')
- parser.add_argument('--port', type=int, default=8000, help='Server port (default: 8000)')
- parser.add_argument('--reload', action='store_true', help='Enable auto-reload (development mode)')
- args = parser.parse_args()
- print(f"Start MinerU FastAPI Service: http://{args.host}:{args.port}")
- print("The API documentation can be accessed at the following address:")
- print(f"- Swagger UI: http://{args.host}:{args.port}/docs")
- print(f"- ReDoc: http://{args.host}:{args.port}/redoc")
- uvicorn.run(
- "mineru.cli.fast_api:app",
- host=args.host,
- port=args.port,
- reload=args.reload
- )
- if __name__ == "__main__":
- main()
|