Преглед на файлове

feat: mineru web and web_api

houlinfeng преди 1 година
родител
ревизия
38086572c8

+ 2 - 1
.gitignore

@@ -38,4 +38,5 @@ source.dev.env
 
 tmp
 
-projects/web/node_modules
+projects/web/node_modules
+projects/web/dist

+ 1 - 1
projects/web/README.md

@@ -8,5 +8,5 @@ npm install -g pnpm
 3. build
 ```
 1.pnpm run build
-2.npm run buil
+2.npm run build
 ```

+ 2 - 0
projects/web/package.json

@@ -22,9 +22,11 @@
     "ahooks": "^3.8.1",
     "antd": "^5.20.3",
     "axios": "^1.7.5",
+    "canvas": "^2.11.2",
     "classnames": "^2.5.1",
     "js-cookie": "^3.0.5",
     "lodash": "^4.17.21",
+    "path2d": "^0.2.1",
     "qs": "^6.13.0",
     "react": "^18.3.1",
     "react-copy-to-clipboard": "^5.1.0",

+ 3 - 3
projects/web/src/pages/home.tsx

@@ -3,7 +3,7 @@
 import ErrorBoundary from "@/components/error-boundary";
 import styles from "./home.module.scss";
 import { SlotID, Path } from "@/constant/route";
-import { BrowserRouter, Routes, Route, Outlet } from "react-router-dom";
+import { HashRouter, Routes, Route, Outlet } from "react-router-dom";
 import { ExtractorSide } from "./extract-side";
 import { LanguageProvider } from "@/context/language-provider";
 import PDFUpload from "@/pages/extract/components/pdf-upload";
@@ -70,9 +70,9 @@ export function Home() {
   return (
     <ErrorBoundary>
       <LanguageProvider>
-        <BrowserRouter>
+        <HashRouter>
           <Screen />
-        </BrowserRouter>
+        </HashRouter>
       </LanguageProvider>
     </ErrorBoundary>
   );

+ 24 - 15
projects/web_api/README.md

@@ -1,34 +1,43 @@
-## 安装
+## Mineru 本地API服务
 
 MinerU
 
-```bash
-# mineru已安装则跳过此步骤
+```
+# 服务依赖mineru,请先确保mineru已安装
+```
 
-git clone https://github.com/opendatalab/MinerU.git
-cd MinerU
+1. 打包前端界面
 
-conda create -n MinerU python=3.10
-conda activate MinerU
-pip install .[full] --extra-index-url https://wheels.myhloli.com
+```bash
+# 先进入前端目录
+cd projects/web
+# 打包前端项目
+npm install -g yarn
+yarn install
+yarn build
 ```
 
-第三方软件
+2. 安装服务依赖
 
 ```bash
+# 先进入后端目录
 cd projects/web_api
-pip install poetry
-portey install
+# 安装依赖
+pip3 install -r requirements.txt  -i https://pypi.tuna.tsinghua.edu.cn/simple
 ```
 
-启动服务
+3. 启动服务
 
 ```bash
-cd web_api
-python app.py
+# 进入程序目录
+cd projects/web_api/web_api
+# 启动服务
+python3 app.py
+# 在浏览器访问启动的地址即可访问界面
 ```
 
-接口文档
+ps:接口文档
+
 ```
 在浏览器打开 mineru-web接口文档.html
 ```

Файловите разлики са ограничени, защото са твърде много
+ 0 - 0
projects/web_api/mineru-web接口文档.html


+ 13 - 0
projects/web_api/requirements.txt

@@ -0,0 +1,13 @@
+flask-cors
+flask-jwt-extended
+flask-marshmallow
+flask-migrate
+flask-restful
+flask-sqlalchemy
+flask
+greenlet
+loguru
+marshmallow-sqlalchemy
+marshmallow
+pyjwt
+pyyaml

+ 3 - 1
projects/web_api/web_api/api/__init__.py

@@ -4,7 +4,7 @@ from common.web_hook import before_request
 from common.logger import setup_log
 
 root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-print("root_dir", root_dir)
+
 
 def _register_db(flask_app):
     from common import import_models
@@ -30,6 +30,8 @@ def create_app(config):
     ma.init_app(app=app)
     from .analysis import analysis_blue
     app.register_blueprint(analysis_blue)
+    from .react_app import react_app_blue
+    app.register_blueprint(react_app_blue)
 
     app.before_request(before_request)
 

+ 3 - 1
projects/web_api/web_api/api/analysis/__init__.py

@@ -4,6 +4,7 @@ from .upload_view import UploadPdfView
 from .analysis_view import AnalysisTaskView, AnalysisTaskProgressView
 from .img_md_view import ImgView, MdView
 from .task_view import TaskView, HistoricalTasksView, DeleteTaskView
+from .markdown_view import MarkdownView
 
 analysis_blue = Blueprint('analysis', __name__)
 
@@ -15,4 +16,5 @@ api_v2.add_resource(ImgView, '/analysis/pdf_img')
 api_v2.add_resource(MdView, '/analysis/pdf_md')
 api_v2.add_resource(TaskView, '/extract/taskQueue')
 api_v2.add_resource(HistoricalTasksView, '/extract/list')
-api_v2.add_resource(DeleteTaskView, '/extract/task/<int:id>')
+api_v2.add_resource(DeleteTaskView, '/extract/task/<int:id>')
+api_v2.add_resource(MarkdownView, '/extract/markdown')

+ 5 - 4
projects/web_api/web_api/api/analysis/analysis_view.py

@@ -1,5 +1,6 @@
 import json
 import threading
+from multiprocessing import Process
 from pathlib import Path
 from flask import request, current_app, url_for
 from flask_restful import Resource
@@ -212,10 +213,10 @@ class AnalysisTaskView(Resource):
                     pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
                     pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}"
                     image_dir = f"{pdf_dir}/images"
-                    t = threading.Thread(target=analysis_pdf_task,
-                                         args=(pdf_dir, image_dir, file_path, analysis_task.is_ocr,
-                                               analysis_task.analysis_pdf_id))
-                    t.start()
+                    process = Process(target=analysis_pdf_task,
+                                      args=(pdf_dir, image_dir, file_path, analysis_task.is_ocr,
+                                            analysis_task.analysis_pdf_id))
+                    process.start()
 
                 # 生成文件的URL路径
                 file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False)

+ 44 - 0
projects/web_api/web_api/api/analysis/markdown_view.py

@@ -0,0 +1,44 @@
+import json
+from pathlib import Path
+from flask import request, current_app
+from flask_restful import Resource
+from common.custom_response import generate_response
+
+
+class MarkdownView(Resource):
+
+    def put(self):
+        """
+        编辑markdown
+        """
+        params = json.loads(request.data)
+        file_key = params.get('file_key')
+        data = params.get('data', {})
+        if not data:
+            return generate_response(code=400, msg="empty data", msgZH="数据为空,无法更新markdown")
+
+        pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
+        pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}"
+        markdown_file_dir = ""
+        for path_obj in Path(pdf_dir).iterdir():
+            if path_obj.name.startswith(file_key):
+                markdown_file_dir = path_obj
+                break
+
+        if markdown_file_dir and Path(markdown_file_dir).exists():
+            for k, v in data.items():
+                md_path = f"{markdown_file_dir}/{k}.md"
+                if Path(md_path).exists():
+                    with open(md_path, 'w', encoding="utf-8") as f:
+                        f.write(v)
+
+            full_content = ""
+            for path_obj in Path(markdown_file_dir).iterdir():
+                if path_obj.is_file() and path_obj.suffix == ".md" and path_obj.stem != "full":
+                    with open(path_obj, 'r', encoding="utf-8") as f:
+                        full_content += f.read() + "\n"
+            with open(f"{markdown_file_dir}/full.md", 'w', encoding="utf-8") as f:
+                f.write(full_content)
+        else:
+            return generate_response(code=400, msg="Invalid file_key", msgZH="文件哈希错误")
+        return generate_response()

+ 1 - 0
projects/web_api/web_api/api/extentions.py

@@ -59,3 +59,4 @@ db = SQLAlchemy()
 migrate = Migrate()
 jwt = JWTManager()
 ma = Marshmallow()
+folder = app.config.get("REACT_APP_DIST")

+ 11 - 0
projects/web_api/web_api/api/react_app/__init__.py

@@ -0,0 +1,11 @@
+from pathlib import Path
+from flask import Blueprint
+from ..extentions import app, Api
+from .react_app_view import ReactAppView
+from loguru import logger
+
+folder = Path(app.config.get("REACT_APP_DIST", "../../web/dist/")).resolve()
+logger.info(f"react_app folder: {folder}")
+react_app_blue = Blueprint('react_app', __name__,  static_folder=folder, static_url_path='', template_folder=folder)
+react_app_api = Api(react_app_blue, prefix='')
+react_app_api.add_resource(ReactAppView, '/')

+ 11 - 0
projects/web_api/web_api/api/react_app/react_app_view.py

@@ -0,0 +1,11 @@
+from flask import render_template, Response
+from flask_restful import Resource
+
+
+class ReactAppView(Resource):
+    def get(self):
+        # 创建自定义的响应对象
+        rendered_template = render_template('index.html')
+        response = Response(rendered_template, mimetype='text/html')
+
+        return response

+ 2 - 0
projects/web_api/web_api/config/config.yaml

@@ -11,6 +11,8 @@ BaseConfig: &base
   JWT_ACCESS_TOKEN_EXPIRES: 3600
   PDF_UPLOAD_FOLDER: "upload_pdf"
   PDF_ANALYSIS_FOLDER: "analysis_pdf"
+  # 前端项目打包的路径
+  REACT_APP_DIST: "../../web/dist/"
 
 # 开发配置
 DevelopmentConfig:

BIN
projects/web_api/web_api/config/mineru_web.db


Някои файлове не бяха показани, защото твърде много файлове са промени