|
@@ -1,20 +1,23 @@
|
|
|
import json
|
|
import json
|
|
|
-import re
|
|
|
|
|
import os
|
|
import os
|
|
|
import shutil
|
|
import shutil
|
|
|
import traceback
|
|
import traceback
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
|
|
+
|
|
|
|
|
+from common.error_types import ApiException
|
|
|
|
|
+from common.mk_markdown.mk_markdown import \
|
|
|
|
|
+ ocr_mk_mm_markdown_with_para_and_pagination
|
|
|
from flask import current_app, url_for
|
|
from flask import current_app, url_for
|
|
|
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
|
|
|
|
-from magic_pdf.pipe.UNIPipe import UNIPipe
|
|
|
|
|
|
|
+from loguru import logger
|
|
|
|
|
+
|
|
|
import magic_pdf.model as model_config
|
|
import magic_pdf.model as model_config
|
|
|
|
|
+from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
|
|
from magic_pdf.libs.json_compressor import JsonCompressor
|
|
from magic_pdf.libs.json_compressor import JsonCompressor
|
|
|
-from common.mk_markdown.mk_markdown import ocr_mk_mm_markdown_with_para_and_pagination
|
|
|
|
|
|
|
+from magic_pdf.pipe.UNIPipe import UNIPipe
|
|
|
|
|
+
|
|
|
|
|
+from ..extensions import app, db
|
|
|
from .ext import find_file
|
|
from .ext import find_file
|
|
|
-from ..extentions import app, db
|
|
|
|
|
from .models import AnalysisPdf, AnalysisTask
|
|
from .models import AnalysisPdf, AnalysisTask
|
|
|
-from common.error_types import ApiException
|
|
|
|
|
-from loguru import logger
|
|
|
|
|
|
|
|
|
|
model_config.__use_inside_model__ = True
|
|
model_config.__use_inside_model__ = True
|
|
|
|
|
|
|
@@ -22,51 +25,51 @@ model_config.__use_inside_model__ = True
|
|
|
def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
|
|
def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
|
|
|
try:
|
|
try:
|
|
|
model_json = [] # model_json传空list使用内置模型解析
|
|
model_json = [] # model_json传空list使用内置模型解析
|
|
|
- logger.info(f"is_ocr: {is_ocr}")
|
|
|
|
|
|
|
+ logger.info(f'is_ocr: {is_ocr}')
|
|
|
if not is_ocr:
|
|
if not is_ocr:
|
|
|
- jso_useful_key = {"_pdf_type": "", "model_list": model_json}
|
|
|
|
|
- image_writer = DiskReaderWriter(image_dir)
|
|
|
|
|
|
|
+ jso_useful_key = {'_pdf_type': '', 'model_list': model_json}
|
|
|
|
|
+ image_writer = FileBasedDataWriter(image_dir)
|
|
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
|
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
|
|
|
pipe.pipe_classify()
|
|
pipe.pipe_classify()
|
|
|
else:
|
|
else:
|
|
|
- jso_useful_key = {"_pdf_type": "ocr", "model_list": model_json}
|
|
|
|
|
- image_writer = DiskReaderWriter(image_dir)
|
|
|
|
|
|
|
+ jso_useful_key = {'_pdf_type': 'ocr', 'model_list': model_json}
|
|
|
|
|
+ image_writer = FileBasedDataWriter(image_dir)
|
|
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
|
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
|
|
|
"""如果没有传入有效的模型数据,则使用内置model解析"""
|
|
"""如果没有传入有效的模型数据,则使用内置model解析"""
|
|
|
if len(model_json) == 0:
|
|
if len(model_json) == 0:
|
|
|
if model_config.__use_inside_model__:
|
|
if model_config.__use_inside_model__:
|
|
|
pipe.pipe_analyze()
|
|
pipe.pipe_analyze()
|
|
|
else:
|
|
else:
|
|
|
- logger.error("need model list input")
|
|
|
|
|
|
|
+ logger.error('need model list input')
|
|
|
exit(1)
|
|
exit(1)
|
|
|
pipe.pipe_parse()
|
|
pipe.pipe_parse()
|
|
|
pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
|
|
pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
|
|
|
- pdf_info_list = pdf_mid_data["pdf_info"]
|
|
|
|
|
|
|
+ pdf_info_list = pdf_mid_data['pdf_info']
|
|
|
md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
|
|
md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
|
|
|
ensure_ascii=False)
|
|
ensure_ascii=False)
|
|
|
bbox_info = get_bbox_info(pdf_info_list)
|
|
bbox_info = get_bbox_info(pdf_info_list)
|
|
|
return md_content, bbox_info
|
|
return md_content, bbox_info
|
|
|
- except Exception as e:
|
|
|
|
|
|
|
+ except Exception as e: # noqa: F841
|
|
|
logger.error(traceback.format_exc())
|
|
logger.error(traceback.format_exc())
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_bbox_info(data):
|
|
def get_bbox_info(data):
|
|
|
bbox_info = []
|
|
bbox_info = []
|
|
|
for page in data:
|
|
for page in data:
|
|
|
- preproc_blocks = page.get("preproc_blocks", [])
|
|
|
|
|
- discarded_blocks = page.get("discarded_blocks", [])
|
|
|
|
|
|
|
+ preproc_blocks = page.get('preproc_blocks', [])
|
|
|
|
|
+ discarded_blocks = page.get('discarded_blocks', [])
|
|
|
bbox_info.append({
|
|
bbox_info.append({
|
|
|
- "preproc_blocks": preproc_blocks,
|
|
|
|
|
- "page_idx": page.get("page_idx"),
|
|
|
|
|
- "page_size": page.get("page_size"),
|
|
|
|
|
- "discarded_blocks": discarded_blocks,
|
|
|
|
|
|
|
+ 'preproc_blocks': preproc_blocks,
|
|
|
|
|
+ 'page_idx': page.get('page_idx'),
|
|
|
|
|
+ 'page_size': page.get('page_size'),
|
|
|
|
|
+ 'discarded_blocks': discarded_blocks,
|
|
|
})
|
|
})
|
|
|
return bbox_info
|
|
return bbox_info
|
|
|
|
|
|
|
|
|
|
|
|
|
def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
|
|
def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
|
|
|
- """
|
|
|
|
|
- 解析pdf
|
|
|
|
|
|
|
+ """解析pdf.
|
|
|
|
|
+
|
|
|
:param pdf_dir: pdf解析目录
|
|
:param pdf_dir: pdf解析目录
|
|
|
:param image_dir: 图片目录
|
|
:param image_dir: 图片目录
|
|
|
:param pdf_path: pdf路径
|
|
:param pdf_path: pdf路径
|
|
@@ -75,8 +78,8 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
|
|
|
:return:
|
|
:return:
|
|
|
"""
|
|
"""
|
|
|
try:
|
|
try:
|
|
|
- logger.info(f"start task: {pdf_path}")
|
|
|
|
|
- logger.info(f"image_dir: {image_dir}")
|
|
|
|
|
|
|
+ logger.info(f'start task: {pdf_path}')
|
|
|
|
|
+ logger.info(f'image_dir: {image_dir}')
|
|
|
if not Path(image_dir).exists():
|
|
if not Path(image_dir).exists():
|
|
|
Path(image_dir).mkdir(parents=True, exist_ok=True)
|
|
Path(image_dir).mkdir(parents=True, exist_ok=True)
|
|
|
else:
|
|
else:
|
|
@@ -96,26 +99,26 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
|
|
|
# ############ markdown #############
|
|
# ############ markdown #############
|
|
|
pdf_name = Path(pdf_path).name
|
|
pdf_name = Path(pdf_path).name
|
|
|
|
|
|
|
|
- full_md_content = ""
|
|
|
|
|
|
|
+ full_md_content = ''
|
|
|
for item in json.loads(md_content):
|
|
for item in json.loads(md_content):
|
|
|
- full_md_content += item["md_content"] + "\n"
|
|
|
|
|
|
|
+ full_md_content += item['md_content'] + '\n'
|
|
|
|
|
|
|
|
- full_md_name = "full.md"
|
|
|
|
|
- with open(f"{pdf_dir}/{full_md_name}", "w", encoding="utf-8") as file:
|
|
|
|
|
|
|
+ full_md_name = 'full.md'
|
|
|
|
|
+ with open(f'{pdf_dir}/{full_md_name}', 'w', encoding='utf-8') as file:
|
|
|
file.write(full_md_content)
|
|
file.write(full_md_content)
|
|
|
with app.app_context():
|
|
with app.app_context():
|
|
|
full_md_link = url_for('analysis.mdview', filename=full_md_name, as_attachment=False)
|
|
full_md_link = url_for('analysis.mdview', filename=full_md_name, as_attachment=False)
|
|
|
- full_md_link = f"{full_md_link}&pdf={pdf_name}"
|
|
|
|
|
|
|
+ full_md_link = f'{full_md_link}&pdf={pdf_name}'
|
|
|
|
|
|
|
|
md_link_list = []
|
|
md_link_list = []
|
|
|
with app.app_context():
|
|
with app.app_context():
|
|
|
for n, md in enumerate(json.loads(md_content)):
|
|
for n, md in enumerate(json.loads(md_content)):
|
|
|
- md_content = md["md_content"]
|
|
|
|
|
|
|
+ md_content = md['md_content']
|
|
|
md_name = f"{md.get('page_no', n)}.md"
|
|
md_name = f"{md.get('page_no', n)}.md"
|
|
|
- with open(f"{pdf_dir}/{md_name}", "w", encoding="utf-8") as file:
|
|
|
|
|
|
|
+ with open(f'{pdf_dir}/{md_name}', 'w', encoding='utf-8') as file:
|
|
|
file.write(md_content)
|
|
file.write(md_content)
|
|
|
md_url = url_for('analysis.mdview', filename=md_name, as_attachment=False)
|
|
md_url = url_for('analysis.mdview', filename=md_name, as_attachment=False)
|
|
|
- md_link_list.append(f"{md_url}&pdf={pdf_name}")
|
|
|
|
|
|
|
+ md_link_list.append(f'{md_url}&pdf={pdf_name}')
|
|
|
|
|
|
|
|
with app.app_context():
|
|
with app.app_context():
|
|
|
with db.auto_commit():
|
|
with db.auto_commit():
|
|
@@ -129,8 +132,8 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
|
|
|
analysis_task_object = AnalysisTask.query.filter_by(analysis_pdf_id=analysis_pdf_id).first()
|
|
analysis_task_object = AnalysisTask.query.filter_by(analysis_pdf_id=analysis_pdf_id).first()
|
|
|
analysis_task_object.status = 1
|
|
analysis_task_object.status = 1
|
|
|
db.session.add(analysis_task_object)
|
|
db.session.add(analysis_task_object)
|
|
|
- logger.info(f"finished!")
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
|
|
+ logger.info('finished!')
|
|
|
|
|
+ except Exception as e: # noqa: F841
|
|
|
logger.error(traceback.format_exc())
|
|
logger.error(traceback.format_exc())
|
|
|
with app.app_context():
|
|
with app.app_context():
|
|
|
with db.auto_commit():
|
|
with db.auto_commit():
|
|
@@ -141,7 +144,7 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
|
|
|
analysis_task_object = AnalysisTask.query.filter_by(analysis_pdf_id=analysis_pdf_id).first()
|
|
analysis_task_object = AnalysisTask.query.filter_by(analysis_pdf_id=analysis_pdf_id).first()
|
|
|
analysis_task_object.status = 1
|
|
analysis_task_object.status = 1
|
|
|
db.session.add(analysis_task_object)
|
|
db.session.add(analysis_task_object)
|
|
|
- raise ApiException(code=500, msg="PDF parsing failed", msgZH="pdf解析失败")
|
|
|
|
|
|
|
+ raise ApiException(code=500, msg='PDF parsing failed', msgZH='pdf解析失败')
|
|
|
finally:
|
|
finally:
|
|
|
# 执行pending
|
|
# 执行pending
|
|
|
with app.app_context():
|
|
with app.app_context():
|
|
@@ -149,12 +152,12 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
|
|
|
AnalysisTask.update_date.asc()).first()
|
|
AnalysisTask.update_date.asc()).first()
|
|
|
if analysis_task_object:
|
|
if analysis_task_object:
|
|
|
pdf_upload_folder = current_app.config['PDF_UPLOAD_FOLDER']
|
|
pdf_upload_folder = current_app.config['PDF_UPLOAD_FOLDER']
|
|
|
- upload_dir = f"{current_app.static_folder}/{pdf_upload_folder}"
|
|
|
|
|
|
|
+ upload_dir = f'{current_app.static_folder}/{pdf_upload_folder}'
|
|
|
file_path = find_file(analysis_task_object.file_key, upload_dir)
|
|
file_path = find_file(analysis_task_object.file_key, upload_dir)
|
|
|
file_stem = Path(file_path).stem
|
|
file_stem = Path(file_path).stem
|
|
|
pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
|
|
pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
|
|
|
- pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}"
|
|
|
|
|
- image_dir = f"{pdf_dir}/images"
|
|
|
|
|
|
|
+ pdf_dir = f'{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}'
|
|
|
|
|
+ image_dir = f'{pdf_dir}/images'
|
|
|
with db.auto_commit():
|
|
with db.auto_commit():
|
|
|
analysis_pdf_object = AnalysisPdf.query.filter_by(id=analysis_task_object.analysis_pdf_id).first()
|
|
analysis_pdf_object = AnalysisPdf.query.filter_by(id=analysis_task_object.analysis_pdf_id).first()
|
|
|
analysis_pdf_object.status = 0
|
|
analysis_pdf_object.status = 0
|
|
@@ -164,4 +167,4 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
|
|
|
db.session.add(analysis_task_object)
|
|
db.session.add(analysis_task_object)
|
|
|
analysis_pdf_task(pdf_dir, image_dir, file_path, analysis_task_object.is_ocr, analysis_task_object.analysis_pdf_id)
|
|
analysis_pdf_task(pdf_dir, image_dir, file_path, analysis_task_object.is_ocr, analysis_task_object.analysis_pdf_id)
|
|
|
else:
|
|
else:
|
|
|
- logger.info(f"all task finished!")
|
|
|
|
|
|
|
+ logger.info('all task finished!')
|