|
@@ -1,9 +1,12 @@
|
|
|
-import re
|
|
|
|
|
|
|
+import time
|
|
|
|
|
+from loguru import logger
|
|
|
|
|
|
|
|
|
|
+from mineru.utils.config_reader import get_llm_aided_config
|
|
|
from mineru.utils.cut_image import cut_image_and_table
|
|
from mineru.utils.cut_image import cut_image_and_table
|
|
|
from mineru.utils.enum_class import BlockType, ContentType
|
|
from mineru.utils.enum_class import BlockType, ContentType
|
|
|
from mineru.utils.hash_utils import str_md5
|
|
from mineru.utils.hash_utils import str_md5
|
|
|
from mineru.backend.vlm.vlm_magic_model import MagicModel
|
|
from mineru.backend.vlm.vlm_magic_model import MagicModel
|
|
|
|
|
+from mineru.utils.llm_aided import llm_aided_title
|
|
|
from mineru.version import __version__
|
|
from mineru.version import __version__
|
|
|
|
|
|
|
|
|
|
|
|
@@ -48,6 +51,19 @@ def result_to_middle_json(token_list, images_list, pdf_doc, image_writer):
|
|
|
image_dict = images_list[index]
|
|
image_dict = images_list[index]
|
|
|
page_info = token_to_page_info(token, image_dict, page, image_writer, index)
|
|
page_info = token_to_page_info(token, image_dict, page, image_writer, index)
|
|
|
middle_json["pdf_info"].append(page_info)
|
|
middle_json["pdf_info"].append(page_info)
|
|
|
|
|
+
|
|
|
|
|
+ """llm优化"""
|
|
|
|
|
+ llm_aided_config = get_llm_aided_config()
|
|
|
|
|
+
|
|
|
|
|
+ if llm_aided_config is not None:
|
|
|
|
|
+ """标题优化"""
|
|
|
|
|
+ title_aided_config = llm_aided_config.get('title_aided', None)
|
|
|
|
|
+ if title_aided_config is not None:
|
|
|
|
|
+ if title_aided_config.get('enable', False):
|
|
|
|
|
+ llm_aided_title_start_time = time.time()
|
|
|
|
|
+ llm_aided_title(middle_json["pdf_info"], title_aided_config)
|
|
|
|
|
+ logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
|
|
|
|
|
+
|
|
|
# 关闭pdf文档
|
|
# 关闭pdf文档
|
|
|
pdf_doc.close()
|
|
pdf_doc.close()
|
|
|
return middle_json
|
|
return middle_json
|