Преглед на файлове

fix: add new enum values and improve MIN_BATCH_INFERENCE_SIZE documentation in pipeline_analyze.py

myhloli преди 5 месеца
родител
ревизия
58b8e8a912
променени са 2 файла, в които са добавени 8 реда и са изтрити 2 реда
  1. 6 2
      mineru/backend/pipeline/pipeline_analyze.py
  2. 2 0
      mineru/utils/enum_class.py

+ 6 - 2
mineru/backend/pipeline/pipeline_analyze.py

@@ -76,7 +76,11 @@ def doc_analyze(
         formula_enable=True,
         table_enable=True,
 ):
-    MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
+    """
+    适当调大MIN_BATCH_INFERENCE_SIZE可以提高性能,可能会增加显存使用量,
+    可通过环境变量MINERU_MIN_BATCH_INFERENCE_SIZE设置,默认值为100。
+    """
+    min_batch_inference_size = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
 
     # 收集所有页面信息
     all_pages_info = []  # 存储(dataset_index, page_index, img, ocr, lang, width, height)
@@ -109,7 +113,7 @@ def doc_analyze(
 
     # 准备批处理
     images_with_extra_info = [(info[2], info[3], info[4]) for info in all_pages_info]
-    batch_size = MIN_BATCH_INFERENCE_SIZE
+    batch_size = min_batch_inference_size
     batch_images = [
         images_with_extra_info[i:i + batch_size]
         for i in range(0, len(images_with_extra_info), batch_size)

+ 2 - 0
mineru/utils/enum_class.py

@@ -33,9 +33,11 @@ class CategoryId:
     TableCaption = 6
     TableFootnote = 7
     InterlineEquation_Layout = 8
+    InterlineEquationNumber_Layout = 9
     InlineEquation = 13
     InterlineEquation_YOLO = 14
     OcrText = 15
+    LowScoreText = 16
     ImageFootnote = 101