Преглед на файлове

perf(language_detection): optimize batch size for language detection model

- Increase batch size from 8 to 256 for language detection inference
- Add timing measurement for language detection process
myhloli преди 9 месеца
родител
ревизия
e4e4eef1f8
променени са 2 файла, в които са добавени 15 реда и са изтрити 4 реда
  1. 11 1
      magic_pdf/libs/pdf_check.py
  2. 4 3
      magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py

+ 11 - 1
magic_pdf/libs/pdf_check.py

@@ -4,6 +4,7 @@ from loguru import logger
 import re
 from io import BytesIO
 from pdfminer.high_level import extract_text
+from pdfminer.layout import LAParams
 
 
 def calculate_sample_count(total_page: int):
@@ -41,7 +42,16 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
     sample_docs = extract_pages(src_pdf_bytes)
     sample_pdf_bytes = sample_docs.tobytes()
     sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
-    text = extract_text(sample_pdf_file_like_object)
+    laparams = LAParams(
+        line_overlap=0.5,
+        char_margin=2.0,
+        line_margin=0.5,
+        word_margin=0.1,
+        boxes_flow=None,
+        detect_vertical=False,
+        all_texts=False,
+    )
+    text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
     text = text.replace("\n", "")
     # logger.info(text)
     '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''

+ 4 - 3
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py

@@ -1,4 +1,5 @@
 # Copyright (c) Opendatalab. All rights reserved.
+import time
 from collections import Counter
 from uuid import uuid4
 
@@ -102,9 +103,9 @@ class YOLOv11LangDetModel(object):
             temp_images = split_images(image)
             for temp_image in temp_images:
                 all_images.append(resize_images_to_224(temp_image))
-
-        images_lang_res = self.batch_predict(all_images, batch_size=8)
-        # logger.info(f"images_lang_res: {images_lang_res}")
+        # langdetect_start = time.time()
+        images_lang_res = self.batch_predict(all_images, batch_size=256)
+        # logger.info(f"image number of langdetect: {len(images_lang_res)}, langdetect time: {round(time.time() - langdetect_start, 2)}")
         if len(images_lang_res) > 0:
             count_dict = Counter(images_lang_res)
             language = max(count_dict, key=count_dict.get)