Просмотр исходного кода

fix: refactor PDF processing logic to ensure proper resource management and improve error handling

myhloli 3 месяцев назад
Родитель
Сommit
e429c5a840
1 измененных файлов с 19 добавлено и 13 удалено
  1. 19 13
      mineru/utils/pdf_classify.py

+ 19 - 13
mineru/utils/pdf_classify.py

@@ -24,11 +24,11 @@ def classify(pdf_bytes):
     Returns:
         str: 'txt' 表示可以直接提取文本,'ocr' 表示需要OCR
     """
-    try:
-        # 从字节数据加载PDF
-        sample_pdf_bytes = extract_pages(pdf_bytes)
-        pdf = pdfium.PdfDocument(sample_pdf_bytes)
 
+    # 从字节数据加载PDF
+    sample_pdf_bytes = extract_pages(pdf_bytes)
+    pdf = pdfium.PdfDocument(sample_pdf_bytes)
+    try:
         # 获取PDF页数
         page_count = len(pdf)
 
@@ -42,19 +42,25 @@ def classify(pdf_bytes):
         # 设置阈值:如果每页平均少于50个有效字符,认为需要OCR
         chars_threshold = 50
 
+        # 检查平均字符数和无效字符
         if (get_avg_cleaned_chars_per_page(pdf, pages_to_check) < chars_threshold) or detect_invalid_chars(sample_pdf_bytes):
             return 'ocr'
-        else:
 
-            if get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.8:
-                return 'ocr'
+        # 检查图像覆盖率
+        if get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.8:
+            return 'ocr'
+
+        return 'txt'
 
-            return 'txt'
     except Exception as e:
         logger.error(f"判断PDF类型时出错: {e}")
         # 出错时默认使用OCR
         return 'ocr'
 
+    finally:
+        # 无论执行哪个路径,都确保PDF被关闭
+        pdf.close()
+
 
 def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check):
     # 总字符数
@@ -78,8 +84,6 @@ def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check):
 
     # logger.debug(f"PDF分析: 平均每页清理后{avg_cleaned_chars_per_page:.1f}字符")
 
-    pdf_doc.close()  # 关闭PDF文档
-
     return avg_cleaned_chars_per_page
 
 
@@ -158,6 +162,9 @@ def get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check):
 
         page_count += 1
 
+    # 关闭资源
+    pdf_stream.close()
+
     # 如果没有处理任何页面,返回0
     if page_count == 0:
         return 0.0
@@ -166,9 +173,6 @@ def get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check):
     high_coverage_ratio = high_image_coverage_pages / page_count
     # logger.debug(f"PDF分析: 高图像覆盖页面比例: {high_coverage_ratio:.2f}")
 
-    # 关闭资源
-    pdf_stream.close()
-
     return high_coverage_ratio
 
 
@@ -205,6 +209,7 @@ def extract_pages(src_pdf_bytes: bytes) -> bytes:
     try:
         # 将选择的页面导入新文档
         sample_docs.import_pages(pdf, page_indices)
+        pdf.close()
 
         # 将新PDF保存到内存缓冲区
         output_buffer = BytesIO()
@@ -213,6 +218,7 @@ def extract_pages(src_pdf_bytes: bytes) -> bytes:
         # 获取字节数据
         return output_buffer.getvalue()
     except Exception as e:
+        pdf.close()
         logger.exception(e)
         return b''  # 出错时返回空字节