enhanced_doc_orientation.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. # zhch/custom_modules/enhanced_doc_orientation.py
  2. """增强版文档方向分类器 - 结合 OCR 和 CNN"""
  3. import cv2
  4. import numpy as np
  5. import onnxruntime
  6. from paddlex import create_model
  7. from typing import Union, List
  8. from PIL import Image
  9. class EnhancedDocOrientationClassify:
  10. """
  11. 增强版文档方向分类器
  12. 参考 MinerU 的多阶段判断逻辑:
  13. 1. 快速过滤: 宽高比检查
  14. 2. OCR 分析: 文本框方向判断
  15. 3. CNN 分类: 精确角度预测
  16. """
  17. def __init__(
  18. self,
  19. cnn_model_name: str = "PP-LCNet_x1_0_doc_ori",
  20. ocr_model_name: str = "PP-OCRv5_server_det",
  21. aspect_ratio_threshold: float = 1.2,
  22. vertical_box_ratio_threshold: float = 0.28,
  23. min_vertical_boxes: int = 3,
  24. text_box_aspect_ratio: float = 0.8,
  25. ):
  26. """
  27. Args:
  28. cnn_model_name: CNN 方向分类模型名称
  29. ocr_model_name: OCR 文本检测模型名称
  30. aspect_ratio_threshold: 图像宽高比阈值(> 此值才进行 OCR 检测)
  31. vertical_box_ratio_threshold: 垂直文本框占比阈值
  32. min_vertical_boxes: 最少垂直文本框数量
  33. text_box_aspect_ratio: 文本框宽高比阈值(< 此值为垂直文本)
  34. """
  35. # 1. 加载 CNN 方向分类模型
  36. self.cnn_model = create_model(cnn_model_name)
  37. # 2. 加载 OCR 文本检测模型
  38. self.ocr_detector = create_model(ocr_model_name)
  39. # 3. 参数设置
  40. self.aspect_ratio_threshold = aspect_ratio_threshold
  41. self.vertical_box_ratio_threshold = vertical_box_ratio_threshold
  42. self.min_vertical_boxes = min_vertical_boxes
  43. self.text_box_aspect_ratio = text_box_aspect_ratio
  44. # 4. 标签映射
  45. self.labels = ["0", "90", "180", "270"]
  46. def predict(
  47. self,
  48. img: Union[str, np.ndarray, Image.Image],
  49. use_ocr_filter: bool = True,
  50. batch_size: int = 1,
  51. ) -> dict:
  52. """
  53. 预测图像方向
  54. Args:
  55. img: 输入图像
  56. use_ocr_filter: 是否使用 OCR 过滤(False 则直接使用 CNN)
  57. batch_size: 批处理大小
  58. Returns:
  59. {
  60. "orientation": "0", # "0", "90", "180", "270"
  61. "confidence": 0.95,
  62. "method": "ocr_filter" or "cnn",
  63. "details": {...}
  64. }
  65. """
  66. # 统一输入格式为 numpy array
  67. if isinstance(img, str):
  68. img = cv2.imread(img)
  69. img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  70. elif isinstance(img, Image.Image):
  71. img = np.array(img)
  72. # ============================================
  73. # 阶段1: 快速过滤(宽高比检查)
  74. # ============================================
  75. img_height, img_width = img.shape[:2]
  76. img_aspect_ratio = img_height / img_width if img_width > 0 else 1.0
  77. if not use_ocr_filter or img_aspect_ratio <= self.aspect_ratio_threshold:
  78. # 横向图像,直接返回 0 度
  79. return {
  80. "orientation": "0",
  81. "confidence": 1.0,
  82. "method": "aspect_ratio_filter",
  83. "details": {
  84. "img_aspect_ratio": img_aspect_ratio,
  85. "threshold": self.aspect_ratio_threshold,
  86. "reason": "Image is landscape, no rotation needed"
  87. }
  88. }
  89. # ============================================
  90. # 阶段2: OCR 文本框分析
  91. # ============================================
  92. # 转换为 BGR(PaddleOCR 需要)
  93. bgr_img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
  94. # OCR 检测文本框
  95. det_result = self.ocr_detector.predict(bgr_img)
  96. det_boxes = det_result.get("boxes", [])
  97. if not det_boxes:
  98. # 没有检测到文本框,使用 CNN
  99. return self._cnn_predict(img, reason="No text boxes detected")
  100. # 分析文本框的方向
  101. vertical_count = 0
  102. for box in det_boxes:
  103. # 提取文本框坐标
  104. coords = box.get("coordinate", [])
  105. if len(coords) < 4:
  106. continue
  107. # 计算文本框的宽度和高度
  108. # PaddleX 返回的是 [x1, y1, x2, y2] 格式
  109. x1, y1, x2, y2 = coords[:4]
  110. width = abs(x2 - x1)
  111. height = abs(y2 - y1)
  112. aspect_ratio = width / height if height > 0 else 1.0
  113. # 判断是否为垂直文本框
  114. if aspect_ratio < self.text_box_aspect_ratio:
  115. vertical_count += 1
  116. # 计算垂直文本框占比
  117. vertical_ratio = vertical_count / len(det_boxes) if det_boxes else 0
  118. # 判断是否需要旋转
  119. is_rotated = (
  120. vertical_ratio >= self.vertical_box_ratio_threshold
  121. and vertical_count >= self.min_vertical_boxes
  122. )
  123. if not is_rotated:
  124. # 文本框正常,不需要旋转
  125. return {
  126. "orientation": "0",
  127. "confidence": 1.0,
  128. "method": "ocr_filter",
  129. "details": {
  130. "total_boxes": len(det_boxes),
  131. "vertical_boxes": vertical_count,
  132. "vertical_ratio": vertical_ratio,
  133. "threshold": self.vertical_box_ratio_threshold,
  134. "reason": "Text boxes are mostly horizontal"
  135. }
  136. }
  137. # ============================================
  138. # 阶段3: CNN 精确分类
  139. # ============================================
  140. return self._cnn_predict(
  141. img,
  142. reason=f"OCR detected rotation (vertical_ratio={vertical_ratio:.2f})",
  143. ocr_details={
  144. "total_boxes": len(det_boxes),
  145. "vertical_boxes": vertical_count,
  146. "vertical_ratio": vertical_ratio,
  147. }
  148. )
  149. def _cnn_predict(self, img: np.ndarray, reason: str = "", ocr_details: dict = None) -> dict:
  150. """使用 CNN 模型预测方向"""
  151. # 转换为 PIL Image(PaddleX 需要)
  152. if isinstance(img, np.ndarray):
  153. img = Image.fromarray(img)
  154. # CNN 推理
  155. result = self.cnn_model.predict(img)
  156. # 提取结果
  157. orientation = result.get("label", "0")
  158. confidence = result.get("score", 0.0)
  159. return {
  160. "orientation": orientation,
  161. "confidence": confidence,
  162. "method": "cnn",
  163. "details": {
  164. "reason": reason,
  165. "ocr_analysis": ocr_details or {},
  166. "cnn_scores": result.get("label_names", [])
  167. }
  168. }
  169. def batch_predict(
  170. self,
  171. imgs: List[Union[str, np.ndarray, Image.Image]],
  172. use_ocr_filter: bool = True,
  173. batch_size: int = 8,
  174. ) -> List[dict]:
  175. """批量预测"""
  176. results = []
  177. for img in imgs:
  178. result = self.predict(img, use_ocr_filter=use_ocr_filter)
  179. results.append(result)
  180. return results