doc_analyze_by_pp_structurev2.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. import random
  2. import fitz
  3. import cv2
  4. from paddleocr import PPStructure
  5. from PIL import Image
  6. from loguru import logger
  7. import numpy as np
  8. def region_to_bbox(region):
  9. x0 = region[0][0]
  10. y0 = region[0][1]
  11. x1 = region[2][0]
  12. y1 = region[2][1]
  13. return [x0, y0, x1, y1]
  14. def dict_compare(d1, d2):
  15. return d1.items() == d2.items()
  16. def remove_duplicates_dicts(lst):
  17. unique_dicts = []
  18. for dict_item in lst:
  19. if not any(
  20. dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
  21. ):
  22. unique_dicts.append(dict_item)
  23. return unique_dicts
  24. def load_imags_from_pdf(pdf_bytes: bytes, dpi=200):
  25. imgs = []
  26. with fitz.open("pdf", pdf_bytes) as doc:
  27. for index in range(0, doc.page_count):
  28. page = doc[index]
  29. dpi = 200
  30. mat = fitz.Matrix(dpi / 72, dpi / 72)
  31. pm = page.get_pixmap(matrix=mat, alpha=False)
  32. # if width or height > 2000 pixels, don't enlarge the image
  33. # if pm.width > 2000 or pm.height > 2000:
  34. # pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
  35. img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
  36. img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
  37. img_dict = {"img": img, "width": pm.width, "height": pm.height}
  38. imgs.append(img_dict)
  39. class CustomPaddleModel:
  40. def __init___(self, ocr: bool = False, show_log: bool = False):
  41. self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
  42. def __call__(self, img):
  43. result = self.model(img)
  44. spans = []
  45. for line in result:
  46. line.pop("img")
  47. """
  48. 为paddle输出适配type no.
  49. title: 0 # 标题
  50. text: 1 # 文本
  51. header: 2 # abandon
  52. footer: 2 # abandon
  53. reference: 1 # 文本 or abandon
  54. equation: 8 # 行间公式 block
  55. equation: 14 # 行间公式 text
  56. figure: 3 # 图片
  57. figure_caption: 4 # 图片描述
  58. table: 5 # 表格
  59. table_caption: 6 # 表格描述
  60. """
  61. if line["type"] == "title":
  62. line["category_id"] = 0
  63. elif line["type"] in ["text", "reference"]:
  64. line["category_id"] = 1
  65. elif line["type"] == "figure":
  66. line["category_id"] = 3
  67. elif line["type"] == "figure_caption":
  68. line["category_id"] = 4
  69. elif line["type"] == "table":
  70. line["category_id"] = 5
  71. elif line["type"] == "table_caption":
  72. line["category_id"] = 6
  73. elif line["type"] == "equation":
  74. line["category_id"] = 8
  75. elif line["type"] in ["header", "footer"]:
  76. line["category_id"] = 2
  77. else:
  78. logger.warning(f"unknown type: {line['type']}")
  79. # 兼容不输出score的paddleocr版本
  80. if line.get("score") is None:
  81. line["score"] = 0.5 + random.random() * 0.5
  82. res = line.pop("res", None)
  83. if res is not None and len(res) > 0:
  84. for span in res:
  85. new_span = {
  86. "category_id": 15,
  87. "bbox": region_to_bbox(span["text_region"]),
  88. "score": span["confidence"],
  89. "text": span["text"],
  90. }
  91. spans.append(new_span)
  92. if len(spans) > 0:
  93. result.extend(spans)
  94. result = remove_duplicates_dicts(result)
  95. return result
  96. def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
  97. imgs = load_imags_from_pdf(pdf_bytes)
  98. custom_paddle = CustomPaddleModel()
  99. model_json = []
  100. for index, img_dict in enumerate(imgs):
  101. img = img_dict["img"]
  102. page_width = img_dict["width"]
  103. page_height = img_dict["height"]
  104. result = custom_paddle(img)
  105. page_info = {"page_no": index, "height": page_height, "width": page_width}
  106. page_dict = {"layout_dets": result, "page_info": page_info}
  107. model_json.append(page_dict)
  108. return model_json