doc_analyze_by_pp_structurev2.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import random
  2. import fitz
  3. import cv2
  4. from paddleocr import PPStructure
  5. from PIL import Image
  6. from loguru import logger
  7. import numpy as np
  8. def region_to_bbox(region):
  9. x0 = region[0][0]
  10. y0 = region[0][1]
  11. x1 = region[2][0]
  12. y1 = region[2][1]
  13. return [x0, y0, x1, y1]
  14. def dict_compare(d1, d2):
  15. return d1.items() == d2.items()
  16. def remove_duplicates_dicts(lst):
  17. unique_dicts = []
  18. for dict_item in lst:
  19. if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts):
  20. unique_dicts.append(dict_item)
  21. return unique_dicts
  22. def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
  23. ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)
  24. imgs = []
  25. with fitz.open("pdf", pdf_bytes) as doc:
  26. for index in range(0, doc.page_count):
  27. page = doc[index]
  28. dpi = 200
  29. mat = fitz.Matrix(dpi / 72, dpi / 72)
  30. pm = page.get_pixmap(matrix=mat, alpha=False)
  31. # if width or height > 2000 pixels, don't enlarge the image
  32. # if pm.width > 2000 or pm.height > 2000:
  33. # pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
  34. img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
  35. img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
  36. img_dict = {
  37. "img": img,
  38. "width": pm.width,
  39. "height": pm.height
  40. }
  41. imgs.append(img_dict)
  42. model_json = []
  43. for index, img_dict in enumerate(imgs):
  44. img = img_dict['img']
  45. page_width = img_dict['width']
  46. page_height = img_dict['height']
  47. result = ocr_engine(img)
  48. spans = []
  49. need_remove = []
  50. for line in result:
  51. line.pop('img')
  52. '''
  53. 为paddle输出适配type no.
  54. title: 0 # 标题
  55. text: 1 # 文本
  56. header: 2 # abandon
  57. footer: 2 # abandon
  58. reference: 1 # 文本 or abandon
  59. equation: 8 # 行间公式 block
  60. equation: 14 # 行间公式 text
  61. figure: 3 # 图片
  62. figure_caption: 4 # 图片描述
  63. table: 5 # 表格
  64. table_caption: 6 # 表格描述
  65. '''
  66. if line['type'] == 'title':
  67. line['category_id'] = 0
  68. elif line['type'] in ['text', 'reference']:
  69. line['category_id'] = 1
  70. elif line['type'] == 'figure':
  71. line['category_id'] = 3
  72. elif line['type'] == 'figure_caption':
  73. line['category_id'] = 4
  74. elif line['type'] == 'table':
  75. line['category_id'] = 5
  76. elif line['type'] == 'table_caption':
  77. line['category_id'] = 6
  78. elif line['type'] == 'equation':
  79. line['category_id'] = 8
  80. elif line['type'] in ['header', 'footer']:
  81. line['category_id'] = 2
  82. else:
  83. logger.warning(f"unknown type: {line['type']}")
  84. line['score'] = 0.5 + random.random() * 0.5
  85. res = line.pop('res', None)
  86. if res is not None and len(res) > 0:
  87. for span in res:
  88. new_span = {'category_id': 15,
  89. 'bbox': region_to_bbox(span['text_region']),
  90. 'score': span['confidence'],
  91. 'text': span['text']
  92. }
  93. spans.append(new_span)
  94. if len(spans) > 0:
  95. result.extend(spans)
  96. result = remove_duplicates_dicts(result)
  97. page_info = {
  98. "page_no": index,
  99. "height": page_height,
  100. "width": page_width
  101. }
  102. page_dict = {
  103. "layout_dets": result,
  104. "page_info": page_info
  105. }
  106. model_json.append(page_dict)
  107. return model_json