|
|
@@ -372,7 +372,6 @@ def parse_ppstructv3_data(data: Dict, config: Dict) -> List[Dict]:
|
|
|
|
|
|
return parsed_data
|
|
|
|
|
|
-
|
|
|
def parse_table_recognition_v2_data(data: Dict, config: Dict) -> List[Dict]:
|
|
|
tool_config = config['ocr']['tools']['table_recognition_v2']
|
|
|
parsed_data = []
|
|
|
@@ -385,31 +384,42 @@ def parse_table_recognition_v2_data(data: Dict, config: Dict) -> List[Dict]:
|
|
|
continue
|
|
|
|
|
|
html_text = item.get(tool_config['text_field'], '')
|
|
|
- bbox = item.get(tool_config['bbox_field'], [])
|
|
|
- if bbox and len(bbox) >= 4:
|
|
|
- bbox = bbox[:4]
|
|
|
+
|
|
|
+ # 计算表格整体bbox
|
|
|
+ cell_boxes_raw = item.get(tool_config['bbox_field'], [])
|
|
|
+ if cell_boxes_raw:
|
|
|
+ x1_list = [box[0] for box in cell_boxes_raw]
|
|
|
+ y1_list = [box[1] for box in cell_boxes_raw]
|
|
|
+ x2_list = [box[2] for box in cell_boxes_raw]
|
|
|
+ y2_list = [box[3] for box in cell_boxes_raw]
|
|
|
+ table_bbox = [
|
|
|
+ float(min(x1_list)),
|
|
|
+ float(min(y1_list)),
|
|
|
+ float(max(x2_list)),
|
|
|
+ float(max(y2_list))
|
|
|
+ ]
|
|
|
else:
|
|
|
- bbox = [0, 0, 0, 0]
|
|
|
+ table_bbox = [0.0, 0.0, 0.0, 0.0]
|
|
|
|
|
|
parsed_data.append({
|
|
|
'text': str(html_text).strip(),
|
|
|
- 'bbox': bbox,
|
|
|
+ 'bbox': table_bbox,
|
|
|
'category': item.get(tool_config.get('category_field', ''), 'table'),
|
|
|
'confidence': item.get(tool_config.get('confidence_field', ''), config['ocr']['default_confidence']),
|
|
|
'source_tool': 'table_recognition_v2',
|
|
|
})
|
|
|
|
|
|
- rec_texts = get_nested_value(data, tool_config.get('rec_texts_field', ''))
|
|
|
- rec_boxes = get_nested_value(data, tool_config.get('rec_boxes_field', ''))
|
|
|
- if isinstance(rec_texts, list) and isinstance(rec_boxes, list):
|
|
|
- for i, (text, box) in enumerate(zip(rec_texts, rec_boxes)):
|
|
|
- if text and isinstance(box, list) and len(box) >= 4:
|
|
|
- parsed_data.append({
|
|
|
- 'text': str(text).strip(),
|
|
|
- 'bbox': box[:4],
|
|
|
- 'category': 'OCR_Text',
|
|
|
- 'source_tool': 'ppstructv3_ocr'
|
|
|
- })
|
|
|
+ rec_texts = get_nested_value(item, tool_config.get('rec_texts_field', ''))
|
|
|
+ rec_boxes = get_nested_value(item, tool_config.get('rec_boxes_field', ''))
|
|
|
+ if isinstance(rec_texts, list) and isinstance(rec_boxes, list):
|
|
|
+ for i, (text, box) in enumerate(zip(rec_texts, rec_boxes)):
|
|
|
+ if text and isinstance(box, list) and len(box) >= 4:
|
|
|
+ parsed_data.append({
|
|
|
+ 'text': str(text).strip(),
|
|
|
+ 'bbox': box[:4],
|
|
|
+ 'category': 'OCR_Text',
|
|
|
+ 'source_tool': 'table_recognition_v2'
|
|
|
+ })
|
|
|
|
|
|
return parsed_data
|
|
|
|