| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- import numpy as np
- import tqdm
- import json
- from validation import cal_edit_distance, format_gt_bbox
- from magic_pdf.layout.layout_sort import sort_with_layout
- with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset_final_rotated_formulafix_highdpi_scihub.json', 'r') as f:
- samples = json.load(f)
- # labels = []
- # det_res = []
- edit_distance_dict = []
- edit_distance_list = []
- for i, sample in tqdm.tqdm(enumerate(samples)):
- pdf_name = sample['pdf_name']
- s3_pdf_path = sample['s3_path']
- page_num = sample['page']
- page_width = sample['annotations']['width']
- page_height = sample['annotations']['height']
- # pre = main(s3_pdf_path, pdf_bin_file_profile, join_path(pdf_model_dir, pdf_name), pdf_model_profile, save_path, page_num)
- # pre_dict_list = []
- # for item in pre:
- # pre_sample = {
- # 'box': [item[0],item[1],item[2],item[3]],
- # 'type': item[7],
- # 'score': 1
- # }
- # pre_dict_list.append(pre_sample)
- # det_res.append(pre_dict_list)
- # match_change_dict = { # 待确认
- # "figure": "image",
- # "svg_figure": "image",
- # "inline_fomula": "equations_inline",
- # "fomula": "equation_interline",
- # "figure_caption": "text",
- # "table_caption": "text",
- # "fomula_caption": "text"
- # }
-
- gt_annos = sample['annotations']
- # matched_label = label_match(gt_annos, match_change_dict)
- # labels.append(matched_label)
- # 判断排序函数的精度
- # 目前不考虑caption与图表相同序号的问题
- ignore_category = ['abandon', 'figure_caption', 'table_caption', 'formula_caption', 'inline_fomula']
- gt_bboxes = format_gt_bbox(gt_annos, ignore_category)
- sorted_bboxes, _ = sort_with_layout(gt_bboxes, page_width, page_height)
- if sorted_bboxes:
- edit_distance = cal_edit_distance(sorted_bboxes)
- edit_distance_list.append(edit_distance)
- edit_distance_dict.append({
- "sample_id": i,
- "s3_path": s3_pdf_path,
- "page_num": page_num,
- "page_s2_path": sample['page_path'],
- "edit_distance": edit_distance
- })
- # label_classes = ["image", "text", "table", "equation_interline"]
- # detect_matrix = detect_val(labels, det_res, label_classes)
- # print('detect_matrix', detect_matrix)
- edit_distance_mean = np.mean(edit_distance_list)
- print('edit_distance_mean', edit_distance_mean)
- edit_distance_dict_sorted = sorted(edit_distance_dict, key=lambda x: x['edit_distance'], reverse=True)
- # print(edit_distance_dict_sorted)
- result = {
- "edit_distance_mean": edit_distance_mean,
- "edit_distance_dict_sorted": edit_distance_dict_sorted
- }
- with open('vali_bbox_sort_result.json', 'w') as f:
- json.dump(result, f)
|