| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- from cProfile import label
- import os
- from collections import defaultdict
- import matplotlib.pyplot as plt
- from matplotlib.backends.backend_agg import FigureCanvasAgg
- import numpy as np
- from PIL import Image, ImageOps
- import cv2
- import json
- from .....utils.file_interface import custom_open
- # show data samples
- def simple_analyse(dataset_path, max_recorded_sample_cnts=20, show_label=True):
- """
- Analyse the dataset samples by return not nore than
- max_recorded_sample_cnts image path and label path
- Args:
- dataset_path (str): dataset path
- max_recorded_sample_cnts (int, optional): the number to return. Default: 50.
- Returns:
- tuple: tuple of sample number, image path and label path for train, val and text subdataset.
- """
- tags = ['train', 'val', 'test']
- sample_cnts = defaultdict(int)
- img_paths = defaultdict(list)
- lab_paths = defaultdict(list)
- lab_infos = defaultdict(list)
- res = [None] * 9
- delim = '\t'
- valid_num_parts = 2
- for tag in tags:
- file_list = os.path.join(dataset_path, f'{tag}.txt')
- if not os.path.exists(file_list):
- if tag in ('train', 'val'):
- res.insert(0, "数据集不符合规范,请先通过数据校准")
- return res
- else:
- continue
- else:
- with custom_open(file_list, 'r') as f:
- all_lines = f.readlines()
- # Each line corresponds to a sample
- sample_cnts[tag] = len(all_lines)
- for idx, line in enumerate(all_lines):
- parts = line.strip("\n").split(delim)
- if len(line.strip("\n")) < 1:
- continue
- if tag in ('train', 'val'):
- valid_num_parts_lst = [2]
- else:
- valid_num_parts_lst = [1, 2]
- if len(parts) not in valid_num_parts_lst and len(
- line.strip("\n")) > 1:
- res.insert(0, "数据集的标注文件不符合规范")
- return res
- if len(parts) == 2:
- img_path, lab_path = parts
- else:
- # len(parts) == 1
- img_path = parts[0]
- lab_path = None
- # check det label
- if len(img_paths[tag]) < max_recorded_sample_cnts:
- img_path = os.path.join(dataset_path, img_path)
- if lab_path is not None:
- label = json.loads(lab_path)
- boxes = []
- for item in label:
- if 'points' not in item or 'transcription' not in item:
- res.insert(0, "数据集的标注文件不符合规范")
- return res
- box = np.array(item['points'])
- if box.shape[1] != 2:
- res.insert(0, "数据集的标注文件不符合规范")
- return res
- boxes.append(box)
- txt = item['transcription']
- if not isinstance(txt, str):
- res.insert(0, "数据集的标注文件不符合规范")
- return res
- if show_label:
- lab_img = show_label_img(img_path, boxes)
- img_paths[tag].append(img_path)
- if show_label:
- lab_paths[tag].append(lab_img)
- else:
- lab_infos[tag].append({
- 'img_path': img_path,
- 'box': boxes
- })
- if show_label:
- return ("完成数据分析", sample_cnts[tags[0]], sample_cnts[tags[1]],
- sample_cnts[tags[2]], img_paths[tags[0]], img_paths[tags[1]],
- img_paths[tags[2]], lab_paths[tags[0]], lab_paths[tags[1]],
- lab_paths[tags[2]])
- else:
- return ("完成数据分析", sample_cnts[tags[0]], sample_cnts[tags[1]],
- sample_cnts[tags[2]], img_paths[tags[0]], img_paths[tags[1]],
- img_paths[tags[2]], lab_infos[tags[0]], lab_infos[tags[1]],
- lab_infos[tags[2]])
- def show_label_img(img_path, dt_boxes):
- """draw ocr detection label"""
- img = cv2.imread(img_path)
- for box in dt_boxes:
- box = np.array(box).astype(np.int32).reshape(-1, 2)
- cv2.polylines(img, [box], True, color=(0, 255, 0), thickness=3)
- return img[:, :, ::-1]
- def deep_analyse(dataset_path, output):
- """class analysis for dataset"""
- sample_results = simple_analyse(
- dataset_path, max_recorded_sample_cnts=float('inf'), show_label=False)
- lab_infos = sample_results[-3] + sample_results[-2] + sample_results[-1]
- labels_cnt = defaultdict(int)
- img_shapes = [] # w, h
- ratios_w = []
- ratios_h = []
- for info in lab_infos:
- img = np.asarray(ImageOps.exif_transpose(Image.open(info['img_path'])))
- img_h, img_w = np.shape(img)[:2]
- img_shapes.append([img_w, img_h])
- for box in info['box']:
- box = np.array(box).astype(np.int32).reshape(-1, 2)
- box_w, box_h = np.max(box, axis=0) - np.min(box, axis=0)
- ratio_w = box_w / img_w
- ratio_h = box_h / img_h
- ratios_w.append(ratio_w)
- ratios_h.append(ratio_h)
- m_w_img, m_h_img = np.mean(img_shapes, axis=0) # mean img shape
- m_num_box = len(ratios_w) / len(lab_infos) # num box per img
- ratio_w = [i * 1000 for i in ratios_w]
- ratio_h = [i * 1000 for i in ratios_h]
- w_bins = int((max(ratio_w) - min(ratio_w)) // 10)
- h_bins = int((max(ratio_h) - min(ratio_h)) // 10)
- fig, ax = plt.subplots()
- ax.hist(ratio_w, bins=w_bins, rwidth=0.8, color='yellowgreen')
- ax.set_xlabel('Width rate *1000')
- ax.set_ylabel('number')
- canvas = FigureCanvasAgg(fig)
- canvas.draw()
- width, height = fig.get_size_inches() * fig.get_dpi()
- bar_array = np.frombuffer(
- canvas.tostring_rgb(), dtype='uint8').reshape(
- int(height), int(width), 3)
- # pie
- fig, ax = plt.subplots()
- ax.hist(ratio_h, bins=h_bins, rwidth=0.8, color='pink')
- ax.set_xlabel('Height rate *1000')
- ax.set_ylabel('number')
- canvas = FigureCanvasAgg(fig)
- canvas.draw()
- width, height = fig.get_size_inches() * fig.get_dpi()
- pie_array = np.frombuffer(
- canvas.tostring_rgb(), dtype='uint8').reshape(
- int(height), int(width), 3)
- os.makedirs(output, exist_ok=True)
- fig_path = os.path.join(output, "histogram.png")
- img_array = np.concatenate((bar_array, pie_array), axis=1)
- cv2.imwrite(fig_path, img_array)
- return {"histogram": os.path.join("check_dataset", "histogram.png")}
- # return {
- # "图像平均宽度": m_w_img,
- # "图像平均高度": m_h_img,
- # "每张图平均文本检测框数量": m_num_box,
- # "检测框相对宽度分布图": fig1_path,
- # "检测框相对高度分布图": fig2_path
- # }
|