| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151 |
- # copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import os
- import pickle
- import traceback
- import os.path as osp
- import multiprocessing as mp
- from .cls_dataset import ClsDataset
- from .det_dataset import DetDataset
- from .seg_dataset import SegDataset
- from .ins_seg_dataset import InsSegDataset
- from ..utils import set_folder_status, get_folder_status, DatasetStatus, DownloadStatus, download, list_files
- dataset_url_list = [
- 'https://bj.bcebos.com/paddlex/demos/vegetables_cls.tar.gz',
- 'https://bj.bcebos.com/paddlex/demos/insect_det.tar.gz',
- 'https://bj.bcebos.com/paddlex/demos/optic_disc_seg.tar.gz',
- 'https://bj.bcebos.com/paddlex/demos/xiaoduxiong_ins_det.tar.gz',
- 'https://bj.bcebos.com/paddlex/demos/remote_sensing_seg.tar.gz'
- ]
- dataset_url_dict = {
- 'classification':
- 'https://bj.bcebos.com/paddlex/demos/vegetables_cls.tar.gz',
- 'detection': 'https://bj.bcebos.com/paddlex/demos/insect_det.tar.gz',
- 'segmentation':
- 'https://bj.bcebos.com/paddlex/demos/optic_disc_seg.tar.gz',
- 'instance_segmentation':
- 'https://bj.bcebos.com/paddlex/demos/xiaoduxiong_ins_det.tar.gz'
- }
- def _check_and_copy(dataset, dataset_path, source_path):
- try:
- dataset.check_dataset(source_path)
- except Exception as e:
- error_info = traceback.format_exc()
- set_folder_status(dataset_path, DatasetStatus.XCHECKFAIL, error_info)
- return
- set_folder_status(dataset_path, DatasetStatus.XCOPYING, os.getpid())
- try:
- dataset.copy_dataset(source_path, dataset.all_files)
- except Exception as e:
- error_info = traceback.format_exc()
- set_folder_status(dataset_path, DatasetStatus.XCOPYFAIL, error_info)
- return
- # 若上传已切分好的数据集
- if len(dataset.train_files) != 0:
- set_folder_status(dataset_path, DatasetStatus.XSPLITED)
- def import_dataset(dataset_id, dataset_type, dataset_path, source_path):
- set_folder_status(dataset_path, DatasetStatus.XCHECKING)
- if dataset_type == 'classification':
- ds = ClsDataset(dataset_id, dataset_path)
- elif dataset_type == 'detection':
- ds = DetDataset(dataset_id, dataset_path)
- elif dataset_type == 'segmentation':
- ds = SegDataset(dataset_id, dataset_path)
- elif dataset_type == 'instance_segmentation':
- ds = InsSegDataset(dataset_id, dataset_path)
- p = mp.Process(
- target=_check_and_copy, args=(ds, dataset_path, source_path))
- p.start()
- return p
- def _download_proc(url, target_path, dataset_type):
- # 下载数据集压缩包
- from paddlex.utils import decompress
- target_path = osp.join(target_path, dataset_type)
- fname = download(url, target_path)
- # 解压
- decompress(fname)
- set_folder_status(target_path, DownloadStatus.XDDECOMPRESSED)
- def download_demo_dataset(prj_type, target_path):
- url = dataset_url_list[prj_type.value]
- dataset_type = prj_type.name
- p = mp.Process(
- target=_download_proc, args=(url, target_path, dataset_type))
- p.start()
- return p
- def get_dataset_status(dataset_id, dataset_type, dataset_path):
- status, message = get_folder_status(dataset_path, True)
- if status is None:
- status = DatasetStatus.XEMPTY
- if status == DatasetStatus.XCOPYING:
- items = message.strip().split()
- pid = None
- if len(items) < 2:
- percent = 0.0
- else:
- pid = int(items[0])
- if int(items[1]) == 0:
- percent = 1.0
- else:
- copyed_files_num = len(list_files(dataset_path)) - 1
- percent = copyed_files_num * 1.0 / int(items[1])
- message = {'pid': pid, 'percent': percent}
- if status == DatasetStatus.XCOPYDONE or status == DatasetStatus.XSPLITED:
- if not osp.exists(osp.join(dataset_path, 'statis.pkl')):
- p = import_dataset(dataset_id, dataset_type, dataset_path,
- dataset_path)
- status = DatasetStatus.XCHECKING
- return status, message
- def split_dataset(dataset_id, dataset_type, dataset_path, val_split,
- test_split):
- status, message = get_folder_status(dataset_path, True)
- if status != DatasetStatus.XCOPYDONE and status != DatasetStatus.XSPLITED:
- raise Exception("数据集还未导入完成,请等数据集导入成功后再进行切分")
- if not osp.exists(osp.join(dataset_path, 'statis.pkl')):
- raise Exception("数据集需重新校验,请刷新数据集后再进行切分")
- if dataset_type == 'classification':
- ds = ClsDataset(dataset_id, dataset_path)
- elif dataset_type == 'detection':
- ds = DetDataset(dataset_id, dataset_path)
- elif dataset_type == 'segmentation':
- ds = SegDataset(dataset_id, dataset_path)
- elif dataset_type == 'instance_segmentation':
- ds = InsSegDataset(dataset_id, dataset_path)
- ds.load_statis_info()
- ds.split(val_split, test_split)
- set_folder_status(dataset_path, DatasetStatus.XSPLITED)
- def get_dataset_details(dataset_path):
- status, message = get_folder_status(dataset_path, True)
- if status == DatasetStatus.XCOPYDONE or status == DatasetStatus.XSPLITED:
- with open(osp.join(dataset_path, 'statis.pkl'), 'rb') as f:
- details = pickle.load(f)
- return details
- return None
|