zhengchun
/
PaddleX


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
							# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os.path as osp
from collections import defaultdict
from pathlib import Path

import pandas as pd

from .....utils.errors import DatasetFileNotFoundError


def check(dataset_dir, output, sample_num=10):
    """check dataset"""
    dataset_dir = osp.abspath(dataset_dir)
    if not osp.exists(dataset_dir) or not osp.isdir(dataset_dir):
        raise DatasetFileNotFoundError(file_path=dataset_dir)

    sample_cnts = dict()
    tables = defaultdict(list)
    vis_save_dir = osp.join(output, "demo_data")

    tags = ["train", "val"]
    for _, tag in enumerate(tags):
        file_list = osp.join(dataset_dir, f"{tag}.csv")
        if not osp.exists(file_list):
            if tag in ("train", "val"):
                # train and val file lists must exist
                raise DatasetFileNotFoundError(
                    file_path=file_list,
                    solution=f"Ensure that both `train.csv` and `val.csv` exist in \
{dataset_dir}",
                )
            else:
                continue
        else:
            df = pd.read_csv(file_list)
            sample_cnts[tag] = len(df)
            vis_path = osp.join(vis_save_dir, f"{tag}.csv")
            Path(vis_path).parent.mkdir(parents=True, exist_ok=True)
            vis_df = df.iloc[:sample_num, :]
            vis_df.to_csv(vis_path, index=False)
            header_list = df.columns.to_list()
            data_list = df.head(10).values.tolist()
            tables[tag] = [header_list] + data_list

    attrs = {}
    attrs["train_samples"] = sample_cnts["train"]
    attrs["train_table"] = tables["train"]
    attrs["val_samples"] = sample_cnts["val"]
    attrs["val_table"] = tables["val"]
    return attrs