zhengchun
/
PaddleX


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
							# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import os
import io
import sys
import abc
import shlex
import locale
import asyncio

from .utils.arg import CLIArgument
from .utils.subprocess import run_cmd as _run_cmd, CompletedProcess

from ...utils import logging
from ...utils.misc import abspath
from ...utils.flags import DRY_RUN
from ...utils.errors import raise_unsupported_api_error, CalledProcessError

__all__ = ['BaseRunner', 'InferOnlyRunner']


class BaseRunner(metaclass=abc.ABCMeta):
    """
    Abstract base class of Runner.

    Runner is responsible for executing training/inference/compression commands.
    """

    def __init__(self, runner_root_path):
        """
        Initialize the instance.

        Args:
            runner_root_path (str): Path of the directory where the scripts reside.
        """
        super().__init__()

        self.runner_root_path = abspath(runner_root_path)
        # Path to python interpreter
        self.python = sys.executable

    def prepare(self):
        """
        Make preparations for the execution of commands.

        For example, download prerequisites and install dependencies.
        """
        # By default we do nothing
        pass

    @abc.abstractmethod
    def train(self, config_path, cli_args, device, ips, save_dir, do_eval=True):
        """
        Execute model training command.

        Args:
            config_path (str): Path of the configuration file.
            cli_args (list[base.utils.arg.CLIArgument]): List of command-line
                arguments.
            device (str): A string that describes the device(s) to use, e.g.,
                'cpu', 'xpu:0', 'gpu:1,2'.
            ips (str|None): Paddle cluster node ips, e.g.,
                '192.168.0.16,192.168.0.17'.
            save_dir (str): Directory to save log files.
            do_eval (bool, optional): Whether to perform model evaluation during
                training. Default: True.

        Returns:
            paddlex.repo_apis.base.utils.subprocess.CompletedProcess
        """
        raise NotImplementedError

    @abc.abstractmethod
    def evaluate(self, config_path, cli_args, device, ips):
        """
        Execute model evaluation command.

        Args:
            config_path (str): Path of the configuration file.
            cli_args (list[base.utils.arg.CLIArgument]): List of command-line
                arguments.
            device (str): A string that describes the device(s) to use, e.g.,
                'cpu', 'xpu:0', 'gpu:1,2'.
            ips (str|None): Paddle cluster node ips, e.g.,
                '192.168.0.16,192.168.0.17'.

        Returns:
            paddlex.repo_apis.base.utils.subprocess.CompletedProcess
        """
        raise NotImplementedError

    @abc.abstractmethod
    def predict(self, config_path, cli_args, device):
        """
        Execute prediction command.

        Args:
            config_path (str): Path of the configuration file.
            cli_args (list[base.utils.arg.CLIArgument]): List of command-line
                arguments.
            device (str): A string that describes the device(s) to use, e.g.,
                'cpu', 'xpu:0', 'gpu:1,2'.

        Returns:
            paddlex.repo_apis.base.utils.subprocess.CompletedProcess
        """
        raise NotImplementedError

    @abc.abstractmethod
    def export(self, config_path, cli_args, device):
        """
        Execute model export command.

        Args:
            config_path (str): Path of the configuration file.
            cli_args (list[base.utils.arg.CLIArgument]): List of command-line
                arguments.
            device (str): A string that describes the device(s) to use, e.g.,
                'cpu', 'xpu:0', 'gpu:1,2'.

        Returns:
            paddlex.repo_apis.base.utils.subprocess.CompletedProcess
        """
        raise NotImplementedError

    @abc.abstractmethod
    def infer(self, config_path, cli_args, device):
        """
        Execute model inference command.

        Args:
            config_path (str): Path of the configuration file.
            cli_args (list[base.utils.arg.CLIArgument]): List of command-line
                arguments.
            device (str): A string that describes the device(s) to use, e.g.,
                'cpu', 'xpu:0', 'gpu:1,2'.

        Returns:
            paddlex.repo_apis.base.utils.subprocess.CompletedProcess
        """
        raise NotImplementedError

    @abc.abstractmethod
    def compression(self, config_path, train_cli_args, export_cli_args, device,
                    train_save_dir):
        """
        Execute model compression (quantization aware training and model export)
            commands.

        Args:
            config_path (str): Path of the configuration file.
            train_cli_args (list[base.utils.arg.CLIArgument]): List of
                command-line arguments used for model training.
            export_cli_args (list[base.utils.arg.CLIArgument]): List of
                command-line arguments used for model export.
            device (str): A string that describes the device(s) to use, e.g.,
                'cpu', 'xpu:0', 'gpu:1,2'.
            train_save_dir (str): Directory to store model snapshots.

        Returns:
            tuple[paddlex.repo_apis.base.utils.subprocess.CompletedProcess]
        """
        raise NotImplementedError

    def distributed(self, device, ips=None, log_dir=None):
        """ distributed """
        # TODO: docstring
        args = [self.python]
        if device is None:
            return args, None
        device, dev_ids = self.parse_device(device)
        if len(dev_ids) == 0:
            return args, None
        else:
            num_devices = len(dev_ids)
            dev_ids = ','.join(dev_ids)
        if num_devices > 1:
            args.extend(['-m', 'paddle.distributed.launch'])
            args.extend(['--devices', dev_ids])
            if ips is not None:
                args.extend(['--ips', ips])
            if log_dir is None:
                log_dir = os.getcwd()
            args.extend(['--log_dir', self._get_dist_train_log_dir(log_dir)])
        elif num_devices == 1:
            new_env = os.environ.copy()
            if device == 'xpu':
                new_env['XPU_VISIBLE_DEVICES'] = dev_ids
            elif device == 'npu':
                new_env['ASCEND_RT_VISIBLE_DEVICES'] = dev_ids
            elif device == 'mlu':
                new_env['MLU_VISIBLE_DEVICES'] = dev_ids
            else:
                new_env['CUDA_VISIBLE_DEVICES'] = dev_ids
            return args, new_env
        return args, None

    def parse_device(self, device):
        """ parse_device """
        # According to https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/device/set_device_cn.html
        if ':' not in device:
            device_type, dev_ids = device, []
        else:
            device_type, dev_ids = device.split(':')
            dev_ids = dev_ids.split(',')
        if device_type not in ('cpu', 'gpu', 'xpu', 'npu', 'mlu'):
            raise ValueError("Unsupported device type.")
        for dev_id in dev_ids:
            if not dev_id.isdigit():
                raise ValueError("Device ID must be an integer.")
        return device_type, dev_ids

    def run_cmd(self,
                cmd,
                env=None,
                switch_wdir=True,
                silent=False,
                echo=True,
                capture_output=False,
                log_path=None):
        """ run_cmd """

        def _trans_args(cmd):
            out = []
            for ele in cmd:
                if isinstance(ele, CLIArgument):
                    out.extend(ele.lst)
                else:
                    out.append(ele)
            return out

        cmd = _trans_args(cmd)

        if DRY_RUN:
            # TODO: Accommodate Windows system
            logging.info(' '.join(shlex.quote(x) for x in cmd))
            # Mock return
            return CompletedProcess(
                cmd, returncode=0, stdout=str(cmd), stderr=None)

        if switch_wdir:
            if isinstance(switch_wdir, str):
                # In this case `switch_wdir` specifies a relative path
                cwd = os.path.join(self.runner_root_path, switch_wdir)
            else:
                cwd = self.runner_root_path
        else:
            cwd = None

        if not capture_output:
            if log_path is not None:
                logging.warning(
                    "`log_path` will be ignored when `capture_output` is False.")

            cp = _run_cmd(
                cmd,
                env=env,
                cwd=cwd,
                silent=silent,
                echo=echo,
                pipe_stdout=False,
                pipe_stderr=False,
                blocking=True)
            cp = CompletedProcess(
                cp.args, cp.returncode, stdout=cp.stdout, stderr=cp.stderr)
        else:
            # Refer to
            # https://stackoverflow.com/questions/17190221/subprocess-popen-cloning-stdout-and-stderr-both-to-terminal-and-variables/25960956
            async def _read_display_and_record_from_stream(in_stream,
                                                           out_stream, files):
                # According to
                # https://docs.python.org/3/library/subprocess.html#frequently-used-arguments
                _ENCODING = locale.getpreferredencoding(False)
                chars = []
                out_stream_is_buffered = hasattr(out_stream, 'buffer')
                while True:
                    flush = False
                    char = await in_stream.read(1)
                    if char == b'':
                        break
                    if out_stream_is_buffered:
                        out_stream.buffer.write(char)
                    chars.append(char)
                    if char == b'\n':
                        flush = True
                    elif char == b'\r':
                        # NOTE: In order to get tqdm progress bars to produce normal outputs
                        # we treat '\r' as an ending character of line
                        flush = True
                    if flush:
                        line = b''.join(chars)
                        line = line.decode(_ENCODING)
                        if not out_stream_is_buffered:
                            # We use line buffering
                            out_stream.write(line)
                        else:
                            out_stream.buffer.flush()
                        for f in files:
                            f.write(line)
                        chars.clear()

            async def _tee_proc_call(proc_call, out_files, err_files):
                proc = await proc_call
                await asyncio.gather(
                    _read_display_and_record_from_stream(proc.stdout,
                                                         sys.stdout, out_files),
                    _read_display_and_record_from_stream(proc.stderr,
                                                         sys.stderr, err_files))
                # NOTE: https://docs.python.org/3/library/subprocess.html#subprocess.Popen.wait
                retcode = await proc.wait()
                return retcode

            # Non-blocking call with stdout and stderr piped
            with io.StringIO() as stdout_buf, io.StringIO() as stderr_buf:
                proc_call = _run_cmd(
                    cmd,
                    env=env,
                    cwd=cwd,
                    echo=echo,
                    silent=silent,
                    pipe_stdout=True,
                    pipe_stderr=True,
                    blocking=False,
                    async_run=True)
                out_files = [stdout_buf]
                err_files = [stderr_buf]
                if log_path is not None:
                    log_dir = os.path.dirname(log_path)
                    os.makedirs(log_dir, exist_ok=True)
                    log_file = open(log_path, 'w', encoding='utf-8')
                    logging.info(f"\nLog path: {os.path.abspath(log_path)} \n")
                    out_files.append(log_file)
                    err_files.append(log_file)
                try:
                    retcode = asyncio.run(
                        _tee_proc_call(proc_call, out_files, err_files))
                finally:
                    if log_path is not None:
                        log_file.close()
                cp = CompletedProcess(cmd, retcode,
                                      stdout_buf.getvalue(),
                                      stderr_buf.getvalue())

        if cp.returncode != 0:
            raise CalledProcessError(
                cp.returncode, cp.args, output=cp.stdout, stderr=cp.stderr)
        return cp

    def _get_dist_train_log_dir(self, log_dir):
        """ _get_dist_train_log_dir """
        return os.path.join(log_dir, 'distributed_train_logs')

    def _get_train_log_path(self, log_dir):
        """ _get_train_log_path """
        return os.path.join(log_dir, 'train.log')


class InferOnlyRunner(BaseRunner):
    """ InferOnlyRunner """

    def train(self, *args, **kwargs):
        """ train """
        raise_unsupported_api_error(self.__class__, 'train')

    def evaluate(self, *args, **kwargs):
        """ evaluate """
        raise_unsupported_api_error(self.__class__, 'evalaute')

    def predict(self, *args, **kwargs):
        """ predict """
        raise_unsupported_api_error(self.__class__, 'predict')

    def export(self, *args, **kwargs):
        """ export """
        raise_unsupported_api_error(self.__class__, 'export')

    def compression(self, *args, **kwargs):
        """ compression """
        raise_unsupported_api_error(self.__class__, 'compression')