paddlevideo/loader/builder.py

# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import signal
import os
import paddle
from paddle.io import DataLoader, DistributedBatchSampler
from .registry import DATASETS, PIPELINES
from ..utils.build_utils import build
from .pipelines.compose import Compose
from paddlevideo.utils import get_logger
from paddlevideo.utils.multigrid import DistributedShortSampler
import numpy as np

logger = get_logger("paddlevideo")


def build_pipeline(cfg):
    """Build pipeline.
    Args:
        cfg (dict): root config dict.
    """
    if cfg == None:
        return
    return Compose(cfg)


def build_dataset(cfg):
    """Build dataset.
    Args:
        cfg (dict): root config dict.

    Returns:
        dataset: dataset.
    """
    #XXX: ugly code here!
    cfg_dataset, cfg_pipeline = cfg
    cfg_dataset.pipeline = build_pipeline(cfg_pipeline)
    dataset = build(cfg_dataset, DATASETS, key="format")
    return dataset


def build_batch_pipeline(cfg):

    batch_pipeline = build(cfg, PIPELINES)
    return batch_pipeline


def build_dataloader(dataset,
                     batch_size,
                     num_workers,
                     places,
                     shuffle=True,
                     drop_last=True,
                     multigrid=False,
                     collate_fn_cfg=None,
                     **kwargs):
    """Build Paddle Dataloader.

    XXX explain how the dataloader work!

    Args:
        dataset (paddle.dataset): A PaddlePaddle dataset object.
        batch_size (int): batch size on single card.
        num_worker (int): num_worker
        shuffle(bool): whether to shuffle the data at every epoch.
    """
    if multigrid:
        sampler = DistributedShortSampler(dataset,
                                          batch_sizes=batch_size,
                                          shuffle=True,
                                          drop_last=True)
    else:
        sampler = DistributedBatchSampler(dataset,
                                          batch_size=batch_size,
                                          shuffle=shuffle,
                                          drop_last=drop_last)

    #NOTE(shipping): when switch the mix operator on, such as: mixup, cutmix.
    # batch like: [[img, label, attibute, ...], [imgs, label, attribute, ...], ...] will recollate to:
    # [[img, img, ...], [label, label, ...], [attribute, attribute, ...], ...] as using numpy.transpose.

    def mix_collate_fn(batch):
        pipeline = build_batch_pipeline(collate_fn_cfg)
        batch = pipeline(batch)
        slots = []
        for items in batch:
            for i, item in enumerate(items):
                if len(slots) < len(items):
                    slots.append([item])
                else:
                    slots[i].append(item)
        return [np.stack(slot, axis=0) for slot in slots]

    #if collate_fn_cfg is not None:
    #ugly code here. collate_fn is mix op config
    #    collate_fn = mix_collate_fn(collate_fn_cfg)

    data_loader = DataLoader(
        dataset,
        batch_sampler=sampler,
        places=places,
        num_workers=num_workers,
        collate_fn=mix_collate_fn if collate_fn_cfg is not None else None,
        return_list=True,
        **kwargs)

    return data_loader


def term_mp(sig_num, frame):
    """ kill all child processes
    """
    pid = os.getpid()
    pgid = os.getpgid(os.getpid())
    logger.info("main proc {} exit, kill process group " "{}".format(pid, pgid))
    os.killpg(pgid, signal.SIGKILL)
    return


signal.signal(signal.SIGINT, term_mp)
signal.signal(signal.SIGTERM, term_mp)