paddlevideo/loader/pipelines/decode.py

#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
try:
    import av
except ImportError as e:
    print(
        f"{e}, [av] package and it's dependencies is required for TimeSformer and other models."
    )
import cv2
import pickle
import decord as de
import math
import random
from ..registry import PIPELINES


def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
    delta = max(video_size - clip_size, 0)
    if clip_idx == -1:  # here
        # Random temporal sampling.
        start_idx = random.uniform(0, delta)
    else:  # ignore
        # Uniformly sample the clip with the given index.
        start_idx = delta * clip_idx / num_clips
    end_idx = start_idx + clip_size - 1
    return start_idx, end_idx


@PIPELINES.register()
class VideoDecoder(object):
    """
    Decode mp4 file to frames.
    Args:
        filepath: the file path of mp4 file
    """
    def __init__(self,
                 backend='cv2',
                 mode='train',
                 sampling_rate=32,
                 num_seg=8,
                 num_clips=1,
                 target_fps=30):

        self.backend = backend
        # params below only for TimeSformer
        self.mode = mode
        self.sampling_rate = sampling_rate
        self.num_seg = num_seg
        self.num_clips = num_clips
        self.target_fps = target_fps

    def __call__(self, results):
        """
        Perform mp4 decode operations.
        return:
            List where each item is a numpy array after decoder.
        """
        file_path = results['filename']
        results['format'] = 'video'
        results['backend'] = self.backend

        if self.backend == 'cv2':
            cap = cv2.VideoCapture(file_path)
            videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            sampledFrames = []
            for i in range(videolen):
                ret, frame = cap.read()
                # maybe first frame is empty
                if ret == False:
                    continue
                img = frame[:, :, ::-1]
                sampledFrames.append(img)
            results['frames'] = sampledFrames
            results['frames_len'] = len(sampledFrames)

        elif self.backend == 'decord':
            container = de.VideoReader(file_path)
            frames_len = len(container)
            results['frames'] = container
            results['frames_len'] = frames_len

        elif self.backend == 'pyav':  # for TimeSformer
            if self.mode in ["train", "valid"]:
                clip_idx = -1
            elif self.mode in ["test"]:
                clip_idx = 0
            else:
                raise NotImplementedError

            container = av.open(file_path)

            num_clips = 1  # always be 1

            # decode process
            fps = float(container.streams.video[0].average_rate)

            frames_length = container.streams.video[0].frames
            duration = container.streams.video[0].duration

            if duration is None:
                # If failed to fetch the decoding information, decode the entire video.
                decode_all_video = True
                video_start_pts, video_end_pts = 0, math.inf
            else:
                decode_all_video = False
                start_idx, end_idx = get_start_end_idx(
                    frames_length,
                    self.sampling_rate * self.num_seg / self.target_fps * fps,
                    clip_idx, num_clips)
                timebase = duration / frames_length
                video_start_pts = int(start_idx * timebase)
                video_end_pts = int(end_idx * timebase)

            frames = None
            # If video stream was found, fetch video frames from the video.
            if container.streams.video:
                margin = 1024
                seek_offset = max(video_start_pts - margin, 0)

                container.seek(seek_offset,
                               any_frame=False,
                               backward=True,
                               stream=container.streams.video[0])
                tmp_frames = {}
                buffer_count = 0
                max_pts = 0
                for frame in container.decode(**{"video": 0}):
                    max_pts = max(max_pts, frame.pts)
                    if frame.pts < video_start_pts:
                        continue
                    if frame.pts <= video_end_pts:
                        tmp_frames[frame.pts] = frame
                    else:
                        buffer_count += 1
                        tmp_frames[frame.pts] = frame
                        if buffer_count >= 0:
                            break
                video_frames = [tmp_frames[pts] for pts in sorted(tmp_frames)]

                container.close()

                frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
                clip_sz = self.sampling_rate * self.num_seg / self.target_fps * fps

                start_idx, end_idx = get_start_end_idx(
                    len(frames),  # frame_len
                    clip_sz,
                    clip_idx if decode_all_video else
                    0,  # If decode all video, -1 in train and valid, 0 in test;
                    # else, always 0 in train, valid and test, as we has selected clip size frames when decode.
                    1)
                results['frames'] = frames
                results['frames_len'] = len(frames)
                results['start_idx'] = start_idx
                results['end_idx'] = end_idx
        else:
            raise NotImplementedError
        return results


@PIPELINES.register()
class FrameDecoder(object):
    """just parse results
    """
    def __init__(self):
        pass

    def __call__(self, results):
        results['format'] = 'frame'
        return results


@PIPELINES.register()
class MRIDecoder(object):
    """just parse results
    """
    def __init__(self):
        pass

    def __call__(self, results):
        results['format'] = 'MRI'
        return results


@PIPELINES.register()
class FeatureDecoder(object):
    """
        Perform feature decode operations.e.g.youtube8m
    """
    def __init__(self, num_classes, max_len=512, has_label=True):
        self.max_len = max_len
        self.num_classes = num_classes
        self.has_label = has_label

    def __call__(self, results):
        """
        Perform feature decode operations.
        return:
            List where each item is a numpy array after decoder.
        """
        #1. load pkl
        #2. parse to rgb/audio/
        #3. padding

        filepath = results['filename']
        data = pickle.load(open(filepath, 'rb'), encoding='bytes')

        record = data
        nframes = record['nframes'] if 'nframes' in record else record[
            b'nframes']
        rgb = record['feature'].astype(
            float) if 'feature' in record else record[b'feature'].astype(float)
        audio = record['audio'].astype(
            float) if 'audio' in record else record[b'audio'].astype(float)
        if self.has_label:
            label = record['label'] if 'label' in record else record[b'label']
            one_hot_label = self.make_one_hot(label, self.num_classes)

        rgb = rgb[0:nframes, :]
        audio = audio[0:nframes, :]

        rgb = self.dequantize(rgb,
                              max_quantized_value=2.,
                              min_quantized_value=-2.)
        audio = self.dequantize(audio,
                                max_quantized_value=2,
                                min_quantized_value=-2)

        if self.has_label:
            results['labels'] = one_hot_label.astype("float32")

        feat_pad_list = []
        feat_len_list = []
        mask_list = []
        vitem = [rgb, audio]
        for vi in range(2):  #rgb and audio
            if vi == 0:
                prefix = "rgb_"
            else:
                prefix = "audio_"
            feat = vitem[vi]
            results[prefix + 'len'] = feat.shape[0]
            #feat pad step 1. padding
            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
                                dtype=np.float32)
            feat_pad = np.concatenate((feat, feat_add), axis=0)
            results[prefix + 'data'] = feat_pad.astype("float32")
            #feat pad step 2. mask
            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
            feat_mask_add = feat_add
            feat_mask = np.concatenate((feat_mask_origin, feat_mask_add),
                                       axis=0)
            results[prefix + 'mask'] = feat_mask.astype("float32")

        return results

    def dequantize(self,
                   feat_vector,
                   max_quantized_value=2.,
                   min_quantized_value=-2.):
        """
        Dequantize the feature from the byte format to the float format
        """

        assert max_quantized_value > min_quantized_value
        quantized_range = max_quantized_value - min_quantized_value
        scalar = quantized_range / 255.0
        bias = (quantized_range / 512.0) + min_quantized_value

        return feat_vector * scalar + bias

    def make_one_hot(self, label, dim=3862):
        one_hot_label = np.zeros(dim)
        one_hot_label = one_hot_label.astype(float)
        for ind in label:
            one_hot_label[int(ind)] = 1
        return one_hot_label


@PIPELINES.register()
class ActionFeatureDecoder(object):
    """
        Perform feature decode operations on footballaction
    """
    def __init__(self, num_classes, max_len=512, has_label=True):
        self.max_len = max_len
        self.num_classes = num_classes
        self.has_label = has_label

    def __call__(self, results):
        """
        Perform feature decode operations.
        return:
            List where each item is a numpy array after decoder.
        """
        #1. load pkl
        #2. parse to rgb/audio/
        #3. padding

        filepath = results['filename']
        data = pickle.load(open(filepath, 'rb'), encoding='bytes')

        pkl_data = data
        rgb = pkl_data['image_feature'].astype(float)
        audio = pkl_data['audio_feature'].astype(float)
        label_id_info = pkl_data['label_info']
        label_cls = [label_id_info['label']]
        label_one = int(label_cls[0])
        if len(label_cls) > 1:
            label_index = random.randint(0, 1)
            label_one = int(label_cls[label_index])
        iou_norm = float(label_id_info['norm_iou'])
        results['labels'] = np.array([label_one])
        results['iou_norm'] = float(iou_norm)

        vitem = [rgb, audio]
        for vi in range(2):  #rgb and audio
            if vi == 0:
                prefix = "rgb_"
            else:
                prefix = "audio_"
            feat = vitem[vi]
            results[prefix + 'len'] = feat.shape[0]
            #feat pad step 1. padding
            feat_add = np.zeros((self.max_len - feat.shape[0], feat.shape[1]),
                                dtype=np.float32)
            feat_pad = np.concatenate((feat, feat_add), axis=0)
            results[prefix + 'data'] = feat_pad.astype("float32")
            #feat pad step 2. mask
            feat_mask_origin = np.ones(feat.shape, dtype=np.float32)
            feat_mask = np.concatenate((feat_mask_origin, feat_add), axis=0)
            results[prefix + 'mask'] = feat_mask.astype("float32")

        return results