move RoutedDecoder Dataset to DataPipe (pytorch#51704)

lixinyu · facebook-github-bot · commit 482b94ae510d · 2021-02-09T03:31:07.000-08:00
Summary: Pull Request resolved: pytorch#51704 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D26245910 Pulled By: glaringlee fbshipit-source-id: 91e3c9f8a6c1209c1a1a752ba29a80dbd9bf4119
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
@@ -5,6 +5,8 @@
 import warnings
 import tarfile
 import zipfile
+import numpy as np
+from PIL import Image
 
 import torch
 from torch.testing._internal.common_utils import (TestCase, run_tests)
@@ -13,6 +15,10 @@
 
 import torch.utils.data.datapipes as dp
 
+from torch.utils.data.datapipes.utils.decoder import (
+    basichandlers as decoder_basichandlers,
+    imagehandler as decoder_imagehandler)
+
 def create_temp_dir_and_files():
     # The temp dir and files within it will be released and deleted in tearDown().
     # Adding `noqa: P201` to avoid mypy's warning on not releasing the dir handle within this function.
@@ -153,6 +159,25 @@ def test_readfilesfromzip_iterable_datapipe(self):
             self.assertEqual(data_refs[i][1].read(), open(self.temp_files[i], 'rb').read())
 
 
+    def test_routeddecoder_iterable_datapipe(self):
+        temp_dir = self.temp_dir.name
+        temp_pngfile_pathname = os.path.join(temp_dir, "test_png.png")
+        img = Image.new('RGB', (2, 2), color='red')
+        img.save(temp_pngfile_pathname)
+        datapipe1 = dp.iter.ListDirFiles(temp_dir, ['*.png', '*.txt'])
+        datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1)
+        datapipe3 = dp.iter.RoutedDecoder(datapipe2, handlers=[decoder_imagehandler('rgb')])
+        datapipe3.add_handler(decoder_basichandlers)
+
+        for rec in datapipe3:
+            ext = os.path.splitext(rec[0])[1]
+            if ext == '.png':
+                expected = np.array([[[1., 0., 0.], [1., 0., 0.]], [[1., 0., 0.], [1., 0., 0.]]], dtype=np.single)
+                self.assertTrue(np.array_equal(rec[1], expected))
+            else:
+                self.assertTrue(rec[1] == open(rec[0], 'rb').read().decode('utf-8'))
+
+
 class IDP_NoLen(IterDataPipe):
     def __init__(self, input_dp):
         super().__init__()
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
@@ -2,11 +2,12 @@
 from torch.utils.data.datapipes.iter.loadfilesfromdisk import LoadFilesFromDiskIterDataPipe as LoadFilesFromDisk
 from torch.utils.data.datapipes.iter.readfilesfromtar import ReadFilesFromTarIterDataPipe as ReadFilesFromTar
 from torch.utils.data.datapipes.iter.readfilesfromzip import ReadFilesFromZipIterDataPipe as ReadFilesFromZip
+from torch.utils.data.datapipes.iter.routeddecoder import RoutedDecoderIterDataPipe as RoutedDecoder
 
 # Functional DataPipe
 from torch.utils.data.datapipes.iter.batch import BatchIterDataPipe as Batch, BucketBatchIterDataPipe as BucketBatch
 from torch.utils.data.datapipes.iter.callable import CallableIterDataPipe as Callable, CollateIterDataPipe as Collate
 from torch.utils.data.datapipes.iter.sampler import SamplerIterDataPipe as Sampler
 
-__all__ = ['ListDirFiles', 'LoadFilesFromDisk', 'ReadFilesFormTar', 'ReadFilesFromZip'
+__all__ = ['ListDirFiles', 'LoadFilesFromDisk', 'ReadFilesFormTar', 'ReadFilesFromZip', 'RoutedDecoder',
            'Batch', 'BucketBatch', 'Callable', 'Collate', 'Sampler']
diff --git a/torch/utils/data/datapipes/iter/routeddecoder.py b/torch/utils/data/datapipes/iter/routeddecoder.py
@@ -0,0 +1,47 @@
+from torch.utils.data import IterDataPipe
+from torch.utils.data.datapipes.utils.decoder import (
+    Decoder,
+    basichandlers as decoder_basichandlers,
+    imagehandler as decoder_imagehandler)
+
+from typing import Iterable, Iterator, Union, List, Tuple, Any, Callable
+from io import BufferedIOBase
+
+class RoutedDecoderIterDataPipe(IterDataPipe):
+    r""" :class:`RoutedDecoderIterDataPipe`.
+
+    Iterable datapipe to decode binary streams from input iterables,
+    yield pathname and decoded binary stream in a tuple.
+    args:
+        datapipe: Iterable datapipe that provides pathname and binary stream in tuples
+        handlers: user defined decoder handlers, if None, basic and image decoder handlers will be set as default
+        length: a nominal length of the datapipe
+    """
+
+    def __init__(
+            self,
+            datapipe : Iterable[Tuple[str, BufferedIOBase]],
+            *,
+            handlers : Union[None, List[Callable]] = None,
+            length: int = -1):
+        super().__init__()
+        self.datapipe : Iterable[Tuple[str, BufferedIOBase]] = datapipe
+        if handlers:
+            self.decoder = Decoder(handlers)
+        else:
+            self.decoder = Decoder([decoder_basichandlers, decoder_imagehandler('torch')])
+        self.length : int = length
+
+    def add_handler(self, handler : Callable) -> None:
+        self.decoder.add_handler(handler)
+
+    def __iter__(self) -> Iterator[Tuple[str, Any]]:
+        for data in self.datapipe:
+            pathname = data[0]
+            result = self.decoder(data)
+            yield (pathname, result[pathname])
+
+    def __len__(self):
+        if self.length == -1:
+            raise NotImplementedError
+        return self.length
diff --git a/torch/utils/data/datapipes/utils/decoder.py b/torch/utils/data/datapipes/utils/decoder.py
@@ -0,0 +1,270 @@
+# This file takes partial of the implementation from NVIDIA's webdataset at here:
+# https://github.com/tmbdev/webdataset/blob/master/webdataset/autodecode.py
+
+import pickle
+import re
+import os
+
+import numpy as np
+import PIL
+import PIL.Image
+import json
+import tempfile
+import io
+
+
+################################################################
+# handle basic datatypes
+################################################################
+
+
+def basichandlers(key, data):
+
+    extension = re.sub(r".*[.]", "", key)
+
+    if extension in "txt text transcript":
+        return data.decode("utf-8")
+
+    if extension in "cls cls2 class count index inx id".split():
+        try:
+            return int(data)
+        except ValueError:
+            return None
+
+    if extension in "json jsn":
+        return json.loads(data)
+
+    if extension in "pyd pickle".split():
+        return pickle.loads(data)
+
+    if extension in "pt".split():
+        import torch
+        stream = io.BytesIO(data)
+        return torch.load(stream)
+
+    # if extension in "ten tb".split():
+    #     from . import tenbin
+    #     return tenbin.decode_buffer(data)
+
+    # if extension in "mp msgpack msg".split():
+    #     import msgpack
+    #     return msgpack.unpackb(data)
+
+    return None
+
+
+################################################################
+# handle images
+################################################################
+
+imagespecs = {
+    "l8": ("numpy", "uint8", "l"),
+    "rgb8": ("numpy", "uint8", "rgb"),
+    "rgba8": ("numpy", "uint8", "rgba"),
+    "l": ("numpy", "float", "l"),
+    "rgb": ("numpy", "float", "rgb"),
+    "rgba": ("numpy", "float", "rgba"),
+    "torchl8": ("torch", "uint8", "l"),
+    "torchrgb8": ("torch", "uint8", "rgb"),
+    "torchrgba8": ("torch", "uint8", "rgba"),
+    "torchl": ("torch", "float", "l"),
+    "torchrgb": ("torch", "float", "rgb"),
+    "torch": ("torch", "float", "rgb"),
+    "torchrgba": ("torch", "float", "rgba"),
+    "pill": ("pil", None, "l"),
+    "pil": ("pil", None, "rgb"),
+    "pilrgb": ("pil", None, "rgb"),
+    "pilrgba": ("pil", None, "rgba"),
+}
+
+def handle_extension(extensions, f):
+    """
+    Returns a decoder handler function for the list of extensions.
+    Extensions can be a space separated list of extensions.
+    Extensions can contain dots, in which case the corresponding number
+    of extension components must be present in the key given to f.
+    Comparisons are case insensitive.
+    Examples:
+    handle_extension("jpg jpeg", my_decode_jpg)  # invoked for any file.jpg
+    handle_extension("seg.jpg", special_case_jpg)  # invoked only for file.seg.jpg
+    """
+
+    extensions = extensions.lower().split()
+
+    def g(key, data):
+        extension = key.lower().split(".")
+
+        for target in extensions:
+            target = target.split(".")
+            if len(target) > len(extension):
+                continue
+
+            if extension[-len(target):] == target:
+                return f(data)
+            return None
+    return g
+
+
+class ImageHandler:
+    """
+    Decode image data using the given `imagespec`.
+    The `imagespec` specifies whether the image is decoded
+    to numpy/torch/pi, decoded to uint8/float, and decoded
+    to l/rgb/rgba:
+
+    - l8: numpy uint8 l
+    - rgb8: numpy uint8 rgb
+    - rgba8: numpy uint8 rgba
+    - l: numpy float l
+    - rgb: numpy float rgb
+    - rgba: numpy float rgba
+    - torchl8: torch uint8 l
+    - torchrgb8: torch uint8 rgb
+    - torchrgba8: torch uint8 rgba
+    - torchl: torch float l
+    - torchrgb: torch float rgb
+    - torch: torch float rgb
+    - torchrgba: torch float rgba
+    - pill: pil None l
+    - pil: pil None rgb
+    - pilrgb: pil None rgb
+    - pilrgba: pil None rgba
+    """
+    def __init__(self, imagespec):
+        assert imagespec in list(imagespecs.keys()), "unknown image specification: {}".format(imagespec)
+        self.imagespec = imagespec.lower()
+
+    def __call__(self, key, data):
+        extension = re.sub(r".*[.]", "", key)
+        if extension.lower() not in "jpg jpeg png ppm pgm pbm pnm".split():
+            return None
+
+        imagespec = self.imagespec
+        atype, etype, mode = imagespecs[imagespec]
+
+        with io.BytesIO(data) as stream:
+            img = PIL.Image.open(stream)
+            img.load()
+            img = img.convert(mode.upper())
+            if atype == "pil":
+                return img
+            elif atype == "numpy":
+                result = np.asarray(img)
+                assert result.dtype == np.uint8, "numpy image array should be type uint8, but got {}".format(result.dtype)
+                if etype == "uint8":
+                    return result
+                else:
+                    return result.astype("f") / 255.0
+            elif atype == "torch":
+                import torch
+
+                result = np.asarray(img)
+                assert result.dtype == np.uint8, "numpy image array should be type uint8, but got {}".format(result.dtype)
+
+                if etype == "uint8":
+                    result = np.array(result.transpose(2, 0, 1))
+                    return torch.tensor(result)
+                else:
+                    result = np.array(result.transpose(2, 0, 1))
+                    return torch.tensor(result) / 255.0
+            return None
+
+def imagehandler(imagespec):
+    return ImageHandler(imagespec)
+
+
+################################################################
+# torch video
+################################################################
+
+
+def torch_video(key, data):
+    extension = re.sub(r".*[.]", "", key)
+    if extension not in "mp4 ogv mjpeg avi mov h264 mpg webm wmv".split():
+        return None
+
+    # add `type: ignore` to avoid mypy's warning on import missing
+    import torchvision.io  # type: ignore
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, f"file.{extension}")
+        with open(fname, "wb") as stream:
+            stream.write(data)
+            return torchvision.io.read_video(fname)
+
+
+################################################################
+# torchaudio
+################################################################
+
+
+def torch_audio(key, data):
+    extension = re.sub(r".*[.]", "", key)
+    if extension not in ["flac", "mp3", "sox", "wav", "m4a", "ogg", "wma"]:
+        return None
+
+    # add `type: ignore` to avoid mypy's warning on import missing
+    import torchaudio  # type: ignore
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, f"file.{extension}")
+        with open(fname, "wb") as stream:
+            stream.write(data)
+            return torchaudio.load(fname)
+
+
+
+################################################################
+# a sample decoder
+################################################################
+
+
+class Decoder:
+    """
+    Decode key/data sets using a list of handlers.
+    For each key/data item, this iterates through the list of
+    handlers until some handler returns something other than None.
+    """
+
+    def __init__(self, handlers):
+        self.handlers = handlers
+
+    def add_handler(self, handler):
+        if not handler:
+            return
+        if not self.handlers:
+            self.handlers = [handler]
+        else:
+            self.handlers.append(handler)
+
+    def decode1(self, key, data):
+        if not data:
+            return data
+
+        # if data is a stream handle, we need to read all the content before decoding
+        if isinstance(data, io.BufferedIOBase) or isinstance(data, io.RawIOBase):
+            data = data.read()
+
+        for f in self.handlers:
+            result = f(key, data)
+            if result is not None:
+                return result
+        return data
+
+    def decode(self, data):
+        result = {}
+        # single data tuple(pathname, data stream)
+        if isinstance(data, tuple):
+            data = [data]
+
+        if data is not None:
+            for k, v in data:
+                # TODO: xinyu, figure out why Nvidia do this?
+                if k[0] == "_":
+                    if isinstance(v, bytes):
+                        v = v.decode("utf-8")
+                        result[k] = v
+                        continue
+                result[k] = self.decode1(k, v)
+        return result
+
+    def __call__(self, data):
+        return self.decode(data)