move GroupByFilename Dataset to DataPipe (pytorch#51709)

lixinyu · facebook-github-bot · commit 015cabf82a19 · 2021-02-09T03:34:56.000-08:00
Summary: Pull Request resolved: pytorch#51709 Move GroupByFilename Dataset to DataPipe Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D26263585 Pulled By: glaringlee fbshipit-source-id: 00e3e13b47b89117f1ccfc4cd6239940a40d071e
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
@@ -19,6 +19,7 @@
     basichandlers as decoder_basichandlers,
     imagehandler as decoder_imagehandler)
 
+
 def create_temp_dir_and_files():
     # The temp dir and files within it will be released and deleted in tearDown().
     # Adding `noqa: P201` to avoid mypy's warning on not releasing the dir handle within this function.
@@ -178,6 +179,38 @@ def test_routeddecoder_iterable_datapipe(self):
                 self.assertTrue(rec[1] == open(rec[0], 'rb').read().decode('utf-8'))
 
 
+    def test_groupbykey_iterable_datapipe(self):
+        temp_dir = self.temp_dir.name
+        temp_tarfile_pathname = os.path.join(temp_dir, "test_tar.tar")
+        file_list = [
+            "a.png", "b.png", "c.json", "a.json", "c.png", "b.json", "d.png",
+            "d.json", "e.png", "f.json", "g.png", "f.png", "g.json", "e.json",
+            "h.txt", "h.json"]
+        with tarfile.open(temp_tarfile_pathname, "w:gz") as tar:
+            for file_name in file_list:
+                file_pathname = os.path.join(temp_dir, file_name)
+                with open(file_pathname, 'w') as f:
+                    f.write('12345abcde')
+                tar.add(file_pathname)
+
+        datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.tar')
+        datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1)
+        datapipe3 = dp.iter.ReadFilesFromTar(datapipe2)
+        datapipe4 = dp.iter.GroupByKey(datapipe3, group_size=2)
+
+        expected_result = [("a.png", "a.json"), ("c.png", "c.json"), ("b.png", "b.json"), ("d.png", "d.json"), (
+            "f.png", "f.json"), ("g.png", "g.json"), ("e.png", "e.json"), ("h.json", "h.txt")]
+
+        count = 0
+        for rec, expected in zip(datapipe4, expected_result):
+            count = count + 1
+            self.assertEqual(os.path.basename(rec[0][0]), expected[0])
+            self.assertEqual(os.path.basename(rec[1][0]), expected[1])
+            self.assertEqual(rec[0][1].read(), b'12345abcde')
+            self.assertEqual(rec[1][1].read(), b'12345abcde')
+        self.assertEqual(count, 8)
+
+
 class IDP_NoLen(IterDataPipe):
     def __init__(self, input_dp):
         super().__init__()
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
@@ -3,11 +3,12 @@
 from torch.utils.data.datapipes.iter.readfilesfromtar import ReadFilesFromTarIterDataPipe as ReadFilesFromTar
 from torch.utils.data.datapipes.iter.readfilesfromzip import ReadFilesFromZipIterDataPipe as ReadFilesFromZip
 from torch.utils.data.datapipes.iter.routeddecoder import RoutedDecoderIterDataPipe as RoutedDecoder
+from torch.utils.data.datapipes.iter.groupbykey import GroupByKeyIterDataPipe as GroupByKey
 
 # Functional DataPipe
 from torch.utils.data.datapipes.iter.batch import BatchIterDataPipe as Batch, BucketBatchIterDataPipe as BucketBatch
 from torch.utils.data.datapipes.iter.callable import CallableIterDataPipe as Callable, CollateIterDataPipe as Collate
 from torch.utils.data.datapipes.iter.sampler import SamplerIterDataPipe as Sampler
 
-__all__ = ['ListDirFiles', 'LoadFilesFromDisk', 'ReadFilesFormTar', 'ReadFilesFromZip', 'RoutedDecoder',
+__all__ = ['ListDirFiles', 'LoadFilesFromDisk', 'ReadFilesFormTar', 'ReadFilesFromZip', 'RoutedDecoder', 'GroupByKey',
            'Batch', 'BucketBatch', 'Callable', 'Collate', 'Sampler']
diff --git a/torch/utils/data/datapipes/iter/groupbykey.py b/torch/utils/data/datapipes/iter/groupbykey.py
@@ -0,0 +1,109 @@
+from torch.utils.data import IterDataPipe
+from typing import Dict, List, Tuple, Any, Callable, Iterable, Iterator, Union
+
+import os
+import functools
+
+
+# defaut group key is the file pathname without the extension.
+# Assuming the passed in data is a tuple and 1st item is file's pathname.
+def default_group_key_fn(dataitem : Tuple[str, Any]):
+    return os.path.splitext(dataitem[0])[0]
+
+
+def default_sort_data_fn(datalist : List[Tuple[str, Any]]):
+    txt_ext = ['.json', '.jsn', '.txt', '.text']
+
+    def cmp_fn(a : Tuple[str, Any], b : Tuple[str, Any]):
+        a_is_txt = os.path.splitext(a[0])[1] in txt_ext
+        b_is_txt = os.path.splitext(b[0])[1] in txt_ext
+
+        # if a is txt but b is not, b go front
+        if a_is_txt and not b_is_txt:
+            return 1
+        # if a is not txt but b is txt, a go front
+        if not a_is_txt and b_is_txt:
+            return -1
+        # if a and b both are or are not txt, sort in alphabetic order
+        if a[0] < b[0]:
+            return -1
+        elif a[0] > b[0]:
+            return 1
+        return 0
+
+    return sorted(datalist, key=functools.cmp_to_key(cmp_fn))
+
+
+class GroupByKeyIterDataPipe(IterDataPipe):
+    r""" :class:`GroupByKeyIterDataPipe`.
+
+    Iterable datapipe to group data from input iterable by keys which are generated from `group_key_fn`,
+    yields a list with `group_size` items in it, each item in the list is a tuple of key and data
+
+    args:
+        datapipe: Iterable datapipe that provides data. (typically str key (eg. pathname) and data stream in tuples)
+        group_size: the size of group
+        max_buffer_size: the max size of stream buffer which is used to store not yet grouped but iterated data
+        group_key_fn: a function which is used to generate group key from the data in the input datapipe
+        sort_data_fn: a function which is used to sort the grouped data before yielding back
+        length: a nominal length of the datapipe
+    """
+
+    def __init__(
+            self,
+            datapipe : Iterable[Tuple[str, Any]],
+            *,
+            group_size : int,
+            max_buffer_size : Union[int, None] = None,
+            group_key_fn : Callable = default_group_key_fn,
+            sort_data_fn : Callable = default_sort_data_fn,
+            length: int = -1):
+        super().__init__()
+
+        assert group_size > 0
+        self.datapipe : Iterable[Tuple[str, Any]] = datapipe
+        self.group_size : int = group_size
+
+        # default max buffer size is group_size * 10
+        self.max_buffer_size = max_buffer_size if max_buffer_size is not None else group_size * 10
+        assert self.max_buffer_size >= self.group_size
+
+        self.group_key_fn : Callable = group_key_fn
+        self.sort_data_fn : Callable = sort_data_fn
+        self.curr_buffer_size : int = 0
+        self.stream_buffer : Dict[str, List[Tuple[str, Any]]] = {}
+        self.length : int = length
+
+
+    def __iter__(self) -> Iterator[list]:
+        if self.group_size == 1:
+            for data in self.datapipe:
+                yield [data]
+        else:
+            for data in self.datapipe:
+                key = self.group_key_fn(data)
+                if key not in self.stream_buffer:
+                    self.stream_buffer[key] = []
+                res = self.stream_buffer[key]
+                res.append(data)
+                if len(res) == self.group_size:
+                    yield self.sort_data_fn(res)
+                    del self.stream_buffer[key]
+                    self.curr_buffer_size = self.curr_buffer_size - self.group_size + 1
+                else:
+                    if self.curr_buffer_size == self.max_buffer_size:
+                        raise OverflowError(
+                            "stream_buffer is overflow, please adjust the order of data "
+                            "in the input datapipe or increase the buffer size!")
+                    self.curr_buffer_size = self.curr_buffer_size + 1
+
+            if self.curr_buffer_size > 0:
+                msg = "Not able to group [{}] with group size {}.".format(
+                    ','.join([v[0] for _, vs in self.stream_buffer.items() for v in vs]), str(self.group_size))
+                raise RuntimeError(msg)
+
+
+    def __len__(self):
+        if self.length == -1:
+            raise NotImplementedError
+        return self.length