Fix/1110 (openml#1117)

PGijsbers · web-flow · commit b4c868a791f3 · 2021-10-28T09:49:44.000+02:00
Update function signatures for create_study|suite and allow for empty studies (i.e. with no runs).
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -8,7 +8,7 @@ Changelog
 
 0.13.0
 ~~~~~~
-
+ * FIX#1110: Make arguments to ``create_study`` and ``create_suite`` that are defined as optional by the OpenML XSD actually optional.
  * MAIN#1088: Do CI for Windows on Github Actions instead of Appveyor.
 
 
diff --git a/openml/study/functions.py b/openml/study/functions.py
@@ -3,7 +3,6 @@
 from typing import cast, Dict, List, Optional, Union
 import warnings
 
-import dateutil.parser
 import xmltodict
 import pandas as pd
 
@@ -94,7 +93,6 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
     description = result_dict["oml:description"]
     status = result_dict["oml:status"]
     creation_date = result_dict["oml:creation_date"]
-    creation_date_as_date = dateutil.parser.parse(creation_date)
     creator = result_dict["oml:creator"]
 
     # tags is legacy. remove once no longer needed.
@@ -106,35 +104,18 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
                 current_tag["window_start"] = tag["oml:window_start"]
             tags.append(current_tag)
 
-    if "oml:data" in result_dict:
-        datasets = [int(x) for x in result_dict["oml:data"]["oml:data_id"]]
-    else:
-        raise ValueError("No datasets attached to study {}!".format(id_))
-    if "oml:tasks" in result_dict:
-        tasks = [int(x) for x in result_dict["oml:tasks"]["oml:task_id"]]
-    else:
-        raise ValueError("No tasks attached to study {}!".format(id_))
+    def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]:
+        if result_dict.get(key) is not None:
+            return [int(oml_id) for oml_id in result_dict[key][subkey]]
+        return None
 
-    if main_entity_type in ["runs", "run"]:
+    datasets = get_nested_ids_from_result_dict("oml:data", "oml:data_id")
+    tasks = get_nested_ids_from_result_dict("oml:tasks", "oml:task_id")
 
-        if "oml:flows" in result_dict:
-            flows = [int(x) for x in result_dict["oml:flows"]["oml:flow_id"]]
-        else:
-            raise ValueError("No flows attached to study {}!".format(id_))
-        if "oml:setups" in result_dict:
-            setups = [int(x) for x in result_dict["oml:setups"]["oml:setup_id"]]
-        else:
-            raise ValueError("No setups attached to study {}!".format(id_))
-        if "oml:runs" in result_dict:
-            runs = [
-                int(x) for x in result_dict["oml:runs"]["oml:run_id"]
-            ]  # type: Optional[List[int]]
-        else:
-            if creation_date_as_date < dateutil.parser.parse("2019-01-01"):
-                # Legacy studies did not require runs
-                runs = None
-            else:
-                raise ValueError("No runs attached to study {}!".format(id_))
+    if main_entity_type in ["runs", "run"]:
+        flows = get_nested_ids_from_result_dict("oml:flows", "oml:flow_id")
+        setups = get_nested_ids_from_result_dict("oml:setups", "oml:setup_id")
+        runs = get_nested_ids_from_result_dict("oml:runs", "oml:run_id")
 
         study = OpenMLStudy(
             study_id=study_id,
@@ -177,9 +158,9 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
 def create_study(
     name: str,
     description: str,
-    run_ids: List[int],
-    alias: Optional[str],
-    benchmark_suite: Optional[int],
+    run_ids: Optional[List[int]] = None,
+    alias: Optional[str] = None,
+    benchmark_suite: Optional[int] = None,
 ) -> OpenMLStudy:
     """
     Creates an OpenML study (collection of data, tasks, flows, setups and run),
@@ -188,16 +169,19 @@ def create_study(
 
     Parameters
     ----------
-    alias : str (optional)
-        a string ID, unique on server (url-friendly)
     benchmark_suite : int (optional)
         the benchmark suite (another study) upon which this study is ran.
     name : str
         the name of the study (meta-info)
     description : str
         brief description (meta-info)
-    run_ids : list
-        a list of run ids associated with this study
+    run_ids : list, optional
+        a list of run ids associated with this study,
+        these can also be added later with ``attach_to_study``.
+    alias : str (optional)
+        a string ID, unique on server (url-friendly)
+    benchmark_suite: int (optional)
+        the ID of the suite for which this study contains run results
 
     Returns
     -------
@@ -217,28 +201,29 @@ def create_study(
         data=None,
         tasks=None,
         flows=None,
-        runs=run_ids,
+        runs=run_ids if run_ids != [] else None,
         setups=None,
     )
 
 
 def create_benchmark_suite(
-    name: str, description: str, task_ids: List[int], alias: Optional[str],
+    name: str, description: str, task_ids: List[int], alias: Optional[str] = None,
 ) -> OpenMLBenchmarkSuite:
     """
     Creates an OpenML benchmark suite (collection of entity types, where
     the tasks are the linked entity)
 
     Parameters
     ----------
-    alias : str (optional)
-        a string ID, unique on server (url-friendly)
     name : str
         the name of the study (meta-info)
     description : str
         brief description (meta-info)
     task_ids : list
         a list of task ids associated with this study
+        more can be added later with ``attach_to_suite``.
+    alias : str (optional)
+        a string ID, unique on server (url-friendly)
 
     Returns
     -------
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
@@ -1,4 +1,5 @@
 # License: BSD 3-Clause
+from typing import Optional, List
 
 import openml
 import openml.study
@@ -114,6 +115,31 @@ def test_publish_benchmark_suite(self):
         self.assertEqual(study_downloaded.status, "deactivated")
         # can't delete study, now it's not longer in preparation
 
+    def _test_publish_empty_study_is_allowed(self, explicit: bool):
+        runs: Optional[List[int]] = [] if explicit else None
+        kind = "explicit" if explicit else "implicit"
+
+        study = openml.study.create_study(
+            name=f"empty-study-{kind}",
+            description=f"a study with no runs attached {kind}ly",
+            run_ids=runs,
+        )
+
+        study.publish()
+        TestBase._mark_entity_for_removal("study", study.id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id))
+
+        self.assertGreater(study.id, 0)
+        study_downloaded = openml.study.get_study(study.id)
+        self.assertEqual(study_downloaded.main_entity_type, "run")
+        self.assertIsNone(study_downloaded.runs)
+
+    def test_publish_empty_study_explicit(self):
+        self._test_publish_empty_study_is_allowed(explicit=True)
+
+    def test_publish_empty_study_implicit(self):
+        self._test_publish_empty_study_is_allowed(explicit=False)
+
     @pytest.mark.flaky()
     def test_publish_study(self):
         # get some random runs to attach
@@ -214,7 +240,7 @@ def test_study_attach_illegal(self):
 
     def test_study_list(self):
         study_list = openml.study.list_studies(status="in_preparation")
-        # might fail if server is recently resetted
+        # might fail if server is recently reset
         self.assertGreaterEqual(len(study_list), 2)
 
     def test_study_list_output_format(self):