all extension convert implemented

IndEcol · Aug 15, 2024 · 6c131d8 · 6c131d8
1 parent 6ae4ba2
commit 6c131d8
Show file tree

Hide file tree

Showing 2 changed files with 162 additions and 10 deletions.
diff --git a/pymrio/core/mriosystem.py b/pymrio/core/mriosystem.py
@@ -3245,23 +3245,144 @@ def remove_extension(self, ext):
 
         return self
 
-    def convert_extensions(
-        self, df_map, extension_name, agg_func="sum", drop_not_bridged_index=True
+    def extension_convert(
+        self,
+        df_map,
+        extension_name,
+        extension_col_name="extension",
+        agg_func="sum",
+        drop_not_bridged_index=True,
+        unit_column_orig="unit_orig",
+        unit_column_new="unit_new",
+        ignore_columns=None,
     ):
-        """Builds a new extension based on conversion of existing ones
+        """Apply the convert function to all extensions
+
+        Internally that calls the Extension.convert function for all extensions.
+
+        See the Extension.convert function for more details.
+        TODO: put all details here
+
+
+        Parameters
+        ----------
+
+        df_map : pd.DataFrame
+            The DataFrame with the mapping of the old to the new classification.
+            This requires a specific structure:
+
+            - Constraining data (e.g. stressors, regions, sectors) can be
+            either in the index or columns of df_orig. The need to have the same
+            name as the named index or column in df_orig. The algorithm searches
+            for matching data in df_orig based on all constraining columns in df_map.
+
+            - Bridge columns are columns with '__' in the name. These are used to
+            map (bridge) some/all of the constraining columns in df_orig to the new
+            classification.
+
+            - One column "factor", which gives the multiplication factor for the
+            conversion. If it is missing, it is set to 1.
+
+
+            This is better explained with an example.
+            Assuming a original dataframe df_orig with
+            index names 'stressor' and 'compartment' and column name 'region',
+            the characterizing dataframe could have the following structure (column names):
+
+            stressor ... original index name
+            compartment ... original index name
+            region ... original column name
+            factor ... the factor for multiplication/characterization
+                If no factor is given, the factor is assumed to be 1.
+                This can be used, to simplify renaming/aggregation mappings.
+            impact__stressor ... the new index name,
+                replacing the previous index name "stressor".
+                Thus here "stressor" will be renamed to "impact", and the row index
+                will be renamed by the entries here.
+            compartment__compartment ... the new compartment,
+                replacing the original compartment. No rename of column happens here,
+                still row index will be renamed as given here.
+
+            the columns with __ are called bridge columns, they are used
+            to match the original index. The new dataframe with have index names
+            based on the first part of the bridge column, in the order
+            in which the bridge columns are given in the mapping dataframe.
 
-        Calls convert function based on data given in df_map
+            "region" is constraining column, these can either be for the index or column
+            in df_orig. In case both exist, the one in index is preferred.
+
+        extension_name: str
+            The name of the new extension returned
 
-        Difference to df_map: runs across all extensions.
-        Internally, this call extension_extract through all extensions
-        and then calls the convert function on the temporarily extracted
-        extension.
+        extension_col_name : str, optional
+            Name of the column specifying the extension name in df_map.
+            The entry in df_map here can either be the name returned by Extension.name or the
+            name of the Extension instance.
+            Default: 'extension'
 
-        Switch: also return the extracted raw_data
+        agg_func : str or func
+            the aggregation function to use for multiple matchings (summation by default)
 
-        df_map difference to other df_map: needs a column with the extension name
+        drop_not_bridged_index : bool, optional
+            What to do with index levels in df_orig not appearing in the bridge columns.
+            If True, drop them after aggregation across these, if False,
+            pass them through to the result.
+
+            *Note:* Only index levels will be dropped, not columns.
+
+            In case some index levels need to be dropped, and some not
+            make a bridge column for the ones to be dropped and map all to the same name.
+            Then drop this index level after the conversion.
+
+        unit_column_orig : str, optional
+            Name of the column in df_map with the original unit.
+            This will be used to check if the unit matches the original unit in the extension.
+            Default is "unit_orig", if None, no check is performed.
+
+        unit_column_new : str, optional
+            Name of the column in df_map with the new unit to be assigned to the new extension.
+            Default is "unit_new", if None same unit as in df_orig TODO EXPLAIN BETTER, THINK WARNING
+
+        ignore_columns : list, optional
+            List of column names in df_map which should be ignored.
+            These could be columns with additional information, etc.
+            The unit columns given in unit_column_orig and unit_column_new
+            are ignored by default.
+
+
+        TODO: remove after explain
+        Extension for extensions:
+        extension ... extension name
+        unit_orig ... the original unit (optional, for double check with the unit)
+        unit_new ... the new unit to be set for the extension
 
         """
+        if not ignore_columns:
+            ignore_columns = []
+        ignore_columns.append(extension_col_name)
+
+        extensions_to_consider = df_map.loc[:, extension_col_name].unique()
+
+        gather = dict()
+
+        for ext in extensions_to_consider:
+            gather.update(
+                self._apply_extension_method(
+                    extensions=ext,
+                    method="convert",
+                    df_map=df_map[df_map[extension_col_name] == ext],
+                    agg_func=agg_func,
+                    extension_name=extension_name,
+                    drop_not_bridged_index=drop_not_bridged_index,
+                    unit_column_orig=unit_column_orig,
+                    unit_column_new=unit_column_new,
+                    ignore_columns=ignore_columns,
+                )
+            )
+
+        result_ext = concate_extension(list(gather.values()), name=extension_name)
+
+        return result_ext
 
         # look for extension name in df_map
         # make unique extension list, and call extension_extract for all

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -554,6 +554,7 @@ def test_characterize_extension(fix_testmrio):
 
 
 def test_extension_convert(fix_testmrio):
+    """Testing the convert function within extensions"""
     tt_pre = fix_testmrio.testmrio.copy()
     df_map = pd.DataFrame(
         columns=[
@@ -638,6 +639,36 @@ def test_extension_convert(fix_testmrio):
     assert tt_post.post_calc.unit.loc["water_emissions", "unit"] == "g"
 
 
+def test_extension_convert_full(fix_testmrio):
+    """Testing the convert function called from the full MRIO"""
+
+    tt_pre = fix_testmrio.testmrio.copy()
+
+    df_map = pd.DataFrame(
+        columns=[
+            "extension",
+            "stressor",
+            "compartment",
+            "total__stressor",
+            "factor",
+            "unit_orig",
+            "unit_new",
+        ],
+        data=[
+            ["Emissions", "emis.*", "air|water", "total_sum_tonnes", 1e-3, "kg", "t"],
+            ["emissions", "emission_type[1|2]", ".*", "total_sum", 1, "kg", "kg"],
+            ["emissions", "emission_type1", ".*", "air_emissions", 1e-3, "kg", "t"],
+            ["Emissions", "emission_type2", ".*", "water_emissions", 1000, "kg", "g"],
+        ],
+    )
+
+    x = tt_pre.extension_convert(df_map, extension_name="emissions_new_pre_calc")
+    # CONT:
+    # write test with units
+    # make a second extensions are check running over 2
+    # cleanup docstrings and write docs
+
+
 def test_extension_convert_test_unit_fail(fix_testmrio):
     df_fail1 = pd.DataFrame(
         columns=[