added testcase for rename via convert

IndEcol · Jul 11, 2024 · 6181066 · 6181066
1 parent 1fad0b2
commit 6181066
Show file tree

Hide file tree

Showing 15 changed files with 116 additions and 85 deletions.
diff --git a/doc/source/notebooks/convert.py b/doc/source/notebooks/convert.py
@@ -19,7 +19,7 @@
 # Pymrio contains several possibilities to convert data from one system to another.
 
 # %% [markdown]
-# The term *convert* is meant very general here, it contains 
+# The term *convert* is meant very general here, it contains
 #     - finding and extracting data based on indicies across a table or an mrio(-extension) system based on name and potentially constrained by sector/region or any other specification
 #     - converting the names of the found indicies
 #     - adjusting the numerical values of the data, e.g. for unit conversion or characterisation
@@ -33,7 +33,7 @@
 
 
 # %% [markdown]
-# Irrespectively of the table or the mrio system, the convert function always follows the same pattern. 
+# Irrespectively of the table or the mrio system, the convert function always follows the same pattern.
 # It requires a bridge table, which contains the mapping of the indicies of the source data to the indicies of the target data.
 # This bridge table has to follow a specific format, depending on the table to be converted.
 
@@ -62,5 +62,3 @@
 
 # %% [markdown]
 # ## Converting a pymrio extension
-
-
diff --git a/doc/source/notebooks/load_save_export.py b/doc/source/notebooks/load_save_export.py
@@ -21,9 +21,10 @@
 # %% [markdown]
 # Here, we use the included small test MRIO system to highlight the different function. The same functions are available for any MRIO loaded into pymrio. Expect, however, significantly decreased performance due to the size of real MRIO system.
 
+import os
+
 # %%
 import pymrio
-import os
 
 io = pymrio.load_test().calc_all()
 

diff --git a/doc/source/notebooks/stressor_characterization.py b/doc/source/notebooks/stressor_characterization.py
@@ -36,7 +36,6 @@
 
 import pandas as pd
 
-
 import pymrio
 from pymrio.core.constants import PYMRIO_PATH  # noqa
 

diff --git a/pymrio/tools/iodownloader.py b/pymrio/tools/iodownloader.py
@@ -1,5 +1,4 @@
-""" Utility functions for automatic downloading of public MRIO databases
-"""
+"""Utility functions for automatic downloading of public MRIO databases"""
 
 import getpass
 import itertools

diff --git a/pymrio/tools/iomath.py b/pymrio/tools/iomath.py
@@ -1,4 +1,4 @@
-""" Mathematical functions for input output calculations
+"""Mathematical functions for input output calculations
 
 All methods here should follow the functional programming paradigm
 

diff --git a/pymrio/tools/iometadata.py b/pymrio/tools/iometadata.py
@@ -1,5 +1,4 @@
-""" Meta data for provenance and version tracking in pymrio
-"""
+"""Meta data for provenance and version tracking in pymrio"""
 
 import datetime
 import getpass

diff --git a/pymrio/tools/ioutil.py b/pymrio/tools/ioutil.py
@@ -999,13 +999,13 @@ def _index_regex_matcher(_dfs_idx, _method, _find_all=None, **kwargs):
 
 
 def convert(df_orig, df_map, agg_func="sum", drop_not_bridged=True):
-    """ Convert a DataFrame to a new classification
+    """Convert a DataFrame to a new classification
 
     Parameters
     ----------
     df_orig : pd.DataFrame
         The DataFrame to process. All matching occurs on the index.
-        Thus stack the tables if necessary.
+        Stack tables if necessary.
 
     df_map : pd.DataFrame
         The DataFrame with the mapping of the old to the new classification.
@@ -1014,17 +1014,24 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged=True):
         and one column for each new index level in the characterized result dataframe.
 
         This is better explained with an example.
-        Assuming a dataframe with index names 'stressor' and 'compartment'
-        the characterizing dataframe would have the following structure:
+        Assuming a original dataframe df_orig with
+        index names 'stressor' and 'compartment'
+        the characterizing dataframe would have the following structure (column names):
 
         stressor ... original index name
         compartment ... original index name
-        factor ... the factor for multiplication
-        impact__stressor ... the new index name, replacing the previous index name "stressor"
-        compartment__compartment ... the new compartment, replacing the original compartment
-
-        the columsn with __ we call bridge columns, they are used to match the original index
-        the new dataframe with have index names based on the first part of the bridge column, in the order
+        factor ... the factor for multiplication/characterization
+        impact__stressor ... the new index name,
+            replacing the previous index name "stressor".
+            Thus here "stressor" will be renamed to "impact", and the row index
+            will be renamed by the entries here.
+        compartment__compartment ... the new compartment,
+            replacing the original compartment. No rename of column happens here,
+            still row index will be renamed as given here.
+
+        the columsn with __ are called bridge columns, they are used
+        to match the original index. The new dataframe with have index names
+        based on the first part of the bridge column, in the order
         in which the bridge columns are given in the mapping dataframe.
 
         The structure "stressor" and "impact__stressor" is important.
@@ -1049,30 +1056,10 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged=True):
 
     """
 
-    # df_orig = pd.DataFrame(
-    #     data=5,
-    #     index=pd.MultiIndex.from_product([["em1", "em2"], ["air", "water"]]),
-    #     columns=pd.MultiIndex.from_product([["r1", "c1"], ["r2", "c2"]]),
-    # )
-    # df_orig.columns.names = ["reg", "sec"]
-    # df_orig.index.names = ["em_type", "compart"]
-    #
-    # df_map = pd.DataFrame(
-    #     columns=["em_type", "compart", "total__em_type", "factor"],
-    #     data=[
-    #         ["em.*", "air|water", "total_regex", 2],
-    #         ["em1", "air", "total_sum", 2],
-    #         ["em1", "water", "total_sum", 2],
-    #         ["em2", "air", "total_sum", 2],
-    #         ["em2", "water", "total_sum", 2],
-    #         ["em1", "air", "all_air", 0.5],
-    #         ["em2", "air", "all_air", 0.5],
-    #     ],
-    # )
-    #
-    #
     bridge_columns = [col for col in df_map.columns if "__" in col]
-    unique_new_index = df_map.loc[:, bridge_columns].drop_duplicates().set_index(bridge_columns).index
+    unique_new_index = (
+        df_map.loc[:, bridge_columns].drop_duplicates().set_index(bridge_columns).index
+    )
 
     bridge_components = namedtuple("bridge_components", ["new", "orig", "raw"])
     bridges = []
@@ -1086,10 +1073,14 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged=True):
         else:
             raise ValueError(f"Column {col} contains more then one '__'")
         assert bridge.orig in df_map.columns, f"Column {bridge.new} not in df_map"
-        assert bridge.orig in df_orig.index.names, f"Column {bridge.orig} not in df_orig"
+        assert (
+            bridge.orig in df_orig.index.names
+        ), f"Column {bridge.orig} not in df_orig"
         bridges.append(bridge)
 
-    orig_index_not_bridged = [ix for ix in df_orig.index.names if ix not in [b.orig for b in bridges]]
+    orig_index_not_bridged = [
+        ix for ix in df_orig.index.names if ix not in [b.orig for b in bridges]
+    ]
 
     df_map = df_map.set_index(bridge_columns)
     res_collector = []
@@ -1125,7 +1116,7 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged=True):
                     )
 
         # CONT: test cases for wrong input
-        # CONT: test cases for just rename
+        # CONT: docs for just rename (see tests already done)
         # CONT: docs with test cases
         res_collector.append(
             df_collected.groupby(by=df_collected.index.names).agg(agg_func)
@@ -1137,7 +1128,9 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged=True):
         all_result = all_result.reset_index(level=orig_index_not_bridged, drop=True)
     else:
         # move the not bridged index levels to the end of the index
-        new_index = [ix for ix in all_result.index.names if ix not in orig_index_not_bridged]
+        new_index = [
+            ix for ix in all_result.index.names if ix not in orig_index_not_bridged
+        ]
         all_result = all_result.reorder_levels(new_index + orig_index_not_bridged)
 
     agg_all = all_result.groupby(by=all_result.index.names).agg(agg_func)

diff --git a/setup.py b/setup.py
@@ -37,6 +37,7 @@
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
         "Programming Language :: Python :: 3 :: Only",
         "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)",
         "Development Status :: 4 - Beta",

diff --git a/tests/test_aggregation.py b/tests/test_aggregation.py
@@ -1,4 +1,4 @@
-""" Tests the aggregation functionality in pymrio
+"""Tests the aggregation functionality in pymrio
 
 This only test the top-level aggregation function.
 For the low-level function 'build_agg_vec' and 'build_agg_matrix'
@@ -211,7 +211,9 @@ def test_wrong_inputs():
     with pytest.raises(ValueError) as VA_region_name:
         reg_agg = range(len(io.get_regions()))
         _ = io.aggregate(
-            region_agg=reg_agg, region_names=["a", "b"], inplace=False  # noqa
+            region_agg=reg_agg,
+            region_names=["a", "b"],
+            inplace=False,  # noqa
         )
     assert "region aggregation" in str(VA_region_name.value).lower()
 

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1,5 +1,4 @@
-""" Testing core functionality of pymrio
-"""
+"""Testing core functionality of pymrio"""
 
 import os
 import sys

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -1,4 +1,4 @@
-""" Testing functions for the full run based on
+"""Testing functions for the full run based on
 the small MRIO given within pymrio.
 
 This tests the full computation and fileio.

diff --git a/tests/test_math.py b/tests/test_math.py
@@ -1,4 +1,4 @@
-""" test cases for all mathematical functions """
+"""test cases for all mathematical functions"""
 
 import os
 import sys

diff --git a/tests/test_outputs.py b/tests/test_outputs.py
@@ -1,16 +1,16 @@
-""" Test for producing graphical outputs
+"""Test for producing graphical outputs
 
-    The report functionality is tested separately
-    in test_integration
+The report functionality is tested separately
+in test_integration
 
-    Note
-    ----
+Note
+----
 
-    Here we use the values returned from the plotted graph
-    for testing. Regression tests against plotted graphs,
-    as provided by image_comparison decorator of matplotlib,
-    are not used since this is deprecated and also not consistent
-    across different plotting engines.
+Here we use the values returned from the plotted graph
+for testing. Regression tests against plotted graphs,
+as provided by image_comparison decorator of matplotlib,
+are not used since this is deprecated and also not consistent
+across different plotting engines.
 
 """
 

diff --git a/tests/test_parsers.py b/tests/test_parsers.py
@@ -1,4 +1,4 @@
-""" Tests the parsing of different MRIOs """
+"""Tests the parsing of different MRIOs"""
 
 import os
 import sys

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -1,4 +1,4 @@
-""" test cases for all util functions """
+"""test cases for all util functions"""
 
 import os
 import string
@@ -20,13 +20,13 @@
 from pymrio.tools.ioutil import (  # noqa
     build_agg_matrix,
     build_agg_vec,
+    convert,
     diagonalize_blocks,
     filename_from_url,
     find_first_number,
     index_contains,
     index_fullmatch,
     index_match,
-    convert,
     set_block,
     sniff_csv_format,
 )
@@ -355,34 +355,68 @@ def test_util_regex():
     assert len(df_none_match_index) == 0
 
 
-
 def test_convert_rename():
-    """ Testing the renaming of one table"""
+    """Testing the renaming of one table"""
 
     to_char = pd.DataFrame(
-        data=5,
+        data=99.0,
         index=pd.MultiIndex.from_product([["em1", "em2", "emA"], ["air", "water"]]),
         columns=pd.MultiIndex.from_product([["r1", "c1"], ["r2", "c2"]]),
     )
+
     to_char.columns.names = ["reg", "sec"]
     to_char.index.names = ["em_type", "compart"]
 
-
-    rename_bridge = pd.DataFrame(
-        columns=["em_type", "compart", "stressor__em_type", "factor"],
+    rename_bridge_simple = pd.DataFrame(
+        columns=["em_type", "stressor__em_type", "factor"],
         data=[
-            ["em1", "air|water", "emission-1", 2],
-            # ["em1", "air", "total_sum", 2],
-            # ["em1", "water", "total_sum", 2],
-            # ["em2", "air", "total_sum", 2],
-            # ["em2", "water", "total_sum", 2],
-            # ["em1", "air", "all_air", 0.5],
-            # ["em2", "air", "all_air", 0.5],
+            ["em1", "emission-1", 1],
+            ["em2", "emission2", 1],
+            ["emA", "emission A", 1],
         ],
     )
 
-    char_res = convert(to_char, rename_bridge)
+    char_res_keep_comp = convert(to_char, rename_bridge_simple, drop_not_bridged=False)
+    assert all(char_res_keep_comp.columns == to_char.columns)
+    assert all(
+        char_res_keep_comp.index.get_level_values("compart")
+        == to_char.index.get_level_values("compart")
+    )
+    npt.assert_allclose(char_res_keep_comp.values, to_char.values)
+
+    pdt.assert_index_equal(
+        char_res_keep_comp.index.get_level_values("stressor"),
+        pd.Index(
+            [
+                "emission A",
+                "emission A",
+                "emission-1",
+                "emission-1",
+                "emission2",
+                "emission2",
+            ],
+            dtype="object",
+            name="stressor",
+        ),
+    )
 
+    char_res_agg_comp = convert(to_char, rename_bridge_simple, drop_not_bridged=True)
+
+    assert all(char_res_agg_comp.columns == to_char.columns)
+    assert char_res_agg_comp.sum().sum() == to_char.sum().sum()
+
+    pdt.assert_index_equal(
+        char_res_agg_comp.index,
+        pd.Index(
+            [
+                "emission A",
+                "emission-1",
+                "emission2",
+            ],
+            dtype="object",
+            name="stressor",
+        ),
+    )
 
 
 def test_convert_characterize():
@@ -439,20 +473,26 @@ def test_convert_characterize():
     exp_res1B = pd.DataFrame(
         columns=to_char.columns,
         index=pd.MultiIndex.from_tuples(
-            [("all_air", "air"), ("total_regex", "air"), ("total_regex", "water"), ("total_sum", "air"), ("total_sum", "water")]),
+            [
+                ("all_air", "air"),
+                ("total_regex", "air"),
+                ("total_regex", "water"),
+                ("total_sum", "air"),
+                ("total_sum", "water"),
+            ]
+        ),
         data=[
             [5, 5, 5, 5],
             [20, 20, 20, 20],
             [20, 20, 20, 20],
             [20, 20, 20, 20],
             [20, 20, 20, 20],
-        ]
+        ],
     )
     exp_res1B = exp_res1B.astype(float)
     exp_res1B.index.names = res1B.index.names
 
-    pdt.assert_frame_equal(res1B, exp_res1B) 
-
+    pdt.assert_frame_equal(res1B, exp_res1B)
 
     # TEST2 with impact per compartment (two index levels in the result)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -36,7 +36,6 @@

		import pandas as pd


		import pymrio
		from pymrio.core.constants import PYMRIO_PATH # noqa

Expand Down