From f5e7aff6a72df2f511b8f9eaed2476dbe454b513 Mon Sep 17 00:00:00 2001 From: Konstantin Stadler Date: Tue, 16 Jul 2024 22:25:29 +0200 Subject: [PATCH] linting --- doc/source/notebooks/convert.py | 212 ++++++++++++++++++-------------- pymrio/__init__.py | 2 +- pymrio/tools/ioutil.py | 26 ++-- tests/test_util.py | 127 +++++++++++-------- 4 files changed, 207 insertions(+), 160 deletions(-) diff --git a/doc/source/notebooks/convert.py b/doc/source/notebooks/convert.py index 52ac4e72..35e0f6c7 100644 --- a/doc/source/notebooks/convert.py +++ b/doc/source/notebooks/convert.py @@ -16,24 +16,24 @@ # # Convert and Characterize MRIO satellite accounts and results # %% [markdown] -# Here we discuss the possibilities for converting MRIO satellite accounts (Extensions) +# Here we discuss the possibilities for converting MRIO satellite accounts (Extensions) # and results. # The term *convert* is used very broadly here, it includes the following tasks: # # - renaming the index names of results/extensions -# - adjusting the numerical values of the data, +# - adjusting the numerical values of the data, # e.g. for unit conversion or characterisation # - finding and extracting data based on indicies across a table or an mrio(-extension). -# This can be system based on name and potentially constrained by sector/region +# This can be system based on name and potentially constrained by sector/region # or any other specification. # - Aggregation/Summation of satellite accounts # - Characterization of stressors to impact categories # -# We will cover each of these points in the examples below. -# We will start with applying the conversion to a single table +# We will cover each of these points in the examples below. +# We will start with applying the conversion to a single table # and then cover the conversion of a full MRIO extension. # -# For the connected topic of *Aggregation of MRIOs* +# For the connected topic of *Aggregation of MRIOs* # see the [Aggregation](./aggregation_examples.ipynb) page. # %% [markdown] @@ -44,15 +44,15 @@ # the indices of the source data to the indices of the target data. # %% [markdown] -# This tables requires headers (columns) corresponding to the column headers +# This tables requires headers (columns) corresponding to the column headers # of the source data as well as bridge columns which specify the new target index. -# The later are indicated by "NewIndex__OldIndex" - **the important part are -# the two underscore in the column name**. Another column named "factor" specifies -# the multiplication factor for the conversion. +# The later are indicated by "NewIndex__OldIndex" - **the important part are +# the two underscore in the column name**. Another column named "factor" specifies +# the multiplication factor for the conversion. # Finally, additional columns can be used to indicate units and other information. # %% [markdown] -# All mapping occurs on the index of the original data. +# All mapping occurs on the index of the original data. # Thus the data to be converted needs to be in long matrix format, at least for the index # levels which are considered in the conversion. # TODO: In case conversion happens on MRIO Extensions this conversion happens automatically. @@ -69,35 +69,43 @@ # %% import pandas as pd + import pymrio ghg_result = pd.DataFrame( -columns=["Region1", "Region2", "Region3"], -index=pd.MultiIndex.from_tuples( - [ - ("Carbon Dioxide", "Air"), - ("Methane", "air"), - ] -), -data=[[5, 6, 7], [0.5, 0.6, 0.7]], + columns=["Region1", "Region2", "Region3"], + index=pd.MultiIndex.from_tuples( + [ + ("Carbon Dioxide", "Air"), + ("Methane", "air"), + ] + ), + data=[[5, 6, 7], [0.5, 0.6, 0.7]], ) ghg_result.index.names = ["stressor", "compartment"] ghg_result.columns.names = ["region"] # %% [markdown] -# Our first task here is to rename to the chemical names of the stressors +# Our first task here is to rename to the chemical names of the stressors # and fix the compartment spelling. -# %% +# %% ghg_map = pd.DataFrame( -columns=["stressor", "compartment", "chem_stressor__stressor", "compartment__compartment", "factor"], -data=[["Carbon Dioxide", "[A|a]ir", "CO2", "Air", 1.0], - ["Methane", "[A|a]ir", "CH4", "Air", 1.0] - ], + columns=[ + "stressor", + "compartment", + "chem_stressor__stressor", + "compartment__compartment", + "factor", + ], + data=[ + ["Carbon Dioxide", "[A|a]ir", "CO2", "Air", 1.0], + ["Methane", "[A|a]ir", "CH4", "Air", 1.0], + ], ) ghg_map -# %% +# %% ghg_new = pymrio.convert(ghg_result, ghg_map) ghg_new @@ -105,23 +113,29 @@ # Explanation: The column headers indicates that the stressor index level # should be renamed from "stressor" to "chem_stressor" and the compartment index level # should stay the same (NewName__OldName). The factor column is not used in this case. -# All renaming columns consider regular expressions, +# All renaming columns consider regular expressions, # so that the spelling of the compartment can be fixed in one go. # %% [markdown] # For simple rename (and aggregation cases, see below) we can omit the factor column. # Thus we obtain the same result with the following mapping table: -# %% +# %% ghg_map_wo_factor = pd.DataFrame( -columns=["stressor", "compartment", "chem_stressor__stressor", "compartment__compartment"], -data=[["Carbon Dioxide", "[A|a]ir", "CO2", "Air"], - ["Methane", "[A|a]ir", "CH4", "Air"] - ], + columns=[ + "stressor", + "compartment", + "chem_stressor__stressor", + "compartment__compartment", + ], + data=[ + ["Carbon Dioxide", "[A|a]ir", "CO2", "Air"], + ["Methane", "[A|a]ir", "CH4", "Air"], + ], ) ghg_map_wo_factor -# %% +# %% ghg_new_wo_factor = pymrio.convert(ghg_result, ghg_map_wo_factor) ghg_new_wo_factor @@ -136,14 +150,14 @@ # %% ghg_result_ton = pd.DataFrame( -columns=["Region1", "Region2", "Region3"], -index=pd.MultiIndex.from_tuples( - [ - ("Carbon Dioxide", "Air"), - ("Methane", "air"), - ] -), -data=[[5, 6, 7], [0.5, 0.6, 0.7]], + columns=["Region1", "Region2", "Region3"], + index=pd.MultiIndex.from_tuples( + [ + ("Carbon Dioxide", "Air"), + ("Methane", "air"), + ] + ), + data=[[5, 6, 7], [0.5, 0.6, 0.7]], ) ghg_result_ton.index.names = ["stressor", "compartment"] ghg_result_ton.columns.names = ["region"] @@ -154,10 +168,17 @@ ghg_map_to_kg = pd.DataFrame( -columns=["stressor", "compartment", "chem_stressor__stressor", "compartment__compartment", "factor"], -data=[["Carbon Dioxide", "[A|a]ir", "CO2", "Air", 1000], - ["Methane", "[A|a]ir", "CH4", "Air", 1000] - ], + columns=[ + "stressor", + "compartment", + "chem_stressor__stressor", + "compartment__compartment", + "factor", + ], + data=[ + ["Carbon Dioxide", "[A|a]ir", "CO2", "Air", 1000], + ["Methane", "[A|a]ir", "CH4", "Air", 1000], + ], ) ghg_map_to_kg @@ -169,7 +190,6 @@ # TODO: unit conversion extensions - # %% [markdown] # ## Characterization @@ -184,29 +204,30 @@ # An simple example is a conversion/aggregation based on GWP100 characterization factors. # Here, we continue with the unit converted and cleanup dataframe from above: -# %% +# %% ghg_new_kg # %% [markdown] -# We define a general purpose characterization map for GHG emissions -# (based on +# We define a general purpose characterization map for GHG emissions +# (based on # [AR6 GWP100 and GWP20 factors](https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_Chapter07.pdf) # ,with some simplifications): # %% GWP_characterization = pd.DataFrame( -columns=["chem_stressor", "GWP__chem_stressor", "factor"], -data=[["CO2", "GWP100", 1], - ["CH4", "GWP100", 29], - ["NHx", "GWP100", 273], - ["CO2", "GWP20", 1], - ["CH4", "GWP20", 80], - ["NHx", "GWP20", 273], - ["CO2", "GWP500", 1], - ["CH4", "GWP500", 8], - ["NHx", "GWP500", 130], - ], + columns=["chem_stressor", "GWP__chem_stressor", "factor"], + data=[ + ["CO2", "GWP100", 1], + ["CH4", "GWP100", 29], + ["NHx", "GWP100", 273], + ["CO2", "GWP20", 1], + ["CH4", "GWP20", 80], + ["NHx", "GWP20", 273], + ["CO2", "GWP500", 1], + ["CH4", "GWP500", 8], + ["NHx", "GWP500", 130], + ], ) GWP_characterization @@ -216,21 +237,23 @@ # %% [markdown] -# As we can see, GWP_characterization can include factors for stressors not actually +# As we can see, GWP_characterization can include factors for stressors not actually # present in the data. # These are silently ignored in the conversion process. -# We also did not specify the compartment and assumed the same factors apply +# We also did not specify the compartment and assumed the same factors apply # independent of the compartment (we could pass through the compartment to # the new result table via passing drop_not_bridge=False to the convert function). # %% -GWP_result_with_comp = pymrio.convert(ghg_new_kg, GWP_characterization, drop_not_bridged=False) +GWP_result_with_comp = pymrio.convert( + ghg_new_kg, GWP_characterization, drop_not_bridged=False +) GWP_result_with_comp # %% [markdown] -# All stressors mapped to the same "impact" are first converted via the +# All stressors mapped to the same "impact" are first converted via the # value given in the factor column -# and then summed up (the aggregation function can be changed +# and then summed up (the aggregation function can be changed # via the `agg_func` parameter). # %% [markdown] @@ -242,17 +265,25 @@ # (The same principle applies to sector specific factors.) # For that, we assume some land use results for different regions: -# %% +# %% land_use_result = pd.DataFrame( -columns=["Region1", "Region2", "Region3"], -index=["Wheat", "Maize", "Rice", "Pasture", "Forest extensive", "Forest intensive",], -data=[[3, 10, 1], - [5, 20, 3], - [0, 12, 34], - [12, 34, 9], - [32, 27, 11], - [43, 17, 24], - ], + columns=["Region1", "Region2", "Region3"], + index=[ + "Wheat", + "Maize", + "Rice", + "Pasture", + "Forest extensive", + "Forest intensive", + ], + data=[ + [3, 10, 1], + [5, 20, 3], + [0, 12, 34], + [12, 34, 9], + [32, 27, 11], + [43, 17, 24], + ], ) land_use_result.index.names = ["stressor"] land_use_result.columns.names = ["region"] @@ -260,25 +291,25 @@ # %% [markdown] # Now we setup a pseudo characterization table for converting the land use data into -# biodiversity impacts. We assume, that the characterization factors vary based on +# biodiversity impacts. We assume, that the characterization factors vary based on # land use type and region. # %% [markdown] landuse_characterization = pd.DataFrame( -columns=["stressor", "BioDiv__stressor", "region", "factor"], -data=[ - ["Wheat|Maize", "BioImpact", "Region1", 3], - ["Wheat", "BioImpact", "Region[2,3]", 4], - ["Maize", "BioImpact", "Region[2,3]", 7], - ["Rice", "BioImpact", "Region1", 12], - ["Rice", "BioImpact", "Region2", 12], - ["Rice", "BioImpact", "Region3", 12], - ["Pasture", "BioImpact", "Region[1,2,3]", 12], - ["Forest.*", "BioImpact", "Region1", 2], - ["Forest.*", "BioImpact", "Region2", 3], - ["Forest ext.*", "BioImpact", "Region3", 1], - ["Forest int.*", "BioImpact", "Region3", 3], - ], + columns=["stressor", "BioDiv__stressor", "region", "factor"], + data=[ + ["Wheat|Maize", "BioImpact", "Region1", 3], + ["Wheat", "BioImpact", "Region[2,3]", 4], + ["Maize", "BioImpact", "Region[2,3]", 7], + ["Rice", "BioImpact", "Region1", 12], + ["Rice", "BioImpact", "Region2", 12], + ["Rice", "BioImpact", "Region3", 12], + ["Pasture", "BioImpact", "Region[1,2,3]", 12], + ["Forest.*", "BioImpact", "Region1", 2], + ["Forest.*", "BioImpact", "Region2", 3], + ["Forest ext.*", "BioImpact", "Region3", 1], + ["Forest int.*", "BioImpact", "Region3", 3], + ], ) landuse_characterization @@ -292,7 +323,6 @@ # CONT: start working on convert for extensions/mrio method - # %% [markdown] # Irrespectively of the table or the mrio system, the convert function always follows the same pattern. # It requires a bridge table, which contains the mapping of the indicies of the source data to the indicies of the target data. diff --git a/pymrio/__init__.py b/pymrio/__init__.py index 1c757c7a..515a6f88 100644 --- a/pymrio/__init__.py +++ b/pymrio/__init__.py @@ -69,10 +69,10 @@ from pymrio.tools.ioutil import ( build_agg_matrix, build_agg_vec, + convert, convert_to_long, index_contains, index_fullmatch, index_match, - convert, ) from pymrio.version import __version__ diff --git a/pymrio/tools/ioutil.py b/pymrio/tools/ioutil.py index e0b9ede0..3c8ce1f8 100644 --- a/pymrio/tools/ioutil.py +++ b/pymrio/tools/ioutil.py @@ -997,13 +997,13 @@ def _index_regex_matcher(_dfs_idx, _method, _find_all=None, **kwargs): return _dfs_idx + def check_df_map(df_orig, df_map): - """ Check which entries of df_map would be in effect given df_orig - """ - # TODO: we need a way to check for spelling mistakes + """Check which entries of df_map would be in effect given df_orig""" + # TODO: we need a way to check for spelling mistakes # and other hickups sneaking into df_map. # In this function, we check for each line of df_map which entries - # would be in effect given df_orig. + # would be in effect given df_orig. pass @@ -1013,12 +1013,12 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged_index=True): Parameters ---------- df_orig : pd.DataFrame - The DataFrame to process. + The DataFrame to process. The index levels need to be named (df.index.name needs to - be set for all levels). All index to be bridged to new + be set for all levels). All index to be bridged to new names need to be in the index (these are columns indicated with two underscores '__' in the mapping dataframe, df_map). - Other constraining conditions (e.g. regions, sectors) can be either + Other constraining conditions (e.g. regions, sectors) can be either in the index or columns. The values in index are preferred. df_map : pd.DataFrame @@ -1062,13 +1062,13 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged_index=True): drop_not_bridged_index : bool, optional What to do with index levels in df_orig not appearing in the bridge columns. - If True, drop them (aggregation across these), if False, - pass them through to the result. + If True, drop them (aggregation across these), if False, + pass them through to the result. *Note:* Only index levels will be dropped, not columns. In case some index levels need to be dropped, and some not - make a bridge column for the ones to be dropped and map all to the same name. + make a bridge column for the ones to be dropped and map all to the same name. Then drop this index level after the conversion. @@ -1090,7 +1090,6 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged_index=True): if isinstance(df_orig, pd.Series): df_orig = pd.DataFrame(df_orig) - # some consitency checks of arguments and restructuring if everything is ok if len(bridge_columns) == 0: raise ValueError("No columns with '__' in the mapping DataFrame") @@ -1105,7 +1104,7 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged_index=True): raise ValueError(f"Column {bridge.orig} not in df_orig") else: bridges.append(bridge) - + orig_index_not_bridged = [ ix for ix in df_orig.index.names if ix not in [b.orig for b in bridges] ] @@ -1190,6 +1189,5 @@ def convert(df_orig, df_map, agg_func="sum", drop_not_bridged_index=True): except TypeError: # case where there is only one index level pass - - return all_result.groupby(by=all_result.index.names).agg(agg_func) + return all_result.groupby(by=all_result.index.names).agg(agg_func) diff --git a/tests/test_util.py b/tests/test_util.py index 4754c287..2db5024a 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -376,7 +376,9 @@ def test_convert_rename(): ], ) - char_res_keep_comp = convert(to_char, rename_bridge_simple, drop_not_bridged_index=False) + char_res_keep_comp = convert( + to_char, rename_bridge_simple, drop_not_bridged_index=False + ) assert all(char_res_keep_comp.columns == to_char.columns) assert all( char_res_keep_comp.index.get_level_values("compart") @@ -400,7 +402,9 @@ def test_convert_rename(): ), ) - char_res_agg_comp = convert(to_char, rename_bridge_simple, drop_not_bridged_index=True) + char_res_agg_comp = convert( + to_char, rename_bridge_simple, drop_not_bridged_index=True + ) assert all(char_res_agg_comp.columns == to_char.columns) assert char_res_agg_comp.sum().sum() == to_char.sum().sum() @@ -418,8 +422,6 @@ def test_convert_rename(): ), ) - - # without factor should give the same result rename_bridge_simple_wo_factor = pd.DataFrame( columns=["em_type", "stressor__em_type"], @@ -430,13 +432,11 @@ def test_convert_rename(): ], ) - char_res_keep_comp_wo_factor = convert(to_char, - rename_bridge_simple_wo_factor, - drop_not_bridged_index=False) + char_res_keep_comp_wo_factor = convert( + to_char, rename_bridge_simple_wo_factor, drop_not_bridged_index=False + ) - pdt.assert_frame_equal( - char_res_keep_comp_wo_factor, - char_res_keep_comp) + pdt.assert_frame_equal(char_res_keep_comp_wo_factor, char_res_keep_comp) def test_convert_characterize(): @@ -488,9 +488,7 @@ def test_convert_characterize(): # TEST1B: with only impact (one index level in the result) , keep compartments as these are not dropped now - res1B = convert(to_char, - map_test1, - drop_not_bridged_index=False) + res1B = convert(to_char, map_test1, drop_not_bridged_index=False) exp_res1B = pd.DataFrame( columns=to_char.columns, @@ -593,47 +591,58 @@ def test_convert_characterize(): pdt.assert_frame_equal(res3, exp_res3) - # TEST 4, with one constraints in the columns + # TEST 4, with one constraints in the columns map_test4 = pd.DataFrame( - columns=["Region1", "Region2", "Region3"], - index=["Wheat", "Maize", "Rice", "Pasture", "Forest extensive", "Forest intensive",], - data=[[3, 10, 1], - [5, 20, 3], - [0, 12, 34], - [12, 34, 9], - [32, 27, 11], - [43, 17, 24], + columns=["Region1", "Region2", "Region3"], + index=[ + "Wheat", + "Maize", + "Rice", + "Pasture", + "Forest extensive", + "Forest intensive", + ], + data=[ + [3, 10, 1], + [5, 20, 3], + [0, 12, 34], + [12, 34, 9], + [32, 27, 11], + [43, 17, 24], ], ) map_test4.index.names = ["stressor"] map_test4.columns.names = ["region"] char4 = pd.DataFrame( - columns=["stressor", "BioDiv__stressor", "region", "factor"], - data=[ - ["Wheat|Maize", "BioImpact", "Region1", 3], - ["Wheat", "BioImpact", "Region[2,3]", 4], - ["Maize", "BioImpact", "Region[2,3]", 7], - ["Rice", "BioImpact", "Region1", 12], - ["Rice", "BioImpact", "Region2", 12], - ["Rice", "BioImpact", "Region3", 12], - ["Pasture", "BioImpact", "Region[1,2,3]", 12], - ["Forest.*", "BioImpact", "Region1", 2], - ["Forest.*", "BioImpact", "Region2", 3], - ["Forest ext.*", "BioImpact", "Region3", 1], - ["Forest int.*", "BioImpact", "Region3", 3], + columns=["stressor", "BioDiv__stressor", "region", "factor"], + data=[ + ["Wheat|Maize", "BioImpact", "Region1", 3], + ["Wheat", "BioImpact", "Region[2,3]", 4], + ["Maize", "BioImpact", "Region[2,3]", 7], + ["Rice", "BioImpact", "Region1", 12], + ["Rice", "BioImpact", "Region2", 12], + ["Rice", "BioImpact", "Region3", 12], + ["Pasture", "BioImpact", "Region[1,2,3]", 12], + ["Forest.*", "BioImpact", "Region1", 2], + ["Forest.*", "BioImpact", "Region2", 3], + ["Forest ext.*", "BioImpact", "Region3", 1], + ["Forest int.*", "BioImpact", "Region3", 3], ], ) expected4 = pd.DataFrame( columns=["Region1", "Region2", "Region3"], index=["BioImpact"], - data=[[ - 3*3 + 5*3 + 0*12 + 12*12 + 32*2 + 43*2, - 10*4 + 20*7 + 12*12 + 34*12 + 27*3 + 17*3, - 1*4 + 3*7 + 34*12 + 9*12 + 11*1 + 24*3, - ]]) + data=[ + [ + 3 * 3 + 5 * 3 + 0 * 12 + 12 * 12 + 32 * 2 + 43 * 2, + 10 * 4 + 20 * 7 + 12 * 12 + 34 * 12 + 27 * 3 + 17 * 3, + 1 * 4 + 3 * 7 + 34 * 12 + 9 * 12 + 11 * 1 + 24 * 3, + ] + ], + ) expected4.columns.names = ["region"] expected4.index.names = ["BioDiv"] @@ -643,32 +652,42 @@ def test_convert_characterize(): map_test4_stack = map_test4.stack(level="region") - char4_calc_stack = convert(map_test4_stack, - char4, - drop_not_bridged_index=False).unstack(level="region")[0] + char4_calc_stack = convert( + map_test4_stack, char4, drop_not_bridged_index=False + ).unstack(level="region")[0] pdt.assert_frame_equal(char4_calc_nostack, char4_calc_stack) # TEST 4, with sectors in addition map_test5 = pd.DataFrame( - columns=pd.MultiIndex.from_product([ - ["Region1", "Region2", "Region3"], - ["SecA", "SecB"]]), - index=["Wheat", "Maize", "Rice", "Pasture", "Forest extensive", "Forest intensive",], - data=[[1.2,1.8, 7,3, 1,0], - [5,0, 12,8, 1.5,1.5], - [0,0, 6,6, 30,4], - [10,2, 14,20, 6,3], - [30,2, 20,7, 10,1], - [23, 20, 7.0, 10.0, 14, 10], + columns=pd.MultiIndex.from_product( + [["Region1", "Region2", "Region3"], ["SecA", "SecB"]] + ), + index=[ + "Wheat", + "Maize", + "Rice", + "Pasture", + "Forest extensive", + "Forest intensive", + ], + data=[ + [1.2, 1.8, 7, 3, 1, 0], + [5, 0, 12, 8, 1.5, 1.5], + [0, 0, 6, 6, 30, 4], + [10, 2, 14, 20, 6, 3], + [30, 2, 20, 7, 10, 1], + [23, 20, 7.0, 10.0, 14, 10], ], ) map_test5.index.names = ["stressor"] map_test5.columns.names = ["region", "sector"] char5_res = convert(map_test5, char4) - pdt.assert_frame_equal(char5_res.T.groupby(level="region").sum().T, char4_calc_nostack.astype("float")) + pdt.assert_frame_equal( + char5_res.T.groupby(level="region").sum().T, char4_calc_nostack.astype("float") + ) def test_convert_wrong_inputs():