diff --git a/docs/api_table_two/bind.Rmd b/docs/api_table_two/bind.Rmd new file mode 100644 index 00000000..da38093f --- /dev/null +++ b/docs/api_table_two/bind.Rmd @@ -0,0 +1,98 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.13.7 + kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +```{python nbsphinx=hidden} +import pandas as pd +pd.set_option("display.max_rows", 5) +``` + +## Bind rows and columns + +```{python} +from siuba import _, bind_rows, bind_cols +from siuba.data import starwars + +one = starwars[:4] +two = starwars[9:12] +``` + +```{python} +one +``` + +```{python} +two +``` + +### DataFrames as arguments + +```{python} +bind_rows(one, two) +``` + +### (Skip) The contents of lists are spliced automatically + +```{python} +#bind_rows([one, two]) +``` + +### Dictionaries as arguments + +```{python} +# Note that to support this, need to bind_rows.register(dict) +# or set the default method, bind_rows.register(object), and +# handle different arguments there +# Currently bind_rows.dispatch(object) returns something called a pipeable, +# so that functions like mtcars >> head(2) work. This way, user's will +# have to explicitly create a pipe using bind_rows(_, ...) +bind_rows( + {"a": 1, "b": 2}, + {"a": 3, "b": 4} +) +``` + + +### Mixing dictionaries and DataFrames + +```{python} +bind_rows( + {"a": 1, "b": 2}, + pd.DataFrame({"a": [3, 4], "b": [5, 6]}), + {"a": 7, "b": 8} +) +``` + +### Supplying \_id argument + +```{python} +bind_rows(one, two, _id="id") +``` + +```{python} +bind_rows(a=one, b=two, _id="id") +``` + +```{python} +bind_rows(**{"group 1": one, "group 2": two}, _id="groups") +``` + +### Data can have different columns when row binding + +```{python} +bind_rows(pd.DataFrame({"x": [1, 2, 3]}), pd.DataFrame({"y": [1, 2, 3, 4]})) +``` + +### TODO: Add in bind_cols examples + +see: https://dplyr.tidyverse.org/reference/bind.html diff --git a/siuba/data/__init__.py b/siuba/data/__init__.py index 58cc4d1e..ab1eecae 100644 --- a/siuba/data/__init__.py +++ b/siuba/data/__init__.py @@ -27,3 +27,7 @@ ["cyl", "mpg", "hp"] ) + +# Starwars -------------------------------------------------------------------- +_fname_sw = pkg_resources.resource_filename("siuba.data", "starwars.csv") +starwars = pd.read_csv(_fname_sw) diff --git a/siuba/data/starwars.csv b/siuba/data/starwars.csv new file mode 100644 index 00000000..41f86ce9 --- /dev/null +++ b/siuba/data/starwars.csv @@ -0,0 +1,88 @@ +name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,films,vehicles,starships +Luke Skywalker,172,77,blond,fair,blue,19,male,masculine,Tatooine,Human,"The Empire Strikes Back, Revenge of the Sith, Return of the Jedi, A New Hope, The Force Awakens","Snowspeeder, Imperial Speeder Bike","X-wing, Imperial shuttle" +C-3PO,167,75,NA,gold,yellow,112,none,masculine,Tatooine,Droid,"The Empire Strikes Back, Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, A New Hope",, +R2-D2,96,32,NA,"white, blue",red,33,none,masculine,Naboo,Droid,"The Empire Strikes Back, Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, A New Hope, The Force Awakens",, +Darth Vader,202,136,none,white,yellow,41.9,male,masculine,Tatooine,Human,"The Empire Strikes Back, Revenge of the Sith, Return of the Jedi, A New Hope",,TIE Advanced x1 +Leia Organa,150,49,brown,light,brown,19,female,feminine,Alderaan,Human,"The Empire Strikes Back, Revenge of the Sith, Return of the Jedi, A New Hope, The Force Awakens",Imperial Speeder Bike, +Owen Lars,178,120,"brown, grey",light,blue,52,male,masculine,Tatooine,Human,"Attack of the Clones, Revenge of the Sith, A New Hope",, +Beru Whitesun lars,165,75,brown,light,blue,47,female,feminine,Tatooine,Human,"Attack of the Clones, Revenge of the Sith, A New Hope",, +R5-D4,97,32,NA,"white, red",red,NA,none,masculine,Tatooine,Droid,A New Hope,, +Biggs Darklighter,183,84,black,light,brown,24,male,masculine,Tatooine,Human,A New Hope,,X-wing +Obi-Wan Kenobi,182,77,"auburn, white",fair,blue-gray,57,male,masculine,Stewjon,Human,"The Empire Strikes Back, Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi, A New Hope",Tribubble bongo,"Jedi starfighter, Trade Federation cruiser, Naboo star skiff, Jedi Interceptor, Belbullab-22 starfighter" +Anakin Skywalker,188,84,blond,fair,blue,41.9,male,masculine,Tatooine,Human,"Attack of the Clones, The Phantom Menace, Revenge of the Sith","Zephyr-G swoop bike, XJ-6 airspeeder","Trade Federation cruiser, Jedi Interceptor, Naboo fighter" +Wilhuff Tarkin,180,NA,"auburn, grey",fair,blue,64,male,masculine,Eriadu,Human,"Revenge of the Sith, A New Hope",, +Chewbacca,228,112,brown,unknown,blue,200,male,masculine,Kashyyyk,Wookiee,"The Empire Strikes Back, Revenge of the Sith, Return of the Jedi, A New Hope, The Force Awakens",AT-ST,"Millennium Falcon, Imperial shuttle" +Han Solo,180,80,brown,fair,brown,29,male,masculine,Corellia,Human,"The Empire Strikes Back, Return of the Jedi, A New Hope, The Force Awakens",,"Millennium Falcon, Imperial shuttle" +Greedo,173,74,NA,green,black,44,male,masculine,Rodia,Rodian,A New Hope,, +Jabba Desilijic Tiure,175,1358,NA,"green-tan, brown",orange,600,hermaphroditic,masculine,Nal Hutta,Hutt,"The Phantom Menace, Return of the Jedi, A New Hope",, +Wedge Antilles,170,77,brown,fair,hazel,21,male,masculine,Corellia,Human,"The Empire Strikes Back, Return of the Jedi, A New Hope",Snowspeeder,X-wing +Jek Tono Porkins,180,110,brown,fair,blue,NA,male,masculine,Bestine IV,Human,A New Hope,,X-wing +Yoda,66,17,white,green,brown,896,male,masculine,NA,Yoda's species,"The Empire Strikes Back, Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi",, +Palpatine,170,75,grey,pale,yellow,82,male,masculine,Naboo,Human,"The Empire Strikes Back, Attack of the Clones, The Phantom Menace, Revenge of the Sith, Return of the Jedi",, +Boba Fett,183,78.2,black,fair,brown,31.5,male,masculine,Kamino,Human,"The Empire Strikes Back, Attack of the Clones, Return of the Jedi",,Slave 1 +IG-88,200,140,none,metal,red,15,none,masculine,NA,Droid,The Empire Strikes Back,, +Bossk,190,113,none,green,red,53,male,masculine,Trandosha,Trandoshan,The Empire Strikes Back,, +Lando Calrissian,177,79,black,dark,brown,31,male,masculine,Socorro,Human,"The Empire Strikes Back, Return of the Jedi",,Millennium Falcon +Lobot,175,79,none,light,blue,37,male,masculine,Bespin,Human,The Empire Strikes Back,, +Ackbar,180,83,none,brown mottle,orange,41,male,masculine,Mon Cala,Mon Calamari,"Return of the Jedi, The Force Awakens",, +Mon Mothma,150,NA,auburn,fair,blue,48,female,feminine,Chandrila,Human,Return of the Jedi,, +Arvel Crynyd,NA,NA,brown,fair,brown,NA,male,masculine,NA,Human,Return of the Jedi,,A-wing +Wicket Systri Warrick,88,20,brown,brown,brown,8,male,masculine,Endor,Ewok,Return of the Jedi,, +Nien Nunb,160,68,none,grey,black,NA,male,masculine,Sullust,Sullustan,Return of the Jedi,,Millennium Falcon +Qui-Gon Jinn,193,89,brown,fair,blue,92,male,masculine,NA,Human,The Phantom Menace,Tribubble bongo, +Nute Gunray,191,90,none,mottled green,red,NA,male,masculine,Cato Neimoidia,Neimodian,"Attack of the Clones, The Phantom Menace, Revenge of the Sith",, +Finis Valorum,170,NA,blond,fair,blue,91,male,masculine,Coruscant,Human,The Phantom Menace,, +Jar Jar Binks,196,66,none,orange,orange,52,male,masculine,Naboo,Gungan,"Attack of the Clones, The Phantom Menace",, +Roos Tarpals,224,82,none,grey,orange,NA,male,masculine,Naboo,Gungan,The Phantom Menace,, +Rugor Nass,206,NA,none,green,orange,NA,male,masculine,Naboo,Gungan,The Phantom Menace,, +Ric Olié,183,NA,brown,fair,blue,NA,NA,NA,Naboo,NA,The Phantom Menace,,Naboo Royal Starship +Watto,137,NA,black,"blue, grey",yellow,NA,male,masculine,Toydaria,Toydarian,"Attack of the Clones, The Phantom Menace",, +Sebulba,112,40,none,"grey, red",orange,NA,male,masculine,Malastare,Dug,The Phantom Menace,, +Quarsh Panaka,183,NA,black,dark,brown,62,NA,NA,Naboo,NA,The Phantom Menace,, +Shmi Skywalker,163,NA,black,fair,brown,72,female,feminine,Tatooine,Human,"Attack of the Clones, The Phantom Menace",, +Darth Maul,175,80,none,red,yellow,54,male,masculine,Dathomir,Zabrak,The Phantom Menace,Sith speeder,Scimitar +Bib Fortuna,180,NA,none,pale,pink,NA,male,masculine,Ryloth,Twi'lek,Return of the Jedi,, +Ayla Secura,178,55,none,blue,hazel,48,female,feminine,Ryloth,Twi'lek,"Attack of the Clones, The Phantom Menace, Revenge of the Sith",, +Dud Bolt,94,45,none,"blue, grey",yellow,NA,male,masculine,Vulpter,Vulptereen,The Phantom Menace,, +Gasgano,122,NA,none,"white, blue",black,NA,male,masculine,Troiken,Xexto,The Phantom Menace,, +Ben Quadinaros,163,65,none,"grey, green, yellow",orange,NA,male,masculine,Tund,Toong,The Phantom Menace,, +Mace Windu,188,84,none,dark,brown,72,male,masculine,Haruun Kal,Human,"Attack of the Clones, The Phantom Menace, Revenge of the Sith",, +Ki-Adi-Mundi,198,82,white,pale,yellow,92,male,masculine,Cerea,Cerean,"Attack of the Clones, The Phantom Menace, Revenge of the Sith",, +Kit Fisto,196,87,none,green,black,NA,male,masculine,Glee Anselm,Nautolan,"Attack of the Clones, The Phantom Menace, Revenge of the Sith",, +Eeth Koth,171,NA,black,brown,brown,NA,male,masculine,Iridonia,Zabrak,"The Phantom Menace, Revenge of the Sith",, +Adi Gallia,184,50,none,dark,blue,NA,female,feminine,Coruscant,Tholothian,"The Phantom Menace, Revenge of the Sith",, +Saesee Tiin,188,NA,none,pale,orange,NA,male,masculine,Iktotch,Iktotchi,"The Phantom Menace, Revenge of the Sith",, +Yarael Poof,264,NA,none,white,yellow,NA,male,masculine,Quermia,Quermian,The Phantom Menace,, +Plo Koon,188,80,none,orange,black,22,male,masculine,Dorin,Kel Dor,"Attack of the Clones, The Phantom Menace, Revenge of the Sith",,Jedi starfighter +Mas Amedda,196,NA,none,blue,blue,NA,male,masculine,Champala,Chagrian,"Attack of the Clones, The Phantom Menace",, +Gregar Typho,185,85,black,dark,brown,NA,male,masculine,Naboo,Human,Attack of the Clones,,Naboo fighter +Cordé,157,NA,brown,light,brown,NA,female,feminine,Naboo,Human,Attack of the Clones,, +Cliegg Lars,183,NA,brown,fair,blue,82,male,masculine,Tatooine,Human,Attack of the Clones,, +Poggle the Lesser,183,80,none,green,yellow,NA,male,masculine,Geonosis,Geonosian,"Attack of the Clones, Revenge of the Sith",, +Luminara Unduli,170,56.2,black,yellow,blue,58,female,feminine,Mirial,Mirialan,"Attack of the Clones, Revenge of the Sith",, +Barriss Offee,166,50,black,yellow,blue,40,female,feminine,Mirial,Mirialan,Attack of the Clones,, +Dormé,165,NA,brown,light,brown,NA,female,feminine,Naboo,Human,Attack of the Clones,, +Dooku,193,80,white,fair,brown,102,male,masculine,Serenno,Human,"Attack of the Clones, Revenge of the Sith",Flitknot speeder, +Bail Prestor Organa,191,NA,black,tan,brown,67,male,masculine,Alderaan,Human,"Attack of the Clones, Revenge of the Sith",, +Jango Fett,183,79,black,tan,brown,66,male,masculine,Concord Dawn,Human,Attack of the Clones,, +Zam Wesell,168,55,blonde,"fair, green, yellow",yellow,NA,female,feminine,Zolan,Clawdite,Attack of the Clones,Koro-2 Exodrive airspeeder, +Dexter Jettster,198,102,none,brown,yellow,NA,male,masculine,Ojom,Besalisk,Attack of the Clones,, +Lama Su,229,88,none,grey,black,NA,male,masculine,Kamino,Kaminoan,Attack of the Clones,, +Taun We,213,NA,none,grey,black,NA,female,feminine,Kamino,Kaminoan,Attack of the Clones,, +Jocasta Nu,167,NA,white,fair,blue,NA,female,feminine,Coruscant,Human,Attack of the Clones,, +Ratts Tyerell,79,15,none,"grey, blue",unknown,NA,male,masculine,Aleen Minor,Aleena,The Phantom Menace,, +R4-P17,96,NA,none,"silver, red","red, blue",NA,none,feminine,NA,Droid,"Attack of the Clones, Revenge of the Sith",, +Wat Tambor,193,48,none,"green, grey",unknown,NA,male,masculine,Skako,Skakoan,Attack of the Clones,, +San Hill,191,NA,none,grey,gold,NA,male,masculine,Muunilinst,Muun,Attack of the Clones,, +Shaak Ti,178,57,none,"red, blue, white",black,NA,female,feminine,Shili,Togruta,"Attack of the Clones, Revenge of the Sith",, +Grievous,216,159,none,"brown, white","green, yellow",NA,male,masculine,Kalee,Kaleesh,Revenge of the Sith,Tsmeu-6 personal wheel bike,Belbullab-22 starfighter +Tarfful,234,136,brown,brown,blue,NA,male,masculine,Kashyyyk,Wookiee,Revenge of the Sith,, +Raymus Antilles,188,79,brown,light,brown,NA,male,masculine,Alderaan,Human,"Revenge of the Sith, A New Hope",, +Sly Moore,178,48,none,pale,white,NA,NA,NA,Umbara,NA,"Attack of the Clones, Revenge of the Sith",, +Tion Medon,206,80,none,grey,black,NA,male,masculine,Utapau,Pau'an,Revenge of the Sith,, +Finn,NA,NA,black,dark,dark,NA,male,masculine,NA,Human,The Force Awakens,, +Rey,NA,NA,brown,light,hazel,NA,female,feminine,NA,Human,The Force Awakens,, +Poe Dameron,NA,NA,brown,light,brown,NA,male,masculine,NA,Human,The Force Awakens,,T-70 X-wing fighter +BB8,NA,NA,none,none,black,NA,none,masculine,NA,Droid,The Force Awakens,, +Captain Phasma,NA,NA,unknown,unknown,unknown,NA,NA,NA,NA,NA,The Force Awakens,, +Padmé Amidala,165,45,brown,light,brown,46,female,feminine,Naboo,Human,"Attack of the Clones, The Phantom Menace, Revenge of the Sith",,"H-type Nubian yacht, Naboo star skiff, Naboo fighter" diff --git a/siuba/dply/verbs.py b/siuba/dply/verbs.py index aec4c390..0bac7ee5 100644 --- a/siuba/dply/verbs.py +++ b/siuba/dply/verbs.py @@ -6,20 +6,21 @@ from pandas.core.groupby import DataFrameGroupBy from pandas.core.dtypes.inference import is_scalar from siuba.siu import ( - Symbolic, Call, strip_symbolic, create_sym_call, + Symbolic, Call, strip_symbolic, create_sym_call, MetaArg, BinaryOp, _SliceOpIndex, Lazy, singledispatch2, pipe_no_args, Pipeable, pipe ) DPLY_FUNCTIONS = ( # Dply ---- - "group_by", "ungroup", + "group_by", "ungroup", "select", "rename", "mutate", "transmute", "filter", "summarize", "arrange", "distinct", "count", "add_count", "head", "top_n", + "bind_cols", "bind_rows", # Tidy ---- "spread", "gather", "nest", "unnest", @@ -67,7 +68,7 @@ def install_pd_siu(): def _repr_grouped_df_html_(self): obj_repr = self.obj._repr_html_() - + # user can config pandas not to return html representation, in which case # the ipython behavior should fall back to repr if obj_repr is None: @@ -161,9 +162,9 @@ def mutate(__data, **kwargs): :: from siuba.data import mtcars mtcars >> mutate(cyl2 = _.cyl * 2, cyl4 = _.cyl2 * 2) - + """ - + orig_cols = __data.columns result = __data.assign(**kwargs) @@ -179,7 +180,7 @@ def _mutate(__data, **kwargs): orig_index = __data.obj.index df = __data.apply(lambda d: d.assign(**kwargs)) - + # will drop all but original index group_by_lvls = list(range(df.index.nlevels - 1)) g_df = df.reset_index(group_by_lvls, drop = True).loc[orig_index].groupby(groupings) @@ -239,7 +240,7 @@ def filter(__data, *args): :: from siuba.data import mtcars # keep rows where cyl is 4 and mpg is less than 25 - mtcars >> filter(mtcars, _.cyl == 4, _.mpg < 25) + mtcars >> filter(mtcars, _.cyl == 4, _.mpg < 25) """ crnt_indx = True @@ -297,7 +298,7 @@ def summarize(__data, **kwargs): :: from siuba.data import mtcars mtcars >> summarize(mean = _.disp.mean(), n = n(_)) - + """ results = {} for k, v in kwargs.items(): @@ -310,17 +311,17 @@ def summarize(__data, **kwargs): # keep result, but use underlying array to avoid crazy index issues # on DataFrame construction (#138) results[k] = res.array if isinstance(res, pd.Series) else res - + # must pass index, or raises error when using all scalar values return DataFrame(results, index = [0]) - + @summarize.register(DataFrameGroupBy) def _summarize(__data, **kwargs): df_summarize = summarize.registry[pd.DataFrame] df = __data.apply(df_summarize, **kwargs) - + group_by_lvls = list(range(df.index.nlevels - 1)) out = df.reset_index(group_by_lvls) out.index = pd.RangeIndex(df.shape[0]) @@ -339,7 +340,7 @@ def transmute(__data, *args, **kwargs): f_mutate = mutate.registry[pd.DataFrame] - df = f_mutate(__data, **kwargs) + df = f_mutate(__data, **kwargs) return df[[*arg_vars, *kwargs.keys()]] @@ -441,7 +442,7 @@ def var_slice(colnames, x): def var_put_cols(name, var, cols): if isinstance(name, list) and var.alias is not None: raise Exception("Cannot assign name to multiple columns") - + names = [name] if not isinstance(name, list) else name for name in names: @@ -453,9 +454,9 @@ def var_put_cols(name, var, cols): def flatten_var(var): if isinstance(var, Var) and isinstance(var.name, (tuple, list)): return [var.to_copy(name = x) for x in var.name] - + return [var] - + @@ -511,7 +512,7 @@ def var_create(*args): all_vars.append(arg(vl)) else: all_vars.append(arg) - + return all_vars @singledispatch2(DataFrame) @@ -528,7 +529,7 @@ def select(__data, *args, **kwargs): to_rename = {k: v for k,v in od.items() if v is not None} return __data[list(od)].rename(columns = to_rename) - + @select.register(DataFrameGroupBy) def _select(__data, *args, **kwargs): raise Exception("Selecting columns of grouped DataFrame currently not allowed") @@ -567,12 +568,12 @@ def _call_strip_ascending(f): def arrange(__data, *args): # TODO: # - add arguments to pass to sort_values (e.g. ascending, kind) - # + # # basically need some (1) select behavior, (2) mutate-like behavior # df.sort_values is the obvious candidate, but only takes names, not expressions # to work around this, we make a shallow copy of data, and add sorting columns # then drop them at the end - # + # # sort order is determined by using a unary w/ Call e.g. -_.repo df = __data.copy(deep = False) @@ -645,7 +646,7 @@ def distinct(__data, *args, _keep_all = False, **kwargs): return tmp_data[list(cols)] return tmp_data - + @distinct.register(DataFrameGroupBy) def _distinct(__data, *args, _keep_all = False, **kwargs): df = __data.apply(lambda x: distinct(x, *args, _keep_all = _keep_all, **kwargs)) @@ -750,7 +751,7 @@ def _count_group(data, *args): out_col = "n" while out_col in crnt_cols: out_col = out_col + "n" - return + return @singledispatch2((pd.DataFrame, DataFrameGroupBy)) @@ -768,7 +769,7 @@ def count(__data, *args, wt = None, sort = False, **kwargs): no_grouping_vars = not args and not kwargs and isinstance(__data, pd.DataFrame) if wt is None: - if no_grouping_vars: + if no_grouping_vars: # no groups, just use number of rows counts = pd.DataFrame({'tmp': [__data.shape[0]]}) else: @@ -807,7 +808,7 @@ def add_count(__data, *args, wt = None, sort = False, **kwargs): on = list(counts.columns)[:-1] return __data.merge(counts, on = on) - + # Tally ======================================================================= @@ -848,7 +849,7 @@ def _fast_split_df(g_df): @singledispatch2(pd.DataFrame) def nest(__data, *args, key = "data"): """Nest columns within a DataFrame. - + Args: ___data: a DataFrame @@ -862,7 +863,7 @@ def nest(__data, *args, key = "data"): :: from siuba.data import mtcars mtcars >> nest(-_.cyl) - + """ # TODO: copied from select function var_list = var_create(*args) @@ -909,7 +910,7 @@ def _nest(__data, *args, key = "data"): @singledispatch2(pd.DataFrame) def unnest(__data, key = "data"): """Unnest a column holding nested data (e.g. Series of lists or DataFrames). - + Args: ___data: a DataFrame key: the name of the column to be unnested. @@ -921,7 +922,7 @@ def unnest(__data, key = "data"): import pandas as pd df = pd.DataFrame({'id': [1,2], 'data': [['a', 'b'], ['c', 'd']]}) df >> unnest() - + """ # TODO: currently only takes key, not expressions nrows_nested = __data[key].apply(len, convert_dtype = True) @@ -936,13 +937,13 @@ def unnest(__data, key = "data"): # may be a better approach using a multi-index long_grp = __data.loc[indx_nested, grp_keys].reset_index(drop = True) - + return long_grp.join(long_data) def _convert_nested_entry(x): if isinstance(x, (tuple, list)): return pd.Series(x) - + return x @@ -1027,13 +1028,13 @@ def anti_join(left, right = None, on = None): # copied from semi_join if isinstance(on, Mapping): left_on, right_on = zip(*on.items()) - else: + else: left_on = right_on = on # manually perform merge, up to getting pieces need for indexing merger = _MergeOperation(left, right, left_on = left_on, right_on = right_on) _, l_indx, _ = merger._get_join_info() - + # use the left table's indexer to exclude those rows range_indx = pd.RangeIndex(len(left)) return left.iloc[range_indx.difference(l_indx),:] @@ -1044,6 +1045,61 @@ def anti_join(left, right = None, on = None): inner_join = partial(join, how = "inner") +# Binding ===================================================================== + +@singledispatch2(pd.DataFrame) +def bind_rows(*args, _id=None, **kwargs): + """Concatenate DataFrames by index/rows. + Similar to join, you must specify all involved DataFrames (including _). + + Args: + *args: the DataFrame/dict-equivalents to concatenate. + _id: column name of identifiers to link each row to its original DataFrame. + Labels are taken from named arguments (kwargs). + If labels are not supplied, a numerical sequence is used instead. + **kwargs: labels with DataFrame/dict-equivalents. + """ + + if not all(isinstance(x, DataFrame) or isinstance(x, dict) for x in args): + raise Exception("all elements must be type DataFrame or dict") + + if not all(isinstance(x, DataFrame) or isinstance(x, dict) for x in kwargs.values()): + raise Exception("all named elements must be type DataFrame or dict") + + args = [df.copy() if isinstance(df, DataFrame) else DataFrame(df).copy() for df in args] + kwargs = {label: df.copy() if isinstance(df, DataFrame) else DataFrame(df).copy() for label, df in kwargs.items()} + + if _id: + dfs = {**(dict(zip(range(len(args)), args)) or {}), **kwargs} + for label, df in dfs.items(): + df.insert(0, _id, label) + dfs = dfs.values() + else: + dfs = (args or []) + kwargs.values() + + return pd.concat(dfs, axis=0) + + +@singledispatch2(pd.DataFrame) +def bind_cols(*args, **kwargs): + """Concatenate DataFrames by columns. + Similar to join, you must specify all involved DataFrames (including _). + + Args: + *args: the DataFrames to concatenate + + """ + if not all(isinstance(x, DataFrame) for x in args): + raise Exception("All elements must be type DataFrame.") + + if len(kwargs): + raise NotImplementedError("extra arguments not currently supported") + + args = [df.copy().reset_index(drop=True) for df in args] + + return pd.concat(args, axis=1) + + # Head ======================================================================== @singledispatch2(pd.DataFrame) @@ -1083,12 +1139,12 @@ def top_n(__data, n, wt = None): """ # NOTE: using min_rank, since it can return a lazy expr for min_rank(ing) - # but I would rather not have it imported in verbs. will be more + # but I would rather not have it imported in verbs. will be more # reasonable if each verb were its own file? need abstract verb / vector module. # vector imports experimental right now, so we need to invert deps - # TODO: + # TODO: # * what if wt is a string? should remove all str -> expr in verbs like group_by etc.. - # * verbs like filter allow lambdas, but this func breaks with that + # * verbs like filter allow lambdas, but this func breaks with that from .vector import min_rank if wt is None: sym_wt = getattr(Symbolic(MetaArg("_")), __data.columns[-1]) @@ -1144,15 +1200,15 @@ def spread(__data, key, value, fill = None, reset_index = True): id_cols = [col for col in __data.columns if col not in (key_col, val_col)] wide = __data.set_index(id_cols + [key_col]).unstack(level = -1) - + if fill is not None: wide.fillna(fill, inplace = True) - + # remove multi-index from both rows and cols wide.columns = wide.columns.droplevel().rename(None) if reset_index: wide.reset_index(inplace = True) - + return wide @@ -1197,13 +1253,13 @@ def complete(__data, *args, fill = None): # e.g. NAs will turn int -> float on_cols = list(expanded.columns) df = __data.merge(expanded, how = "right", on = on_cols) - + if fill is not None: for col_name, val in fill.items(): df[col_name].fillna(val, inplace = True) return df - + # Separate/Unit/Extract ============================================================ import warnings @@ -1248,11 +1304,11 @@ def separate(__data, col, into, sep = r"[^a-zA-Z0-9]", n_into = len(into) col_name = simple_varname(col) - + # splitting column ---- all_splits = __data[col_name].str.split(sep, expand = True) n_split_cols = len(all_splits.columns) - + # handling too many or too few splits ---- if n_split_cols < n_into: # too few columns @@ -1271,12 +1327,12 @@ def separate(__data, col, into, sep = r"[^a-zA-Z0-9]", # end up with only the into columns, correctly named ---- new_names = dict(zip(range(n_into), into)) keep_splits = all_splits.iloc[:, :n_into].rename(columns = new_names) - + out = pd.concat([__data, keep_splits], axis = 1) # attempt to convert columns to numeric ---- if convert: - # TODO: better strategy here? + # TODO: better strategy here? for k in into: try: out[k] = pd.to_numeric(out[k]) @@ -1405,14 +1461,14 @@ def extract( # attempt to convert columns to numeric ---- if convert: - # TODO: better strategy here? + # TODO: better strategy here? for k in keep_splits: try: keep_splits[k] = pd.to_numeric(keep_splits[k]) except ValueError: pass - + out = pd.concat([__data, keep_splits], axis = 1) if remove: diff --git a/siuba/tests/test_verb_bind.py b/siuba/tests/test_verb_bind.py new file mode 100644 index 00000000..0a4e7f02 --- /dev/null +++ b/siuba/tests/test_verb_bind.py @@ -0,0 +1,216 @@ +import pandas as pd +import pytest + +from datetime import timedelta, date, datetime +from string import ascii_letters +from pandas.testing import assert_frame_equal, assert_series_equal + +from siuba.dply.verbs import bind_cols, bind_rows +from .helpers import data_frame + +@pytest.mark.skip +def test_bind_cols_shallow_copies(): + # https://github.com/tidyverse/dplyr/blob/main/tests/testthat/test-bind.R#L3 + pass + + +@pytest.mark.skip +def test_bind_cols_lists(): + # see https://github.com/tidyverse/dplyr/issues/1104 + # the siuba analog would probably be dictionaries? + exp = data_frame(x = 1, y = "a", z = 2) + + pass + + +# Note: omitting other bind_cols list-based tests + +@pytest.mark.skip +def test_that_bind_cols_repairs_names(): + pass + + +@pytest.mark.skip +def test_that_bind_cols_honors_name_repair(): + pass + + +# rows ------------------------------------------------------------------------ + +@pytest.fixture +def df_var(): + today = date.today() + now = datetime.now() + return data_frame( + l = [True, False, False], + i = [1, 1, 2], + d = [today + timedelta(days=i) for i in [1, 1, 2]], + f = pd.Categorical(["a", "a", "b"]), + n = [1.5, 1.5, 2.5], + t = [now + timedelta(seconds=i) for i in [1, 1, 2]], + c = ["a", "a", "b"], + ) + + +def test_bind_rows_equiv_to_concat(df_var): + exp = pd.concat([df_var, df_var, df_var], axis=0) + res = bind_rows(df_var, df_var, df_var) + + assert_frame_equal(res, exp) + + +def test_bind_rows_reorders_columns(df_var): + new_order = list(df_var.columns[3::-1]) + list(df_var.columns[:3:-1]) + df_var_scramble = df_var[new_order] + + assert_frame_equal( + bind_rows(df_var, df_var_scramble), + bind_rows(df_var, df_var) + ) + + +@pytest.mark.skip +def test_bind_rows_ignores_null(): + pass + + +def test_bind_rows_list_columns(): + vals = [[1,2], [1,2,3]] + + dfl = data_frame(x = vals) + res = bind_rows(dfl, dfl) + + exp = data_frame(x = vals*2, _index = [0,1]*2) + + assert_frame_equal(res, exp) + + +@pytest.mark.xfail +def test_bind_rows_list_of_dfs(): + # https://github.com/tidyverse/dplyr/issues/1389 + df = data_frame(x = 1) + + res = bind_rows([df, df], [df, df]) + assert length(res) == 4 + assert_frame_equal(res, bind_rows(*[df]*4)) + + +def test_bind_rows_handles_dfs_no_rows(): + df1 = data_frame(x = 1, y = pd.Categorical(["a"])) + df0 = df1.loc[pd.Index([]), :] + + assert_frame_equal(bind_rows(df0), df0) + assert_frame_equal(bind_rows(df0, df0), df0) + assert_frame_equal(bind_rows(df0, df1), df1) + + +def test_bind_rows_handles_dfs_no_cols(): + df1 = data_frame(x = 1, y = pd.Categorical(["a"])) + df0 = df1.loc[:,pd.Index([])] + + assert_frame_equal(bind_rows(df0), df0) + assert bind_rows(df0, df0).shape == (2, 0) + +@pytest.mark.skip +def test_bind_rows_lists_with_nulls(): + pass + + +@pytest.mark.skip +def test_bind_rows_lists_with_list_values(): + pass + + +def test_that_bind_rows_order_even_no_cols(): + df2 = data_frame(x = 2, y = "b") + df1 = df2.loc[:, pd.Index([])] + + res = bind_rows(df1, df2).convert_dtypes() + + indx = [0,0] + assert_series_equal(res.x, pd.Series([pd.NA, 2], index=indx, dtype="Int64", name="x")) + assert_series_equal(res.y, pd.Series([pd.NA, "b"], index=indx, dtype="string", name="y")) + + +# Column coercion ------------------------------------------------------------- + +# Note: I think most of these are handled by pandas or unavoidable + +@pytest.mark.xfail +def test_bind_rows_creates_column_of_identifiers(): + df = data_frame(x = [1,2,3], y = ["a", "b", "c"]) + data1 = df.iloc[1:,] + data2 = df.iloc[:1,] + + out = bind_rows(data1, data2, _id = "col") + + # Note: omitted test of bind_rows(list(...)) + + assert out.columns[0] == "col" + + # TODO(question): should it use 0 indexing? Would say yes, since then it just + # corresponds to the arg index + assert (out.col == ["0", "0", "1"]).all() + + out_labelled = bind_rows(zero = data1, one = data2) + assert out_labelled.col == ["zero", "zero", "one"] + + +@pytest.mark.xfail +def test_bind_cols_accepts_null(): + df1 = data_frame(a = list(range(10)), b = list(range(10))) + df2 = data_frame(c = list(range(10)), d = list(range(10))) + + res1 = bind_cols(df1, df2) + res2 = bind_cols(None, df1, df2) + res3 = bind_cols(df1, None, df2) + res4 = bind_cols(df1, df2, None) + + assert_frame_equal(res1, res2) + assert_frame_equal(res1, res3) + assert_frame_equal(res1, res4) + + +@pytest.mark.skip +def test_bind_rows_handles_0_len_named_list(): + pass + + +@pytest.mark.xfail +def test_bind_rows_infers_classes_from_first_result(): + # TODO(question): is this what pd.concat does? DataFrames are subclassable.. + pass + + +@pytest.mark.skip +def test_bind_rows_sub_df_columns(): + pass + + +@pytest.mark.xfail +def test_bind_rows_handles_rowwises_vectors(): + tbl = bind_rows( + data_frame(a = "foo", b = "bar"), + dict(a = "A", b = "B"), + ) + + assert_frame_equal(tbl, data_frame(a = ["foo", "A"], b = ["bar", "B"])) + + +@pytest.mark.skip +def test_bind_rows_lists_of_df_like_lists(): + # I think this mostly exists because R has to use do.call(...), while + # python can easily splat with *[...] + pass + + +def test_bind_rows_handles_lists(): + # see https://github.com/tidyverse/dplyr/issues/1104 + [dict(x = 1, y = "a"), dict(x = 2, y = "b")] + + +# Vectors --------------------------------------------------------------------- + +# Note: seems like bind_col tests here are overkill? +# bind_cols vector features are similar to mutate +