Skip to content

Commit cdda3e7

Browse files
committed
better dc default
umap match transpose index type-spec concat type-spec concat dc for comp_cluster dirty_cat as default, cc passes most tests ;) source cu_cat from pypi source cu_cat from pypi remove cc tests, tested for in dc place remove cc tests, tested for in dc place init 1dc > 2cc init 1dc > 2cc use constants throughout revert from constants revert from constants init 1dc > 2cc better dc default better dc default
1 parent 5a69233 commit cdda3e7

File tree

4 files changed

+74
-76
lines changed

4 files changed

+74
-76
lines changed

graphistry/feature_utils.py

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def resolve_feature_engine(
201201
feature_engine: FeatureEngine,
202202
) -> FeatureEngineConcrete: # noqa
203203

204-
if feature_engine in ["none", "pandas", DIRTY_CAT, "torch", CUDA_CAT]:
204+
if feature_engine in ["none", "pandas", "dirty_cat", "torch", "cu_cat"]:
205205
return feature_engine # type: ignore
206206
if feature_engine == "auto":
207207
has_dependancy_text_, _, _ = lazy_import_has_dependancy_text()
@@ -967,19 +967,19 @@ def process_dirty_dataframes(
967967
the data encoder, and the label encoder.
968968
"""
969969

970-
if feature_engine == CUDA_CAT:
970+
if feature_engine == "cu_cat":
971971
assert_imported_cucat()
972-
from cu_cat import SuperVectorizer, GapEncoder # , SimilarityEncoder
972+
from cu_cat import SuperVectorizer, GapEncoder
973973
from cuml.preprocessing import FunctionTransformer
974-
975-
else: # if feature_engine == "dirty_cat": # DIRTY_CAT
976-
from dirty_cat import SuperVectorizer, GapEncoder # , SimilarityEncoder
974+
975+
else:
976+
from dirty_cat import SuperVectorizer, GapEncoder
977977
from sklearn.preprocessing import FunctionTransformer
978978

979979
t = time()
980980

981981
if not is_dataframe_all_numeric(ndf):
982-
if feature_engine == CUDA_CAT:
982+
if feature_engine == "cu_cat":
983983
data_encoder = SuperVectorizer(
984984
auto_cast=True,
985985
cardinality_threshold=cardinality_threshold_target,
@@ -1010,9 +1010,9 @@ def process_dirty_dataframes(
10101010
features_transformed = data_encoder.get_feature_names_out()
10111011

10121012
all_transformers = data_encoder.transformers
1013-
if feature_engine == CUDA_CAT:
1013+
if feature_engine == "cu_cat":
10141014
logger.info(f"-Shape of [[cu_cat fit]] data {X_enc.shape}")
1015-
elif feature_engine == DIRTY_CAT:
1015+
else:
10161016
logger.info(f"-Shape of [[dirty_cat fit]] data {X_enc.shape}")
10171017
logger.debug(f"-Transformers: \n{all_transformers}\n")
10181018
logger.debug(
@@ -1058,7 +1058,7 @@ def process_dirty_dataframes(
10581058
t2 = time()
10591059
logger.debug("-Fitting Targets --\n%s", y.columns)
10601060

1061-
if feature_engine == CUDA_CAT:
1061+
if feature_engine == "cu_cat":
10621062
label_encoder = SuperVectorizer(
10631063
auto_cast=True,
10641064
cardinality_threshold=cardinality_threshold_target,
@@ -1486,10 +1486,17 @@ def process_edge_dataframes(
14861486
other_df, y
14871487
)
14881488
# add the two datasets together
1489-
if feature_engine == 'pandas':
1490-
X_enc = pd.concat([T, X_enc], axis=1)
1491-
elif feature_engine == 'cudf':
1489+
has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
1490+
T_type = str(getmodule(T))
1491+
X_type = str(getmodule(X_enc))
1492+
if 'cudf' in T_type and 'cudf' in X_type:
14921493
X_enc = cudf.concat([T, X_enc], axis=1)
1494+
elif 'pd' in T_type and 'pd' in X_type:
1495+
X_enc = pd.concat([T, X_enc], axis=1)
1496+
elif 'cudf' in T_type and 'pd' in X_type:
1497+
X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1)
1498+
elif 'pd' in T_type and 'cudf' in X_type:
1499+
X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1)
14931500
# then scale them
14941501
X_encs, y_encs, scaling_pipeline, scaling_pipeline_target = smart_scaler( # noqa
14951502
X_enc,
@@ -1556,21 +1563,17 @@ def process_edge_dataframes(
15561563
if not X_enc.empty and not T.empty:
15571564
logger.debug("-" * 60)
15581565
logger.debug("<= Found Edges and Dirty_cat encoding =>")
1566+
has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
15591567
T_type = str(getmodule(T))
15601568
X_type = str(getmodule(X_enc))
15611569
if 'cudf' in T_type and 'cudf' in X_type:
15621570
X_enc = cudf.concat([T, X_enc], axis=1)
15631571
elif 'pd' in T_type and 'pd' in X_type:
15641572
X_enc = pd.concat([T, X_enc], axis=1)
1565-
else:
1566-
try:
1567-
X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1)
1568-
except:
1569-
pass
1570-
try:
1571-
X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1)
1572-
except:
1573-
pass
1573+
elif 'cudf' in T_type and 'pd' in X_type:
1574+
X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1)
1575+
elif 'pd' in T_type and 'cudf' in X_type:
1576+
X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1)
15741577
elif not T.empty and X_enc.empty:
15751578
logger.debug("-" * 60)
15761579
logger.debug("<= Found only Edges =>")
@@ -1750,7 +1753,18 @@ def transform(
17501753

17511754
# concat text to dirty_cat, with text in front.
17521755
if not tX.empty and not X.empty:
1753-
X = pd.concat([tX, X], axis=1)
1756+
has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
1757+
T_type = str(getmodule(tX))
1758+
X_type = str(getmodule(X))
1759+
if 'cudf' in T_type and 'cudf' in X_type:
1760+
X = cudf.concat([tX, X], axis=1)
1761+
elif 'pd' in T_type and 'pd' in X_type:
1762+
X = pd.concat([tX, X], axis=1)
1763+
elif 'cudf' in T_type and 'pd' in X_type:
1764+
X = cudf.concat([cudf.from_pandas(tX), X], axis=1)
1765+
elif 'pd' in T_type and 'cudf' in X_type:
1766+
X = cudf.concat([tX, cudf.from_pandas(X)], axis=1)
1767+
# X = pd.concat([tX, X], axis=1)
17541768
logger.info("--Combining both Textual and Numeric/Dirty_Cat")
17551769
elif not tX.empty and X.empty:
17561770
X = tX # textual
@@ -1765,7 +1779,18 @@ def transform(
17651779

17661780
# now if edges, add T at front
17671781
if kind == "edges":
1768-
X = pd.concat([T, X], axis=1) # edges, text, dirty_cat
1782+
# X = pd.concat([T, X], axis=1) # edges, text, dirty_cat
1783+
has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
1784+
T_type = str(getmodule(T))
1785+
X_type = str(getmodule(X))
1786+
if 'cudf' in T_type and 'cudf' in X_type:
1787+
X = cudf.concat([T, X], axis=1)
1788+
elif 'pd' in T_type and 'pd' in X_type:
1789+
X = pd.concat([T, X], axis=1)
1790+
elif 'cudf' in T_type and 'pd' in X_type:
1791+
X = cudf.concat([cudf.from_pandas(T), X], axis=1)
1792+
elif 'pd' in T_type and 'cudf' in X_type:
1793+
X = cudf.concat([T, cudf.from_pandas(X)], axis=1)
17691794
logger.info("-Combining MultiLabelBinarizer with previous features")
17701795

17711796
logger.info("-" * 40)
@@ -2656,10 +2681,11 @@ def featurize(
26562681
"""
26572682
feature_engine = resolve_feature_engine(feature_engine)
26582683

2659-
if feature_engine == 'dirty_cat':
2660-
assert_imported_min()
2661-
elif feature_engine == 'cu_cat':
2684+
2685+
if feature_engine == "cu_cat":
26622686
assert_imported_cucat()
2687+
else:
2688+
assert_imported_min()
26632689

26642690
if inplace:
26652691
res = self

graphistry/tests/test_feature_utils.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -444,48 +444,5 @@ def test_edge_scaling(self):
444444
return_scalers=True)
445445

446446

447-
class TestFeaturizeGetMethodsCucat(unittest.TestCase):
448-
449-
@pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies")
450-
@pytest.mark.skipif(not is_test_cudf, reason="requires cudf")
451-
def setUp(self) -> None:
452-
_, _, cudf = lazy_import_has_dependancy_cudf()
453-
ndf_malware = pd.read_csv("graphistry/tests/data/malware_capture_bot.csv", index_col=0)
454-
g = graphistry.nodes(cudf.from_pandas(ndf_malware))
455-
456-
g2 = g.featurize(y=cudf.from_pandas(double_target_reddit), # ngrams
457-
use_ngrams=True,
458-
ngram_range=(1, 4)
459-
)
460-
461-
g3 = g.featurize(**topic_model, feature_engine="cu_cat") # topic model
462-
self.g = g
463-
self.g2 = g2
464-
self.g3 = g3
465-
466-
@pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies")
467-
@pytest.mark.skipif(not is_test_cudf, reason="requires cudf")
468-
def test_get_col_matrix(self):
469-
_, _, cudf = lazy_import_has_dependancy_cudf()
470-
# no edges so this should be None
471-
assert self.g2.get_matrix(kind='edges') is None
472-
473-
# test target methods
474-
assert all(self.g2.get_matrix(target=True).columns == self.g2._node_target.columns)
475-
# assert self.g2.get_matrix('Anxiety', target=True).shape[0] == len(self.g2._node_target)
476-
# test str vs list
477-
# assert (self.g2.get_matrix('Anxiety', target=True) == self.g2.get_matrix(['Anxiety'], target=True)).all().values[0]
478-
479-
# assert list(self.g2.get_matrix(['Anxiety', 'education', 'computer'], target=True).columns) == ['label_Anxiety', 'label_education', 'label_computervision']
480-
481-
# test feature methods
482-
# ngrams
483-
assert (self.g2.get_matrix().columns == self.g2._node_features.columns).all()
484-
# assert list(self.g2.get_matrix('what').columns) == what, list(self.g2.get_matrix('what').columns)
485-
486-
# topic
487-
assert all(self.g3.get_matrix().columns == self.g3._node_features.columns)
488-
489-
490447
if __name__ == "__main__":
491448
unittest.main()

graphistry/umap_utils.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -728,12 +728,27 @@ def _bind_xy_from_umap(
728728
emb = res._edge_embedding
729729

730730
if isinstance(df, type(emb)):
731-
df[x_name] = emb.values.T[0]
732-
df[y_name] = emb.values.T[1]
731+
try:
732+
df[x_name] = emb.values.T[0]
733+
df[y_name] = emb.values.T[1]
734+
except:
735+
pass
736+
try:
737+
df[x_name] = emb.values[0]
738+
df[y_name] = emb.values[1]
739+
except:
740+
pass
733741
elif isinstance(df, pd.DataFrame) and 'cudf' in str(getmodule(emb)):
734-
df[x_name] = emb.to_numpy().T[0]
735-
df[y_name] = emb.to_numpy().T[1]
736-
742+
try:
743+
df[x_name] = emb.to_numpy().T[0]
744+
df[y_name] = emb.to_numpy().T[1]
745+
except:
746+
pass
747+
try:
748+
df[x_name] = emb.to_numpy()[0]
749+
df[y_name] = emb.to_numpy()[1]
750+
except:
751+
pass
737752
res = res.nodes(df) if kind == "nodes" else res.edges(df)
738753

739754
if encode_weight and kind == "nodes":

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def unique_flatten_dict(d):
4747
# https://github.com/facebookresearch/faiss/issues/1589 for faiss-cpu 1.6.1, #'setuptools==67.4.0' removed
4848
base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch<2', 'sentence-transformers', 'faiss-cpu', 'joblib']
4949

50-
base_extras_heavy['cu_cat'] = base_extras_heavy['ai'] + ['cu_cat @ git+http://github.com/graphistry/[email protected]']
50+
base_extras_heavy['cu_cat'] = base_extras_heavy['ai'] + ['cu_cat']
5151

5252
base_extras = {**base_extras_light, **base_extras_heavy}
5353

0 commit comments

Comments
 (0)