better dc default

dcolinmorgan · dcolinmorgan · commit cdda3e71c862 · 2024-01-02T17:14:20.000+08:00
umap match transpose index

type-spec concat

type-spec concat

dc for comp_cluster

dirty_cat as default, cc passes most tests ;)

source cu_cat from pypi

source cu_cat from pypi

remove cc tests, tested for in dc place

remove cc tests, tested for in dc place

init 1dc &gt; 2cc

init 1dc &gt; 2cc

use constants throughout

revert from constants

revert from constants

init 1dc &gt; 2cc

better dc default

better dc default
diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py
@@ -201,7 +201,7 @@ def resolve_feature_engine(
     feature_engine: FeatureEngine,
 ) -> FeatureEngineConcrete:  # noqa
 
-    if feature_engine in ["none", "pandas", DIRTY_CAT, "torch", CUDA_CAT]:
+    if feature_engine in ["none", "pandas", "dirty_cat", "torch", "cu_cat"]:
         return feature_engine  # type: ignore
     if feature_engine == "auto":
         has_dependancy_text_, _, _ = lazy_import_has_dependancy_text()
@@ -967,19 +967,19 @@ def process_dirty_dataframes(
             the data encoder, and the label encoder.
     """
 
-    if feature_engine == CUDA_CAT:
+    if feature_engine == "cu_cat":
         assert_imported_cucat()
-        from cu_cat import SuperVectorizer, GapEncoder  # , SimilarityEncoder
+        from cu_cat import SuperVectorizer, GapEncoder
         from cuml.preprocessing import FunctionTransformer
-
-    else:  # if feature_engine == "dirty_cat":  # DIRTY_CAT
-        from dirty_cat import SuperVectorizer, GapEncoder  # , SimilarityEncoder
+    
+    else:
+        from dirty_cat import SuperVectorizer, GapEncoder
         from sklearn.preprocessing import FunctionTransformer
 
     t = time()
 
     if not is_dataframe_all_numeric(ndf):
-        if feature_engine == CUDA_CAT:
+        if feature_engine == "cu_cat":
             data_encoder = SuperVectorizer(
                 auto_cast=True,
                 cardinality_threshold=cardinality_threshold_target,
@@ -1010,9 +1010,9 @@ def process_dirty_dataframes(
             features_transformed = data_encoder.get_feature_names_out()
 
         all_transformers = data_encoder.transformers
-        if feature_engine == CUDA_CAT:
+        if feature_engine == "cu_cat":
             logger.info(f"-Shape of [[cu_cat fit]] data {X_enc.shape}")
-        elif feature_engine == DIRTY_CAT:
+        else:
             logger.info(f"-Shape of [[dirty_cat fit]] data {X_enc.shape}")
         logger.debug(f"-Transformers: \n{all_transformers}\n")
         logger.debug(
@@ -1058,7 +1058,7 @@ def process_dirty_dataframes(
         t2 = time()
         logger.debug("-Fitting Targets --\n%s", y.columns)
 
-        if feature_engine == CUDA_CAT:
+        if feature_engine == "cu_cat":
             label_encoder = SuperVectorizer(
                 auto_cast=True,
                 cardinality_threshold=cardinality_threshold_target,
@@ -1486,10 +1486,17 @@ def process_edge_dataframes(
             other_df, y
         )
         # add the two datasets together
-        if feature_engine == 'pandas':
-            X_enc = pd.concat([T, X_enc], axis=1)
-        elif feature_engine == 'cudf':
+        has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
+        T_type = str(getmodule(T))
+        X_type = str(getmodule(X_enc))
+        if 'cudf' in T_type and 'cudf' in X_type:
             X_enc = cudf.concat([T, X_enc], axis=1)
+        elif 'pd' in T_type and 'pd' in X_type:
+            X_enc = pd.concat([T, X_enc], axis=1)
+        elif 'cudf' in T_type and 'pd' in X_type:
+            X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1)
+        elif 'pd' in T_type and 'cudf' in X_type:
+            X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1)
         # then scale them
         X_encs, y_encs, scaling_pipeline, scaling_pipeline_target = smart_scaler(  # noqa
             X_enc,
@@ -1556,21 +1563,17 @@ def process_edge_dataframes(
     if not X_enc.empty and not T.empty:
         logger.debug("-" * 60)
         logger.debug("<= Found Edges and Dirty_cat encoding =>")
+        has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
         T_type = str(getmodule(T))
         X_type = str(getmodule(X_enc))
         if 'cudf' in T_type and 'cudf' in X_type:
             X_enc = cudf.concat([T, X_enc], axis=1)
         elif 'pd' in T_type and 'pd' in X_type:
             X_enc = pd.concat([T, X_enc], axis=1)
-        else:
-            try:
-                X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1)
-            except:
-                pass
-            try:
-                X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1)
-            except:
-                pass
+        elif 'cudf' in T_type and 'pd' in X_type:
+            X_enc = cudf.concat([cudf.from_pandas(T), X_enc], axis=1)
+        elif 'pd' in T_type and 'cudf' in X_type:
+            X_enc = cudf.concat([T, cudf.from_pandas(X_enc)], axis=1)
     elif not T.empty and X_enc.empty:
         logger.debug("-" * 60)
         logger.debug("<= Found only Edges =>")
@@ -1750,7 +1753,18 @@ def transform(
 
     # concat text to dirty_cat, with text in front.
     if not tX.empty and not X.empty:
-        X = pd.concat([tX, X], axis=1)
+        has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
+        T_type = str(getmodule(tX))
+        X_type = str(getmodule(X))
+        if 'cudf' in T_type and 'cudf' in X_type:
+            X = cudf.concat([tX, X], axis=1)
+        elif 'pd' in T_type and 'pd' in X_type:
+            X = pd.concat([tX, X], axis=1)
+        elif 'cudf' in T_type and 'pd' in X_type:
+            X = cudf.concat([cudf.from_pandas(tX), X], axis=1)
+        elif 'pd' in T_type and 'cudf' in X_type:
+            X = cudf.concat([tX, cudf.from_pandas(X)], axis=1)
+        # X = pd.concat([tX, X], axis=1)
         logger.info("--Combining both Textual and Numeric/Dirty_Cat")
     elif not tX.empty and X.empty:
         X = tX  # textual
@@ -1765,7 +1779,18 @@ def transform(
 
     # now if edges, add T at front
     if kind == "edges":
-        X = pd.concat([T, X], axis=1)  # edges, text, dirty_cat
+        # X = pd.concat([T, X], axis=1)  # edges, text, dirty_cat
+        has_dependancy_cudf_, import_exn, cudf = lazy_import_has_dependancy_cudf()
+        T_type = str(getmodule(T))
+        X_type = str(getmodule(X))
+        if 'cudf' in T_type and 'cudf' in X_type:
+            X = cudf.concat([T, X], axis=1)
+        elif 'pd' in T_type and 'pd' in X_type:
+            X = pd.concat([T, X], axis=1)
+        elif 'cudf' in T_type and 'pd' in X_type:
+            X = cudf.concat([cudf.from_pandas(T), X], axis=1)
+        elif 'pd' in T_type and 'cudf' in X_type:
+            X = cudf.concat([T, cudf.from_pandas(X)], axis=1)
         logger.info("-Combining MultiLabelBinarizer with previous features")
 
     logger.info("-" * 40)
@@ -2656,10 +2681,11 @@ def featurize(
         """
         feature_engine = resolve_feature_engine(feature_engine)
 
-        if feature_engine == 'dirty_cat':
-            assert_imported_min()
-        elif feature_engine == 'cu_cat':
+        
+        if feature_engine == "cu_cat":
             assert_imported_cucat()
+        else:
+            assert_imported_min()
 
         if inplace:
             res = self
diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py
@@ -444,48 +444,5 @@ def test_edge_scaling(self):
                                   return_scalers=True)
 
 
-class TestFeaturizeGetMethodsCucat(unittest.TestCase):
-    
-    @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies")
-    @pytest.mark.skipif(not is_test_cudf, reason="requires cudf")
-    def setUp(self) -> None:
-        _, _, cudf = lazy_import_has_dependancy_cudf()
-        ndf_malware = pd.read_csv("graphistry/tests/data/malware_capture_bot.csv", index_col=0)
-        g = graphistry.nodes(cudf.from_pandas(ndf_malware))
-
-        g2 = g.featurize(y=cudf.from_pandas(double_target_reddit),  # ngrams
-                use_ngrams=True,
-                ngram_range=(1, 4)
-                )
-        
-        g3 = g.featurize(**topic_model, feature_engine="cu_cat")  # topic model
-        self.g = g
-        self.g2 = g2
-        self.g3 = g3
-        
-    @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies")
-    @pytest.mark.skipif(not is_test_cudf, reason="requires cudf")
-    def test_get_col_matrix(self):
-        _, _, cudf = lazy_import_has_dependancy_cudf()
-        # no edges so this should be None
-        assert self.g2.get_matrix(kind='edges') is None
-        
-        # test target methods
-        assert all(self.g2.get_matrix(target=True).columns == self.g2._node_target.columns)
-        # assert self.g2.get_matrix('Anxiety', target=True).shape[0] == len(self.g2._node_target)
-        # test str vs list 
-        # assert (self.g2.get_matrix('Anxiety', target=True) == self.g2.get_matrix(['Anxiety'], target=True)).all().values[0]
-
-        # assert list(self.g2.get_matrix(['Anxiety', 'education', 'computer'], target=True).columns) == ['label_Anxiety', 'label_education', 'label_computervision']
-    
-        # test feature methods
-        # ngrams
-        assert (self.g2.get_matrix().columns == self.g2._node_features.columns).all()
-        # assert list(self.g2.get_matrix('what').columns) == what, list(self.g2.get_matrix('what').columns)
-        
-        # topic
-        assert all(self.g3.get_matrix().columns == self.g3._node_features.columns)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py
@@ -728,12 +728,27 @@ def _bind_xy_from_umap(
             emb = res._edge_embedding
             
         if isinstance(df, type(emb)):
-            df[x_name] = emb.values.T[0]
-            df[y_name] = emb.values.T[1]
+            try:
+                df[x_name] = emb.values.T[0]
+                df[y_name] = emb.values.T[1]
+            except:
+                pass
+            try:
+                df[x_name] = emb.values[0]
+                df[y_name] = emb.values[1]
+            except:
+                pass
         elif isinstance(df, pd.DataFrame) and 'cudf' in str(getmodule(emb)):
-            df[x_name] = emb.to_numpy().T[0]
-            df[y_name] = emb.to_numpy().T[1]
-
+            try:
+                df[x_name] = emb.to_numpy().T[0]
+                df[y_name] = emb.to_numpy().T[1]
+            except:
+                pass
+            try:
+                df[x_name] = emb.to_numpy()[0]
+                df[y_name] = emb.to_numpy()[1]
+            except:
+                pass
         res = res.nodes(df) if kind == "nodes" else res.edges(df)
 
         if encode_weight and kind == "nodes":
diff --git a/setup.py b/setup.py
@@ -47,7 +47,7 @@ def unique_flatten_dict(d):
 # https://github.com/facebookresearch/faiss/issues/1589 for faiss-cpu 1.6.1, #'setuptools==67.4.0' removed
 base_extras_heavy['ai'] = base_extras_heavy['umap-learn'] + ['scipy', 'dgl', 'torch<2', 'sentence-transformers', 'faiss-cpu', 'joblib']
 
-base_extras_heavy['cu_cat'] = base_extras_heavy['ai'] + ['cu_cat @ git+http://github.com/graphistry/cu-cat.git@v0.05.0']
+base_extras_heavy['cu_cat'] = base_extras_heavy['ai'] + ['cu_cat']
 
 base_extras = {**base_extras_light, **base_extras_heavy}