diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index b32442379f2..9c83535b771 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index f044aa8fbbc..a559be18077 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ucx1.15.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ucx1.15.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/cuda",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index c4149f04bed..ca10c04edee 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 5e42537b46d..6e2bf45700a 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ucx1.15.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.8": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.10": {
       "version": "12.5",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/cuda",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 03d3162bdbf..bc489ffd3f0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -47,7 +47,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -77,13 +77,13 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.08
+      extra-repo-sha: branch-24.10
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-publish-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -93,7 +93,7 @@ jobs:
   wheel-build-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,12 +101,12 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.08
+      extra-repo-sha: branch-24.10
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-publish-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -116,7 +116,7 @@ jobs:
   wheel-build-nx-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
   wheel-publish-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -136,7 +136,7 @@ jobs:
   wheel-build-cugraph-dgl:
     needs: wheel-publish-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-cugraph-dgl:
     needs: wheel-build-cugraph-dgl
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -156,7 +156,7 @@ jobs:
   wheel-build-cugraph-pyg:
     needs: wheel-publish-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -166,7 +166,7 @@ jobs:
   wheel-publish-cugraph-pyg:
     needs: wheel-build-cugraph-pyg
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -175,7 +175,7 @@ jobs:
       package-name: cugraph-pyg
   wheel-build-cugraph-equivariant:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -185,7 +185,7 @@ jobs:
   wheel-publish-cugraph-equivariant:
     needs: wheel-build-cugraph-equivariant
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 94ed08d96d7..dacd9a93399 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,29 +34,29 @@ jobs:
       - wheel-tests-cugraph-equivariant
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: cpu32
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -64,19 +64,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -86,7 +86,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -96,63 +96,63 @@ jobs:
   wheel-build-pylibcugraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.08
+      extra-repo-sha: branch-24.10
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-tests-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibcugraph.sh
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.08
+      extra-repo-sha: branch-24.10
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-tests-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph.sh
   wheel-build-nx-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_nx-cugraph.sh
   wheel-tests-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_nx-cugraph.sh
   wheel-build-cugraph-dgl:
     needs: wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-dgl.sh
   wheel-tests-cugraph-dgl:
     needs: wheel-build-cugraph-dgl
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-dgl.sh
@@ -160,35 +160,35 @@ jobs:
   wheel-build-cugraph-pyg:
     needs: wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-pyg.sh
   wheel-tests-cugraph-pyg:
     needs: wheel-build-cugraph-pyg
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-pyg.sh
       matrix_filter: map(select(.ARCH == "amd64"))
   wheel-build-cugraph-equivariant:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-equivariant.sh
   wheel-tests-cugraph-equivariant:
     needs: wheel-build-cugraph-equivariant
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-equivariant.sh
       matrix_filter: map(select(.ARCH == "amd64"))
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 03d8b73a412..957d29ce72b 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: (cugraph::ops|hornet|void writeEdgeCountsKernel|void markUniqueOffsetsKernel)
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibcugraph.sh
   wheel-tests-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -60,7 +60,7 @@ jobs:
       script: ci/test_wheel_cugraph.sh
   wheel-tests-nx-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       script: ci/test_wheel_nx-cugraph.sh
   wheel-tests-cugraph-dgl:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64"))
   wheel-tests-cugraph-pyg:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64"))
   wheel-tests-cugraph-equivariant:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index ec8489fda92..7c7ba04436f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.08.00
+24.10.00
diff --git a/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py b/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
index face22c9283..8c46095a7da 100644
--- a/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
+++ b/benchmarks/cugraph/pytest-based/bench_cugraph_uniform_neighbor_sample.py
@@ -48,6 +48,11 @@
 _seed = 42
 
 
+###############################################################################
+# Helpers
+###############################################################################
+
+
 def create_graph(graph_data):
     """
     Create a graph instance based on the data to be loaded/generated, return a
@@ -107,9 +112,6 @@ def create_mg_graph(graph_data):
     Create a graph instance based on the data to be loaded/generated, return a
     tuple containing (graph_obj, num_verts, client, cluster)
     """
-    # range starts at 1 to let let 0 be used by benchmark/client process
-    visible_devices = os.getenv("DASK_WORKER_DEVICES", "1,2,3,4")
-
     (client, cluster) = start_dask_client(
         # enable_tcp_over_ucx=True,
         # enable_infiniband=False,
@@ -117,7 +119,6 @@ def create_mg_graph(graph_data):
         # enable_rdmacm=False,
         protocol="ucx",
         rmm_pool_size="28GB",
-        dask_worker_devices=visible_devices,
     )
     rmm.reinitialize(pool_allocator=True)
 
@@ -261,6 +262,9 @@ def uns_func(*args, **kwargs):
 
 ################################################################################
 # Benchmarks
+###############################################################################
+@pytest.mark.managedmem_off
+@pytest.mark.poolallocator_on
 @pytest.mark.parametrize("batch_size", params.batch_sizes.values())
 @pytest.mark.parametrize("fanout", [params.fanout_10_25, params.fanout_5_10_15])
 @pytest.mark.parametrize(
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index ce2488ad0bf..08c22fca02e 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -69,18 +69,19 @@ DEPENDENCIES=(
   pyraft
   raft-dask
   rmm
-  ucx-py
   rapids-dask-dependency
 )
-for DEP in "${DEPENDENCIES[@]}"; do
-  for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do
+for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do
+  for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
-    sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0/g" "${FILE}"
   done
-  for FILE in python/**/pyproject.toml python/**/**/pyproject.toml; do
+  sed_runner "/-.* ucx-py\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0/g" "${FILE}"
+done
+for FILE in python/**/pyproject.toml python/**/**/pyproject.toml; do
+  for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/\"${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
-    sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0\"/g" "${FILE}"
   done
+  sed_runner "/\"ucx-py\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_UCX_PY_VERSION}.*,>=0.0.0a0\"/g" "${FILE}"
 done
 
 # ucx-py version
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 5d2c942cd0c..f0b86c791f8 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,22 +16,22 @@ dependencies:
 - cuda-nvtx
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.8.*,>=0.0.0a0
+- cudf==24.10.*,>=0.0.0a0
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- dask-cudf==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- dask-cudf==24.10.*,>=0.0.0a0
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
 - graphviz
 - ipython
-- libcudf==24.8.*,>=0.0.0a0
-- libcugraphops==24.8.*,>=0.0.0a0
-- libraft-headers==24.8.*,>=0.0.0a0
-- libraft==24.8.*,>=0.0.0a0
-- librmm==24.8.*,>=0.0.0a0
+- libcudf==24.10.*,>=0.0.0a0
+- libcugraphops==24.10.*,>=0.0.0a0
+- libraft-headers==24.10.*,>=0.0.0a0
+- libraft==24.10.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
@@ -46,10 +46,11 @@ dependencies:
 - packaging>=21
 - pandas
 - pre-commit
+- pydantic
 - pydata-sphinx-theme
-- pylibcugraphops==24.8.*,>=0.0.0a0
-- pylibraft==24.8.*,>=0.0.0a0
-- pylibwholegraph==24.8.*,>=0.0.0a0
+- pylibcugraphops==24.10.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
+- pylibwholegraph==24.10.*,>=0.0.0a0
 - pytest
 - pytest-benchmark
 - pytest-cov
@@ -57,12 +58,12 @@ dependencies:
 - pytest-xdist
 - python-louvain
 - pytorch>=2.0,<2.2.0a0
-- raft-dask==24.8.*,>=0.0.0a0
+- raft-dask==24.10.*,>=0.0.0a0
 - rapids-build-backend>=0.3.1,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
 - requests
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - scikit-learn>=0.23.1
 - scipy
@@ -72,8 +73,9 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - thriftpy2!=0.5.0,!=0.5.1
+- torchdata
 - ucx-proc=*=gpu
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
 - wget
 - wheel
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index f8a95169ddd..ebded3eec92 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -18,26 +18,26 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-version=12.5
-- cudf==24.8.*,>=0.0.0a0
+- cudf==24.10.*,>=0.0.0a0
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.8.*,>=0.0.0a0
-- dask-cudf==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
+- dask-cudf==24.10.*,>=0.0.0a0
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
 - graphviz
 - ipython
 - libcublas-dev
-- libcudf==24.8.*,>=0.0.0a0
-- libcugraphops==24.8.*,>=0.0.0a0
+- libcudf==24.10.*,>=0.0.0a0
+- libcugraphops==24.10.*,>=0.0.0a0
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==24.8.*,>=0.0.0a0
-- libraft==24.8.*,>=0.0.0a0
-- librmm==24.8.*,>=0.0.0a0
+- libraft-headers==24.10.*,>=0.0.0a0
+- libraft==24.10.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
@@ -51,10 +51,11 @@ dependencies:
 - packaging>=21
 - pandas
 - pre-commit
+- pydantic
 - pydata-sphinx-theme
-- pylibcugraphops==24.8.*,>=0.0.0a0
-- pylibraft==24.8.*,>=0.0.0a0
-- pylibwholegraph==24.8.*,>=0.0.0a0
+- pylibcugraphops==24.10.*,>=0.0.0a0
+- pylibraft==24.10.*,>=0.0.0a0
+- pylibwholegraph==24.10.*,>=0.0.0a0
 - pytest
 - pytest-benchmark
 - pytest-cov
@@ -62,12 +63,12 @@ dependencies:
 - pytest-xdist
 - python-louvain
 - pytorch>=2.0,<2.2.0a0
-- raft-dask==24.8.*,>=0.0.0a0
+- raft-dask==24.10.*,>=0.0.0a0
 - rapids-build-backend>=0.3.1,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - recommonmark
 - requests
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - scikit-build-core>=0.7.0
 - scikit-learn>=0.23.1
 - scipy
@@ -77,8 +78,9 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - thriftpy2!=0.5.0,!=0.5.1
+- torchdata
 - ucx-proc=*=gpu
-- ucx-py==0.39.*,>=0.0.0a0
+- ucx-py==0.40.*,>=0.0.0a0
 - wget
 - wheel
 name: all_cuda-125_arch-x86_64
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index ca4fdb7f2fc..d1cf6fcd9e9 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -22,14 +22,17 @@ requirements:
   host:
     - python
     - rapids-build-backend>=0.3.1,<0.4.0.dev0
+    - setuptools>=61.0.0
   run:
     - cugraph ={{ version }}
     - dgl >=1.1.0.cu*
     - numba >=0.57
     - numpy >=1.23,<2.0a0
     - pylibcugraphops ={{ minor_version }}
+    - tensordict >=0.1.2
     - python
-    - pytorch
+    - pytorch >=2.0
+    - cupy >=12.0.0
 
 tests:
   imports:
diff --git a/conda/recipes/cugraph-equivariant/meta.yaml b/conda/recipes/cugraph-equivariant/meta.yaml
index 3b56c1d6b08..9dc9d51fa48 100644
--- a/conda/recipes/cugraph-equivariant/meta.yaml
+++ b/conda/recipes/cugraph-equivariant/meta.yaml
@@ -22,6 +22,7 @@ requirements:
   host:
     - python
     - rapids-build-backend>=0.3.1,<0.4.0.dev0
+    - setuptools>=61.0.0
   run:
     - pylibcugraphops ={{ minor_version }}
     - python
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 9833a78d88b..2e1788ac0c6 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -24,8 +24,8 @@ requirements:
   host:
     - cython >=3.0.0
     - python
-    - scikit-build-core >=0.7.0
     - rapids-build-backend>=0.3.1,<0.4.0.dev0
+    - setuptools>=61.0.0
   run:
     - rapids-dask-dependency ={{ minor_version }}
     - numba >=0.57
diff --git a/conda/recipes/cugraph-service/conda_build_config.yaml b/conda/recipes/cugraph-service/conda_build_config.yaml
index 23aa872be9b..2ac251ab10a 100644
--- a/conda/recipes/cugraph-service/conda_build_config.yaml
+++ b/conda/recipes/cugraph-service/conda_build_config.yaml
@@ -1,2 +1,2 @@
 ucx_py_version:
-  - "0.39.*"
+  - "0.40.*"
diff --git a/conda/recipes/cugraph-service/meta.yaml b/conda/recipes/cugraph-service/meta.yaml
index 225f40fe2ec..c1027582c78 100644
--- a/conda/recipes/cugraph-service/meta.yaml
+++ b/conda/recipes/cugraph-service/meta.yaml
@@ -30,6 +30,7 @@ outputs:
         - pip
         - python
         - rapids-build-backend>=0.3.1,<0.4.0.dev0
+        - setuptools>=61.0.0
       run:
         - python
         - thriftpy2 >=0.4.15,!=0.5.0,!=0.5.1
@@ -51,7 +52,7 @@ outputs:
       host:
         - pip
         - python
-        - setuptools
+        - setuptools>=61.0.0
         - wheel
         - rapids-build-backend>=0.3.1,<0.4.0.dev0
       run:
diff --git a/conda/recipes/cugraph/conda_build_config.yaml b/conda/recipes/cugraph/conda_build_config.yaml
index 0c73036610f..2525441f92d 100644
--- a/conda/recipes/cugraph/conda_build_config.yaml
+++ b/conda/recipes/cugraph/conda_build_config.yaml
@@ -20,4 +20,4 @@ c_stdlib_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.39.*"
+  - "0.40.*"
diff --git a/conda/recipes/nx-cugraph/meta.yaml b/conda/recipes/nx-cugraph/meta.yaml
index 3b2d6067f9d..d67287be757 100644
--- a/conda/recipes/nx-cugraph/meta.yaml
+++ b/conda/recipes/nx-cugraph/meta.yaml
@@ -22,6 +22,7 @@ requirements:
   host:
     - python
     - rapids-build-backend>=0.3.1,<0.4.0.dev0
+    - setuptools>=61.0.0
   run:
     - pylibcugraph ={{ version }}
     - networkx >=3.0
diff --git a/conda/recipes/pylibcugraph/conda_build_config.yaml b/conda/recipes/pylibcugraph/conda_build_config.yaml
index 0c73036610f..2525441f92d 100644
--- a/conda/recipes/pylibcugraph/conda_build_config.yaml
+++ b/conda/recipes/pylibcugraph/conda_build_config.yaml
@@ -20,4 +20,4 @@ c_stdlib_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.39.*"
+  - "0.40.*"
diff --git a/conda/recipes/pylibcugraph/meta.yaml b/conda/recipes/pylibcugraph/meta.yaml
index 57ec1e7418f..15632cfcc0e 100644
--- a/conda/recipes/pylibcugraph/meta.yaml
+++ b/conda/recipes/pylibcugraph/meta.yaml
@@ -75,7 +75,9 @@ requirements:
     - cuda-cudart
     {% endif %}
     - libcugraph ={{ version }}
+    - pylibraft ={{ minor_version }}
     - python
+    - rmm ={{ minor_version }}
 
 tests:
   requirements:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 555bc44eb26..f3873ae0f9e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -284,6 +284,9 @@ set(CUGRAPH_SOURCES
     src/community/k_truss_sg_v64_e64.cu
     src/community/k_truss_sg_v32_e32.cu
     src/community/k_truss_sg_v32_e64.cu
+    src/community/k_truss_mg_v64_e64.cu
+    src/community/k_truss_mg_v32_e32.cu
+    src/community/k_truss_mg_v32_e64.cu
     src/lookup/lookup_src_dst_mg_v32_e32.cu
     src/lookup/lookup_src_dst_mg_v32_e64.cu
     src/lookup/lookup_src_dst_mg_v64_e64.cu
diff --git a/cpp/include/cugraph/detail/shuffle_wrappers.hpp b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
index 69d48098a5d..7dffcce298a 100644
--- a/cpp/include/cugraph/detail/shuffle_wrappers.hpp
+++ b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
@@ -53,7 +53,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_id_t>>>
+           std::optional<rmm::device_uvector<edge_type_id_t>>,
+           std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -86,14 +87,15 @@ shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
  * (exclusive) vertex ID.
  *
  * @return Tuple of vectors storing shuffled major vertices, minor vertices and optional weights,
- * edge ids and edge types
+ * edge ids and edge types and rx counts
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_id_t>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_id_t>>>
+           std::optional<rmm::device_uvector<edge_type_id_t>>,
+           std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
diff --git a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
index 63d7fd9685e..61ad833a529 100644
--- a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
+++ b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
@@ -251,7 +251,8 @@ class per_device_edgelist_t {
              store_transposed ? src_[0] : dst_[0],
              tmp_wgt,
              tmp_edge_id,
-             tmp_edge_type) =
+             tmp_edge_type,
+             std::ignore) =
       cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
         handle.raft_handle(),
         store_transposed ? std::move(dst_[0]) : std::move(src_[0]),
diff --git a/cpp/include/cugraph_c/similarity_algorithms.h b/cpp/include/cugraph_c/similarity_algorithms.h
index 5b8462a1666..12f55132fc7 100644
--- a/cpp/include/cugraph_c/similarity_algorithms.h
+++ b/cpp/include/cugraph_c/similarity_algorithms.h
@@ -145,6 +145,34 @@ cugraph_error_code_t cugraph_overlap_coefficients(const cugraph_resource_handle_
                                                   cugraph_similarity_result_t** result,
                                                   cugraph_error_t** error);
 
+/**
+ * @brief     Perform cosine similarity computation
+ *
+ * Compute the similarity for the specified vertex_pairs
+ *
+ * Note that cosine similarity must run on a symmetric graph.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ * @param [in]  graph        Pointer to graph
+ * @param [in]  vertex_pairs Vertex pair for input
+ * @param [in]  use_weight   If true consider the edge weight in the graph, if false use an
+ *                           edge weight of 1
+ * @param [in]  do_expensive_check A flag to run expensive checks for input arguments (if set to
+ * `true`).
+ * @param [out] result       Opaque pointer to similarity results
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_cosine_similarity_coefficients(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_vertex_pairs_t* vertex_pairs,
+  bool_t use_weight,
+  bool_t do_expensive_check,
+  cugraph_similarity_result_t** result,
+  cugraph_error_t** error);
+
 /**
  * @brief     Perform All-Pairs Jaccard similarity computation
  *
@@ -259,6 +287,44 @@ cugraph_error_code_t cugraph_all_pairs_overlap_coefficients(
   cugraph_similarity_result_t** result,
   cugraph_error_t** error);
 
+/**
+ * @brief     Perform All Pairs cosine similarity computation
+ *
+ * Compute the similarity for all vertex pairs derived from the two-hop neighbors
+ * of an optional specified vertex list.  This function will identify the two-hop
+ * neighbors of the specified vertices (all vertices in the graph if not specified)
+ * and compute similarity for those vertices.
+ *
+ * If the topk parameter is specified then the result will only contain the top k
+ * highest scoring results.
+ *
+ * Note that cosine similarity must run on a symmetric graph.
+ *
+ * @param [in]  handle       Handle for accessing resources
+ * @param [in]  graph        Pointer to graph
+ * @param [in]  vertices     Vertex list for input.  If null then compute based on
+ *                           all vertices in the graph.
+ * @param [in]  use_weight   If true consider the edge weight in the graph, if false use an
+ *                           edge weight of 1
+ * @param [in]  topk         Specify how many answers to return.  Specifying SIZE_MAX
+ *                           will return all values.
+ * @param [in]  do_expensive_check A flag to run expensive checks for input arguments (if set to
+ * `true`).
+ * @param [out] result       Opaque pointer to similarity results
+ * @param [out] error        Pointer to an error object storing details of any error.  Will
+ *                           be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_all_pairs_cosine_similarity_coefficients(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* vertices,
+  bool_t use_weight,
+  size_t topk,
+  bool_t do_expensive_check,
+  cugraph_similarity_result_t** result,
+  cugraph_error_t** error);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/src/c_api/graph_functions.cpp b/cpp/src/c_api/graph_functions.cpp
index 91371b988b3..df741a349d2 100644
--- a/cpp/src/c_api/graph_functions.cpp
+++ b/cpp/src/c_api/graph_functions.cpp
@@ -72,7 +72,7 @@ struct create_vertex_pairs_functor : public cugraph::c_api::abstract_functor {
         second_copy.data(), second_->as_type<vertex_t>(), second_->size_, handle_.get_stream());
 
       if constexpr (multi_gpu) {
-        std::tie(first_copy, second_copy, std::ignore, std::ignore, std::ignore) =
+        std::tie(first_copy, second_copy, std::ignore, std::ignore, std::ignore, std::ignore) =
           cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
             vertex_t,
             edge_t,
diff --git a/cpp/src/c_api/graph_mg.cpp b/cpp/src/c_api/graph_mg.cpp
index 22ceea3f629..cc4acd31743 100644
--- a/cpp/src/c_api/graph_mg.cpp
+++ b/cpp/src/c_api/graph_mg.cpp
@@ -167,7 +167,8 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
                store_transposed ? edgelist_srcs : edgelist_dsts,
                edgelist_weights,
                edgelist_edge_ids,
-               edgelist_edge_types) =
+               edgelist_edge_types,
+               std::ignore) =
         cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
           handle_,
           std::move(store_transposed ? edgelist_dsts : edgelist_srcs),
diff --git a/cpp/src/c_api/k_truss.cpp b/cpp/src/c_api/k_truss.cpp
index 18e256b022a..37a0672676e 100644
--- a/cpp/src/c_api/k_truss.cpp
+++ b/cpp/src/c_api/k_truss.cpp
@@ -60,10 +60,7 @@ struct k_truss_functor : public cugraph::c_api::abstract_functor {
   {
     if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
       unsupported();
-    } else if constexpr (multi_gpu) {
-      unsupported();
     } else {
-      // k_truss expects store_transposed == false
       if constexpr (store_transposed) {
         error_code_ = cugraph::c_api::
           transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
diff --git a/cpp/src/c_api/similarity.cpp b/cpp/src/c_api/similarity.cpp
index aa54fc6dee7..36f1a74f3e0 100644
--- a/cpp/src/c_api/similarity.cpp
+++ b/cpp/src/c_api/similarity.cpp
@@ -212,6 +212,22 @@ struct all_pairs_similarity_functor : public cugraph::c_api::abstract_functor {
                                    : std::nullopt,
                          topk_ != SIZE_MAX ? std::make_optional(topk_) : std::nullopt);
 
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(
+        handle_,
+        v1.data(),
+        v1.size(),
+        number_map->data(),
+        graph_view.vertex_partition_range_lasts(),
+        false);
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(
+        handle_,
+        v2.data(),
+        v2.size(),
+        number_map->data(),
+        graph_view.vertex_partition_range_lasts(),
+        false);
+
       result_ = new cugraph::c_api::cugraph_similarity_result_t{
         new cugraph::c_api::cugraph_type_erased_device_array_t(similarity_coefficients,
                                                                graph_->weight_type_),
@@ -274,6 +290,33 @@ struct sorensen_functor {
   }
 };
 
+struct cosine_functor {
+  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+  rmm::device_uvector<weight_t> operator()(
+    raft::handle_t const& handle,
+    cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+    std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+    std::tuple<raft::device_span<vertex_t const>, raft::device_span<vertex_t const>> vertex_pairs)
+  {
+    return cugraph::cosine_similarity_coefficients(
+      handle, graph_view, edge_weight_view, vertex_pairs);
+  }
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<weight_t>>
+  operator()(raft::handle_t const& handle,
+             cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+             std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+             std::optional<raft::device_span<vertex_t const>> vertices,
+             std::optional<size_t> topk)
+  {
+    return cugraph::cosine_similarity_all_pairs_coefficients(
+      handle, graph_view, edge_weight_view, vertices, topk);
+  }
+};
+
 struct overlap_functor {
   template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
   rmm::device_uvector<weight_t> operator()(
@@ -300,6 +343,33 @@ struct overlap_functor {
   }
 };
 
+struct cosine_similarity_functor {
+  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+  rmm::device_uvector<weight_t> operator()(
+    raft::handle_t const& handle,
+    cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+    std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+    std::tuple<raft::device_span<vertex_t const>, raft::device_span<vertex_t const>> vertex_pairs)
+  {
+    return cugraph::cosine_similarity_coefficients(
+      handle, graph_view, edge_weight_view, vertex_pairs);
+  }
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+  std::tuple<rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<vertex_t>,
+             rmm::device_uvector<weight_t>>
+  operator()(raft::handle_t const& handle,
+             cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+             std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+             std::optional<raft::device_span<vertex_t const>> vertices,
+             std::optional<size_t> topk)
+  {
+    return cugraph::cosine_similarity_all_pairs_coefficients(
+      handle, graph_view, edge_weight_view, vertices, topk);
+  }
+};
+
 }  // namespace
 
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_similarity_result_get_similarity(
@@ -391,6 +461,28 @@ extern "C" cugraph_error_code_t cugraph_overlap_coefficients(
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
 
+extern "C" cugraph_error_code_t cugraph_cosine_similarity_coefficients(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_vertex_pairs_t* vertex_pairs,
+  bool_t use_weight,
+  bool_t do_expensive_check,
+  cugraph_similarity_result_t** result,
+  cugraph_error_t** error)
+{
+  if (use_weight) {
+    CAPI_EXPECTS(
+      reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->edge_weights_ != nullptr,
+      CUGRAPH_INVALID_INPUT,
+      "use_weight is true but edge weights are not provided.",
+      *error);
+  }
+  similarity_functor functor(
+    handle, graph, vertex_pairs, cosine_similarity_functor{}, use_weight, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
+
 extern "C" cugraph_error_code_t cugraph_all_pairs_jaccard_coefficients(
   const cugraph_resource_handle_t* handle,
   cugraph_graph_t* graph,
@@ -459,3 +551,26 @@ extern "C" cugraph_error_code_t cugraph_all_pairs_overlap_coefficients(
 
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
+
+extern "C" cugraph_error_code_t cugraph_all_pairs_cosine_similarity_coefficients(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  const cugraph_type_erased_device_array_view_t* vertices,
+  bool_t use_weight,
+  size_t topk,
+  bool_t do_expensive_check,
+  cugraph_similarity_result_t** result,
+  cugraph_error_t** error)
+{
+  if (use_weight) {
+    CAPI_EXPECTS(
+      reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->edge_weights_ != nullptr,
+      CUGRAPH_INVALID_INPUT,
+      "use_weight is true but edge weights are not provided.",
+      *error);
+  }
+  all_pairs_similarity_functor functor(
+    handle, graph, vertices, overlap_functor{}, use_weight, topk, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 99fc1cd6fae..272e3d71f83 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -627,6 +627,7 @@ refine_clustering(
                store_transposed ? d_srcs : d_dsts,
                d_weights,
                std::ignore,
+               std::ignore,
                std::ignore) =
         cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
           vertex_t,
diff --git a/cpp/src/community/edge_triangle_count_impl.cuh b/cpp/src/community/edge_triangle_count_impl.cuh
index c4277e240be..225687c4cf0 100644
--- a/cpp/src/community/edge_triangle_count_impl.cuh
+++ b/cpp/src/community/edge_triangle_count_impl.cuh
@@ -250,7 +250,7 @@ edge_property_t<graph_view_t<vertex_t, edge_t, false, multi_gpu>, edge_t> edge_t
                          handle.get_stream());
 
       // There are still multiple copies here but is it worth sorting and reducing again?
-      std::tie(pair_srcs, pair_dsts, std::ignore, pair_count, std::ignore) =
+      std::tie(pair_srcs, pair_dsts, std::ignore, pair_count, std::ignore, std::ignore) =
         shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                edge_t,
                                                                                weight_t,
diff --git a/cpp/src/community/k_truss_impl.cuh b/cpp/src/community/k_truss_impl.cuh
index f2d4c04dffd..e052a892917 100644
--- a/cpp/src/community/k_truss_impl.cuh
+++ b/cpp/src/community/k_truss_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,12 +17,14 @@
 
 #include "prims/edge_bucket.cuh"
 #include "prims/extract_transform_e.cuh"
+#include "prims/extract_transform_v_frontier_outgoing_e.cuh"
 #include "prims/fill_edge_property.cuh"
 #include "prims/transform_e.cuh"
 #include "prims/transform_reduce_dst_nbr_intersection_of_e_endpoints_by_v.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
 
 #include <cugraph/algorithms.hpp>
+#include <cugraph/detail/collect_comm_wrapper.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/error.hpp>
@@ -41,351 +43,6 @@
 
 namespace cugraph {
 
-template <typename vertex_t, typename edge_t, typename EdgeIterator>
-struct unroll_edge {
-  size_t num_valid_edges{};
-  raft::device_span<edge_t> num_triangles{};
-  EdgeIterator edge_to_unroll_first{};
-  EdgeIterator transposed_valid_edge_first{};
-  EdgeIterator transposed_valid_edge_last{};
-  EdgeIterator transposed_invalid_edge_last{};
-
-  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
-  {
-    // edges are sorted with destination as key so reverse the edge when looking it
-    auto pair = thrust::make_tuple(thrust::get<1>(*(edge_to_unroll_first + i)),
-                                   thrust::get<0>(*(edge_to_unroll_first + i)));
-    // Find its position in either partition of the transposed edgelist
-    // An edge can be in found in either of the two partitions (valid or invalid)
-    auto itr = thrust::lower_bound(
-      thrust::seq, transposed_valid_edge_last, transposed_invalid_edge_last, pair);
-    size_t idx{};
-    if (itr != transposed_invalid_edge_last && *itr == pair) {
-      idx =
-        static_cast<size_t>(thrust::distance(transposed_valid_edge_last, itr) + num_valid_edges);
-    } else {
-      // The edge must be in the first boundary
-      itr = thrust::lower_bound(
-        thrust::seq, transposed_valid_edge_first, transposed_valid_edge_last, pair);
-      assert(*itr == pair);
-      idx = thrust::distance(transposed_valid_edge_first, itr);
-    }
-    cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(num_triangles[idx]);
-    auto r = atomic_counter.fetch_sub(edge_t{1}, cuda::std::memory_order_relaxed);
-  }
-};
-
-// FIXME: May re-locate this function as a general utility function for graph algorithm
-// implementations.
-template <typename vertex_t>
-rmm::device_uvector<vertex_t> compute_prefix_sum(raft::handle_t const& handle,
-                                                 raft::device_span<vertex_t const> sorted_vertices,
-                                                 raft::device_span<vertex_t const> query_vertices)
-{
-  rmm::device_uvector<vertex_t> prefix_sum(query_vertices.size() + 1, handle.get_stream());
-
-  auto count_first = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(size_t{0}),
-    cuda::proclaim_return_type<vertex_t>(
-      [query_vertices,
-       num_edges       = sorted_vertices.size(),
-       sorted_vertices = sorted_vertices.begin()] __device__(size_t idx) {
-        auto itr_lower = thrust::lower_bound(
-          thrust::seq, sorted_vertices, sorted_vertices + num_edges, query_vertices[idx]);
-
-        auto itr_upper = thrust::upper_bound(
-          thrust::seq, itr_lower, sorted_vertices + num_edges, query_vertices[idx]);
-        vertex_t dist = thrust::distance(itr_lower, itr_upper);
-
-        return dist;
-      }));
-
-  thrust::exclusive_scan(handle.get_thrust_policy(),
-                         count_first,
-                         count_first + query_vertices.size() + 1,
-                         prefix_sum.begin());
-
-  return prefix_sum;
-}
-
-template <typename vertex_t, typename edge_t, typename EdgeIterator>
-edge_t remove_overcompensating_edges(raft::handle_t const& handle,
-                                     size_t buffer_size,
-                                     EdgeIterator potential_closing_or_incoming_edges,
-                                     EdgeIterator incoming_or_potential_closing_edges,
-                                     raft::device_span<vertex_t const> invalid_edgelist_srcs,
-                                     raft::device_span<vertex_t const> invalid_edgelist_dsts)
-{
-  // To avoid over-compensating, check whether the 'potential_closing_edges'
-  // are within the invalid edges. If yes, the was already unrolled
-  auto edges_not_overcomp = thrust::remove_if(
-    handle.get_thrust_policy(),
-    thrust::make_zip_iterator(potential_closing_or_incoming_edges,
-                              incoming_or_potential_closing_edges),
-    thrust::make_zip_iterator(potential_closing_or_incoming_edges + buffer_size,
-                              incoming_or_potential_closing_edges + buffer_size),
-    [num_invalid_edges = invalid_edgelist_dsts.size(),
-     invalid_first =
-       thrust::make_zip_iterator(invalid_edgelist_dsts.begin(), invalid_edgelist_srcs.begin()),
-     invalid_last = thrust::make_zip_iterator(invalid_edgelist_dsts.end(),
-                                              invalid_edgelist_srcs.end())] __device__(auto e) {
-      auto potential_edge = thrust::get<0>(e);
-      auto transposed_potential_or_incoming_edge =
-        thrust::make_tuple(thrust::get<1>(potential_edge), thrust::get<0>(potential_edge));
-      auto itr = thrust::lower_bound(
-        thrust::seq, invalid_first, invalid_last, transposed_potential_or_incoming_edge);
-      return (itr != invalid_last && *itr == transposed_potential_or_incoming_edge);
-    });
-
-  auto dist = thrust::distance(thrust::make_zip_iterator(potential_closing_or_incoming_edges,
-                                                         incoming_or_potential_closing_edges),
-                               edges_not_overcomp);
-
-  return dist;
-}
-
-template <typename vertex_t, typename edge_t, bool multi_gpu, bool is_q_r_edge>
-void unroll_p_r_or_q_r_edges(raft::handle_t const& handle,
-                             graph_view_t<vertex_t, edge_t, false, false>& graph_view,
-                             size_t num_invalid_edges,
-                             size_t num_valid_edges,
-                             raft::device_span<vertex_t const> edgelist_srcs,
-                             raft::device_span<vertex_t const> edgelist_dsts,
-                             raft::device_span<edge_t> num_triangles)
-{
-  auto prefix_sum_valid = compute_prefix_sum(
-    handle,
-    raft::device_span<vertex_t const>(edgelist_dsts.data(), num_valid_edges),
-    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
-
-  auto prefix_sum_invalid = compute_prefix_sum(
-    handle,
-    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges),
-    raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
-
-  auto potential_closing_edges = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
-    prefix_sum_valid.back_element(handle.get_stream()) +
-      prefix_sum_invalid.back_element(handle.get_stream()),
-    handle.get_stream());
-
-  auto incoming_edges_to_r = allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(
-    prefix_sum_valid.back_element(handle.get_stream()) +
-      prefix_sum_invalid.back_element(handle.get_stream()),
-    handle.get_stream());
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(num_invalid_edges),
-    [num_valid_edges,
-     num_invalid_edges,
-     invalid_dst_first       = edgelist_dsts.begin() + num_valid_edges,
-     invalid_src_first       = edgelist_srcs.begin() + num_valid_edges,
-     valid_src_first         = edgelist_srcs.begin(),
-     valid_dst_first         = edgelist_dsts.begin(),
-     prefix_sum_valid        = prefix_sum_valid.data(),
-     prefix_sum_invalid      = prefix_sum_invalid.data(),
-     potential_closing_edges = get_dataframe_buffer_begin(potential_closing_edges),
-     incoming_edges_to_r = get_dataframe_buffer_begin(incoming_edges_to_r)] __device__(auto idx) {
-      auto src                 = invalid_src_first[idx];
-      auto dst                 = invalid_dst_first[idx];
-      auto dst_array_end_valid = valid_dst_first + num_valid_edges;
-
-      auto itr_lower_valid =
-        thrust::lower_bound(thrust::seq, valid_dst_first, dst_array_end_valid, dst);
-      auto idx_lower_valid = thrust::distance(
-        valid_dst_first,
-        itr_lower_valid);  // Need a binary search to find the begining of the range
-
-      auto invalid_end_dst = invalid_dst_first + num_invalid_edges;
-
-      auto itr_lower_invalid =
-        thrust::lower_bound(thrust::seq, invalid_dst_first, invalid_end_dst, dst);
-      auto idx_lower_invalid = thrust::distance(
-        invalid_dst_first,
-        itr_lower_invalid);  // Need a binary search to find the begining of the range
-
-      auto incoming_edges_to_r_first_valid = thrust::make_zip_iterator(
-        valid_src_first + idx_lower_valid, thrust::make_constant_iterator(dst));
-      thrust::copy(
-        thrust::seq,
-        incoming_edges_to_r_first_valid,
-        incoming_edges_to_r_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
-        incoming_edges_to_r + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
-
-      auto incoming_edges_to_r_first_invalid = thrust::make_zip_iterator(
-        invalid_src_first + idx_lower_invalid, thrust::make_constant_iterator(dst));
-      thrust::copy(
-        thrust::seq,
-        incoming_edges_to_r_first_invalid,
-        incoming_edges_to_r_first_invalid + (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
-
-        incoming_edges_to_r + prefix_sum_invalid[idx] + prefix_sum_valid[idx + 1]);
-
-      if constexpr (is_q_r_edge) {
-        auto potential_closing_edges_first_valid = thrust::make_zip_iterator(
-          valid_src_first + idx_lower_valid, thrust::make_constant_iterator(src));
-        thrust::copy(
-          thrust::seq,
-          potential_closing_edges_first_valid,
-          potential_closing_edges_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
-          potential_closing_edges + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
-
-        auto potential_closing_edges_first_invalid = thrust::make_zip_iterator(
-          invalid_src_first + idx_lower_invalid, thrust::make_constant_iterator(src));
-        thrust::copy(thrust::seq,
-                     potential_closing_edges_first_invalid,
-                     potential_closing_edges_first_invalid +
-                       (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
-                     potential_closing_edges + prefix_sum_invalid[idx] + prefix_sum_valid[idx + 1]);
-
-      } else {
-        auto potential_closing_edges_first_valid = thrust::make_zip_iterator(
-          thrust::make_constant_iterator(src), valid_src_first + idx_lower_valid);
-        thrust::copy(
-          thrust::seq,
-          potential_closing_edges_first_valid,
-          potential_closing_edges_first_valid + (prefix_sum_valid[idx + 1] - prefix_sum_valid[idx]),
-          potential_closing_edges + prefix_sum_valid[idx] + prefix_sum_invalid[idx]);
-
-        auto potential_closing_edges_first_invalid = thrust::make_zip_iterator(
-          thrust::make_constant_iterator(src), invalid_src_first + idx_lower_invalid);
-        thrust::copy(
-          thrust::seq,
-          potential_closing_edges_first_invalid,
-          potential_closing_edges_first_invalid +
-            (prefix_sum_invalid[idx + 1] - prefix_sum_invalid[idx]),
-          potential_closing_edges + prefix_sum_invalid[idx] + (prefix_sum_valid[idx + 1]));
-      }
-    });
-
-  auto edges_exist = graph_view.has_edge(
-    handle,
-    raft::device_span<vertex_t const>(std::get<0>(potential_closing_edges).data(),
-                                      std::get<0>(potential_closing_edges).size()),
-    raft::device_span<vertex_t const>(std::get<1>(potential_closing_edges).data(),
-                                      std::get<1>(potential_closing_edges).size()));
-
-  auto edge_to_existance = thrust::make_zip_iterator(
-    thrust::make_zip_iterator(get_dataframe_buffer_begin(potential_closing_edges),
-                              get_dataframe_buffer_begin(incoming_edges_to_r)),
-    edges_exist.begin());
-
-  auto has_edge_last = thrust::remove_if(handle.get_thrust_policy(),
-                                         edge_to_existance,
-                                         edge_to_existance + edges_exist.size(),
-                                         [] __device__(auto e) {
-                                           auto edge_exists = thrust::get<1>(e);
-                                           return edge_exists == 0;
-                                         });
-
-  auto num_edge_exists = thrust::distance(edge_to_existance, has_edge_last);
-
-  // After pushing the non-existant edges to the second partition,
-  // remove them by resizing  both vertex pair buffer
-  resize_dataframe_buffer(potential_closing_edges, num_edge_exists, handle.get_stream());
-  resize_dataframe_buffer(incoming_edges_to_r, num_edge_exists, handle.get_stream());
-
-  auto num_edges_not_overcomp =
-    remove_overcompensating_edges<vertex_t,
-                                  edge_t,
-                                  decltype(get_dataframe_buffer_begin(potential_closing_edges))>(
-      handle,
-      num_edge_exists,
-      get_dataframe_buffer_begin(potential_closing_edges),
-      get_dataframe_buffer_begin(incoming_edges_to_r),
-      raft::device_span<vertex_t const>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-      raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges, num_invalid_edges));
-
-  // After pushing the non-existant edges to the second partition,
-  // remove them by resizing  both vertex pair buffer
-  resize_dataframe_buffer(potential_closing_edges, num_edges_not_overcomp, handle.get_stream());
-  resize_dataframe_buffer(incoming_edges_to_r, num_edges_not_overcomp, handle.get_stream());
-
-  // Extra check for 'incoming_edges_to_r'
-  if constexpr (!is_q_r_edge) {
-    // Exchange the arguments (incoming_edges_to_r, num_edges_not_overcomp) order
-    // To also check if the 'incoming_edges_to_r' belong the the invalid_edgelist
-    num_edges_not_overcomp =
-      remove_overcompensating_edges<vertex_t,
-                                    edge_t,
-                                    decltype(get_dataframe_buffer_begin(potential_closing_edges))>(
-        handle,
-        num_edges_not_overcomp,
-        get_dataframe_buffer_begin(incoming_edges_to_r),
-        get_dataframe_buffer_begin(potential_closing_edges),
-        raft::device_span<vertex_t const>(edgelist_srcs.data() + num_valid_edges,
-                                          num_invalid_edges),
-        raft::device_span<vertex_t const>(edgelist_dsts.data() + num_valid_edges,
-                                          num_invalid_edges));
-
-    resize_dataframe_buffer(potential_closing_edges, num_edges_not_overcomp, handle.get_stream());
-    resize_dataframe_buffer(incoming_edges_to_r, num_edges_not_overcomp, handle.get_stream());
-  }
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_zip_iterator(get_dataframe_buffer_begin(potential_closing_edges),
-                              get_dataframe_buffer_begin(incoming_edges_to_r)),
-    thrust::make_zip_iterator(
-      get_dataframe_buffer_begin(potential_closing_edges) + num_edges_not_overcomp,
-      get_dataframe_buffer_begin(incoming_edges_to_r) + num_edges_not_overcomp),
-    [num_triangles = num_triangles.begin(),
-     num_valid_edges,
-     invalid_first = thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
-                                               edgelist_srcs.begin() + num_valid_edges),
-     invalid_last  = thrust::make_zip_iterator(
-       edgelist_dsts.end(), edgelist_srcs.end())] __device__(auto potential_or_incoming_e) {
-      auto potential_e     = thrust::get<0>(potential_or_incoming_e);
-      auto incoming_e_to_r = thrust::get<1>(potential_or_incoming_e);
-      // thrust::tuple<vertex_t, vertex_t>> transposed_invalid_edge_;
-      auto transposed_invalid_edge =
-        thrust::make_tuple(thrust::get<1>(incoming_e_to_r), thrust::get<1>(potential_e));
-
-      if constexpr (!is_q_r_edge) {
-        transposed_invalid_edge =
-          thrust::make_tuple(thrust::get<1>(incoming_e_to_r), thrust::get<0>(potential_e));
-      }
-      auto itr =
-        thrust::lower_bound(thrust::seq, invalid_first, invalid_last, transposed_invalid_edge);
-      if (itr != invalid_last) { assert(*itr == transposed_invalid_edge); }
-      auto dist = thrust::distance(invalid_first, itr) + num_valid_edges;
-
-      cuda::atomic_ref<edge_t, cuda::thread_scope_device> atomic_counter(num_triangles[dist]);
-      auto r = atomic_counter.fetch_sub(edge_t{1}, cuda::std::memory_order_relaxed);
-    });
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(num_edges_not_overcomp),
-    unroll_edge<vertex_t,
-                edge_t,
-                decltype(thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin()))>{
-      num_valid_edges,
-      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-      get_dataframe_buffer_begin(potential_closing_edges),
-      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin()),
-      thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
-                                edgelist_srcs.begin() + num_valid_edges),
-      thrust::make_zip_iterator(edgelist_dsts.end(), edgelist_srcs.end())});
-
-  thrust::for_each(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(num_edges_not_overcomp),
-    unroll_edge<vertex_t,
-                edge_t,
-                decltype(thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin()))>{
-      num_valid_edges,
-      raft::device_span<edge_t>(num_triangles.data(), num_triangles.size()),
-      get_dataframe_buffer_begin(incoming_edges_to_r),
-      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin()),
-      thrust::make_zip_iterator(edgelist_dsts.begin() + num_valid_edges,
-                                edgelist_srcs.begin() + num_valid_edges),
-      thrust::make_zip_iterator(edgelist_dsts.end(), edgelist_srcs.end())});
-}
-
 namespace {
 
 template <typename vertex_t>
@@ -434,28 +91,6 @@ struct extract_low_to_high_degree_edges_t {
   }
 };
 
-template <typename vertex_t, typename edge_t, bool generate_p_r>
-struct generate_p_r_or_q_r_from_p_q {
-  size_t chunk_start{};
-  raft::device_span<size_t const> intersection_offsets{};
-  raft::device_span<vertex_t const> intersection_indices{};
-  raft::device_span<vertex_t const> invalid_srcs{};
-  raft::device_span<vertex_t const> invalid_dsts{};
-
-  __device__ thrust::tuple<vertex_t, vertex_t> operator()(edge_t i) const
-  {
-    auto itr = thrust::upper_bound(
-      thrust::seq, intersection_offsets.begin() + 1, intersection_offsets.end(), i);
-    auto idx = thrust::distance(intersection_offsets.begin() + 1, itr);
-
-    if constexpr (generate_p_r) {
-      return thrust::make_tuple(invalid_srcs[chunk_start + idx], intersection_indices[i]);
-
-    } else {
-      return thrust::make_tuple(invalid_dsts[chunk_start + idx], intersection_indices[i]);
-    }
-  }
-};
 }  // namespace
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
@@ -470,8 +105,6 @@ k_truss(raft::handle_t const& handle,
 {
   // 1. Check input arguments.
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   CUGRAPH_EXPECTS(graph_view.is_symmetric(),
                   "Invalid input arguments: K-truss currently supports undirected graphs only.");
   CUGRAPH_EXPECTS(!graph_view.is_multigraph(),
@@ -497,7 +130,7 @@ k_truss(raft::handle_t const& handle,
                                             exclude_self_loop_t<vertex_t>{});
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
@@ -520,7 +153,8 @@ k_truss(raft::handle_t const& handle,
     modified_graph_view = (*modified_graph).view();
   }
 
-  // 3. Find (k-1)-core and exclude edges that do not belong to (k-1)-core
+  // 2. Find (k-1)-core and exclude edges that do not belong to (k-1)-core
+
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
 
@@ -536,26 +170,23 @@ k_truss(raft::handle_t const& handle,
 
     raft::device_span<edge_t const> core_number_span{core_numbers.data(), core_numbers.size()};
 
-    rmm::device_uvector<vertex_t> srcs{0, handle.get_stream()};
-    rmm::device_uvector<vertex_t> dsts{0, handle.get_stream()};
-    std::tie(srcs, dsts, wgts) = k_core(handle,
-                                        cur_graph_view,
-                                        edge_weight_view,
-                                        k - 1,
-                                        std::make_optional(k_core_degree_type_t::OUT),
-                                        std::make_optional(core_number_span));
+    auto [srcs, dsts, wgts] = k_core(handle,
+                                     cur_graph_view,
+                                     edge_weight_view,
+                                     k - 1,
+                                     std::make_optional(k_core_degree_type_t::OUT),
+                                     std::make_optional(core_number_span));
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, wgts, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
                                                                                        int32_t>(
-          handle, std::move(srcs), std::move(dsts), std::nullopt, std::nullopt, std::nullopt);
+          handle, std::move(srcs), std::move(dsts), std::move(wgts), std::nullopt, std::nullopt);
     }
 
     std::optional<rmm::device_uvector<vertex_t>> tmp_renumber_map{std::nullopt};
-
     std::tie(*modified_graph, edge_weight, std::ignore, std::ignore, tmp_renumber_map) =
       create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
         handle,
@@ -566,7 +197,7 @@ k_truss(raft::handle_t const& handle,
         std::nullopt,
         std::nullopt,
         cugraph::graph_properties_t{true, graph_view.is_multigraph()},
-        false);
+        true);
 
     modified_graph_view = (*modified_graph).view();
 
@@ -577,10 +208,11 @@ k_truss(raft::handle_t const& handle,
                                                    (*renumber_map).data(),
                                                    *vertex_partition_range_lasts);
     }
+
     renumber_map = std::move(tmp_renumber_map);
   }
 
-  // 4. Keep only the edges from a low-degree vertex to a high-degree vertex.
+  // 3. Keep only the edges from a low-degree vertex to a high-degree vertex.
 
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
@@ -625,7 +257,7 @@ k_truss(raft::handle_t const& handle,
     }
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, wgts, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, wgts, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
@@ -645,7 +277,7 @@ k_truss(raft::handle_t const& handle,
         std::nullopt,
         std::nullopt,
         cugraph::graph_properties_t{false /* now asymmetric */, cur_graph_view.is_multigraph()},
-        false);
+        true);
 
     modified_graph_view = (*modified_graph).view();
     if (renumber_map) {  // collapse renumber_map
@@ -658,264 +290,49 @@ k_truss(raft::handle_t const& handle,
     renumber_map = std::move(tmp_renumber_map);
   }
 
-  // 5. Decompress the resulting graph to an edges list and ind intersection of edges endpoints
-  // for each partition using detail::nbr_intersection
+  // 4. Compute triangle count using nbr_intersection and unroll weak edges
 
   {
     auto cur_graph_view = modified_graph_view ? *modified_graph_view : graph_view;
-    rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
-    rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
-    std::optional<rmm::device_uvector<edge_t>> num_triangles{std::nullopt};
-    std::optional<rmm::device_uvector<weight_t>> edgelist_wgts{std::nullopt};
 
     edge_weight_view =
       edge_weight ? std::make_optional((*edge_weight).view())
                   : std::optional<edge_property_view_t<edge_t, weight_t const*>>{std::nullopt};
 
-    auto prop_num_triangles = edge_triangle_count<vertex_t, edge_t, false>(handle, cur_graph_view);
-
-    std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, num_triangles, std::ignore) =
-      decompress_to_edgelist(
-        handle,
-        cur_graph_view,
-        edge_weight_view,
-        // FIXME: Update 'decompress_edgelist' to support int32_t and int64_t values
-        std::make_optional(prop_num_triangles.view()),
-        std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
-        std::optional<raft::device_span<vertex_t const>>(std::nullopt));
-    auto transposed_edge_first =
-      thrust::make_zip_iterator(edgelist_dsts.begin(), edgelist_srcs.begin());
+    cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
+    cugraph::fill_edge_property(handle, cur_graph_view, edge_mask.mutable_view(), bool{true});
 
-    auto edge_first = thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin());
+    while (true) {
+      // FIXME: This approach is very expensive when invalidating only few edges per iteration
+      // and should be address.
+      auto edge_triangle_counts =
+        edge_triangle_count<vertex_t, edge_t, multi_gpu>(handle, cur_graph_view);
 
-    auto transposed_edge_triangle_count_pair_first =
-      thrust::make_zip_iterator(transposed_edge_first, (*num_triangles).begin());
+      // Mask all the edges that have k - 2 count
 
-    thrust::sort_by_key(handle.get_thrust_policy(),
-                        transposed_edge_first,
-                        transposed_edge_first + edgelist_srcs.size(),
-                        (*num_triangles).begin());
+      auto prev_number_of_edges = cur_graph_view.compute_number_of_edges(handle);
 
-    cugraph::edge_property_t<decltype(cur_graph_view), bool> edge_mask(handle, cur_graph_view);
-    cugraph::fill_edge_property(handle, cur_graph_view, edge_mask.mutable_view(), true);
-    cur_graph_view.attach_edge_mask(edge_mask.view());
-
-    while (true) {
-      // 'invalid_transposed_edge_triangle_count_first' marks the beginning of the edges to be
-      // removed 'invalid_transposed_edge_triangle_count_first' + edgelist_srcs.size() marks the end
-      // of the edges to be removed 'edge_triangle_count_pair_first' marks the begining of the valid
-      // edges.
-      auto invalid_transposed_edge_triangle_count_first =
-        thrust::stable_partition(handle.get_thrust_policy(),
-                                 transposed_edge_triangle_count_pair_first,
-                                 transposed_edge_triangle_count_pair_first + edgelist_srcs.size(),
-                                 [k] __device__(auto e) {
-                                   auto num_triangles = thrust::get<1>(e);
-                                   return num_triangles >= k - 2;
-                                 });
-      auto num_invalid_edges = static_cast<size_t>(
-        thrust::distance(invalid_transposed_edge_triangle_count_first,
-                         transposed_edge_triangle_count_pair_first + edgelist_srcs.size()));
-
-      if (num_invalid_edges == 0) { break; }
-
-      auto num_valid_edges = edgelist_srcs.size() - num_invalid_edges;
-
-      // case 1. For the (p, q), find intersection 'r'.
-
-      // nbr_intersection requires the edges to be sort by 'src'
-      // sort the invalid edges by src for nbr intersection
-      size_t edges_to_intersect_per_iteration =
-        static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * (1 << 17);
-
-      size_t prev_chunk_size         = 0;
-      size_t chunk_num_invalid_edges = num_invalid_edges;
-
-      auto num_chunks =
-        raft::div_rounding_up_safe(edgelist_srcs.size(), edges_to_intersect_per_iteration);
-
-      for (size_t i = 0; i < num_chunks; ++i) {
-        auto chunk_size = std::min(edges_to_intersect_per_iteration, chunk_num_invalid_edges);
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            edge_first + num_valid_edges,
-                            edge_first + edgelist_srcs.size(),
-                            (*num_triangles).begin() + num_valid_edges);
-
-        auto [intersection_offsets, intersection_indices] =
-          detail::nbr_intersection(handle,
-                                   cur_graph_view,
-                                   cugraph::edge_dummy_property_t{}.view(),
-                                   edge_first + num_valid_edges + prev_chunk_size,
-                                   edge_first + num_valid_edges + prev_chunk_size + chunk_size,
-                                   std::array<bool, 2>{true, true},
-                                   do_expensive_check);
-
-        // Update the number of triangles of each (p, q) edges by looking at their intersection
-        // size.
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator<edge_t>(0),
-          thrust::make_counting_iterator<edge_t>(chunk_size),
-          [chunk_start   = prev_chunk_size,
-           num_triangles = raft::device_span<edge_t>((*num_triangles).data() + num_valid_edges,
-                                                     num_invalid_edges),
-           intersection_offsets = raft::device_span<size_t const>(
-             intersection_offsets.data(), intersection_offsets.size())] __device__(auto i) {
-            num_triangles[chunk_start + i] -=
-              (intersection_offsets[i + 1] - intersection_offsets[i]);
-          });
-
-        // FIXME: Find a way to not have to maintain a dataframe_buffer
-        auto vertex_pair_buffer_p_r_edge_p_q =
-          allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
-                                                                       handle.get_stream());
-        thrust::tabulate(
-          handle.get_thrust_policy(),
-          get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
-          get_dataframe_buffer_end(vertex_pair_buffer_p_r_edge_p_q),
-          generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, true>{
-            prev_chunk_size,
-            raft::device_span<size_t const>(intersection_offsets.data(),
-                                            intersection_offsets.size()),
-            raft::device_span<vertex_t const>(intersection_indices.data(),
-                                              intersection_indices.size()),
-            raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-            raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges,
-                                        num_invalid_edges)});
-
-        auto vertex_pair_buffer_q_r_edge_p_q =
-          allocate_dataframe_buffer<thrust::tuple<vertex_t, vertex_t>>(intersection_indices.size(),
-                                                                       handle.get_stream());
-        thrust::tabulate(
-          handle.get_thrust_policy(),
-          get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
-          get_dataframe_buffer_end(vertex_pair_buffer_q_r_edge_p_q),
-          generate_p_r_or_q_r_from_p_q<vertex_t, edge_t, false>{
-            prev_chunk_size,
-            raft::device_span<size_t const>(intersection_offsets.data(),
-                                            intersection_offsets.size()),
-            raft::device_span<vertex_t const>(intersection_indices.data(),
-                                              intersection_indices.size()),
-            raft::device_span<vertex_t>(edgelist_srcs.data() + num_valid_edges, num_invalid_edges),
-            raft::device_span<vertex_t>(edgelist_dsts.data() + num_valid_edges,
-                                        num_invalid_edges)});
-
-        // Unrolling the edges require the edges to be sorted by destination
-        // re-sort the invalid edges by 'dst'
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            transposed_edge_first + num_valid_edges,
-                            transposed_edge_first + edgelist_srcs.size(),
-                            (*num_triangles).begin() + num_valid_edges);
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator<edge_t>(0),
-          thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-          unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
-            num_valid_edges,
-            raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()),
-            get_dataframe_buffer_begin(vertex_pair_buffer_p_r_edge_p_q),
-            transposed_edge_first,
-            transposed_edge_first + num_valid_edges,
-            transposed_edge_first + edgelist_srcs.size()});
-
-        thrust::for_each(
-          handle.get_thrust_policy(),
-          thrust::make_counting_iterator<edge_t>(0),
-          thrust::make_counting_iterator<edge_t>(intersection_indices.size()),
-          unroll_edge<vertex_t, edge_t, decltype(transposed_edge_first)>{
-            num_valid_edges,
-            raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()),
-            get_dataframe_buffer_begin(vertex_pair_buffer_q_r_edge_p_q),
-            transposed_edge_first,
-            transposed_edge_first + num_valid_edges,
-            transposed_edge_first + edgelist_srcs.size()});
-
-        prev_chunk_size += chunk_size;
-        chunk_num_invalid_edges -= chunk_size;
-      }
-      // case 2: unroll (q, r)
-      // For each (q, r) edges to unroll, find the incoming edges to 'r' let's say from 'p' and
-      // create the pair (p, q)
-      cugraph::unroll_p_r_or_q_r_edges<vertex_t, edge_t, false, true>(
-        handle,
-        cur_graph_view,
-        num_invalid_edges,
-        num_valid_edges,
-        raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
-        raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
-        raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()));
-
-      // case 3: unroll (p, r)
-      cugraph::unroll_p_r_or_q_r_edges<vertex_t, edge_t, false, false>(
+      cugraph::transform_e(
         handle,
         cur_graph_view,
-        num_invalid_edges,
-        num_valid_edges,
-        raft::device_span<vertex_t const>(edgelist_srcs.data(), edgelist_srcs.size()),
-        raft::device_span<vertex_t const>(edgelist_dsts.data(), edgelist_dsts.size()),
-        raft::device_span<edge_t>((*num_triangles).data(), (*num_triangles).size()));
-
-      // Remove edges that have a triangle count of zero. Those should not be accounted
-      // for during the unroling phase.
-      auto edges_with_triangle_last = thrust::stable_partition(
-        handle.get_thrust_policy(),
-        transposed_edge_triangle_count_pair_first,
-        transposed_edge_triangle_count_pair_first + (*num_triangles).size(),
-        [] __device__(auto e) {
-          auto num_triangles = thrust::get<1>(e);
-          return num_triangles > 0;
-        });
-
-      auto num_edges_with_triangles = static_cast<size_t>(
-        thrust::distance(transposed_edge_triangle_count_pair_first, edges_with_triangle_last));
-
-      thrust::sort(handle.get_thrust_policy(),
-                   thrust::make_zip_iterator(edgelist_srcs.begin() + num_edges_with_triangles,
-                                             edgelist_dsts.begin() + num_edges_with_triangles),
-                   thrust::make_zip_iterator(edgelist_srcs.end(), edgelist_dsts.end()));
-
-      cugraph::edge_bucket_t<vertex_t, void, true, multi_gpu, true> edges_with_no_triangle(handle);
-      edges_with_no_triangle.insert(edgelist_srcs.begin() + num_edges_with_triangles,
-                                    edgelist_srcs.end(),
-                                    edgelist_dsts.begin() + num_edges_with_triangles);
-
-      cur_graph_view.clear_edge_mask();
-      if (edge_weight_view) {
-        cugraph::transform_e(
-          handle,
-          cur_graph_view,
-          edges_with_no_triangle,
-          cugraph::edge_src_dummy_property_t{}.view(),
-          cugraph::edge_dst_dummy_property_t{}.view(),
-          *edge_weight_view,
-          [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto wgt) {
-            return false;
-          },
-          edge_mask.mutable_view(),
-          false);
-      } else {
-        cugraph::transform_e(
-          handle,
-          cur_graph_view,
-          edges_with_no_triangle,
-          cugraph::edge_src_dummy_property_t{}.view(),
-          cugraph::edge_dst_dummy_property_t{}.view(),
-          cugraph::edge_dummy_property_t{}.view(),
-          [] __device__(
-            auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
-            return false;
-          },
-          edge_mask.mutable_view(),
-          false);
-      }
+        cugraph::edge_src_dummy_property_t{}.view(),
+        cugraph::edge_dst_dummy_property_t{}.view(),
+        edge_triangle_counts.view(),
+        [k] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, auto count) {
+          return count >= k - 2;
+        },
+        edge_mask.mutable_view(),
+        false);
+
       cur_graph_view.attach_edge_mask(edge_mask.view());
 
-      edgelist_srcs.resize(num_edges_with_triangles, handle.get_stream());
-      edgelist_dsts.resize(num_edges_with_triangles, handle.get_stream());
-      (*num_triangles).resize(num_edges_with_triangles, handle.get_stream());
+      if (prev_number_of_edges == cur_graph_view.compute_number_of_edges(handle)) { break; }
     }
 
+    rmm::device_uvector<vertex_t> edgelist_srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> edgelist_wgts{std::nullopt};
+
     std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts, std::ignore, std::ignore) =
       decompress_to_edgelist(
         handle,
@@ -923,7 +340,8 @@ k_truss(raft::handle_t const& handle,
         edge_weight_view ? std::make_optional(*edge_weight_view) : std::nullopt,
         std::optional<edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
         std::optional<cugraph::edge_property_view_t<edge_t, int32_t const*>>{std::nullopt},
-        std::optional<raft::device_span<vertex_t const>>(std::nullopt));
+        std::make_optional(
+          raft::device_span<vertex_t const>((*renumber_map).data(), (*renumber_map).size())));
 
     std::tie(edgelist_srcs, edgelist_dsts, edgelist_wgts) =
       symmetrize_edgelist<vertex_t, weight_t, false, multi_gpu>(handle,
diff --git a/cpp/src/community/k_truss_mg_v32_e32.cu b/cpp/src/community/k_truss_mg_v32_e32.cu
new file mode 100644
index 00000000000..4feb69f6098
--- /dev/null
+++ b/cpp/src/community/k_truss_mg_v32_e32.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// MG instantiation
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+        int32_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+        int32_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/k_truss_mg_v32_e64.cu b/cpp/src/community/k_truss_mg_v32_e64.cu
new file mode 100644
index 00000000000..b07f9382612
--- /dev/null
+++ b/cpp/src/community/k_truss_mg_v32_e64.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// MG instantiation
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/k_truss_mg_v64_e64.cu b/cpp/src/community/k_truss_mg_v64_e64.cu
new file mode 100644
index 00000000000..1c730fe272d
--- /dev/null
+++ b/cpp/src/community/k_truss_mg_v64_e64.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "community/k_truss_impl.cuh"
+
+namespace cugraph {
+
+// MG instantiation
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss(raft::handle_t const& handle,
+        graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+        std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+        int64_t k,
+        bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/triangle_count_impl.cuh b/cpp/src/community/triangle_count_impl.cuh
index 0b453cfe262..e902901cd36 100644
--- a/cpp/src/community/triangle_count_impl.cuh
+++ b/cpp/src/community/triangle_count_impl.cuh
@@ -439,7 +439,7 @@ void triangle_count(raft::handle_t const& handle,
                                             extract_low_to_high_degree_edges_t<vertex_t, edge_t>{});
 
     if constexpr (multi_gpu) {
-      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore) =
+      std::tie(srcs, dsts, std::ignore, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 681523dad90..468f4f7280f 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -682,6 +682,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
                  std::get<1>(edge_buffer),
                  std::ignore,
                  std::ignore,
+                 std::ignore,
                  std::ignore) =
           detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
             vertex_t,
diff --git a/cpp/src/cores/k_core_impl.cuh b/cpp/src/cores/k_core_impl.cuh
index 06402cc3382..2c5bf987a47 100644
--- a/cpp/src/cores/k_core_impl.cuh
+++ b/cpp/src/cores/k_core_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,6 @@ k_core(raft::handle_t const& handle,
        std::optional<raft::device_span<edge_t const>> core_numbers,
        bool do_expensive_check)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   rmm::device_uvector<edge_t> computed_core_numbers(0, handle.get_stream());
 
   if (!core_numbers) {
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
index 487f31e5e03..b39895129dc 100644
--- a/cpp/src/link_prediction/similarity_impl.cuh
+++ b/cpp/src/link_prediction/similarity_impl.cuh
@@ -408,7 +408,7 @@ all_pairs_similarity(raft::handle_t const& handle,
           // shuffle vertex pairs
           auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
 
-          std::tie(v1, v2, std::ignore, std::ignore, std::ignore) =
+          std::tie(v1, v2, std::ignore, std::ignore, std::ignore, std::ignore) =
             detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                            edge_t,
                                                                                            weight_t,
@@ -599,7 +599,7 @@ all_pairs_similarity(raft::handle_t const& handle,
       // shuffle vertex pairs
       auto vertex_partition_range_lasts = graph_view.vertex_partition_range_lasts();
 
-      std::tie(v1, v2, std::ignore, std::ignore, std::ignore) =
+      std::tie(v1, v2, std::ignore, std::ignore, std::ignore, std::ignore) =
         detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                        edge_t,
                                                                                        weight_t,
diff --git a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
index 64b6aab9baf..43415ba6df4 100644
--- a/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
+++ b/cpp/src/prims/detail/sample_and_compute_local_nbr_indices.cuh
@@ -1573,11 +1573,11 @@ uniform_sample_and_compute_local_nbr_indices(
   size_t K,
   bool with_replacement)
 {
+  using edge_t = typename GraphViewType::edge_type;
 #ifndef NO_CUGRAPH_OPS
   assert(cugraph::invalid_edge_id_v<edge_t> == cugraph::ops::graph::INVALID_ID<edge_t>);
 
   using vertex_t = typename GraphViewType::vertex_type;
-  using edge_t   = typename GraphViewType::edge_type;
   using key_t    = typename thrust::iterator_traits<KeyIterator>::value_type;
 
   int minor_comm_size{1};
diff --git a/cpp/src/prims/key_store.cuh b/cpp/src/prims/key_store.cuh
index 56be1456d0b..b8e17145590 100644
--- a/cpp/src/prims/key_store.cuh
+++ b/cpp/src/prims/key_store.cuh
@@ -28,6 +28,7 @@
 #include <cuco/static_set.cuh>
 
 #include <algorithm>
+#include <cstddef>
 #include <memory>
 #include <optional>
 #include <type_traits>
@@ -322,7 +323,7 @@ class key_cuco_store_t {
       static_cast<size_t>(static_cast<double>(num_keys) / load_factor),
       static_cast<size_t>(num_keys) + 1);  // cuco::static_map requires at least one empty slot
 
-    auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(
+    auto stream_adapter = rmm::mr::stream_allocator_adaptor(
       rmm::mr::polymorphic_allocator<std::byte>(rmm::mr::get_current_device_resource()), stream);
     cuco_store_ =
       std::make_unique<cuco_set_type>(cuco_size,
diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index de233fd583b..a4e644b361e 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -820,7 +820,7 @@ class kv_cuco_store_t {
       static_cast<size_t>(static_cast<double>(num_keys) / load_factor),
       static_cast<size_t>(num_keys) + 1);  // cuco::static_map requires at least one empty slot
 
-    auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(
+    auto stream_adapter = rmm::mr::stream_allocator_adaptor(
       rmm::mr::polymorphic_allocator<std::byte>(rmm::mr::get_current_device_resource()), stream);
     if constexpr (std::is_arithmetic_v<value_t>) {
       cuco_store_ =
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index fb1dee1a92f..ed0a70e570f 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -349,7 +349,8 @@ coarsen_graph(raft::handle_t const& handle,
 
     // 1-2. globally shuffle
 
-    std::tie(edgelist_majors, edgelist_minors, edgelist_weights, std::ignore, std::ignore) =
+    std::tie(
+      edgelist_majors, edgelist_minors, edgelist_weights, std::ignore, std::ignore, std::ignore) =
       cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
         edge_t,
@@ -475,6 +476,7 @@ coarsen_graph(raft::handle_t const& handle,
              reversed_edgelist_minors,
              reversed_edgelist_weights,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/src/structure/symmetrize_edgelist_impl.cuh b/cpp/src/structure/symmetrize_edgelist_impl.cuh
index a6a4c0947c7..1fd566938eb 100644
--- a/cpp/src/structure/symmetrize_edgelist_impl.cuh
+++ b/cpp/src/structure/symmetrize_edgelist_impl.cuh
@@ -295,6 +295,7 @@ symmetrize_edgelist(raft::handle_t const& handle,
              upper_triangular_majors,
              upper_triangular_weights,
              std::ignore,
+             std::ignore,
              std::ignore) =
       detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                      vertex_t,
@@ -481,6 +482,7 @@ symmetrize_edgelist(raft::handle_t const& handle,
              upper_triangular_minors,
              upper_triangular_weights,
              std::ignore,
+             std::ignore,
              std::ignore) =
       detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                      vertex_t,
diff --git a/cpp/src/structure/transpose_graph_impl.cuh b/cpp/src/structure/transpose_graph_impl.cuh
index a1eef536c43..6786b52faed 100644
--- a/cpp/src/structure/transpose_graph_impl.cuh
+++ b/cpp/src/structure/transpose_graph_impl.cuh
@@ -96,6 +96,7 @@ transpose_graph_impl(
            store_transposed ? edgelist_dsts : edgelist_srcs,
            edgelist_weights,
            std::ignore,
+           std::ignore,
            std::ignore) =
     detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                    edge_t,
diff --git a/cpp/src/structure/transpose_graph_storage_impl.cuh b/cpp/src/structure/transpose_graph_storage_impl.cuh
index f8e479f28ff..374bfd05c40 100644
--- a/cpp/src/structure/transpose_graph_storage_impl.cuh
+++ b/cpp/src/structure/transpose_graph_storage_impl.cuh
@@ -96,6 +96,7 @@ transpose_graph_storage_impl(
            !store_transposed ? edgelist_srcs : edgelist_dsts,
            edgelist_weights,
            std::ignore,
+           std::ignore,
            std::ignore) =
     detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<vertex_t,
                                                                                    edge_t,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs.cuh b/cpp/src/utilities/shuffle_vertex_pairs.cuh
index 8cfefa1a060..70327db5ffb 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs.cuh
+++ b/cpp/src/utilities/shuffle_vertex_pairs.cuh
@@ -42,7 +42,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_vertex_pairs_with_values_by_gpu_id_impl(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -178,25 +179,27 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                     handle.get_stream());
   handle.sync_stream();
 
+  std::vector<size_t> rx_counts{};
+
   if (mem_frugal_flag) {  // trade-off potential parallelism to lower peak memory
-    std::tie(majors, std::ignore) =
+    std::tie(majors, rx_counts) =
       shuffle_values(comm, majors.begin(), h_tx_value_counts, handle.get_stream());
 
-    std::tie(minors, std::ignore) =
+    std::tie(minors, rx_counts) =
       shuffle_values(comm, minors.begin(), h_tx_value_counts, handle.get_stream());
 
     if (weights) {
-      std::tie(weights, std::ignore) =
+      std::tie(weights, rx_counts) =
         shuffle_values(comm, (*weights).begin(), h_tx_value_counts, handle.get_stream());
     }
 
     if (edge_ids) {
-      std::tie(edge_ids, std::ignore) =
+      std::tie(edge_ids, rx_counts) =
         shuffle_values(comm, (*edge_ids).begin(), h_tx_value_counts, handle.get_stream());
     }
 
     if (edge_types) {
-      std::tie(edge_types, std::ignore) =
+      std::tie(edge_types, rx_counts) =
         shuffle_values(comm, (*edge_types).begin(), h_tx_value_counts, handle.get_stream());
     }
   } else {
@@ -204,7 +207,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
       if (edge_ids) {
         if (edge_types) {
           std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids, edge_types),
-                                std::ignore) =
+                                rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(majors.begin(),
                                                      minors.begin(),
@@ -214,7 +217,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                            h_tx_value_counts,
                            handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors, weights, edge_ids), rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(
                              majors.begin(), minors.begin(), weights->begin(), edge_ids->begin()),
@@ -223,14 +226,14 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
         }
       } else {
         if (edge_types) {
-          std::forward_as_tuple(std::tie(majors, minors, weights, edge_types), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors, weights, edge_types), rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(
                              majors.begin(), minors.begin(), weights->begin(), edge_types->begin()),
                            h_tx_value_counts,
                            handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors, weights), std::ignore) = shuffle_values(
+          std::forward_as_tuple(std::tie(majors, minors, weights), rx_counts) = shuffle_values(
             comm,
             thrust::make_zip_iterator(majors.begin(), minors.begin(), weights->begin()),
             h_tx_value_counts,
@@ -240,7 +243,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
     } else {
       if (edge_ids) {
         if (edge_types) {
-          std::forward_as_tuple(std::tie(majors, minors, edge_ids, edge_types), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors, edge_ids, edge_types), rx_counts) =
             shuffle_values(
               comm,
               thrust::make_zip_iterator(
@@ -248,7 +251,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
               h_tx_value_counts,
               handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors, edge_ids), std::ignore) = shuffle_values(
+          std::forward_as_tuple(std::tie(majors, minors, edge_ids), rx_counts) = shuffle_values(
             comm,
             thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_ids->begin()),
             h_tx_value_counts,
@@ -256,13 +259,13 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
         }
       } else {
         if (edge_types) {
-          std::forward_as_tuple(std::tie(majors, minors, edge_types), std::ignore) = shuffle_values(
+          std::forward_as_tuple(std::tie(majors, minors, edge_types), rx_counts) = shuffle_values(
             comm,
             thrust::make_zip_iterator(majors.begin(), minors.begin(), edge_types->begin()),
             h_tx_value_counts,
             handle.get_stream());
         } else {
-          std::forward_as_tuple(std::tie(majors, minors), std::ignore) =
+          std::forward_as_tuple(std::tie(majors, minors), rx_counts) =
             shuffle_values(comm,
                            thrust::make_zip_iterator(majors.begin(), minors.begin()),
                            h_tx_value_counts,
@@ -276,7 +279,8 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl(
                          std::move(minors),
                          std::move(weights),
                          std::move(edge_ids),
-                         std::move(edge_types));
+                         std::move(edge_types),
+                         std::move(rx_counts));
 }
 
 }  // namespace
@@ -288,7 +292,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -320,7 +325,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<vertex_t>&& majors,
@@ -366,7 +372,8 @@ std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>,
            std::optional<rmm::device_uvector<edge_t>>,
-           std::optional<rmm::device_uvector<edge_type_t>>>
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<vertex_t>&& edge_srcs,
                        rmm::device_uvector<vertex_t>&& edge_dsts,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu
index 7943ebcd5f4..db5a7c0e9bd 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e32.cu
@@ -36,7 +36,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -49,7 +50,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -62,7 +64,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -76,7 +79,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -92,7 +96,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
@@ -104,7 +109,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int32_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu
index 230fce435f9..d79b2379224 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs_mg_v32_e64.cu
@@ -35,7 +35,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -48,7 +49,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -61,7 +63,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -75,7 +78,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int32_t>&& majors,
@@ -91,7 +95,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
@@ -103,7 +108,8 @@ template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int32_t>&& majors,
                        rmm::device_uvector<int32_t>&& minors,
diff --git a/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu b/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu
index 8134c41696b..605976f7076 100644
--- a/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu
+++ b/cpp/src/utilities/shuffle_vertex_pairs_mg_v64_e64.cu
@@ -36,7 +36,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -49,7 +50,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -62,7 +64,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -76,7 +79,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   raft::handle_t const& handle,
   rmm::device_uvector<int64_t>&& majors,
@@ -92,7 +96,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<float>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int64_t>&& majors,
                        rmm::device_uvector<int64_t>&& minors,
@@ -104,7 +109,8 @@ template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
                     std::optional<rmm::device_uvector<double>>,
                     std::optional<rmm::device_uvector<int64_t>>,
-                    std::optional<rmm::device_uvector<int32_t>>>
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::vector<size_t>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<int64_t>&& majors,
                        rmm::device_uvector<int64_t>&& minors,
diff --git a/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu b/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu
index 0c91eb546d6..db7be5a3031 100644
--- a/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu
+++ b/cpp/src/utilities/shuffle_vertices_mg_v32_integral.cu
@@ -40,6 +40,13 @@ shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int32_t>&& d_values,
   std::vector<int32_t> const& vertex_partition_range_lasts);
 
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int64_t>>
+shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& d_vertices,
+  rmm::device_uvector<int64_t>&& d_values,
+  std::vector<int32_t> const& vertex_partition_range_lasts);
+
 template rmm::device_uvector<int32_t> shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle, rmm::device_uvector<int32_t>&& d_vertices);
 
diff --git a/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu b/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu
index 5abce7c0783..7d968006bc7 100644
--- a/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu
+++ b/cpp/src/utilities/shuffle_vertices_mg_v64_integral.cu
@@ -35,6 +35,13 @@ shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int32_t>&& d_values,
   std::vector<int64_t> const& vertex_partition_range_lasts);
 
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+shuffle_int_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& d_vertices,
+  rmm::device_uvector<int64_t>&& d_values,
+  std::vector<int64_t> const& vertex_partition_range_lasts);
+
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
 shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 771b50846bd..2289841ff19 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -504,7 +504,7 @@ ConfigureTest(CORE_NUMBER_TEST cores/core_number_test.cpp)
 ConfigureTest(K_CORE_TEST cores/k_core_test.cpp)
 
 ###################################################################################################
-# - K-truss tests ---------------------------------------------------------------------------------
+# - K-truss tests --------------------------------------------------------------------------
 ConfigureTest(K_TRUSS_TEST community/k_truss_test.cpp)
 
 ###################################################################################################
@@ -623,6 +623,10 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG EDGE TRIANGLE COUNT tests --------------------------------------------------------------
     ConfigureTestMG(MG_EDGE_TRIANGLE_COUNT_TEST community/mg_edge_triangle_count_test.cpp)
 
+    ###############################################################################################
+    # - MG K-TRUSS tests --------------------------------------------------------------------------
+    ConfigureTestMG(MG_K_TRUSS_TEST community/mg_k_truss_test.cpp)
+
     ###############################################################################################
     # - MG WEAKLY CONNECTED COMPONENTS tests ------------------------------------------------------
     ConfigureTestMG(MG_WEAKLY_CONNECTED_COMPONENTS_TEST
@@ -793,6 +797,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureCTestMG(MG_CAPI_COUNT_MULTI_EDGES c_api/mg_count_multi_edges_test.c)
     ConfigureCTestMG(MG_CAPI_EGONET_TEST c_api/mg_egonet_test.c)
     ConfigureCTestMG(MG_CAPI_TWO_HOP_NEIGHBORS_TEST c_api/mg_two_hop_neighbors_test.c)
+    ConfigureCTestMG(MG_CAPI_K_TRUSS c_api/mg_k_truss_test.c)
 
     rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing_mg DESTINATION bin/gtests/libcugraph_mg)
 
diff --git a/cpp/tests/c_api/mg_k_truss_test.c b/cpp/tests/c_api/mg_k_truss_test.c
new file mode 100644
index 00000000000..e406eb330a7
--- /dev/null
+++ b/cpp/tests/c_api/mg_k_truss_test.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/graph.h>
+#include <cugraph_c/graph_functions.h>
+
+#include <stdio.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+/*
+ * Simple check of creating a graph from a COO on device memory.
+ */
+int generic_k_truss_test(const cugraph_resource_handle_t* handle,
+                         vertex_t* h_src,
+                         vertex_t* h_dst,
+                         weight_t* h_wgt,
+                         size_t num_edges,
+                         size_t num_results,
+                         size_t k,
+                         bool_t store_transposed,
+                         vertex_t* h_result_src,
+                         vertex_t* h_result_dst,
+                         weight_t* h_result_wgt)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  cugraph_graph_t* graph = NULL;
+
+  cugraph_induced_subgraph_result_t* result = NULL;
+
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t size_t_tid = SIZE_T;
+
+  ret_code = create_mg_test_graph(
+    handle, h_src, h_dst, h_wgt, num_edges, store_transposed, TRUE, &graph, &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code = cugraph_k_truss_subgraph(handle, graph, k, FALSE, &result, &ret_error);
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph_k_truss failed.");
+
+  cugraph_type_erased_device_array_view_t* k_truss_src;
+  cugraph_type_erased_device_array_view_t* k_truss_dst;
+  cugraph_type_erased_device_array_view_t* k_truss_wgt;
+
+  k_truss_src = cugraph_induced_subgraph_get_sources(result);
+  k_truss_dst = cugraph_induced_subgraph_get_destinations(result);
+  k_truss_wgt = cugraph_induced_subgraph_get_edge_weights(result);
+
+  size_t k_truss_size = cugraph_type_erased_device_array_view_size(k_truss_src);
+
+  vertex_t h_k_truss_src[k_truss_size];
+  vertex_t h_k_truss_dst[k_truss_size];
+  weight_t h_k_truss_wgt[k_truss_size];
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_k_truss_src, k_truss_src, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_k_truss_dst, k_truss_dst, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+    handle, (byte_t*)h_k_truss_wgt, k_truss_wgt, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+  for (size_t i = 0; (i < k_truss_size) && (test_ret_value == 0); ++i) {
+    bool_t found = FALSE;
+    for (size_t j = 0; (j < num_results) && !found; ++j) {
+      if ((h_k_truss_src[i] == h_result_src[j]) && (h_k_truss_dst[i] == h_result_dst[j]) &&
+          (h_k_truss_wgt[i] == h_result_wgt[j]))
+        found = TRUE;
+    }
+    TEST_ASSERT(test_ret_value, found, "k_truss subgraph has an edge that doesn't match");
+  }
+
+  cugraph_induced_subgraph_result_free(result);
+  cugraph_mg_graph_free(graph);
+  cugraph_error_free(ret_error);
+  return test_ret_value;
+}
+
+int test_k_truss_subgraph(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges    = 14;
+  size_t num_vertices = 7;
+  size_t num_results  = 6;
+  size_t k            = 3;
+
+  vertex_t h_src[] = {0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 3, 4, 5, 6};
+  vertex_t h_dst[] = {1, 2, 5, 0, 2, 3, 4, 6, 0, 1, 1, 1, 0, 1};
+  weight_t h_wgt[] = {
+    1.2f, 1.3f, 1.6f, 1.2f, 2.3f, 2.4f, 2.5f, 2.7f, 1.3f, 2.3f, 2.4f, 2.5f, 1.6f, 2.7f};
+
+  vertex_t h_result_src[] = {0, 2, 2, 1, 1, 0};
+  vertex_t h_result_dst[] = {1, 1, 0, 0, 2, 2};
+  weight_t h_result_wgt[] = {1.2f, 2.3f, 1.3f, 1.2f, 2.3f, 1.3f};
+
+  return generic_k_truss_test(handle,
+                              h_src,
+                              h_dst,
+                              h_wgt,
+                              num_edges,
+                              num_results,
+                              k,
+                              FALSE,
+                              h_result_src,
+                              h_result_dst,
+                              h_result_wgt);
+}
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  void* raft_handle                 = create_mg_raft_handle(argc, argv);
+  cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle);
+
+  int result = 0;
+  result |= RUN_MG_TEST(test_k_truss_subgraph, handle);
+
+  cugraph_free_resource_handle(handle);
+  free_mg_raft_handle(raft_handle);
+
+  return result;
+}
diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c
index 587acb4d295..486ca34aaca 100644
--- a/cpp/tests/c_api/mg_similarity_test.c
+++ b/cpp/tests/c_api/mg_similarity_test.c
@@ -26,7 +26,16 @@ typedef int32_t vertex_t;
 typedef int32_t edge_t;
 typedef float weight_t;
 
-typedef enum { JACCARD, SORENSEN, OVERLAP } similarity_t;
+typedef enum {
+  JACCARD,
+  SORENSEN,
+  OVERLAP,
+  COSINE,
+  ALL_PAIRS_JACCARD,
+  ALL_PAIRS_SORENSEN,
+  ALL_PAIRS_OVERLAP,
+  ALL_PAIRS_COSINE
+} similarity_t;
 
 int generic_similarity_test(const cugraph_resource_handle_t* handle,
                             vertex_t* h_src,
@@ -34,10 +43,13 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle,
                             weight_t* h_wgt,
                             vertex_t* h_first,
                             vertex_t* h_second,
+                            vertex_t* h_start_vertices,
                             weight_t* h_result,
                             size_t num_vertices,
                             size_t num_edges,
                             size_t num_pairs,
+                            size_t num_start_vertices,
+                            size_t topk,
                             bool_t store_transposed,
                             bool_t use_weight,
                             similarity_t test_type)
@@ -48,13 +60,15 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle,
   cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
   cugraph_error_t* ret_error;
 
-  cugraph_graph_t* graph                           = NULL;
-  cugraph_similarity_result_t* result              = NULL;
-  cugraph_vertex_pairs_t* vertex_pairs             = NULL;
-  cugraph_type_erased_device_array_t* v1           = NULL;
-  cugraph_type_erased_device_array_t* v2           = NULL;
-  cugraph_type_erased_device_array_view_t* v1_view = NULL;
-  cugraph_type_erased_device_array_view_t* v2_view = NULL;
+  cugraph_graph_t* graph                                = NULL;
+  cugraph_similarity_result_t* result                   = NULL;
+  cugraph_vertex_pairs_t* vertex_pairs                  = NULL;
+  cugraph_type_erased_device_array_t* v1                = NULL;
+  cugraph_type_erased_device_array_t* v2                = NULL;
+  cugraph_type_erased_device_array_t* start_v           = NULL;
+  cugraph_type_erased_device_array_view_t* v1_view      = NULL;
+  cugraph_type_erased_device_array_view_t* v2_view      = NULL;
+  cugraph_type_erased_device_array_view_t* start_v_view = NULL;
 
   ret_code = create_test_graph(
     handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, TRUE, &graph, &ret_error);
@@ -62,44 +76,81 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle,
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
+  if (topk == 0) { topk = SIZE_MAX; }
+
   if (cugraph_resource_handle_get_rank(handle) != 0) { num_pairs = 0; }
 
-  ret_code =
-    cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v1, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed.");
+  if (h_first != NULL && h_second != NULL) {
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v1, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed.");
+
+    ret_code =
+      cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v2, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v2 create failed.");
+
+    v1_view = cugraph_type_erased_device_array_view(v1);
+    v2_view = cugraph_type_erased_device_array_view(v2);
+
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, v1_view, (byte_t*)h_first, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_first copy_from_host failed.");
 
-  ret_code =
-    cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v2, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v2 create failed.");
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, v2_view, (byte_t*)h_second, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_second copy_from_host failed.");
 
-  v1_view = cugraph_type_erased_device_array_view(v1);
-  v2_view = cugraph_type_erased_device_array_view(v2);
+    ret_code =
+      cugraph_create_vertex_pairs(handle, graph, v1_view, v2_view, &vertex_pairs, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create vertex pairs failed.");
+  }
 
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, v1_view, (byte_t*)h_first, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_first copy_from_host failed.");
+  if (h_start_vertices != NULL) {
+    ret_code = cugraph_type_erased_device_array_create(
+      handle, num_start_vertices, vertex_tid, &start_v, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed.");
+    start_v_view = cugraph_type_erased_device_array_view(start_v);
 
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, v2_view, (byte_t*)h_second, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_second copy_from_host failed.");
+    ret_code = cugraph_type_erased_device_array_view_copy_from_host(
+      handle, start_v_view, (byte_t*)h_start_vertices, &ret_error);
 
-  ret_code =
-    cugraph_create_vertex_pairs(handle, graph, v1_view, v2_view, &vertex_pairs, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create vertex pairs failed.");
+    TEST_ASSERT(
+      test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_start_vertices copy_from_host failed.");
+  }
 
   switch (test_type) {
     case JACCARD:
       ret_code = cugraph_jaccard_coefficients(
         handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error);
       break;
+    case ALL_PAIRS_JACCARD:
+      ret_code = cugraph_all_pairs_jaccard_coefficients(
+        handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error);
+      break;
     case SORENSEN:
       ret_code = cugraph_sorensen_coefficients(
         handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error);
       break;
+    case ALL_PAIRS_SORENSEN:
+      ret_code = cugraph_all_pairs_sorensen_coefficients(
+        handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error);
+      break;
     case OVERLAP:
       ret_code = cugraph_overlap_coefficients(
         handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error);
       break;
+    case ALL_PAIRS_OVERLAP:
+      ret_code = cugraph_all_pairs_overlap_coefficients(
+        handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error);
+      break;
+    case COSINE:
+      ret_code = cugraph_cosine_similarity_coefficients(
+        handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error);
+      break;
+    case ALL_PAIRS_COSINE:
+      ret_code = cugraph_all_pairs_cosine_similarity_coefficients(
+        handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error);
+      break;
   }
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -109,6 +160,21 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle,
 
   similarity_coefficient = cugraph_similarity_result_get_similarity(result);
 
+  switch (test_type) {
+    case ALL_PAIRS_JACCARD:
+      num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient);
+      break;
+    case ALL_PAIRS_SORENSEN:
+      num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient);
+      break;
+    case ALL_PAIRS_OVERLAP:
+      num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient);
+      break;
+    case ALL_PAIRS_COSINE:
+      num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient);
+      break;
+  }
+
   weight_t h_similarity_coefficient[num_pairs];
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
@@ -131,15 +197,18 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle,
 
 int test_jaccard(const cugraph_resource_handle_t* handle)
 {
-  size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
-
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 10;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_first[]         = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
+  vertex_t h_second[]        = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
+  vertex_t* h_start_vertices = NULL;
   weight_t h_result[] = {0.2, 0.666667, 0.333333, 0.4, 0.166667, 0.5, 0.2, 0.25, 0.25, 0.666667};
 
   return generic_similarity_test(handle,
@@ -148,10 +217,13 @@ int test_jaccard(const cugraph_resource_handle_t* handle)
                                  h_wgt,
                                  h_first,
                                  h_second,
+                                 h_start_vertices,
                                  h_result,
                                  num_vertices,
                                  num_edges,
                                  num_pairs,
+                                 num_start_vertices,
+                                 topk,
                                  FALSE,
                                  FALSE,
                                  JACCARD);
@@ -159,18 +231,21 @@ int test_jaccard(const cugraph_resource_handle_t* handle)
 
 int test_weighted_jaccard(const cugraph_resource_handle_t* handle)
 {
-  size_t num_edges    = 16;
-  size_t num_vertices = 7;
-  size_t num_pairs    = 3;
+  size_t num_edges          = 16;
+  size_t num_vertices       = 7;
+  size_t num_pairs          = 3;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
 
   vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
   vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
   weight_t h_wgt[] = {
     0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
 
-  vertex_t h_first[]  = {0, 0, 1};
-  vertex_t h_second[] = {1, 2, 3};
-  weight_t h_result[] = {0.357143, 0.208333, 0.0};
+  vertex_t h_first[]         = {0, 0, 1};
+  vertex_t h_second[]        = {1, 2, 3};
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {0.357143, 0.208333, 0.0};
 
   return generic_similarity_test(handle,
                                  h_src,
@@ -178,26 +253,137 @@ int test_weighted_jaccard(const cugraph_resource_handle_t* handle)
                                  h_wgt,
                                  h_first,
                                  h_second,
+                                 h_start_vertices,
                                  h_result,
                                  num_vertices,
                                  num_edges,
                                  num_pairs,
+                                 num_start_vertices,
+                                 topk,
                                  FALSE,
                                  TRUE,
                                  JACCARD);
 }
 
+int test_all_pairs_jaccard(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first          = NULL;
+  vertex_t* h_second         = NULL;
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[] = {0.2,      0.25,      0.666667, 0.333333, 0.2,  0.4,      0.166667, 0.5,
+                         0.25,     0.4,       0.2,      0.25,     0.25, 0.666667, 0.166667, 0.2,
+                         0.666667, 0.3333333, 0.25,     0.666667, 0.5,  0.25};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_JACCARD);
+}
+
+int test_all_pairs_jaccard_with_start_vertices(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 3;
+  size_t topk               = 0;
+
+  vertex_t h_src[]            = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]            = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]            = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first           = NULL;
+  vertex_t* h_second          = NULL;
+  vertex_t h_start_vertices[] = {0, 1, 2};
+  weight_t h_result[]         = {
+    0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, 0.25, 0.4, 0.2, 0.25, 0.25};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_JACCARD);
+}
+
+int test_all_pairs_jaccard_with_topk(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 3;
+  size_t topk               = 5;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first          = NULL;
+  vertex_t* h_second         = NULL;
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {0.666667, 0.666667, 0.666667, 0.666667, 0.5};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_JACCARD);
+}
+
 int test_sorensen(const cugraph_resource_handle_t* handle)
 {
-  size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
-
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 10;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_first[]         = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
+  vertex_t h_second[]        = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
+  vertex_t* h_start_vertices = NULL;
   weight_t h_result[] = {0.333333, 0.8, 0.5, 0.571429, 0.285714, 0.666667, 0.333333, 0.4, 0.4, 0.8};
 
   return generic_similarity_test(handle,
@@ -206,10 +392,13 @@ int test_sorensen(const cugraph_resource_handle_t* handle)
                                  h_wgt,
                                  h_first,
                                  h_second,
+                                 h_start_vertices,
                                  h_result,
                                  num_vertices,
                                  num_edges,
                                  num_pairs,
+                                 num_start_vertices,
+                                 topk,
                                  FALSE,
                                  FALSE,
                                  SORENSEN);
@@ -217,18 +406,21 @@ int test_sorensen(const cugraph_resource_handle_t* handle)
 
 int test_weighted_sorensen(const cugraph_resource_handle_t* handle)
 {
-  size_t num_edges    = 16;
-  size_t num_vertices = 7;
-  size_t num_pairs    = 3;
+  size_t num_edges          = 16;
+  size_t num_vertices       = 7;
+  size_t num_pairs          = 3;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
 
   vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
   vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
   weight_t h_wgt[] = {
     0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
 
-  vertex_t h_first[]  = {0, 0, 1};
-  vertex_t h_second[] = {1, 2, 3};
-  weight_t h_result[] = {0.526316, 0.344828, 0.000000};
+  vertex_t h_first[]         = {0, 0, 1};
+  vertex_t h_second[]        = {1, 2, 3};
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {0.526316, 0.344828, 0.000000};
 
   return generic_similarity_test(handle,
                                  h_src,
@@ -236,27 +428,81 @@ int test_weighted_sorensen(const cugraph_resource_handle_t* handle)
                                  h_wgt,
                                  h_first,
                                  h_second,
+                                 h_start_vertices,
                                  h_result,
                                  num_vertices,
                                  num_edges,
                                  num_pairs,
+                                 num_start_vertices,
+                                 topk,
                                  FALSE,
                                  TRUE,
                                  SORENSEN);
 }
 
-int test_overlap(const cugraph_resource_handle_t* handle)
+int test_all_pairs_sorensen(const cugraph_resource_handle_t* handle)
 {
-  size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first          = NULL;
+  vertex_t* h_second         = NULL;
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[] = {0.333333, 0.4,      0.8,      0.5, 0.333333, 0.571429, 0.285714, 0.666667,
+                         0.4,      0.571429, 0.333333, 0.4, 0.4,      0.8,      0.285714, 0.333333,
+                         0.8,      0.5,      0.4,      0.8, 0.666667, 0.4};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_SORENSEN);
+}
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0.5, 1, 0.5, 0.666667, 0.333333, 1, 0.333333, 0.5, 0.5, 1};
+int test_all_pairs_sorensen_with_start_vertices(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 3;
+  size_t topk               = 0;
+
+  vertex_t h_src[]            = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]            = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]            = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first           = NULL;
+  vertex_t* h_second          = NULL;
+  vertex_t h_start_vertices[] = {0, 1, 2};
+  weight_t h_result[]         = {0.333333,
+                                 0.4,
+                                 0.8,
+                                 0.5,
+                                 0.333333,
+                                 0.571429,
+                                 0.285714,
+                                 0.666667,
+                                 0.4,
+                                 0.571429,
+                                 0.333333,
+                                 0.4,
+                                 0.4};
 
   return generic_similarity_test(handle,
                                  h_src,
@@ -264,10 +510,81 @@ int test_overlap(const cugraph_resource_handle_t* handle)
                                  h_wgt,
                                  h_first,
                                  h_second,
+                                 h_start_vertices,
                                  h_result,
                                  num_vertices,
                                  num_edges,
                                  num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_SORENSEN);
+}
+
+int test_all_pairs_sorensen_with_topk(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 3;
+  size_t topk               = 5;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first          = NULL;
+  vertex_t* h_second         = NULL;
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {0.8, 0.8, 0.8, 0.8, 0.666667};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_SORENSEN);
+}
+
+int test_overlap(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 10;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_first[]         = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
+  vertex_t h_second[]        = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {0.5, 1, 0.5, 0.666667, 0.333333, 1, 0.333333, 0.5, 0.5, 1};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
                                  FALSE,
                                  FALSE,
                                  OVERLAP);
@@ -275,18 +592,21 @@ int test_overlap(const cugraph_resource_handle_t* handle)
 
 int test_weighted_overlap(const cugraph_resource_handle_t* handle)
 {
-  size_t num_edges    = 16;
-  size_t num_vertices = 7;
-  size_t num_pairs    = 3;
+  size_t num_edges          = 16;
+  size_t num_vertices       = 7;
+  size_t num_pairs          = 3;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
 
   vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
   vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
   weight_t h_wgt[] = {
     0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
 
-  vertex_t h_first[]  = {0, 0, 1};
-  vertex_t h_second[] = {1, 2, 3};
-  weight_t h_result[] = {0.714286, 0.416667, 0.000000};
+  vertex_t h_first[]         = {0, 0, 1};
+  vertex_t h_second[]        = {1, 2, 3};
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {0.714286, 0.416667, 0.000000};
 
   return generic_similarity_test(handle,
                                  h_src,
@@ -294,15 +614,298 @@ int test_weighted_overlap(const cugraph_resource_handle_t* handle)
                                  h_wgt,
                                  h_first,
                                  h_second,
+                                 h_start_vertices,
                                  h_result,
                                  num_vertices,
                                  num_edges,
                                  num_pairs,
+                                 num_start_vertices,
+                                 topk,
                                  FALSE,
                                  TRUE,
                                  OVERLAP);
 }
 
+int test_all_pairs_overlap(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first          = NULL;
+  vertex_t* h_second         = NULL;
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {0.5, 0.5,      1.0,      0.5, 0.5, 0.666667, 0.333333, 1.0,
+                                0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0,      0.333333, 0.333333,
+                                1.0, 0.5,      0.5,      1.0, 1.0, 0.5};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_OVERLAP);
+}
+
+int test_all_pairs_overlap_with_start_vertices(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 3;
+  size_t topk               = 0;
+
+  vertex_t h_src[]            = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]            = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]            = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first           = NULL;
+  vertex_t* h_second          = NULL;
+  vertex_t h_start_vertices[] = {0, 1, 2};
+  weight_t h_result[]         = {
+    0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_OVERLAP);
+}
+
+int test_all_pairs_overlap_with_topk(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 3;
+  size_t topk               = 5;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first          = NULL;
+  vertex_t* h_second         = NULL;
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {1.0, 1.0, 1.0, 1.0, 1.0};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_OVERLAP);
+}
+
+int test_cosine(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 10;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_first[]         = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
+  vertex_t h_second[]        = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 COSINE);
+}
+
+int test_weighted_cosine(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 7;
+  size_t num_pairs          = 2;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
+
+  vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[] = {
+    0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]         = {0, 0};
+  vertex_t h_second[]        = {1, 2};
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {0.990830, 0.976187};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 TRUE,
+                                 COSINE);
+}
+
+int test_all_pairs_cosine(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 0;
+  size_t topk               = 0;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first          = NULL;
+  vertex_t* h_second         = NULL;
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {0.5, 0.5,      1.0,      0.5, 0.5, 0.666667, 0.333333, 1.0,
+                                0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0,      0.333333, 0.333333,
+                                1.0, 0.5,      0.5,      1.0, 1.0, 0.5};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_COSINE);
+}
+
+int test_all_pairs_cosine_with_start_vertices(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 3;
+  size_t topk               = 0;
+
+  vertex_t h_src[]            = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]            = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]            = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first           = NULL;
+  vertex_t* h_second          = NULL;
+  vertex_t h_start_vertices[] = {0, 1, 2};
+  weight_t h_result[]         = {
+    0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_COSINE);
+}
+
+int test_all_pairs_cosine_with_topk(const cugraph_resource_handle_t* handle)
+{
+  size_t num_edges          = 16;
+  size_t num_vertices       = 6;
+  size_t num_pairs          = 0;
+  size_t num_start_vertices = 3;
+  size_t topk               = 5;
+
+  vertex_t h_src[]           = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]           = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]           = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t* h_first          = NULL;
+  vertex_t* h_second         = NULL;
+  vertex_t* h_start_vertices = NULL;
+  weight_t h_result[]        = {1.0, 1.0, 1.0, 1.0, 1.0};
+
+  return generic_similarity_test(handle,
+                                 h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_start_vertices,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 num_start_vertices,
+                                 topk,
+                                 FALSE,
+                                 FALSE,
+                                 ALL_PAIRS_COSINE);
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
@@ -311,12 +914,30 @@ int main(int argc, char** argv)
   cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle);
 
   int result = 0;
+
   result |= RUN_MG_TEST(test_jaccard, handle);
+  result |= RUN_MG_TEST(test_weighted_jaccard, handle);
+  result |= RUN_MG_TEST(test_all_pairs_jaccard, handle);
+  result |= RUN_MG_TEST(test_all_pairs_jaccard_with_start_vertices, handle);
+  result |= RUN_MG_TEST(test_all_pairs_jaccard_with_topk, handle);
+
   result |= RUN_MG_TEST(test_sorensen, handle);
+  result |= RUN_MG_TEST(test_weighted_sorensen, handle);
+  result |= RUN_MG_TEST(test_all_pairs_sorensen, handle);
+  result |= RUN_MG_TEST(test_all_pairs_sorensen_with_start_vertices, handle);
+  result |= RUN_MG_TEST(test_all_pairs_sorensen_with_topk, handle);
+
   result |= RUN_MG_TEST(test_overlap, handle);
-  // result |= RUN_MG_TEST(test_weighted_jaccard, handle);
-  // result |= RUN_MG_TEST(test_weighted_sorensen, handle);
-  // result |= RUN_MG_TEST(test_weighted_overlap, handle);
+  result |= RUN_MG_TEST(test_weighted_overlap, handle);
+  result |= RUN_MG_TEST(test_all_pairs_overlap, handle);
+  result |= RUN_MG_TEST(test_all_pairs_overlap_with_start_vertices, handle);
+  result |= RUN_MG_TEST(test_all_pairs_overlap_with_topk, handle);
+
+  result |= RUN_MG_TEST(test_cosine, handle);
+  result |= RUN_MG_TEST(test_weighted_cosine, handle);
+  result |= RUN_MG_TEST(test_all_pairs_cosine, handle);
+  result |= RUN_MG_TEST(test_all_pairs_cosine_with_start_vertices, handle);
+  result |= RUN_MG_TEST(test_all_pairs_cosine_with_topk, handle);
 
   cugraph_free_resource_handle(handle);
   free_mg_raft_handle(raft_handle);
diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c
index c29af658ce9..70e0cb6fb95 100644
--- a/cpp/tests/c_api/similarity_test.c
+++ b/cpp/tests/c_api/similarity_test.c
@@ -26,7 +26,7 @@ typedef int32_t vertex_t;
 typedef int32_t edge_t;
 typedef float weight_t;
 
-typedef enum { JACCARD, SORENSEN, OVERLAP } similarity_t;
+typedef enum { JACCARD, SORENSEN, OVERLAP, COSINE } similarity_t;
 
 int generic_similarity_test(vertex_t* h_src,
                             vertex_t* h_dst,
@@ -101,6 +101,10 @@ int generic_similarity_test(vertex_t* h_src,
       ret_code = cugraph_overlap_coefficients(
         handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error);
       break;
+    case COSINE:
+      ret_code = cugraph_cosine_similarity_coefficients(
+        handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error);
+      break;
   }
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -179,6 +183,10 @@ int generic_all_pairs_similarity_test(vertex_t* h_src,
       ret_code = cugraph_all_pairs_overlap_coefficients(
         handle, graph, vertices_view, use_weight, topk, FALSE, &result, &ret_error);
       break;
+    case COSINE:
+      ret_code = cugraph_all_pairs_cosine_similarity_coefficients(
+        handle, graph, vertices_view, use_weight, topk, FALSE, &result, &ret_error);
+      break;
   }
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
@@ -333,7 +341,7 @@ int test_weighted_sorensen()
 
   vertex_t h_first[]  = {0, 0, 1};
   vertex_t h_second[] = {1, 2, 3};
-  weight_t h_result[] = {0.526316, 0.344828, 0.000000};
+  weight_t h_result[] = {0.526316, 0.344828, 0.0};
 
   return generic_similarity_test(h_src,
                                  h_dst,
@@ -389,7 +397,7 @@ int test_weighted_overlap()
 
   vertex_t h_first[]  = {0, 0, 1};
   vertex_t h_second[] = {1, 2, 3};
-  weight_t h_result[] = {0.714286, 0.416667, 0.000000};
+  weight_t h_result[] = {0.714286, 0.416667, 0.0};
 
   return generic_similarity_test(h_src,
                                  h_dst,
@@ -405,6 +413,62 @@ int test_weighted_overlap()
                                  OVERLAP);
 }
 
+int test_cosine()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 6;
+  size_t num_pairs    = 10;
+
+  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
+  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
+  weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  return generic_similarity_test(h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 FALSE,
+                                 FALSE,
+                                 COSINE);
+}
+
+int test_weighted_cosine()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 2;
+
+  vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[] = {
+    0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0};
+  vertex_t h_second[] = {1, 2};
+  weight_t h_result[] = {0.990830, 0.976187};
+
+  return generic_similarity_test(h_src,
+                                 h_dst,
+                                 h_wgt,
+                                 h_first,
+                                 h_second,
+                                 h_result,
+                                 num_vertices,
+                                 num_edges,
+                                 num_pairs,
+                                 FALSE,
+                                 TRUE,
+                                 COSINE);
+}
+
 int test_all_pairs_jaccard()
 {
   size_t num_edges    = 16;
@@ -631,6 +695,67 @@ int test_weighted_all_pairs_overlap()
                                            OVERLAP);
 }
 
+int test_all_pairs_cosine()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 6;
+  size_t num_pairs    = 22;
+
+  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_first[]  = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5};
+  vertex_t h_second[] = {1, 2, 3, 4, 0, 2, 3, 5, 0, 1, 3, 4, 5, 0, 1, 2, 4, 0, 2, 3, 1, 2};
+  weight_t h_result[] = {0.5, 0.5,      1.0,      0.5, 0.5, 0.666667, 0.333333, 1.0,
+                         0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0,      0.333333, 0.333333,
+                         1.0, 0.5,      0.5,      1.0, 1.0, 0.5};
+
+  return generic_all_pairs_similarity_test(h_src,
+                                           h_dst,
+                                           h_wgt,
+                                           h_first,
+                                           h_second,
+                                           h_result,
+                                           num_vertices,
+                                           num_edges,
+                                           num_pairs,
+                                           FALSE,
+                                           FALSE,
+                                           SIZE_MAX,
+                                           COSINE);
+}
+
+int test_weighted_all_pairs_cosine_topk()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 6;
+  size_t topk         = 6;
+
+  vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[] = {
+    0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 1, 1, 2, 3, 4};
+  vertex_t h_second[] = {1, 0, 2, 1, 4, 3};
+  weight_t h_result[] = {0.0, 0.0, 1.0, 1.0, 1.0, 1.0};
+
+  return generic_all_pairs_similarity_test(h_src,
+                                           h_dst,
+                                           h_wgt,
+                                           h_first,
+                                           h_second,
+                                           h_result,
+                                           num_vertices,
+                                           num_edges,
+                                           num_pairs,
+                                           FALSE,
+                                           TRUE,
+                                           topk,
+                                           COSINE);
+}
+
 int test_all_pairs_jaccard_topk()
 {
   size_t num_edges    = 16;
@@ -812,28 +937,110 @@ int test_weighted_all_pairs_overlap_topk()
                                            OVERLAP);
 }
 
+int test_all_pairs_cosine_topk()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 6;
+  size_t topk         = 6;
+  size_t num_pairs    = 6;
+
+  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  vertex_t h_first[]  = {0, 1, 3, 3, 4, 5};
+  vertex_t h_second[] = {3, 5, 0, 4, 3, 1};
+  weight_t h_result[] = {1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000};
+
+  return generic_all_pairs_similarity_test(h_src,
+                                           h_dst,
+                                           h_wgt,
+                                           h_first,
+                                           h_second,
+                                           h_result,
+                                           num_vertices,
+                                           num_edges,
+                                           num_pairs,
+                                           FALSE,
+                                           FALSE,
+                                           topk,
+                                           COSINE);
+}
+
+int test_weighted_all_pairs_cosine()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 16;
+
+  vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[] = {
+    0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6};
+  vertex_t h_second[] = {1, 2, 0, 2, 0, 1, 4, 5, 6, 3, 5, 6, 3, 4, 3, 4};
+  weight_t h_result[] = {0.714286,
+                         0.416667,
+                         0.714286,
+                         1,
+                         0.416667,
+                         1,
+                         1,
+                         0.166667,
+                         0.5,
+                         1,
+                         0.571429,
+                         0.75,
+                         0.166667,
+                         0.571429,
+                         0.5,
+                         0.75};
+
+  return generic_all_pairs_similarity_test(h_src,
+                                           h_dst,
+                                           h_wgt,
+                                           h_first,
+                                           h_second,
+                                           h_result,
+                                           num_vertices,
+                                           num_edges,
+                                           num_pairs,
+                                           FALSE,
+                                           TRUE,
+                                           SIZE_MAX,
+                                           COSINE);
+}
+
 /******************************************************************************/
 
 int main(int argc, char** argv)
 {
   int result = 0;
+
   result |= RUN_TEST(test_jaccard);
   result |= RUN_TEST(test_sorensen);
   result |= RUN_TEST(test_overlap);
+  result |= RUN_TEST(test_cosine);
   result |= RUN_TEST(test_weighted_jaccard);
   result |= RUN_TEST(test_weighted_sorensen);
   result |= RUN_TEST(test_weighted_overlap);
+  result |= RUN_TEST(test_weighted_cosine);
   result |= RUN_TEST(test_all_pairs_jaccard);
   result |= RUN_TEST(test_all_pairs_sorensen);
   result |= RUN_TEST(test_all_pairs_overlap);
+  result |= RUN_TEST(test_all_pairs_cosine);
   result |= RUN_TEST(test_weighted_all_pairs_jaccard);
   result |= RUN_TEST(test_weighted_all_pairs_sorensen);
   result |= RUN_TEST(test_weighted_all_pairs_overlap);
+  result |= RUN_TEST(test_weighted_all_pairs_cosine);
   result |= RUN_TEST(test_all_pairs_jaccard_topk);
   result |= RUN_TEST(test_all_pairs_sorensen_topk);
   result |= RUN_TEST(test_all_pairs_overlap_topk);
+  result |= RUN_TEST(test_all_pairs_cosine_topk);
   result |= RUN_TEST(test_weighted_all_pairs_jaccard_topk);
   result |= RUN_TEST(test_weighted_all_pairs_sorensen_topk);
   result |= RUN_TEST(test_weighted_all_pairs_overlap_topk);
+  result |= RUN_TEST(test_weighted_all_pairs_cosine_topk);
+
   return result;
 }
diff --git a/cpp/tests/community/mg_k_truss_test.cpp b/cpp/tests/community/mg_k_truss_test.cpp
new file mode 100644
index 00000000000..a1624949007
--- /dev/null
+++ b/cpp/tests/community/mg_k_truss_test.cpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities/base_fixture.hpp"
+#include "utilities/conversion_utilities.hpp"
+#include "utilities/device_comm_wrapper.hpp"
+#include "utilities/mg_utilities.hpp"
+#include "utilities/property_generator_utilities.hpp"
+#include "utilities/test_graphs.hpp"
+#include "utilities/thrust_wrapper.hpp"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+struct KTruss_Usecase {
+  int32_t k_{3};
+  bool test_weighted_{false};
+  bool edge_masking_{false};
+  bool check_correctness_{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGKTruss
+  : public ::testing::TestWithParam<std::tuple<KTruss_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGKTruss() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running KTruss on multiple GPUs to that of a single-GPU run
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(KTruss_Usecase const& k_truss_usecase, input_usecase_t const& input_usecase)
+  {
+    using weight_t = float;
+
+    HighResTimer hr_timer{};
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    auto [mg_graph, edge_weight, mg_renumber_map] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, k_truss_usecase.test_weighted_, true, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (k_truss_usecase.edge_masking_) {
+      edge_mask = cugraph::test::generate<decltype(mg_graph_view), bool>::edge_property(
+        *handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
+    // 2. run MG KTruss
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG KTruss");
+    }
+
+    auto mg_edge_weight_view =
+      edge_weight ? std::make_optional((*edge_weight).view()) : std::nullopt;
+    auto [d_cugraph_srcs, d_cugraph_dsts, d_cugraph_wgts] =
+      cugraph::k_truss<vertex_t, edge_t, weight_t, true>(
+        *handle_, mg_graph_view, mg_edge_weight_view, k_truss_usecase.k_, false);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 3. Compare SG & MG results
+
+    if (k_truss_usecase.check_correctness_) {
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_cugraph_srcs.data(),
+        d_cugraph_srcs.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_cugraph_dsts.data(),
+        d_cugraph_dsts.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+
+      auto global_d_cugraph_srcs = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_cugraph_srcs.data(), d_cugraph_srcs.size()));
+
+      auto global_d_cugraph_dsts = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_cugraph_dsts.data(), d_cugraph_srcs.size()));
+
+      rmm::device_uvector<vertex_t> d_sorted_cugraph_srcs{0, handle_->get_stream()};
+      rmm::device_uvector<vertex_t> d_sorted_cugraph_dsts{0, handle_->get_stream()};
+      rmm::device_uvector<weight_t> d_sorted_cugraph_wgts{0, handle_->get_stream()};
+
+      if (edge_weight) {
+        auto global_d_cugraph_wgts = cugraph::test::device_gatherv(
+          *handle_,
+          raft::device_span<weight_t const>((*d_cugraph_wgts).data(), (*d_cugraph_wgts).size()));
+
+        std::tie(d_sorted_cugraph_srcs, d_sorted_cugraph_dsts, d_sorted_cugraph_wgts) =
+          cugraph::test::sort_by_key<vertex_t, weight_t>(
+            *handle_, global_d_cugraph_srcs, global_d_cugraph_dsts, global_d_cugraph_wgts);
+
+      } else {
+        std::tie(d_sorted_cugraph_srcs, d_sorted_cugraph_dsts) =
+          cugraph::test::sort<vertex_t>(*handle_, global_d_cugraph_srcs, global_d_cugraph_dsts);
+      }
+
+      // 3-1. Convert to SG graph
+      auto [sg_graph, sg_edge_weights, sg_edge_ids, sg_number_map] =
+        cugraph::test::mg_graph_to_sg_graph(
+          *handle_,
+          mg_graph_view,
+          mg_edge_weight_view,
+          std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+          std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                                (*mg_renumber_map).size()),
+          false);
+
+      auto sg_edge_weight_view =
+        sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+      if (handle_->get_comms().get_rank() == int{0}) {
+        auto sg_graph_view = sg_graph.view();
+
+        // 3-2. Run SG KTruss
+        auto [ref_d_cugraph_srcs, ref_d_cugraph_dsts, ref_d_cugraph_wgts] =
+          cugraph::k_truss<vertex_t, edge_t, weight_t, false>(
+            *handle_, sg_graph_view, sg_edge_weight_view, k_truss_usecase.k_, false);
+
+        rmm::device_uvector<vertex_t> d_sorted_ref_cugraph_srcs{0, handle_->get_stream()};
+        rmm::device_uvector<vertex_t> d_sorted_ref_cugraph_dsts{0, handle_->get_stream()};
+        rmm::device_uvector<weight_t> d_sorted_ref_cugraph_wgts{0, handle_->get_stream()};
+
+        if (edge_weight) {
+          std::tie(
+            d_sorted_ref_cugraph_srcs, d_sorted_ref_cugraph_dsts, d_sorted_ref_cugraph_wgts) =
+            cugraph::test::sort_by_key<vertex_t, weight_t>(
+              *handle_, ref_d_cugraph_srcs, ref_d_cugraph_dsts, *ref_d_cugraph_wgts);
+
+        } else {
+          std::tie(d_sorted_ref_cugraph_srcs, d_sorted_ref_cugraph_dsts) =
+            cugraph::test::sort<vertex_t>(*handle_, ref_d_cugraph_srcs, ref_d_cugraph_dsts);
+        }
+
+        // 3-3. Compare
+        auto h_cugraph_srcs     = cugraph::test::to_host(*handle_, d_sorted_cugraph_srcs);
+        auto h_cugraph_dsts     = cugraph::test::to_host(*handle_, d_sorted_cugraph_dsts);
+        auto ref_h_cugraph_srcs = cugraph::test::to_host(*handle_, d_sorted_ref_cugraph_srcs);
+        auto ref_h_cugraph_dsts = cugraph::test::to_host(*handle_, d_sorted_ref_cugraph_dsts);
+
+        ASSERT_TRUE(
+          std::equal(h_cugraph_srcs.begin(), h_cugraph_srcs.end(), ref_h_cugraph_srcs.begin()));
+
+        ASSERT_TRUE(
+          std::equal(h_cugraph_dsts.begin(), h_cugraph_dsts.end(), ref_h_cugraph_dsts.begin()));
+
+        if (edge_weight) {
+          auto ref_h_cugraph_wgts = cugraph::test::to_host(*handle_, d_sorted_ref_cugraph_wgts);
+
+          auto h_cugraph_wgts = cugraph::test::to_host(*handle_, d_sorted_cugraph_wgts);
+
+          ASSERT_TRUE(
+            std::equal(h_cugraph_wgts.begin(), h_cugraph_wgts.end(), ref_h_cugraph_wgts.begin()));
+        }
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGKTruss<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGKTruss_File = Tests_MGKTruss<cugraph::test::File_Usecase>;
+using Tests_MGKTruss_Rmat = Tests_MGKTruss<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGKTruss_File, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGKTruss_Rmat, CheckInt32Int32)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGKTruss_Rmat, CheckInt32Int64)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGKTruss_Rmat, CheckInt64Int64)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGKTruss_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(KTruss_Usecase{4, false, true, true}, KTruss_Usecase{5, true, true, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGKTruss_Rmat,
+  ::testing::Combine(
+    ::testing::Values(KTruss_Usecase{4, false, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGKTruss_Rmat,
+  ::testing::Combine(
+    ::testing::Values(KTruss_Usecase{4, false, false, false},
+                      KTruss_Usecase{5, false, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_prediction/mg_similarity_test.cpp b/cpp/tests/link_prediction/mg_similarity_test.cpp
index 8f674e6a6de..3bcabb6b6df 100644
--- a/cpp/tests/link_prediction/mg_similarity_test.cpp
+++ b/cpp/tests/link_prediction/mg_similarity_test.cpp
@@ -106,7 +106,7 @@ class Tests_MGSimilarity
     auto d_v1 = cugraph::test::to_device(*handle_, h_v1);
     auto d_v2 = std::move(two_hop_nbrs);
 
-    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore) =
+    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore, std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
         edge_t,
diff --git a/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp b/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp
index 192caa5227e..730a3ac8f08 100644
--- a/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp
+++ b/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp
@@ -108,7 +108,7 @@ class Tests_MGSimilarity
     auto d_v1 = cugraph::test::to_device(*handle_, h_v1);
     auto d_v2 = std::move(two_hop_nbrs);
 
-    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore) =
+    std::tie(d_v1, d_v2, std::ignore, std::ignore, std::ignore, std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
         edge_t,
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
index 681a7d8e6ff..fc6369ec721 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
@@ -149,6 +149,7 @@ class Tests_MGPerVPairTransformDstNbrIntersection
              std::get<1>(mg_vertex_pair_buffer),
              std::ignore,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
index 48bbc6176d8..06a23880d81 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_weighted_intersection.cu
@@ -175,6 +175,7 @@ class Tests_MGPerVPairTransformDstNbrIntersection
              std::get<1>(mg_vertex_pair_buffer),
              std::ignore,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
index 3d3d881fb23..b8ad06dd18b 100644
--- a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
+++ b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
@@ -123,6 +123,7 @@ class Tests_MGHasEdgeAndComputeMultiplicity
              store_transposed ? d_mg_edge_srcs : d_mg_edge_dsts,
              std::ignore,
              std::ignore,
+             std::ignore,
              std::ignore) =
       cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
         vertex_t,
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index b6898fbaf78..0a706d1cf80 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -335,6 +335,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase {
                  store_transposed ? tmp_src_v : tmp_dst_v,
                  tmp_weights_v,
                  std::ignore,
+                 std::ignore,
                  std::ignore) =
           cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
             vertex_t,
diff --git a/dependencies.yaml b/dependencies.yaml
index 9d0a5478a50..6bb728a2aae 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -388,11 +388,11 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - libcudf==24.8.*,>=0.0.0a0
-          - libcugraphops==24.8.*,>=0.0.0a0
-          - libraft-headers==24.8.*,>=0.0.0a0
-          - libraft==24.8.*,>=0.0.0a0
-          - librmm==24.8.*,>=0.0.0a0
+          - libcudf==24.10.*,>=0.0.0a0
+          - libcugraphops==24.10.*,>=0.0.0a0
+          - libraft-headers==24.10.*,>=0.0.0a0
+          - libraft==24.10.*,>=0.0.0a0
+          - librmm==24.10.*,>=0.0.0a0
           - openmpi # Required for building cpp-mgtests (multi-GPU tests)
     specific:
       - output_types: [conda]
@@ -482,8 +482,8 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &dask rapids-dask-dependency==24.8.*,>=0.0.0a0
-          - &dask_cuda dask-cuda==24.8.*,>=0.0.0a0
+          - &dask rapids-dask-dependency==24.10.*,>=0.0.0a0
+          - &dask_cuda dask-cuda==24.10.*,>=0.0.0a0
           - &numba numba>=0.57
           - &numpy numpy>=1.23,<2.0a0
       - output_types: conda
@@ -493,7 +493,7 @@ dependencies:
           - requests
           - nccl>=2.9.9
           - ucx-proc=*=gpu
-          - &ucx_py ucx-py==0.39.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.40.*,>=0.0.0a0
       - output_types: pyproject
         packages:
             # cudf uses fsspec but is protocol independent. cugraph
@@ -504,15 +504,17 @@ dependencies:
         matrices:
           - matrix:
               cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - &ucx_py_cu11 ucx-py-cu11==0.39.*,>=0.0.0a0
+              - &ucx_py_cu11 ucx-py-cu11==0.40.*,>=0.0.0a0
           - matrix:
               cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - &ucx_py_cu12 ucx-py-cu12==0.39.*,>=0.0.0a0
+              - &ucx_py_cu12 ucx-py-cu12==0.40.*,>=0.0.0a0
           - matrix:
             packages:
-              - *ucx_py
+              - *ucx_py_unsuffixed
   python_run_nx_cugraph:
     common:
       - output_types: [conda, pyproject]
@@ -530,15 +532,17 @@ dependencies:
         matrices:
           - matrix:
               cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - &cugraph_cu11 cugraph-cu11==24.8.*,>=0.0.0a0
+              - &cugraph_cu11 cugraph-cu11==24.10.*,>=0.0.0a0
           - matrix:
               cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - &cugraph_cu12 cugraph-cu12==24.8.*,>=0.0.0a0
+              - &cugraph_cu12 cugraph-cu12==24.10.*,>=0.0.0a0
           - matrix:
             packages:
-              - &cugraph cugraph==24.8.*,>=0.0.0a0
+              - &cugraph_unsuffixed cugraph==24.10.*,>=0.0.0a0
   python_run_cugraph_pyg:
     common:
       - output_types: [conda, pyproject]
@@ -550,15 +554,17 @@ dependencies:
         matrices:
           - matrix:
               cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - *cugraph_cu11
           - matrix:
               cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - *cugraph_cu12
           - matrix:
             packages:
-              - *cugraph
+              - *cugraph_unsuffixed
   python_run_cugraph_service_client:
     common:
       - output_types: [conda, pyproject]
@@ -575,27 +581,29 @@ dependencies:
           - *thrift
       - output_types: conda
         packages:
-          - *ucx_py
+          - *ucx_py_unsuffixed
     specific:
       - output_types: pyproject
         matrices:
           - matrix:
               cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - *cugraph_cu11
-              - cugraph-service-client-cu11==24.8.*,>=0.0.0a0
+              - cugraph-service-client-cu11==24.10.*,>=0.0.0a0
               - *ucx_py_cu11
           - matrix:
               cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - *cugraph_cu12
-              - cugraph-service-client-cu12==24.8.*,>=0.0.0a0
+              - cugraph-service-client-cu12==24.10.*,>=0.0.0a0
               - *ucx_py_cu12
           - matrix:
             packages:
-              - *cugraph
-              - cugraph-service-client==24.8.*,>=0.0.0a0
-              - *ucx_py
+              - *cugraph_unsuffixed
+              - cugraph-service-client==24.10.*,>=0.0.0a0
+              - *ucx_py_unsuffixed
   test_cpp:
     common:
       - output_types: conda
@@ -630,7 +638,7 @@ dependencies:
           - scikit-learn>=0.23.1
       - output_types: [conda]
         packages:
-          - &pylibwholegraph_conda pylibwholegraph==24.8.*,>=0.0.0a0
+          - &pylibwholegraph_unsuffixed pylibwholegraph==24.10.*,>=0.0.0a0
           - *thrift
   test_python_pylibcugraph:
     common:
@@ -648,25 +656,28 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.8.*,>=0.0.0a0
+          - *cugraph_unsuffixed
           - pytorch>=2.0
           - pytorch-cuda==11.8
+          - &tensordict tensordict>=0.1.2
           - dgl>=1.1.0.cu*
   cugraph_pyg_dev:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.8.*,>=0.0.0a0
+          - *cugraph_unsuffixed
           - pytorch>=2.0
           - pytorch-cuda==11.8
-          - &tensordict tensordict>=0.1.2
+          - *tensordict
           - pyg>=2.5,<2.6
 
   depends_on_pytorch:
     common:
       - output_types: [conda]
         packages:
-          - &pytorch_conda pytorch>=2.0,<2.2.0a0
+          - &pytorch_unsuffixed pytorch>=2.0,<2.2.0a0
+          - torchdata
+          - pydantic
 
     specific:
       - output_types: [requirements]
@@ -694,7 +705,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - *pylibwholegraph_conda
+          - *pylibwholegraph_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -703,19 +714,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - pylibwholegraph-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - pylibwholegraph-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - pylibwholegraph-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibwholegraph_conda]}
+              - pylibwholegraph-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibwholegraph_unsuffixed]}
 
   depends_on_rmm:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.8.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -724,19 +739,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - rmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_conda]}
+              - rmm-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*rmm_unsuffixed]}
 
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -745,19 +764,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - cudf-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - cudf-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - cudf-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*cudf_conda]}
+              - cudf-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*cudf_unsuffixed]}
 
   depends_on_dask_cudf:
     common:
       - output_types: conda
         packages:
-          - &dask_cudf_conda dask-cudf==24.8.*,>=0.0.0a0
+          - &dask_cudf_unsuffixed dask-cudf==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -766,19 +789,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - dask-cudf-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - dask-cudf-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - dask-cudf-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*dask_cudf_conda]}
+              - dask-cudf-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*dask_cudf_unsuffixed]}
 
   depends_on_pylibraft:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_conda pylibraft==24.8.*,>=0.0.0a0
+          - &pylibraft_unsuffixed pylibraft==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -787,19 +814,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - pylibraft-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - pylibraft-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - pylibraft-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibraft_conda]}
+              - pylibraft-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibraft_unsuffixed]}
 
   depends_on_raft_dask:
     common:
       - output_types: conda
         packages:
-          - &raft_dask_conda raft-dask==24.8.*,>=0.0.0a0
+          - &raft_dask_unsuffixed raft-dask==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -808,19 +839,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - raft-dask-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - raft-dask-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - raft-dask-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*raft_dask_conda]}
+              - raft-dask-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*raft_dask_unsuffixed]}
 
   depends_on_pylibcugraph:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraph_conda pylibcugraph==24.8.*,>=0.0.0a0
+          - &pylibcugraph_unsuffixed pylibcugraph==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -829,19 +864,23 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - pylibcugraph-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - pylibcugraph-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - pylibcugraph-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibcugraph_conda]}
+              - pylibcugraph-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibcugraph_unsuffixed]}
 
   depends_on_pylibcugraphops:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraphops_conda pylibcugraphops==24.8.*,>=0.0.0a0
+          - &pylibcugraphops_unsuffixed pylibcugraphops==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -850,19 +889,26 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - pylibcugraphops-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - pylibcugraphops-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - pylibcugraphops-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibcugraphops_conda]}
+              - pylibcugraphops-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibcugraphops_unsuffixed]}
 
   depends_on_cupy:
     common:
       - output_types: conda
         packages:
           - cupy>=12.0.0
+    # NOTE: This is intentionally not broken into groups by a 'cuda_suffixed' selector like
+    #       other packages with -cu{nn}x suffixes in this file.
+    #       All RAPIDS wheel builds (including in devcontainers) expect cupy to be suffixed.
     specific:
       - output_types: [requirements, pyproject]
         matrices:
diff --git a/notebooks/README.md b/notebooks/README.md
index bd0acf00d7d..818382f35a7 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -67,13 +67,13 @@ The easiest way to run the notebooks is to get the latest [rapidsai/notebooks](h
 
 For example, get the latest (as of writing the document) nightly image (`a` after the version number indicates that an image is nightly) with cuda 12.0 using
 ```sh
-docker pull rapidsai/notebooks:24.08a-cuda12.0-py3.9
+docker pull rapidsai/notebooks:24.10a-cuda12.0-py3.9
 ```
 
 And, then run a container based on the image using
 
 ```sh
-docker run --rm  -it --pull always --gpus all --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864   -p 8888:8888 rapidsai/notebooks:24.08a-cuda12.0-py3.9
+docker run --rm  -it --pull always --gpus all --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864   -p 8888:8888 rapidsai/notebooks:24.10a-cuda12.0-py3.9
 ```
 You are all set. Run and edit cugraph notebooks from a browser at url
 http://127.0.0.1:8888/lab/tree/cugraph/cugraph_benchmarks
@@ -89,8 +89,8 @@ ssh -L  127.0.0.1:8888:127.0.0.1:8888 [USER_NAME@][REMOTE_HOST_NAME or REMOTE_HO
 and then run the container in your remote machine.
 
 ```sh
-docker pull rapidsai/notebooks:24.08a-cuda12.0-py3.9
-docker run --rm  -it --pull always --gpus all --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p 8888:8888 rapidsai/notebooks:24.08a-cuda12.0-py3.9
+docker pull rapidsai/notebooks:24.10a-cuda12.0-py3.9
+docker run --rm  -it --pull always --gpus all --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 -p 8888:8888 rapidsai/notebooks:24.10a-cuda12.0-py3.9
 ```
 
 You can run and edit cugraph notebooks at url http://127.0.0.1:8888/lab/tree/cugraph/cugraph_benchmarks as if they are running locally.
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index 63771a75064..ea30b652286 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -9,11 +9,11 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==24.8.*,>=0.0.0a0
+- cugraph==24.10.*,>=0.0.0a0
 - dgl>=1.1.0.cu*
 - pandas
 - pre-commit
-- pylibcugraphops==24.8.*,>=0.0.0a0
+- pylibcugraphops==24.10.*,>=0.0.0a0
 - pytest
 - pytest-benchmark
 - pytest-cov
@@ -21,4 +21,5 @@ dependencies:
 - pytorch-cuda==11.8
 - pytorch>=2.0
 - scipy
+- tensordict>=0.1.2
 name: cugraph_dgl_dev_cuda-118
diff --git a/python/cugraph-dgl/cugraph_dgl/__init__.py b/python/cugraph-dgl/cugraph_dgl/__init__.py
index 03ff50896a4..58850d47fba 100644
--- a/python/cugraph-dgl/cugraph_dgl/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,8 +15,12 @@
 
 # to prevent rapids context being created when importing cugraph_dgl
 os.environ["RAPIDS_NO_INITIALIZE"] = "1"
+from cugraph_dgl.graph import Graph
 from cugraph_dgl.cugraph_storage import CuGraphStorage
-from cugraph_dgl.convert import cugraph_storage_from_heterograph
+from cugraph_dgl.convert import (
+    cugraph_storage_from_heterograph,
+    cugraph_dgl_graph_from_heterograph,
+)
 import cugraph_dgl.dataloading
 import cugraph_dgl.nn
 
diff --git a/python/cugraph-dgl/cugraph_dgl/convert.py b/python/cugraph-dgl/cugraph_dgl/convert.py
index 1235f07adf1..ae4b96dd391 100644
--- a/python/cugraph-dgl/cugraph_dgl/convert.py
+++ b/python/cugraph-dgl/cugraph_dgl/convert.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,6 +12,8 @@
 # limitations under the License.
 from __future__ import annotations
 from cugraph.utilities.utils import import_optional
+
+import cugraph_dgl
 from cugraph_dgl import CuGraphStorage
 from cugraph_dgl.utils.cugraph_conversion_utils import (
     get_edges_dict_from_dgl_HeteroGraph,
@@ -39,3 +41,53 @@ def cugraph_storage_from_heterograph(
     add_ndata_from_dgl_HeteroGraph(gs, g)
     add_edata_from_dgl_HeteroGraph(gs, g)
     return gs
+
+
+def cugraph_dgl_graph_from_heterograph(
+    input_graph: dgl.DGLGraph,
+    single_gpu: bool = True,
+    ndata_storage: str = "torch",
+    edata_storage: str = "torch",
+    **kwargs,
+) -> cugraph_dgl.Graph:
+    """
+    Converts a DGL Graph to a cuGraph-DGL Graph.
+    """
+
+    output_graph = cugraph_dgl.Graph(
+        is_multi_gpu=(not single_gpu),
+        ndata_storage=ndata_storage,
+        edata_storage=edata_storage,
+        **kwargs,
+    )
+
+    # Calling is_homogeneous does not work here
+    if len(input_graph.ntypes) <= 1:
+        output_graph.add_nodes(
+            input_graph.num_nodes(), data=input_graph.ndata, ntype=input_graph.ntypes[0]
+        )
+    else:
+        for ntype in input_graph.ntypes:
+            data = {
+                k: v_dict[ntype]
+                for k, v_dict in input_graph.ndata.items()
+                if ntype in v_dict
+            }
+            output_graph.add_nodes(input_graph.num_nodes(ntype), data=data, ntype=ntype)
+
+    if len(input_graph.canonical_etypes) <= 1:
+        can_etype = input_graph.canonical_etypes[0]
+        src_t, dst_t = input_graph.edges(form="uv", etype=can_etype)
+        output_graph.add_edges(src_t, dst_t, input_graph.edata, etype=can_etype)
+    else:
+        for can_etype in input_graph.canonical_etypes:
+            data = {
+                k: v_dict[can_etype]
+                for k, v_dict in input_graph.edata.items()
+                if can_etype in v_dict
+            }
+
+            src_t, dst_t = input_graph.edges(form="uv", etype=can_etype)
+            output_graph.add_edges(src_t, dst_t, data=data, etype=can_etype)
+
+    return output_graph
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
index 2fd7d29bd49..8a2e9cd954d 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,9 +11,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+
 from cugraph_dgl.dataloading.dataset import (
     HomogenousBulkSamplerDataset,
     HeterogenousBulkSamplerDataset,
 )
+
+from cugraph_dgl.dataloading.sampler import Sampler
 from cugraph_dgl.dataloading.neighbor_sampler import NeighborSampler
-from cugraph_dgl.dataloading.dataloader import DataLoader
+
+from cugraph_dgl.dataloading.dask_dataloader import DaskDataLoader
+from cugraph_dgl.dataloading.dataloader import DataLoader as FutureDataLoader
+
+
+def DataLoader(*args, **kwargs):
+    warnings.warn(
+        "DataLoader has been renamed to DaskDataLoader.  "
+        "In Release 24.10, cugraph_dgl.dataloading.FutureDataLoader "
+        "will take over the DataLoader name.",
+        FutureWarning,
+    )
+    return DaskDataLoader(*args, **kwargs)
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
new file mode 100644
index 00000000000..e220b93f738
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import os
+import shutil
+import cugraph_dgl
+import cupy as cp
+import cudf
+from cugraph.utilities.utils import import_optional
+from cugraph.gnn import BulkSampler
+from dask.distributed import default_client, Event
+from cugraph_dgl.dataloading import (
+    HomogenousBulkSamplerDataset,
+    HeterogenousBulkSamplerDataset,
+)
+from cugraph_dgl.dataloading.utils.extract_graph_helpers import (
+    create_cugraph_graph_from_edges_dict,
+)
+
+dgl = import_optional("dgl")
+torch = import_optional("torch")
+
+
+class DaskDataLoader(torch.utils.data.DataLoader):
+    """
+    Sampled graph data loader. Wrap a :class:`~cugraph_dgl.CuGraphStorage` and a
+    :class:`~cugraph_dgl.dataloading.NeighborSampler` into
+    an iterable over mini-batches of samples. cugraph_dgl's ``DataLoader`` extends
+    PyTorch's ``DataLoader`` by handling creation and
+    transmission of graph samples.
+    """
+
+    def __init__(
+        self,
+        graph: cugraph_dgl.CuGraphStorage,
+        indices: torch.Tensor,
+        graph_sampler: cugraph_dgl.dataloading.NeighborSampler,
+        sampling_output_dir: str,
+        batches_per_partition: int = 50,
+        seeds_per_call: int = 200_000,
+        device: torch.device = None,
+        use_ddp: bool = False,
+        ddp_seed: int = 0,
+        batch_size: int = 1024,
+        drop_last: bool = False,
+        shuffle: bool = False,
+        sparse_format: str = "coo",
+        **kwargs,
+    ):
+        """
+        Constructor for DaskDataLoader:
+        -------------------------------
+        graph : CuGraphStorage
+            The graph.
+        indices : Tensor or dict[ntype, Tensor]
+            The set of indices.  It can either be a tensor of
+            integer indices or a dictionary of types and indices.
+            The actual meaning of the indices is defined by the :meth:`sample` method of
+            :attr:`graph_sampler`.
+        graph_sampler : cugraph_dgl.dataloading.NeighborSampler
+            The subgraph sampler.
+        sampling_output_dir: str
+            Output directory to share sampling results in
+        batches_per_partition: int
+            The number of batches of sampling results to write/read
+        seeds_per_call: int
+            The number of seeds to sample at once
+        device : device context, optional
+            The device of the generated MFGs in each iteration, which should be a
+            PyTorch device object (e.g., ``torch.device``).
+            By default this returns the tenors on device with the current
+            cuda context
+        use_ddp : boolean, optional
+            If True, tells the DataLoader to split the training set for each
+            participating process appropriately using
+            :class:`torch.utils.data.distributed.DistributedSampler`.
+            Overrides the :attr:`sampler` argument of
+            :class:`torch.utils.data.DataLoader`.
+        ddp_seed : int, optional
+            The seed for shuffling the dataset in
+            :class:`torch.utils.data.distributed.DistributedSampler`.
+            Only effective when :attr:`use_ddp` is True.
+        batch_size: int
+            Batch size.
+        sparse_format: str, default = "coo"
+            The sparse format of the emitted sampled graphs. Choose between "csc"
+            and "coo". When using "csc", the graphs are of type
+            cugraph_dgl.nn.SparseGraph.
+        kwargs : dict
+            Key-word arguments to be passed to the parent PyTorch
+            :py:class:`torch.utils.data.DataLoader` class. Common arguments are:
+                - ``batch_size`` (int): The number of indices in each batch.
+                - ``drop_last`` (bool): Whether to drop the last incomplete
+                                        batch.
+                - ``shuffle`` (bool): Whether to randomly shuffle the
+                                      indices at each epoch
+        Examples
+        --------
+        To train a 3-layer GNN for node classification on a set of nodes
+        ``train_nid`` on a homogeneous graph where each node takes messages
+        from 15 neighbors on the first layer, 10 neighbors on the second, and
+        5 neighbors on the third:
+        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
+        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
+        ...     g, train_nid, sampler,
+        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
+        >>> for input_nodes, output_nodes, blocks in dataloader:
+        ...     train_on(input_nodes, output_nodes, blocks)
+        **Using with Distributed Data Parallel**
+        If you are using PyTorch's distributed training (e.g. when using
+        :mod:`torch.nn.parallel.DistributedDataParallel`),
+        you can train the model by turning
+        on the `use_ddp` option:
+        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
+        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
+        ...     g, train_nid, sampler, use_ddp=True,
+        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
+        >>> for epoch in range(start_epoch, n_epochs):
+        ...     for input_nodes, output_nodes, blocks in dataloader:
+        ...
+        """
+        if sparse_format not in ["coo", "csc"]:
+            raise ValueError(
+                f"sparse_format must be one of 'coo', 'csc', "
+                f"but got {sparse_format}."
+            )
+        self.sparse_format = sparse_format
+
+        self.ddp_seed = ddp_seed
+        self.use_ddp = use_ddp
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.graph_sampler = graph_sampler
+        worker_init_fn = dgl.dataloading.WorkerInitWrapper(
+            kwargs.get("worker_init_fn", None)
+        )
+        self.other_storages = {}
+        self.epoch_number = 0
+        self._batch_size = batch_size
+        self._sampling_output_dir = sampling_output_dir
+        self._batches_per_partition = batches_per_partition
+        self._seeds_per_call = seeds_per_call
+        self._rank = None
+
+        indices = _dgl_idx_to_cugraph_idx(indices, graph)
+
+        self.tensorized_indices_ds = dgl.dataloading.create_tensorized_dataset(
+            indices,
+            batch_size,
+            drop_last,
+            use_ddp,
+            ddp_seed,
+            shuffle,
+            kwargs.get("persistent_workers", False),
+        )
+
+        if len(graph.ntypes) <= 1:
+            self.cugraph_dgl_dataset = HomogenousBulkSamplerDataset(
+                total_number_of_nodes=graph.total_number_of_nodes,
+                edge_dir=self.graph_sampler.edge_dir,
+                sparse_format=sparse_format,
+            )
+        else:
+            etype_id_to_etype_str_dict = {v: k for k, v in graph._etype_id_dict.items()}
+
+            self.cugraph_dgl_dataset = HeterogenousBulkSamplerDataset(
+                num_nodes_dict=graph.num_nodes_dict,
+                etype_id_dict=etype_id_to_etype_str_dict,
+                etype_offset_dict=graph._etype_offset_d,
+                ntype_offset_dict=graph._ntype_offset_d,
+                edge_dir=self.graph_sampler.edge_dir,
+            )
+
+        if use_ddp:
+            rank = torch.distributed.get_rank()
+            client = default_client()
+            self._graph_creation_event = Event("cugraph_dgl_load_mg_graph_event")
+            if rank == 0:
+                G = create_cugraph_graph_from_edges_dict(
+                    edges_dict=graph._edges_dict,
+                    etype_id_dict=graph._etype_id_dict,
+                    edge_dir=graph_sampler.edge_dir,
+                )
+                client.publish_dataset(cugraph_dgl_mg_graph_ds=G)
+                self._graph_creation_event.set()
+            else:
+                if self._graph_creation_event.wait(timeout=1000):
+                    G = client.get_dataset("cugraph_dgl_mg_graph_ds")
+                else:
+                    raise RuntimeError(
+                        f"Fetch cugraph_dgl_mg_graph_ds to worker_id {rank}",
+                        "from worker_id 0 failed",
+                    )
+        else:
+            rank = 0
+            G = create_cugraph_graph_from_edges_dict(
+                edges_dict=graph._edges_dict,
+                etype_id_dict=graph._etype_id_dict,
+                edge_dir=graph_sampler.edge_dir,
+            )
+
+        self._rank = rank
+        self._cugraph_graph = G
+        super().__init__(
+            self.cugraph_dgl_dataset,
+            batch_size=None,
+            worker_init_fn=worker_init_fn,
+            collate_fn=lambda x: x,  # Hack to prevent collating
+            **kwargs,
+        )
+
+    def __iter__(self):
+        output_dir = os.path.join(
+            self._sampling_output_dir, "epoch_" + str(self.epoch_number)
+        )
+        kwargs = {}
+        if isinstance(self.cugraph_dgl_dataset, HomogenousBulkSamplerDataset):
+            kwargs["deduplicate_sources"] = True
+            kwargs["prior_sources_behavior"] = "carryover"
+            kwargs["renumber"] = True
+
+            if self.sparse_format == "csc":
+                kwargs["compression"] = "CSR"
+                kwargs["compress_per_hop"] = True
+                # The following kwargs will be deprecated in uniform sampler.
+                kwargs["use_legacy_names"] = False
+                kwargs["include_hop_column"] = False
+
+        else:
+            kwargs["deduplicate_sources"] = False
+            kwargs["prior_sources_behavior"] = None
+            kwargs["renumber"] = False
+
+        bs = BulkSampler(
+            output_path=output_dir,
+            batch_size=self._batch_size,
+            graph=self._cugraph_graph,
+            batches_per_partition=self._batches_per_partition,
+            seeds_per_call=self._seeds_per_call,
+            fanout_vals=self.graph_sampler._reversed_fanout_vals,
+            with_replacement=self.graph_sampler.replace,
+            **kwargs,
+        )
+
+        if self.shuffle:
+            self.tensorized_indices_ds.shuffle()
+
+        batch_df = create_batch_df(self.tensorized_indices_ds)
+        bs.add_batches(batch_df, start_col_name="start", batch_col_name="batch_id")
+        bs.flush()
+        self.cugraph_dgl_dataset.set_input_files(input_directory=output_dir)
+        self.epoch_number = self.epoch_number + 1
+        return super().__iter__()
+
+    def __del__(self):
+        if self.use_ddp:
+            torch.distributed.barrier()
+        if self._rank == 0:
+            if self.use_ddp:
+                client = default_client()
+                client.unpublish_dataset("cugraph_dgl_mg_graph_ds")
+                self._graph_creation_event.clear()
+            _clean_directory(self._sampling_output_dir)
+
+
+def get_batch_id_series(n_output_rows: int, batch_size: int) -> cudf.Series:
+    num_batches = (n_output_rows + batch_size - 1) // batch_size
+    print(f"Number of batches = {num_batches}".format(num_batches))
+    batch_ar = cp.arange(0, num_batches).repeat(batch_size)
+    batch_ar = batch_ar[0:n_output_rows].astype(cp.int32)
+    return cudf.Series(batch_ar)
+
+
+def create_batch_df(dataset: torch.Tensor) -> cudf.DataFrame:
+    batch_id_ls = []
+    indices_ls = []
+    for batch_id, b_indices in enumerate(dataset):
+        if isinstance(b_indices, dict):
+            b_indices = torch.cat(list(b_indices.values()))
+        batch_id_ar = cp.full(shape=len(b_indices), fill_value=batch_id, dtype=cp.int32)
+        batch_id_ls.append(batch_id_ar)
+        indices_ls.append(b_indices)
+
+    batch_id_ar = cp.concatenate(batch_id_ls)
+    indices_ar = cp.asarray(torch.concat(indices_ls))
+    batches_df = cudf.DataFrame(
+        {
+            "start": indices_ar,
+            "batch_id": batch_id_ar,
+        }
+    )
+    return batches_df
+
+
+def _dgl_idx_to_cugraph_idx(idx, cugraph_gs):
+    if not isinstance(idx, dict):
+        if len(cugraph_gs.ntypes) > 1:
+            raise dgl.DGLError(
+                "Must specify node type when the graph is not homogeneous."
+            )
+        return idx
+    else:
+        return {k: cugraph_gs.dgl_n_id_to_cugraph_id(n, k) for k, n in idx.items()}
+
+
+def _clean_directory(path):
+    """param <path> could either be relative or absolute."""
+    if os.path.isfile(path):
+        os.remove(path)  # remove the file
+    elif os.path.isdir(path):
+        shutil.rmtree(path)  # remove dir and all contains
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 11139910931..21b70b05f3a 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,151 +10,121 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import annotations
-import os
-import shutil
-import cugraph_dgl
-import cupy as cp
-import cudf
+
+import warnings
+
+from typing import Union, Optional, Dict
+
 from cugraph.utilities.utils import import_optional
-from cugraph.gnn import BulkSampler
-from dask.distributed import default_client, Event
-from cugraph_dgl.dataloading import (
-    HomogenousBulkSamplerDataset,
-    HeterogenousBulkSamplerDataset,
-)
-from cugraph_dgl.dataloading.utils.extract_graph_helpers import (
-    create_cugraph_graph_from_edges_dict,
-)
+
+import cugraph_dgl
+from cugraph_dgl.typing import TensorType
+from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
 
 dgl = import_optional("dgl")
 torch = import_optional("torch")
 
 
-class DataLoader(torch.utils.data.DataLoader):
+class DataLoader:
     """
-    Sampled graph data loader. Wrap a :class:`~cugraph_dgl.CuGraphStorage` and a
-    :class:`~cugraph_dgl.dataloading.NeighborSampler` into
-    an iterable over mini-batches of samples. cugraph_dgl's ``DataLoader`` extends
-    PyTorch's ``DataLoader`` by handling creation and
-    transmission of graph samples.
+    Duck-typed version of dgl.dataloading.DataLoader
     """
 
     def __init__(
         self,
-        graph: cugraph_dgl.CuGraphStorage,
-        indices: torch.Tensor,
-        graph_sampler: cugraph_dgl.dataloading.NeighborSampler,
-        sampling_output_dir: str,
-        batches_per_partition: int = 50,
-        seeds_per_call: int = 200_000,
-        device: torch.device = None,
+        graph: "cugraph_dgl.Graph",
+        indices: TensorType,
+        graph_sampler: "cugraph_dgl.dataloading.Sampler",
+        device: Union[int, str, "torch.device"] = None,
         use_ddp: bool = False,
         ddp_seed: int = 0,
-        batch_size: int = 1024,
+        batch_size: int = 1,
         drop_last: bool = False,
         shuffle: bool = False,
-        sparse_format: str = "coo",
+        use_prefetch_thread: Optional[bool] = None,
+        use_alternate_streams: Optional[bool] = None,
+        pin_prefetcher: Optional[bool] = None,
+        use_uva=False,
+        gpu_cache: Dict[str, Dict[str, int]] = None,
+        output_format: str = "dgl.Block",
         **kwargs,
     ):
         """
-        Constructor for CuGraphStorage:
-        -------------------------------
-        graph : CuGraphStorage
-            The graph.
-        indices : Tensor or dict[ntype, Tensor]
-            The set of indices.  It can either be a tensor of
-            integer indices or a dictionary of types and indices.
-            The actual meaning of the indices is defined by the :meth:`sample` method of
-            :attr:`graph_sampler`.
-        graph_sampler : cugraph_dgl.dataloading.NeighborSampler
-            The subgraph sampler.
-        sampling_output_dir: str
-            Output directory to share sampling results in
-        batches_per_partition: int
-            The number of batches of sampling results to write/read
-        seeds_per_call: int
-            The number of seeds to sample at once
-        device : device context, optional
-            The device of the generated MFGs in each iteration, which should be a
-            PyTorch device object (e.g., ``torch.device``).
-            By default this returns the tenors on device with the current
-            cuda context
-        use_ddp : boolean, optional
-            If True, tells the DataLoader to split the training set for each
-            participating process appropriately using
-            :class:`torch.utils.data.distributed.DistributedSampler`.
-            Overrides the :attr:`sampler` argument of
-            :class:`torch.utils.data.DataLoader`.
-        ddp_seed : int, optional
-            The seed for shuffling the dataset in
-            :class:`torch.utils.data.distributed.DistributedSampler`.
-            Only effective when :attr:`use_ddp` is True.
-        batch_size: int
-            Batch size.
-        sparse_format: str, default = "coo"
-            The sparse format of the emitted sampled graphs. Choose between "csc"
-            and "coo". When using "csc", the graphs are of type
-            cugraph_dgl.nn.SparseGraph.
-        kwargs : dict
-            Key-word arguments to be passed to the parent PyTorch
-            :py:class:`torch.utils.data.DataLoader` class. Common arguments are:
-                - ``batch_size`` (int): The number of indices in each batch.
-                - ``drop_last`` (bool): Whether to drop the last incomplete
-                                        batch.
-                - ``shuffle`` (bool): Whether to randomly shuffle the
-                                      indices at each epoch
-        Examples
-        --------
-        To train a 3-layer GNN for node classification on a set of nodes
-        ``train_nid`` on a homogeneous graph where each node takes messages
-        from 15 neighbors on the first layer, 10 neighbors on the second, and
-        5 neighbors on the third:
-        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
-        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-        ...     g, train_nid, sampler,
-        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
-        >>> for input_nodes, output_nodes, blocks in dataloader:
-        ...     train_on(input_nodes, output_nodes, blocks)
-        **Using with Distributed Data Parallel**
-        If you are using PyTorch's distributed training (e.g. when using
-        :mod:`torch.nn.parallel.DistributedDataParallel`),
-        you can train the model by turning
-        on the `use_ddp` option:
-        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
-        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-        ...     g, train_nid, sampler, use_ddp=True,
-        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
-        >>> for epoch in range(start_epoch, n_epochs):
-        ...     for input_nodes, output_nodes, blocks in dataloader:
-        ...
+        Parameters
+        ----------
+        graph: cugraph_dgl.Graph
+            The graph being sampled.  Can be a single-GPU or multi-GPU graph.
+        indices: TensorType
+            The seed nodes for sampling.  If use_ddp=True, then all seed
+            nodes should be provided.  If use_ddp=False, then only the seed
+            nodes assigned to this worker should be provided.
+        graph_sampler: cugraph_dgl.dataloading.Sampler
+            The sampler responsible for sampling the graph and producing
+            output minibatches.
+        device: Union[int, str, torch.device]
+            Optional.
+            The device assigned to this loader ('cpu', 'cuda' or device id).
+            Defaults to the current device.
+        use_ddp: bool
+            Optional (default=False).
+            If true, this argument will assume the entire list of input seed
+            nodes is being passed to each worker, and will appropriately
+            split and shuffle the list.
+            It false, then it is assumed that the list of input seed nodes
+            is comprised of the union of the lists provided to each worker.
+        ddp_seed: int
+            Optional (default=0).
+            The seed used for dividing and shuffling data if use_ddp=True.
+            Has no effect if use_ddp=False.
+        use_uva: bool
+            Optional (default=False).
+            Whether to use pinned memory and unified virtual addressing
+            to perform sampling.
+            This argument is ignored by cuGraph-DGL.
+        use_prefetch_thread: bool
+            Optional (default=False).
+            Whether to spawn a new thread for feature fetching.
+            This argument is ignored by cuGraph-DGL.
+        use_alternate_streams: bool
+            Optional (default=False).
+            Whether to perform feature fetching on a separate stream.
+            This argument is ignored by cuGraph-DGL.
+        pin_prefetcher: bool
+            Optional (default=False).
+            Whether to pin the feature tensors.
+            This argument is currently ignored by cuGraph-DGL.
+        gpu_cache: Dict[str, Dict[str, int]]
+            List of features to cache using HugeCTR.
+            This argument is not supported by cuGraph-DGL and
+            will result in an error.
+        output_format: str
+            Optional (default="dgl.Block").
+            The output format for blocks.
+            Can be either "dgl.Block" or "cugraph_dgl.nn.SparseGraph".
         """
-        if sparse_format not in ["coo", "csc"]:
+
+        if use_uva:
+            warnings.warn("The 'use_uva' argument is ignored by cuGraph-DGL.")
+        if use_prefetch_thread:
+            warnings.warn(
+                "The 'use_prefetch_thread' argument is ignored by cuGraph-DGL."
+            )
+        if use_alternate_streams:
+            warnings.warn(
+                "The 'use_alternate_streams' argument is ignored by cuGraph-DGL."
+            )
+        if pin_prefetcher:
+            warnings.warn("The 'pin_prefetcher' argument is ignored by cuGraph-DGL.")
+        if gpu_cache:
             raise ValueError(
-                f"sparse_format must be one of 'coo', 'csc', "
-                f"but got {sparse_format}."
+                "HugeCTR is not supported by cuGraph-DGL. "
+                "Consider using WholeGraph for feature storage"
+                " in cugraph_dgl.Graph instead."
             )
-        self.sparse_format = sparse_format
 
-        self.ddp_seed = ddp_seed
-        self.use_ddp = use_ddp
-        self.shuffle = shuffle
-        self.drop_last = drop_last
-        self.graph_sampler = graph_sampler
-        worker_init_fn = dgl.dataloading.WorkerInitWrapper(
-            kwargs.get("worker_init_fn", None)
-        )
-        self.other_storages = {}
-        self.epoch_number = 0
-        self._batch_size = batch_size
-        self._sampling_output_dir = sampling_output_dir
-        self._batches_per_partition = batches_per_partition
-        self._seeds_per_call = seeds_per_call
-        self._rank = None
-
-        indices = _dgl_idx_to_cugraph_idx(indices, graph)
+        indices = _cast_to_torch_tensor(indices)
 
-        self.tensorized_indices_ds = dgl.dataloading.create_tensorized_dataset(
+        self.__dataset = dgl.dataloading.create_tensorized_dataset(
             indices,
             batch_size,
             drop_last,
@@ -164,158 +134,25 @@ def __init__(
             kwargs.get("persistent_workers", False),
         )
 
-        if len(graph.ntypes) <= 1:
-            self.cugraph_dgl_dataset = HomogenousBulkSamplerDataset(
-                total_number_of_nodes=graph.total_number_of_nodes,
-                edge_dir=self.graph_sampler.edge_dir,
-                sparse_format=sparse_format,
-            )
-        else:
-            etype_id_to_etype_str_dict = {v: k for k, v in graph._etype_id_dict.items()}
-
-            self.cugraph_dgl_dataset = HeterogenousBulkSamplerDataset(
-                num_nodes_dict=graph.num_nodes_dict,
-                etype_id_dict=etype_id_to_etype_str_dict,
-                etype_offset_dict=graph._etype_offset_d,
-                ntype_offset_dict=graph._ntype_offset_d,
-                edge_dir=self.graph_sampler.edge_dir,
-            )
+        self.__output_format = output_format
+        self.__sampler = graph_sampler
+        self.__batch_size = batch_size
+        self.__graph = graph
+        self.__device = device
 
-        if use_ddp:
-            rank = torch.distributed.get_rank()
-            client = default_client()
-            self._graph_creation_event = Event("cugraph_dgl_load_mg_graph_event")
-            if rank == 0:
-                G = create_cugraph_graph_from_edges_dict(
-                    edges_dict=graph._edges_dict,
-                    etype_id_dict=graph._etype_id_dict,
-                    edge_dir=graph_sampler.edge_dir,
-                )
-                client.publish_dataset(cugraph_dgl_mg_graph_ds=G)
-                self._graph_creation_event.set()
-            else:
-                if self._graph_creation_event.wait(timeout=1000):
-                    G = client.get_dataset("cugraph_dgl_mg_graph_ds")
-                else:
-                    raise RuntimeError(
-                        f"Fetch cugraph_dgl_mg_graph_ds to worker_id {rank}",
-                        "from worker_id 0 failed",
-                    )
-        else:
-            rank = 0
-            G = create_cugraph_graph_from_edges_dict(
-                edges_dict=graph._edges_dict,
-                etype_id_dict=graph._etype_id_dict,
-                edge_dir=graph_sampler.edge_dir,
-            )
-
-        self._rank = rank
-        self._cugraph_graph = G
-        super().__init__(
-            self.cugraph_dgl_dataset,
-            batch_size=None,
-            worker_init_fn=worker_init_fn,
-            collate_fn=lambda x: x,  # Hack to prevent collating
-            **kwargs,
-        )
+    @property
+    def dataset(
+        self,
+    ) -> Union[
+        "dgl.dataloading.dataloader.TensorizedDataset",
+        "dgl.dataloading.dataloader.DDPTensorizedDataset",
+    ]:
+        return self.__dataset
 
     def __iter__(self):
-        output_dir = os.path.join(
-            self._sampling_output_dir, "epoch_" + str(self.epoch_number)
+        # TODO move to the correct device (rapidsai/cugraph-gnn#11)
+        return self.__sampler.sample(
+            self.__graph,
+            self.__dataset,
+            batch_size=self.__batch_size,
         )
-        kwargs = {}
-        if isinstance(self.cugraph_dgl_dataset, HomogenousBulkSamplerDataset):
-            kwargs["deduplicate_sources"] = True
-            kwargs["prior_sources_behavior"] = "carryover"
-            kwargs["renumber"] = True
-
-            if self.sparse_format == "csc":
-                kwargs["compression"] = "CSR"
-                kwargs["compress_per_hop"] = True
-                # The following kwargs will be deprecated in uniform sampler.
-                kwargs["use_legacy_names"] = False
-                kwargs["include_hop_column"] = False
-
-        else:
-            kwargs["deduplicate_sources"] = False
-            kwargs["prior_sources_behavior"] = None
-            kwargs["renumber"] = False
-
-        bs = BulkSampler(
-            output_path=output_dir,
-            batch_size=self._batch_size,
-            graph=self._cugraph_graph,
-            batches_per_partition=self._batches_per_partition,
-            seeds_per_call=self._seeds_per_call,
-            fanout_vals=self.graph_sampler._reversed_fanout_vals,
-            with_replacement=self.graph_sampler.replace,
-            **kwargs,
-        )
-
-        if self.shuffle:
-            self.tensorized_indices_ds.shuffle()
-
-        batch_df = create_batch_df(self.tensorized_indices_ds)
-        bs.add_batches(batch_df, start_col_name="start", batch_col_name="batch_id")
-        bs.flush()
-        self.cugraph_dgl_dataset.set_input_files(input_directory=output_dir)
-        self.epoch_number = self.epoch_number + 1
-        return super().__iter__()
-
-    def __del__(self):
-        if self.use_ddp:
-            torch.distributed.barrier()
-        if self._rank == 0:
-            if self.use_ddp:
-                client = default_client()
-                client.unpublish_dataset("cugraph_dgl_mg_graph_ds")
-                self._graph_creation_event.clear()
-            _clean_directory(self._sampling_output_dir)
-
-
-def get_batch_id_series(n_output_rows: int, batch_size: int):
-    num_batches = (n_output_rows + batch_size - 1) // batch_size
-    print(f"Number of batches = {num_batches}".format(num_batches))
-    batch_ar = cp.arange(0, num_batches).repeat(batch_size)
-    batch_ar = batch_ar[0:n_output_rows].astype(cp.int32)
-    return cudf.Series(batch_ar)
-
-
-def create_batch_df(dataset: torch.Tensor):
-    batch_id_ls = []
-    indices_ls = []
-    for batch_id, b_indices in enumerate(dataset):
-        if isinstance(b_indices, dict):
-            b_indices = torch.cat(list(b_indices.values()))
-        batch_id_ar = cp.full(shape=len(b_indices), fill_value=batch_id, dtype=cp.int32)
-        batch_id_ls.append(batch_id_ar)
-        indices_ls.append(b_indices)
-
-    batch_id_ar = cp.concatenate(batch_id_ls)
-    indices_ar = cp.asarray(torch.concat(indices_ls))
-    batches_df = cudf.DataFrame(
-        {
-            "start": indices_ar,
-            "batch_id": batch_id_ar,
-        }
-    )
-    return batches_df
-
-
-def _dgl_idx_to_cugraph_idx(idx, cugraph_gs):
-    if not isinstance(idx, dict):
-        if len(cugraph_gs.ntypes) > 1:
-            raise dgl.DGLError(
-                "Must specify node type when the graph is not homogeneous."
-            )
-        return idx
-    else:
-        return {k: cugraph_gs.dgl_n_id_to_cugraph_id(n, k) for k, n in idx.items()}
-
-
-def _clean_directory(path):
-    """param <path> could either be relative or absolute."""
-    if os.path.isfile(path):
-        os.remove(path)  # remove the file
-    elif os.path.isdir(path):
-        shutil.rmtree(path)  # remove dir and all contains
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
index b61f05f6379..1a35c3ea027 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,11 +10,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from __future__ import annotations
-from typing import Sequence
 
+import warnings
+import tempfile
+
+from typing import Sequence, Optional, Union, List, Tuple, Iterator
+
+from cugraph.gnn import UniformNeighborSampler, DistSampleWriter
+from cugraph.utilities.utils import import_optional
+
+import cugraph_dgl
+from cugraph_dgl.typing import DGLSamplerOutput
+from cugraph_dgl.dataloading.sampler import Sampler, HomogeneousSampleReader
 
-class NeighborSampler:
+torch = import_optional("torch")
+
+
+class NeighborSampler(Sampler):
     """Sampler that builds computational dependency of node representations via
     neighbor sampling for multilayer GNN.
     This sampler will make every node gather messages from a fixed number of neighbors
@@ -50,7 +64,88 @@ def __init__(
         fanouts_per_layer: Sequence[int],
         edge_dir: str = "in",
         replace: bool = False,
+        prob: Optional[str] = None,
+        mask: Optional[str] = None,
+        prefetch_node_feats: Optional[Union[List[str], dict[str, List[str]]]] = None,
+        prefetch_edge_feats: Optional[
+            Union[List[str], dict[Tuple[str, str, str], List[str]]]
+        ] = None,
+        prefetch_labels: Optional[Union[List[str], dict[str, List[str]]]] = None,
+        output_device: Optional[Union["torch.device", int, str]] = None,
+        fused: Optional[bool] = None,
+        sparse_format="csc",
+        output_format="dgl.Block",
+        **kwargs,
     ):
+        """
+        Parameters
+        ----------
+        fanouts_per_layer: Sequence[int]
+            The number of neighbors to sample per layer.
+        edge_dir: str
+            Optional (default='in').
+            The direction to traverse edges.
+        replace: bool
+            Optional (default=False).
+            Whether to sample with replacement.
+        prob: str
+            Optional.
+            If provided, the probability of each neighbor being
+            sampled is proportional to the edge feature
+            with the given name.  Mutually exclusive with mask.
+            Currently unsupported.
+        mask: str
+            Optional.
+            If proivided, only neighbors where the edge mask
+            with the given name is True can be selected.
+            Mutually exclusive with prob.
+            Currently unsupported.
+        prefetch_node_feats: Union[List[str], dict[str, List[str]]]
+            Optional.
+            Currently ignored by cuGraph-DGL.
+        prefetch_edge_feats: Union[List[str], dict[Tuple[str, str, str], List[str]]]
+            Optional.
+            Currently ignored by cuGraph-DGL.
+        prefetch_labels: Union[List[str], dict[str, List[str]]]
+            Optional.
+            Currently ignored by cuGraph-DGL.
+        output_device: Union[torch.device, int, str]
+            Optional.
+            Output device for samples. Defaults to the current device.
+        fused: bool
+            Optional.
+            This argument is ignored by cuGraph-DGL.
+        sparse_format: str
+            Optional (default = "coo").
+            The sparse format of the emitted sampled graphs.
+            Currently, only "csc" is supported.
+        output_format: str
+            Optional (default = "dgl.Block")
+            The output format of the emitted sampled graphs.
+            Can be either "dgl.Block" (default), or "cugraph_dgl.nn.SparseGraph".
+        **kwargs
+            Keyword arguments for the underlying cuGraph distributed sampler
+            and writer (directory, batches_per_partition, format,
+            local_seeds_per_call).
+        """
+
+        if mask:
+            raise NotImplementedError(
+                "Edge masking is currently unsupported by cuGraph-DGL"
+            )
+        if prob:
+            raise NotImplementedError(
+                "Edge masking is currently unsupported by cuGraph-DGL"
+            )
+        if prefetch_edge_feats:
+            warnings.warn("'prefetch_edge_feats' is ignored by cuGraph-DGL")
+        if prefetch_node_feats:
+            warnings.warn("'prefetch_node_feats' is ignored by cuGraph-DGL")
+        if prefetch_labels:
+            warnings.warn("'prefetch_labels' is ignored by cuGraph-DGL")
+        if fused:
+            warnings.warn("'fused' is ignored by cuGraph-DGL")
+
         self.fanouts = fanouts_per_layer
         reverse_fanouts = fanouts_per_layer.copy()
         reverse_fanouts.reverse()
@@ -58,3 +153,53 @@ def __init__(
 
         self.edge_dir = edge_dir
         self.replace = replace
+        self.__kwargs = kwargs
+
+        super().__init__(
+            sparse_format=sparse_format,
+            output_format=output_format,
+        )
+
+    def sample(
+        self,
+        g: "cugraph_dgl.Graph",
+        indices: Iterator["torch.Tensor"],
+        batch_size: int = 1,
+    ) -> Iterator[DGLSamplerOutput]:
+        kwargs = dict(**self.__kwargs)
+
+        directory = kwargs.pop("directory", None)
+        if directory is None:
+            warnings.warn("Setting a directory to store samples is recommended.")
+            self._tempdir = tempfile.TemporaryDirectory()
+            directory = self._tempdir.name
+
+        writer = DistSampleWriter(
+            directory=directory,
+            batches_per_partition=kwargs.pop("batches_per_partition", 256),
+            format=kwargs.pop("format", "parquet"),
+        )
+
+        ds = UniformNeighborSampler(
+            g._graph(self.edge_dir),
+            writer,
+            compression="CSR",
+            fanout=self._reversed_fanout_vals,
+            prior_sources_behavior="carryover",
+            deduplicate_sources=True,
+            compress_per_hop=True,
+            with_replacement=self.replace,
+            **kwargs,
+        )
+
+        if g.is_homogeneous:
+            indices = torch.concat(list(indices))
+            ds.sample_from_nodes(indices, batch_size=batch_size)
+            return HomogeneousSampleReader(
+                ds.get_reader(), self.output_format, self.edge_dir
+            )
+
+        raise ValueError(
+            "Sampling heterogeneous graphs is currently"
+            " unsupported in the non-dask API"
+        )
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
new file mode 100644
index 00000000000..731ec1b8d6f
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Iterator, Dict, Tuple, List, Union
+
+import cugraph_dgl
+from cugraph_dgl.nn import SparseGraph
+from cugraph_dgl.typing import DGLSamplerOutput
+from cugraph_dgl.dataloading.utils.sampling_helpers import (
+    create_homogeneous_sampled_graphs_from_tensors_csc,
+)
+
+from cugraph.gnn import DistSampleReader
+
+from cugraph.utilities.utils import import_optional
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+
+class SampleReader:
+    """
+    Iterator that processes results from the cuGraph distributed sampler.
+    """
+
+    def __init__(self, base_reader: DistSampleReader, output_format: str = "dgl.Block"):
+        """
+        Constructs a new SampleReader.
+
+        Parameters
+        ----------
+        base_reader: DistSampleReader
+            The reader responsible for loading saved samples produced by
+            the cuGraph distributed sampler.
+        """
+        self.__output_format = output_format
+        self.__base_reader = base_reader
+        self.__num_samples_remaining = 0
+        self.__index = 0
+
+    @property
+    def output_format(self) -> str:
+        return self.__output_format
+
+    def __next__(self) -> DGLSamplerOutput:
+        if self.__num_samples_remaining == 0:
+            # raw_sample_data is already a dict of tensors
+            self.__raw_sample_data, start_inclusive, end_inclusive = next(
+                self.__base_reader
+            )
+
+            self.__decoded_samples = self._decode_all(self.__raw_sample_data)
+            self.__num_samples_remaining = end_inclusive - start_inclusive + 1
+            self.__index = 0
+
+        out = self.__decoded_samples[self.__index]
+        self.__index += 1
+        self.__num_samples_remaining -= 1
+        return out
+
+    def _decode_all(self) -> List[DGLSamplerOutput]:
+        raise NotImplementedError("Must be implemented by subclass")
+
+    def __iter__(self) -> DGLSamplerOutput:
+        return self
+
+
+class HomogeneousSampleReader(SampleReader):
+    """
+    Subclass of SampleReader that reads DGL homogeneous output samples
+    produced by the cuGraph distributed sampler.
+    """
+
+    def __init__(
+        self,
+        base_reader: DistSampleReader,
+        output_format: str = "dgl.Block",
+        edge_dir="in",
+    ):
+        """
+        Constructs a new HomogeneousSampleReader
+
+        Parameters
+        ----------
+        base_reader: DistSampleReader
+            The reader responsible for loading saved samples produced by
+            the cuGraph distributed sampler.
+        output_format: str
+            The output format for blocks (either "dgl.Block" or
+            "cugraph_dgl.nn.SparseGraph").
+        edge_dir: str
+            The direction sampling was performed in ("in" or "out").
+        """
+
+        self.__edge_dir = edge_dir
+        super().__init__(base_reader, output_format=output_format)
+
+    def __decode_csc(
+        self, raw_sample_data: Dict[str, "torch.Tensor"]
+    ) -> List[DGLSamplerOutput]:
+        return create_homogeneous_sampled_graphs_from_tensors_csc(
+            raw_sample_data, output_format=self.output_format
+        )
+
+    def __decode_coo(
+        self, raw_sample_data: Dict[str, "torch.Tensor"]
+    ) -> List[DGLSamplerOutput]:
+        raise NotImplementedError(
+            "COO format is currently unsupported in the non-dask API"
+        )
+
+    def _decode_all(
+        self, raw_sample_data: Dict[str, "torch.Tensor"]
+    ) -> List[DGLSamplerOutput]:
+        if "major_offsets" in raw_sample_data:
+            return self.__decode_csc(raw_sample_data)
+        else:
+            return self.__decode_coo(raw_sample_data)
+
+
+class Sampler:
+    """
+    Base sampler class for all cugraph-DGL samplers.
+    """
+
+    def __init__(self, sparse_format: str = "csc", output_format="dgl.Block"):
+        """
+        Parameters
+        ----------
+        sparse_format: str
+            Optional (default = "coo").
+            The sparse format of the emitted sampled graphs.
+            Currently, only "csc" is supported.
+        output_format: str
+            Optional (default = "dgl.Block")
+            The output format of the emitted sampled graphs.
+            Can be either "dgl.Block" (default), or "cugraph_dgl.nn.SparseGraph".
+        """
+
+        if sparse_format != "csc":
+            raise ValueError("Only CSC format is supported at this time")
+
+        self.__output_format = output_format
+
+    @property
+    def output_format(self):
+        return self.__output_format
+
+    @property
+    def sparse_format(self):
+        return self.__sparse_format
+
+    def sample(
+        self,
+        g: cugraph_dgl.Graph,
+        indices: Iterator["torch.Tensor"],
+        batch_size: int = 1,
+    ) -> Iterator[
+        Tuple["torch.Tensor", "torch.Tensor", List[Union[SparseGraph, "dgl.Block"]]]
+    ]:
+        """
+        Samples the graph.
+
+        Parameters
+        ----------
+        g: cugraph_dgl.Graph
+            The graph being sampled.
+        indices: TensorType
+            The node ids of seed nodes where sampling will initiate from.
+        batch_size: int
+            The number of seed nodes per batch.
+
+        Returns
+        -------
+        Iterator[DGLSamplerOutput]
+            Iterator over batches.  The returned tuples are in standard
+            DGL format: (input nodes, output nodes, blocks) where input
+            nodes are the renumbered input nodes, output nodes are
+            the renumbered output nodes, and blocks are the output graphs
+            for each hop.
+        """
+
+        raise NotImplementedError("Must be implemented by subclass")
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index 10d851ebade..3b7e4502134 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -404,21 +404,21 @@ def create_heterogenous_dgl_block_from_tensors_dict(
     return block
 
 
-def _process_sampled_df_csc(
-    df: cudf.DataFrame,
+def _process_sampled_tensors_csc(
+    tensors: Dict["torch.Tensor"],
     reverse_hop_id: bool = True,
 ) -> Tuple[
-    Dict[int, Dict[int, Dict[str, torch.Tensor]]],
-    List[torch.Tensor],
+    Dict[int, Dict[int, Dict[str, "torch.Tensor"]]],
+    List["torch.Tensor"],
     List[List[int, int]],
 ]:
     """
-    Convert a dataframe generated by BulkSampler to a dictionary of tensors, to
+    Convert tensors generated by BulkSampler to a dictionary of tensors, to
     facilitate MFG creation. The sampled graphs in the dataframe use CSC-format.
 
     Parameters
     ----------
-    df: cudf.DataFrame
+    tensors: Dict[torch.Tensor]
         The output from BulkSampler compressed in CSC format. The dataframe
         should be generated with `compression="CSR"` in BulkSampler,
         since the sampling routine treats seed nodes as sources.
@@ -442,12 +442,12 @@ def _process_sampled_df_csc(
         k-th hop, mfg_sizes[k] and mfg_sizes[k+1] is the number of sources and
         destinations, respectively.
     """
-    # dropna
-    major_offsets = cast_to_tensor(df.major_offsets.dropna())
-    label_hop_offsets = cast_to_tensor(df.label_hop_offsets.dropna())
-    renumber_map_offsets = cast_to_tensor(df.renumber_map_offsets.dropna())
-    renumber_map = cast_to_tensor(df["map"].dropna())
-    minors = cast_to_tensor(df.minors.dropna())
+
+    major_offsets = tensors["major_offsets"]
+    minors = tensors["minors"]
+    label_hop_offsets = tensors["label_hop_offsets"]
+    renumber_map = tensors["map"]
+    renumber_map_offsets = tensors["renumber_map_offsets"]
 
     n_batches = len(renumber_map_offsets) - 1
     n_hops = int((len(label_hop_offsets) - 1) / n_batches)
@@ -511,6 +511,115 @@ def _process_sampled_df_csc(
     return tensors_dict, renumber_map_list, mfg_sizes.tolist()
 
 
+def _process_sampled_df_csc(
+    df: cudf.DataFrame,
+    reverse_hop_id: bool = True,
+):
+    """
+    Convert a dataframe generated by BulkSampler to a dictionary of tensors, to
+    facilitate MFG creation. The sampled graphs in the dataframe use CSC-format.
+
+    Parameters
+    ----------
+    df: cudf.DataFrame
+        The output from BulkSampler compressed in CSC format. The dataframe
+        should be generated with `compression="CSR"` in BulkSampler,
+        since the sampling routine treats seed nodes as sources.
+
+    reverse_hop_id: bool (default=True)
+        Reverse hop id.
+
+    Returns
+    -------
+    tensors_dict: dict
+        A nested dictionary keyed by batch id and hop id.
+        `tensor_dict[batch_id][hop_id]` holds "minors" and "major_offsets"
+        values for CSC MFGs.
+
+    renumber_map_list: list
+        List of renumbering maps for looking up global indices of nodes. One
+        map for each batch.
+
+    mfg_sizes: list
+        List of the number of nodes in each message passing layer. For the
+        k-th hop, mfg_sizes[k] and mfg_sizes[k+1] is the number of sources and
+        destinations, respectively.
+    """
+
+    return _process_sampled_tensors_csc(
+        {
+            "major_offsets": cast_to_tensor(df.major_offsets.dropna()),
+            "label_hop_offsets": cast_to_tensor(df.label_hop_offsets.dropna()),
+            "renumber_map_offsets": cast_to_tensor(df.renumber_map_offsets.dropna()),
+            "map": cast_to_tensor(df["map"].dropna()),
+            "minors": cast_to_tensor(df.minors.dropna()),
+        },
+        reverse_hop_id=reverse_hop_id,
+    )
+
+
+def _create_homogeneous_blocks_from_csc(
+    tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
+    renumber_map_list: List[torch.Tensor],
+    mfg_sizes: List[int, int],
+):
+    """Create mini-batches of MFGs in the dgl.Block format.
+    The input arguments are the outputs of
+    the function `_process_sampled_df_csc`.
+
+    Returns
+    -------
+    output: list
+        A list of mini-batches. Each mini-batch is a list that consists of
+        `input_nodes` tensor, `output_nodes` tensor and a list of MFGs.
+    """
+    n_batches, n_hops = len(mfg_sizes), len(mfg_sizes[0]) - 1
+    output = []
+    for b_id in range(n_batches):
+        output_batch = []
+        output_batch.append(renumber_map_list[b_id])
+        output_batch.append(renumber_map_list[b_id][: mfg_sizes[b_id][-1]])
+
+        mfgs = [
+            SparseGraph(
+                size=(mfg_sizes[b_id][h_id], mfg_sizes[b_id][h_id + 1]),
+                src_ids=tensors_dict[b_id][h_id]["minors"],
+                cdst_ids=tensors_dict[b_id][h_id]["major_offsets"],
+                formats=["csc", "coo"],
+                reduce_memory=True,
+            )
+            for h_id in range(n_hops)
+        ]
+
+        blocks = []
+        seednodes_range = None
+        for mfg in reversed(mfgs):
+            block_mfg = _create_homogeneous_dgl_block_from_tensor_d(
+                {
+                    "sources": mfg.src_ids(),
+                    "destinations": mfg.dst_ids(),
+                    "sources_range": mfg._num_src_nodes - 1,
+                    "destinations_range": mfg._num_dst_nodes - 1,
+                },
+                renumber_map=renumber_map_list[b_id],
+                seednodes_range=seednodes_range,
+            )
+
+            seednodes_range = max(
+                mfg._num_src_nodes - 1,
+                mfg._num_dst_nodes - 1,
+            )
+            blocks.append(block_mfg)
+        del mfgs
+
+        blocks.reverse()
+
+        output_batch.append(blocks)
+
+        output.append(output_batch)
+    return output
+
+
 def _create_homogeneous_sparse_graphs_from_csc(
     tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
     renumber_map_list: List[torch.Tensor],
@@ -549,9 +658,35 @@ def _create_homogeneous_sparse_graphs_from_csc(
     return output
 
 
-def create_homogeneous_sampled_graphs_from_dataframe_csc(sampled_df: cudf.DataFrame):
+def create_homogeneous_sampled_graphs_from_dataframe_csc(
+    sampled_df: cudf.DataFrame, output_format: str = "cugraph_dgl.nn.SparseGraph"
+):
+    """Public API to create mini-batches of MFGs using a dataframe output by
+    BulkSampler, where the sampled graph is compressed in CSC format."""
+    if output_format == "cugraph_dgl.nn.SparseGraph":
+        return _create_homogeneous_sparse_graphs_from_csc(
+            *(_process_sampled_df_csc(sampled_df)),
+        )
+    elif output_format == "dgl.Block":
+        return _create_homogeneous_blocks_from_csc(
+            *(_process_sampled_df_csc(sampled_df)),
+        )
+    else:
+        raise ValueError(f"Invalid output format {output_format}")
+
+
+def create_homogeneous_sampled_graphs_from_tensors_csc(
+    tensors: Dict["torch.Tensor"], output_format: str = "cugraph_dgl.nn.SparseGraph"
+):
     """Public API to create mini-batches of MFGs using a dataframe output by
     BulkSampler, where the sampled graph is compressed in CSC format."""
-    return _create_homogeneous_sparse_graphs_from_csc(
-        *(_process_sampled_df_csc(sampled_df))
-    )
+    if output_format == "cugraph_dgl.nn.SparseGraph":
+        return _create_homogeneous_sparse_graphs_from_csc(
+            *(_process_sampled_tensors_csc(tensors)),
+        )
+    elif output_format == "dgl.Block":
+        return _create_homogeneous_blocks_from_csc(
+            *(_process_sampled_tensors_csc(tensors)),
+        )
+    else:
+        raise ValueError(f"Invalid output format {output_format}")
diff --git a/python/cugraph-dgl/cugraph_dgl/features.py b/python/cugraph-dgl/cugraph_dgl/features.py
new file mode 100644
index 00000000000..9dc009f4127
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/features.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+wgth = import_optional("pylibwholegraph.torch")
+
+
+class WholeFeatureStore(
+    object if isinstance(dgl, MissingModule) else dgl.storages.base.FeatureStorage
+):
+    """
+    Interface for feature storage.
+    """
+
+    def __init__(
+        self,
+        tensor: "torch.Tensor",
+        memory_type: str = "distributed",
+        location: str = "cpu",
+    ):
+        """
+        Constructs a new WholeFeatureStore object that wraps a WholeGraph wholememory
+        distributed tensor.
+
+        Parameters
+        ----------
+        t: torch.Tensor
+            The local slice of the tensor being distributed.  These should be in order
+            by rank (i.e. rank 0 contains elements 0-9, rank 1 contains elements 10-19,
+            rank 3 contains elements 20-29, etc.)  The sizes do not need to be equal.
+        memory_type: str (optional, default='distributed')
+            The memory type of this store.  Options are
+            'distributed', 'chunked', and 'continuous'.
+            For more information consult the WholeGraph
+            documentation.
+        location: str(optional, default='cpu')
+            The location ('cpu' or 'cuda') where data is stored.
+        """
+        self.__wg_comm = wgth.get_global_communicator()
+
+        if len(tensor.shape) > 2:
+            raise ValueError("Only 1-D or 2-D tensors are supported by WholeGraph.")
+
+        rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+
+        ld = torch.tensor(tensor.shape[0], device="cuda", dtype=torch.int64)
+        sizes = torch.empty((world_size,), device="cuda", dtype=torch.int64)
+        torch.distributed.all_gather_into_tensor(sizes, ld)
+
+        sizes = sizes.cpu()
+        ld = sizes.sum()
+
+        self.__td = -1 if len(tensor.shape) == 1 else tensor.shape[1]
+        global_shape = [
+            int(ld),
+            self.__td if self.__td > 0 else 1,
+        ]
+
+        if self.__td < 0:
+            tensor = tensor.reshape((tensor.shape[0], 1))
+
+        wg_tensor = wgth.create_wholememory_tensor(
+            self.__wg_comm,
+            memory_type,
+            location,
+            global_shape,
+            tensor.dtype,
+            [global_shape[1], 1],
+        )
+
+        offset = sizes[:rank].sum() if rank > 0 else 0
+
+        wg_tensor.scatter(
+            tensor.clone(memory_format=torch.contiguous_format).cuda(),
+            torch.arange(
+                offset, offset + tensor.shape[0], dtype=torch.int64, device="cuda"
+            ).contiguous(),
+        )
+
+        self.__wg_comm.barrier()
+
+        self.__wg_tensor = wg_tensor
+
+    def requires_ddp(self) -> bool:
+        return True
+
+    def fetch(
+        self,
+        indices: torch.Tensor,
+        device: torch.cuda.Device,
+        pin_memory=False,
+        **kwargs,
+    ):
+        if pin_memory:
+            warnings.warn("pin_memory has no effect for WholeFeatureStorage.")
+
+        t = self.__wg_tensor.gather(
+            indices.cuda(),
+            force_dtype=self.__wg_tensor.dtype,
+        )
+
+        if self.__td < 0:
+            t = t.reshape((t.shape[0],))
+
+        return t.to(torch.device(device))
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
new file mode 100644
index 00000000000..2eba13c6958
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/graph.py
@@ -0,0 +1,910 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from typing import Union, Optional, Dict, Tuple, List
+
+from cugraph.utilities.utils import import_optional
+from cugraph.gnn import cugraph_comms_get_raft_handle
+
+import cupy
+import pylibcugraph
+
+from cugraph_dgl.typing import TensorType
+from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
+from cugraph_dgl.features import WholeFeatureStore
+from cugraph_dgl.view import (
+    HeteroNodeView,
+    HeteroNodeDataView,
+    HeteroEdgeView,
+    HeteroEdgeDataView,
+)
+
+
+# Have to use import_optional even though these are required
+# dependencies in order to build properly.
+dgl = import_optional("dgl")
+torch = import_optional("torch")
+tensordict = import_optional("tensordict")
+
+HOMOGENEOUS_NODE_TYPE = "n"
+HOMOGENEOUS_EDGE_TYPE = (HOMOGENEOUS_NODE_TYPE, "e", HOMOGENEOUS_NODE_TYPE)
+
+
+class Graph:
+    """
+    cuGraph-backed duck-typed version of dgl.DGLGraph that distributes
+    the graph across workers.  This object uses lazy graph creation.
+    Users can repeatedly call add_edges, and the tensors won't
+    be converted into a cuGraph graph until one is needed
+    (i.e. when creating a loader). Supports
+    single-node/single-GPU, single-node/multi-GPU, and
+    multi-node/multi-GPU graph storage.
+
+    Each worker should have a slice of the graph locally, and
+    call put_edge_index with its slice.
+    """
+
+    def __init__(
+        self,
+        is_multi_gpu: bool = False,
+        ndata_storage="torch",
+        edata_storage="torch",
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        is_multi_gpu: bool (optional, default=False)
+            Specifies whether this graph is distributed across GPUs.
+        ndata_storage: str (optional, default='torch')
+            Specifies where node data should be stored
+            (options are 'torch' and 'wholegraph').
+            If using PyTorch tensors for storage ('torch')
+            then data will be replicated across workers and data
+            for all nodes should be provided when calling add_nodes.
+            If using WholeGraph wholememory tensors for storage,
+            then data will be distributed across workers and only
+            the local slice of the data should be provided when
+            calling add_nodes.
+        edata_storage: str (optional, default='torch')
+            If using PyTorch tensors for storage ('torch')
+            then data will be replicated across workers and data
+            for all nodes should be provided when calling add_edge.
+            If using WholeGraph wholememory tensors for storage,
+            then data will be distributed across workers and only
+            the local slice of the data should be provided when
+            calling add_edges.
+        kwargs:
+            Optional kwargs for WholeGraph feature storage.
+        """
+
+        if ndata_storage not in ("torch", "wholegraph"):
+            raise ValueError(
+                "Invalid node storage type (valid types are 'torch' and 'wholegraph')"
+            )
+        if edata_storage not in ("torch", "wholegraph"):
+            raise ValueError(
+                "Invalid edge storage type (valid types are 'torch' and 'wholegraph')"
+            )
+
+        self.__num_nodes_dict = {}
+        self.__num_edges_dict = {}
+        self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
+
+        self.__graph = None
+        self.__vertex_offsets = None
+        self.__handle = None
+        self.__is_multi_gpu = is_multi_gpu
+
+        self.__ndata_storage_type = (
+            WholeFeatureStore
+            if ndata_storage == "wholegraph"
+            else dgl.storages.pytorch_tensor.PyTorchTensorStorage
+        )
+        self.__edata_storage_type = (
+            WholeFeatureStore
+            if edata_storage == "wholegraph"
+            else dgl.storages.pytorch_tensor.PyTorchTensorStorage
+        )
+        self.__ndata_storage = {}
+        self.__edata_storage = {}
+        self.__wg_kwargs = kwargs
+
+    @property
+    def is_multi_gpu(self):
+        return self.__is_multi_gpu
+
+    def to_canonical_etype(
+        self, etype: Union[str, Tuple[str, str, str]]
+    ) -> Tuple[str, str, str]:
+        if etype is None:
+            if len(self.canonical_etypes) > 1:
+                raise ValueError("Edge type is required for heterogeneous graphs.")
+            return HOMOGENEOUS_EDGE_TYPE
+
+        if isinstance(etype, tuple) and len(etype) == 3:
+            return etype
+
+        for src_type, rel_type, dst_type in self.__edge_indices.keys(
+            leaves_only=True, include_nested=True
+        ):
+            if etype == rel_type:
+                return (src_type, rel_type, dst_type)
+
+        raise ValueError("Unknown relation type " + etype)
+
+    def add_nodes(
+        self,
+        global_num_nodes: int,
+        data: Optional[Dict[str, TensorType]] = None,
+        ntype: Optional[str] = None,
+    ):
+        """
+        Adds the given number of nodes to this graph.  Can only be called once
+        per node type. The number of nodes specified here refers to the total
+        number of nodes across all workers (the entire graph). If the backing
+        feature store is distributed (i.e. wholegraph), then only local features
+        should be passed to the data argument.  If the backing feature store is
+        replicated, then features for all nodes in the graph should be passed to
+        the data argument, including those for nodes not on the local worker.
+
+        Parameters
+        ----------
+        global_num_nodes: int
+            The total number of nodes of the given type in this graph.
+            The same number should be passed to every worker.
+        data: Dict[str, TensorType] (optional, default=None)
+            Node feature tensors.
+        ntype: str (optional, default=None)
+            The node type being modified.  Required for heterogeneous graphs.
+        """
+        if ntype is None:
+            if len(self.__num_nodes_dict.keys()) > 1:
+                raise ValueError("Node type is required for heterogeneous graphs.")
+            ntype = HOMOGENEOUS_NODE_TYPE
+
+        if ntype in self.__num_nodes_dict:
+            raise ValueError(
+                "Calling add_nodes multiple types for the same "
+                "node type is not allowed in cuGraph-DGL"
+            )
+
+        if self.is_multi_gpu:
+            # Ensure all nodes got the same number of nodes passed
+            world_size = torch.distributed.get_world_size()
+            local_size = torch.tensor(
+                [global_num_nodes], device="cuda", dtype=torch.int64
+            )
+            ns = torch.empty((world_size,), device="cuda", dtype=torch.int64)
+            torch.distributed.all_gather_into_tensor(ns, local_size)
+            if not (ns == global_num_nodes).all():
+                raise ValueError("The global number of nodes must match on all workers")
+
+            # Ensure the sum of the feature shapes equals the global number of nodes.
+            if data is not None:
+                for feature_name, feature_tensor in data.items():
+                    features_size = torch.tensor(
+                        [int(feature_tensor.shape[0])], device="cuda", dtype=torch.int64
+                    )
+                    torch.distributed.all_reduce(
+                        features_size, op=torch.distributed.ReduceOp.SUM
+                    )
+                    if features_size != global_num_nodes:
+                        raise ValueError(
+                            "The total length of the feature vector across workers must"
+                            " match the global number of nodes but it does not "
+                            f"match for {feature_name}."
+                        )
+
+        self.__num_nodes_dict[ntype] = global_num_nodes
+
+        if data is not None:
+            for feature_name, feature_tensor in data.items():
+                self.__ndata_storage[ntype, feature_name] = self.__ndata_storage_type(
+                    _cast_to_torch_tensor(feature_tensor), **self.__wg_kwargs
+                )
+
+        self.__graph = None
+        self.__vertex_offsets = None
+
+    def __check_node_ids(self, ntype: str, ids: TensorType):
+        """
+        Ensures all node ids in the provided id tensor are valid.
+        Raises a ValueError if any are invalid.
+
+        Parameters
+        ----------
+        ntype: str
+            The node type being validated against.
+        ids:
+            The tensor of ids being validated.
+        """
+        if ntype in self.__num_nodes_dict:
+            if ids.max() + 1 > self.num_nodes(ntype):
+                raise ValueError(
+                    f"input tensor contains invalid node ids for type {ntype}"
+                )
+        else:
+            raise ValueError(
+                f"add_nodes() must be called for type {ntype} before calling num_edges."
+            )
+
+    def add_edges(
+        self,
+        u: TensorType,
+        v: TensorType,
+        data: Optional[Dict[str, TensorType]] = None,
+        etype: Optional[Union[str, Tuple[str, str, str]]] = None,
+    ) -> None:
+        """
+        Adds edges to this graph.  Must be called after add_nodes
+        is called for the src/dst node type. If the backing feature
+        store is distributed (i.e. wholegraph), then only local
+        features should be passed to the data argument.  If the
+        backing feature store is replicated, then features for
+        all edges should be passed to the data argument,
+        including those for edges not on the local worker.
+
+        Parameters
+        ----------
+        u: TensorType
+            1d tensor of source node ids (local slice of the distributed edgelist).
+        v: TensorType
+            1d tensor of destination node ids (local slice of the distributed edgelist).
+        data: Dict[str, TensorType] (optional, default=None)
+            Dictionary containing edge features for the new edges.
+        etype: Union[str, Tuple[str, str, str]]
+            The edge type of the edges being inserted.  Not required
+            for homogeneous graphs, which have only one edge type.
+        """
+
+        # Validate all inputs before proceeding
+        # The number of nodes for the src/dst type needs to be known and there cannot
+        # be any edges of this type in the graph.
+        dgl_can_edge_type = self.to_canonical_etype(etype)
+        src_type, _, dst_type = dgl_can_edge_type
+        if dgl_can_edge_type in self.__edge_indices.keys(
+            leaves_only=True, include_nested=True
+        ):
+            raise ValueError(
+                "This cuGraph-DGL graph already contains edges of type"
+                f" {dgl_can_edge_type}. Calling add_edges multiple times"
+                " for the same edge type is not supported."
+            )
+        self.__check_node_ids(src_type, u)
+        self.__check_node_ids(dst_type, v)
+
+        self.__edge_indices[dgl_can_edge_type] = torch.stack(
+            [
+                _cast_to_torch_tensor(u),
+                _cast_to_torch_tensor(v),
+            ]
+        ).to(self.idtype)
+
+        if data is not None:
+            for attr_name, attr_tensor in data.items():
+                self.__edata_storage[
+                    dgl_can_edge_type, attr_name
+                ] = self.__edata_storage_type(
+                    _cast_to_torch_tensor(attr_tensor), **self.__wg_kwargs
+                )
+
+        num_edges = self.__edge_indices[dgl_can_edge_type].shape[1]
+        if self.is_multi_gpu:
+            num_edges = torch.tensor([num_edges], device="cuda", dtype=torch.int64)
+            torch.distributed.all_reduce(num_edges, op=torch.distributed.ReduceOp.SUM)
+
+        self.__num_edges_dict[dgl_can_edge_type] = int(num_edges)
+
+        self.__graph = None
+        self.__vertex_offsets = None
+
+    def num_nodes(self, ntype: str = None) -> int:
+        """
+        Returns the number of nodes of ntype, or if ntype is not provided,
+        the total number of nodes in the graph.
+        """
+        if ntype is None:
+            return sum(self.__num_nodes_dict.values())
+
+        return self.__num_nodes_dict[ntype]
+
+    def number_of_nodes(self, ntype: str = None) -> int:
+        """
+        Alias for num_nodes.
+        """
+        return self.num_nodes(ntype=ntype)
+
+    def num_edges(self, etype: Union[str, Tuple[str, str, str]] = None) -> int:
+        """
+        Returns the number of edges of etype, or if etype is not provided,
+        the total number of edges in the graph.
+        """
+        if etype is None:
+            return sum(self.__num_edges_dict.values())
+
+        etype = self.to_canonical_etype(etype)
+        return self.__num_edges_dict[etype]
+
+    def number_of_edges(self, etype: Union[str, Tuple[str, str, str]] = None) -> int:
+        """
+        Alias for num_edges.
+        """
+        return self.num_edges(etype=etype)
+
+    @property
+    def ntypes(self) -> List[str]:
+        """
+        Returns the node type names in this graph.
+        """
+        return list(self.__num_nodes_dict.keys())
+
+    @property
+    def etypes(self) -> List[str]:
+        """
+        Returns the edge type names in this graph
+        (the second element of the canonical edge
+        type tuple).
+        """
+        return [et[1] for et in self.__num_edges_dict.keys()]
+
+    @property
+    def canonical_etypes(self) -> List[str]:
+        """
+        Returns the canonical edge type names in this
+        graph.
+        """
+        return list(self.__num_edges_dict.keys())
+
+    @property
+    def _vertex_offsets(self) -> Dict[str, int]:
+        if self.__vertex_offsets is None:
+            ordered_keys = sorted(list(self.ntypes))
+            self.__vertex_offsets = {}
+            offset = 0
+            for vtype in ordered_keys:
+                self.__vertex_offsets[vtype] = offset
+                offset += self.num_nodes(vtype)
+
+        return dict(self.__vertex_offsets)
+
+    def __get_edgelist(self) -> Dict[str, "torch.Tensor"]:
+        """
+        This function always returns src/dst labels with respect
+        to the out direction.
+
+        Returns
+        -------
+        Dict[str, torch.Tensor] with the following keys:
+            src: source vertices (int64)
+                Note that src is the 1st element of the DGL edge index.
+            dst: destination vertices (int64)
+                Note that dst is the 2nd element of the DGL edge index.
+            eid: edge ids for each edge (int64)
+                Note that these start from 0 for each edge type.
+            etp: edge types for each edge (int32)
+                Note that these are in lexicographic order.
+        """
+        sorted_keys = sorted(
+            list(self.__edge_indices.keys(leaves_only=True, include_nested=True))
+        )
+
+        # note that this still follows the DGL convention of (src, rel, dst)
+        # i.e. (author, writes, paper): [[0,1,2],[2,0,1]] is referring to a
+        # cuGraph graph where (paper 2) -> (author 0), (paper 0) -> (author 1),
+        # and (paper 1) -> (author 0)
+        edge_index = torch.concat(
+            [
+                torch.stack(
+                    [
+                        self.__edge_indices[src_type, rel_type, dst_type][0]
+                        + self._vertex_offsets[src_type],
+                        self.__edge_indices[src_type, rel_type, dst_type][1]
+                        + self._vertex_offsets[dst_type],
+                    ]
+                )
+                for (src_type, rel_type, dst_type) in sorted_keys
+            ],
+            axis=1,
+        ).cuda()
+
+        edge_type_array = torch.arange(
+            len(sorted_keys), dtype=torch.int32, device="cuda"
+        ).repeat_interleave(
+            torch.tensor(
+                [self.__edge_indices[et].shape[1] for et in sorted_keys],
+                device="cuda",
+                dtype=torch.int32,
+            )
+        )
+
+        if self.is_multi_gpu:
+            rank = torch.distributed.get_rank()
+            world_size = torch.distributed.get_world_size()
+
+            num_edges_t = torch.tensor(
+                [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
+            )
+            num_edges_all_t = torch.empty(
+                world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
+            )
+            torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
+
+            if rank > 0:
+                start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
+                edge_id_array = torch.concat(
+                    [
+                        torch.arange(
+                            start_offsets[i],
+                            start_offsets[i] + num_edges_all_t[rank][i],
+                            dtype=torch.int64,
+                            device="cuda",
+                        )
+                        for i in range(len(sorted_keys))
+                    ]
+                )
+            else:
+                edge_id_array = torch.concat(
+                    [
+                        torch.arange(
+                            self.__edge_indices[et].shape[1],
+                            dtype=torch.int64,
+                            device="cuda",
+                        )
+                        for et in sorted_keys
+                    ]
+                )
+
+        else:
+            # single GPU
+            edge_id_array = torch.concat(
+                [
+                    torch.arange(
+                        self.__edge_indices[et].shape[1],
+                        dtype=torch.int64,
+                        device="cuda",
+                    )
+                    for et in sorted_keys
+                ]
+            )
+
+        return {
+            "src": edge_index[0],
+            "dst": edge_index[1],
+            "etp": edge_type_array,
+            "eid": edge_id_array,
+        }
+
+    @property
+    def is_homogeneous(self):
+        return len(self.__num_edges_dict) <= 1 and len(self.__num_nodes_dict) <= 1
+
+    @property
+    def idtype(self):
+        return torch.int64
+
+    @property
+    def _resource_handle(self):
+        if self.__handle is None:
+            if self.is_multi_gpu:
+                self.__handle = pylibcugraph.ResourceHandle(
+                    cugraph_comms_get_raft_handle().getHandle()
+                )
+            else:
+                self.__handle = pylibcugraph.ResourceHandle()
+        return self.__handle
+
+    def _graph(
+        self, direction: str
+    ) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
+        """
+        Gets the pylibcugraph Graph object with edges pointing in the given direction
+        (i.e. 'out' is standard, 'in' is reverse).
+        """
+
+        if direction not in ["out", "in"]:
+            raise ValueError(f"Invalid direction {direction} (expected 'in' or 'out').")
+
+        graph_properties = pylibcugraph.GraphProperties(
+            is_multigraph=True, is_symmetric=False
+        )
+
+        if self.__graph is not None and self.__graph[1] != direction:
+            self.__graph = None
+
+        if self.__graph is None:
+            src_col, dst_col = ("src", "dst") if direction == "out" else ("dst", "src")
+            edgelist_dict = self.__get_edgelist()
+
+            if self.is_multi_gpu:
+                rank = torch.distributed.get_rank()
+                world_size = torch.distributed.get_world_size()
+
+                vertices_array = cupy.arange(self.num_nodes(), dtype="int64")
+                vertices_array = cupy.array_split(vertices_array, world_size)[rank]
+
+                self.__graph = (
+                    pylibcugraph.MGGraph(
+                        self._resource_handle,
+                        graph_properties,
+                        [cupy.asarray(edgelist_dict[src_col]).astype("int64")],
+                        [cupy.asarray(edgelist_dict[dst_col]).astype("int64")],
+                        vertices_array=[vertices_array],
+                        edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
+                        edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
+                    ),
+                    direction,
+                )
+            else:
+                self.__graph = (
+                    pylibcugraph.SGGraph(
+                        self._resource_handle,
+                        graph_properties,
+                        cupy.asarray(edgelist_dict[src_col]).astype("int64"),
+                        cupy.asarray(edgelist_dict[dst_col]).astype("int64"),
+                        vertices_array=cupy.arange(self.num_nodes(), dtype="int64"),
+                        edge_id_array=cupy.asarray(edgelist_dict["eid"]),
+                        edge_type_array=cupy.asarray(edgelist_dict["etp"]),
+                    ),
+                    direction,
+                )
+
+        return self.__graph[0]
+
+    def _has_n_emb(self, ntype: str, emb_name: str) -> bool:
+        return (ntype, emb_name) in self.__ndata_storage
+
+    def _get_n_emb(
+        self, ntype: str, emb_name: str, u: Union[str, TensorType]
+    ) -> "torch.Tensor":
+        """
+        Gets the embedding of a single node type.
+        Unlike DGL, this function takes the string node
+        type name instead of an integer id.
+
+        Parameters
+        ----------
+        ntype: str
+            The node type to get the embedding of.
+        emb_name: str
+            The embedding name of the embedding to get.
+        u: Union[str, TensorType]
+            Nodes to get the representation of, or ALL
+            to get the representation of all nodes of
+            the given type.
+
+        Returns
+        -------
+        torch.Tensor
+            The embedding of the given edge type with the given embedding name.
+        """
+
+        if ntype is None:
+            if len(self.ntypes) == 1:
+                ntype = HOMOGENEOUS_NODE_TYPE
+            else:
+                raise ValueError("Must provide the node type for a heterogeneous graph")
+
+        if dgl.base.is_all(u):
+            u = torch.arange(self.num_nodes(ntype), dtype=self.idtype, device="cpu")
+
+        try:
+            return self.__ndata_storage[ntype, emb_name].fetch(
+                _cast_to_torch_tensor(u), "cuda"
+            )
+        except RuntimeError as ex:
+            warnings.warn(
+                "Got error accessing data, trying again with index on device: "
+                + str(ex)
+            )
+            return self.__ndata_storage[ntype, emb_name].fetch(
+                _cast_to_torch_tensor(u).cuda(), "cuda"
+            )
+
+    def _has_e_emb(self, etype: Tuple[str, str, str], emb_name: str) -> bool:
+        return (etype, emb_name) in self.__edata_storage
+
+    def _get_e_emb(
+        self, etype: Tuple[str, str, str], emb_name: str, u: Union[str, TensorType]
+    ) -> "torch.Tensor":
+        """
+        Gets the embedding of a single edge type.
+        Unlike DGL, this function takes the canonical edge type
+        instead of an integer id.
+
+        Parameters
+        ----------
+        etype: str
+            The edge type to get the embedding of.
+        emb_name: str
+            The embedding name of the embedding to get.
+        u: Union[str, TensorType]
+            Edges to get the representation of, or ALL to
+            get the representation of all nodes of the
+            given type.
+
+        Returns
+        -------
+        torch.Tensor
+            The embedding of the given edge type with the given embedding name.
+        """
+
+        etype = self.to_canonical_etype(etype)
+
+        if dgl.base.is_all(u):
+            u = torch.arange(self.num_edges(etype), dtype=self.idtype, device="cpu")
+
+        try:
+            return self.__edata_storage[etype, emb_name].fetch(
+                _cast_to_torch_tensor(u), "cuda"
+            )
+        except RuntimeError as ex:
+            warnings.warn(
+                "Got error accessing data, trying again with index on device: "
+                + str(ex)
+            )
+            return self.__edata_storage[etype, emb_name].fetch(
+                _cast_to_torch_tensor(u).cuda(), "cuda"
+            )
+
+    def _set_n_emb(
+        self, ntype: str, u: Union[str, TensorType], kv: Dict[str, TensorType]
+    ) -> None:
+        """
+        Stores or updates the embedding(s) of a single node type.
+        Unlike DGL, this function takes the string node type name
+        instead of an integer id.
+
+        The semantics of this function match those of add_nodes
+        with respect to whether or not the backing feature store
+        is distributed.
+
+        Parameters
+        ----------
+        ntype: str
+            The node type to store an embedding of.
+        u: Union[str, TensorType]
+            The indices to update, if updating the embedding.
+            Currently, updating a slice of an embedding is
+            unsupported, so this should be ALL.
+        kv: Dict[str, TensorType]
+            A mapping of embedding names to embedding tensors.
+        """
+
+        if not dgl.base.is_all(u):
+            raise NotImplementedError(
+                "Updating a slice of an embedding is "
+                "currently unimplemented in cuGraph-DGL."
+            )
+
+        for k, v in kv:
+            self.__ndata_storage[ntype, k] = self.__ndata_storage_type(
+                v,
+                **self.__wg_kwargs,
+            )
+
+    def _set_e_emb(
+        self, etype: str, u: Union[str, TensorType], kv: Dict[str, TensorType]
+    ) -> None:
+        """
+        Stores or updates the embedding(s) of a single edge type.
+        Unlike DGL, this function takes the canonical edge type name
+        instead of an integer id.
+
+        The semantics of this function match those of add_edges
+        with respect to whether or not the backing feature store
+        is distributed.
+
+        Parameters
+        ----------
+        etype: str
+            The edge type to store an embedding of.
+        u: Union[str, TensorType]
+            The indices to update, if updating the embedding.
+            Currently, updating a slice of an embedding is
+            unsupported, so this should be ALL.
+        kv: Dict[str, TensorType]
+            A mapping of embedding names to embedding tensors.
+        """
+
+        if not dgl.base.is_all(u):
+            raise NotImplementedError(
+                "Updating a slice of an embedding is "
+                "currently unimplemented in cuGraph-DGL."
+            )
+
+        for k, v in kv:
+            self.__edata_storage[etype, k] = self.__edata_storage_type(
+                v,
+                **self.__wg_kwargs,
+            )
+
+    def _pop_n_emb(self, ntype: str, key: str) -> "torch.Tensor":
+        """
+        Removes and returns the embedding of the given node
+        type with the given name.
+
+        Parameters
+        ----------
+        ntype:str
+            The node type.
+        key:str
+            The embedding name.
+
+        Returns
+        -------
+        The removed embedding.
+        """
+        return self.__ndata_storage[ntype, key].pop(key)
+
+    def _pop_e_emb(self, etype: str, key: str) -> "torch.Tensor":
+        """
+        Removes and returns the embedding of the given edge
+        type with the given name.
+
+        Parameters
+        ----------
+        etype:str
+            The node type.
+        key:str
+            The embedding name.
+
+        Returns
+        -------
+        torch.Tensor
+            The removed embedding.
+        """
+        return self.__edata_storage[etype, key].pop(key)
+
+    def _get_n_emb_keys(self, ntype: str) -> List[str]:
+        """
+        Gets a list of the embedding names for a given node
+        type.
+
+        Parameters
+        ----------
+        ntype: str
+            The node type to get embedding names for.
+
+        Returns
+        -------
+        List[str]
+            The list of embedding names for the given node type.
+        """
+        return [k for (t, k) in self.__ndata_storage if ntype == t]
+
+    def _get_e_emb_keys(self, etype: str) -> List[str]:
+        """
+        Gets a list of the embedding names for a given edge
+        type.
+
+        Parameters
+        ----------
+        etype: str
+            The edge type to get embedding names for.
+
+        Returns
+        -------
+        List[str]
+            The list of embedding names for the given edge type.
+        """
+        return [k for (t, k) in self.__edata_storage if etype == t]
+
+    def all_edges(
+        self,
+        form="uv",
+        order="eid",
+        etype: Union[str, Tuple[str, str, str]] = None,
+        device: Union[str, int, "torch.device"] = "cpu",
+    ):
+        """
+        Returns all edges with the specified edge type.
+        cuGraph-DGL currently only supports 'eid' format and
+        'eid' order.
+
+        Parameters
+        ----------
+        form: str (optional, default='uv')
+            The format to return ('uv', 'eid', 'all').
+
+        order: str (optional, default='eid')
+            The order to return edges in ('eid', 'srcdst')
+            cuGraph-DGL currently only supports 'eid'.
+        etype: Union[str, Tuple[str, str, str]] (optional, default=None)
+            The edge type to get.  Not required if this is
+            a homogeneous graph.  Can be the relation type if the
+            relation type is unique, or the canonical edge type.
+        device: Union[str, int, torch.device] (optional, default='cpu')
+            The device where returned edges should be stored
+            ('cpu', 'cuda', or device id).
+        """
+
+        if order != "eid":
+            raise NotImplementedError("cugraph-DGL only supports eid order.")
+
+        if etype is None and len(self.canonical_etypes) > 1:
+            raise ValueError("Edge type is required for heterogeneous graphs.")
+
+        etype = self.to_canonical_etype(etype)
+
+        if form == "eid":
+            return torch.arange(
+                0,
+                self.__num_edges_dict[etype],
+                dtype=self.idtype,
+                device=device,
+            )
+        else:
+            if self.is_multi_gpu:
+                # This can't be done because it requires collective communication.
+                raise ValueError(
+                    "Calling all_edges in a distributed graph with"
+                    " form 'uv' or 'all' is unsupported."
+                )
+
+            else:
+                eix = self.__edge_indices[etype].to(device)
+                if form == "uv":
+                    return eix[0], eix[1]
+                elif form == "all":
+                    return (
+                        eix[0],
+                        eix[1],
+                        torch.arange(
+                            self.__num_edges_dict[etype],
+                            dtype=self.idtype,
+                            device=device,
+                        ),
+                    )
+                else:
+                    raise ValueError(f"Invalid form {form}")
+
+    @property
+    def ndata(self) -> HeteroNodeDataView:
+        """
+        Returns a view of the node data in this graph which can be used to
+        access or modify node features.
+        """
+
+        if len(self.ntypes) == 1:
+            ntype = self.ntypes[0]
+            return HeteroNodeDataView(self, ntype, dgl.base.ALL)
+
+        return HeteroNodeDataView(self, self.ntypes, dgl.base.ALL)
+
+    @property
+    def edata(self) -> HeteroEdgeDataView:
+        """
+        Returns a view of the edge data in this graph which can be used to
+        access or modify edge features.
+        """
+        if len(self.canonical_etypes) == 1:
+            return HeteroEdgeDataView(self, None, dgl.base.ALL)
+
+        return HeteroEdgeDataView(self, self.canonical_etypes, dgl.base.ALL)
+
+    @property
+    def nodes(self) -> HeteroNodeView:
+        """
+        Returns a view of the nodes in this graph.
+        """
+        return HeteroNodeView(self)
+
+    @property
+    def edges(self) -> HeteroEdgeView:
+        """
+        Returns a view of the edges in this graph.
+        """
+        return HeteroEdgeView(self)
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
index ddd95a76366..d2460f814c9 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,6 +15,8 @@
 
 from cugraph.utilities.utils import import_optional
 
+import cugraph_dgl
+
 torch = import_optional("torch")
 ops_torch = import_optional("pylibcugraphops.pytorch")
 dgl = import_optional("dgl")
@@ -255,6 +257,27 @@ def __repr__(self) -> str:
             f"num_edges={self._src_ids.size(0)}, formats={self._formats})"
         )
 
+    def to(self, device: Union[torch.device, str, int]) -> "cugraph_dgl.nn.SparseGraph":
+        sg = SparseGraph(
+            src_ids=None if self._src_ids is None else self._src_ids.to(device),
+            dst_ids=None if self._dst_ids is None else self._dst_ids.to(device),
+            csrc_ids=None if self._csrc_ids is None else self._csrc_ids.to(device),
+            cdst_ids=None if self._cdst_ids is None else self._cdst_ids.to(device),
+            values=None if self._values is None else self._values.to(device),
+            is_sorted=self._is_sorted,
+            formats=self._formats,
+            reduce_memory=self._reduce_memory,
+        )
+
+        sg._perm_coo2csc = (
+            None if self._perm_coo2csc is None else self._perm_coo2csc.to(device)
+        )
+        sg._perm_csc2csr = (
+            None if self._perm_csc2csr is None else self._perm_csc2csr.to(device)
+        )
+
+        return sg
+
 
 class BaseConv(torch.nn.Module):
     r"""An abstract base class for cugraph-ops nn module."""
diff --git a/python/cugraph-dgl/tests/__init__.py b/python/cugraph-dgl/cugraph_dgl/tests/__init__.py
similarity index 100%
rename from python/cugraph-dgl/tests/__init__.py
rename to python/cugraph-dgl/cugraph_dgl/tests/__init__.py
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/cugraph_dgl/tests/conftest.py
similarity index 100%
rename from python/cugraph-dgl/tests/conftest.py
rename to python/cugraph-dgl/cugraph_dgl/tests/conftest.py
diff --git a/python/cugraph-dgl/tests/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
similarity index 98%
rename from python/cugraph-dgl/tests/test_dataloader.py
rename to python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
index cc473cd0ad6..e2542657de4 100644
--- a/python/cugraph-dgl/tests/test_dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -52,7 +52,7 @@ def sample_cugraph_dgl_graphs(cugraph_gs, train_nid, fanouts):
     sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
     tempdir_object = tempfile.TemporaryDirectory()
     sampling_output_dir = tempdir_object
-    dataloader = cugraph_dgl.dataloading.DataLoader(
+    dataloader = cugraph_dgl.dataloading.DaskDataLoader(
         cugraph_gs,
         train_nid,
         sampler,
diff --git a/python/cugraph-dgl/tests/mg/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
similarity index 97%
rename from python/cugraph-dgl/tests/mg/test_dataloader.py
rename to python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
index 29b7e1c3412..d49e1293e77 100644
--- a/python/cugraph-dgl/tests/mg/test_dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -51,7 +51,7 @@ def sample_cugraph_dgl_graphs(cugraph_gs, train_nid, fanouts):
     sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
     tempdir_object = tempfile.TemporaryDirectory()
     sampling_output_dir = tempdir_object
-    dataloader = cugraph_dgl.dataloading.DataLoader(
+    dataloader = cugraph_dgl.dataloading.DaskDataLoader(
         cugraph_gs,
         train_nid,
         sampler,
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
new file mode 100644
index 00000000000..ef47875463d
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cugraph_dgl.dataloading
+import pytest
+
+import cugraph_dgl
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+import numpy as np
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+def test_dataloader_basic_homogeneous():
+    graph = cugraph_dgl.Graph(is_multi_gpu=False)
+
+    num_nodes = karate.number_of_nodes()
+    graph.add_nodes(num_nodes, data={"z": torch.arange(num_nodes)})
+
+    edf = karate.get_edgelist()
+    graph.add_edges(
+        u=edf["src"], v=edf["dst"], data={"q": torch.arange(karate.number_of_edges())}
+    )
+
+    sampler = cugraph_dgl.dataloading.NeighborSampler([5, 5, 5])
+    loader = cugraph_dgl.dataloading.FutureDataLoader(
+        graph, torch.arange(num_nodes), sampler, batch_size=2
+    )
+
+    for in_t, out_t, blocks in loader:
+        assert len(blocks) == 3
+        assert len(out_t) <= 2
+
+
+def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1):
+    # Single fanout to match cugraph
+    sampler = dgl.dataloading.NeighborSampler(fanouts)
+    dataloader = dgl.dataloading.DataLoader(
+        g,
+        train_nid,
+        sampler,
+        batch_size=batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=0,
+    )
+
+    dgl_output = {}
+    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        dgl_output[batch_id] = {
+            "input_nodes": input_nodes,
+            "output_nodes": output_nodes,
+            "blocks": blocks,
+        }
+    return dgl_output
+
+
+def sample_cugraph_dgl_graphs(cugraph_g, train_nid, fanouts, batch_size=1):
+    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
+
+    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
+        cugraph_g,
+        train_nid,
+        sampler,
+        batch_size=batch_size,
+        drop_last=False,
+        shuffle=False,
+    )
+
+    cugraph_dgl_output = {}
+    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        cugraph_dgl_output[batch_id] = {
+            "input_nodes": input_nodes,
+            "output_nodes": output_nodes,
+            "blocks": blocks,
+        }
+    return cugraph_dgl_output
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+@pytest.mark.parametrize("ix", [[1], [1, 0]])
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_same_homogeneousgraph_results(ix, batch_size):
+    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
+
+    train_nid = torch.tensor(ix)
+    # Create a heterograph with 3 node types and 3 edges types.
+    dgl_g = dgl.graph((src, dst))
+
+    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=False)
+    cugraph_g.add_nodes(9)
+    cugraph_g.add_edges(u=src, v=dst)
+
+    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2], batch_size=batch_size)
+    cugraph_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2], batch_size)
+
+    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
+    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
+
+    np.testing.assert_array_equal(
+        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_dst_nodes()
+        == cugraph_output[0]["blocks"][0].num_dst_nodes()
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_edges()
+        == cugraph_output[0]["blocks"][0].num_edges()
+    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
new file mode 100644
index 00000000000..b32233f16a6
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import numpy as np
+
+import cugraph_dgl
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph.gnn import (
+    cugraph_comms_create_unique_id,
+    cugraph_comms_shutdown,
+)
+
+from cugraph_dgl.tests.utils import init_pytorch_worker
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+
+def run_test_dataloader_basic_homogeneous(rank, world_size, uid):
+    init_pytorch_worker(rank, world_size, uid)
+
+    graph = cugraph_dgl.Graph(is_multi_gpu=True)
+
+    num_nodes = karate.number_of_nodes()
+    graph.add_nodes(
+        num_nodes,
+    )
+
+    edf = karate.get_edgelist()
+    graph.add_edges(
+        u=torch.tensor_split(torch.as_tensor(edf["src"], device="cuda"), world_size)[
+            rank
+        ],
+        v=torch.tensor_split(torch.as_tensor(edf["dst"], device="cuda"), world_size)[
+            rank
+        ],
+    )
+
+    sampler = cugraph_dgl.dataloading.NeighborSampler([5, 5, 5])
+    loader = cugraph_dgl.dataloading.FutureDataLoader(
+        graph,
+        torch.arange(num_nodes),
+        sampler,
+        batch_size=2,
+        use_ddp=True,
+    )
+
+    for in_t, out_t, blocks in loader:
+        assert len(blocks) == 3
+        assert len(out_t) <= 2
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+def test_dataloader_basic_homogeneous():
+    uid = cugraph_comms_create_unique_id()
+    # Limit the number of GPUs this rest is run with
+    world_size = min(torch.cuda.device_count(), 4)
+
+    torch.multiprocessing.spawn(
+        run_test_dataloader_basic_homogeneous,
+        args=(
+            world_size,
+            uid,
+        ),
+        nprocs=world_size,
+    )
+
+
+def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1):
+    # Single fanout to match cugraph
+    sampler = dgl.dataloading.NeighborSampler(fanouts)
+    dataloader = dgl.dataloading.DataLoader(
+        g,
+        train_nid,
+        sampler,
+        batch_size=batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=0,
+    )
+
+    dgl_output = {}
+    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        dgl_output[batch_id] = {
+            "input_nodes": input_nodes,
+            "output_nodes": output_nodes,
+            "blocks": blocks,
+        }
+    return dgl_output
+
+
+def sample_cugraph_dgl_graphs(cugraph_g, train_nid, fanouts, batch_size=1):
+    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
+
+    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
+        cugraph_g,
+        train_nid,
+        sampler,
+        batch_size=batch_size,
+        drop_last=False,
+        shuffle=False,
+    )
+
+    cugraph_dgl_output = {}
+    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        cugraph_dgl_output[batch_id] = {
+            "input_nodes": input_nodes,
+            "output_nodes": output_nodes,
+            "blocks": blocks,
+        }
+    return cugraph_dgl_output
+
+
+def run_test_same_homogeneousgraph_results(rank, world_size, uid, ix, batch_size):
+    init_pytorch_worker(rank, world_size, uid)
+
+    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
+
+    local_src = torch.tensor_split(src, world_size)[rank]
+    local_dst = torch.tensor_split(dst, world_size)[rank]
+
+    train_nid = torch.tensor(ix)
+    # Create a heterograph with 3 node types and 3 edges types.
+    dgl_g = dgl.graph((src, dst))
+
+    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=True)
+    cugraph_g.add_nodes(9)
+    cugraph_g.add_edges(u=local_src, v=local_dst)
+
+    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2], batch_size=batch_size)
+    cugraph_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2], batch_size)
+
+    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
+    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
+
+    np.testing.assert_array_equal(
+        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_dst_nodes()
+        == cugraph_output[0]["blocks"][0].num_dst_nodes()
+    )
+    assert (
+        dgl_output[0]["blocks"][0].num_edges()
+        == cugraph_output[0]["blocks"][0].num_edges()
+    )
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+@pytest.mark.parametrize("ix", [[1], [1, 0]])
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_same_homogeneousgraph_results_mg(ix, batch_size):
+    uid = cugraph_comms_create_unique_id()
+    # Limit the number of GPUs this rest is run with
+    world_size = min(torch.cuda.device_count(), 4)
+
+    torch.multiprocessing.spawn(
+        run_test_same_homogeneousgraph_results,
+        args=(world_size, uid, ix, batch_size),
+        nprocs=world_size,
+    )
diff --git a/python/cugraph-dgl/tests/test_dataset.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_dataset.py
rename to python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_gatconv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_gatv2conv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_relgraphconv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_sageconv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
diff --git a/python/cugraph-dgl/tests/nn/test_sparsegraph.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_sparsegraph.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
similarity index 100%
rename from python/cugraph-dgl/tests/nn/test_transformerconv.py
rename to python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
diff --git a/python/cugraph-dgl/tests/test_cugraph_storage.py b/python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_cugraph_storage.py
rename to python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py
diff --git a/python/cugraph-dgl/tests/test_from_dgl_heterograph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
similarity index 83%
rename from python/cugraph-dgl/tests/test_from_dgl_heterograph.py
rename to python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
index 128d9bfaca5..667a4a2e66d 100644
--- a/python/cugraph-dgl/tests/test_from_dgl_heterograph.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,7 +20,9 @@
 from cugraph.utilities.utils import import_optional
 from .utils import (
     assert_same_edge_feats,
+    assert_same_edge_feats_daskapi,
     assert_same_node_feats,
+    assert_same_node_feats_daskapi,
     assert_same_num_edges_can_etypes,
     assert_same_num_edges_etypes,
     assert_same_num_nodes,
@@ -134,7 +136,7 @@ def create_heterograph4(idtype):
 
 
 @pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_nodes(idxtype):
+def test_heterograph_conversion_nodes_daskapi(idxtype):
     graph_fs = [
         create_heterograph1,
         create_heterograph2,
@@ -145,6 +147,39 @@ def test_heterograph_conversion_nodes(idxtype):
         g = graph_f(idxtype)
         gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
 
+        assert_same_num_nodes(gs, g)
+        assert_same_node_feats_daskapi(gs, g)
+
+
+@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
+def test_heterograph_conversion_edges_daskapi(idxtype):
+    graph_fs = [
+        create_heterograph1,
+        create_heterograph2,
+        create_heterograph3,
+        create_heterograph4,
+    ]
+    for graph_f in graph_fs:
+        g = graph_f(idxtype)
+        gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
+
+        assert_same_num_edges_can_etypes(gs, g)
+        assert_same_num_edges_etypes(gs, g)
+        assert_same_edge_feats_daskapi(gs, g)
+
+
+@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
+def test_heterograph_conversion_nodes(idxtype):
+    graph_fs = [
+        create_heterograph1,
+        create_heterograph2,
+        create_heterograph3,
+        create_heterograph4,
+    ]
+    for graph_f in graph_fs:
+        g = graph_f(idxtype)
+        gs = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g)
+
         assert_same_num_nodes(gs, g)
         assert_same_node_feats(gs, g)
 
@@ -159,7 +194,7 @@ def test_heterograph_conversion_edges(idxtype):
     ]
     for graph_f in graph_fs:
         g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
+        gs = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g)
 
         assert_same_num_edges_can_etypes(gs, g)
         assert_same_num_edges_etypes(gs, g)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
new file mode 100644
index 00000000000..a60db97b8d6
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import cugraph_dgl
+import pylibcugraph
+import cupy
+import numpy as np
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+@pytest.mark.parametrize("direction", ["out", "in"])
+def test_graph_make_homogeneous_graph(direction):
+    df = karate.get_edgelist()
+    df.src = df.src.astype("int64")
+    df.dst = df.dst.astype("int64")
+    wgt = np.random.random((len(df),))
+
+    graph = cugraph_dgl.Graph()
+    num_nodes = max(df.src.max(), df.dst.max()) + 1
+    node_x = np.random.random((num_nodes,))
+
+    graph.add_nodes(
+        num_nodes, data={"num": torch.arange(num_nodes, dtype=torch.int64), "x": node_x}
+    )
+    graph.add_edges(df.src, df.dst, {"weight": wgt})
+    plc_dgl_graph = graph._graph(direction=direction)
+
+    assert graph.num_nodes() == num_nodes
+    assert graph.num_edges() == len(df)
+    assert graph.is_homogeneous
+    assert not graph.is_multi_gpu
+
+    assert (
+        graph.nodes() == torch.arange(num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+
+    assert graph.nodes[None]["x"] is not None
+    assert (graph.nodes[None]["x"] == torch.as_tensor(node_x, device="cuda")).all()
+    assert (
+        graph.nodes[None]["num"]
+        == torch.arange(num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+
+    assert (
+        graph.edges("eid", device="cuda")
+        == torch.arange(len(df), dtype=torch.int64, device="cuda")
+    ).all()
+    assert (graph.edges[None]["weight"] == torch.as_tensor(wgt, device="cuda")).all()
+
+    plc_expected_graph = pylibcugraph.SGGraph(
+        pylibcugraph.ResourceHandle(),
+        pylibcugraph.GraphProperties(is_multigraph=True, is_symmetric=False),
+        df.src if direction == "out" else df.dst,
+        df.dst if direction == "out" else df.src,
+        vertices_array=cupy.arange(num_nodes, dtype="int64"),
+    )
+
+    # Do the expensive check to make sure this test fails if an invalid
+    # graph is constructed.
+    v_actual, d_in_actual, d_out_actual = pylibcugraph.degrees(
+        pylibcugraph.ResourceHandle(),
+        plc_dgl_graph,
+        source_vertices=cupy.arange(num_nodes, dtype="int64"),
+        do_expensive_check=True,
+    )
+
+    v_exp, d_in_exp, d_out_exp = pylibcugraph.degrees(
+        pylibcugraph.ResourceHandle(),
+        plc_expected_graph,
+        source_vertices=cupy.arange(num_nodes, dtype="int64"),
+        do_expensive_check=True,
+    )
+
+    assert (v_actual == v_exp).all()
+    assert (d_in_actual == d_in_exp).all()
+    assert (d_out_actual == d_out_exp).all()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+@pytest.mark.parametrize("direction", ["out", "in"])
+def test_graph_make_heterogeneous_graph(direction):
+    df = karate.get_edgelist()
+    df.src = df.src.astype("int64")
+    df.dst = df.dst.astype("int64")
+
+    graph = cugraph_dgl.Graph()
+    total_num_nodes = max(df.src.max(), df.dst.max()) + 1
+
+    num_nodes_group_1 = total_num_nodes // 2
+    num_nodes_group_2 = total_num_nodes - num_nodes_group_1
+
+    node_x_1 = np.random.random((num_nodes_group_1,))
+    node_x_2 = np.random.random((num_nodes_group_2,))
+
+    graph.add_nodes(num_nodes_group_1, {"x": node_x_1}, "type1")
+    graph.add_nodes(num_nodes_group_2, {"x": node_x_2}, "type2")
+
+    edges_11 = df[(df.src < num_nodes_group_1) & (df.dst < num_nodes_group_1)]
+    edges_12 = df[(df.src < num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
+    edges_21 = df[(df.src >= num_nodes_group_1) & (df.dst < num_nodes_group_1)]
+    edges_22 = df[(df.src >= num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
+
+    edges_12.dst -= num_nodes_group_1
+    edges_21.src -= num_nodes_group_1
+    edges_22.dst -= num_nodes_group_1
+    edges_22.src -= num_nodes_group_1
+
+    graph.add_edges(edges_11.src, edges_11.dst, etype=("type1", "e1", "type1"))
+    graph.add_edges(edges_12.src, edges_12.dst, etype=("type1", "e2", "type2"))
+    graph.add_edges(edges_21.src, edges_21.dst, etype=("type2", "e3", "type1"))
+    graph.add_edges(edges_22.src, edges_22.dst, etype=("type2", "e4", "type2"))
+
+    assert not graph.is_homogeneous
+    assert not graph.is_multi_gpu
+
+    # Verify graph.nodes()
+    assert (
+        graph.nodes() == torch.arange(total_num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.nodes("type1")
+        == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.nodes("type2")
+        == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
+    ).all()
+
+    # Verify graph.edges()
+    assert (
+        graph.edges("eid", etype=("type1", "e1", "type1"))
+        == torch.arange(len(edges_11), dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type1", "e2", "type2"))
+        == torch.arange(len(edges_12), dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type2", "e3", "type1"))
+        == torch.arange(len(edges_21), dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type2", "e4", "type2"))
+        == torch.arange(len(edges_22), dtype=torch.int64, device="cuda")
+    ).all()
+
+    # Use sampling call to check graph creation
+    # This isn't a test of cuGraph sampling with DGL; the options are
+    # set to verify the graph only.
+    plc_graph = graph._graph(direction)
+    sampling_output = pylibcugraph.uniform_neighbor_sample(
+        pylibcugraph.ResourceHandle(),
+        plc_graph,
+        start_list=cupy.arange(total_num_nodes, dtype="int64"),
+        h_fan_out=np.array([1, 1], dtype="int32"),
+        with_replacement=False,
+        do_expensive_check=True,
+        with_edge_properties=True,
+        prior_sources_behavior="exclude",
+        return_dict=True,
+    )
+
+    expected_etypes = {
+        0: "e1",
+        1: "e2",
+        2: "e3",
+        3: "e4",
+    }
+    expected_offsets = {
+        0: (0, 0),
+        1: (0, num_nodes_group_1),
+        2: (num_nodes_group_1, 0),
+        3: (num_nodes_group_1, num_nodes_group_1),
+    }
+    if direction == "in":
+        src_col = "minors"
+        dst_col = "majors"
+    else:
+        src_col = "majors"
+        dst_col = "minors"
+
+    # Looping over the output verifies that all edges are valid
+    # (and therefore, the graph is valid)
+    for i, etype in enumerate(sampling_output["edge_type"].tolist()):
+        eid = int(sampling_output["edge_id"][i])
+
+        srcs, dsts, eids = graph.edges(
+            "all", etype=expected_etypes[etype], device="cpu"
+        )
+
+        assert eids[eid] == eid
+        assert (
+            srcs[eid] == int(sampling_output[src_col][i]) - expected_offsets[etype][0]
+        )
+        assert (
+            dsts[eid] == int(sampling_output[dst_col][i]) - expected_offsets[etype][1]
+        )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
new file mode 100644
index 00000000000..eedda664c52
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
@@ -0,0 +1,310 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+
+import cugraph_dgl
+import pylibcugraph
+import cupy
+import numpy as np
+
+import cudf
+
+from cugraph.datasets import karate
+from cugraph.utilities.utils import import_optional, MissingModule
+
+from cugraph.gnn import (
+    cugraph_comms_shutdown,
+    cugraph_comms_create_unique_id,
+    cugraph_comms_get_raft_handle,
+)
+
+from .utils import init_pytorch_worker
+
+pylibwholegraph = import_optional("pylibwholegraph")
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+
+def run_test_graph_make_homogeneous_graph_mg(rank, uid, world_size, direction):
+    init_pytorch_worker(rank, world_size, uid, init_wholegraph=True)
+
+    df = karate.get_edgelist()
+    df.src = df.src.astype("int64")
+    df.dst = df.dst.astype("int64")
+    wgt = np.random.random((len(df),))
+
+    graph = cugraph_dgl.Graph(
+        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
+    )
+
+    # The number of nodes is set globally but features can have
+    # any distribution across workers as long as they are in order.
+    global_num_nodes = max(df.src.max(), df.dst.max()) + 1
+    node_x = np.array_split(np.arange(global_num_nodes, dtype="int64"), world_size)[
+        rank
+    ]
+
+    # Each worker gets a shuffled, permuted version of the edgelist
+    df = df.sample(frac=1.0)
+    df.src = (df.src + rank) % global_num_nodes
+    df.dst = (df.dst + rank + 1) % global_num_nodes
+
+    graph.add_nodes(global_num_nodes, data={"x": node_x})
+    graph.add_edges(df.src, df.dst, {"weight": wgt})
+    plc_dgl_graph = graph._graph(direction=direction)
+
+    assert graph.num_nodes() == global_num_nodes
+    assert graph.num_edges() == len(df) * world_size
+    assert graph.is_homogeneous
+    assert graph.is_multi_gpu
+
+    assert (
+        graph.nodes()
+        == torch.arange(global_num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+    ix = torch.arange(len(node_x) * rank, len(node_x) * (rank + 1), dtype=torch.int64)
+    assert graph.nodes[ix]["x"] is not None
+    assert (graph.nodes[ix]["x"] == torch.as_tensor(node_x, device="cuda")).all()
+
+    assert (
+        graph.edges("eid", device="cuda")
+        == torch.arange(world_size * len(df), dtype=torch.int64, device="cuda")
+    ).all()
+    ix = torch.arange(len(df) * rank, len(df) * (rank + 1), dtype=torch.int64)
+    assert (graph.edges[ix]["weight"] == torch.as_tensor(wgt, device="cuda")).all()
+
+    plc_handle = pylibcugraph.ResourceHandle(
+        cugraph_comms_get_raft_handle().getHandle()
+    )
+
+    plc_expected_graph = pylibcugraph.MGGraph(
+        plc_handle,
+        pylibcugraph.GraphProperties(is_multigraph=True, is_symmetric=False),
+        [df.src] if direction == "out" else [df.dst],
+        [df.dst] if direction == "out" else [df.src],
+        vertices_array=[
+            cupy.array_split(cupy.arange(global_num_nodes, dtype="int64"), world_size)[
+                rank
+            ]
+        ],
+    )
+
+    # Do the expensive check to make sure this test fails if an invalid
+    # graph is constructed.
+    v_actual, d_in_actual, d_out_actual = pylibcugraph.degrees(
+        plc_handle,
+        plc_dgl_graph,
+        source_vertices=cupy.arange(global_num_nodes, dtype="int64"),
+        do_expensive_check=True,
+    )
+
+    v_exp, d_in_exp, d_out_exp = pylibcugraph.degrees(
+        plc_handle,
+        plc_expected_graph,
+        source_vertices=cupy.arange(global_num_nodes, dtype="int64"),
+        do_expensive_check=True,
+    )
+
+    assert (v_actual == v_exp).all()
+    assert (d_in_actual == d_in_exp).all()
+    assert (d_out_actual == d_out_exp).all()
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
+)
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+@pytest.mark.parametrize("direction", ["out", "in"])
+def test_graph_make_homogeneous_graph_mg(direction):
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_graph_make_homogeneous_graph_mg,
+        args=(
+            uid,
+            world_size,
+            direction,
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_test_graph_make_heterogeneous_graph_mg(rank, uid, world_size, direction):
+    init_pytorch_worker(rank, world_size, uid)
+
+    df = karate.get_edgelist()
+    df.src = df.src.astype("int64")
+    df.dst = df.dst.astype("int64")
+
+    graph = cugraph_dgl.Graph(is_multi_gpu=True)
+    total_num_nodes = max(df.src.max(), df.dst.max()) + 1
+
+    # Each worker gets a shuffled, permuted version of the edgelist
+    df = df.sample(frac=1.0)
+    df.src = (df.src + rank) % total_num_nodes
+    df.dst = (df.dst + rank + 1) % total_num_nodes
+
+    num_nodes_group_1 = total_num_nodes // 2
+    num_nodes_group_2 = total_num_nodes - num_nodes_group_1
+
+    node_x_1 = np.array_split(np.random.random((num_nodes_group_1,)), world_size)[rank]
+    node_x_2 = np.array_split(np.random.random((num_nodes_group_2,)), world_size)[rank]
+
+    graph.add_nodes(num_nodes_group_1, {"x": node_x_1}, "type1")
+    graph.add_nodes(num_nodes_group_2, {"x": node_x_2}, "type2")
+
+    edges_11 = df[(df.src < num_nodes_group_1) & (df.dst < num_nodes_group_1)]
+    edges_12 = df[(df.src < num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
+    edges_21 = df[(df.src >= num_nodes_group_1) & (df.dst < num_nodes_group_1)]
+    edges_22 = df[(df.src >= num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
+
+    edges_12.dst -= num_nodes_group_1
+    edges_21.src -= num_nodes_group_1
+    edges_22.dst -= num_nodes_group_1
+    edges_22.src -= num_nodes_group_1
+
+    total_edges_11 = torch.tensor(len(edges_11), device="cuda", dtype=torch.int64)
+    torch.distributed.all_reduce(total_edges_11, torch.distributed.ReduceOp.SUM)
+    total_edges_12 = torch.tensor(len(edges_12), device="cuda", dtype=torch.int64)
+    torch.distributed.all_reduce(total_edges_12, torch.distributed.ReduceOp.SUM)
+    total_edges_21 = torch.tensor(len(edges_21), device="cuda", dtype=torch.int64)
+    torch.distributed.all_reduce(total_edges_21, torch.distributed.ReduceOp.SUM)
+    total_edges_22 = torch.tensor(len(edges_22), device="cuda", dtype=torch.int64)
+    torch.distributed.all_reduce(total_edges_22, torch.distributed.ReduceOp.SUM)
+
+    graph.add_edges(edges_11.src, edges_11.dst, etype=("type1", "e1", "type1"))
+    graph.add_edges(edges_12.src, edges_12.dst, etype=("type1", "e2", "type2"))
+    graph.add_edges(edges_21.src, edges_21.dst, etype=("type2", "e3", "type1"))
+    graph.add_edges(edges_22.src, edges_22.dst, etype=("type2", "e4", "type2"))
+
+    assert not graph.is_homogeneous
+    assert graph.is_multi_gpu
+
+    # Verify graph.nodes()
+    assert (
+        graph.nodes() == torch.arange(total_num_nodes, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.nodes("type1")
+        == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.nodes("type2")
+        == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
+    ).all()
+
+    # Verify graph.edges()
+    assert (
+        graph.edges("eid", etype=("type1", "e1", "type1"))
+        == torch.arange(total_edges_11, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type1", "e2", "type2"))
+        == torch.arange(total_edges_12, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type2", "e3", "type1"))
+        == torch.arange(total_edges_21, dtype=torch.int64, device="cuda")
+    ).all()
+    assert (
+        graph.edges("eid", etype=("type2", "e4", "type2"))
+        == torch.arange(total_edges_22, dtype=torch.int64, device="cuda")
+    ).all()
+
+    # Use sampling call to check graph creation
+    # This isn't a test of cuGraph sampling with DGL; the options are
+    # set to verify the graph only.
+    plc_graph = graph._graph(direction)
+    assert isinstance(plc_graph, pylibcugraph.MGGraph)
+    sampling_output = pylibcugraph.uniform_neighbor_sample(
+        graph._resource_handle,
+        plc_graph,
+        start_list=cupy.arange(total_num_nodes, dtype="int64"),
+        batch_id_list=cupy.full(total_num_nodes, rank, dtype="int32"),
+        label_list=cupy.arange(world_size, dtype="int32"),
+        label_to_output_comm_rank=cupy.arange(world_size, dtype="int32"),
+        h_fan_out=np.array([-1], dtype="int32"),
+        with_replacement=False,
+        do_expensive_check=True,
+        with_edge_properties=True,
+        prior_sources_behavior="exclude",
+        return_dict=True,
+    )
+
+    sdf = cudf.DataFrame(
+        {
+            "majors": sampling_output["majors"],
+            "minors": sampling_output["minors"],
+            "edge_id": sampling_output["edge_id"],
+            "edge_type": sampling_output["edge_type"],
+        }
+    )
+
+    expected_offsets = {
+        0: (0, 0),
+        1: (0, num_nodes_group_1),
+        2: (num_nodes_group_1, 0),
+        3: (num_nodes_group_1, num_nodes_group_1),
+    }
+    if direction == "in":
+        src_col = "minors"
+        dst_col = "majors"
+    else:
+        src_col = "majors"
+        dst_col = "minors"
+
+    edges_11["etype"] = 0
+    edges_12["etype"] = 1
+    edges_21["etype"] = 2
+    edges_22["etype"] = 3
+
+    cdf = cudf.concat([edges_11, edges_12, edges_21, edges_22])
+    for i in range(len(cdf)):
+        row = cdf.iloc[i]
+        etype = row["etype"]
+        src = row["src"] + expected_offsets[etype][0]
+        dst = row["dst"] + expected_offsets[etype][1]
+
+        f = sdf[
+            (sdf[src_col] == src) & (sdf[dst_col] == dst) & (sdf["edge_type"] == etype)
+        ]
+        assert len(f) > 0  # may be multiple, some could be on other GPU
+
+    cugraph_comms_shutdown()
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
+)
+@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
+@pytest.mark.parametrize("direction", ["out", "in"])
+def test_graph_make_heterogeneous_graph_mg(direction):
+    uid = cugraph_comms_create_unique_id()
+    world_size = torch.cuda.device_count()
+
+    torch.multiprocessing.spawn(
+        run_test_graph_make_heterogeneous_graph_mg,
+        args=(
+            uid,
+            world_size,
+            direction,
+        ),
+        nprocs=world_size,
+    )
diff --git a/python/cugraph-dgl/tests/test_utils.py b/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_utils.py
rename to python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/utils.py b/python/cugraph-dgl/cugraph_dgl/tests/utils.py
new file mode 100644
index 00000000000..fa4eb05f297
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/tests/utils.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from cugraph.utilities.utils import import_optional
+from cugraph.gnn import cugraph_comms_init
+
+th = import_optional("torch")
+
+
+def assert_same_node_feats_daskapi(gs, g):
+    assert set(gs.ndata.keys()) == set(g.ndata.keys())
+
+    for key in g.ndata.keys():
+        for ntype in g.ntypes:
+            indices = th.arange(0, g.num_nodes(ntype), dtype=g.idtype).cuda()
+            if len(g.ntypes) <= 1 or ntype in g.ndata[key]:
+                g_output = g.get_node_storage(key=key, ntype=ntype).fetch(
+                    indices, device="cuda"
+                )
+                gs_output = gs.get_node_storage(key=key, ntype=ntype).fetch(indices)
+                equal_t = (gs_output != g_output).sum().cpu()
+                assert equal_t == 0
+
+
+def assert_same_node_feats(gs, g):
+    assert set(gs.ndata.keys()) == set(g.ndata.keys())
+    assert set(gs.ntypes) == set(g.ntypes)
+
+    for key in g.ndata.keys():
+        for ntype in g.ntypes:
+            if len(g.ntypes) <= 1 or ntype in g.ndata[key]:
+                indices = th.arange(0, g.num_nodes(ntype), dtype=g.idtype)
+
+                g_output = g.ndata[key]
+                gs_output = gs.ndata[key]
+
+                if len(g.ntypes) > 1:
+                    g_output = g_output[ntype]
+                    gs_output = gs_output[ntype]
+
+                g_output = g_output[indices]
+                gs_output = gs_output[indices]
+
+                equal_t = (gs_output != g_output).sum()
+                assert equal_t == 0
+
+
+def assert_same_num_nodes(gs, g):
+    for ntype in g.ntypes:
+        assert g.num_nodes(ntype) == gs.num_nodes(ntype)
+
+
+def assert_same_num_edges_can_etypes(gs, g):
+    for can_etype in g.canonical_etypes:
+        assert g.num_edges(can_etype) == gs.num_edges(can_etype)
+
+
+def assert_same_num_edges_etypes(gs, g):
+    for etype in g.etypes:
+        assert g.num_edges(etype) == gs.num_edges(etype)
+
+
+def assert_same_edge_feats_daskapi(gs, g):
+    assert set(gs.edata.keys()) == set(g.edata.keys())
+    for key in g.edata.keys():
+        for etype in g.canonical_etypes:
+            indices = th.arange(0, g.num_edges(etype), dtype=g.idtype).cuda()
+            if len(g.etypes) <= 1 or etype in g.edata[key]:
+                g_output = g.get_edge_storage(key=key, etype=etype).fetch(
+                    indices, device="cuda"
+                )
+                gs_output = gs.get_edge_storage(key=key, etype=etype).fetch(indices)
+                equal_t = (gs_output != g_output).sum().cpu()
+                assert equal_t == 0
+
+
+def assert_same_edge_feats(gs, g):
+    assert set(gs.edata.keys()) == set(g.edata.keys())
+    assert set(gs.canonical_etypes) == set(g.canonical_etypes)
+    assert set(gs.etypes) == set(g.etypes)
+
+    for key in g.edata.keys():
+        for etype in g.canonical_etypes:
+            if len(g.etypes) <= 1 or etype in g.edata[key]:
+                indices = th.arange(0, g.num_edges(etype), dtype=g.idtype).cuda()
+                g_output = g.edata[key]
+                gs_output = gs.edata[key]
+
+                if len(g.etypes) > 1:
+                    g_output = g_output[etype]
+                    gs_output = gs_output[etype]
+
+                g_output = g_output[indices]
+                gs_output = gs_output[indices]
+
+                equal_t = (gs_output != g_output).sum().cpu()
+                assert equal_t == 0
+
+
+def assert_same_sampling_len(dgl_g, cugraph_gs, nodes, fanout, edge_dir):
+    dgl_o = dgl_g.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
+    cugraph_o = cugraph_gs.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
+    assert cugraph_o.num_edges() == dgl_o.num_edges()
+    for etype in dgl_o.canonical_etypes:
+        assert dgl_o.num_edges(etype) == cugraph_o.num_edges(etype)
+
+
+def init_pytorch_worker(rank, world_size, cugraph_id, init_wholegraph=False):
+    import rmm
+
+    rmm.reinitialize(
+        devices=rank,
+    )
+
+    import cupy
+
+    cupy.cuda.Device(rank).use()
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    from cugraph.testing.mg_utils import enable_spilling
+
+    enable_spilling()
+
+    th.cuda.set_device(rank)
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    th.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    if init_wholegraph:
+        import pylibwholegraph
+
+        pylibwholegraph.torch.initialize.init(
+            rank,
+            world_size,
+            rank,
+            world_size,
+        )
+
+    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
diff --git a/python/cugraph-dgl/cugraph_dgl/typing.py b/python/cugraph-dgl/cugraph_dgl/typing.py
new file mode 100644
index 00000000000..a68463c3fd9
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/typing.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union, Tuple
+from cugraph.utilities.utils import import_optional
+
+from cugraph_dgl.nn import SparseGraph
+
+import pandas
+import numpy
+import cupy
+import cudf
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+TensorType = Union[
+    "torch.Tensor",
+    "cupy.ndarray",
+    "numpy.ndarray",
+    "cudf.Series",
+    "pandas.Series",
+    List[int],
+]
+
+DGLSamplerOutput = Tuple[
+    "torch.Tensor",
+    "torch.Tensor",
+    List[Union["dgl.Block", SparseGraph]],
+]
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
index 647dbd38a64..2ba04bd916f 100644
--- a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
+++ b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,12 +15,15 @@
 from __future__ import annotations
 from typing import Dict, Tuple, Union
 
+from cugraph_dgl.typing import TensorType
+
 import cudf
 import pandas as pd
 import dask.dataframe as dd
 import dask_cudf
 from dask.distributed import get_client
 import cupy as cp
+import numpy as np
 from cugraph.utilities.utils import import_optional
 from cugraph.gnn.dgl_extensions.dgl_uniform_sampler import src_n, dst_n
 
@@ -115,3 +118,13 @@ def add_edata_from_dgl_HeteroGraph(gs, g):
                 gs.edata_storage.add_data(
                     feat_name=feat_name, type_name=etype, feat_obj=feat_t
                 )
+
+
+def _cast_to_torch_tensor(t: TensorType) -> "torch.Tensor":
+    if isinstance(t, torch.Tensor):
+        return t
+    elif isinstance(t, (cp.ndarray, cudf.Series)):
+        return torch.as_tensor(t, device="cuda")
+    elif isinstance(t, (pd.Series, np.ndarray)):
+        return torch.as_tensor(t, device="cpu")
+    return torch.as_tensor(t)
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
new file mode 100644
index 00000000000..dbc53e73b6a
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/view.py
@@ -0,0 +1,310 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections import defaultdict
+from collections.abc import MutableMapping
+from typing import Union, Dict, List, Tuple
+
+from cugraph.utilities.utils import import_optional
+
+import cugraph_dgl
+from cugraph_dgl.typing import TensorType
+
+torch = import_optional("torch")
+dgl = import_optional("dgl")
+
+
+class HeteroEdgeDataView(MutableMapping):
+    """
+    Duck-typed version of DGL's HeteroEdgeDataView.
+    Used for accessing and modifying edge features.
+    """
+
+    def __init__(
+        self,
+        graph: "cugraph_dgl.Graph",
+        etype: Union[Tuple[str, str, str], List[Tuple[str, str, str]]],
+        edges: TensorType,
+    ):
+        self.__graph = graph
+        self.__etype = etype
+        self.__edges = edges
+
+    @property
+    def _etype(self) -> Tuple[str, str, str]:
+        return self.__etype
+
+    @property
+    def _graph(self) -> "cugraph_dgl.Graph":
+        return self.__graph
+
+    @property
+    def _edges(self) -> TensorType:
+        return self.__edges
+
+    def __getitem__(self, key: str):
+        if isinstance(self._etype, list):
+            return {
+                t: self._graph._get_e_emb(t, key, self._edges)
+                for t in self._etype
+                if self._graph._has_e_emb(t, key)
+            }
+
+        return self._graph._get_e_emb(self._etype, key, self._edges)
+
+    def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
+        if isinstance(self._etype, list):
+            if not isinstance(val, dict):
+                raise ValueError(
+                    "There are multiple edge types in this view. "
+                    "Expected a dictionary of values."
+                )
+            for t, v in val.items():
+                if t not in self._etype:
+                    raise ValueError("Attempted to modify a type out of view.")
+                self._graph.set_e_emb(t, self._edges, {key: v})
+        else:
+            if isinstance(val, dict):
+                raise ValueError(
+                    "There is only one edge type in this view. "
+                    "Expected a single tensor."
+                )
+            self._graph.set_e_emb(self._etype, self._edges, {key: v})
+
+    def __delitem__(self, key: str):
+        if isinstance(self._etype, list):
+            for t in self._etype:
+                self._graph.pop_e_emb(t, key)
+        else:
+            self._graph.pop_e_emb(self._etype, key)
+
+    def _transpose(self, fetch_vals=True):
+        if isinstance(self._etype, list):
+            tr = defaultdict(dict)
+            for etype in self._etype:
+                for key in self._graph._get_e_emb_keys(etype):
+                    tr[key][etype] = (
+                        self._graph._get_e_emb(etype, key, self._edges)
+                        if fetch_vals
+                        else []
+                    )
+        else:
+            tr = {}
+            for key in self._graph._get_e_emb_keys(self._etype):
+                tr[key] = (
+                    self._graph._get_e_emb(self._etype, key, self._edges)
+                    if fetch_vals
+                    else []
+                )
+
+        return tr
+
+    def __len__(self):
+        return len(self._transpose(fetch_vals=False))
+
+    def __iter__(self):
+        return iter(self._transpose())
+
+    def keys(self):
+        return self._transpose(fetch_vals=False).keys()
+
+    def values(self):
+        return self._transpose().values()
+
+    def __repr__(self):
+        return repr(self._transpose(fetch_vals=False))
+
+
+class HeteroNodeDataView(MutableMapping):
+    """
+    Duck-typed version of DGL's HeteroNodeDataView.
+    Used for accessing and modifying node features.
+    """
+
+    def __init__(
+        self,
+        graph: "cugraph_dgl.Graph",
+        ntype: Union[str, List[str]],
+        nodes: TensorType,
+    ):
+        self.__graph = graph
+        self.__ntype = ntype
+        self.__nodes = nodes
+
+    @property
+    def _ntype(self) -> str:
+        return self.__ntype
+
+    @property
+    def _graph(self) -> "cugraph_dgl.Graph":
+        return self.__graph
+
+    @property
+    def _nodes(self) -> TensorType:
+        return self.__nodes
+
+    def __getitem__(self, key: str):
+        if isinstance(self._ntype, list):
+            return {
+                t: self._graph._get_n_emb(t, key, self._nodes)
+                for t in self._ntype
+                if self._graph._has_n_emb(t, key)
+            }
+        else:
+            return self._graph._get_n_emb(self._ntype, key, self._nodes)
+
+    def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
+        if isinstance(self._ntype, list):
+            if not isinstance(val, dict):
+                raise ValueError(
+                    "There are multiple node types in this view. "
+                    "Expected a dictionary of values."
+                )
+            for t, v in val.items():
+                if t not in self._ntype:
+                    raise ValueError("Attempted to modify a type out of view.")
+                self._graph._set_n_emb(t, self._nodes, {key: v})
+        else:
+            if isinstance(val, dict):
+                raise ValueError(
+                    "There is only one node type in this view. "
+                    "Expected a single value tensor."
+                )
+            self._graph._set_n_emb(self._ntype, self._nodes, {key: val})
+
+    def __delitem__(self, key: str):
+        if isinstance(self._ntype, list):
+            for t in self._ntype:
+                self._graph._pop_n_emb(t, key)
+        else:
+            self._graph.pop_n_emb(self._ntype, key)
+
+    def _transpose(self, fetch_vals=True):
+        if isinstance(self._ntype, list):
+            tr = defaultdict(dict)
+            for ntype in self._ntype:
+                for key in self._graph._get_n_emb_keys(ntype):
+                    tr[key][ntype] = (
+                        self._graph._get_n_emb(ntype, key, self._nodes)
+                        if fetch_vals
+                        else []
+                    )
+        else:
+            tr = {}
+            for key in self._graph._get_n_emb_keys(self._ntype):
+                tr[key] = (
+                    self._graph._get_n_emb(self._ntype, key, self._nodes)
+                    if fetch_vals
+                    else []
+                )
+
+        return tr
+
+    def __len__(self):
+        return len(self._transpose(fetch_vals=False))
+
+    def __iter__(self):
+        return iter(self._transpose())
+
+    def keys(self):
+        return self._transpose(fetch_vals=False).keys()
+
+    def values(self):
+        return self._transpose().values()
+
+    def __repr__(self):
+        return repr(self._transpose(fetch_vals=False))
+
+
+class HeteroEdgeView:
+    """
+    Duck-typed version of DGL's HeteroEdgeView.
+    """
+
+    def __init__(self, graph):
+        self.__graph = graph
+
+    @property
+    def _graph(self) -> "cugraph_dgl.Graph":
+        return self.__graph
+
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            if not (key.start is None and key.stop is None and key.stop is None):
+                raise ValueError("Only full slices are supported in DGL.")
+            edges = dgl.base.ALL
+            etype = None
+        elif key is None:
+            edges = dgl.base.ALL
+            etype = None
+        elif isinstance(key, tuple):
+            if len(key) == 3:
+                edges = dgl.base.ALL
+                etype = key
+            else:
+                edges = key
+                etype = None
+        elif isinstance(key, str):
+            edges = dgl.base.ALL
+            etype = key
+        else:
+            edges = key
+            etype = None
+
+        return HeteroEdgeDataView(
+            graph=self.__graph,
+            etype=etype,
+            edges=edges,
+        )
+
+    def __call__(self, *args, **kwargs):
+        if "device" in kwargs:
+            return self.__graph.all_edges(*args, **kwargs)
+
+        return self.__graph.all_edges(*args, **kwargs, device="cuda")
+
+
+class HeteroNodeView:
+    """
+    Duck-typed version of DGL's HeteroNodeView.
+    """
+
+    def __init__(self, graph: "cugraph_dgl.Graph"):
+        self.__graph = graph
+
+    @property
+    def _graph(self) -> "cugraph_dgl.Graph":
+        return self.__graph
+
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            if not (key.start is None and key.stop is None and key.stop is None):
+                raise ValueError("Only full slices are supported in DGL.")
+            nodes = dgl.base.ALL
+            ntype = None
+        elif isinstance(key, tuple):
+            nodes, ntype = key
+        elif key is None or isinstance(key, str):
+            nodes = dgl.base.ALL
+            ntype = key
+        else:
+            nodes = key
+            ntype = None
+
+        return HeteroNodeDataView(graph=self.__graph, ntype=ntype, nodes=nodes)
+
+    def __call__(self, ntype=None):
+        return torch.arange(
+            0, self.__graph.num_nodes(ntype), dtype=self.__graph.idtype, device="cuda"
+        )
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index 1762b1e6d8e..ba2bb4bc170 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -24,16 +24,16 @@ classifiers = [
     "Programming Language :: Python",
 ]
 dependencies = [
-    "cugraph==24.8.*,>=0.0.0a0",
+    "cugraph==24.10.*,>=0.0.0a0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
-    "pylibcugraphops==24.8.*,>=0.0.0a0",
+    "pylibcugraphops==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
 test = [
     "pandas",
-    "pylibwholegraph==24.8.*,>=0.0.0a0",
+    "pylibwholegraph==24.10.*,>=0.0.0a0",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",
@@ -61,3 +61,4 @@ include = [
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-dgl/tests/utils.py b/python/cugraph-dgl/tests/utils.py
deleted file mode 100644
index d6a90840b72..00000000000
--- a/python/cugraph-dgl/tests/utils.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from cugraph.utilities.utils import import_optional
-
-th = import_optional("torch")
-
-
-def assert_same_node_feats(gs, g):
-    set(gs.ndata.keys()) == set(g.ndata.keys())
-
-    for key in g.ndata.keys():
-        for ntype in g.ntypes:
-            indices = th.arange(0, g.num_nodes(ntype), dtype=g.idtype).cuda()
-            if len(g.ntypes) <= 1 or ntype in g.ndata[key]:
-                g_output = g.get_node_storage(key=key, ntype=ntype).fetch(
-                    indices, device="cuda"
-                )
-                gs_output = gs.get_node_storage(key=key, ntype=ntype).fetch(indices)
-                equal_t = (gs_output != g_output).sum().cpu()
-                assert equal_t == 0
-
-
-def assert_same_num_nodes(gs, g):
-    for ntype in g.ntypes:
-        assert g.num_nodes(ntype) == gs.num_nodes(ntype)
-
-
-def assert_same_num_edges_can_etypes(gs, g):
-    for can_etype in g.canonical_etypes:
-        assert g.num_edges(can_etype) == gs.num_edges(can_etype)
-
-
-def assert_same_num_edges_etypes(gs, g):
-    for etype in g.etypes:
-        assert g.num_edges(etype) == gs.num_edges(etype)
-
-
-def assert_same_edge_feats(gs, g):
-    set(gs.edata.keys()) == set(g.edata.keys())
-    for key in g.edata.keys():
-        for etype in g.canonical_etypes:
-            indices = th.arange(0, g.num_edges(etype), dtype=g.idtype).cuda()
-            if len(g.etypes) <= 1 or etype in g.edata[key]:
-                g_output = g.get_edge_storage(key=key, etype=etype).fetch(
-                    indices, device="cuda"
-                )
-                gs_output = gs.get_edge_storage(key=key, etype=etype).fetch(indices)
-                equal_t = (gs_output != g_output).sum().cpu()
-                assert equal_t == 0
-
-
-def assert_same_sampling_len(dgl_g, cugraph_gs, nodes, fanout, edge_dir):
-    dgl_o = dgl_g.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
-    cugraph_o = cugraph_gs.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
-    assert cugraph_o.num_edges() == dgl_o.num_edges()
-    for etype in dgl_o.canonical_etypes:
-        assert dgl_o.num_edges(etype) == cugraph_o.num_edges(etype)
diff --git a/python/cugraph-equivariant/pyproject.toml b/python/cugraph-equivariant/pyproject.toml
index f9c992e3fb9..e4a8d290d9e 100644
--- a/python/cugraph-equivariant/pyproject.toml
+++ b/python/cugraph-equivariant/pyproject.toml
@@ -37,7 +37,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 dependencies = [
-    "pylibcugraphops==24.8.*,>=0.0.0a0",
+    "pylibcugraphops==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -69,3 +69,4 @@ include = [
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index 7e59dcced76..bd1ca33af70 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -9,11 +9,11 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==24.8.*,>=0.0.0a0
+- cugraph==24.10.*,>=0.0.0a0
 - pandas
 - pre-commit
 - pyg>=2.5,<2.6
-- pylibcugraphops==24.8.*,>=0.0.0a0
+- pylibcugraphops==24.10.*,>=0.0.0a0
 - pytest
 - pytest-benchmark
 - pytest-cov
diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
index a3715d3ddf4..b6450e7b192 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -169,7 +169,7 @@ def __init__(self, memory_type="distributed", location="cpu"):
 
         self.__features = {}
 
-        self.__wg_comm = wgth.get_local_node_communicator()
+        self.__wg_comm = wgth.get_global_communicator()
         self.__wg_type = memory_type
         self.__wg_location = location
 
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
index 622b68d37e2..e086bf07b1f 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
@@ -271,7 +271,7 @@ def __get_edgelist(self):
             torch.tensor(
                 [self.__edge_indices[et].shape[1] for et in sorted_keys],
                 device="cuda",
-                dtype=torch.int32,
+                dtype=torch.int64,
             )
         )
 
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index 8ccd305a6bd..b29c108e3f4 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -29,10 +29,10 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 dependencies = [
-    "cugraph==24.8.*,>=0.0.0a0",
+    "cugraph==24.10.*,>=0.0.0a0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
-    "pylibcugraphops==24.8.*,>=0.0.0a0",
+    "pylibcugraphops==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -42,7 +42,7 @@ Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 [project.optional-dependencies]
 test = [
     "pandas",
-    "pylibwholegraph==24.8.*,>=0.0.0a0",
+    "pylibwholegraph==24.10.*,>=0.0.0a0",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",
@@ -67,3 +67,4 @@ include = [
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-service/client/pyproject.toml b/python/cugraph-service/client/pyproject.toml
index 53170e888ba..75deea88e2e 100644
--- a/python/cugraph-service/client/pyproject.toml
+++ b/python/cugraph-service/client/pyproject.toml
@@ -49,3 +49,4 @@ include = [
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../../dependencies.yaml"
 disable-cuda = true
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index 4a79e13e532..2ae40911821 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -20,18 +20,18 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
-    "cugraph-service-client==24.8.*,>=0.0.0a0",
-    "cugraph==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
+    "cugraph-service-client==24.10.*,>=0.0.0a0",
+    "cugraph==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==24.8.*,>=0.0.0a0",
-    "dask-cudf==24.8.*,>=0.0.0a0",
+    "dask-cuda==24.10.*,>=0.0.0a0",
+    "dask-cudf==24.10.*,>=0.0.0a0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
-    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
     "thriftpy2!=0.5.0,!=0.5.1",
-    "ucx-py==0.39.*,>=0.0.0a0",
+    "ucx-py==0.40.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -77,3 +77,4 @@ include = [
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py
index ba7e23df800..ada1fec74cb 100644
--- a/python/cugraph/cugraph/__init__.py
+++ b/python/cugraph/cugraph/__init__.py
@@ -76,10 +76,16 @@
 from cugraph.link_prediction import (
     jaccard,
     jaccard_coefficient,
+    all_pairs_jaccard,
     overlap,
     overlap_coefficient,
+    all_pairs_overlap,
     sorensen,
     sorensen_coefficient,
+    all_pairs_sorensen,
+    cosine,
+    cosine_coefficient,
+    all_pairs_cosine,
 )
 
 from cugraph.traversal import (
diff --git a/python/cugraph/cugraph/community/ktruss_subgraph.py b/python/cugraph/cugraph/community/ktruss_subgraph.py
index 1799c50252f..bcf8527e17b 100644
--- a/python/cugraph/cugraph/community/ktruss_subgraph.py
+++ b/python/cugraph/cugraph/community/ktruss_subgraph.py
@@ -11,19 +11,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure.graph_classes import Graph
 from typing import Union
+
+import cudf
+from pylibcugraph import k_truss_subgraph as pylibcugraph_k_truss_subgraph
+from pylibcugraph import ResourceHandle
+from cugraph.structure.graph_classes import Graph
 from cugraph.utilities import (
     ensure_cugraph_obj_for_nx,
     cugraph_to_nx,
 )
-
-from pylibcugraph import k_truss_subgraph as pylibcugraph_k_truss_subgraph
-from pylibcugraph import ResourceHandle
-import warnings
-
-from numba import cuda
-import cudf
 from cugraph.utilities.utils import import_optional
 
 # FIXME: the networkx.Graph type used in the type annotation for
@@ -34,37 +31,17 @@
 networkx = import_optional("networkx")
 
 
-# FIXME: special case for ktruss on CUDA 11.4: an 11.4 bug causes ktruss to
-# crash in that environment. Allow ktruss to import on non-11.4 systems, but
-# raise an exception if ktruss is directly imported on 11.4.
-def _ensure_compatible_cuda_version():
-    try:
-        cuda_version = cuda.runtime.get_version()
-    except cuda.cudadrv.runtime.CudaRuntimeAPIError:
-        cuda_version = "n/a"
-
-    unsupported_cuda_version = (11, 4)
-
-    if cuda_version == unsupported_cuda_version:
-        ver_string = ".".join([str(n) for n in unsupported_cuda_version])
-        raise NotImplementedError(
-            "k_truss is not currently supported in CUDA" f" {ver_string} environments."
-        )
-
-
 def k_truss(
     G: Union[Graph, "networkx.Graph"], k: int
 ) -> Union[Graph, "networkx.Graph"]:
     """
     Returns the K-Truss subgraph of a graph for a specific k.
 
-    NOTE: this function is currently not available on CUDA 11.4 systems.
-
-    The k-truss of a graph is a subgraph where each edge is part of at least
-    (k−2) triangles. K-trusses are used for finding tighlty knit groups of
-    vertices in a graph. A k-truss is a relaxation of a k-clique in the graph
-    and was define in [1]. Finding cliques is computationally demanding and
-    finding the maximal k-clique is known to be NP-Hard.
+    The k-truss of a graph is a subgraph where each edge is incident to at
+    least (k−2) triangles. K-trusses are used for finding tighlty knit groups
+    of vertices in a graph. A k-truss is a relaxation of a k-clique in the graph.
+    Finding cliques is computationally demanding and finding the maximal
+    k-clique is known to be NP-Hard.
 
     Parameters
     ----------
@@ -89,9 +66,6 @@ def k_truss(
     >>> k_subgraph = cugraph.k_truss(G, 3)
 
     """
-
-    _ensure_compatible_cuda_version()
-
     G, isNx = ensure_cugraph_obj_for_nx(G)
 
     if isNx is True:
@@ -159,12 +133,6 @@ def ktruss_subgraph(
     k : int
         The desired k to be used for extracting the k-truss subgraph.
 
-    use_weights : bool, optional (default=True)
-        Whether the output should contain the edge weights if G has them.
-
-        Deprecated: If 'weights' were passed at the graph creation, they will
-        be used.
-
     Returns
     -------
     G_truss : cuGraph.Graph
@@ -177,20 +145,10 @@ def ktruss_subgraph(
     >>> k_subgraph = cugraph.ktruss_subgraph(G, 3, use_weights=False)
     """
 
-    _ensure_compatible_cuda_version()
-
     KTrussSubgraph = Graph()
     if G.is_directed():
         raise ValueError("input graph must be undirected")
 
-    if use_weights:
-        warning_msg = (
-            "The use_weights flag is deprecated "
-            "and will be removed in the next release. if weights "
-            "were passed at the graph creation, they will be used."
-        )
-        warnings.warn(warning_msg, FutureWarning)
-
     sources, destinations, edge_weights, _ = pylibcugraph_k_truss_subgraph(
         resource_handle=ResourceHandle(),
         graph=G._plc_graph,
diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py
index a76f1460575..b1588008bc6 100644
--- a/python/cugraph/cugraph/dask/__init__.py
+++ b/python/cugraph/cugraph/dask/__init__.py
@@ -23,6 +23,7 @@
 from .community.triangle_count import triangle_count
 from .community.egonet import ego_graph
 from .community.induced_subgraph import induced_subgraph
+from .community.ktruss_subgraph import ktruss_subgraph
 from .centrality.katz_centrality import katz_centrality
 from .components.connectivity import weakly_connected_components
 from .sampling.uniform_neighbor_sample import uniform_neighbor_sample
@@ -33,8 +34,13 @@
 from .centrality.betweenness_centrality import edge_betweenness_centrality
 from .cores.k_core import k_core
 from .link_prediction.jaccard import jaccard
+from .link_prediction.jaccard import all_pairs_jaccard
 from .link_prediction.sorensen import sorensen
+from .link_prediction.sorensen import all_pairs_sorensen
 from .link_prediction.overlap import overlap
+from .link_prediction.overlap import all_pairs_overlap
+from .link_prediction.cosine import cosine
+from .link_prediction.cosine import all_pairs_cosine
 from .community.leiden import leiden
 
 # Avoid "p2p" shuffling in dask for now
diff --git a/python/cugraph/cugraph/dask/centrality/katz_centrality.py b/python/cugraph/cugraph/dask/centrality/katz_centrality.py
index a11be3b6870..6616670c6b6 100644
--- a/python/cugraph/cugraph/dask/centrality/katz_centrality.py
+++ b/python/cugraph/cugraph/dask/centrality/katz_centrality.py
@@ -162,7 +162,7 @@ def katz_centrality(
     do_expensive_check = False
 
     initial_hubs_guess_values = None
-    if nstart:
+    if nstart is not None:
         if input_graph.renumbered:
             if len(input_graph.renumber_map.implementation.col_names) > 1:
                 cols = nstart.columns[:-1].to_list()
diff --git a/python/cugraph/cugraph/dask/comms/comms.py b/python/cugraph/cugraph/dask/comms/comms.py
index 5499b13af03..1e1c28fbbee 100644
--- a/python/cugraph/cugraph/dask/comms/comms.py
+++ b/python/cugraph/cugraph/dask/comms/comms.py
@@ -146,8 +146,6 @@ def initialize(comms=None, p2p=False, prows=None, pcols=None, partition_type=1):
         __default_handle = None
         if comms is None:
             # Initialize communicator
-            if not p2p:
-                raise Exception("Set p2p to True for running mnmg algorithms")
             __instance = raftComms(comms_p2p=p2p)
             __instance.init()
             # Initialize subcommunicator
diff --git a/python/cugraph/cugraph/dask/community/__init__.py b/python/cugraph/cugraph/dask/community/__init__.py
index 657d9df101b..9b5301d0e42 100644
--- a/python/cugraph/cugraph/dask/community/__init__.py
+++ b/python/cugraph/cugraph/dask/community/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,3 +15,4 @@
 from .triangle_count import triangle_count
 from .induced_subgraph import induced_subgraph
 from .leiden import leiden
+from .ktruss_subgraph import ktruss_subgraph
diff --git a/python/cugraph/cugraph/dask/community/ktruss_subgraph.py b/python/cugraph/cugraph/dask/community/ktruss_subgraph.py
new file mode 100644
index 00000000000..2ecca069ea5
--- /dev/null
+++ b/python/cugraph/cugraph/dask/community/ktruss_subgraph.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Tuple
+
+import cudf
+import cupy as cp
+from dask.distributed import wait, default_client
+import dask_cudf
+
+from pylibcugraph import (
+    ResourceHandle,
+    k_truss_subgraph as pylibcugraph_k_truss_subgraph,
+)
+import cugraph.dask.comms.comms as Comms
+
+
+def _call_k_truss_subgraph(
+    sID: bytes,
+    mg_graph_x,
+    k: int,
+    do_expensive_check: bool,
+) -> Tuple[cp.ndarray, cp.ndarray, cp.ndarray]:
+
+    return pylibcugraph_k_truss_subgraph(
+        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+        graph=mg_graph_x,
+        k=k,
+        do_expensive_check=do_expensive_check,
+    )
+
+
+def convert_to_cudf(cp_arrays: cp.ndarray) -> cudf.DataFrame:
+    cp_src, cp_dst, cp_weight, _ = cp_arrays
+
+    df = cudf.DataFrame()
+    if cp_src is not None:
+        df["src"] = cp_src
+        df["dst"] = cp_dst
+    if cp_weight is not None:
+        df["weight"] = cp_weight
+
+    return df
+
+
+def ktruss_subgraph(input_graph, k: int) -> dask_cudf.DataFrame:
+    """
+    Returns the K-Truss subgraph of a graph for a specific k.
+
+    The k-truss of a graph is a subgraph where each edge is incident to at
+    least (k−2) triangles. K-trusses are used for finding tighlty knit groups
+    of vertices in a graph. A k-truss is a relaxation of a k-clique in the graph.
+    Finding cliques is computationally demanding and finding the maximal
+    k-clique is known to be NP-Hard.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        Graph or matrix object, which should contain the connectivity
+        information. Edge weights, if present, should be single or double
+        precision floating point values
+
+    k : int
+        The desired k to be used for extracting the k-truss subgraph.
+
+
+    Returns
+    -------
+    k_truss_edge_lists : dask_cudf.DataFrame
+        Distributed GPU data frame containing all source identifiers,
+        destination identifiers, and edge weights belonging to the truss.
+    """
+    if input_graph.is_directed():
+        raise ValueError("input graph must be undirected")
+    # Initialize dask client
+    client = default_client()
+
+    do_expensive_check = False
+
+    result = [
+        client.submit(
+            _call_k_truss_subgraph,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            k,
+            do_expensive_check,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
+    wait(result)
+
+    cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result]
+
+    wait(cudf_result)
+
+    ddf = dask_cudf.from_delayed(cudf_result).persist()
+    wait(ddf)
+    # Wait until the inactive futures are released
+    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
+
+    if input_graph.renumbered:
+        ddf = input_graph.unrenumber(ddf, "src")
+        ddf = input_graph.unrenumber(ddf, "dst")
+
+    return ddf
diff --git a/python/cugraph/cugraph/dask/link_prediction/cosine.py b/python/cugraph/cugraph/dask/link_prediction/cosine.py
new file mode 100644
index 00000000000..e4007ad96d5
--- /dev/null
+++ b/python/cugraph/cugraph/dask/link_prediction/cosine.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from dask.distributed import wait, default_client
+import cugraph.dask.comms.comms as Comms
+import dask_cudf
+import cudf
+from cugraph.dask.common.input_utils import get_distributed_data
+from cugraph.dask import get_n_workers
+from cugraph.utilities import renumber_vertex_pair
+from cugraph.dask.common.part_utils import (
+    get_persisted_df_worker_map,
+    persist_dask_df_equal_parts_per_worker,
+)
+
+
+from pylibcugraph import (
+    cosine_coefficients as pylibcugraph_cosine_coefficients,
+    all_pairs_cosine_coefficients as pylibcugraph_all_pairs_cosine_coefficients,
+)
+from pylibcugraph import ResourceHandle
+
+
+def convert_to_cudf(cp_arrays):
+    """
+    Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
+    """
+
+    cupy_first, cupy_second, cupy_similarity = cp_arrays
+
+    df = cudf.DataFrame()
+    df["first"] = cupy_first
+    df["second"] = cupy_second
+    df["cosine_coeff"] = cupy_similarity
+
+    return df
+
+
+def _call_plc_all_pairs_cosine(
+    sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check
+):
+
+    return pylibcugraph_all_pairs_cosine_coefficients(
+        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+        graph=mg_graph_x,
+        vertices=vertices,
+        use_weight=use_weight,
+        topk=topk,
+        do_expensive_check=do_expensive_check,
+    )
+
+
+def _call_plc_cosine(
+    sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name
+):
+
+    first = vertex_pair[vertex_pair_col_name[0]]
+    second = vertex_pair[vertex_pair_col_name[1]]
+
+    return pylibcugraph_cosine_coefficients(
+        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+        graph=mg_graph_x,
+        first=first,
+        second=second,
+        use_weight=use_weight,
+        do_expensive_check=do_expensive_check,
+    )
+
+
+def cosine(input_graph, vertex_pair=None, use_weight=False):
+    """
+    Compute the Cosine similarity between each pair of vertices connected by
+    an edge, or between arbitrary pairs of vertices specified by the user.
+    Cosine similarity is defined between two sets as the ratio of their
+    intersection's volume over the square root of volume's product.
+    In the context of graphs, the neighborhood of a vertex is seen as a set.
+    The Cosine similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors.
+
+    cugraph.dask.cosine, in the absence of a specified vertex pair list, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the cosine coefficient for those vertex pairs. This is
+    not advisable as the vertex_pairs can grow exponentially with respect to the
+    size of the datasets.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list (edge weights are not supported yet for this algorithm). The
+        graph should be undirected where an undirected edge is represented by a
+        directed edge in both direction. The adjacency list will be computed if
+        not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
+
+    vertex_pair : cudf.DataFrame, optional (default=None)
+        A GPU dataframe consisting of two columns representing pairs of
+        vertices. If provided, the cosine coefficient is computed for the
+        given vertex pairs.  If the vertex_pair is not provided then the
+        current implementation computes the cosine coefficient for all
+        adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted cosine (if use_weight==True)
+        or un-weighted cosine (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    Returns
+    -------
+    result : dask_cudf.DataFrame
+        GPU distributed data frame containing 3 dask_cudf.Series
+
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        ddf['cosine_coeff']: dask_cudf.Series
+            The computed cosine coefficient between the first and the second
+            vertex ID.
+    """
+
+    if input_graph.is_directed():
+        raise ValueError("input graph must be undirected")
+
+    if vertex_pair is None:
+        # Call two_hop neighbor of the entire graph
+        vertex_pair = input_graph.get_two_hop_neighbors()
+
+    vertex_pair_col_name = vertex_pair.columns
+
+    if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
+        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+
+    elif vertex_pair is not None:
+        raise ValueError("vertex_pair must be a dask_cudf or cudf dataframe")
+
+    if not isinstance(vertex_pair, (dask_cudf.DataFrame)):
+        vertex_pair = dask_cudf.from_cudf(
+            vertex_pair, npartitions=len(Comms.get_workers())
+        )
+    vertex_pair = get_distributed_data(vertex_pair)
+    wait(vertex_pair)
+    vertex_pair = vertex_pair.worker_to_parts
+
+    # Initialize dask client
+    client = default_client()
+
+    do_expensive_check = False
+
+    result = [
+        client.submit(
+            _call_plc_cosine,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            vertex_pair[w][0],
+            use_weight,
+            do_expensive_check,
+            vertex_pair_col_name,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
+
+    wait(result)
+
+    cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result]
+
+    wait(cudf_result)
+
+    ddf = dask_cudf.from_delayed(cudf_result).persist()
+    wait(ddf)
+
+    # Wait until the inactive futures are released
+    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
+
+    if input_graph.renumbered:
+        ddf = input_graph.unrenumber(ddf, "first")
+        ddf = input_graph.unrenumber(ddf, "second")
+
+    return ddf
+
+
+def all_pairs_cosine(
+    input_graph,
+    vertices: cudf.Series = None,
+    use_weight: bool = False,
+    topk: int = None,
+):
+    """
+    Compute the All Pairs Cosine similarity between all pairs of vertices specified.
+    All pairs Cosine similarity is defined between two sets as the ratio of their
+    intersection's volume over the square root of their volume's product.
+    In the context of graphs, the neighborhood of a vertex is seen as a set. The Cosine
+    similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors.
+
+    cugraph.all_pairs_cosine, in the absence of specified vertices, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the cosine coefficient for all the vertex pairs in the graph.
+    This is not advisable as the vertex_pairs can grow exponentially with respect to
+    the size of the datasets.
+
+    If the topk parameter is specified then the result will only contain the top k
+    highest scoring results.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list (edge weights are not supported yet for this algorithm). The
+        graph should be undirected where an undirected edge is represented by a
+        directed edge in both direction. The adjacency list will be computed if
+        not already present.
+
+       This implementation only supports undirected, non-multi Graphs.
+
+    vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None)
+        A GPU Series containing the input vertex list.  If the vertex list is not
+        provided then the current implementation computes the cosine coefficient for
+        all adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted cosine (if use_weight==True)
+        or un-weighted cosine (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    topk : int, optional (default=None)
+        Specify the number of answers to return otherwise returns the entire
+        solution
+
+    Returns
+    -------
+    result : dask_cudf.DataFrame
+        GPU distributed data frame containing 3 dask_cudf.Series
+
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        ddf['cosine_coeff']: dask_cudf.Series
+            The computed cosine coefficient between the first and the second
+            vertex ID.
+    """
+
+    if input_graph.is_directed():
+        raise ValueError("input graph must be undirected")
+
+    # Initialize dask client
+    client = default_client()
+
+    if vertices is not None:
+        if isinstance(vertices, int):
+            vertices = [vertices]
+
+        if isinstance(vertices, list):
+            vertices = cudf.Series(
+                vertices,
+                dtype=input_graph.edgelist.edgelist_df[
+                    input_graph.renumber_map.renumbered_src_col_name
+                ].dtype,
+            )
+
+        if not isinstance(vertices, (dask_cudf.Series)):
+            vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers())
+
+        if input_graph.renumbered:
+            vertices = input_graph.lookup_internal_vertex_id(vertices)
+
+        n_workers = get_n_workers()
+        vertices = vertices.repartition(npartitions=n_workers)
+        vertices = persist_dask_df_equal_parts_per_worker(vertices, client)
+        vertices = get_persisted_df_worker_map(vertices, client)
+
+    do_expensive_check = False
+
+    result = [
+        client.submit(
+            _call_plc_all_pairs_cosine,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            vertices[w][0] if vertices is not None else None,
+            use_weight,
+            topk,
+            do_expensive_check,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
+
+    wait(result)
+
+    cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result]
+
+    wait(cudf_result)
+
+    ddf = dask_cudf.from_delayed(cudf_result).persist()
+    wait(ddf)
+
+    # Wait until the inactive futures are released
+    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
+
+    if input_graph.renumbered:
+        ddf = input_graph.unrenumber(ddf, "first")
+        ddf = input_graph.unrenumber(ddf, "second")
+
+    return ddf
diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
index 3b8edc8daa5..f72122048f9 100644
--- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
@@ -18,10 +18,17 @@
 import dask_cudf
 import cudf
 from cugraph.dask.common.input_utils import get_distributed_data
+from cugraph.dask import get_n_workers
 from cugraph.utilities import renumber_vertex_pair
+from cugraph.dask.common.part_utils import (
+    get_persisted_df_worker_map,
+    persist_dask_df_equal_parts_per_worker,
+)
+
 
 from pylibcugraph import (
     jaccard_coefficients as pylibcugraph_jaccard_coefficients,
+    all_pairs_jaccard_coefficients as pylibcugraph_all_pairs_jaccard_coefficients,
 )
 from pylibcugraph import ResourceHandle
 
@@ -41,6 +48,20 @@ def convert_to_cudf(cp_arrays):
     return df
 
 
+def _call_plc_all_pairs_jaccard(
+    sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check
+):
+
+    return pylibcugraph_all_pairs_jaccard_coefficients(
+        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+        graph=mg_graph_x,
+        vertices=vertices,
+        use_weight=use_weight,
+        topk=topk,
+        do_expensive_check=do_expensive_check,
+    )
+
+
 def _call_plc_jaccard(
     sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name
 ):
@@ -63,7 +84,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
     Compute the Jaccard similarity between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
     Jaccard similarity is defined between two sets as the ratio of the volume
-    of their intersection divided by the volume of their union. In the context
+    of their intersection over the volume of their union. In the context
     of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
     similarity weight of each edge represents the strength of connection
     between vertices based on the relative similarity of their neighbors.
@@ -83,7 +104,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
         directed edge in both direction. The adjacency list will be computed if
         not already present.
 
-        This implementation only supports undirected, unweighted Graph.
+        This implementation only supports undirected, non-multi Graphs.
 
     vertex_pair : cudf.DataFrame, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
@@ -100,7 +121,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
     Returns
     -------
     result : dask_cudf.DataFrame
-        GPU distributed data frame containing 2 dask_cudf.Series
+        GPU distributed data frame containing 3 dask_cudf.Series
 
         ddf['first']: dask_cudf.Series
             The first vertex ID of each pair (will be identical to first if specified).
@@ -140,21 +161,148 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
 
     do_expensive_check = False
 
-    if vertex_pair is not None:
-        result = [
-            client.submit(
-                _call_plc_jaccard,
-                Comms.get_session_id(),
-                input_graph._plc_graph[w],
-                vertex_pair[w][0],
-                use_weight,
-                do_expensive_check,
-                vertex_pair_col_name,
-                workers=[w],
-                allow_other_workers=False,
+    result = [
+        client.submit(
+            _call_plc_jaccard,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            vertex_pair[w][0],
+            use_weight,
+            do_expensive_check,
+            vertex_pair_col_name,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
+
+    wait(result)
+
+    cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result]
+
+    wait(cudf_result)
+
+    ddf = dask_cudf.from_delayed(cudf_result).persist()
+    wait(ddf)
+
+    # Wait until the inactive futures are released
+    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
+
+    if input_graph.renumbered:
+        ddf = input_graph.unrenumber(ddf, "first")
+        ddf = input_graph.unrenumber(ddf, "second")
+
+    return ddf
+
+
+def all_pairs_jaccard(
+    input_graph,
+    vertices: cudf.Series = None,
+    use_weight: bool = False,
+    topk: int = None,
+):
+    """
+    Compute the All Pairs Jaccard similarity between all pairs of vertices specified.
+    All pairs Jaccard similarity is defined between two sets as the ratio of the volume
+    of their intersection over the volume of their union. In the context
+    of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
+    similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors.
+
+    cugraph.all_pairs_jaccard, in the absence of specified vertices, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the jaccard coefficient for all the vertex pairs in the graph.
+    This is not advisable as the vertex_pairs can grow exponentially with respect to
+    the size of the datasets.
+
+    If the topk parameter is specified then the result will only contain the top k
+    highest scoring results.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list (edge weights are not supported yet for this algorithm). The
+        graph should be undirected where an undirected edge is represented by a
+        directed edge in both direction. The adjacency list will be computed if
+        not already present.
+
+       This implementation only supports undirected, non-multi Graphs.
+
+    vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None)
+        A GPU Series containing the input vertex list.  If the vertex list is not
+        provided then the current implementation computes the jaccard coefficient for
+        all adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted jaccard (if use_weight==True)
+        or un-weighted jaccard (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    topk : int, optional (default=None)
+        Specify the number of answers to return otherwise returns the entire
+        solution
+
+    Returns
+    -------
+    result : dask_cudf.DataFrame
+        GPU distributed data frame containing 3 dask_cudf.Series
+
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        ddf['jaccard_coeff']: dask_cudf.Series
+            The computed jaccard coefficient between the first and the second
+            vertex ID.
+    """
+
+    if input_graph.is_directed():
+        raise ValueError("input graph must be undirected")
+
+    # Initialize dask client
+    client = default_client()
+
+    if vertices is not None:
+        if isinstance(vertices, int):
+            vertices = [vertices]
+
+        if isinstance(vertices, list):
+            vertices = cudf.Series(
+                vertices,
+                dtype=input_graph.edgelist.edgelist_df[
+                    input_graph.renumber_map.renumbered_src_col_name
+                ].dtype,
             )
-            for w in Comms.get_workers()
-        ]
+
+        if not isinstance(vertices, (dask_cudf.Series)):
+            vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers())
+
+        if input_graph.renumbered:
+            vertices = input_graph.lookup_internal_vertex_id(vertices)
+
+        n_workers = get_n_workers()
+        vertices = vertices.repartition(npartitions=n_workers)
+        vertices = persist_dask_df_equal_parts_per_worker(vertices, client)
+        vertices = get_persisted_df_worker_map(vertices, client)
+
+    do_expensive_check = False
+
+    result = [
+        client.submit(
+            _call_plc_all_pairs_jaccard,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            vertices[w][0] if vertices is not None else None,
+            use_weight,
+            topk,
+            do_expensive_check,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
 
     wait(result)
 
diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py
index 4bda05e3c95..e1a3285ee60 100644
--- a/python/cugraph/cugraph/dask/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,9 +19,15 @@
 import cudf
 from cugraph.dask.common.input_utils import get_distributed_data
 from cugraph.utilities import renumber_vertex_pair
+from cugraph.dask import get_n_workers
+from cugraph.dask.common.part_utils import (
+    get_persisted_df_worker_map,
+    persist_dask_df_equal_parts_per_worker,
+)
 
 from pylibcugraph import (
     overlap_coefficients as pylibcugraph_overlap_coefficients,
+    all_pairs_overlap_coefficients as pylibcugraph_all_pairs_overlap_coefficients,
 )
 from pylibcugraph import ResourceHandle
 
@@ -41,6 +47,20 @@ def convert_to_cudf(cp_arrays):
     return df
 
 
+def _call_plc_all_pairs_overlap(
+    sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check
+):
+
+    return pylibcugraph_all_pairs_overlap_coefficients(
+        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+        graph=mg_graph_x,
+        vertices=vertices,
+        use_weight=use_weight,
+        topk=topk,
+        do_expensive_check=do_expensive_check,
+    )
+
+
 def _call_plc_overlap(
     sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name
 ):
@@ -63,7 +83,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
     Compute the Overlap Coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
     Overlap Coefficient is defined between two sets as the ratio of the volume
-    of their intersection divided by the smaller of their two volumes. In the
+    of their intersection over the smaller of their two volumes. In the
     context of graphs, the neighborhood of a vertex is seen as a set. The
     Overlap Coefficient weight of each edge represents the strength of
     connection between vertices based on the relative similarity of their
@@ -86,7 +106,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
         directed edge in both direction. The adjacency list will be computed if
         not already present.
 
-        This implementation only supports undirected, unweighted Graph.
+        This implementation only supports undirected, non-multi Graphs.
 
     vertex_pair : cudf.DataFrame, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
@@ -103,7 +123,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
     Returns
     -------
     result : dask_cudf.DataFrame
-        GPU distributed data frame containing 2 dask_cudf.Series
+        GPU distributed data frame containing 3 dask_cudf.Series
 
         ddf['first']: dask_cudf.Series
             The first vertex ID of each pair(will be identical to first if specified).
@@ -143,21 +163,148 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
 
     do_expensive_check = False
 
-    if vertex_pair is not None:
-        result = [
-            client.submit(
-                _call_plc_overlap,
-                Comms.get_session_id(),
-                input_graph._plc_graph[w],
-                vertex_pair[w][0],
-                use_weight,
-                do_expensive_check,
-                vertex_pair_col_name,
-                workers=[w],
-                allow_other_workers=False,
+    result = [
+        client.submit(
+            _call_plc_overlap,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            vertex_pair[w][0],
+            use_weight,
+            do_expensive_check,
+            vertex_pair_col_name,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
+
+    wait(result)
+
+    cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result]
+
+    wait(cudf_result)
+
+    ddf = dask_cudf.from_delayed(cudf_result).persist()
+    wait(ddf)
+
+    # Wait until the inactive futures are released
+    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
+
+    if input_graph.renumbered:
+        ddf = input_graph.unrenumber(ddf, "first")
+        ddf = input_graph.unrenumber(ddf, "second")
+
+    return ddf
+
+
+def all_pairs_overlap(
+    input_graph,
+    vertices: cudf.Series = None,
+    use_weight: bool = False,
+    topk: int = None,
+):
+    """
+    Compute the All Pairs Overlap similarity between all pairs of vertices specified.
+    All pairs Overlap Coefficient is defined between two sets as the ratio of the volume
+    of their intersection over the smaller of their two volumes. In the context
+    of graphs, the neighborhood of a vertex is seen as a set. The Overlap
+    similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors.
+
+    cugraph.all_pairs_overlap, in the absence of specified vertices, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the overlap coefficient for all the vertex pairs in the graph.
+    This is not advisable as the vertex_pairs can grow exponentially with respect to
+    the size of the datasets.
+
+    If the topk parameter is specified then the result will only contain the top k
+    highest scoring results.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list (edge weights are not supported yet for this algorithm). The
+        graph should be undirected where an undirected edge is represented by a
+        directed edge in both direction. The adjacency list will be computed if
+        not already present.
+
+       This implementation only supports undirected, non-multi Graphs.
+
+    vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None)
+        A GPU Series containing the input vertex list.  If the vertex list is not
+        provided then the current implementation computes the overlap coefficient for
+        all adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted overlap (if use_weight==True)
+        or un-weighted overlap (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    topk : int, optional (default=None)
+        Specify the number of answers to return otherwise returns the entire
+        solution
+
+    Returns
+    -------
+    result : dask_cudf.DataFrame
+        GPU distributed data frame containing 3 dask_cudf.Series
+
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        ddf['overlap_coeff']: dask_cudf.Series
+            The computed overlap coefficient between the first and the second
+            vertex ID.
+    """
+
+    if input_graph.is_directed():
+        raise ValueError("input graph must be undirected")
+
+    # Initialize dask client
+    client = default_client()
+
+    if vertices is not None:
+        if isinstance(vertices, int):
+            vertices = [vertices]
+
+        if isinstance(vertices, list):
+            vertices = cudf.Series(
+                vertices,
+                dtype=input_graph.edgelist.edgelist_df[
+                    input_graph.renumber_map.renumbered_src_col_name
+                ].dtype,
             )
-            for w in Comms.get_workers()
-        ]
+
+        if not isinstance(vertices, (dask_cudf.Series)):
+            vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers())
+
+        if input_graph.renumbered:
+            vertices = input_graph.lookup_internal_vertex_id(vertices)
+
+        n_workers = get_n_workers()
+        vertices = vertices.repartition(npartitions=n_workers)
+        vertices = persist_dask_df_equal_parts_per_worker(vertices, client)
+        vertices = get_persisted_df_worker_map(vertices, client)
+
+    do_expensive_check = False
+
+    result = [
+        client.submit(
+            _call_plc_all_pairs_overlap,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            vertices[w][0] if vertices is not None else None,
+            use_weight,
+            topk,
+            do_expensive_check,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
 
     wait(result)
 
diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
index 163b0d0dc16..3697385e8f8 100644
--- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,10 +18,16 @@
 import dask_cudf
 import cudf
 from cugraph.dask.common.input_utils import get_distributed_data
+from cugraph.dask import get_n_workers
 from cugraph.utilities import renumber_vertex_pair
+from cugraph.dask.common.part_utils import (
+    get_persisted_df_worker_map,
+    persist_dask_df_equal_parts_per_worker,
+)
 
 from pylibcugraph import (
     sorensen_coefficients as pylibcugraph_sorensen_coefficients,
+    all_pairs_sorensen_coefficients as pylibcugraph_all_pairs_sorensen_coefficients,
 )
 from pylibcugraph import ResourceHandle
 
@@ -58,12 +64,26 @@ def _call_plc_sorensen(
     )
 
 
+def _call_plc_all_pairs_sorensen(
+    sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check
+):
+
+    return pylibcugraph_all_pairs_sorensen_coefficients(
+        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+        graph=mg_graph_x,
+        vertices=vertices,
+        use_weight=use_weight,
+        topk=topk,
+        do_expensive_check=do_expensive_check,
+    )
+
+
 def sorensen(input_graph, vertex_pair=None, use_weight=False):
     """
     Compute the Sorensen coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
     Sorensen coefficient is defined between two sets as the ratio of twice the
-    volume of their intersection divided by the volume of each set.
+    volume of their intersection over the volume of each set.
     If first is specified but second is not, or vice versa, an exception will
     be thrown.
 
@@ -82,7 +102,7 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
         directed edge in both direction. The adjacency list will be computed if
         not already present.
 
-        This implementation only supports undirected, unweighted Graph.
+        This implementation only supports undirected, non-multi Graphs.
 
     vertex_pair : cudf.DataFrame, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
@@ -99,7 +119,7 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
     Returns
     -------
     result : dask_cudf.DataFrame
-        GPU distributed data frame containing 2 dask_cudf.Series
+        GPU distributed data frame containing 3 dask_cudf.Series
 
         ddf['first']: dask_cudf.Series
             The first vertex ID of each pair(will be identical to first if specified).
@@ -139,21 +159,148 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
 
     do_expensive_check = False
 
-    if vertex_pair is not None:
-        result = [
-            client.submit(
-                _call_plc_sorensen,
-                Comms.get_session_id(),
-                input_graph._plc_graph[w],
-                vertex_pair[w][0],
-                use_weight,
-                do_expensive_check,
-                vertex_pair_col_name,
-                workers=[w],
-                allow_other_workers=False,
+    result = [
+        client.submit(
+            _call_plc_sorensen,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            vertex_pair[w][0],
+            use_weight,
+            do_expensive_check,
+            vertex_pair_col_name,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
+
+    wait(result)
+
+    cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result]
+
+    wait(cudf_result)
+
+    ddf = dask_cudf.from_delayed(cudf_result).persist()
+    wait(ddf)
+
+    # Wait until the inactive futures are released
+    wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)])
+
+    if input_graph.renumbered:
+        ddf = input_graph.unrenumber(ddf, "first")
+        ddf = input_graph.unrenumber(ddf, "second")
+
+    return ddf
+
+
+def all_pairs_sorensen(
+    input_graph,
+    vertices: cudf.Series = None,
+    use_weight: bool = False,
+    topk: int = None,
+):
+    """
+    Compute the All Pairs Sorensen similarity between all pairs of vertices specified.
+    All pairs Sorensen coefficient is defined between two sets as the ratio of twice the
+    volume of their intersection over the volume of each set. In the context
+    of graphs, the neighborhood of a vertex is seen as a set. The Sorensen
+    similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors.
+
+    cugraph.all_pairs_sorensen, in the absence of specified vertices, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the sorensen coefficient for all the vertex pairs in the graph.
+    This is not advisable as the vertex_pairs can grow exponentially with respect to
+    the size of the datasets.
+
+    If the topk parameter is specified then the result will only contain the top k
+    highest scoring results.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list (edge weights are not supported yet for this algorithm). The
+        graph should be undirected where an undirected edge is represented by a
+        directed edge in both direction. The adjacency list will be computed if
+        not already present.
+
+       This implementation only supports undirected, non-multi Graphs.
+
+    vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None)
+        A GPU Series containing the input vertex list.  If the vertex list is not
+        provided then the current implementation computes the sorensen coefficient for
+        all adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted sorensen (if use_weight==True)
+        or un-weighted sorensen (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    topk : int, optional (default=None)
+        Specify the number of answers to return otherwise returns the entire
+        solution
+
+    Returns
+    -------
+    result : dask_cudf.DataFrame
+        GPU distributed data frame containing 3 dask_cudf.Series
+
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        ddf['sorensen_coeff']: dask_cudf.Series
+            The computed sorensen coefficient between the first and the second
+            vertex ID.
+    """
+
+    if input_graph.is_directed():
+        raise ValueError("input graph must be undirected")
+
+    # Initialize dask client
+    client = default_client()
+
+    if vertices is not None:
+        if isinstance(vertices, int):
+            vertices = [vertices]
+
+        if isinstance(vertices, list):
+            vertices = cudf.Series(
+                vertices,
+                dtype=input_graph.edgelist.edgelist_df[
+                    input_graph.renumber_map.renumbered_src_col_name
+                ].dtype,
             )
-            for w in Comms.get_workers()
-        ]
+
+        if not isinstance(vertices, (dask_cudf.Series)):
+            vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers())
+
+        if input_graph.renumbered:
+            vertices = input_graph.lookup_internal_vertex_id(vertices)
+
+        n_workers = get_n_workers()
+        vertices = vertices.repartition(npartitions=n_workers)
+        vertices = persist_dask_df_equal_parts_per_worker(vertices, client)
+        vertices = get_persisted_df_worker_map(vertices, client)
+
+    do_expensive_check = False
+
+    result = [
+        client.submit(
+            _call_plc_all_pairs_sorensen,
+            Comms.get_session_id(),
+            input_graph._plc_graph[w],
+            vertices[w][0] if vertices is not None else None,
+            use_weight,
+            topk,
+            do_expensive_check,
+            workers=[w],
+            allow_other_workers=False,
+        )
+        for w in Comms.get_workers()
+    ]
 
     wait(result)
 
diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py
index 38c8b9a2d3b..f511b95c34c 100644
--- a/python/cugraph/cugraph/link_prediction/__init__.py
+++ b/python/cugraph/cugraph/link_prediction/__init__.py
@@ -13,7 +13,13 @@
 
 from cugraph.link_prediction.jaccard import jaccard
 from cugraph.link_prediction.jaccard import jaccard_coefficient
+from cugraph.link_prediction.jaccard import all_pairs_jaccard
 from cugraph.link_prediction.sorensen import sorensen
 from cugraph.link_prediction.sorensen import sorensen_coefficient
+from cugraph.link_prediction.sorensen import all_pairs_sorensen
 from cugraph.link_prediction.overlap import overlap
 from cugraph.link_prediction.overlap import overlap_coefficient
+from cugraph.link_prediction.overlap import all_pairs_overlap
+from cugraph.link_prediction.cosine import cosine
+from cugraph.link_prediction.cosine import cosine_coefficient
+from cugraph.link_prediction.cosine import all_pairs_cosine
diff --git a/python/cugraph/cugraph/link_prediction/cosine.py b/python/cugraph/cugraph/link_prediction/cosine.py
new file mode 100644
index 00000000000..9dce0e96f8c
--- /dev/null
+++ b/python/cugraph/cugraph/link_prediction/cosine.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.utilities import (
+    ensure_cugraph_obj_for_nx,
+    df_edge_score_to_dictionary,
+    renumber_vertex_pair,
+)
+import cudf
+import warnings
+from typing import Union, Iterable
+
+from pylibcugraph import (
+    cosine_coefficients as pylibcugraph_cosine_coefficients,
+    all_pairs_cosine_coefficients as pylibcugraph_all_pairs_cosine_coefficients,
+)
+from pylibcugraph import ResourceHandle
+
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+# FIXME: Move this function to the utility module so that it can be
+# shared by other algos
+def ensure_valid_dtype(input_graph, vertex_pair):
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes.iloc[0]
+    vertex_pair_dtypes = vertex_pair.dtypes
+
+    if (
+        vertex_pair_dtypes.iloc[0] != vertex_dtype
+        or vertex_pair_dtypes.iloc[1] != vertex_dtype
+    ):
+        warning_msg = (
+            "Cosine requires 'vertex_pair' to match the graph's 'vertex' type. "
+            f"input graph's vertex type is: {vertex_dtype} and got "
+            f"'vertex_pair' of type: {vertex_pair_dtypes}."
+        )
+        warnings.warn(warning_msg, UserWarning)
+        vertex_pair = vertex_pair.astype(vertex_dtype)
+
+    return vertex_pair
+
+
+def cosine(
+    input_graph: Graph,
+    vertex_pair: cudf.DataFrame = None,
+    use_weight: bool = False,
+):
+    """
+    Compute the Cosine similarity between each pair of vertices connected by
+    an edge, or between arbitrary pairs of vertices specified by the user.
+    The Cosine similarity is defined between two sets as the ratio of their
+    intersection's volume over the square root of their volume's product.
+    In the context of graphs, the neighborhood of a vertex is seen as a set.
+    The Cosine similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors.
+
+    cugraph.cosine, in the absence of a specified vertex pair list, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the cosine coefficient for those vertex pairs. This is
+    not advisable as the vertex_pairs can grow exponentially with respect to the
+    size of the datasets.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list. The graph should be undirected where an undirected
+        edge is represented by a directed edge in both direction.The adjacency
+        list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
+
+    vertex_pair : cudf.DataFrame, optional (default=None)
+        A GPU dataframe consisting of two columns representing pairs of
+        vertices. If provided, the cosine coefficient is computed for the
+        given vertex pairs.  If the vertex_pair is not provided then the
+        current implementation computes the cosine coefficient for all
+        adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted cosine (if use_weight==True)
+        or un-weighted cosine (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the Cosine weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
+
+        df['first'] : cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        df['second'] : cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        df['cosine_coeff'] : cudf.Series
+            The computed Cosine coefficient between the first and the second
+            vertex ID.
+
+    Examples
+    --------
+    >>> from cugraph.datasets import karate
+    >>> from cugraph import cosine
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = cosine(input_graph)
+
+    """
+    if input_graph.is_directed():
+        raise ValueError("Input must be an undirected Graph.")
+
+    if vertex_pair is None:
+        # Call two_hop neighbor of the entire graph
+        vertex_pair = input_graph.get_two_hop_neighbors()
+
+    v_p_num_col = len(vertex_pair.columns)
+
+    if isinstance(vertex_pair, cudf.DataFrame):
+        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+        vertex_pair = ensure_valid_dtype(input_graph, vertex_pair)
+        src_col_name = vertex_pair.columns[0]
+        dst_col_name = vertex_pair.columns[1]
+        first = vertex_pair[src_col_name]
+        second = vertex_pair[dst_col_name]
+
+    elif vertex_pair is not None:
+        raise ValueError("vertex_pair must be a cudf Dataframe")
+
+    first, second, cosine_coeff = pylibcugraph_cosine_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        first=first,
+        second=second,
+        use_weight=use_weight,
+        do_expensive_check=False,
+    )
+
+    if input_graph.renumbered:
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, src_col_name, preserve_order=True
+        )
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, dst_col_name, preserve_order=True
+        )
+
+    if v_p_num_col == 2:
+        # single column vertex
+        vertex_pair = vertex_pair.rename(
+            columns={src_col_name: "first", dst_col_name: "second"}
+        )
+
+    df = vertex_pair
+    df["cosine_coeff"] = cudf.Series(cosine_coeff)
+
+    return df
+
+
+def cosine_coefficient(
+    G: Union[Graph, "networkx.Graph"],
+    ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None,
+):
+    """
+    Note: No NetworkX equivalent.
+
+    Parameters
+    ----------
+    G : cugraph.Graph or NetworkX.Graph
+        cuGraph or NetworkX Graph instance, should contain the connectivity
+        information as an edge list. The graph should be undirected where an
+        undirected edge is represented by a directed edge in both direction.
+        The adjacency list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
+
+    ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
+        A GPU dataframe consisting of two columns representing pairs of
+        vertices or iterable of 2-tuples (u, v) where u and v are nodes in
+        the graph.
+
+        If provided, the Overlap coefficient is computed for the given vertex
+        pairs. Otherwise, the current implementation computes the overlap
+        coefficient for all adjacent vertices in the graph.
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the Cosine weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
+
+        df['first'] : cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        df['second'] : cudf.Series
+            the second vertex ID of each pair (will be identical to second if
+            specified).
+        df['cosine_coeff'] : cudf.Series
+            The computed Cosine coefficient between the first and the second
+            vertex ID.
+
+    Examples
+    --------
+    >>> from cugraph.datasets import karate
+    >>> from cugraph import cosine_coefficient
+    >>> G = karate.get_graph(download=True)
+    >>> df = cosine_coefficient(G)
+
+    """
+    vertex_pair = None
+
+    G, isNx = ensure_cugraph_obj_for_nx(G)
+
+    if isNx is True and ebunch is not None:
+        vertex_pair = cudf.DataFrame(ebunch)
+
+    df = cosine(G, vertex_pair)
+
+    if isNx is True:
+        df = df_edge_score_to_dictionary(
+            df, k="cosine_coeff", src="first", dst="second"
+        )
+
+    return df
+
+
+def all_pairs_cosine(
+    input_graph: Graph,
+    vertices: cudf.Series = None,
+    use_weight: bool = False,
+    topk: int = None,
+):
+    """
+    Compute the All Pairs Cosine similarity between all pairs of vertices specified.
+    The Cosine similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors.
+    The All Pairs Cosine similarity is defined between two sets as the ratio of their
+    intersection's volume over the square root of their volume's product.
+    In the context of graphs, the neighborhood of a vertex is seen as a set.
+    The Cosine similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors.
+
+    cugraph.all_pairs_cosine, in the absence of specified vertices, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the cosine coefficient for all the vertex pairs in the graph.
+    This is not advisable as the vertex_pairs can grow exponentially with respect to
+    the size of the datasets.
+
+    If the topk parameter is specified then the result will only contain the top k
+    highest scoring results.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list. The graph should be undirected where an undirected
+        edge is represented by a directed edge in both direction.The adjacency
+        list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
+
+    vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None)
+        A GPU Series containing the input vertex list.  If the vertex list is not
+        provided then the current implementation computes the cosine coefficient for
+        all adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted cosine (if use_weight==True)
+        or un-weighted cosine (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    topk : int, optional (default=None)
+        Specify the number of answers to return otherwise returns the entire
+        solution
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the Cosine weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
+
+        df['first'] : cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        df['second'] : cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        df['cosine_coeff'] : cudf.Series
+            The computed Cosine coefficient between the first and the second
+            vertex ID.
+
+    Examples
+    --------
+    >>> from cugraph.datasets import karate
+    >>> from cugraph import all_pairs_cosine
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = all_pairs_cosine(input_graph)
+
+    """
+    if input_graph.is_directed():
+        raise ValueError("Input must be an undirected Graph.")
+
+    if vertices is not None:
+
+        if isinstance(vertices, int):
+            vertices = [vertices]
+
+        if isinstance(vertices, list):
+            vertices = cudf.Series(
+                vertices,
+                dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype,
+            )
+
+        if input_graph.renumbered is True:
+            if isinstance(vertices, cudf.DataFrame):
+                vertices = input_graph.lookup_internal_vertex_id(
+                    vertices, vertices.columns
+                )
+            else:
+                vertices = input_graph.lookup_internal_vertex_id(vertices)
+
+    first, second, cosine_coeff = pylibcugraph_all_pairs_cosine_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        vertices=vertices,
+        use_weight=use_weight,
+        topk=topk,
+        do_expensive_check=False,
+    )
+    vertex_pair = cudf.DataFrame()
+    vertex_pair["first"] = first
+    vertex_pair["second"] = second
+
+    if input_graph.renumbered:
+        vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True)
+        vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True)
+
+    df = vertex_pair
+    df["cosine_coeff"] = cudf.Series(cosine_coeff)
+
+    return df
diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py
index 06644a7e1b7..214d92a1be5 100644
--- a/python/cugraph/cugraph/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/link_prediction/jaccard.py
@@ -22,6 +22,7 @@
 
 from pylibcugraph import (
     jaccard_coefficients as pylibcugraph_jaccard_coefficients,
+    all_pairs_jaccard_coefficients as pylibcugraph_all_pairs_jaccard_coefficients,
 )
 from pylibcugraph import ResourceHandle
 
@@ -65,7 +66,7 @@ def jaccard(
     Compute the Jaccard similarity between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
     Jaccard similarity is defined between two sets as the ratio of the volume
-    of their intersection divided by the volume of their union. In the context
+    of their intersection over the volume of their union. In the context
     of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
     similarity weight of each edge represents the strength of connection
     between vertices based on the relative similarity of their neighbors.
@@ -238,3 +239,119 @@ def jaccard_coefficient(
         )
 
     return df
+
+
+def all_pairs_jaccard(
+    input_graph: Graph,
+    vertices: cudf.Series = None,
+    use_weight: bool = False,
+    topk: int = None,
+):
+    """
+    Compute the All Pairs Jaccard similarity between all pairs of vertices specified.
+    All pairs Jaccard similarity is defined between two sets as the ratio of the volume
+    of their intersection over the volume of their union. In the context
+    of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
+    similarity weight of each edge represents the strength of connection
+    between vertices based on the relative similarity of their neighbors.
+
+    cugraph.all_pairs_jaccard, in the absence of specified vertices, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the jaccard coefficient for all the vertex pairs in the graph.
+    This is not advisable as the vertex_pairs can grow exponentially with respect to
+    the size of the datasets.
+
+    If the topk parameter is specified then the result will only contain the top k
+    highest scoring results.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list. The graph should be undirected where an undirected
+        edge is represented by a directed edge in both direction.The adjacency
+        list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
+
+    vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None)
+        A GPU Series containing the input vertex list.  If the vertex list is not
+        provided then the current implementation computes the jaccard coefficient for
+        all adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted jaccard (if use_weight==True)
+        or un-weighted jaccard (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    topk : int, optional (default=None)
+        Specify the number of answers to return otherwise returns the entire
+        solution
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the Jaccard weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
+
+        df['first'] : cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        df['second'] : cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        df['jaccard_coeff'] : cudf.Series
+            The computed Jaccard coefficient between the first and the second
+            vertex ID.
+
+    Examples
+    --------
+    >>> from cugraph.datasets import karate
+    >>> from cugraph import all_pairs_jaccard
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = all_pairs_jaccard(input_graph)
+
+    """
+    if input_graph.is_directed():
+        raise ValueError("Input must be an undirected Graph.")
+
+    if vertices is not None:
+
+        if isinstance(vertices, int):
+            vertices = [vertices]
+
+        if isinstance(vertices, list):
+            vertices = cudf.Series(
+                vertices,
+                dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype,
+            )
+
+        if input_graph.renumbered is True:
+            if isinstance(vertices, cudf.DataFrame):
+                vertices = input_graph.lookup_internal_vertex_id(
+                    vertices, vertices.columns
+                )
+            else:
+                vertices = input_graph.lookup_internal_vertex_id(vertices)
+
+    first, second, jaccard_coeff = pylibcugraph_all_pairs_jaccard_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        vertices=vertices,
+        use_weight=use_weight,
+        topk=topk,
+        do_expensive_check=False,
+    )
+    vertex_pair = cudf.DataFrame()
+    vertex_pair["first"] = first
+    vertex_pair["second"] = second
+
+    if input_graph.renumbered:
+        vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True)
+        vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True)
+
+    df = vertex_pair
+    df["jaccard_coeff"] = cudf.Series(jaccard_coeff)
+
+    return df
diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py
index b6e9cfb58c4..52697d6b552 100644
--- a/python/cugraph/cugraph/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/link_prediction/overlap.py
@@ -22,6 +22,7 @@
 
 from pylibcugraph import (
     overlap_coefficients as pylibcugraph_overlap_coefficients,
+    all_pairs_overlap_coefficients as pylibcugraph_all_pairs_overlap_coefficients,
 )
 from pylibcugraph import ResourceHandle
 
@@ -151,7 +152,7 @@ def overlap(
     Compute the Overlap Coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
     Overlap Coefficient is defined between two sets as the ratio of the volume
-    of their intersection divided by the smaller of their two volumes. In the
+    of their intersection over the smaller of their two volumes. In the
     context of graphs, the neighborhood of a vertex is seen as a set. The
     Overlap Coefficient weight of each edge represents the strength of
     connection between vertices based on the relative similarity of their
@@ -271,3 +272,121 @@ def overlap(
     df["overlap_coeff"] = cudf.Series(overlap_coeff)
 
     return df
+
+
+def all_pairs_overlap(
+    input_graph: Graph,
+    vertices: cudf.Series = None,
+    use_weight: bool = False,
+    topk: int = None,
+):
+    """
+    Compute the All Pairs Overlap Coefficient between each pair of vertices connected
+    by an edge, or between arbitrary pairs of vertices specified by the user.
+    Overlap Coefficient is defined between two sets as the ratio of the volume
+    of their intersection over the smaller of their two volumes. In the
+    context of graphs, the neighborhood of a vertex is seen as a set. The
+    Overlap Coefficient weight of each edge represents the strength of
+    connection between vertices based on the relative similarity of their
+    neighbors.
+
+    cugraph.all_pairs_overlap, in the absence of specified vertices, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the overlap coefficient for all the vertex pairs in the graph.
+    This is not advisable as the vertex_pairs can grow exponentially with respect to
+    the size of the datasets.
+
+    If the topk parameter is specified then the result will only contain the top k
+    highest scoring results.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list. The graph should be undirected where an undirected
+        edge is represented by a directed edge in both direction.The adjacency
+        list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
+
+    vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None)
+        A GPU Series containing the input vertex list.  If the vertex list is not
+        provided then the current implementation computes the overlap coefficient for
+        all adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted overlap (if use_weight==True)
+        or un-weighted overlap (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    topk : int, optional (default=None)
+        Specify the number of answers to return otherwise returns the entire
+        solution
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the Overlap weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
+
+        df['first'] : cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        df['second'] : cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        df['overlap_coeff'] : cudf.Series
+            The computed Overlap coefficient between the first and the second
+            vertex ID.
+
+    Examples
+    --------
+    >>> from cugraph.datasets import karate
+    >>> from cugraph import all_pairs_overlap
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = all_pairs_overlap(input_graph)
+
+    """
+    if input_graph.is_directed():
+        raise ValueError("Input must be an undirected Graph.")
+
+    if vertices is not None:
+
+        if isinstance(vertices, int):
+            vertices = [vertices]
+
+        if isinstance(vertices, list):
+            vertices = cudf.Series(
+                vertices,
+                dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype,
+            )
+
+        if input_graph.renumbered is True:
+            if isinstance(vertices, cudf.DataFrame):
+                vertices = input_graph.lookup_internal_vertex_id(
+                    vertices, vertices.columns
+                )
+            else:
+                vertices = input_graph.lookup_internal_vertex_id(vertices)
+
+    first, second, overlap_coeff = pylibcugraph_all_pairs_overlap_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        vertices=vertices,
+        use_weight=use_weight,
+        topk=topk,
+        do_expensive_check=False,
+    )
+    vertex_pair = cudf.DataFrame()
+    vertex_pair["first"] = first
+    vertex_pair["second"] = second
+
+    if input_graph.renumbered:
+        vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True)
+        vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True)
+
+    df = vertex_pair
+    df["overlap_coeff"] = cudf.Series(overlap_coeff)
+
+    return df
diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py
index cac8bfb9cc6..8030234993b 100644
--- a/python/cugraph/cugraph/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/link_prediction/sorensen.py
@@ -22,6 +22,7 @@
 
 from pylibcugraph import (
     sorensen_coefficients as pylibcugraph_sorensen_coefficients,
+    all_pairs_sorensen_coefficients as pylibcugraph_all_pairs_sorensen_coefficients,
 )
 from pylibcugraph import ResourceHandle
 
@@ -66,7 +67,7 @@ def sorensen(
     Compute the Sorensen coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
     Sorensen coefficient is defined between two sets as the ratio of twice the
-    volume of their intersection divided by the volume of each set.
+    volume of their intersection over the volume of each set.
     If first is specified but second is not, or vice versa, an exception will
     be thrown.
 
@@ -209,8 +210,8 @@ def sorensen_coefficient(
         vertices or iterable of 2-tuples (u, v) where u and v are nodes in
         the graph.
 
-        If provided, the Overlap coefficient is computed for the given vertex
-        pairs. Otherwise, the current implementation computes the overlap
+        If provided, the Sorensen coefficient is computed for the given vertex
+        pairs. Otherwise, the current implementation computes the sorensen
         coefficient for all adjacent vertices in the graph.
 
     do_expensive_check : bool, optional (default=False)
@@ -270,3 +271,119 @@ def sorensen_coefficient(
         )
 
     return df
+
+
+def all_pairs_sorensen(
+    input_graph: Graph,
+    vertices: cudf.Series = None,
+    use_weight: bool = False,
+    topk: int = None,
+):
+    """
+    Compute the All Pairs Sorensen coefficient between each pair of vertices connected
+    by an edge, or between arbitrary pairs of vertices specified by the user.
+    Sorensen coefficient is defined between two sets as the ratio of twice the
+    volume of their intersection over the volume of each set.
+    If first is specified but second is not, or vice versa, an exception will
+    be thrown.
+
+    cugraph.all_pairs_sorensen, in the absence of specified vertices, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the sorensen coefficient for all the vertex pairs in the graph.
+    This is not advisable as the vertex_pairs can grow exponentially with respect to
+    the size of the datasets.
+
+    If the topk parameter is specified then the result will only contain the top k
+    highest scoring results.
+
+    Parameters
+    ----------
+    input_graph : cugraph.Graph
+        cuGraph Graph instance, should contain the connectivity information
+        as an edge list. The graph should be undirected where an undirected
+        edge is represented by a directed edge in both direction.The adjacency
+        list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
+
+    vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None)
+        A GPU Series containing the input vertex list.  If the vertex list is not
+        provided then the current implementation computes the sorensen coefficient for
+        all adjacent vertices in the graph.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted sorensen (if use_weight==True)
+        or un-weighted sorensen (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+    topk : int, optional (default=None)
+        Specify the number of answers to return otherwise returns the entire
+        solution
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the Sorensen weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
+
+        df['first'] : cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        df['second'] : cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        df['sorensen_coeff'] : cudf.Series
+            The computed Sorensen coefficient between the first and the second
+            vertex ID.
+
+    Examples
+    --------
+    >>> from cugraph.datasets import karate
+    >>> from cugraph import all_pairs_sorensen
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = all_pairs_sorensen(input_graph)
+
+    """
+    if input_graph.is_directed():
+        raise ValueError("Input must be an undirected Graph.")
+
+    if vertices is not None:
+
+        if isinstance(vertices, int):
+            vertices = [vertices]
+
+        if isinstance(vertices, list):
+            vertices = cudf.Series(
+                vertices,
+                dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype,
+            )
+
+        if input_graph.renumbered is True:
+            if isinstance(vertices, cudf.DataFrame):
+                vertices = input_graph.lookup_internal_vertex_id(
+                    vertices, vertices.columns
+                )
+            else:
+                vertices = input_graph.lookup_internal_vertex_id(vertices)
+
+    first, second, sorensen_coeff = pylibcugraph_all_pairs_sorensen_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        vertices=vertices,
+        use_weight=use_weight,
+        topk=topk,
+        do_expensive_check=False,
+    )
+    vertex_pair = cudf.DataFrame()
+    vertex_pair["first"] = first
+    vertex_pair["second"] = second
+
+    if input_graph.renumbered:
+        vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True)
+        vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True)
+
+    df = vertex_pair
+    df["sorensen_coeff"] = cudf.Series(sorensen_coeff)
+
+    return df
diff --git a/python/cugraph/cugraph/testing/mg_utils.py b/python/cugraph/cugraph/testing/mg_utils.py
index 32854652f05..07399b90627 100644
--- a/python/cugraph/cugraph/testing/mg_utils.py
+++ b/python/cugraph/cugraph/testing/mg_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -35,6 +35,7 @@ def start_dask_client(
     jit_unspill=False,
     worker_class=None,
     device_memory_limit=0.8,
+    p2p=True,
 ):
     """
     Creates a new dask client, and possibly also a cluster, and returns them as
@@ -95,6 +96,9 @@ def start_dask_client(
         dask_cuda.LocalCUDACluster for details. This parameter is ignored if
         the env var SCHEDULER_FILE is set which implies the dask cluster has
         already been created.
+
+    p2p : bool, optional (default=True)
+        Initialize UCX endpoints if True.
     """
     dask_scheduler_file = os.environ.get("SCHEDULER_FILE")
     dask_local_directory = os.getenv("DASK_LOCAL_DIRECTORY")
@@ -164,7 +168,7 @@ def start_dask_client(
         # FIXME: use proper logging, INFO or DEBUG level
         print("\nDask client/cluster created using LocalCUDACluster")
 
-    Comms.initialize(p2p=True)
+    Comms.initialize(p2p=p2p)
 
     return (client, cluster)
 
diff --git a/python/cugraph/cugraph/tests/community/test_k_truss_subgraph.py b/python/cugraph/cugraph/tests/community/test_k_truss_subgraph.py
index 063d7fc735f..bbd2866b5df 100644
--- a/python/cugraph/cugraph/tests/community/test_k_truss_subgraph.py
+++ b/python/cugraph/cugraph/tests/community/test_k_truss_subgraph.py
@@ -20,7 +20,6 @@
 import cugraph
 from cugraph.testing import utils
 from cugraph.datasets import polbooks, karate_asymmetric
-from numba import cuda
 
 
 # =============================================================================
@@ -67,32 +66,7 @@ def compare_k_truss(k_truss_cugraph, k, ground_truth_file):
     return True
 
 
-__cuda_version = cuda.runtime.get_version()
-__unsupported_cuda_version = (11, 4)
-
-
-# FIXME: remove when ktruss is supported on CUDA 11.4
-@pytest.mark.sg
-def test_unsupported_cuda_version():
-    """
-    Ensures the proper exception is raised when ktruss is called in an
-    unsupported env, and not when called in a supported env.
-    """
-    k = 5
-
-    G = polbooks.get_graph(download=True)
-    if __cuda_version == __unsupported_cuda_version:
-        with pytest.raises(NotImplementedError):
-            cugraph.k_truss(G, k)
-    else:
-        cugraph.k_truss(G, k)
-
-
 @pytest.mark.sg
-@pytest.mark.skipif(
-    (__cuda_version == __unsupported_cuda_version),
-    reason="skipping on unsupported CUDA " f"{__unsupported_cuda_version} environment.",
-)
 @pytest.mark.parametrize("_, nx_ground_truth", utils.DATASETS_KTRUSS)
 def test_ktruss_subgraph_Graph(_, nx_ground_truth):
 
@@ -104,10 +78,6 @@ def test_ktruss_subgraph_Graph(_, nx_ground_truth):
 
 
 @pytest.mark.sg
-@pytest.mark.skipif(
-    (__cuda_version == __unsupported_cuda_version),
-    reason="skipping on unsupported CUDA " f"{__unsupported_cuda_version} environment.",
-)
 def test_ktruss_subgraph_Graph_nx():
     k = 5
     dataset_path = polbooks.get_path()
@@ -122,10 +92,6 @@ def test_ktruss_subgraph_Graph_nx():
 
 
 @pytest.mark.sg
-@pytest.mark.skipif(
-    (__cuda_version == __unsupported_cuda_version),
-    reason="skipping on unsupported CUDA " f"{__unsupported_cuda_version} environment.",
-)
 def test_ktruss_subgraph_directed_Graph():
     k = 5
     edgevals = True
diff --git a/python/cugraph/cugraph/tests/community/test_k_truss_subgraph_mg.py b/python/cugraph/cugraph/tests/community/test_k_truss_subgraph_mg.py
new file mode 100644
index 00000000000..12e5146c2de
--- /dev/null
+++ b/python/cugraph/cugraph/tests/community/test_k_truss_subgraph_mg.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+
+import pytest
+
+import cugraph
+import cugraph.dask as dcg
+from cudf.testing.testing import assert_frame_equal
+from cugraph.datasets import karate, dolphins, netscience
+
+
+# =============================================================================
+# Pytest Setup / Teardown - called for each test function
+# =============================================================================
+
+
+def setup_function():
+    gc.collect()
+
+
+# =============================================================================
+# Parameters
+# =============================================================================
+
+
+DATASETS = [karate, dolphins, netscience]
+IS_DIRECTED = [True, False]
+K_VALUE = [4, 6, 8]
+
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+
+
+def get_sg_graph(dataset, directed):
+    G = dataset.get_graph(create_using=cugraph.Graph(directed=directed))
+
+    return G
+
+
+def get_mg_graph(dataset, directed):
+    ddf = dataset.get_dask_edgelist()
+    dg = cugraph.Graph(directed=directed)
+    dg.from_dask_cudf_edgelist(
+        ddf,
+        source="src",
+        destination="dst",
+        edge_attr="wgt",
+        renumber=True,
+        store_transposed=True,
+    )
+
+    return dg
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("is_directed", IS_DIRECTED)
+@pytest.mark.parametrize("k", K_VALUE)
+def test_mg_ktruss_subgraph(dask_client, benchmark, dataset, is_directed, k):
+    # Create SG and MG Graphs
+    g = get_sg_graph(dataset, is_directed)
+    dg = get_mg_graph(dataset, is_directed)
+
+    if is_directed:
+        with pytest.raises(ValueError):
+            result_ktruss_subgraph = benchmark(dcg.ktruss_subgraph, dg, k)
+    else:
+        sg_ktruss_subgraph = cugraph.ktruss_subgraph(g, k=k)
+        result_ktruss_subgraph = benchmark(dcg.ktruss_subgraph, dg, k)
+
+        mg_df = result_ktruss_subgraph
+
+        if len(mg_df) != 0 and len(sg_ktruss_subgraph.input_df) != 0:
+            # FIXME: 'edges()' or 'view_edgelist()' takes half the edges out if
+            # 'directed=False'.
+            sg_result = sg_ktruss_subgraph.input_df
+
+            sg_df = sg_result.sort_values(["src", "dst"]).reset_index(drop=True)
+            mg_df = mg_df.compute().sort_values(["src", "dst"]).reset_index(drop=True)
+
+            assert_frame_equal(sg_df, mg_df, check_dtype=False, check_like=True)
+
+        else:
+            # There is no edge left when extracting the K-Truss
+            assert len(sg_ktruss_subgraph.input_df) == 0
+            assert len(mg_df) == 0
diff --git a/python/cugraph/cugraph/tests/community/test_triangle_count.py b/python/cugraph/cugraph/tests/community/test_triangle_count.py
index 449df32b52a..69cd5fd72e4 100644
--- a/python/cugraph/cugraph/tests/community/test_triangle_count.py
+++ b/python/cugraph/cugraph/tests/community/test_triangle_count.py
@@ -105,48 +105,32 @@ def test_triangles(input_combo):
 @pytest.mark.sg
 def test_triangles_int64(input_combo):
     Gnx = input_combo["Gnx"]
-    count_legacy_32 = cugraph.triangle_count(Gnx)
+    count_int32 = cugraph.triangle_count(Gnx)["counts"].sum()
 
     graph_file = input_combo["graph_file"]
     G = graph_file.get_graph()
     G.edgelist.edgelist_df = G.edgelist.edgelist_df.astype(
         {"src": "int64", "dst": "int64"}
     )
+    count_int64 = cugraph.triangle_count(G)["counts"].sum()
 
-    count_exp_64 = (
-        cugraph.triangle_count(G)
-        .sort_values("vertex")
-        .reset_index(drop=True)
-        .rename(columns={"counts": "exp_cugraph_counts"})
-    )
-    cugraph_exp_triangle_results = count_exp_64["exp_cugraph_counts"].sum()
     assert G.edgelist.edgelist_df["src"].dtype == "int64"
     assert G.edgelist.edgelist_df["dst"].dtype == "int64"
-    assert cugraph_exp_triangle_results == count_legacy_32
+    assert count_int32 == count_int64
 
 
 @pytest.mark.sg
 def test_triangles_no_weights(input_combo):
     G_weighted = input_combo["Gnx"]
-    count_legacy = (
-        cugraph.triangle_count(G_weighted)
-        .sort_values("vertex")
-        .reset_index(drop=True)
-        .rename(columns={"counts": "exp_cugraph_counts"})
-    )
+    count_triangles_nx_graph = cugraph.triangle_count(G_weighted)["counts"].sum()
 
     graph_file = input_combo["graph_file"]
     G = graph_file.get_graph(ignore_weights=True)
 
     assert G.is_weighted() is False
-    triangle_count = (
-        cugraph.triangle_count(G)
-        .sort_values("vertex")
-        .reset_index(drop=True)
-        .rename(columns={"counts": "exp_cugraph_counts"})
-    )
-    cugraph_exp_triangle_results = triangle_count["exp_cugraph_counts"].sum()
-    assert cugraph_exp_triangle_results == count_legacy
+    count_triangles = cugraph.triangle_count(G)["counts"].sum()
+
+    assert count_triangles_nx_graph == count_triangles
 
 
 @pytest.mark.sg
diff --git a/python/cugraph/cugraph/tests/conftest.py b/python/cugraph/cugraph/tests/conftest.py
index cb5755128eb..d31c2968afe 100644
--- a/python/cugraph/cugraph/tests/conftest.py
+++ b/python/cugraph/cugraph/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -52,6 +52,21 @@ def dask_client():
     stop_dask_client(dask_client, dask_cluster)
 
 
+# FIXME: Add tests leveraging this fixture
+@pytest.fixture(scope="module")
+def dask_client_non_p2p():
+    # start_dask_client will check for the SCHEDULER_FILE and
+    # DASK_WORKER_DEVICES env vars and use them when creating a client if
+    # set. start_dask_client will also initialize the Comms singleton.
+    dask_client, dask_cluster = start_dask_client(
+        worker_class=IncreasedCloseTimeoutNanny, p2p=False
+    )
+
+    yield dask_client
+
+    stop_dask_client(dask_client, dask_cluster)
+
+
 @pytest.fixture(scope="module")
 def scratch_dir():
     # This should always be set if doing MG testing, since temporary
diff --git a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py
index 1892e8a85a6..30336490312 100644
--- a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py
+++ b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,6 +13,7 @@
 
 import pytest
 import numpy as np
+import os
 
 from cugraph.gnn import FeatureStore
 
@@ -21,18 +22,23 @@
 pylibwholegraph = import_optional("pylibwholegraph")
 wmb = import_optional("pylibwholegraph.binding.wholememory_binding")
 torch = import_optional("torch")
+wgth = import_optional("pylibwholegraph.torch")
 
 
-def runtest(world_rank: int, world_size: int):
-    from pylibwholegraph.torch.initialize import init_torch_env_and_create_wm_comm
+def runtest(rank: int, world_size: int):
+    torch.cuda.set_device(rank)
 
-    wm_comm, _ = init_torch_env_and_create_wm_comm(
-        world_rank,
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    pylibwholegraph.torch.initialize.init(
+        rank,
         world_size,
-        world_rank,
+        rank,
         world_size,
     )
-    wm_comm = wm_comm.wmb_comm
+    wm_comm = wgth.get_global_communicator()
 
     generator = np.random.default_rng(62)
     arr = (
@@ -52,7 +58,7 @@ def runtest(world_rank: int, world_size: int):
     expected = arr[indices_to_fetch]
     np.testing.assert_array_equal(output_fs.cpu().numpy(), expected)
 
-    wmb.finalize()
+    pylibwholegraph.torch.initialize.finalize()
 
 
 @pytest.mark.sg
@@ -61,13 +67,13 @@ def runtest(world_rank: int, world_size: int):
     isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
 )
 def test_feature_storage_wholegraph_backend():
-    from pylibwholegraph.utils.multiprocess import multiprocess_run
+    world_size = torch.cuda.device_count()
+    print("gpu count:", world_size)
+    assert world_size > 0
 
-    gpu_count = wmb.fork_get_gpu_count()
-    print("gpu count:", gpu_count)
-    assert gpu_count > 0
+    print("ignoring gpu count and running on 1 GPU only")
 
-    multiprocess_run(1, runtest)
+    torch.multiprocessing.spawn(runtest, args=(1,), nprocs=1)
 
 
 @pytest.mark.mg
@@ -76,10 +82,8 @@ def test_feature_storage_wholegraph_backend():
     isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
 )
 def test_feature_storage_wholegraph_backend_mg():
-    from pylibwholegraph.utils.multiprocess import multiprocess_run
-
-    gpu_count = wmb.fork_get_gpu_count()
-    print("gpu count:", gpu_count)
-    assert gpu_count > 0
+    world_size = torch.cuda.device_count()
+    print("gpu count:", world_size)
+    assert world_size > 0
 
-    multiprocess_run(gpu_count, runtest)
+    torch.multiprocessing.spawn(runtest, args=(world_size,), nprocs=world_size)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py
new file mode 100644
index 00000000000..f85508cb089
--- /dev/null
+++ b/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+
+import pytest
+
+import dask_cudf
+import cugraph
+import cugraph.dask as dcg
+from cugraph.testing import utils
+from pylibcugraph.testing import gen_fixture_params_product
+
+
+# =============================================================================
+# Pytest Setup / Teardown - called for each test function
+# =============================================================================
+
+
+def setup_function():
+    gc.collect()
+
+
+IS_DIRECTED = [False]
+HAS_VERTEX_PAIR = [False, True]
+HAS_VERTICES = [False, True]
+HAS_TOPK = [False, True]
+IS_WEIGHTED = [False, True]
+
+
+# =============================================================================
+# Pytest fixtures
+# =============================================================================
+
+datasets = utils.DATASETS_UNDIRECTED + [
+    utils.RAPIDS_DATASET_ROOT_DIR_PATH / "email-Eu-core.csv"
+]
+
+fixture_params = gen_fixture_params_product(
+    (datasets, "graph_file"),
+    (IS_DIRECTED, "directed"),
+    (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (HAS_VERTICES, "has_vertices"),
+    (HAS_TOPK, "has_topk"),
+    (IS_WEIGHTED, "is_weighted"),
+)
+
+
+@pytest.fixture(scope="module", params=fixture_params)
+def input_combo(request):
+    """
+    Simply return the current combination of params as a dictionary for use in
+    tests or other parameterized fixtures.
+    """
+    parameters = dict(
+        zip(
+            (
+                "graph_file",
+                "directed",
+                "has_vertex_pair",
+                "has_vertices",
+                "has_topk",
+                "is_weighted",
+            ),
+            request.param,
+        )
+    )
+
+    return parameters
+
+
+@pytest.fixture(scope="module")
+def input_expected_output(input_combo):
+    """
+    This fixture returns the inputs and expected results from the Cosine algo.
+    (based on cuGraph Cosine) which can be used for validation.
+    """
+
+    input_data_path = input_combo["graph_file"]
+    directed = input_combo["directed"]
+    has_vertex_pair = input_combo["has_vertex_pair"]
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
+    if has_vertex_pair:
+        # Sample random vertices from the graph and compute the two_hop_neighbors
+        # with those seeds
+        k = random.randint(1, 10)
+        seeds = random.sample(range(G.number_of_vertices()), k)
+
+        vertex_pair = G.get_two_hop_neighbors(start_vertices=seeds)
+    else:
+        vertex_pair = None
+
+    input_combo["vertex_pair"] = vertex_pair
+    sg_cugraph_cosine = cugraph.cosine(
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
+    # Save the results back to the input_combo dictionary to prevent redundant
+    # cuGraph runs. Other tests using the input_combo fixture will look for
+    # them, and if not present they will have to re-run the same cuGraph call.
+
+    input_combo["sg_cugraph_results"] = sg_cugraph_cosine
+    chunksize = dcg.get_chunksize(input_data_path)
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        blocksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.Graph(directed=directed)
+    dg.from_dask_cudf_edgelist(
+        ddf,
+        source="src",
+        destination="dst",
+        edge_attr="value" if is_weighted else None,
+        renumber=True,
+        store_transposed=True,
+    )
+
+    input_combo["MGGraph"] = dg
+
+    return input_combo
+
+
+@pytest.fixture(scope="module")
+def input_expected_output_all_pairs(input_combo):
+    """
+    This fixture returns the inputs and expected results from the Cosine algo.
+    (based on cuGraph Cosine) which can be used for validation.
+    """
+
+    input_data_path = input_combo["graph_file"]
+    directed = input_combo["directed"]
+    has_vertices = input_combo["has_vertices"]
+    has_topk = input_combo["has_topk"]
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
+    if has_vertices:
+        # Sample random vertices from the graph and compute the two_hop_neighbors
+        # with those seeds
+        k = random.randint(1, 10)
+        vertices = random.sample(range(G.number_of_vertices()), k)
+
+    else:
+        vertices = None
+
+    if has_topk:
+        topk = 5
+    else:
+        topk = None
+
+    input_combo["vertices"] = vertices
+    print("vertices ", vertices, " is_weighted = ", is_weighted)
+    input_combo["topk"] = topk
+    sg_cugraph_all_pairs_cosine = cugraph.all_pairs_cosine(
+        G,
+        vertices=input_combo["vertices"],
+        topk=input_combo["topk"],
+        use_weight=is_weighted,
+    )
+    # Save the results back to the input_combo dictionary to prevent redundant
+    # cuGraph runs. Other tests using the input_combo fixture will look for
+    # them, and if not present they will have to re-run the same cuGraph call.
+
+    input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_cosine
+    chunksize = dcg.get_chunksize(input_data_path)
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        blocksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.Graph(directed=directed)
+    dg.from_dask_cudf_edgelist(
+        ddf,
+        source="src",
+        destination="dst",
+        edge_attr="value" if is_weighted else None,
+        renumber=True,
+        store_transposed=True,
+    )
+
+    input_combo["MGGraph"] = dg
+
+    return input_combo
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+
+
+@pytest.mark.mg
+def test_dask_mg_cosine(dask_client, benchmark, input_expected_output):
+
+    dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
+
+    result_cosine = benchmark(
+        dcg.cosine, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
+
+    result_cosine = (
+        result_cosine.compute()
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+        .rename(columns={"cosine_coeff": "mg_cugraph_cosine_coeff"})
+    )
+
+    expected_output = (
+        input_expected_output["sg_cugraph_results"]
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+    )
+
+    # Update the dask cugraph Cosine results with sg cugraph results for easy
+    # comparison using cuDF DataFrame methods.
+    result_cosine["sg_cugraph_cosine_coeff"] = expected_output["cosine_coeff"]
+
+    cosine_coeff_diffs1 = result_cosine.query(
+        "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff > 0.00001"
+    )
+    cosine_coeff_diffs2 = result_cosine.query(
+        "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff < -0.00001"
+    )
+
+    assert len(cosine_coeff_diffs1) == 0
+    assert len(cosine_coeff_diffs2) == 0
+
+
+@pytest.mark.mg
+def test_dask_mg_all_pairs_cosine(
+    dask_client, benchmark, input_expected_output_all_pairs
+):
+
+    dg = input_expected_output_all_pairs["MGGraph"]
+
+    use_weight = input_expected_output_all_pairs["is_weighted"]
+
+    result_cosine = benchmark(
+        dcg.all_pairs_cosine,
+        dg,
+        vertices=input_expected_output_all_pairs["vertices"],
+        topk=input_expected_output_all_pairs["topk"],
+        use_weight=use_weight,
+    )
+
+    result_cosine = (
+        result_cosine.compute()
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+        .rename(columns={"cosine_coeff": "mg_cugraph_cosine_coeff"})
+    )
+
+    expected_output = (
+        input_expected_output_all_pairs["sg_cugraph_results"]
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+    )
+
+    # Update the dask cugraph Cosine results with sg cugraph results for easy
+    # comparison using cuDF DataFrame methods.
+    result_cosine["sg_cugraph_cosine_coeff"] = expected_output["cosine_coeff"]
+
+    cosine_coeff_diffs1 = result_cosine.query(
+        "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff > 0.00001"
+    )
+    cosine_coeff_diffs2 = result_cosine.query(
+        "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff < -0.00001"
+    )
+
+    assert len(cosine_coeff_diffs1) == 0
+    assert len(cosine_coeff_diffs2) == 0
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
index 3691ad5a8c9..34ee72e799b 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
@@ -22,7 +22,7 @@
 import cugraph
 from cugraph.datasets import netscience
 from cugraph.testing import utils, UNDIRECTED_DATASETS
-from cudf.testing import assert_series_equal
+from cudf.testing import assert_series_equal, assert_frame_equal
 
 SRC_COL = "0"
 DST_COL = "1"
@@ -341,3 +341,90 @@ def test_weighted_jaccard():
     G = karate.get_graph(ignore_weights=True)
     with pytest.raises(ValueError):
         cugraph.jaccard(G, use_weight=True)
+
+
+@pytest.mark.sg
+def test_all_pairs_jaccard():
+    karate = UNDIRECTED_DATASETS[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    # Call Jaccard
+    jaccard_results = cugraph.jaccard(G)
+
+    # Remove self loop
+    jaccard_results = jaccard_results[
+        jaccard_results["first"] != jaccard_results["second"]
+    ].reset_index(drop=True)
+
+    all_pairs_jaccard_results = cugraph.all_pairs_jaccard(G)
+
+    assert_frame_equal(
+        jaccard_results.head(),
+        all_pairs_jaccard_results.head(),
+        check_dtype=False,
+        check_like=True,
+    )
+
+
+# FIXME
+@pytest.mark.sg
+@pytest.mark.skip(reason="Inaccurate results returned by all-pairs similarity")
+def test_all_pairs_jaccard_with_vertices():
+    karate = UNDIRECTED_DATASETS[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    # Call Jaccard
+    jaccard_results = cugraph.jaccard(G)
+
+    # Remove self loop
+    jaccard_results = jaccard_results[
+        jaccard_results["first"] != jaccard_results["second"]
+    ].reset_index(drop=True)
+
+    vertices = [0, 1, 2]
+
+    mask_first = jaccard_results["first"].isin(vertices)
+    mask_second = jaccard_results["second"].isin(vertices)
+    # mask = [v in vertices for v in (jaccard_results['first'].to_pandas()
+    # or jaccard_results['second'].to_pandas())]
+    mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())]
+
+    jaccard_results = jaccard_results[mask].reset_index(drop=True)
+
+    # Call all-pairs Jaccard
+    all_pairs_jaccard_results = cugraph.all_pairs_jaccard(
+        G, vertices=cudf.Series(vertices, dtype="int32")
+    )
+
+    assert_frame_equal(
+        jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True
+    )
+
+
+@pytest.mark.sg
+def test_all_pairs_jaccard_with_topk():
+    karate = UNDIRECTED_DATASETS[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    # Call Jaccard
+    jaccard_results = cugraph.jaccard(G)
+
+    topk = 4
+
+    # Remove self loop
+    jaccard_results = (
+        jaccard_results[jaccard_results["first"] != jaccard_results["second"]]
+        .sort_values(["jaccard_coeff", "first", "second"], ascending=False)
+        .reset_index(drop=True)[:topk]
+    )
+
+    # Call all-pairs Jaccard
+    all_pairs_jaccard_results = (
+        cugraph.all_pairs_jaccard(G, topk=topk)
+        .sort_values(["first", "second"], ascending=False)
+        .reset_index(drop=True)
+    )
+
+    assert_frame_equal(
+        jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True
+    )
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
index 98f64906564..244718ce927 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
@@ -33,8 +33,10 @@ def setup_function():
 
 
 IS_DIRECTED = [False]
-HAS_VERTEX_PAIR = [True, False]
-IS_WEIGHTED = [True, False]
+HAS_VERTEX_PAIR = [False, True]
+HAS_VERTICES = [False, True]
+HAS_TOPK = [False, True]
+IS_WEIGHTED = [False, True]
 
 
 # =============================================================================
@@ -49,6 +51,8 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (HAS_VERTICES, "has_vertices"),
+    (HAS_TOPK, "has_topk"),
     (IS_WEIGHTED, "is_weighted"),
 )
 
@@ -60,7 +64,17 @@ def input_combo(request):
     tests or other parameterized fixtures.
     """
     parameters = dict(
-        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+        zip(
+            (
+                "graph_file",
+                "directed",
+                "has_vertex_pair",
+                "has_vertices",
+                "has_topk",
+                "is_weighted",
+            ),
+            request.param,
+        )
     )
 
     return parameters
@@ -123,6 +137,76 @@ def input_expected_output(input_combo):
     return input_combo
 
 
+@pytest.fixture(scope="module")
+def input_expected_output_all_pairs(input_combo):
+    """
+    This fixture returns the inputs and expected results from the Jaccard algo.
+    (based on cuGraph Jaccard) which can be used for validation.
+    """
+
+    input_data_path = input_combo["graph_file"]
+    directed = input_combo["directed"]
+    has_vertices = input_combo["has_vertices"]
+    has_topk = input_combo["has_topk"]
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
+
+    if has_topk:
+        topk = 5
+    else:
+        topk = None
+
+    if has_vertices:
+        # Sample random vertices from the graph and compute the two_hop_neighbors
+        # with those seeds
+        k = random.randint(1, 10)
+        vertices = random.sample(range(G.number_of_vertices()), k)
+
+    else:
+        vertices = None
+        # If no start_vertices are passed, all_pairs similarity runs OOM
+        topk = 10
+
+    input_combo["vertices"] = vertices
+    input_combo["topk"] = topk
+    print("vertices ", vertices)
+    sg_cugraph_all_pairs_jaccard = cugraph.all_pairs_jaccard(
+        G,
+        vertices=input_combo["vertices"],
+        topk=input_combo["topk"],
+        use_weight=is_weighted,
+    )
+    # Save the results back to the input_combo dictionary to prevent redundant
+    # cuGraph runs. Other tests using the input_combo fixture will look for
+    # them, and if not present they will have to re-run the same cuGraph call.
+
+    input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_jaccard
+    chunksize = dcg.get_chunksize(input_data_path)
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        blocksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.Graph(directed=directed)
+    dg.from_dask_cudf_edgelist(
+        ddf,
+        source="src",
+        destination="dst",
+        edge_attr="value" if is_weighted else None,
+        renumber=True,
+        store_transposed=True,
+    )
+
+    input_combo["MGGraph"] = dg
+
+    return input_combo
+
+
 # =============================================================================
 # Tests
 # =============================================================================
@@ -164,3 +248,48 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):
 
     assert len(jaccard_coeff_diffs1) == 0
     assert len(jaccard_coeff_diffs2) == 0
+
+
+@pytest.mark.mg
+def test_dask_mg_all_pairs_jaccard(
+    dask_client, benchmark, input_expected_output_all_pairs
+):
+
+    dg = input_expected_output_all_pairs["MGGraph"]
+
+    use_weight = input_expected_output_all_pairs["is_weighted"]
+
+    result_jaccard = benchmark(
+        dcg.all_pairs_jaccard,
+        dg,
+        vertices=input_expected_output_all_pairs["vertices"],
+        topk=input_expected_output_all_pairs["topk"],
+        use_weight=use_weight,
+    )
+
+    result_jaccard = (
+        result_jaccard.compute()
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+        .rename(columns={"jaccard_coeff": "mg_cugraph_jaccard_coeff"})
+    )
+
+    expected_output = (
+        input_expected_output_all_pairs["sg_cugraph_results"]
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+    )
+
+    # Update the dask cugraph Jaccard results with sg cugraph results for easy
+    # comparison using cuDF DataFrame methods.
+    result_jaccard["sg_cugraph_jaccard_coeff"] = expected_output["jaccard_coeff"]
+
+    jaccard_coeff_diffs1 = result_jaccard.query(
+        "mg_cugraph_jaccard_coeff - sg_cugraph_jaccard_coeff > 0.00001"
+    )
+    jaccard_coeff_diffs2 = result_jaccard.query(
+        "mg_cugraph_jaccard_coeff - sg_cugraph_jaccard_coeff < -0.00001"
+    )
+
+    assert len(jaccard_coeff_diffs1) == 0
+    assert len(jaccard_coeff_diffs2) == 0
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
index 4b00330b6c9..f87fe06f691 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
@@ -20,7 +20,8 @@
 import cudf
 import cugraph
 from cugraph.testing import utils, UNDIRECTED_DATASETS
-from cudf.testing import assert_series_equal
+from cudf.testing import assert_series_equal, assert_frame_equal
+import pandas as pd
 
 SRC_COL = "0"
 DST_COL = "1"
@@ -114,6 +115,50 @@ def cpu_call(M, first, second):
     return result
 
 
+def compare(src1, dst1, val1, src2, dst2, val2):
+    #
+    #  We will do comparison computations by using dataframe
+    #  merge functions (essentially doing fast joins).  We
+    #  start by making two data frames
+    #
+    df1 = cudf.DataFrame()
+    df1["src1"] = src1
+    df1["dst1"] = dst1
+    if val1 is not None:
+        df1["val1"] = val1
+
+    df2 = cudf.DataFrame()
+    df2["src2"] = src2
+    df2["dst2"] = dst2
+    if val2 is not None:
+        df2["val2"] = val2
+
+    #
+    #  Check to see if all pairs in the original data frame
+    #  still exist in the new data frame.  If we join (merge)
+    #  the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i])
+    #  then we should get exactly the same number of entries in
+    #  the data frame if we did not lose any data.
+    #
+    join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"])
+
+    if len(df1) != len(join):
+        join2 = df1.merge(
+            df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"]
+        )
+        pd.set_option("display.max_rows", 500)
+        print("df1 = \n", df1.sort_values(["src1", "dst1"]))
+        print("df2 = \n", df2.sort_values(["src2", "dst2"]))
+        print(
+            "join2 = \n",
+            join2.sort_values(["src1", "dst1"])
+            .to_pandas()
+            .query("src2.isnull()", engine="python"),
+        )
+
+    assert len(df1) == len(join)
+
+
 # =============================================================================
 # Pytest Fixtures
 # =============================================================================
@@ -242,3 +287,106 @@ def test_weighted_overlap():
     G = karate.get_graph(ignore_weights=True)
     with pytest.raises(ValueError):
         cugraph.overlap(G, use_weight=True)
+
+
+@pytest.mark.sg
+def test_all_pairs_overlap():
+    karate = UNDIRECTED_DATASETS[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    # Call Overlap
+    overlap_results = cugraph.overlap(G)
+
+    # Remove self loop
+    overlap_results = overlap_results[
+        overlap_results["first"] != overlap_results["second"]
+    ].reset_index(drop=True)
+
+    all_pairs_overlap_results = cugraph.all_pairs_overlap(G)
+
+    assert_frame_equal(
+        overlap_results.head(),
+        all_pairs_overlap_results.head(),
+        check_dtype=False,
+        check_like=True,
+    )
+
+
+# FIXME
+@pytest.mark.sg
+@pytest.mark.skip(reason="Inaccurate results returned by all-pairs similarity")
+def test_all_pairs_overlap_with_vertices():
+    karate = UNDIRECTED_DATASETS[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    # Call Overlap
+    overlap_results = cugraph.overlap(G)
+
+    # Remove self loop
+    overlap_results = overlap_results[
+        overlap_results["first"] != overlap_results["second"]
+    ].reset_index(drop=True)
+
+    vertices = [0, 1, 2]
+
+    mask_first = overlap_results["first"].isin(vertices)
+    mask_second = overlap_results["second"].isin(vertices)
+    # mask = [v in vertices for v in (overlap_results['first'].to_pandas()
+    # or overlap_results['second'].to_pandas())]
+    mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())]
+
+    overlap_results = overlap_results[mask].reset_index(drop=True)
+
+    # Call all-pairs Overlap
+    all_pairs_overlap_results = cugraph.all_pairs_overlap(
+        G, vertices=cudf.Series(vertices, dtype="int32")
+    )
+
+    assert_frame_equal(
+        overlap_results, all_pairs_overlap_results, check_dtype=False, check_like=True
+    )
+
+
+@pytest.mark.sg
+def test_all_pairs_overlap_with_topk():
+    karate = UNDIRECTED_DATASETS[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    # Call Overlap
+    overlap_results = cugraph.overlap(G)
+
+    topk = 10
+
+    # Remove self loop
+    overlap_results = (
+        overlap_results[overlap_results["first"] != overlap_results["second"]]
+        .sort_values(["overlap_coeff", "first", "second"], ascending=False)
+        .reset_index(drop=True)  # [:topk]
+    )
+    print("overlap_results = \n", overlap_results)
+
+    # Call all-pairs overlap
+    all_pairs_overlap_results = (
+        cugraph.all_pairs_overlap(G, topk=topk)
+        .sort_values(["first", "second"], ascending=False)
+        .reset_index(drop=True)
+    )
+
+    # 1. All pair similarity might return different top pairs k pairs
+    # which are still valid hence, ensure the pairs returned by all-pairs
+    # exists.
+
+    compare(
+        all_pairs_overlap_results["first"],
+        all_pairs_overlap_results["second"],
+        all_pairs_overlap_results["overlap_coeff"],
+        overlap_results["first"],
+        overlap_results["second"],
+        overlap_results["overlap_coeff"],
+    )
+
+    # 2. Ensure the coefficient scores are still the highest
+    assert_series_equal(
+        all_pairs_overlap_results["overlap_coeff"],
+        overlap_results["overlap_coeff"][:topk],
+    )
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
index 9afe7dd842f..aa238f6a6de 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
@@ -33,8 +33,10 @@ def setup_function():
 
 
 IS_DIRECTED = [False]
-HAS_VERTEX_PAIR = [True, False]
-IS_WEIGHTED = [True, False]
+HAS_VERTEX_PAIR = [False, True]
+HAS_VERTICES = [False, True]
+HAS_TOPK = [False, True]
+IS_WEIGHTED = [False, True]
 
 
 # =============================================================================
@@ -49,6 +51,8 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (HAS_VERTICES, "has_vertices"),
+    (HAS_TOPK, "has_topk"),
     (IS_WEIGHTED, "is_weighted"),
 )
 
@@ -60,7 +64,17 @@ def input_combo(request):
     tests or other parameterized fixtures.
     """
     parameters = dict(
-        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+        zip(
+            (
+                "graph_file",
+                "directed",
+                "has_vertex_pair",
+                "has_vertices",
+                "has_topk",
+                "is_weighted",
+            ),
+            request.param,
+        )
     )
 
     return parameters
@@ -123,6 +137,76 @@ def input_expected_output(input_combo):
     return input_combo
 
 
+@pytest.fixture(scope="module")
+def input_expected_output_all_pairs(input_combo):
+    """
+    This fixture returns the inputs and expected results from the Overlap algo.
+    (based on cuGraph Overlap) which can be used for validation.
+    """
+
+    input_data_path = input_combo["graph_file"]
+    directed = input_combo["directed"]
+    has_vertices = input_combo["has_vertices"]
+    has_topk = input_combo["has_topk"]
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
+
+    if has_topk:
+        topk = 5
+    else:
+        topk = None
+
+    if has_vertices:
+        # Sample random vertices from the graph and compute the two_hop_neighbors
+        # with those seeds
+        k = random.randint(1, 10)
+        vertices = random.sample(range(G.number_of_vertices()), k)
+
+    else:
+        vertices = None
+        # If no start_vertices are passed, all_pairs similarity runs OOM
+        topk = 10
+
+    input_combo["vertices"] = vertices
+    input_combo["topk"] = topk
+    print("vertices ", vertices)
+    sg_cugraph_all_pairs_overlap = cugraph.all_pairs_overlap(
+        G,
+        vertices=input_combo["vertices"],
+        topk=input_combo["topk"],
+        use_weight=is_weighted,
+    )
+    # Save the results back to the input_combo dictionary to prevent redundant
+    # cuGraph runs. Other tests using the input_combo fixture will look for
+    # them, and if not present they will have to re-run the same cuGraph call.
+
+    input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_overlap
+    chunksize = dcg.get_chunksize(input_data_path)
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        blocksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.Graph(directed=directed)
+    dg.from_dask_cudf_edgelist(
+        ddf,
+        source="src",
+        destination="dst",
+        edge_attr="value" if is_weighted else None,
+        renumber=True,
+        store_transposed=True,
+    )
+
+    input_combo["MGGraph"] = dg
+
+    return input_combo
+
+
 # =============================================================================
 # Tests
 # =============================================================================
@@ -167,3 +251,48 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):
 
     assert len(overlap_coeff_diffs1) == 0
     assert len(overlap_coeff_diffs2) == 0
+
+
+@pytest.mark.mg
+def test_dask_mg_all_pairs_overlap(
+    dask_client, benchmark, input_expected_output_all_pairs
+):
+
+    dg = input_expected_output_all_pairs["MGGraph"]
+
+    use_weight = input_expected_output_all_pairs["is_weighted"]
+
+    result_overlap = benchmark(
+        dcg.all_pairs_overlap,
+        dg,
+        vertices=input_expected_output_all_pairs["vertices"],
+        topk=input_expected_output_all_pairs["topk"],
+        use_weight=use_weight,
+    )
+
+    result_overlap = (
+        result_overlap.compute()
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+        .rename(columns={"overlap_coeff": "mg_cugraph_overlap_coeff"})
+    )
+
+    expected_output = (
+        input_expected_output_all_pairs["sg_cugraph_results"]
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+    )
+
+    # Update the dask cugraph Overlap results with sg cugraph results for easy
+    # comparison using cuDF DataFrame methods.
+    result_overlap["sg_cugraph_overlap_coeff"] = expected_output["overlap_coeff"]
+
+    overlap_coeff_diffs1 = result_overlap.query(
+        "mg_cugraph_overlap_coeff - sg_cugraph_overlap_coeff > 0.00001"
+    )
+    overlap_coeff_diffs2 = result_overlap.query(
+        "mg_cugraph_overlap_coeff - sg_cugraph_overlap_coeff < -0.00001"
+    )
+
+    assert len(overlap_coeff_diffs1) == 0
+    assert len(overlap_coeff_diffs2) == 0
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
index 6345187a376..4c30f149ea5 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
@@ -15,12 +15,13 @@
 
 import pytest
 import networkx as nx
+import pandas as pd
 
 import cudf
 import cugraph
 from cugraph.testing import utils, UNDIRECTED_DATASETS
 from cugraph.datasets import netscience
-from cudf.testing import assert_series_equal
+from cudf.testing import assert_series_equal, assert_frame_equal
 
 SRC_COL = "0"
 DST_COL = "1"
@@ -156,6 +157,50 @@ def networkx_call(M, benchmark_callable=None):
     return src, dst, coeff
 
 
+def compare(src1, dst1, val1, src2, dst2, val2):
+    #
+    #  We will do comparison computations by using dataframe
+    #  merge functions (essentially doing fast joins).  We
+    #  start by making two data frames
+    #
+    df1 = cudf.DataFrame()
+    df1["src1"] = src1
+    df1["dst1"] = dst1
+    if val1 is not None:
+        df1["val1"] = val1
+
+    df2 = cudf.DataFrame()
+    df2["src2"] = src2
+    df2["dst2"] = dst2
+    if val2 is not None:
+        df2["val2"] = val2
+
+    #
+    #  Check to see if all pairs in the original data frame
+    #  still exist in the new data frame.  If we join (merge)
+    #  the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i])
+    #  then we should get exactly the same number of entries in
+    #  the data frame if we did not lose any data.
+    #
+    join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"])
+
+    if len(df1) != len(join):
+        join2 = df1.merge(
+            df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"]
+        )
+        pd.set_option("display.max_rows", 500)
+        print("df1 = \n", df1.sort_values(["src1", "dst1"]))
+        print("df2 = \n", df2.sort_values(["src2", "dst2"]))
+        print(
+            "join2 = \n",
+            join2.sort_values(["src1", "dst1"])
+            .to_pandas()
+            .query("src2.isnull()", engine="python"),
+        )
+
+    assert len(df1) == len(join)
+
+
 # =============================================================================
 # Pytest Fixtures
 # =============================================================================
@@ -337,3 +382,105 @@ def test_weighted_sorensen():
     G = karate.get_graph(ignore_weights=True)
     with pytest.raises(ValueError):
         cugraph.sorensen(G, use_weight=True)
+
+
+@pytest.mark.sg
+def test_all_pairs_sorensen():
+    karate = UNDIRECTED_DATASETS[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    # Call Sorensen
+    sorensen_results = cugraph.sorensen(G)
+
+    # Remove self loop
+    sorensen_results = sorensen_results[
+        sorensen_results["first"] != sorensen_results["second"]
+    ].reset_index(drop=True)
+
+    all_pairs_sorensen_results = cugraph.all_pairs_sorensen(G)
+
+    assert_frame_equal(
+        sorensen_results.head(),
+        all_pairs_sorensen_results.head(),
+        check_dtype=False,
+        check_like=True,
+    )
+
+
+# FIXME
+@pytest.mark.sg
+@pytest.mark.skip(reason="Inaccurate results returned by all-pairs similarity")
+def test_all_pairs_sorensen_with_vertices():
+    karate = UNDIRECTED_DATASETS[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    # Call Sorensen
+    sorensen_results = cugraph.sorensen(G)
+
+    # Remove self loop
+    sorensen_results = sorensen_results[
+        sorensen_results["first"] != sorensen_results["second"]
+    ].reset_index(drop=True)
+
+    vertices = [0, 1, 2]
+
+    mask_first = sorensen_results["first"].isin(vertices)
+    mask_second = sorensen_results["second"].isin(vertices)
+    # mask = [v in vertices for v in (sorensen_results['first'].to_pandas()
+    # or sorensen_results['second'].to_pandas())]
+    mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())]
+
+    sorensen_results = sorensen_results[mask].reset_index(drop=True)
+
+    # Call all-pairs Sorensen
+    all_pairs_sorensen_results = cugraph.all_pairs_sorensen(
+        G, vertices=cudf.Series(vertices, dtype="int32")
+    )
+
+    assert_frame_equal(
+        sorensen_results, all_pairs_sorensen_results, check_dtype=False, check_like=True
+    )
+
+
+@pytest.mark.sg
+def test_all_pairs_sorensen_with_topk():
+    karate = UNDIRECTED_DATASETS[0]
+    G = karate.get_graph(ignore_weights=True)
+
+    # Call Sorensen
+    sorensen_results = cugraph.sorensen(G)
+
+    topk = 4
+
+    # Remove self loop
+    sorensen_results = (
+        sorensen_results[sorensen_results["first"] != sorensen_results["second"]]
+        .sort_values(["sorensen_coeff", "first", "second"], ascending=False)
+        .reset_index(drop=True)[:topk]
+    )
+
+    # Call all-pairs sorensen
+    all_pairs_sorensen_results = (
+        cugraph.all_pairs_sorensen(G, topk=topk)
+        .sort_values(["first", "second"], ascending=False)
+        .reset_index(drop=True)
+    )
+
+    # 1. All pair similarity might return different top pairs k pairs
+    # which are still valid hence, ensure the pairs returned by all-pairs
+    # exists.
+
+    compare(
+        all_pairs_sorensen_results["first"],
+        all_pairs_sorensen_results["second"],
+        all_pairs_sorensen_results["sorensen_coeff"],
+        sorensen_results["first"],
+        sorensen_results["second"],
+        sorensen_results["sorensen_coeff"],
+    )
+
+    # 2. Ensure the coefficient scores are still the highest
+    assert_series_equal(
+        all_pairs_sorensen_results["sorensen_coeff"],
+        sorensen_results["sorensen_coeff"][:topk],
+    )
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
index 6c24fa5af13..e41daa64fb8 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
@@ -34,8 +34,10 @@ def setup_function():
 
 
 IS_DIRECTED = [False]
-HAS_VERTEX_PAIR = [True, False]
-IS_WEIGHTED = [True, False]
+HAS_VERTEX_PAIR = [False, True]
+HAS_VERTICES = [False, True]
+HAS_TOPK = [False, True]
+IS_WEIGHTED = [False, True]
 
 
 # =============================================================================
@@ -50,6 +52,8 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (HAS_VERTICES, "has_vertices"),
+    (HAS_TOPK, "has_topk"),
     (IS_WEIGHTED, "is_weighted"),
 )
 
@@ -61,7 +65,17 @@ def input_combo(request):
     tests or other parameterized fixtures.
     """
     parameters = dict(
-        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+        zip(
+            (
+                "graph_file",
+                "directed",
+                "has_vertex_pair",
+                "has_vertices",
+                "has_topk",
+                "is_weighted",
+            ),
+            request.param,
+        )
     )
 
     return parameters
@@ -124,6 +138,76 @@ def input_expected_output(input_combo):
     return input_combo
 
 
+@pytest.fixture(scope="module")
+def input_expected_output_all_pairs(input_combo):
+    """
+    This fixture returns the inputs and expected results from the Sorensen algo.
+    (based on cuGraph Sorensen) which can be used for validation.
+    """
+
+    input_data_path = input_combo["graph_file"]
+    directed = input_combo["directed"]
+    has_vertices = input_combo["has_vertices"]
+    has_topk = input_combo["has_topk"]
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
+
+    if has_topk:
+        topk = 5
+    else:
+        topk = None
+
+    if has_vertices:
+        # Sample random vertices from the graph and compute the two_hop_neighbors
+        # with those seeds
+        k = random.randint(1, 10)
+        vertices = random.sample(range(G.number_of_vertices()), k)
+
+    else:
+        vertices = None
+        # If no start_vertices are passed, all_pairs similarity runs OOM
+        topk = 10
+
+    input_combo["vertices"] = vertices
+    print("vertices ", vertices, " is_weighted = ", is_weighted)
+    input_combo["topk"] = topk
+    sg_cugraph_all_pairs_sorensen = cugraph.all_pairs_sorensen(
+        G,
+        vertices=input_combo["vertices"],
+        topk=input_combo["topk"],
+        use_weight=is_weighted,
+    )
+    # Save the results back to the input_combo dictionary to prevent redundant
+    # cuGraph runs. Other tests using the input_combo fixture will look for
+    # them, and if not present they will have to re-run the same cuGraph call.
+
+    input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_sorensen
+    chunksize = dcg.get_chunksize(input_data_path)
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        blocksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.Graph(directed=directed)
+    dg.from_dask_cudf_edgelist(
+        ddf,
+        source="src",
+        destination="dst",
+        edge_attr="value" if is_weighted else None,
+        renumber=True,
+        store_transposed=True,
+    )
+
+    input_combo["MGGraph"] = dg
+
+    return input_combo
+
+
 # =============================================================================
 # Tests
 # =============================================================================
@@ -166,3 +250,48 @@ def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output):
 
     assert len(sorensen_coeff_diffs1) == 0
     assert len(sorensen_coeff_diffs2) == 0
+
+
+@pytest.mark.mg
+def test_dask_mg_all_pairs_sorensen(
+    dask_client, benchmark, input_expected_output_all_pairs
+):
+
+    dg = input_expected_output_all_pairs["MGGraph"]
+
+    use_weight = input_expected_output_all_pairs["is_weighted"]
+
+    result_sorensen = benchmark(
+        dcg.all_pairs_sorensen,
+        dg,
+        vertices=input_expected_output_all_pairs["vertices"],
+        topk=input_expected_output_all_pairs["topk"],
+        use_weight=use_weight,
+    )
+
+    result_sorensen = (
+        result_sorensen.compute()
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+        .rename(columns={"sorensen_coeff": "mg_cugraph_sorensen_coeff"})
+    )
+
+    expected_output = (
+        input_expected_output_all_pairs["sg_cugraph_results"]
+        .sort_values(["first", "second"])
+        .reset_index(drop=True)
+    )
+
+    # Update the dask cugraph sorensen results with sg cugraph results for easy
+    # comparison using cuDF DataFrame methods.
+    result_sorensen["sg_cugraph_sorensen_coeff"] = expected_output["sorensen_coeff"]
+
+    sorensen_coeff_diffs1 = result_sorensen.query(
+        "mg_cugraph_sorensen_coeff - sg_cugraph_sorensen_coeff > 0.00001"
+    )
+    sorensen_coeff_diffs2 = result_sorensen.query(
+        "mg_cugraph_sorensen_coeff - sg_cugraph_sorensen_coeff < -0.00001"
+    )
+
+    assert len(sorensen_coeff_diffs1) == 0
+    assert len(sorensen_coeff_diffs2) == 0
diff --git a/python/cugraph/cugraph/tests/traversal/test_paths.py b/python/cugraph/cugraph/tests/traversal/test_paths.py
index 5ee22874f4a..4ef10da593c 100644
--- a/python/cugraph/cugraph/tests/traversal/test_paths.py
+++ b/python/cugraph/cugraph/tests/traversal/test_paths.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -22,6 +22,7 @@
 import cupy
 import cugraph
 from cugraph.testing import get_resultset, load_resultset
+from cudf.testing.testing import assert_series_equal
 from cupyx.scipy.sparse import coo_matrix as cupy_coo_matrix
 
 
@@ -204,7 +205,11 @@ def test_shortest_path_length_no_path(graphs):
 def test_shortest_path_length_no_target(graphs, load_traversal_results):
     cugraph_G, cupy_df = graphs
 
-    cugraph_path_1_to_all = cugraph.shortest_path_length(cugraph_G, 1)
+    cugraph_path_1_to_all = (
+        cugraph.shortest_path_length(cugraph_G, 1)
+        .sort_values("vertex")
+        .reset_index(drop=True)
+    )
     golden_path_1_to_all = get_resultset(
         resultset_name="traversal",
         algo="shortest_path_length",
@@ -217,7 +222,12 @@ def test_shortest_path_length_no_target(graphs, load_traversal_results):
 
     # Cast networkx graph on cugraph vertex column type from str to int.
     # SSSP preserves vertex type, convert for comparison
-    assert cugraph_path_1_to_all == cupy_path_1_to_all
+    assert_series_equal(
+        cugraph_path_1_to_all["distance"],
+        cupy_path_1_to_all["distance"],
+        check_names=False,
+        check_dtype=False,
+    )
 
     # results for vertex 8 and 9 are not returned
     assert cugraph_path_1_to_all.shape[0] == len(golden_path_1_to_all) + 2
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 218868da000..bbb89b03697 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -23,18 +23,18 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==24.8.*,>=0.0.0a0",
-    "dask-cudf==24.8.*,>=0.0.0a0",
+    "dask-cuda==24.10.*,>=0.0.0a0",
+    "dask-cudf==24.10.*,>=0.0.0a0",
     "fsspec[http]>=0.6.0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
-    "pylibcugraph==24.8.*,>=0.0.0a0",
-    "raft-dask==24.8.*,>=0.0.0a0",
-    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
-    "rmm==24.8.*,>=0.0.0a0",
-    "ucx-py==0.39.*,>=0.0.0a0",
+    "pylibcugraph==24.10.*,>=0.0.0a0",
+    "raft-dask==24.10.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
+    "ucx-py==0.40.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -49,7 +49,7 @@ test = [
     "networkx>=2.5.1",
     "numpy>=1.23,<2.0a0",
     "pandas",
-    "pylibwholegraph==24.8.*,>=0.0.0a0",
+    "pylibwholegraph==24.10.*,>=0.0.0a0",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",
@@ -81,8 +81,9 @@ build-backend = "scikit_build_core.build"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "ninja",
-    "pylibcugraph==24.8.*,>=0.0.0a0",
-    "pylibraft==24.8.*,>=0.0.0a0",
-    "rmm==24.8.*,>=0.0.0a0",
+    "pylibcugraph==24.10.*,>=0.0.0a0",
+    "pylibraft==24.10.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/nx-cugraph/.flake8 b/python/nx-cugraph/.flake8
index c5874e54f7e..cdda8d1080f 100644
--- a/python/nx-cugraph/.flake8
+++ b/python/nx-cugraph/.flake8
@@ -1,9 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 [flake8]
 max-line-length = 88
 inline-quotes = "
 extend-ignore =
+    B020,
     E203,
     SIM105,
     SIM401,
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index 27825585c28..458421e2b6e 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -173,11 +173,19 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/shortest_paths.html#module-networkx.algorithms.shortest_paths.weighted">weighted</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path">all_pairs_bellman_ford_path</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.all_pairs_bellman_ford_path_length">all_pairs_bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra.html#networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra">all_pairs_dijkstra</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path.html#networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path">all_pairs_dijkstra_path</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path_length.html#networkx.algorithms.shortest_paths.weighted.all_pairs_dijkstra_path_length">all_pairs_dijkstra_path_length</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.bellman_ford_path">bellman_ford_path</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.bellman_ford_path_length">bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.dijkstra_path.html#networkx.algorithms.shortest_paths.weighted.dijkstra_path">dijkstra_path</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.dijkstra_path_length.html#networkx.algorithms.shortest_paths.weighted.dijkstra_path_length">dijkstra_path_length</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford">single_source_bellman_ford</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path">single_source_bellman_ford_path</a>
-     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length">single_source_bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length.html#networkx.algorithms.shortest_paths.weighted.single_source_bellman_ford_path_length">single_source_bellman_ford_path_length</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_dijkstra.html#networkx.algorithms.shortest_paths.weighted.single_source_dijkstra">single_source_dijkstra</a>
+     ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path.html#networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path">single_source_dijkstra_path</a>
+     └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path_length.html#networkx.algorithms.shortest_paths.weighted.single_source_dijkstra_path_length">single_source_dijkstra_path_length</a>
 <a href="https://networkx.org/documentation/stable/reference/algorithms/traversal.html">traversal</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/traversal.html#module-networkx.algorithms.traversal.breadth_first_search">breadth_first_search</a>
      ├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.traversal.breadth_first_search.bfs_edges.html#networkx.algorithms.traversal.breadth_first_search.bfs_edges">bfs_edges</a>
@@ -253,9 +261,15 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
 <a href="https://networkx.org/documentation/stable/reference/classes/index.html">classes</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/functions.html#module-networkx.classes.function">function</a>
      └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.is_negatively_weighted.html#networkx.classes.function.is_negatively_weighted">is_negatively_weighted</a>
+<a href="https://networkx.org/documentation/stable/reference/convert.html#module-networkx.convert">convert</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert.from_dict_of_lists.html#networkx.convert.from_dict_of_lists">from_dict_of_lists</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert.to_dict_of_lists.html#networkx.convert.to_dict_of_lists">to_dict_of_lists</a>
 <a href="https://networkx.org/documentation/stable/reference/convert.html#module-networkx.convert_matrix">convert_matrix</a>
  ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html#networkx.convert_matrix.from_pandas_edgelist">from_pandas_edgelist</a>
  └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_scipy_sparse_array.html#networkx.convert_matrix.from_scipy_sparse_array">from_scipy_sparse_array</a>
+<a href="https://networkx.org/documentation/stable/reference/relabel.html#module-networkx.relabel">relabel</a>
+ ├─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.relabel.convert_node_labels_to_integers.html#networkx.relabel.convert_node_labels_to_integers">convert_node_labels_to_integers</a>
+ └─ <a href="https://networkx.org/documentation/stable/reference/generated/networkx.relabel.relabel_nodes.html#networkx.relabel.relabel_nodes">relabel_nodes</a>
 </pre>
 
 To request nx-cugraph backend support for a NetworkX API that is not listed
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index f57b90eb402..f58a6e2293b 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -42,6 +42,9 @@
         # BEGIN: functions
         "all_pairs_bellman_ford_path",
         "all_pairs_bellman_ford_path_length",
+        "all_pairs_dijkstra",
+        "all_pairs_dijkstra_path",
+        "all_pairs_dijkstra_path_length",
         "all_pairs_shortest_path",
         "all_pairs_shortest_path_length",
         "ancestors",
@@ -66,6 +69,7 @@
         "complete_graph",
         "complete_multipartite_graph",
         "connected_components",
+        "convert_node_labels_to_integers",
         "core_number",
         "cubical_graph",
         "cycle_graph",
@@ -75,12 +79,15 @@
         "descendants",
         "descendants_at_distance",
         "diamond_graph",
+        "dijkstra_path",
+        "dijkstra_path_length",
         "dodecahedral_graph",
         "edge_betweenness_centrality",
         "ego_graph",
         "eigenvector_centrality",
         "empty_graph",
         "florentine_families_graph",
+        "from_dict_of_lists",
         "from_pandas_edgelist",
         "from_scipy_sparse_array",
         "frucht_graph",
@@ -124,6 +131,7 @@
         "path_graph",
         "petersen_graph",
         "reciprocity",
+        "relabel_nodes",
         "reverse",
         "sedgewick_maze_graph",
         "shortest_path",
@@ -131,6 +139,9 @@
         "single_source_bellman_ford",
         "single_source_bellman_ford_path",
         "single_source_bellman_ford_path_length",
+        "single_source_dijkstra",
+        "single_source_dijkstra_path",
+        "single_source_dijkstra_path_length",
         "single_source_shortest_path",
         "single_source_shortest_path_length",
         "single_target_shortest_path",
@@ -138,6 +149,7 @@
         "star_graph",
         "tadpole_graph",
         "tetrahedral_graph",
+        "to_dict_of_lists",
         "transitivity",
         "triangles",
         "trivial_graph",
@@ -171,8 +183,8 @@
         "katz_centrality": "`nstart` isn't used (but is checked), and `normalized=False` is not supported.",
         "louvain_communities": "`seed` parameter is currently ignored, and self-loops are not yet supported.",
         "pagerank": "`dangling` parameter is not supported, but it is checked for validity.",
-        "shortest_path": "Negative weights are not yet supported, and method is ununsed.",
-        "shortest_path_length": "Negative weights are not yet supported, and method is ununsed.",
+        "shortest_path": "Negative weights are not yet supported.",
+        "shortest_path_length": "Negative weights are not yet supported.",
         "single_source_bellman_ford": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
         "single_source_bellman_ford_path": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
         "single_source_bellman_ford_path_length": "Negative cycles are not yet supported. ``NotImplementedError`` will be raised if there are negative edge weights. We plan to support negative edge weights soon. Also, callable ``weight`` argument is not supported.",
@@ -187,12 +199,27 @@
         "all_pairs_bellman_ford_path_length": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
+        "all_pairs_dijkstra": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "all_pairs_dijkstra_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "all_pairs_dijkstra_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         "bellman_ford_path": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
         "bellman_ford_path_length": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
+        "dijkstra_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "dijkstra_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         "ego_graph": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
@@ -227,6 +254,15 @@
         "single_source_bellman_ford_path_length": {
             "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
         },
+        "single_source_dijkstra": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "single_source_dijkstra_path": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
+        "single_source_dijkstra_path_length": {
+            "dtype : dtype or None, optional": "The data type (np.float32, np.float64, or None) to use for the edge weights in the algorithm. If None, then dtype is determined by the edge values.",
+        },
         # END: additional_parameters
     },
 }
@@ -260,6 +296,27 @@ def get_info():
     return d
 
 
+def _check_networkx_version():
+    import warnings
+
+    import networkx as nx
+
+    version_major, version_minor = nx.__version__.split(".")[:2]
+    if version_major != "3":
+        warnings.warn(
+            f"nx-cugraph version {__version__} is only known to work with networkx "
+            f"versions 3.x, but networkx {nx.__version__} is installed. "
+            "Perhaps try upgrading your Python environment.",
+            UserWarning,
+            stacklevel=2,
+        )
+    if len(version_minor) > 1:
+        raise RuntimeWarning(
+            f"nx-cugraph version {__version__} does not work with networkx version "
+            f"{nx.__version__}. Please upgrade (or fix) your Python environment."
+        )
+
+
 if __name__ == "__main__":
     from pathlib import Path
 
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index 317d5b8d481..ce46360e234 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -50,7 +50,7 @@ repos:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.1
+    rev: v0.5.4
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
@@ -58,7 +58,7 @@ repos:
     rev: 7.1.0
     hooks:
       - id: flake8
-        args: ['--per-file-ignores=_nx_cugraph/__init__.py:E501', '--extend-ignore=SIM105']  # Why is this necessary?
+        args: ['--per-file-ignores=_nx_cugraph/__init__.py:E501', '--extend-ignore=B020,SIM105']  # Why is this necessary?
         additional_dependencies: &flake8_dependencies
           # These versions need updated manually
           - flake8==7.1.0
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.1
+    rev: v0.5.4
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/__init__.py b/python/nx-cugraph/nx_cugraph/__init__.py
index 2c54da87898..542256fa781 100644
--- a/python/nx-cugraph/nx_cugraph/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/__init__.py
@@ -23,6 +23,9 @@
 from . import convert_matrix
 from .convert_matrix import *
 
+from . import relabel
+from .relabel import *
+
 from . import generators
 from .generators import *
 
@@ -30,3 +33,6 @@
 from .algorithms import *
 
 from _nx_cugraph._version import __git_commit__, __version__
+from _nx_cugraph import _check_networkx_version
+
+_check_networkx_version()
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py b/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
index 08abc9f2872..f53b3458949 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
@@ -51,5 +51,10 @@ def reverse(G, copy=True):
     if not G.is_directed():
         raise nx.NetworkXError("Cannot reverse an undirected graph.")
     if isinstance(G, nx.Graph):
+        if not copy:
+            raise RuntimeError(
+                "Using `copy=False` is invalid when using a NetworkX graph "
+                "as input to `nx_cugraph.reverse`"
+            )
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
     return G.reverse(copy=copy)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
index 68dbbace93d..7d6d77f34a4 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/generic.py
@@ -43,7 +43,7 @@ def has_path(G, source, target):
 def shortest_path(
     G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
 ):
-    """Negative weights are not yet supported, and method is ununsed."""
+    """Negative weights are not yet supported."""
     if method not in {"dijkstra", "bellman-ford"}:
         raise ValueError(f"method not supported: {method}")
     if weight is None:
@@ -53,9 +53,9 @@ def shortest_path(
             # All pairs
             if method == "unweighted":
                 paths = nxcg.all_pairs_shortest_path(G)
-            else:
-                # method == "dijkstra":
-                # method == 'bellman-ford':
+            elif method == "dijkstra":
+                paths = nxcg.all_pairs_dijkstra_path(G, weight=weight, dtype=dtype)
+            else:  # method == 'bellman-ford':
                 paths = nxcg.all_pairs_bellman_ford_path(G, weight=weight, dtype=dtype)
             if nx.__version__[:3] <= "3.4":
                 paths = dict(paths)
@@ -75,9 +75,11 @@ def shortest_path(
         # From source
         if method == "unweighted":
             paths = nxcg.single_source_shortest_path(G, source)
-        else:
-            # method == "dijkstra":
-            # method == 'bellman-ford':
+        elif method == "dijkstra":
+            paths = nxcg.single_source_dijkstra_path(
+                G, source, weight=weight, dtype=dtype
+            )
+        else:  # method == 'bellman-ford':
             paths = nxcg.single_source_bellman_ford_path(
                 G, source, weight=weight, dtype=dtype
             )
@@ -106,7 +108,7 @@ def _(G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
 def shortest_path_length(
     G, source=None, target=None, weight=None, method="dijkstra", *, dtype=None
 ):
-    """Negative weights are not yet supported, and method is ununsed."""
+    """Negative weights are not yet supported."""
     if method not in {"dijkstra", "bellman-ford"}:
         raise ValueError(f"method not supported: {method}")
     if weight is None:
@@ -116,9 +118,11 @@ def shortest_path_length(
             # All pairs
             if method == "unweighted":
                 lengths = nxcg.all_pairs_shortest_path_length(G)
-            else:
-                # method == "dijkstra":
-                # method == 'bellman-ford':
+            elif method == "dijkstra":
+                lengths = nxcg.all_pairs_dijkstra_path_length(
+                    G, weight=weight, dtype=dtype
+                )
+            else:  # method == 'bellman-ford':
                 lengths = nxcg.all_pairs_bellman_ford_path_length(
                     G, weight=weight, dtype=dtype
                 )
@@ -127,9 +131,11 @@ def shortest_path_length(
             lengths = nxcg.single_target_shortest_path_length(G, target)
             if nx.__version__[:3] <= "3.4":
                 lengths = dict(lengths)
-        else:
-            # method == "dijkstra":
-            # method == 'bellman-ford':
+        elif method == "dijkstra":
+            lengths = nxcg.single_source_dijkstra_path_length(
+                G, target, weight=weight, dtype=dtype
+            )
+        else:  # method == 'bellman-ford':
             lengths = nxcg.single_source_bellman_ford_path_length(
                 G, target, weight=weight, dtype=dtype
             )
@@ -137,21 +143,21 @@ def shortest_path_length(
         # From source
         if method == "unweighted":
             lengths = nxcg.single_source_shortest_path_length(G, source)
-        else:
-            # method == "dijkstra":
-            # method == 'bellman-ford':
-            lengths = dict(
-                nxcg.single_source_bellman_ford_path_length(
-                    G, source, weight=weight, dtype=dtype
-                )
+        elif method == "dijkstra":
+            lengths = nxcg.single_source_dijkstra_path_length(
+                G, source, weight=weight, dtype=dtype
+            )
+        else:  # method == 'bellman-ford':
+            lengths = nxcg.single_source_bellman_ford_path_length(
+                G, source, weight=weight, dtype=dtype
             )
     # From source to target
     elif method == "unweighted":
         G = _to_graph(G)
         lengths = _bfs(G, source, None, "Source", return_type="length", target=target)
-    else:
-        # method == "dijkstra":
-        # method == 'bellman-ford':
+    elif method == "dijkstra":
+        lengths = nxcg.dijkstra_path_length(G, source, target, weight, dtype=dtype)
+    else:  # method == 'bellman-ford':
         lengths = nxcg.bellman_ford_path_length(G, source, target, weight, dtype=dtype)
     return lengths
 
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
index 714289c5b4b..0e98c366e4a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
@@ -61,7 +61,12 @@ def bidirectional_shortest_path(G, source, target):
     # TODO PERF: do bidirectional traversal in core
     G = _to_graph(G)
     if source not in G or target not in G:
-        raise nx.NodeNotFound(f"Either source {source} or target {target} is not in G")
+        if nx.__version__[:3] <= "3.3":
+            raise nx.NodeNotFound(
+                f"Either source {source} or target {target} is not in G"
+            )
+        missing = f"Source {source}" if source not in G else f"Target {target}"
+        raise nx.NodeNotFound(f"{missing} is not in G")
     return _bfs(G, source, None, "Source", return_type="path", target=target)
 
 
@@ -131,7 +136,7 @@ def _bfs(
         # return_type == "length-path"
         return {source: 0}, {source: [source]}
 
-    if cutoff is None:
+    if cutoff is None or np.isinf(cutoff):
         cutoff = -1
     src_index = source if G.key_to_id is None else G.key_to_id[source]
     distances, predecessors, node_ids = plc.bfs(
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
index 32323dd45f3..032ef2c7fdf 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/weighted.py
@@ -25,6 +25,14 @@
 from .unweighted import _bfs
 
 __all__ = [
+    "dijkstra_path",
+    "dijkstra_path_length",
+    "single_source_dijkstra",
+    "single_source_dijkstra_path",
+    "single_source_dijkstra_path_length",
+    "all_pairs_dijkstra",
+    "all_pairs_dijkstra_path",
+    "all_pairs_dijkstra_path_length",
     "bellman_ford_path",
     "bellman_ford_path_length",
     "single_source_bellman_ford",
@@ -44,14 +52,24 @@ def _add_doc(func):
     return func
 
 
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def bellman_ford_path(G, source, target, weight="weight", *, dtype=None):
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def dijkstra_path(G, source, target, weight="weight", *, dtype=None):
     G = _to_graph(G, weight, 1, np.float32)
     dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     return _sssp(G, source, weight, target, return_type="path", dtype=dtype)
 
 
+@dijkstra_path._can_run
+def _(G, source, target, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def bellman_ford_path(G, source, target, weight="weight", *, dtype=None):
+    return dijkstra_path(G, source, target, weight=weight, dtype=dtype)
+
+
 @bellman_ford_path._can_run
 def _(G, source, target, weight="weight", *, dtype=None):
     return (
@@ -61,14 +79,24 @@ def _(G, source, target, weight="weight", *, dtype=None):
     )
 
 
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def bellman_ford_path_length(G, source, target, weight="weight", *, dtype=None):
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def dijkstra_path_length(G, source, target, weight="weight", *, dtype=None):
     G = _to_graph(G, weight, 1, np.float32)
     dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     return _sssp(G, source, weight, target, return_type="length", dtype=dtype)
 
 
+@dijkstra_path._can_run
+def _(G, source, target, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def bellman_ford_path_length(G, source, target, weight="weight", *, dtype=None):
+    return dijkstra_path_length(G, source, target, weight=weight, dtype=dtype)
+
+
 @bellman_ford_path_length._can_run
 def _(G, source, target, weight="weight", *, dtype=None):
     return (
@@ -78,12 +106,22 @@ def _(G, source, target, weight="weight", *, dtype=None):
     )
 
 
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def single_source_dijkstra_path(G, source, cutoff=None, weight="weight", *, dtype=None):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(G, source, weight, return_type="path", dtype=dtype, cutoff=cutoff)
+
+
+@single_source_dijkstra_path._can_run
+def _(G, source, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
 @networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
 @_add_doc
 def single_source_bellman_ford_path(G, source, weight="weight", *, dtype=None):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, return_type="path", dtype=dtype)
+    return single_source_dijkstra_path(G, source, weight=weight, dtype=dtype)
 
 
 @single_source_bellman_ford_path._can_run
@@ -95,12 +133,24 @@ def _(G, source, weight="weight", *, dtype=None):
     )
 
 
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def single_source_dijkstra_path_length(
+    G, source, cutoff=None, weight="weight", *, dtype=None
+):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(G, source, weight, return_type="length", dtype=dtype, cutoff=cutoff)
+
+
+@single_source_dijkstra_path_length._can_run
+def _(G, source, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
 @networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
 @_add_doc
 def single_source_bellman_ford_path_length(G, source, weight="weight", *, dtype=None):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, return_type="length", dtype=dtype)
+    return single_source_dijkstra_path_length(G, source, weight=weight, dtype=dtype)
 
 
 @single_source_bellman_ford_path_length._can_run
@@ -112,12 +162,26 @@ def _(G, source, weight="weight", *, dtype=None):
     )
 
 
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def single_source_dijkstra(
+    G, source, target=None, cutoff=None, weight="weight", *, dtype=None
+):
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    return _sssp(
+        G, source, weight, target, return_type="length-path", dtype=dtype, cutoff=cutoff
+    )
+
+
+@single_source_dijkstra._can_run
+def _(G, source, target=None, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
 @networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
 @_add_doc
 def single_source_bellman_ford(G, source, target=None, weight="weight", *, dtype=None):
-    G = _to_graph(G, weight, 1, np.float32)
-    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
-    return _sssp(G, source, weight, target, return_type="length-path", dtype=dtype)
+    return single_source_dijkstra(G, source, target=target, weight=weight, dtype=dtype)
 
 
 @single_source_bellman_ford._can_run
@@ -129,14 +193,41 @@ def _(G, source, target=None, weight="weight", *, dtype=None):
     )
 
 
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def all_pairs_bellman_ford_path_length(G, weight="weight", *, dtype=None):
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def all_pairs_dijkstra(G, cutoff=None, weight="weight", *, dtype=None):
+    # TODO PERF: batched bfs to compute many at once
+    G = _to_graph(G, weight, 1, np.float32)
+    dtype = _get_float_dtype(dtype, graph=G, weight=weight)
+    for n in G:
+        yield (
+            n,
+            _sssp(G, n, weight, return_type="length-path", dtype=dtype, cutoff=cutoff),
+        )
+
+
+@all_pairs_dijkstra._can_run
+def _(G, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def all_pairs_dijkstra_path_length(G, cutoff=None, weight="weight", *, dtype=None):
     # TODO PERF: batched bfs to compute many at once
     G = _to_graph(G, weight, 1, np.float32)
     dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     for n in G:
-        yield (n, _sssp(G, n, weight, return_type="length", dtype=dtype))
+        yield (n, _sssp(G, n, weight, return_type="length", dtype=dtype, cutoff=cutoff))
+
+
+@all_pairs_dijkstra_path_length._can_run
+def _(G, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def all_pairs_bellman_ford_path_length(G, weight="weight", *, dtype=None):
+    return all_pairs_dijkstra_path_length(G, weight=weight, dtype=None)
 
 
 @all_pairs_bellman_ford_path_length._can_run
@@ -148,14 +239,24 @@ def _(G, weight="weight", *, dtype=None):
     )
 
 
-@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
-@_add_doc
-def all_pairs_bellman_ford_path(G, weight="weight", *, dtype=None):
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.08", _plc="sssp")
+def all_pairs_dijkstra_path(G, cutoff=None, weight="weight", *, dtype=None):
     # TODO PERF: batched bfs to compute many at once
     G = _to_graph(G, weight, 1, np.float32)
     dtype = _get_float_dtype(dtype, graph=G, weight=weight)
     for n in G:
-        yield (n, _sssp(G, n, weight, return_type="path", dtype=dtype))
+        yield (n, _sssp(G, n, weight, return_type="path", dtype=dtype, cutoff=cutoff))
+
+
+@all_pairs_dijkstra_path._can_run
+def _(G, cutoff=None, weight="weight", *, dtype=None):
+    return not callable(weight)
+
+
+@networkx_algorithm(extra_params=_dtype_param, version_added="24.04", _plc="sssp")
+@_add_doc
+def all_pairs_bellman_ford_path(G, weight="weight", *, dtype=None):
+    return all_pairs_dijkstra_path(G, weight=weight, dtype=None)
 
 
 @all_pairs_bellman_ford_path._can_run
@@ -167,7 +268,17 @@ def _(G, weight="weight", *, dtype=None):
     )
 
 
-def _sssp(G, source, weight, target=None, *, return_type, dtype, reverse_path=False):
+def _sssp(
+    G,
+    source,
+    weight,
+    target=None,
+    *,
+    return_type,
+    dtype,
+    reverse_path=False,
+    cutoff=None,
+):
     """SSSP for weighted shortest paths.
 
     Parameters
@@ -201,7 +312,7 @@ def _sssp(G, source, weight, target=None, *, return_type, dtype, reverse_path=Fa
 
     if weight not in G.edge_values:
         # No edge values, so use BFS instead
-        return _bfs(G, source, None, "Source", return_type=return_type, target=target)
+        return _bfs(G, source, cutoff, "Source", return_type=return_type, target=target)
 
     # Check for negative values since we don't support negative cycles
     edge_vals = G.edge_values[weight]
@@ -217,7 +328,7 @@ def _sssp(G, source, weight, target=None, *, return_type, dtype, reverse_path=Fa
         return _bfs(
             G,
             source,
-            None,
+            None if cutoff is None else cutoff / edge_val,
             "Source",
             return_type=return_type,
             target=target,
@@ -226,11 +337,16 @@ def _sssp(G, source, weight, target=None, *, return_type, dtype, reverse_path=Fa
         )
 
     src_index = source if G.key_to_id is None else G.key_to_id[source]
+    if cutoff is None:
+        cutoff = np.inf
+    else:
+        cutoff = np.nextafter(cutoff, np.inf, dtype=np.float64)
+
     node_ids, distances, predecessors = plc.sssp(
         resource_handle=plc.ResourceHandle(),
         graph=G._get_plc_graph(weight, 1, dtype),
         source=src_index,
-        cutoff=np.inf,
+        cutoff=cutoff,
         compute_predecessors=True,  # TODO: False is not yet supported
         # compute_predecessors=return_type != "length",
         do_expensive_check=False,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
index f5d5e2a995d..5e4466d7d33 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
@@ -57,23 +57,41 @@ def _bfs(G, source, *, depth_limit=None, reverse=False):
     return distances[mask], predecessors[mask], node_ids[mask]
 
 
-@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
-def generic_bfs_edges(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
-    """`neighbors` and `sort_neighbors` parameters are not yet supported."""
-    if neighbors is not None:
-        raise NotImplementedError(
-            "neighbors argument in generic_bfs_edges is not currently supported"
-        )
-    if sort_neighbors is not None:
-        raise NotImplementedError(
-            "sort_neighbors argument in generic_bfs_edges is not currently supported"
-        )
-    return bfs_edges(G, source, depth_limit=depth_limit)
-
-
-@generic_bfs_edges._can_run
-def _(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
-    return neighbors is None and sort_neighbors is None
+if nx.__version__[:3] <= "3.3":
+
+    @networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
+    def generic_bfs_edges(
+        G, source, neighbors=None, depth_limit=None, sort_neighbors=None
+    ):
+        """`neighbors` and `sort_neighbors` parameters are not yet supported."""
+        if neighbors is not None:
+            raise NotImplementedError(
+                "neighbors argument in generic_bfs_edges is not currently supported"
+            )
+        if sort_neighbors is not None:
+            raise NotImplementedError(
+                "sort_neighbors argument in generic_bfs_edges is not supported"
+            )
+        return bfs_edges(G, source, depth_limit=depth_limit)
+
+    @generic_bfs_edges._can_run
+    def _(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
+        return neighbors is None and sort_neighbors is None
+
+else:
+
+    @networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
+    def generic_bfs_edges(G, source, neighbors=None, depth_limit=None):
+        """`neighbors` parameter is not yet supported."""
+        if neighbors is not None:
+            raise NotImplementedError(
+                "neighbors argument in generic_bfs_edges is not currently supported"
+            )
+        return bfs_edges(G, source, depth_limit=depth_limit)
+
+    @generic_bfs_edges._can_run
+    def _(G, source, neighbors=None, depth_limit=None):
+        return neighbors is None
 
 
 @networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py
index ad7cf319139..7425eacb2b4 100644
--- a/python/nx-cugraph/nx_cugraph/classes/graph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/graph.py
@@ -50,6 +50,10 @@ class Graph:
     __networkx_backend__: ClassVar[str] = "cugraph"  # nx >=3.2
     __networkx_plugin__: ClassVar[str] = "cugraph"  # nx <3.2
 
+    # Allow networkx dispatch machinery to cache conversions.
+    # This means we should clear the cache if we ever mutate the object!
+    __networkx_cache__: dict | None
+
     # networkx properties
     graph: dict
     graph_attr_dict_factory: ClassVar[type] = dict
@@ -108,6 +112,7 @@ def from_coo(
         **attr,
     ) -> Graph:
         new_graph = object.__new__(cls)
+        new_graph.__networkx_cache__ = {}
         new_graph.src_indices = src_indices
         new_graph.dst_indices = dst_indices
         new_graph.edge_values = {} if edge_values is None else dict(edge_values)
@@ -420,6 +425,8 @@ def clear(self) -> None:
         self._node_ids = None
         self.key_to_id = None
         self._id_to_key = None
+        if cache := self.__networkx_cache__:
+            cache.clear()
 
     @networkx_api
     def clear_edges(self) -> None:
@@ -427,6 +434,8 @@ def clear_edges(self) -> None:
         self.edge_masks.clear()
         self.src_indices = cp.empty(0, self.src_indices.dtype)
         self.dst_indices = cp.empty(0, self.dst_indices.dtype)
+        if cache := self.__networkx_cache__:
+            cache.clear()
 
     @networkx_api
     def copy(self, as_view: bool = False) -> Graph:
@@ -553,6 +562,12 @@ def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
         node_masks = self.node_masks
         key_to_id = self.key_to_id
         id_to_key = None if key_to_id is None else self._id_to_key
+        if self.__networkx_cache__ is None:
+            __networkx_cache__ = None
+        elif not reverse and cls is self.__class__:
+            __networkx_cache__ = self.__networkx_cache__
+        else:
+            __networkx_cache__ = {}
         if not as_view:
             src_indices = src_indices.copy()
             dst_indices = dst_indices.copy()
@@ -564,6 +579,8 @@ def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
                 key_to_id = key_to_id.copy()
                 if id_to_key is not None:
                     id_to_key = id_to_key.copy()
+            if __networkx_cache__ is not None:
+                __networkx_cache__ = __networkx_cache__.copy()
         if reverse:
             src_indices, dst_indices = dst_indices, src_indices
         rv = cls.from_coo(
@@ -581,6 +598,7 @@ def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
             rv.graph = self.graph
         else:
             rv.graph.update(deepcopy(self.graph))
+        rv.__networkx_cache__ = __networkx_cache__
         return rv
 
     def _get_plc_graph(
@@ -719,18 +737,26 @@ def _become(self, other: Graph):
         edge_masks = self.edge_masks
         node_values = self.node_values
         node_masks = self.node_masks
+        __networkx_cache__ = self.__networkx_cache__
         graph = self.graph
         edge_values.update(other.edge_values)
         edge_masks.update(other.edge_masks)
         node_values.update(other.node_values)
         node_masks.update(other.node_masks)
         graph.update(other.graph)
+        if other.__networkx_cache__ is None:
+            __networkx_cache__ = None
+        else:
+            if __networkx_cache__ is None:
+                __networkx_cache__ = {}
+            __networkx_cache__.update(other.__networkx_cache__)
         self.__dict__.update(other.__dict__)
         self.edge_values = edge_values
         self.edge_masks = edge_masks
         self.node_values = node_values
         self.node_masks = node_masks
         self.graph = graph
+        self.__networkx_cache__ = __networkx_cache__
         return self
 
     def _degrees_array(self, *, ignore_selfloops=False):
diff --git a/python/nx-cugraph/nx_cugraph/classes/multigraph.py b/python/nx-cugraph/nx_cugraph/classes/multigraph.py
index de58474de70..23d9faa8734 100644
--- a/python/nx-cugraph/nx_cugraph/classes/multigraph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/multigraph.py
@@ -415,6 +415,12 @@ def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
         key_to_id = self.key_to_id
         id_to_key = None if key_to_id is None else self._id_to_key
         edge_keys = self.edge_keys
+        if self.__networkx_cache__ is None:
+            __networkx_cache__ = None
+        elif not reverse and cls is self.__class__:
+            __networkx_cache__ = self.__networkx_cache__
+        else:
+            __networkx_cache__ = {}
         if not as_view:
             src_indices = src_indices.copy()
             dst_indices = dst_indices.copy()
@@ -429,6 +435,8 @@ def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
                     id_to_key = id_to_key.copy()
             if edge_keys is not None:
                 edge_keys = edge_keys.copy()
+            if __networkx_cache__ is not None:
+                __networkx_cache__ = __networkx_cache__.copy()
         if reverse:
             src_indices, dst_indices = dst_indices, src_indices
         rv = cls.from_coo(
@@ -448,6 +456,7 @@ def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
             rv.graph = self.graph
         else:
             rv.graph.update(deepcopy(self.graph))
+        rv.__networkx_cache__ = __networkx_cache__
         return rv
 
     def _sort_edge_indices(self, primary="src"):
diff --git a/python/nx-cugraph/nx_cugraph/convert.py b/python/nx-cugraph/nx_cugraph/convert.py
index b34245d5031..56d16d837d7 100644
--- a/python/nx-cugraph/nx_cugraph/convert.py
+++ b/python/nx-cugraph/nx_cugraph/convert.py
@@ -14,7 +14,7 @@
 
 import itertools
 import operator as op
-from collections import Counter
+from collections import Counter, defaultdict
 from collections.abc import Mapping
 from typing import TYPE_CHECKING
 
@@ -24,7 +24,8 @@
 
 import nx_cugraph as nxcg
 
-from .utils import index_dtype
+from .utils import index_dtype, networkx_algorithm
+from .utils.misc import pairwise
 
 if TYPE_CHECKING:  # pragma: no cover
     from nx_cugraph.typing import AttrKey, Dtype, EdgeValue, NodeValue, any_ndarray
@@ -32,6 +33,8 @@
 __all__ = [
     "from_networkx",
     "to_networkx",
+    "from_dict_of_lists",
+    "to_dict_of_lists",
 ]
 
 concat = itertools.chain.from_iterable
@@ -408,13 +411,21 @@ def func(it, edge_attr=edge_attr, dtype=dtype):
                 # Node values may be numpy or cupy arrays (useful for str, object, etc).
                 # Someday we'll let the user choose np or cp, and support edge values.
                 node_mask = np.fromiter(iter_mask, bool)
-                node_value = np.array(vals, dtype)
                 try:
-                    node_value = cp.array(node_value)
+                    node_value = np.array(vals, dtype)
                 except ValueError:
-                    pass
+                    # Handle e.g. list elements
+                    if dtype is None or dtype == object:
+                        node_value = np.fromiter(vals, object)
+                    else:
+                        raise
                 else:
-                    node_mask = cp.array(node_mask)
+                    try:
+                        node_value = cp.array(node_value)
+                    except ValueError:
+                        pass
+                    else:
+                        node_mask = cp.array(node_mask)
                 node_values[node_attr] = node_value
                 node_masks[node_attr] = node_mask
                 # if vals.ndim > 1: ...
@@ -428,7 +439,12 @@ def func(it, edge_attr=edge_attr, dtype=dtype):
                 # Node values may be numpy or cupy arrays (useful for str, object, etc).
                 # Someday we'll let the user choose np or cp, and support edge values.
                 if dtype is None:
-                    node_value = np.array(list(iter_values))
+                    vals = list(iter_values)
+                    try:
+                        node_value = np.array(vals)
+                    except ValueError:
+                        # Handle e.g. list elements
+                        node_value = np.fromiter(vals, object)
                 else:
                     node_value = np.fromiter(iter_values, dtype)
                 try:
@@ -474,6 +490,23 @@ def func(it, edge_attr=edge_attr, dtype=dtype):
     return rv
 
 
+def _to_tuples(ndim, L):
+    if ndim > 2:
+        L = list(map(_to_tuples.__get__(ndim - 1), L))
+    return list(map(tuple, L))
+
+
+def _array_to_tuples(a):
+    """Like ``a.tolist()``, but nested structures are tuples instead of lists.
+
+    This is only different from ``a.tolist()`` if ``a.ndim > 1``. It is used to
+    try to return tuples instead of lists for e.g. node values.
+    """
+    if a.ndim > 1:
+        return _to_tuples(a.ndim, a.tolist())
+    return a.tolist()
+
+
 def _iter_attr_dicts(
     values: dict[AttrKey, any_ndarray[EdgeValue | NodeValue]],
     masks: dict[AttrKey, any_ndarray[bool]],
@@ -482,7 +515,7 @@ def _iter_attr_dicts(
     if full_attrs:
         full_dicts = (
             dict(zip(full_attrs, vals))
-            for vals in zip(*(values[attr].tolist() for attr in full_attrs))
+            for vals in zip(*(_array_to_tuples(values[attr]) for attr in full_attrs))
         )
     partial_attrs = list(values.keys() & masks.keys())
     if partial_attrs:
@@ -653,3 +686,98 @@ def _to_undirected_graph(
         )
     # TODO: handle cugraph.Graph
     raise TypeError
+
+
+@networkx_algorithm(version_added="24.08")
+def from_dict_of_lists(d, create_using=None):
+    from .generators._utils import _create_using_class
+
+    graph_class, inplace = _create_using_class(create_using)
+    key_to_id = defaultdict(itertools.count().__next__)
+    src_indices = cp.array(
+        # cp.repeat is slow to use here, so use numpy instead
+        np.repeat(
+            np.fromiter(map(key_to_id.__getitem__, d), index_dtype),
+            np.fromiter(map(len, d.values()), index_dtype),
+        )
+    )
+    dst_indices = cp.fromiter(
+        map(key_to_id.__getitem__, concat(d.values())), index_dtype
+    )
+    # Initialize as directed first them symmetrize if undirected.
+    G = graph_class.to_directed_class().from_coo(
+        len(key_to_id),
+        src_indices,
+        dst_indices,
+        key_to_id=key_to_id,
+    )
+    if not graph_class.is_directed():
+        G = G.to_undirected()
+    if inplace:
+        return create_using._become(G)
+    return G
+
+
+@networkx_algorithm(version_added="24.08")
+def to_dict_of_lists(G, nodelist=None):
+    G = _to_graph(G)
+    src_indices = G.src_indices
+    dst_indices = G.dst_indices
+    if nodelist is not None:
+        try:
+            node_ids = G._nodekeys_to_nodearray(nodelist)
+        except KeyError as exc:
+            gname = "digraph" if G.is_directed() else "graph"
+            raise nx.NetworkXError(
+                f"The node {exc.args[0]} is not in the {gname}."
+            ) from exc
+        mask = cp.isin(src_indices, node_ids) & cp.isin(dst_indices, node_ids)
+        src_indices = src_indices[mask]
+        dst_indices = dst_indices[mask]
+    # Sort indices so we can use `cp.unique` to determine boundaries.
+    # This is like exporting to DCSR.
+    if G.is_multigraph():
+        stacked = cp.unique(cp.vstack((src_indices, dst_indices)), axis=1)
+        src_indices = stacked[0]
+        dst_indices = stacked[1]
+    else:
+        stacked = cp.vstack((dst_indices, src_indices))
+        indices = cp.lexsort(stacked)
+        src_indices = src_indices[indices]
+        dst_indices = dst_indices[indices]
+    compressed_srcs, left_bounds = cp.unique(src_indices, return_index=True)
+    # Ensure we include isolate nodes in the result (and in proper order)
+    rv = None
+    if nodelist is not None:
+        if compressed_srcs.size != len(nodelist):
+            if G.key_to_id is None:
+                # `G._nodekeys_to_nodearray` does not check for valid node keys.
+                container = range(G._N)
+                for key in nodelist:
+                    if key not in container:
+                        gname = "digraph" if G.is_directed() else "graph"
+                        raise nx.NetworkXError(f"The node {key} is not in the {gname}.")
+            rv = {key: [] for key in nodelist}
+    elif compressed_srcs.size != G._N:
+        rv = {key: [] for key in G}
+    # We use `boundaries` like this in `_groupby` too
+    boundaries = pairwise(itertools.chain(left_bounds.tolist(), [src_indices.size]))
+    dst_indices = dst_indices.tolist()
+    if G.key_to_id is None:
+        it = zip(compressed_srcs.tolist(), boundaries)
+        if rv is None:
+            return {src: dst_indices[start:end] for src, (start, end) in it}
+        rv.update((src, dst_indices[start:end]) for src, (start, end) in it)
+        return rv
+    to_key = G.id_to_key.__getitem__
+    it = zip(compressed_srcs.tolist(), boundaries)
+    if rv is None:
+        return {
+            to_key(src): list(map(to_key, dst_indices[start:end]))
+            for src, (start, end) in it
+        }
+    rv.update(
+        (to_key(src), list(map(to_key, dst_indices[start:end])))
+        for src, (start, end) in it
+    )
+    return rv
diff --git a/python/nx-cugraph/nx_cugraph/convert_matrix.py b/python/nx-cugraph/nx_cugraph/convert_matrix.py
index 67f6386987b..38139b913cf 100644
--- a/python/nx-cugraph/nx_cugraph/convert_matrix.py
+++ b/python/nx-cugraph/nx_cugraph/convert_matrix.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 from .generators._utils import _create_using_class
-from .utils import index_dtype, networkx_algorithm
+from .utils import _cp_iscopied_asarray, index_dtype, networkx_algorithm
 
 __all__ = [
     "from_pandas_edgelist",
@@ -34,16 +34,30 @@ def from_pandas_edgelist(
     edge_key=None,
 ):
     """cudf.DataFrame inputs also supported; value columns with str is unsuppported."""
+    # This function never shares ownership of the underlying arrays of the DataFrame
+    # columns. We will perform a copy if necessary even if given e.g. a cudf.DataFrame.
     graph_class, inplace = _create_using_class(create_using)
     # Try to be optimal whether using pandas, cudf, or cudf.pandas
-    src_array = df[source].to_numpy()
-    dst_array = df[target].to_numpy()
+    src_series = df[source]
+    dst_series = df[target]
     try:
         # Optimistically try to use cupy, but fall back to numpy if necessary
-        src_array = cp.asarray(src_array)
-        dst_array = cp.asarray(dst_array)
+        src_array = src_series.to_cupy()
+        dst_array = dst_series.to_cupy()
+    except (AttributeError, TypeError, ValueError, NotImplementedError):
+        src_array = src_series.to_numpy()
+        dst_array = dst_series.to_numpy()
+    try:
+        # Minimize unnecessary data copies by tracking whether we copy or not
+        is_src_copied, src_array = _cp_iscopied_asarray(
+            src_array, orig_object=src_series
+        )
+        is_dst_copied, dst_array = _cp_iscopied_asarray(
+            dst_array, orig_object=dst_series
+        )
         np_or_cp = cp
     except ValueError:
+        is_src_copied = is_dst_copied = False
         src_array = np.asarray(src_array)
         dst_array = np.asarray(dst_array)
         np_or_cp = np
@@ -65,8 +79,15 @@ def from_pandas_edgelist(
         src_indices = cp.asarray(np_or_cp.searchsorted(nodes, src_array), index_dtype)
         dst_indices = cp.asarray(np_or_cp.searchsorted(nodes, dst_array), index_dtype)
     else:
-        src_indices = cp.array(src_array)
-        dst_indices = cp.array(dst_array)
+        # Copy if necessary so we don't share ownership of input arrays.
+        if is_src_copied:
+            src_indices = src_array
+        else:
+            src_indices = cp.array(src_array)
+        if is_dst_copied:
+            dst_indices = dst_array
+        else:
+            dst_indices = cp.array(dst_array)
 
     if not graph_class.is_directed():
         # Symmetrize the edges
@@ -111,19 +132,28 @@ def from_pandas_edgelist(
             }
         kwargs["edge_values"] = edge_values
 
-        if graph_class.is_multigraph() and edge_key is not None:
-            try:
-                edge_keys = df[edge_key].to_list()
-            except (KeyError, TypeError) as exc:
-                raise nx.NetworkXError(
-                    f"Invalid edge_key argument: {edge_key}"
-                ) from exc
-            if not graph_class.is_directed():
-                # Symmetrize the edges
-                edge_keys = cp.hstack(
-                    (edge_keys, edge_keys[mask] if mask is not None else edge_keys)
-                )
-            kwargs["edge_keys"] = edge_keys
+    if (
+        graph_class.is_multigraph()
+        and edge_key is not None
+        and (
+            # In nx <= 3.3, `edge_key` was ignored if `edge_attr` is None
+            edge_attr is not None
+            or nx.__version__[:3] > "3.3"
+        )
+    ):
+        try:
+            edge_keys = df[edge_key].to_list()
+        except (KeyError, TypeError) as exc:
+            raise nx.NetworkXError(f"Invalid edge_key argument: {edge_key}") from exc
+        if not graph_class.is_directed():
+            # Symmetrize the edges; remember, `edge_keys` is a list!
+            if mask is None:
+                edge_keys *= 2
+            else:
+                edge_keys += [
+                    key for keep, key in zip(mask.tolist(), edge_keys) if keep
+                ]
+        kwargs["edge_keys"] = edge_keys
 
     G = graph_class.from_coo(N, src_indices, dst_indices, **kwargs)
     if inplace:
diff --git a/python/nx-cugraph/nx_cugraph/interface.py b/python/nx-cugraph/nx_cugraph/interface.py
index 8569bbf40b9..4007230efa9 100644
--- a/python/nx-cugraph/nx_cugraph/interface.py
+++ b/python/nx-cugraph/nx_cugraph/interface.py
@@ -68,6 +68,13 @@ def key(testpath):
         louvain_different = "Louvain may be different due to RNG"
         no_string_dtype = "string edge values not currently supported"
         sssp_path_different = "sssp may choose a different valid path"
+        no_object_dtype_for_edges = (
+            "Edges don't support object dtype (lists, strings, etc.)"
+        )
+        tuple_elements_preferred = "elements are tuples instead of lists"
+        nx_cugraph_in_test_setup = (
+            "nx-cugraph Graph is incompatible in test setup in nx versions < 3.3"
+        )
 
         xfail = {
             # This is removed while strongly_connected_components() is not
@@ -91,6 +98,81 @@ def key(testpath):
                 "test_cycles.py:TestMinimumCycleBasis."
                 "test_gh6787_and_edge_attribute_names"
             ): sssp_path_different,
+            key(
+                "test_graph_hashing.py:test_isomorphic_edge_attr"
+            ): no_object_dtype_for_edges,
+            key(
+                "test_graph_hashing.py:test_isomorphic_edge_attr_and_node_attr"
+            ): no_object_dtype_for_edges,
+            key(
+                "test_graph_hashing.py:test_isomorphic_edge_attr_subgraph_hash"
+            ): no_object_dtype_for_edges,
+            key(
+                "test_graph_hashing.py:"
+                "test_isomorphic_edge_attr_and_node_attr_subgraph_hash"
+            ): no_object_dtype_for_edges,
+            key(
+                "test_summarization.py:TestSNAPNoEdgeTypes.test_summary_graph"
+            ): no_object_dtype_for_edges,
+            key(
+                "test_summarization.py:TestSNAPUndirected.test_summary_graph"
+            ): no_object_dtype_for_edges,
+            key(
+                "test_summarization.py:TestSNAPDirected.test_summary_graph"
+            ): no_object_dtype_for_edges,
+            key("test_gexf.py:TestGEXF.test_relabel"): no_object_dtype_for_edges,
+            key(
+                "test_gml.py:TestGraph.test_parse_gml_cytoscape_bug"
+            ): no_object_dtype_for_edges,
+            key("test_gml.py:TestGraph.test_parse_gml"): no_object_dtype_for_edges,
+            key("test_gml.py:TestGraph.test_read_gml"): no_object_dtype_for_edges,
+            key("test_gml.py:TestGraph.test_data_types"): no_object_dtype_for_edges,
+            key(
+                "test_gml.py:TestPropertyLists.test_reading_graph_with_list_property"
+            ): no_object_dtype_for_edges,
+            key(
+                "test_relabel.py:"
+                "test_relabel_preserve_node_order_partial_mapping_with_copy_false"
+            ): "Node order is preserved when relabeling with partial mapping",
+            key(
+                "test_gml.py:"
+                "TestPropertyLists.test_reading_graph_with_single_element_list_property"
+            ): tuple_elements_preferred,
+            key(
+                "test_relabel.py:"
+                "TestRelabel.test_relabel_multidigraph_inout_merge_nodes"
+            ): no_string_dtype,
+            key(
+                "test_relabel.py:TestRelabel.test_relabel_multigraph_merge_inplace"
+            ): no_string_dtype,
+            key(
+                "test_relabel.py:TestRelabel.test_relabel_multidigraph_merge_inplace"
+            ): no_string_dtype,
+            key(
+                "test_relabel.py:TestRelabel.test_relabel_multidigraph_inout_copy"
+            ): no_string_dtype,
+            key(
+                "test_relabel.py:TestRelabel.test_relabel_multigraph_merge_copy"
+            ): no_string_dtype,
+            key(
+                "test_relabel.py:TestRelabel.test_relabel_multidigraph_merge_copy"
+            ): no_string_dtype,
+            key(
+                "test_relabel.py:TestRelabel.test_relabel_multigraph_nonnumeric_key"
+            ): no_string_dtype,
+            key("test_contraction.py:test_multigraph_path"): no_object_dtype_for_edges,
+            key(
+                "test_contraction.py:test_directed_multigraph_path"
+            ): no_object_dtype_for_edges,
+            key(
+                "test_contraction.py:test_multigraph_blockmodel"
+            ): no_object_dtype_for_edges,
+            key(
+                "test_summarization.py:TestSNAPUndirectedMulti.test_summary_graph"
+            ): no_string_dtype,
+            key(
+                "test_summarization.py:TestSNAPDirectedMulti.test_summary_graph"
+            ): no_string_dtype,
         }
 
         from packaging.version import parse
@@ -118,6 +200,19 @@ def key(testpath):
                         "test_strongly_connected.py:"
                         "TestStronglyConnected.test_connected_raise"
                     ): "test is incompatible with pytest>=8",
+                    # NetworkX 3.3 introduced logic around functions that return graphs
+                    key(
+                        "test_vf2pp_helpers.py:TestGraphTinoutUpdating.test_updating"
+                    ): nx_cugraph_in_test_setup,
+                    key(
+                        "test_vf2pp_helpers.py:TestGraphTinoutUpdating.test_restoring"
+                    ): nx_cugraph_in_test_setup,
+                    key(
+                        "test_vf2pp_helpers.py:TestDiGraphTinoutUpdating.test_updating"
+                    ): nx_cugraph_in_test_setup,
+                    key(
+                        "test_vf2pp_helpers.py:TestDiGraphTinoutUpdating.test_restoring"
+                    ): nx_cugraph_in_test_setup,
                 }
             )
 
diff --git a/python/nx-cugraph/nx_cugraph/relabel.py b/python/nx-cugraph/nx_cugraph/relabel.py
new file mode 100644
index 00000000000..20d1337a99c
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/relabel.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+from collections import defaultdict
+
+import cupy as cp
+import networkx as nx
+import numpy as np
+
+import nx_cugraph as nxcg
+
+from .utils import _get_int_dtype, _groupby, index_dtype, networkx_algorithm
+
+__all__ = [
+    "convert_node_labels_to_integers",
+    "relabel_nodes",
+]
+
+
+@networkx_algorithm(version_added="24.08")
+def relabel_nodes(G, mapping, copy=True):
+    if isinstance(G, nx.Graph):
+        if not copy:
+            raise RuntimeError(
+                "Using `copy=False` is invalid when using a NetworkX graph "
+                "as input to `nx_cugraph.relabel_nodes`"
+            )
+        G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    it = range(G._N) if G.key_to_id is None else G.id_to_key
+    if callable(mapping):
+        previd_to_key = [mapping(node) for node in it]
+    else:
+        previd_to_key = [mapping.get(node, node) for node in it]
+    if not copy:
+        # Our implementation does not need to raise here, but do so to match networkx.
+        it = range(G._N) if G.key_to_id is None else G.id_to_key
+        D = nx.DiGraph([(x, y) for x, y in zip(it, previd_to_key) if x != y])
+        if nx.algorithms.dag.has_cycle(D):
+            raise nx.NetworkXUnfeasible(
+                "The node label sets are overlapping and no ordering can "
+                "resolve the mapping. Use copy=True."
+            )
+    key_to_previd = {val: i for i, val in enumerate(previd_to_key)}
+    newid_to_key = list(key_to_previd)
+    key_to_newid = dict(zip(newid_to_key, range(len(newid_to_key))))
+
+    src_indices = G.src_indices
+    dst_indices = G.dst_indices
+    edge_values = G.edge_values
+    edge_masks = G.edge_masks
+    node_values = G.node_values
+    node_masks = G.node_masks
+    if G.is_multigraph():
+        edge_indices = G.edge_indices
+        edge_keys = G.edge_keys
+    if len(key_to_previd) != G._N:
+        # Some nodes were combined.
+        # Node data doesn't get merged, so use the data from the last shared index
+        int_dtype = _get_int_dtype(G._N - 1)
+        node_indices = cp.fromiter(key_to_previd.values(), int_dtype)
+        node_indices_np = node_indices.get()  # Node data may be cupy or numpy arrays
+        node_values = {key: val[node_indices_np] for key, val in node_values.items()}
+        node_masks = {key: val[node_indices_np] for key, val in node_masks.items()}
+
+        # Renumber, but will have duplicates
+        translations = cp.fromiter(
+            (key_to_newid[key] for key in previd_to_key), index_dtype
+        )
+        src_indices_dup = translations[src_indices]
+        dst_indices_dup = translations[dst_indices]
+
+        if G.is_multigraph():
+            # No merging necessary for multigraphs.
+            if G.is_directed():
+                src_indices = src_indices_dup
+                dst_indices = dst_indices_dup
+            else:
+                # New self-edges should have one edge entry, not two
+                mask = (
+                    # Not self-edges, no need to deduplicate
+                    (src_indices_dup != dst_indices_dup)
+                    # == : already self-edges; no need to deduplicate
+                    # < : if new self-edges, keep where src < dst
+                    | (src_indices <= dst_indices)
+                )
+                if mask.all():
+                    src_indices = src_indices_dup
+                    dst_indices = dst_indices_dup
+                else:
+                    src_indices = src_indices_dup[mask]
+                    dst_indices = dst_indices_dup[mask]
+                    if edge_values:
+                        edge_values = {
+                            key: val[mask] for key, val in edge_values.items()
+                        }
+                        edge_masks = {key: val[mask] for key, val in edge_masks.items()}
+                    if edge_keys is not None:
+                        edge_keys = [
+                            key for keep, key in zip(mask.tolist(), edge_keys) if keep
+                        ]
+                    if edge_indices is not None:
+                        edge_indices = edge_indices[mask]
+            # Handling of `edge_keys` and `edge_indices` is pure Python to match nx.
+            # This may be slower than we'd like; if it's way too slow, should we
+            # direct users to use the defaults of None?
+            if edge_keys is not None:
+                seen = set()
+                new_edge_keys = []
+                for key in zip(src_indices.tolist(), dst_indices.tolist(), edge_keys):
+                    if key in seen:
+                        src, dst, edge_key = key
+                        if not isinstance(edge_key, (int, float)):
+                            edge_key = 0
+                        for edge_key in itertools.count(edge_key):
+                            if (src, dst, edge_key) not in seen:
+                                seen.add((src, dst, edge_key))
+                                break
+                    else:
+                        seen.add(key)
+                        edge_key = key[2]
+                    new_edge_keys.append(edge_key)
+                edge_keys = new_edge_keys
+            if edge_indices is not None:
+                # PERF: can we do this using cupy?
+                seen = set()
+                new_edge_indices = []
+                for key in zip(
+                    src_indices.tolist(), dst_indices.tolist(), edge_indices.tolist()
+                ):
+                    if key in seen:
+                        src, dst, edge_index = key
+                        for edge_index in itertools.count(edge_index):
+                            if (src, dst, edge_index) not in seen:
+                                seen.add((src, dst, edge_index))
+                                break
+                    else:
+                        seen.add(key)
+                        edge_index = key[2]
+                    new_edge_indices.append(edge_index)
+                edge_indices = cp.array(new_edge_indices, index_dtype)
+        else:
+            stacked_dup = cp.vstack((src_indices_dup, dst_indices_dup))
+            if not edge_values:
+                # Drop duplicates
+                stacked = cp.unique(stacked_dup, axis=1)
+            else:
+                # Drop duplicates. This relies heavily on `_groupby`.
+                # It has not been compared to alternative implementations.
+                # I wonder if there are ways to use assignment using duplicate indices.
+                (stacked, ind, inv) = cp.unique(
+                    stacked_dup, axis=1, return_index=True, return_inverse=True
+                )
+                if ind.dtype != int_dtype:
+                    ind = ind.astype(int_dtype)
+                if inv.dtype != int_dtype:
+                    inv = inv.astype(int_dtype)
+
+                # We need to merge edge data
+                mask = cp.ones(src_indices.size, dtype=bool)
+                mask[ind] = False
+                edge_data = [val[mask] for val in edge_values.values()]
+                edge_data.extend(val[mask] for val in edge_masks.values())
+                groups = _groupby(inv[mask], edge_data)
+
+                edge_values = {key: val[ind] for key, val in edge_values.items()}
+                edge_masks = {key: val[ind] for key, val in edge_masks.items()}
+
+                value_keys = list(edge_values.keys())
+                mask_keys = list(edge_masks.keys())
+
+                values_to_update = defaultdict(list)
+                masks_to_update = defaultdict(list)
+                for k, v in groups.items():
+                    it = iter(v)
+                    vals = dict(zip(value_keys, it))  # zip(strict=False)
+                    masks = dict(zip(mask_keys, it))  # zip(strict=True)
+                    for key, val in vals.items():
+                        if key in masks:
+                            val = val[masks[key]]
+                            if val.size > 0:
+                                values_to_update[key].append((k, val[-1]))
+                                masks_to_update[key].append((k, True))
+                        else:
+                            values_to_update[key].append((k, val[-1]))
+                            if key in edge_masks:
+                                masks_to_update[key].append((k, True))
+
+                int_dtype = _get_int_dtype(src_indices.size - 1)
+                for k, v in values_to_update.items():
+                    ii, jj = zip(*v)
+                    edge_val = edge_values[k]
+                    edge_val[cp.array(ii, dtype=int_dtype)] = cp.array(
+                        jj, dtype=edge_val.dtype
+                    )
+                for k, v in masks_to_update.items():
+                    ii, jj = zip(*v)
+                    edge_masks[k][cp.array(ii, dtype=int_dtype)] = cp.array(
+                        jj, dtype=bool
+                    )
+            src_indices = stacked[0]
+            dst_indices = stacked[1]
+
+    if G.is_multigraph():
+        # `edge_keys` and `edge_indices` are preserved for free if no nodes were merged
+        extra_kwargs = {"edge_keys": edge_keys, "edge_indices": edge_indices}
+    else:
+        extra_kwargs = {}
+    rv = G.__class__.from_coo(
+        len(key_to_previd),
+        src_indices,
+        dst_indices,
+        edge_values=edge_values,
+        edge_masks=edge_masks,
+        node_values=node_values,
+        node_masks=node_masks,
+        id_to_key=newid_to_key,
+        key_to_id=key_to_newid,
+        **extra_kwargs,
+    )
+    rv.graph.update(G.graph)
+    if not copy:
+        G._become(rv)
+        return G
+    return rv
+
+
+@networkx_algorithm(version_added="24.08")
+def convert_node_labels_to_integers(
+    G, first_label=0, ordering="default", label_attribute=None
+):
+    if ordering not in {"default", "sorted", "increasing degree", "decreasing degree"}:
+        raise nx.NetworkXError(f"Unknown node ordering: {ordering}")
+    if isinstance(G, nx.Graph):
+        G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    G = G.copy()
+    if label_attribute is not None:
+        prev_vals = G.id_to_key
+        if prev_vals is None:
+            prev_vals = cp.arange(G._N, dtype=_get_int_dtype(G._N - 1))
+        else:
+            try:
+                prev_vals = np.array(prev_vals)
+            except ValueError:
+                prev_vals = np.fromiter(prev_vals, object)
+            else:
+                try:
+                    prev_vals = cp.array(prev_vals)
+                except ValueError:
+                    pass
+        G.node_values[label_attribute] = prev_vals
+        G.node_masks.pop(label_attribute, None)
+    id_to_key = None
+    if ordering == "default" or ordering == "sorted" and G.key_to_id is None:
+        if first_label == 0:
+            G.key_to_id = None
+        else:
+            id_to_key = list(range(first_label, first_label + G._N))
+            G.key_to_id = dict(zip(id_to_key, range(G._N)))
+    elif ordering == "sorted":
+        key_to_id = G.key_to_id
+        G.key_to_id = {
+            i: key_to_id[n] for i, n in enumerate(sorted(key_to_id), first_label)
+        }
+    else:
+        pairs = sorted(
+            ((d, n) for (n, d) in G._nodearray_to_dict(G._degrees_array()).items()),
+            reverse=ordering == "decreasing degree",
+        )
+        key_to_id = G.key_to_id
+        G.key_to_id = {i: key_to_id[n] for i, (d, n) in enumerate(pairs, first_label)}
+    G._id_to_key = id_to_key
+    return G
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert.py b/python/nx-cugraph/nx_cugraph/tests/test_convert.py
index 1a71b796861..634b28e961c 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_convert.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_convert.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,10 +13,13 @@
 import cupy as cp
 import networkx as nx
 import pytest
+from packaging.version import parse
 
 import nx_cugraph as nxcg
 from nx_cugraph import interface
 
+nxver = parse(nx.__version__)
+
 
 @pytest.mark.parametrize(
     "graph_class", [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
@@ -224,3 +227,48 @@ def test_multigraph(graph_class):
     H = nxcg.to_networkx(Gcg)
     assert type(G) is type(H)
     assert nx.utils.graphs_equal(G, H)
+
+
+def test_to_dict_of_lists():
+    G = nx.MultiGraph()
+    G.add_edge("a", "b")
+    G.add_edge("a", "c")
+    G.add_edge("a", "b")
+    expected = nx.to_dict_of_lists(G)
+    result = nxcg.to_dict_of_lists(G)
+    assert expected == result
+    expected = nx.to_dict_of_lists(G, nodelist=["a", "b"])
+    result = nxcg.to_dict_of_lists(G, nodelist=["a", "b"])
+    assert expected == result
+    with pytest.raises(nx.NetworkXError, match="The node d is not in the graph"):
+        nx.to_dict_of_lists(G, nodelist=["a", "d"])
+    with pytest.raises(nx.NetworkXError, match="The node d is not in the graph"):
+        nxcg.to_dict_of_lists(G, nodelist=["a", "d"])
+    G.add_node("d")  # No edges
+    expected = nx.to_dict_of_lists(G)
+    result = nxcg.to_dict_of_lists(G)
+    assert expected == result
+    expected = nx.to_dict_of_lists(G, nodelist=["a", "d"])
+    result = nxcg.to_dict_of_lists(G, nodelist=["a", "d"])
+    assert expected == result
+    # Now try with default node ids
+    G = nx.DiGraph()
+    G.add_edge(0, 1)
+    G.add_edge(0, 2)
+    expected = nx.to_dict_of_lists(G)
+    result = nxcg.to_dict_of_lists(G)
+    assert expected == result
+    expected = nx.to_dict_of_lists(G, nodelist=[0, 1])
+    result = nxcg.to_dict_of_lists(G, nodelist=[0, 1])
+    assert expected == result
+    with pytest.raises(nx.NetworkXError, match="The node 3 is not in the digraph"):
+        nx.to_dict_of_lists(G, nodelist=[0, 3])
+    with pytest.raises(nx.NetworkXError, match="The node 3 is not in the digraph"):
+        nxcg.to_dict_of_lists(G, nodelist=[0, 3])
+    G.add_node(3)  # No edges
+    expected = nx.to_dict_of_lists(G)
+    result = nxcg.to_dict_of_lists(G)
+    assert expected == result
+    expected = nx.to_dict_of_lists(G, nodelist=[0, 3])
+    result = nxcg.to_dict_of_lists(G, nodelist=[0, 3])
+    assert expected == result
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py b/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py
new file mode 100644
index 00000000000..0a9cc087ce0
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pandas as pd
+import pytest
+
+import nx_cugraph as nxcg
+from nx_cugraph.utils import _cp_iscopied_asarray
+
+try:
+    import cudf
+except ModuleNotFoundError:
+    cudf = None
+
+
+DATA = [
+    {"source": [0, 1], "target": [1, 2]},  # nodes are 0, 1, 2
+    {"source": [0, 1], "target": [1, 3]},  # nodes are 0, 1, 3 (need renumbered!)
+    {"source": ["a", "b"], "target": ["b", "c"]},  # nodes are 'a', 'b', 'c'
+]
+CREATE_USING = [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
+
+
+@pytest.mark.skipif("not cudf")
+@pytest.mark.parametrize("data", DATA)
+@pytest.mark.parametrize("create_using", CREATE_USING)
+def test_from_cudf_edgelist(data, create_using):
+    df = cudf.DataFrame(data)
+    nxcg.from_pandas_edgelist(df, create_using=create_using)  # Basic smoke test
+    source = df["source"]
+    if source.dtype == int:
+        is_copied, src_array = _cp_iscopied_asarray(source)
+        assert is_copied is False
+        is_copied, src_array = _cp_iscopied_asarray(source.to_cupy())
+        assert is_copied is False
+        is_copied, src_array = _cp_iscopied_asarray(source, orig_object=source)
+        assert is_copied is False
+        is_copied, src_array = _cp_iscopied_asarray(
+            source.to_cupy(), orig_object=source
+        )
+        assert is_copied is False
+        # to numpy
+        is_copied, src_array = _cp_iscopied_asarray(source.to_numpy())
+        assert is_copied is True
+        is_copied, src_array = _cp_iscopied_asarray(
+            source.to_numpy(), orig_object=source
+        )
+        assert is_copied is True
+    else:
+        with pytest.raises(TypeError):
+            _cp_iscopied_asarray(source)
+        with pytest.raises(TypeError):
+            _cp_iscopied_asarray(source.to_cupy())
+        with pytest.raises(ValueError, match="Unsupported dtype"):
+            _cp_iscopied_asarray(source.to_numpy())
+        with pytest.raises(ValueError, match="Unsupported dtype"):
+            _cp_iscopied_asarray(source.to_numpy(), orig_object=source)
+
+
+@pytest.mark.parametrize("data", DATA)
+@pytest.mark.parametrize("create_using", CREATE_USING)
+def test_from_pandas_edgelist(data, create_using):
+    df = pd.DataFrame(data)
+    nxcg.from_pandas_edgelist(df, create_using=create_using)  # Basic smoke test
+    source = df["source"]
+    if source.dtype == int:
+        is_copied, src_array = _cp_iscopied_asarray(source)
+        assert is_copied is True
+        is_copied, src_array = _cp_iscopied_asarray(source, orig_object=source)
+        assert is_copied is True
+        is_copied, src_array = _cp_iscopied_asarray(source.to_numpy())
+        assert is_copied is True
+        is_copied, src_array = _cp_iscopied_asarray(
+            source.to_numpy(), orig_object=source
+        )
+        assert is_copied is True
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
index d784d8c13cb..176b531a6e7 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
@@ -48,7 +48,7 @@ def test_match_signature_and_names():
         orig_sig = inspect.signature(orig_func)
         func_sig = inspect.signature(func)
         if not func.extra_params:
-            assert orig_sig == func_sig
+            assert orig_sig == func_sig, name
         else:
             # Ignore extra parameters added to nx-cugraph algorithm
             # The key of func.extra_params may be like "max_level : int, optional",
@@ -60,14 +60,14 @@ def test_match_signature_and_names():
                     for name, p in func_sig.parameters.items()
                     if name not in extra_params
                 ]
-            )
+            ), name
         if func.can_run is not nxcg.utils.decorators._default_can_run:
-            assert func_sig == inspect.signature(func.can_run)
+            assert func_sig == inspect.signature(func.can_run), name
         if func.should_run is not nxcg.utils.decorators._default_should_run:
-            assert func_sig == inspect.signature(func.should_run)
+            assert func_sig == inspect.signature(func.should_run), name
 
         # Matching function names?
-        assert func.__name__ == dispatchable_func.__name__ == orig_func.__name__
+        assert func.__name__ == dispatchable_func.__name__ == orig_func.__name__, name
 
         # Matching dispatch names?
         # nx version >=3.2 uses name, version >=3.0,<3.2 uses dispatchname
@@ -75,14 +75,14 @@ def test_match_signature_and_names():
             dispatchname = dispatchable_func.dispatchname
         else:
             dispatchname = dispatchable_func.name
-        assert func.name == dispatchname
+        assert func.name == dispatchname, name
 
         # Matching modules (i.e., where function defined)?
         assert (
             "networkx." + func.__module__.split(".", 1)[1]
             == dispatchable_func.__module__
             == orig_func.__module__
-        )
+        ), name
 
         # Matching package layout (i.e., which modules have the function)?
         nxcg_path = func.__module__
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_relabel.py b/python/nx-cugraph/nx_cugraph/tests/test_relabel.py
new file mode 100644
index 00000000000..40bf851d376
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_relabel.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pytest
+
+import nx_cugraph as nxcg
+
+from .testing_utils import assert_graphs_equal
+
+
+@pytest.mark.parametrize(
+    "create_using", [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph]
+)
+def test_relabel(create_using):
+    G = nx.complete_graph(3, create_using=create_using)
+    Hnx = nx.relabel_nodes(G, {2: 1})
+    Hcg = nxcg.relabel_nodes(G, {2: 1})
+    assert_graphs_equal(Hnx, Hcg)
+
+    G.add_edge(0, 2, a=11)
+    G.add_edge(1, 2, b=22)
+    Hnx = nx.relabel_nodes(G, {2: 10, 1: 10})
+    Hcg = nxcg.relabel_nodes(G, {2: 10, 1: 10})
+    assert_graphs_equal(Hnx, Hcg)
+
+    G = nx.path_graph(3, create_using=create_using)
+    Hnx = nx.relabel_nodes(G, {2: 0})
+    Hcg = nxcg.relabel_nodes(G, {2: 0})
+    assert_graphs_equal(Hnx, Hcg)
+
+
+@pytest.mark.parametrize("create_using", [nx.MultiGraph, nx.MultiDiGraph])
+def test_relabel_multigraph(create_using):
+    G = nx.empty_graph(create_using=create_using)
+    G.add_edge(0, 1, "x", a=11)
+    G.add_edge(0, 2, "y", a=10, b=6)
+    G.add_edge(0, 0, c=7)
+    G.add_edge(0, 0, "x", a=-1, b=-1, c=-1)
+    Hnx = nx.relabel_nodes(G, {0: 1, 2: 1})
+    Hcg = nxcg.relabel_nodes(G, {0: 1, 2: 1})
+    assert_graphs_equal(Hnx, Hcg)
+    Hnx = nx.relabel_nodes(G, {2: 3, 1: 3, 0: 3})
+    Hcg = nxcg.relabel_nodes(G, {2: 3, 1: 3, 0: 3})
+    assert_graphs_equal(Hnx, Hcg)
+
+
+def test_relabel_nx_input():
+    G = nx.complete_graph(3)
+    with pytest.raises(RuntimeError, match="Using `copy=False` is invalid"):
+        nxcg.relabel_nodes(G, {0: 1}, copy=False)
+    Hnx = nx.relabel_nodes(G, {0: 1}, copy=True)
+    Hcg = nxcg.relabel_nodes(G, {0: 1}, copy=True)
+    assert_graphs_equal(Hnx, Hcg)
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_utils.py b/python/nx-cugraph/nx_cugraph/tests/test_utils.py
index fdd0c91995c..d38a286fa5d 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_utils.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,10 +10,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import cupy as cp
 import numpy as np
 import pytest
 
-from nx_cugraph.utils import _get_int_dtype
+from nx_cugraph.utils import _cp_iscopied_asarray, _get_int_dtype
 
 
 def test_get_int_dtype():
@@ -85,3 +86,20 @@ def test_get_int_dtype():
         _get_int_dtype(7, signed=True, unsigned=True)
     assert _get_int_dtype(7, signed=True, unsigned=False) == np.int8
     assert _get_int_dtype(7, signed=False, unsigned=True) == np.uint8
+
+
+def test_cp_iscopied_asarray():
+    # We don't yet run doctest, so do simple copy/paste test here.
+    #
+    # >>> is_copied, a = _cp_iscopied_asarray([1, 2, 3])
+    # >>> is_copied
+    # True
+    # >>> a
+    # array([1, 2, 3])
+    # >>> _cp_iscopied_asarray(a)
+    # (False, array([1, 2, 3]))
+    is_copied, a = _cp_iscopied_asarray([1, 2, 3])
+    assert is_copied is True
+    assert isinstance(a, cp.ndarray)
+    assert repr(a) == "array([1, 2, 3])"
+    assert _cp_iscopied_asarray(a)[0] is False
diff --git a/python/nx-cugraph/nx_cugraph/tests/testing_utils.py b/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
index 6d4741c9ca6..529a96efd81 100644
--- a/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
+++ b/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
@@ -18,10 +18,10 @@
 def assert_graphs_equal(Gnx, Gcg):
     assert isinstance(Gnx, nx.Graph)
     assert isinstance(Gcg, nxcg.Graph)
-    assert Gnx.number_of_nodes() == Gcg.number_of_nodes()
-    assert Gnx.number_of_edges() == Gcg.number_of_edges()
-    assert Gnx.is_directed() == Gcg.is_directed()
-    assert Gnx.is_multigraph() == Gcg.is_multigraph()
+    assert (a := Gnx.number_of_nodes()) == (b := Gcg.number_of_nodes()), (a, b)
+    assert (a := Gnx.number_of_edges()) == (b := Gcg.number_of_edges()), (a, b)
+    assert (a := Gnx.is_directed()) == (b := Gcg.is_directed()), (a, b)
+    assert (a := Gnx.is_multigraph()) == (b := Gcg.is_multigraph()), (a, b)
     G = nxcg.to_networkx(Gcg)
     rv = nx.utils.graphs_equal(G, Gnx)
     if not rv:
diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py
index eab4b42c2cc..8526524f1de 100644
--- a/python/nx-cugraph/nx_cugraph/utils/misc.py
+++ b/python/nx-cugraph/nx_cugraph/utils/misc.py
@@ -45,6 +45,7 @@ def pairwise(it):
     "_get_int_dtype",
     "_get_float_dtype",
     "_dtype_param",
+    "_cp_iscopied_asarray",
 ]
 
 # This may switch to np.uint32 at some point
@@ -206,3 +207,34 @@ def _get_float_dtype(
             f"Dtype {dtype} cannot be safely promoted to float32 or float64"
         )
     return rv
+
+
+def _cp_iscopied_asarray(a, *args, orig_object=None, **kwargs):
+    """Like ``cp.asarray``, but also returns whether the input was copied.
+
+    Use this to avoid unnecessary copies. If given, ``orig_object`` will
+    also be inspected to determine if it was copied.
+
+    >>> is_copied, a = _cp_iscopied_asarray([1, 2, 3])
+    >>> is_copied
+    True
+    >>> a
+    array([1, 2, 3])
+    >>> _cp_iscopied_asarray(a)
+    (False, array([1, 2, 3]))
+    """
+    arr = cp.asarray(a, *args, **kwargs)
+    ptr = arr.__cuda_array_interface__["data"][0]
+    if (
+        hasattr(a, "__cuda_array_interface__")
+        and a.__cuda_array_interface__["data"][0] == ptr
+        and (
+            orig_object is None
+            or hasattr(orig_object, "__cuda_array_interface__")
+            and orig_object.__cuda_array_interface__["data"][0] == ptr
+        )
+        # Should we also check device_id?
+        # and getattr(getattr(a, "data", None), "device_id", None) == arr.data.device_id
+    ):
+        return False, arr
+    return True, arr
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index 50881d5db90..847444f9dd1 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "networkx>=3.0",
     "numpy>=1.23,<2.0a0",
-    "pylibcugraph==24.8.*,>=0.0.0a0",
+    "pylibcugraph==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
@@ -86,6 +86,7 @@ include = [
 build-backend = "setuptools.build_meta"
 commit-files = ["_nx_cugraph/GIT_COMMIT"]
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.black]
 line-length = 88
@@ -180,6 +181,7 @@ ignore = [
     # "SIM300",  # Yoda conditions are discouraged, use ... instead (Note: we're not this picky)
     # "SIM401",  # Use dict.get ... instead of if-else-block (Note: if-else better for coverage and sometimes clearer)
     # "TRY004",  # Prefer `TypeError` exception for invalid type (Note: good advice, but not worth the nuisance)
+    "B020",  # Found for loop that reassigns the iterable it is iterating with each iterable value (too strict)
     "B904",  # Bare `raise` inside exception clause (like TRY200; sometimes okay)
     "S310",  # Audit URL open for permitted schemes (Note: we don't download URLs in normal usage)
 
@@ -206,6 +208,7 @@ ignore = [
     "RET502",  # Do not implicitly `return None` in function able to return non-`None` value
     "RET503",  # Missing explicit `return` at the end of function able to return non-`None` value
     "RET504",  # Unnecessary variable assignment before `return` statement
+    "RUF018",  # Avoid assignment expressions in `assert` statements
     "S110",  # `try`-`except`-`pass` detected, consider logging the exception (Note: good advice, but we don't log)
     "S112",  # `try`-`except`-`continue` detected, consider logging the exception (Note: good advice, but we don't log)
     "SIM102",  # Use a single `if` statement instead of nested `if` statements (Note: often necessary)
@@ -241,6 +244,7 @@ ignore = [
 "nx_cugraph/algorithms/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for algorithms
 "nx_cugraph/generators/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for generators
 "nx_cugraph/interface.py" = ["D401"]  # Flexible docstrings
+"nx_cugraph/convert.py" = ["E721"]  # Allow `dtype == object`
 "scripts/update_readme.py" = ["INP001"]  # Not part of a package
 
 [tool.ruff.lint.flake8-annotations]
diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
index 7cc90145949..90fce23282e 100644
--- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
@@ -39,6 +39,7 @@ set(cython_sources
     jaccard_coefficients.pyx
     sorensen_coefficients.pyx
     overlap_coefficients.pyx
+    cosine_coefficients.pyx
     katz_centrality.pyx
     leiden.pyx
     louvain.pyx
@@ -58,6 +59,10 @@ set(cython_sources
     weakly_connected_components.pyx
     replicate_edgelist.pyx
     degrees.pyx
+    all_pairs_jaccard_coefficients.pyx
+    all_pairs_sorensen_coefficients.pyx
+    all_pairs_overlap_coefficients.pyx
+    all_pairs_cosine_coefficients.pyx
 )
 set(linked_libraries cugraph::cugraph;cugraph::cugraph_c)
 
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index dcdef05e106..b67acc8bbfc 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -95,6 +95,16 @@
 
 from pylibcugraph.sorensen_coefficients import sorensen_coefficients
 
+from pylibcugraph.cosine_coefficients import cosine_coefficients
+
+from pylibcugraph.all_pairs_jaccard_coefficients import all_pairs_jaccard_coefficients
+
+from pylibcugraph.all_pairs_overlap_coefficients import all_pairs_overlap_coefficients
+
+from pylibcugraph.all_pairs_sorensen_coefficients import all_pairs_sorensen_coefficients
+
+from pylibcugraph.all_pairs_cosine_coefficients import all_pairs_cosine_coefficients
+
 from pylibcugraph.degrees import in_degrees, out_degrees, degrees
 
 
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd
index 406094f18d5..71d094a6058 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd
@@ -35,11 +35,14 @@ from pylibcugraph._cugraph_c.graph_functions cimport (
 
 
 cdef extern from "cugraph_c/similarity_algorithms.h":
+
     ###########################################################################
-    #"""
     ctypedef struct cugraph_similarity_result_t:
         pass
-    #"""
+
+    cdef cugraph_vertex_pairs_t* \
+        cugraph_similarity_result_get_vertex_pairs(
+            cugraph_similarity_result_t* result);
 
     cdef cugraph_type_erased_device_array_view_t* \
         cugraph_similarity_result_get_similarity(
@@ -64,6 +67,20 @@ cdef extern from "cugraph_c/similarity_algorithms.h":
             cugraph_error_t** error
         )
 
+    ###########################################################################
+    # all-pairs jaccard coefficients
+    cdef cugraph_error_code_t \
+        cugraph_all_pairs_jaccard_coefficients(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* vertices,
+            bool_t use_weight,
+            size_t topk,
+            bool_t do_expensive_check,
+            cugraph_similarity_result_t** result,
+            cugraph_error_t** error
+        )
+
     ###########################################################################
     # sorensen coefficients
     cdef cugraph_error_code_t \
@@ -77,6 +94,20 @@ cdef extern from "cugraph_c/similarity_algorithms.h":
             cugraph_error_t** error
         )
 
+    ###########################################################################
+    # all-pairs sorensen coefficients
+    cdef cugraph_error_code_t \
+        cugraph_all_pairs_sorensen_coefficients(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* vertices,
+            bool_t use_weight,
+            size_t topk,
+            bool_t do_expensive_check,
+            cugraph_similarity_result_t** result,
+            cugraph_error_t** error
+        )
+
     ###########################################################################
     # overlap coefficients
     cdef cugraph_error_code_t \
@@ -89,3 +120,44 @@ cdef extern from "cugraph_c/similarity_algorithms.h":
             cugraph_similarity_result_t** result,
             cugraph_error_t** error
         )
+
+    ###########################################################################
+    # all-pairs overlap coefficients
+    cdef cugraph_error_code_t \
+        cugraph_all_pairs_overlap_coefficients(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* vertices,
+            bool_t use_weight,
+            size_t topk,
+            bool_t do_expensive_check,
+            cugraph_similarity_result_t** result,
+            cugraph_error_t** error
+        )
+
+    ###########################################################################
+    # cosine coefficients
+    cdef cugraph_error_code_t \
+        cugraph_cosine_similarity_coefficients(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_vertex_pairs_t* vertex_pairs,
+            bool_t use_weight,
+            bool_t do_expensive_check,
+            cugraph_similarity_result_t** result,
+            cugraph_error_t** error
+        )
+
+    ###########################################################################
+    # all-pairs cosine coefficients
+    cdef cugraph_error_code_t \
+        cugraph_all_pairs_cosine_similarity_coefficients(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            const cugraph_type_erased_device_array_view_t* vertices,
+            bool_t use_weight,
+            size_t topk,
+            bool_t do_expensive_check,
+            cugraph_similarity_result_t** result,
+            cugraph_error_t** error
+        )
diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx
new file mode 100644
index 00000000000..b600dd48567
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx
@@ -0,0 +1,164 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_free
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_vertex_pairs_t,
+    cugraph_vertex_pairs_get_first,
+    cugraph_vertex_pairs_get_second,
+    cugraph_vertex_pairs_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.similarity_algorithms cimport (
+    cugraph_all_pairs_cosine_similarity_coefficients,
+    cugraph_similarity_result_t,
+    cugraph_similarity_result_get_similarity,
+    cugraph_similarity_result_get_vertex_pairs,
+    cugraph_similarity_result_free
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+    create_cugraph_type_erased_device_array_view_from_py_obj,
+    SIZE_MAX
+)
+
+
+def all_pairs_cosine_coefficients(ResourceHandle resource_handle,
+        _GPUGraph graph,
+        vertices,
+        bool_t use_weight,
+        topk,
+        bool_t do_expensive_check):
+    """
+    Perform All-Pairs Cosine similarity computation.
+
+    Note that Cosine similarity must run on a symmetric graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    vertices : cudf.Series or None
+        Vertex list to compute all-pairs. If None, then compute based
+            on all vertices in the graph.
+
+    use_weight : bool, optional
+        If set to True, then compute weighted cosine_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, compute non-weighted cosine_coefficients
+
+    topk : size_t
+        Specify the number of answers to return otherwise will return all values.
+
+
+    do_expensive_check : bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    Returns
+    -------
+    A tuple of device arrays containing the vertex pairs with
+    their corresponding Cosine coefficient scores.
+
+    Examples
+    --------
+    # FIXME: No example yet
+
+    """
+
+    if topk is None:
+        topk = SIZE_MAX
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_similarity_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        vertices_view_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                vertices)
+
+    error_code = cugraph_all_pairs_cosine_similarity_coefficients(c_resource_handle_ptr,
+                                              c_graph_ptr,
+                                              vertices_view_ptr,
+                                              use_weight,
+                                              topk,
+                                              do_expensive_check,
+                                              &result_ptr,
+                                              &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_all_pairs_cosine_similarity_coefficients")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \
+        cugraph_similarity_result_get_similarity(result_ptr)
+
+    cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr)
+
+    cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \
+        cugraph_similarity_result_get_vertex_pairs(result_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \
+        cugraph_vertex_pairs_get_first(vertex_pairs_ptr)
+
+    cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \
+        cugraph_vertex_pairs_get_second(vertex_pairs_ptr)
+
+    cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr)
+
+    # Free all pointers
+    cugraph_similarity_result_free(result_ptr)
+    cugraph_vertex_pairs_free(vertex_pairs_ptr)
+
+    cugraph_type_erased_device_array_view_free(vertices_view_ptr)
+    # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory
+    # are already deallocated when freeing 'result_ptr'
+
+    return cupy_first, cupy_second, cupy_similarity
diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx
new file mode 100644
index 00000000000..b65905b6850
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx
@@ -0,0 +1,164 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_free
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_vertex_pairs_t,
+    cugraph_vertex_pairs_get_first,
+    cugraph_vertex_pairs_get_second,
+    cugraph_vertex_pairs_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.similarity_algorithms cimport (
+    cugraph_all_pairs_jaccard_coefficients,
+    cugraph_similarity_result_t,
+    cugraph_similarity_result_get_similarity,
+    cugraph_similarity_result_get_vertex_pairs,
+    cugraph_similarity_result_free
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+    create_cugraph_type_erased_device_array_view_from_py_obj,
+    SIZE_MAX
+)
+
+
+def all_pairs_jaccard_coefficients(ResourceHandle resource_handle,
+        _GPUGraph graph,
+        vertices,
+        bool_t use_weight,
+        topk,
+        bool_t do_expensive_check):
+    """
+    Perform All-Pairs Jaccard similarity computation.
+
+    Note that Jaccard similarity must run on a symmetric graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    vertices : cudf.Series or None
+        Vertex list to compute all-pairs. If None, then compute based
+            on all vertices in the graph.
+
+    use_weight : bool, optional
+        If set to True, then compute weighted jaccard_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, compute non-weighted jaccard_coefficients
+
+    topk : size_t
+        Specify the number of answers to return otherwise will return all values.
+
+
+    do_expensive_check : bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    Returns
+    -------
+    A tuple of device arrays containing the vertex pairs with
+    their corresponding Jaccard coefficient scores.
+
+    Examples
+    --------
+    # FIXME: No example yet
+
+    """
+
+    if topk is None:
+        topk = SIZE_MAX
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_similarity_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        vertices_view_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                vertices)
+
+    error_code = cugraph_all_pairs_jaccard_coefficients(c_resource_handle_ptr,
+                                              c_graph_ptr,
+                                              vertices_view_ptr,
+                                              use_weight,
+                                              topk,
+                                              do_expensive_check,
+                                              &result_ptr,
+                                              &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_all_pairs_jaccard_coefficients")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \
+        cugraph_similarity_result_get_similarity(result_ptr)
+
+    cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr)
+
+    cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \
+        cugraph_similarity_result_get_vertex_pairs(result_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \
+        cugraph_vertex_pairs_get_first(vertex_pairs_ptr)
+
+    cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \
+        cugraph_vertex_pairs_get_second(vertex_pairs_ptr)
+
+    cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr)
+
+    # Free all pointers
+    cugraph_similarity_result_free(result_ptr)
+    cugraph_vertex_pairs_free(vertex_pairs_ptr)
+
+    cugraph_type_erased_device_array_view_free(vertices_view_ptr)
+    # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory
+    # are already deallocated when freeing 'result_ptr'
+
+    return cupy_first, cupy_second, cupy_similarity
diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx
new file mode 100644
index 00000000000..74f3bc06a94
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx
@@ -0,0 +1,164 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_free
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_vertex_pairs_t,
+    cugraph_vertex_pairs_get_first,
+    cugraph_vertex_pairs_get_second,
+    cugraph_vertex_pairs_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.similarity_algorithms cimport (
+    cugraph_all_pairs_overlap_coefficients,
+    cugraph_similarity_result_t,
+    cugraph_similarity_result_get_similarity,
+    cugraph_similarity_result_get_vertex_pairs,
+    cugraph_similarity_result_free
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+    create_cugraph_type_erased_device_array_view_from_py_obj,
+    SIZE_MAX
+)
+
+
+def all_pairs_overlap_coefficients(ResourceHandle resource_handle,
+        _GPUGraph graph,
+        vertices,
+        bool_t use_weight,
+        topk,
+        bool_t do_expensive_check):
+    """
+    Perform All-Pairs Overlap similarity computation.
+
+    Note that Overlap similarity must run on a symmetric graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    vertices : cudf.Series or None
+        Vertex list to compute all-pairs. If None, then compute based
+            on all vertices in the graph.
+
+    use_weight : bool, optional
+        If set to True, then compute weighted overlap_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, compute non-weighted overlap_coefficients
+
+    topk : size_t
+        Specify the number of answers to return otherwise will return all values.
+
+
+    do_expensive_check : bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    Returns
+    -------
+    A tuple of device arrays containing the vertex pairs with
+    their corresponding Overlap coefficient scores.
+
+    Examples
+    --------
+    # FIXME: No example yet
+
+    """
+
+    if topk is None:
+        topk = SIZE_MAX
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_similarity_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        vertices_view_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                vertices)
+
+    error_code = cugraph_all_pairs_overlap_coefficients(c_resource_handle_ptr,
+                                              c_graph_ptr,
+                                              vertices_view_ptr,
+                                              use_weight,
+                                              topk,
+                                              do_expensive_check,
+                                              &result_ptr,
+                                              &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_all_pairs_overlap_coefficients")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \
+        cugraph_similarity_result_get_similarity(result_ptr)
+
+    cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr)
+
+    cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \
+        cugraph_similarity_result_get_vertex_pairs(result_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \
+        cugraph_vertex_pairs_get_first(vertex_pairs_ptr)
+
+    cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \
+        cugraph_vertex_pairs_get_second(vertex_pairs_ptr)
+
+    cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr)
+
+    # Free all pointers
+    cugraph_similarity_result_free(result_ptr)
+    cugraph_vertex_pairs_free(vertex_pairs_ptr)
+
+    cugraph_type_erased_device_array_view_free(vertices_view_ptr)
+    # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory
+    # are already deallocated when freeing 'result_ptr'
+
+    return cupy_first, cupy_second, cupy_similarity
diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx
new file mode 100644
index 00000000000..5e3fc24a4b4
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx
@@ -0,0 +1,164 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_free
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_vertex_pairs_t,
+    cugraph_vertex_pairs_get_first,
+    cugraph_vertex_pairs_get_second,
+    cugraph_vertex_pairs_free,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.similarity_algorithms cimport (
+    cugraph_all_pairs_sorensen_coefficients,
+    cugraph_similarity_result_t,
+    cugraph_similarity_result_get_similarity,
+    cugraph_similarity_result_get_vertex_pairs,
+    cugraph_similarity_result_free
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+    create_cugraph_type_erased_device_array_view_from_py_obj,
+    SIZE_MAX
+)
+
+
+def all_pairs_sorensen_coefficients(ResourceHandle resource_handle,
+        _GPUGraph graph,
+        vertices,
+        bool_t use_weight,
+        topk,
+        bool_t do_expensive_check):
+    """
+    Perform All-Pairs Sorensen similarity computation.
+
+    Note that Sorensen similarity must run on a symmetric graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    vertices : cudf.Series or None
+        Vertex list to compute all-pairs. If None, then compute based
+            on all vertices in the graph.
+
+    use_weight : bool, optional
+        If set to True, then compute weighted sorensen_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, compute non-weighted sorensen_coefficients
+
+    topk : size_t
+        Specify the number of answers to return otherwise will return all values.
+
+
+    do_expensive_check : bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    Returns
+    -------
+    A tuple of device arrays containing the vertex pairs with
+    their corresponding Sorensen coefficient scores.
+
+    Examples
+    --------
+    # FIXME: No example yet
+
+    """
+
+    if topk is None:
+        topk = SIZE_MAX
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_similarity_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        vertices_view_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                vertices)
+
+    error_code = cugraph_all_pairs_sorensen_coefficients(c_resource_handle_ptr,
+                                              c_graph_ptr,
+                                              vertices_view_ptr,
+                                              use_weight,
+                                              topk,
+                                              do_expensive_check,
+                                              &result_ptr,
+                                              &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_all_pairs_sorensen_coefficients")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \
+        cugraph_similarity_result_get_similarity(result_ptr)
+
+    cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr)
+
+    cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \
+        cugraph_similarity_result_get_vertex_pairs(result_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \
+        cugraph_vertex_pairs_get_first(vertex_pairs_ptr)
+
+    cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \
+        cugraph_vertex_pairs_get_second(vertex_pairs_ptr)
+
+    cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr)
+
+    # Free all pointers
+    cugraph_similarity_result_free(result_ptr)
+    cugraph_vertex_pairs_free(vertex_pairs_ptr)
+
+    cugraph_type_erased_device_array_view_free(vertices_view_ptr)
+    # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory
+    # are already deallocated when freeing 'result_ptr'
+
+    return cupy_first, cupy_second, cupy_similarity
diff --git a/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx b/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx
new file mode 100644
index 00000000000..df194fe364e
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx
@@ -0,0 +1,171 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+from cython.operator cimport dereference
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+    cugraph_type_erased_device_array_view_free
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_vertex_pairs_t,
+    cugraph_vertex_pairs_get_first,
+    cugraph_vertex_pairs_get_second,
+    cugraph_vertex_pairs_free,
+    cugraph_create_vertex_pairs
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.similarity_algorithms cimport (
+    cugraph_cosine_similarity_coefficients,
+    cugraph_similarity_result_t,
+    cugraph_similarity_result_get_similarity,
+    cugraph_similarity_result_free
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+    create_cugraph_type_erased_device_array_view_from_py_obj
+)
+
+
+def cosine_coefficients(ResourceHandle resource_handle,
+        _GPUGraph graph,
+        first,
+        second,
+        bool_t use_weight,
+        bool_t do_expensive_check):
+    """
+    Compute the Cosine coefficients for the specified vertex_pairs.
+
+    Note that Cosine similarity must run on a symmetric graph.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph, for either Single or Multi-GPU operations.
+
+    first :
+        Source of the vertex pair.
+
+    second :
+        Destination of the vertex pair.
+
+    use_weight : bool, optional
+        If set to True, the  compute weighted cosine_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, computed un-weighted cosine_coefficients
+
+    do_expensive_check : bool
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    Returns
+    -------
+    A tuple of device arrays containing the vertex pairs with
+    their corresponding Cosine coefficient scores.
+
+    Examples
+    --------
+    # FIXME: No example yet
+
+    """
+
+    cdef cugraph_vertex_pairs_t* vertex_pairs_ptr
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+
+    cdef cugraph_similarity_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    # 'first' is a required parameter
+    cdef cugraph_type_erased_device_array_view_t* \
+        first_view_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                first)
+
+    # 'second' is a required parameter
+    cdef cugraph_type_erased_device_array_view_t* \
+        second_view_ptr = \
+            create_cugraph_type_erased_device_array_view_from_py_obj(
+                second)
+
+    error_code = cugraph_create_vertex_pairs(c_resource_handle_ptr,
+                                             c_graph_ptr,
+                                             first_view_ptr,
+                                             second_view_ptr,
+                                             &vertex_pairs_ptr,
+                                             &error_ptr)
+    assert_success(error_code, error_ptr, "vertex_pairs")
+
+    error_code = cugraph_cosine_similarity_coefficients(c_resource_handle_ptr,
+                                              c_graph_ptr,
+                                              vertex_pairs_ptr,
+                                              use_weight,
+                                              do_expensive_check,
+                                              &result_ptr,
+                                              &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_cosine_similarity_coefficients")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \
+        cugraph_similarity_result_get_similarity(result_ptr)
+
+    cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* first_ptr = \
+        cugraph_vertex_pairs_get_first(vertex_pairs_ptr)
+
+    cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* second_ptr = \
+        cugraph_vertex_pairs_get_second(vertex_pairs_ptr)
+
+    cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr)
+
+    # Free all pointers
+    cugraph_similarity_result_free(result_ptr)
+    cugraph_vertex_pairs_free(vertex_pairs_ptr)
+
+    cugraph_type_erased_device_array_view_free(first_view_ptr)
+    cugraph_type_erased_device_array_view_free(second_view_ptr)
+
+    return cupy_first, cupy_second, cupy_similarity
diff --git a/python/pylibcugraph/pylibcugraph/k_truss_subgraph.pyx b/python/pylibcugraph/pylibcugraph/k_truss_subgraph.pyx
index 6e4cd2e282a..9ea533c9f28 100644
--- a/python/pylibcugraph/pylibcugraph/k_truss_subgraph.pyx
+++ b/python/pylibcugraph/pylibcugraph/k_truss_subgraph.pyx
@@ -65,7 +65,7 @@ def k_truss_subgraph(ResourceHandle resource_handle,
         Handle to the underlying device resources needed for referencing data
         and running algorithms.
 
-    graph : SGGraph
+    graph : SGGraph or MGGraph
         The input graph.
 
     k: size_t
diff --git a/python/pylibcugraph/pylibcugraph/utils.pxd b/python/pylibcugraph/pylibcugraph/utils.pxd
index 7fc140e9aed..21ab49a1f1e 100644
--- a/python/pylibcugraph/pylibcugraph/utils.pxd
+++ b/python/pylibcugraph/pylibcugraph/utils.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -57,3 +57,6 @@ cdef cugraph_type_erased_device_array_view_t* \
 cdef create_cupy_array_view_for_device_ptr(
     cugraph_type_erased_device_array_view_t* device_array_view_ptr,
     owning_py_object)
+
+cdef extern from "stdint.h":
+    size_t SIZE_MAX
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 984e1d140f2..4dd513a4902 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -23,8 +23,8 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "pylibraft==24.8.*,>=0.0.0a0",
-    "rmm==24.8.*,>=0.0.0a0",
+    "pylibraft==24.10.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -36,7 +36,7 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "numpy>=1.23,<2.0a0",
     "pandas",
     "pytest",
@@ -69,6 +69,7 @@ dependencies-file = "../../dependencies.yaml"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "ninja",
-    "pylibraft==24.8.*,>=0.0.0a0",
-    "rmm==24.8.*,>=0.0.0a0",
+    "pylibraft==24.10.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+matrix-entry = "cuda_suffixed=true"