diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
index 2ef0d7c7..df9dd473 100644
--- a/EXAMPLE/CMakeLists.txt
+++ b/EXAMPLE/CMakeLists.txt
@@ -67,7 +67,12 @@ if(enable_double)
   set(DEXM3D pddrive3d.c dcreate_matrix.c dcreate_matrix3d.c)
   add_executable(pddrive3d ${DEXM3D})
   target_link_libraries(pddrive3d ${all_link_libs})
-  install(TARGETS pddrive3d RUNTIME DESTINATION "${INSTALL_LIB_DIR}/EXAMPLE")  
+  install(TARGETS pddrive3d RUNTIME DESTINATION "${INSTALL_LIB_DIR}/EXAMPLE")
+
+  set(DEXM3D pddrive3d_vbatch.c dcreate_matrix.c dcreate_matrix3d.c)
+  add_executable(pddrive3d_vbatch ${DEXM3D})
+  target_link_libraries(pddrive3d_vbatch ${all_link_libs})
+  install(TARGETS pddrive3d_vbatch RUNTIME DESTINATION "${INSTALL_LIB_DIR}/EXAMPLE")  
 
   set(DEXM3D pddrive3d_block_diag.c dcreate_matrix.c dcreate_matrix3d.c)
   add_executable(pddrive3d_block_diag ${DEXM3D})
diff --git a/EXAMPLE/dcreate_matrix3d.c b/EXAMPLE/dcreate_matrix3d.c
index 03b35b59..83e40a9e 100644
--- a/EXAMPLE/dcreate_matrix3d.c
+++ b/EXAMPLE/dcreate_matrix3d.c
@@ -710,3 +710,105 @@ int dcreate_batch_systems(handle_t *SparseMatrix_handles, int batchCount,
 #endif
     return 0;
 } /* end dcreate_batch_systems */
+
+int dcreate_batch_systems_multiple(handle_t *SparseMatrix_handles, int batchCount,
+                 int nrhs, double **RHSptr, int *ldRHS, double **xtrue, int *ldX,
+                 FILE **fp, char * postfix, gridinfo3d_t *grid3d)
+{
+    int_t    *rowind_d, *colptr_d;  /* metadata for one diagonal block */
+    double   *nzval_d;      /* global */
+    int_t    m, n, nnz;
+    int    row, col, i, j, relpos;
+    int      iam;
+    char     trans[1];
+
+    iam = grid3d->iam;
+
+#if ( DEBUGlevel>=1 )
+CHECK_MALLOC(iam, "Enter dcreate_batch_systems()");
+#endif
+
+    /* Allocate storage for CSC containing all the matrices */
+    SuperMatrix **A = SUPERLU_MALLOC( batchCount * sizeof(SuperMatrix *) );
+    int d = 0;
+    for (d = 0; d < batchCount; ++d) {
+        int_t    *rowind, *colptr;
+        double   *nzval;
+
+        FILE *fpd = fp[d];
+        if ( !iam )
+        {
+            double t = SuperLU_timer_();
+
+            if (!strcmp(postfix, "rua"))
+            {
+                /* Read the matrix stored on disk in Harwell-Boeing format. */
+                dreadhb_dist(iam, fpd, &m, &n, &nnz, &nzval, &rowind, &colptr);
+            }
+            else if (!strcmp(postfix, "mtx"))
+            {
+                /* Read the matrix stored on disk in Matrix Market format. */
+                dreadMM_dist(fpd, &m, &n, &nnz, &nzval, &rowind, &colptr);
+            }
+            else if (!strcmp(postfix, "rb"))
+            {
+                /* Read the matrix stored on disk in Rutherford-Boeing format. */
+                dreadrb_dist(iam, fpd, &m, &n, &nnz, &nzval, &rowind, &colptr);
+            }
+            else if (!strcmp(postfix, "dat"))
+            {
+                /* Read the matrix stored on disk in triplet format. */
+                dreadtriple_dist(fpd, &m, &n, &nnz, &nzval, &rowind, &colptr);
+            }
+            else if (!strcmp(postfix, "datnh"))
+            {
+                /* Read the matrix stored on disk in triplet format (without header). */
+                dreadtriple_noheader(fpd, &m, &n, &nnz, &nzval, &rowind, &colptr);
+            }
+            else if (!strcmp(postfix, "bin"))
+            {
+                /* Read the matrix stored on disk in binary format. */
+                dread_binary(fpd, &m, &n, &nnz, &nzval, &rowind, &colptr);
+            }
+            else
+            {
+                ABORT("File format not known");
+            }
+
+            printf("Time to read and distribute matrix %.2f\n",
+                   SuperLU_timer_() - t);  fflush(stdout);
+        }
+
+        int_t  *rowind_d, *colptr_d; /* each block */
+    
+        /* Allocate storage for compressed column representation. */
+        dallocateA_dist(n, nnz, &nzval_d, &rowind_d, &colptr_d);
+    
+        /* Copy the CSC arrays */
+        for (j = 0; j < n+1; ++j) colptr_d[j] = colptr[j];
+        for (i = 0; i < nnz; ++i) {
+            rowind_d[i] = rowind[i];
+            nzval_d[i] = nzval[i];
+        }
+            
+        /* Create compressed column matrix. */
+        A[d] = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
+        dCreate_CompCol_Matrix_dist(A[d], m, n, nnz, nzval_d, rowind_d, colptr_d,
+                        SLU_NC, SLU_D, SLU_GE);
+        SparseMatrix_handles[d] = (handle_t) A[d];
+        
+        /* Generate the exact solutions and compute the right-hand sides. */
+        RHSptr[d] = doubleMalloc_dist( m * nrhs );
+        xtrue[d] = doubleMalloc_dist( n * nrhs );
+        ldRHS[d] = m;
+        ldX[d] = n;
+        *trans = 'N';
+        dGenXtrue_dist(n, nrhs, xtrue[d], n);
+        dFillRHS_dist(trans, nrhs, xtrue[d], n, A[d], RHSptr[d], m);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit dcreate_batch_systems()");
+#endif
+    return 0;
+} /* end dcreate_batch_systems_mult */
\ No newline at end of file
diff --git a/EXAMPLE/pddrive3d_block_diag_vbatch.c b/EXAMPLE/pddrive3d_block_diag_vbatch.c
new file mode 100755
index 00000000..7887da69
--- /dev/null
+++ b/EXAMPLE/pddrive3d_block_diag_vbatch.c
@@ -0,0 +1,498 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PDGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 9.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
+ * August 27, 2022  Add batch option
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D.
+ *
+ * This example illustrates how to use PDGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pdgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np <p> pddrive3d -r <proc rows> -c <proc columns> \
+ *                                   -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       d must be a power-of-two, e.g., 1, 2, 4, ...
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, double* A, int LDA,
+       double* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    double * Aval = (double *) A->nzval;
+    double * Bval = (double *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    dScalePermstruct_t ScalePermstruct;
+    dLUstruct_t LUstruct;
+    dSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    double *b, *xtrue;
+    int_t m, n;
+    int nprow, npcol, npdep;
+    int lookahead, colperm, rowperm, ir;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level, batchCount = 0;
+    int*    usermap;     /* The following variables are used for batch solves */
+    float result_min[2];
+    result_min[0]=1e10;
+    result_min[1]=1e10;
+    float result_max[2];
+    result_max[0]=0.0;
+    result_max[1]=0.0;
+    MPI_Comm SubComm;
+    int myrank, p;
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+    lookahead = -1;
+    colperm = -1;
+    rowperm = -1;
+    ir = -1;
+    
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            case 'l': lookahead = atoi(*cpp);
+                      break;
+            case 'p': rowperm = atoi(*cpp);
+                      break;
+            case 'q': colperm = atoi(*cpp);
+                      break;
+            case 'i': ir = atoi(*cpp);
+                      break;
+            case 'b': batchCount = atoi(*cpp);
+                      break;
+            case 's': nrhs = atoi(*cpp);
+                      break;                      
+            }
+        }
+        else
+        {   /* Last arg is considered a filename */
+            if (!(fp = fopen (*cpp, "r")))
+            {
+                ABORT ("File does not exist");
+            }
+            break;
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+#ifdef GPU_ACC
+    int superlu_acc_offload = get_acc_offload();
+    if (superlu_acc_offload) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+        double t1 = SuperLU_timer_();
+        gpuFree(0);
+        double t2 = SuperLU_timer_();
+        if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
+        gpublasHandle_t hb;
+        gpublasCreate(&hb);
+        if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
+        gpublasDestroy(hb);
+	}
+#endif
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+        fflush(stdout);
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    for (ii = 0; ii<strlen(*cpp); ii++) {
+	if((*cpp)[ii]=='.'){
+	    suffix = &((*cpp)[ii+1]);
+	    // printf("%s\n", suffix);
+	}
+    }
+
+#define NRFRMT
+#ifndef NRFRMT
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+	dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d));
+	
+#else
+    // *fp0 = *fp;
+
+    if ( batchCount > 0 ) {
+	printf("batchCount %d\n", batchCount);
+	dcreate_block_diag_3d(&A, batchCount, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &grid);
+    } else {
+	dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+				 &xtrue, &ldx, fp, suffix, &(grid));
+    }
+    
+    //printf("ldx %d, ldb %d\n", ldx, ldb);
+    
+#if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
+    double* B2d;
+    NRformat_loc Atmp = dGatherNRformat_loc(
+                            (NRformat_loc *) A.Store,
+                            b, ldb, nrhs, &B2d,
+                            &grid);
+    Astore = &Atmp;
+    SuperMatrix Aref;
+    double *bref, *xtrueref;
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+    {
+        dcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb,
+                               &xtrueref, &ldx, fp0, 
+                               suffix, &(grid.grid2d));
+        Astore0 = (NRformat_loc *) Aref.Store;
+
+	/*
+	if ( (grid.grid2d).iam == 0 ) {
+	    printf(" iam %d\n", 0); 
+	    checkNRFMT(Astore, Astore0);
+	} else if ((grid.grid2d).iam == 1 ) {
+	    printf(" iam %d\n", 1); 
+	    checkNRFMT(Astore, Astore0);
+	} 
+	*/
+    
+	// bref, xtrueref are created on 2D
+        matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb);
+    }
+    // MPI_Finalize(); exit(0);
+    #endif
+#endif
+
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------ */
+
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = SLU_DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+    options.Algo3d = YES;
+    options.ReplaceTinyPivot = YES;
+    options.IterRefine = NOREFINE;
+    options.DiagInv           = YES;
+    // options.ParSymbFact       = YES;
+    // options.ColPerm           = PARMETIS;
+    options.Algo3d = YES;
+	options.DiagInv = YES;
+    options.ReplaceTinyPivot  = YES;    
+#if 0
+    options.ReplaceTinyPivot = YES;
+    options.RowPerm = NOROWPERM;
+    options.ColPerm = NATURAL;
+    options.Equil = NO;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if (rowperm != -1) options.RowPerm = rowperm;
+    if (colperm != -1) options.ColPerm = colperm;
+    if (lookahead != -1) options.num_lookaheads = lookahead;
+    if (ir != -1) options.IterRefine = ir;
+    
+    if ( batchCount > 0 )
+        options.batchCount = batchCount;
+      
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+#ifdef NRFRMT  // matrix is on 3D process grid
+    m = A.nrow;
+    n = A.ncol;
+#else
+    if ( grid.zscp.Iam == 0 )  // Process layer 0
+    {
+	m = A.nrow;
+        n = A.ncol;
+    }
+    // broadcast m, n to all the process layers;
+    MPI_Bcast( &m, 1, mpi_int_t, 0,  grid.zscp.comm);
+    MPI_Bcast( &n, 1, mpi_int_t, 0,  grid.zscp.comm);
+#endif    
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    dScalePermstructInit (m, n, &ScalePermstruct);
+    dLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+
+    // dDestroy_LU (n, &(grid.grid2d), &LUstruct);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+	    PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+    }
+    dSolveFinalize (&options, &SOLVEstruct);
+    dDestroy_LU (n, &(grid.grid2d), &LUstruct);
+    
+    dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    SUPERLU_FREE (b);
+    SUPERLU_FREE (xtrue);
+    SUPERLU_FREE (berr);
+    dScalePermstructFree (&ScalePermstruct);
+    dLUstructFree (&LUstruct);
+    fclose(fp);
+    
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+    if ( batchCount ) {
+	result_min[0] = stat.utime[FACT];   
+	result_min[1] = stat.utime[SOLVE];  
+	result_max[0] = stat.utime[FACT];   
+	result_max[1] = stat.utime[SOLVE];    
+	MPI_Allreduce(MPI_IN_PLACE, result_min, 2, MPI_FLOAT,MPI_MIN, MPI_COMM_WORLD);
+	MPI_Allreduce(MPI_IN_PLACE, result_max, 2, MPI_FLOAT,MPI_MAX, MPI_COMM_WORLD);
+	if (!myrank) {
+	    printf("Batch solves returning data:\n");
+	    printf("    Factor time over all grids.  Min: %8.4f Max: %8.4f\n",result_min[0], result_max[0]);
+	    printf("    Solve time over all grids.  Min: %8.4f Max: %8.4f\n",result_min[1], result_max[1]);
+	    printf("**************************************************\n");
+	    fflush(stdout);
+	}
+    }
+
+    superlu_gridexit3d (&grid);
+    if ( iam != -1 )PStatFree (&stat);
+    
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/EXAMPLE/pddrive3d_vbatch.c b/EXAMPLE/pddrive3d_vbatch.c
new file mode 100755
index 00000000..c0b46c12
--- /dev/null
+++ b/EXAMPLE/pddrive3d_vbatch.c
@@ -0,0 +1,650 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PDGSSVX3D example
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 9.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
+ * August 27, 2022 Add batch option
+ * January 7, 2024 Complete the batch interface
+ *
+ */
+#include "superlu_ddefs.h"
+
+/*! \brief
+ *
+ * <pre>
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D.
+ *
+ * This example illustrates how to use PDGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pdgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np <p> pddrive3d -r <proc rows> -c <proc columns> \
+ *                                   -d <proc Z-dimension> <input_file>
+ * NOTE: total number of processes p = r * c * d
+ *       d must be a power-of-two, e.g., 1, 2, 4, ...
+ *
+ * </pre>
+ */
+ 
+static void matCheck(int n, int m, double* A, int LDA,
+       double* B, int LDB)
+{
+    for(int j=0; j<m;j++)
+        for (int i = 0; i < n; ++i) {
+	    assert(A[i+ LDA*j] == B[i+ LDB*j]);
+	}
+    printf("B check passed\n");
+    return;
+}
+
+static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
+{
+    /*
+    int_t nnz_loc;
+    int_t m_loc;
+    int_t fst_row;
+    void  *nzval;
+    int_t *rowptr;
+    int_t *colind;
+    */
+
+    assert(A->nnz_loc == B->nnz_loc);
+    assert(A->m_loc == B->m_loc);
+    assert(A->fst_row == B->fst_row);
+
+#if 0
+    double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
+    fflush(stdout);
+#endif
+
+    double * Aval = (double *) A->nzval;
+    double * Bval = (double *) B->nzval;
+    for (int_t i = 0; i < A->nnz_loc; i++)
+    {
+        assert( Aval[i] == Bval[i] );
+        assert((A->colind)[i] == (B->colind)[i]);
+	printf("colind[] correct\n");
+    }
+
+    for (int_t i = 0; i < A->m_loc + 1; i++)
+    {
+        assert((A->rowptr)[i] == (B->rowptr)[i]);
+    }
+
+    printf("Matrix check passed\n");
+
+}
+
+int
+main (int argc, char *argv[])
+{
+    superlu_dist_options_t options;
+    SuperLUStat_t stat;
+    SuperMatrix A;  // Now, A is on all 3D processes  
+    dScalePermstruct_t ScalePermstruct;
+    dLUstruct_t LUstruct;
+    dSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    double *b, *xtrue;
+    int_t m, n;
+    int nprow, npcol, npdep;
+    int equil,colperm, rowperm, ir, lookahead;
+    int iam, info, ldb, ldx, nrhs;
+    char **cpp, c, *suffix;
+    FILE *fp, *fopen ();
+    extern int cpp_defs ();
+    int ii, omp_mpi_level, batchCount = 0;
+    int*    usermap;     /* The following variables are used for batch solves */
+    float result_min[2];
+    result_min[0]=1e10;
+    result_min[1]=1e10;
+    float result_max[2];
+    result_max[0]=0.0;
+    result_max[1]=0.0;
+    MPI_Comm SubComm;
+    int myrank, p;
+
+    nprow = 1;            /* Default process rows.      */
+    npcol = 1;            /* Default process columns.   */
+    npdep = 1;            /* replication factor must be power of two */
+    nrhs = 1;             /* Number of right-hand side. */
+    equil = -1;
+    colperm = -1;
+    rowperm = -1;
+    ir = -1;
+    lookahead = -1;
+
+    int dir = 0;
+    char postfix[10];
+    FILE **fparray;
+    
+    /* ------------------------------------------------------------
+       INITIALIZE MPI ENVIRONMENT.
+       ------------------------------------------------------------ */
+    // MPI_Init (&argc, &argv);
+    int required = MPI_THREAD_MULTIPLE;
+    int provided;
+    MPI_Init_thread(&argc, &argv, required, &provided);
+    if (provided < required)
+    {
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
+    }
+
+    /* Parse command line argv[]. */
+    for (cpp = argv + 1; *cpp; ++cpp)
+    {
+        if (**cpp == '-')
+        {
+            c = *(*cpp + 1);
+            ++cpp;
+            switch (c)
+            {
+            case 'h':
+                printf ("Options:\n");
+                printf ("\t-r <int>: process rows    (default %d)\n", nprow);
+                printf ("\t-c <int>: process columns (default %d)\n", npcol);
+                printf ("\t-d <int>: process Z-dimension (default %d)\n", npdep);
+                exit (0);
+                break;
+            case 'r':
+                nprow = atoi (*cpp);
+                break;
+            case 'c':
+                npcol = atoi (*cpp);
+                break;
+            case 'd':
+                npdep = atoi (*cpp);
+                break;
+            case 'b': batchCount = atoi(*cpp);
+                      break;
+            case 'e': equil = atoi(*cpp);
+                      break;
+            case 'p': rowperm = atoi(*cpp);
+                      break;
+            case 'q': colperm = atoi(*cpp);
+                      break;
+            case 'i': ir = atoi(*cpp);
+                      break;
+            case 's': nrhs = atoi(*cpp);
+                      break;                      
+            case 'l': lookahead = atoi(*cpp);
+                      break;
+            case 'f': strcpy(postfix, *cpp); dir = 1;
+                      break;
+            }
+        }
+        else
+        {   /* Last arg is considered a filename or a directory name*/
+            if (dir) {
+                fparray = malloc(sizeof(FILE*) * batchCount);
+                for (int bc = 0; bc < batchCount; bc++) {
+                    char buf[sizeof(int)*8+1];
+                    int lenbuf = snprintf(buf, sizeof(int)*8+1, "%d", bc);
+
+                    char name[100];
+                    strcpy(name, *cpp);
+                    strcat(name, buf);
+                    strcat(name, postfix);
+
+                    if (!(fparray[bc] = fopen (name, "r")))
+                    {
+                        ABORT ("File does not exist");
+                    }
+                }
+                break;
+            }
+            else {
+                if (!(fp = fopen (*cpp, "r")))
+                {
+                    ABORT ("File does not exist");
+                }
+                break;
+            }
+        }
+    }
+
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+#ifdef GPU_ACC
+    int superlu_acc_offload = get_acc_offload(&options);
+    if (superlu_acc_offload) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+        double t1 = SuperLU_timer_();
+        gpuFree(0);
+        double t2 = SuperLU_timer_();
+        if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
+        gpublasHandle_t hb;
+        gpublasCreate(&hb);
+        if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
+        gpublasDestroy(hb);
+	}
+#endif
+    if(grid.iam==0) {
+	MPI_Query_thread(&omp_mpi_level);
+	switch (omp_mpi_level) {
+	case MPI_THREAD_SINGLE:
+	    printf("MPI_Query_thread with MPI_THREAD_SINGLE\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_FUNNELED:
+	    printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_SERIALIZED:
+	    printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n");
+	    fflush(stdout);
+	    break;
+	case MPI_THREAD_MULTIPLE:
+	    printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n");
+	    fflush(stdout);
+	    break;
+	}
+        fflush(stdout);
+    }
+	
+    /* Bail out if I do not belong in the grid. */
+    iam = grid.iam;
+    if (iam == -1)     goto out;
+    if (!iam) {
+	int v_major, v_minor, v_bugfix;
+#ifdef __INTEL_COMPILER
+	printf("__INTEL_COMPILER is defined\n");
+#endif
+	printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__);
+
+	superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix);
+	printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix);
+
+	printf("Input matrix file:\t%s\n", *cpp);
+	printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep);
+	//printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol);
+	fflush(stdout);
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = SLU_DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+    options.Algo3d = YES;
+    options.IterRefine = NOREFINE;
+    // options.ParSymbFact       = YES;
+    // options.ColPerm           = PARMETIS;
+#if 0
+    options.DiagInv           = YES; // only if SLU_HAVE_LAPACK
+    options.ReplaceTinyPivot = YES;
+    options.RowPerm = NOROWPERM;
+    options.ColPerm = NATURAL;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if ( batchCount > 0 )
+        options.batchCount = batchCount;
+
+    if (equil != -1) options.Equil = equil;
+    if (rowperm != -1) options.RowPerm = rowperm;
+    if (colperm != -1) options.ColPerm = colperm;
+    if (ir != -1) options.IterRefine = ir;
+    if (lookahead != -1) options.num_lookaheads = lookahead;
+    
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+
+    /* ------------------------------------------------------------
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
+       ------------------------------------------------------------ */
+    if (dir) {
+        suffix = &(postfix[1]);
+    }
+    else {
+        for (ii = 0; ii<strlen(*cpp); ii++) {
+            if((*cpp)[ii]=='.'){
+                suffix = &((*cpp)[ii+1]);
+                // printf("%s\n", suffix);
+            }
+        }
+    }
+    
+
+    if ( batchCount > 0 ) {
+	/* ------------------------------------------------------------
+	   SOLVE THE BATCH LINEAR SYSTEM.
+	   ------------------------------------------------------------ */
+	printf("batchCount %d\n", batchCount);
+	// dcreate_block_diag_3d(&A, batchCount, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &grid);
+	
+	handle_t *F;
+	double **RHSptr;
+	int *ldRHS, *md, *nd, *nnzd;
+	double **ReqPtr;
+	double **CeqPtr;
+	DiagScale_t *DiagScale;
+	int **RpivPtr;
+	int **CpivPtr;
+	double **Xptr;
+	int *ldX;
+	double **Berrs, **xtrues;
+	
+	handle_t *SparseMatrix_handles = SUPERLU_MALLOC( batchCount *  sizeof(handle_t) );
+	RHSptr = (double **) SUPERLU_MALLOC( batchCount *  sizeof(double *) );
+	ldRHS = int32Malloc_dist(batchCount);
+	xtrues = (double **) SUPERLU_MALLOC( batchCount *  sizeof(double *) );
+	ldX = int32Malloc_dist(batchCount);
+	
+    if (dir) {
+        dcreate_batch_systems_multiple(SparseMatrix_handles, batchCount, nrhs, RHSptr, ldRHS, 
+            xtrues, ldX, fparray, suffix, &grid);
+    }
+    else {
+        dcreate_batch_systems(SparseMatrix_handles, batchCount, nrhs, RHSptr, ldRHS,
+            xtrues, ldX, fp, suffix, &grid);
+    }
+
+    md = int32Malloc_dist(batchCount);
+    nd = int32Malloc_dist(batchCount);
+    nnzd = int32Malloc_dist(batchCount);
+    for (int d = 0; d < batchCount; ++d) {
+        SuperMatrix *Ad = (SuperMatrix *) SparseMatrix_handles[d];
+        NCformat *Adstore = Ad->Store;
+        double *ad = Adstore->nzval;
+        md[d] = Ad->nrow;
+        nd[d] = Ad->ncol;
+        nnzd[d] = Adstore->nnz;
+    }
+
+	// SuperMatrix *A = (SuperMatrix *) SparseMatrix_handles[0];
+	// NCformat *Astore = A->Store;
+	// double *a = Astore->nzval;
+	// m = A->nrow;
+	// n = A->ncol;
+	
+	ReqPtr = (double **) SUPERLU_MALLOC( batchCount * sizeof(double *) );
+	CeqPtr = (double **) SUPERLU_MALLOC( batchCount * sizeof(double *) );
+	RpivPtr = (int **) SUPERLU_MALLOC( batchCount * sizeof(int *) );
+	CpivPtr = (int **) SUPERLU_MALLOC( batchCount * sizeof(int *) );
+	DiagScale = (DiagScale_t *) SUPERLU_MALLOC( batchCount * sizeof(DiagScale_t) );
+	Xptr = (double **) SUPERLU_MALLOC( batchCount * sizeof(double*) );
+	Berrs = (double **) SUPERLU_MALLOC( batchCount * sizeof(double*) );
+	for (int d = 0; d < batchCount; ++d) {
+	    DiagScale[d] = NOEQUIL;
+	    RpivPtr[d] = int32Malloc_dist(md[d]);
+	    CpivPtr[d] = int32Malloc_dist(nd[d]);
+	    Xptr[d] = doubleMalloc_dist( nd[d] *  nrhs );
+	    Berrs[d] = doubleMalloc_dist( nrhs );
+	}
+
+	/* Initialize the statistics variables. */
+	PStatInit (&stat);
+	
+	/* Call batch solver */
+	pdgssvx3d_csc_vbatch(&options, batchCount,
+			    md, nd, nnzd, nrhs, SparseMatrix_handles,
+			    RHSptr, ldRHS, ReqPtr, CeqPtr, RpivPtr, CpivPtr,
+			    DiagScale, F, Xptr, ldX, Berrs, &grid, &stat, &info);
+
+	printf("**** Backward errors ****\n");
+	for (int d = 0; d < batchCount; ++d) {
+	    printf("\tSystem %d: Berr = %e\n", d, Berrs[d][0]);
+	    //printf("\t\tDiagScale[%d] %d\n", d, DiagScale[d]);
+	}
+	
+	/* Free matrices pointed to by the handles, and ReqPtr[], etc. */
+	for (int d = 0; d < batchCount; ++d) {
+	    if ( DiagScale[d] == ROW || DiagScale[d] == BOTH )
+		SUPERLU_FREE(ReqPtr[d]);
+	    if ( DiagScale[d] == COL || DiagScale[d] == BOTH )
+		SUPERLU_FREE(CeqPtr[d]);
+	    SUPERLU_FREE(RpivPtr[d]);
+	    SUPERLU_FREE(CpivPtr[d]);
+	    SUPERLU_FREE(Xptr[d]);
+	    SUPERLU_FREE(Berrs[d]);
+	    // A = (SuperMatrix *) SparseMatrix_handles[d];
+	    //	    Destroy_CompRowLoc_Matrix_dist (A);
+	}
+	SUPERLU_FREE(SparseMatrix_handles);
+	SUPERLU_FREE(RHSptr);
+	SUPERLU_FREE(ldRHS);
+    SUPERLU_FREE(md);
+    SUPERLU_FREE(nd);
+    SUPERLU_FREE(nnzd);
+	SUPERLU_FREE(xtrues);
+	SUPERLU_FREE(ldX);
+	SUPERLU_FREE(ReqPtr);
+	SUPERLU_FREE(CeqPtr);
+	SUPERLU_FREE(RpivPtr);
+	SUPERLU_FREE(CpivPtr);
+	SUPERLU_FREE(DiagScale);
+	SUPERLU_FREE(Xptr);
+	SUPERLU_FREE(Berrs);
+
+	goto out;
+	
+    } else {
+
+#define NRFRMT
+#ifndef NRFRMT
+        if ( grid.zscp.Iam == 0 )  // only in process layer 0
+        dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d));
+
+#else
+        dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+                             &xtrue, &ldx, fp, suffix, &(grid));
+	// dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
+	// 			 &xtrue, &ldx, fp, suffix, &(grid));
+    }
+    
+#if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
+    double* B2d;
+    NRformat_loc Atmp = dGatherNRformat_loc(
+                            (NRformat_loc *) A.Store,
+                            b, ldb, nrhs, &B2d,
+                            &grid);
+    Astore = &Atmp;
+    SuperMatrix Aref;
+    double *bref, *xtrueref;
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
+    {
+        dcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb,
+                               &xtrueref, &ldx, fp0, 
+                               suffix, &(grid.grid2d));
+        Astore0 = (NRformat_loc *) Aref.Store;
+
+	/*
+	if ( (grid.grid2d).iam == 0 ) {
+	    printf(" iam %d\n", 0); 
+	    checkNRFMT(Astore, Astore0);
+	} else if ((grid.grid2d).iam == 1 ) {
+	    printf(" iam %d\n", 1); 
+	    checkNRFMT(Astore, Astore0);
+	} 
+	*/
+    
+	// bref, xtrueref are created on 2D
+        matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb);
+    }
+    // MPI_Finalize(); exit(0);
+    #endif
+#endif
+
+    if (!(berr = doubleMalloc_dist (nrhs)))
+        ABORT ("Malloc fails for berr[].");
+
+    /* ------------------------------------------------------------
+       NOW WE SOLVE THE LINEAR SYSTEM.
+       ------------------------------------------------------------ */
+#ifdef NRFRMT  // matrix is on 3D process grid
+    m = A.nrow;
+    n = A.ncol;
+#else
+    if ( grid.zscp.Iam == 0 )  // Process layer 0
+    {
+    m = A.nrow;
+        n = A.ncol;
+    }
+    // broadcast m, n to all the process layers;
+    MPI_Bcast( &m, 1, mpi_int_t, 0,  grid.zscp.comm);
+    MPI_Bcast( &n, 1, mpi_int_t, 0,  grid.zscp.comm);
+#endif
+    // m = A.nrow;
+    // n = A.ncol;
+    
+    /* Initialize ScalePermstruct and LUstruct. */
+    dScalePermstructInit (m, n, &ScalePermstruct);
+    dLUstructInit (n, &LUstruct);
+
+    /* Initialize the statistics variables. */
+    PStatInit (&stat);
+
+    /* Call the linear equation solver. */
+    pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, &stat, &info);
+
+    if ( info ) {  /* Something is wrong */
+        if ( iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info);
+	    fflush(stdout);
+	}
+    } else {
+        /* Check the accuracy of the solution. */
+        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                              nrhs, b, ldb, xtrue, ldx, grid.comm);
+    }
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+    }
+    dDestroy_LU (n, &(grid.grid2d), &LUstruct);
+    dSolveFinalize (&options, &SOLVEstruct);
+
+    dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
+
+    Destroy_CompRowLoc_Matrix_dist (&A);
+    SUPERLU_FREE (b);
+    SUPERLU_FREE (xtrue);
+    SUPERLU_FREE (berr);
+    dScalePermstructFree (&ScalePermstruct);
+    dLUstructFree (&LUstruct);
+
+    if (dir) {
+        for (int bc = 0; bc < batchCount; bc++) {
+            fclose(fparray[bc]);
+        }
+        free(fparray);
+    }
+    else {
+        fclose(fp);
+    }
+    
+    /* ------------------------------------------------------------
+       RELEASE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+out:
+#if 0  // the following makes sense only for coarse-grain parallel model 
+    if ( batchCount ) {
+	result_min[0] = stat.utime[FACT];   
+	result_min[1] = stat.utime[SOLVE];  
+	result_max[0] = stat.utime[FACT];   
+	result_max[1] = stat.utime[SOLVE];    
+	MPI_Allreduce(MPI_IN_PLACE, result_min, 2, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD);
+	MPI_Allreduce(MPI_IN_PLACE, result_max, 2, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
+	if (!myrank) {
+	    printf("Batch solves returning data:\n");
+	    printf("    Factor time over all grids.  Min: %8.4f Max: %8.4f\n",result_min[0], result_max[0]);
+	    printf("    Solve time over all grids.  Min: %8.4f Max: %8.4f\n",result_min[1], result_max[1]);
+	    printf("**************************************************\n");
+	    fflush(stdout);
+	}
+    }
+#endif    
+
+    superlu_gridexit3d (&grid);
+    if ( iam != -1 )PStatFree (&stat);
+    
+    /* ------------------------------------------------------------
+       TERMINATES THE MPI EXECUTION ENVIRONMENT.
+       ------------------------------------------------------------ */
+    MPI_Finalize ();
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit main()");
+#endif
+
+}
+
+
+int
+cpp_defs ()
+{
+    printf (".. CPP definitions:\n");
+#if ( PRNTlevel>=1 )
+    printf ("\tPRNTlevel = %d\n", PRNTlevel);
+#endif
+#if ( DEBUGlevel>=1 )
+    printf ("\tDEBUGlevel = %d\n", DEBUGlevel);
+#endif
+#if ( PROFlevel>=1 )
+    printf ("\tPROFlevel = %d\n", PROFlevel);
+#endif
+    printf ("....\n");
+    return 0;
+}
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 9483a61c..302479a0 100755
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -81,6 +81,7 @@ set(sources
   prec-independent/treeFactorization.c
   prec-independent/sec_structs.c  
   prec-independent/get_perm_c_batch.c
+  prec-independent/get_perm_c_vbatch.c
 )
 
 # Add all the BLAS routines: CplusplusFactor/supelu_blas.hpp needs all
@@ -164,8 +165,11 @@ if(enable_double)
     double/dtrfCommWrapper.c
     double/dsuperlu_blas.c
     double/pdgssvx3d_csc_batch.c # batch in CSC format
+    double/pdgssvx3d_csc_vbatch.c
     double/dequil_batch.c # batch in CSC format
+    double/dequil_vbatch.c
     double/dpivot_batch.c
+    double/dpivot_vbatch.c
   )
   
 if (TPL_ENABLE_CUDALIB)
diff --git a/SRC/double/dequil_vbatch.c b/SRC/double/dequil_vbatch.c
new file mode 100644
index 00000000..6fbd9ed1
--- /dev/null
+++ b/SRC/double/dequil_vbatch.c
@@ -0,0 +1,204 @@
+/*! @file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*
+ * -- Distributed SuperLU routine (version 9.0) --
+ * Lawrence Berkeley National Lab
+ * November 5, 2023
+ * Last update:
+ */
+#include "superlu_ddefs.h"
+
+/*! \brief Equilibrate the systems using the LAPACK-style algorithm
+ *
+ * @param[in]      options solver options
+ * @param[in]      batchCount number of matrices in the batch
+ * @param[in]      m row dimension of the matrices
+ * @param[in]      n column dimension of the matrices
+ * @param[in, out] SparseMatrix_handles pointers to the matrices in the batch, each pointing to the actual stoage in CSC format
+ *     On entry, the original matrices
+ *     On exit, each matrix may be overwritten by diag(R)*A*diag(C)
+ * @param[out]     ReqPtr pointers to row scaling vectors (allocated internally)
+ * @param[out]     CeqPtr pointers to column scaling vectors (allocated internally)
+ * @param[in, out] DiagScale arrays indicating how each system is equilibrated: {ROW, COL, BOTH}
+ *
+ * Return value i:
+ *     = 0: successful exit
+ *     > 0: indicates the first matrix in the batch has zero row or column
+ *          if i <= m: the i-th row of A is exactly zero
+ *          if i >  m: the (i-m)-th column of A is exactly zero
+ * </pre>
+ */
+int
+dequil_vbatch(
+    superlu_dist_options_t *options, /* options for algorithm choices and algorithm parameters */
+    int batchCount, /* number of matrices in the batch */
+    int *m, /* matrix row dimension */
+    int *n, /* matrix column dimension */
+    handle_t  *SparseMatrix_handles, /* array of sparse matrix handles, of size 'batchCount',
+				      * each pointing to the actual storage
+				      */
+    double **ReqPtr, /* array of pointers to diagonal row scaling vectors,
+			each of size M   */
+    double **CeqPtr, /* array of pointers to diagonal column scaling vectors,
+			each of size N    */
+    DiagScale_t *DiagScale /* How equilibration is done for each matrix. */
+    //    DeviceContext context /* device context including queues, events, dependencies */
+		  )
+{
+    int i, j, irow, icol, info = 0;
+    fact_t Fact = options->Fact;
+    int factored = (Fact == FACTORED);
+    int Equil = (!factored && options->Equil == YES);
+    int notran = (options->Trans == NOTRANS);
+    
+#if (DEBUGlevel >= 1)
+    CHECK_MALLOC(0, "Enter dequil_batch()");
+#endif
+    /* Decipher the input matrices */
+    SuperMatrix **A = SUPERLU_MALLOC(batchCount * sizeof(SuperMatrix *));
+    for (i = 0; i < batchCount; ++i) {
+	A[i] = (SuperMatrix *) SparseMatrix_handles[i];
+    }
+
+    /* Loop through each matrix in the batch */
+    for (int k = 0; k < batchCount; ++k) {
+	
+	NCformat *Astore = (NCformat *) A[k]->Store;
+	double *a = (double *) Astore->nzval;
+	int_t *colptr = Astore->colptr;
+	int_t *rowind = Astore->rowind;
+	
+	/* Assuming each matrix is in CSC format
+	 * Otherwise, convert to CSC first -- use the code in dvpivot_batch.c
+	 * ...
+	 */
+
+	/* The following arrays are replicated on all processes. */
+	double *R = ReqPtr[k];
+	double *C = CeqPtr[k];
+	
+	/* Allocate stoage if not factored & ask for equilibration */
+	if (Equil && Fact != SamePattern_SameRowPerm) {
+	    /* Allocate storage if not done so before. */
+	    //switch (ScalePermstruct->DiagScale)
+	    switch ( DiagScale[k] ) {
+		case NOEQUIL:
+		    if (!(R = (double *)doubleMalloc_dist(m[k]))) ABORT("Malloc fails for R[].");
+		    if (!(C = (double *)doubleMalloc_dist(n[k]))) ABORT("Malloc fails for C[].");
+		    ReqPtr[k] = R;
+		    CeqPtr[k] = C;
+		    break;
+		case ROW: /* R[] was already allocated before */
+		    if (!(C = (double *)doubleMalloc_dist(n[k]))) ABORT("Malloc fails for C[].");
+		    CeqPtr[k] = C;
+		    break;
+		case COL: /* C[] was already allocated before */
+		    if (!(R = (double *)doubleMalloc_dist(m[k]))) ABORT("Malloc fails for R[].");
+		    ReqPtr[k] = R;
+		    break;
+		default:
+		    break;
+		}
+	}
+
+	/* ------------------------------------------------------------
+	   Diagonal scaling to equilibrate the matrix.
+	   ------------------------------------------------------------ */
+	if ( Equil ) {
+	    if (Fact == SamePattern_SameRowPerm) {
+		/* Reuse R and C. */
+		switch ( DiagScale[k] ) {
+		    case NOEQUIL: break;
+		    case ROW:
+			for (j = 0; j < n[k]; ++j) {
+			    for (i = colptr[j]; i < colptr[j + 1]; ++i) {
+				irow = rowind[i];
+				a[i] *= R[irow]; /* Scale rows. */
+			    }
+			}
+			break;
+		    case COL:
+			for (j = 0; j < n[k]; ++j) {
+			    double cj = C[j];
+			    for (i = colptr[j]; i < colptr[j+1]; ++i) {
+				a[i] *= cj; /* Scale columns. */
+			    }
+			}
+			break;
+		    case BOTH:
+			for (j = 0; j < n[k]; ++j) {
+			    double cj = C[j];
+			    for (i = colptr[j]; i < colptr[j + 1]; ++i) {
+				irow = rowind[i];
+				a[i] *= R[irow] * cj; /* Scale rows and cols. */
+
+			    }
+			}
+			break;
+		} /* end switch DiagScale[k] ... */
+		
+	    } else { /* Compute R[] & C[] from scratch */
+		
+		int iinfo;
+		char equed[1];
+		double amax, anorm, colcnd, rowcnd;
+		
+		/* Compute the row and column scalings. */
+
+		dgsequ_dist(A[k], R, C, &rowcnd, &colcnd, &amax, &iinfo);
+		
+		if (iinfo > 0) {
+		    if (iinfo <= m[k]) {
+#if (PRNTlevel >= 1)
+			fprintf(stderr, "Matrix %d: the %d-th row of A is exactly zero\n", k, (int)iinfo);
+#endif
+		    } else {
+#if (PRNTlevel >= 1)
+			fprintf(stderr, "Matrix %d: the %d-th column of A is exactly zero\n", k, (int)iinfo - n[k]);
+#endif
+		    }
+		} else if (iinfo < 0) {
+		    if ( info==0 ) info = iinfo;
+		}
+
+		/* Now iinfo == 0 */
+
+		/* Equilibrate matrix A if it is badly-scaled.
+		   A <-- diag(R)*A*diag(C)                     */
+		dlaqgs_dist(A[k], R, C, rowcnd, colcnd, amax, equed);
+
+		if (strncmp(equed, "R", 1) == 0) {
+		    DiagScale[k] = ROW;
+		} else if (strncmp(equed, "C", 1) == 0) {
+		    DiagScale[k] = COL;
+		} else if (strncmp(equed, "B", 1) == 0)	{
+		    DiagScale[k] = BOTH;
+		} else {
+		    DiagScale[k] = NOEQUIL;
+		}
+#if (PRNTlevel >= 1)
+		printf(".. equilibrated? *equed = %c, DiagScale[k] %d\n", *equed, DiagScale[k]);
+		fflush(stdout);
+#endif
+	    } /* end if-else Fact ... */
+
+	} /* end if Equil ... LAPACK style, not involving MC64 */
+	
+    } /* end for k ... batchCount */
+
+#if (DEBUGlevel >= 1)
+    CHECK_MALLOC(0, "Exit dequil_batch()");
+#endif
+    return info;
+} /* end dequil_batch */
diff --git a/SRC/double/dpivot_vbatch.c b/SRC/double/dpivot_vbatch.c
new file mode 100644
index 00000000..1c34b1bb
--- /dev/null
+++ b/SRC/double/dpivot_vbatch.c
@@ -0,0 +1,274 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*
+ * -- Distributed SuperLU routine (version 9.0) --
+ * Lawrence Berkeley National Lab
+ * November 5, 2023
+ * Last update:
+ */
+#include "superlu_ddefs.h"
+
+/*! \brief Compute row pivotings for each matrix, for numerical stability
+ * <pre>
+ *
+ * @param[in]      options solver options
+ * @param[in]      batchCount number of matrices in the batch
+ * @param[in]      m row dimension of the matrices
+ * @param[in]      n column dimension of the matrices
+ * @param[in, out] SparseMatrix_handles pointers to the matrices in the batch, each pointing to the actual stoage in CSC format
+ *     On entry, the original matrices, may be overwritten by A1 <- diag(R)*A*diag(C) from dequil_batch()
+ *     On exit, each matrix may be A2 <- Pr*A1
+ * @param[in,out]  ReqPtr pointers to row scaling vectors, maybe overwritten by scaling from MC64
+ * @param[in,out]  CeqPtr pointers to column scaling vectors, maybe overwritten by scaling from MC64
+ * @param[in,out] DiagScale array indicating how each system is equilibrated: {ROW, COL, BOTH}
+ * @param[in,out] RpivPtr pointers to row permutation vectors for each matrix, each of size m
+ *     On exit, each RpivPtr[] is applied to each matrix
+ *
+ * Return value:
+ *     0,  success
+ *     -1, invalid RowPerm option; an Identity perm_r[] is returned
+ *     d, indicates that the d-th matrix is the first one in the batch encountering error
+ * </pre>
+ */
+int
+dpivot_vbatch(
+    superlu_dist_options_t *options, /* options for algorithm choices and algorithm parameters */
+    int batchCount, /* number of matrices in the batch */
+    int *m, /* matrix row dimension */
+    int *n, /* matrix column dimension */
+    handle_t  *SparseMatrix_handles, /* array of sparse matrix handles,
+				      * of size 'batchCount', each pointing to the actual storage
+				      */
+    double **ReqPtr, /* array of pointers to diagonal row scaling vectors,
+			each of size M   */
+    double **CeqPtr, /* array of pointers to diagonal column scaling vectors,
+			each of size N    */
+    DiagScale_t *DiagScale, /* How equilibration is done for each matrix. */
+    int **RpivPtr /* array of pointers to row permutation vectors , each of size M */
+    //    DeviceContext context /* device context including queues, events, dependencies */
+		  )
+{
+    int i, j, irow, iinfo, rowequ, colequ, info = 0;
+    fact_t Fact = options->Fact;
+    int factored = (Fact == FACTORED);
+    int Equil = (!factored && options->Equil == YES);
+    int notran = (options->Trans == NOTRANS);
+    int job = 5;
+    
+
+#if (DEBUGlevel >= 1)
+    CHECK_MALLOC(0, "Enter dpivot_batch()");
+#endif
+
+    /* Decipher the input matrices */
+    SuperMatrix **A;
+    A = SUPERLU_MALLOC(batchCount * sizeof(SuperMatrix *));
+    for (i = 0; i < batchCount; ++i) {
+	A[i] = (SuperMatrix *) SparseMatrix_handles[i];
+    }
+
+    int_t *colptr;
+    int_t *rowind;
+    double *a, *at;
+    int_t nnz;
+
+    /* Loop through each matrix in the batch */
+    for (int d = 0; d < batchCount; ++d) {
+
+	double *R1, *C1;
+	if (job == 5) {
+	/* Allocate storage for scaling factors. */
+	if (!(R1 = doubleMalloc_dist(m[d])))
+	    ABORT("SUPERLU_MALLOC fails for R1[]");
+	if (!(C1 = doubleMalloc_dist(n[d])))
+	    ABORT("SUPERLU_MALLOC fails for C1[]");
+    }
+
+	rowequ = ( DiagScale[d] == ROW || DiagScale[d] == BOTH );
+	colequ = ( DiagScale[d] == COL || DiagScale[d] == BOTH );
+
+	/* If the matrix type is SLU_NR (CSR), then need to convert to CSC first */
+	if ( A[d]->Stype == SLU_NR ) { /* CSR format */
+	    NRformat *Astore = (NRformat *) A[d]->Store;
+	    a = (double *)Astore->nzval;
+
+	    dCompRow_to_CompCol_dist(m[d], n[d], nnz, a,
+				     Astore->colind, Astore->rowptr,
+				     &at, &rowind, &colptr);
+
+	    a = at; // now a[] points to at[], stored in CSC format.
+	    nnz = Astore->nnz;
+	} else { /* CSC format */
+	    NCformat *Astore = (NCformat *) A[d]->Store;
+	    a = (double *)Astore->nzval;
+	    colptr = Astore->colptr;
+	    rowind = Astore->rowind;
+	    nnz = Astore->nnz;
+	}
+
+	/* Row and column scaling factors. */
+	double *R = ReqPtr[d];
+	double *C = CeqPtr[d];
+
+	if ( !factored ) { /* Skip this if already factored. */
+
+	    int *perm_r = RpivPtr[d];
+
+	    /* ------------------------------------------------------------
+	       Find the row permutation for A.
+	       ------------------------------------------------------------ */
+	    if (options->RowPerm != NO)	{
+
+		if (Fact != SamePattern_SameRowPerm) {
+		    if (options->RowPerm == MY_PERMR) { /* Use user's perm_r. */
+			/* Permute the matrix A for symbfact() */
+			for (i = 0; i < colptr[n[d]]; ++i) {
+			    irow = rowind[i];
+			    rowind[i] = perm_r[irow];
+			}
+		    } else if (options->RowPerm == LargeDiag_MC64) {
+			/* Finds a row permutation (serial) */
+			iinfo = dldperm_dist(job, m[d], nnz, colptr, rowind, a,
+					     perm_r, R1, C1);
+
+			if ( iinfo ) { /* Error */
+			    printf(".. Matrix %d: LDPERM ERROR %d\n", d, iinfo);
+			    if ( info==0 ) info = d+1 ;
+			}
+#if (PRNTlevel >= 2)
+			dmin = damch_dist("Overflow");
+			dsum = 0.0;
+			dprod = 1.0;
+#endif
+			if (iinfo == 0)	{
+			    if (job == 5) {
+				if (Equil) {
+				    for (i = 0; i < n[d]; ++i) {
+					R1[i] = exp(R1[i]);
+					C1[i] = exp(C1[i]);
+				    }
+
+				    /* Scale the matrix further.
+				       A <-- diag(R1)*A*diag(C1)            */
+				    double cj;
+				    for (j = 0; j < n[d]; ++j) {
+					cj = C1[j];
+					for (i = colptr[j]; i < colptr[j + 1]; ++i) {
+					    irow = rowind[i];
+					    a[i] *= R1[irow] * cj;
+
+					}
+				    }
+
+				    /* Multiply together the scaling factors --
+				       R/C from simple scheme, R1/C1 from MC64. */
+				    if (rowequ)
+					for (i = 0; i < m[d]; ++i)	R[i] *= R1[i];
+				    else
+					for (i = 0; i < m[d]; ++i) R[i] = R1[i];
+				    if (colequ)
+					for (i = 0; i < n[d]; ++i)	C[i] *= C1[i];
+				    else
+					for (i = 0; i < n[d]; ++i) C[i] = C1[i];
+
+				    DiagScale[d] = BOTH;
+				    rowequ = colequ = 1;
+
+				} /* end if Equil */
+
+				/* Now permute rows of A to prepare for symbfact() */
+				for (j = 0; j < n[d]; ++j)	{
+				    for (i = colptr[j]; i < colptr[j + 1]; ++i) {
+					irow = rowind[i];
+					rowind[i] = perm_r[irow];
+#if (PRNTlevel >= 2)
+				        dprod *= fabs(a[i]);
+#endif
+				    }
+				}
+			    } else { /* job = 2,3,4 */
+				for (j = 0; j < n[d]; ++j)	{
+				    for (i = colptr[j]; i < colptr[j + 1]; ++i)	{
+					irow = rowind[i];
+					rowind[i] = perm_r[irow];
+
+#if (PRNTlevel >= 2)
+					/* New diagonal */
+					if (job == 2 || job == 3)
+					    dmin = SUPERLU_MIN(dmin, fabs(a[i]));
+					else if (job == 4)
+					    dsum += fabs(a[i]);
+#endif
+				    } /* end for i ... */
+				}  /* end for j ... */
+			    }  /* end else job ... */
+
+			} else	{ /* if iinfo != 0 ... MC64 returns error */
+			    for (i = 0; i < m[d]; ++i) perm_r[i] = i;
+			}
+#if (PRNTlevel >= 2)
+			if (job == 2 || job == 3) {
+			    if (!iam) printf("\tsmallest diagonal %e\n", dmin);
+			} else if (job == 4) {
+			    if (!iam) printf("\tsum of diagonal %e\n", dsum);
+			} else if (job == 5) {
+n			    if (!iam) printf("\t product of diagonal %e\n", dprod);
+			}
+#endif
+		    } else {
+			printf(".. LDPERM invalid RowPerm option %d\n", options->RowPerm);
+			info = -1;
+			for (i = 0; i < m[d]; ++i)	perm_r[i] = i;
+		    } /* end if-else options->RowPerm ... */
+
+#if (PRNTlevel >= 1)
+		    printf(".. LDPERM job %d\n", (int) job);
+		    fflush(stdout);
+#endif
+		} /* end if Fact not SamePattern_SameRowPerm ... */
+
+	    } else { /* options->RowPerm == NOROWPERM / NATURAL */
+
+		for (i = 0; i < m[d]; ++i)	perm_r[i] = i;
+	    }
+
+#if ( DEBUGlevel>=1 )
+	    check_perm_dist("perm_r", m[d], perm_r);
+            PrintInt32("perm_r", m[d], perm_r);
+#endif
+	} /* end if (!factored) */
+
+	if ( A[d]->Stype == SLU_NR ) {
+	    SUPERLU_FREE(at);
+	    SUPERLU_FREE(rowind);
+	    SUPERLU_FREE(colptr);
+	}
+
+	if (job == 5) {
+	SUPERLU_FREE(R1);
+	SUPERLU_FREE(C1);
+    }
+
+    } /* end for d ... batchCount */
+
+    /* Deallocate storage */
+    SUPERLU_FREE(A);
+    
+
+#if (DEBUGlevel >= 1)
+    CHECK_MALLOC(0, "Exit dpivot_batch()");
+#endif
+    return info;
+
+} /* end dpivot_batch */
diff --git a/SRC/double/pdgssvx3d_csc_vbatch.c b/SRC/double/pdgssvx3d_csc_vbatch.c
new file mode 100755
index 00000000..b52d6dd6
--- /dev/null
+++ b/SRC/double/pdgssvx3d_csc_vbatch.c
@@ -0,0 +1,506 @@
+/*! @file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*
+ * -- Distributed SuperLU routine (version 9.0) --
+ * Lawrence Berkeley National Lab
+ * January 13, 2024
+ * Last update:
+ */
+#include "superlu_ddefs.h"
+#include "superlu_defs.h"
+#include "superlu_upacked.h"
+#include <stdbool.h>
+
+int file_dPrint_CompRowLoc_to_Triples(SuperMatrix *A)
+{
+    NRformat_loc *Astore = A->Store;
+    int nnz, m, n, i, j;
+    double  *dp;
+    FILE *fp = fopen("CSR.txt", "w");
+
+    m = A->nrow;
+    n = A->ncol;
+    nnz = Astore->nnz_loc;
+    dp = Astore->nzval;
+
+    printf("print to triples: m %d, n %d, nnz %d\n", m, n, nnz);
+    for (i = 0; i < m; ++i) {
+	for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) {
+	    fprintf(fp, "%8d %8d %16.8e\n", i, (int) Astore->colind[j], dp[j]);
+	}
+    }
+    fclose(fp);
+    return 0;
+}
+
+/*! \brief Solve a batch of linear systems Ai * Xi = Bi with direct method,
+ *    computing the LU factorization of each matrix Ai; <br>
+ * This is the fixed-size interface: all the input matrices have the same sparsity structure
+ *
+ * <pre>
+ * @param[in]      options solver options
+ * @param[in]      batchCount number of matrices in the batch
+ * @param[in]      m row dimension of the matrices
+ * @param[in]      n column dimension of the matrices
+ * @param[in]      nnz number of non-zero entries in each matrix
+ * @param[in]      nrhs number of right-hand-sides
+ * @param[in,out]  SparseMatrix_handles  array of sparse matrix handles, of size 'batchCount', each pointing to the actual storage in CSC format, see 'NCformat' in SuperMatix structure
+ *      Each A is overwritten by row/col scaling R*A*C
+ * @param[in,out]  RHSptr  array of pointers to dense storage of right-hand sides B
+ *      Each B is overwritten by row/col scaling R*B*C
+ * @param[in]      ldRHS array of leading dimensions of RHS
+ * @param[in,out]  ReqPtr array of pointers to diagonal row scaling vectors R, each of size m
+ *    ReqPtr[] are allocated internally if equilibration is asked for
+ * @param[in,out]  CeqPtr array of pointers to diagonal colum scaling vectors C, each of size n
+ *    CeqPtr[] are allocated internally if equilibration is asked for
+ * @param[in,out]  RpivPtr array of pointers to row permutation vectors, each of size m
+ * @param[in,out]  CpivPtr array of pointers to column permutation vectors, each of size n
+ * @param[in,out]  DiagScale array of indicators how equilibration is done for each matrix
+ * @param[out]     F array of handles pointing to the factored matrices
+ * @param[out]     Xptr array of pointers to dense storage of solution
+ * @param[in]      ldX array of leading dimensions of X
+ * @param[out]     Berrs array of poiniters to backward errors
+ * @param[in]]     grid3d contains MPI communicator
+ * @param[out]     stat records algorithms statistics such as runtime, memory usage, etc.
+ * @param[out]     info flags the errors on return
+ *
+ * </pre>
+ */
+int
+pdgssvx3d_csc_vbatch(
+		superlu_dist_options_t *options, /* options for algorithm choices and algorithm parameters */
+		int batchCount, /* number of matrices in the batch */
+		int *m, /* array of matrix row dimensions, size batchCount */
+		int *n, /* array of matrix column dimension, size batchCount */
+		int *nnz, /* array of number of non-zero entries, size batchCount */
+		int nrhs, /* number of right-hand-sides */
+		handle_t  *SparseMatrix_handles, /* array of sparse matrix handles,
+						  * of size 'batchCount',
+						  * each pointing to the actual storage
+						  */
+		double **RHSptr, // array of pointers to dense RHS storage
+		int *ldRHS, // array of leading dimensions of RHS
+		double **ReqPtr, /* array of pointers to diagonal row scaling vectors,
+				     each of size M   */
+		double **CeqPtr, /* array of pointers to diagonal column scaling vectors,
+				    each of size N    */
+		int **RpivPtr, /* array of pointers to row permutation vectors , each of size M */
+		int **CpivPtr, /* array of pointers to column permutation vectors , each of size N */
+		DiagScale_t *DiagScale, /* indicate how equilibration is done for each matrix */
+		handle_t *F, /* array of handles pointing to the factored matrices */
+ 		double **Xptr, // array of pointers to dense solution storage
+		int *ldX, // array of leading dimensions of X
+		double **Berrs, /* array of poiniters to backward errors */
+		gridinfo3d_t *grid3d,
+		SuperLUStat_t *stat,
+		int *info
+		//DeviceContext context /* device context including queues, events, dependencies */
+		)
+{
+    /* Steps in this routine
+     1. Loop through all matrices A_i, perform preprocessing (all in serial format)
+            1.1 equilibration
+	    1.2 numerical pivoting (e.g., MC64)
+	    1.3 sparsity reordering
+
+     2. Copy the matrices into block diagonal form: A_big
+        (can be merged into Step 3, then no need to have a big copy)
+
+     3. Factorize A_big -> LU_big
+            3.1 symbolic factoization (not batched, can loop through each individual one serial)
+	    3.2 numerical factorization
+
+     4. Split LU_big into individual LU factors, store them in the handle array to return
+        (this may be difficult, and the users may not need them.)
+
+     5. Solve (2 designs)
+             5.1 using LU_big -- requires RHS to be in contiguous memory
+	         compute level set. Leverage B-to-X with an internal copy.
+        (OR) 5.2 loop through individual LU -- may lose some data-parallel opportunity
+    */
+
+    /* Test the options choices. */
+    *info = 0;
+    SuperMatrix *A0 = (SuperMatrix *) SparseMatrix_handles[0];
+    fact_t Fact = options->Fact;
+
+    if (Fact < 0 || Fact > FACTORED)
+	*info = -1;
+    else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR)
+	*info = -1;
+    else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC)
+	*info = -1;
+    else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA)
+	*info = -1;
+    else if (options->IterRefine == SLU_EXTRA)
+	{
+	    *info = -1;
+	    fprintf(stderr,
+		    "Extra precise iterative refinement yet to support.");
+	}
+    else if (batchCount < 0) *info = -2;
+    /* Need to check M, N, NNZ */
+    else if (A0->nrow != A0->ncol || A0->nrow < 0 || A0->Stype != SLU_NC || A0->Dtype != SLU_D || A0->Mtype != SLU_GE)
+	*info = -7;
+    else if (nrhs < 0)
+	{
+	    *info = -6;
+	}
+    if (*info) {
+	pxerr_dist("pdgssvx3d_csc_vbatch", &(grid3d->grid2d), -(*info));
+	return -1;
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid3d->iam, "Enter pdgssvx3d_csc_batch()");
+#endif
+
+    int colequ, Equil, factored, job, notran, rowequ, need_value;
+    int_t i, iinfo, j, k, irow;
+    double *C, *R; //*C1, *R1, amax, anorm, colcnd, rowcnd;
+    float GA_mem_use;	/* memory usage by global A */
+    float dist_mem_use; /* memory usage during distribution */
+    superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage;
+    int d; /* index into each matrix in the batch */
+
+    double t = SuperLU_timer_();
+
+    /**** equilibration (LAPACK style) ****/
+    /* ReqPtr[] and CeqPtr[] are allocated internally */
+    /* Each A may be overwritten by R*A*C */
+    dequil_vbatch(options, batchCount, m, n, SparseMatrix_handles,
+		 ReqPtr, CeqPtr, DiagScale);
+
+    stat->utime[EQUIL] = SuperLU_timer_() - t;
+    t = SuperLU_timer_();
+
+    /**** numerical pivoting (e.g., MC64) ****/
+    /* If MC64(job=5 is invoked, further equilibration is done,
+     * DiagScale[] will be BOTH, and each A is modified,
+     * perm_r[]'s are applied to each matrix.
+     */
+    /* no internal malloc */
+    dpivot_vbatch(options, batchCount, m, n, SparseMatrix_handles,
+		 ReqPtr, CeqPtr, DiagScale, RpivPtr);
+
+    stat->utime[ROWPERM] = SuperLU_timer_() - t;
+
+#if 0
+    for (d = 0; d < batchCount; ++d) {
+	printf("DiagScale[%d] %d\n", d, DiagScale[d]);
+	if ( DiagScale[d] ) {
+	    Printdouble5("ReqPtr[d]", m, ReqPtr[d]);
+	    Printdouble5("CeqPtr[d]", m, CeqPtr[d]);
+	}
+	PrintInt32("RpivPtr[d]", m, RpivPtr[d]);
+    }
+#endif
+
+    /**** sparsity reordering ****/
+    /* col perms are computed for each matrix; may be different due to different row perm.
+     * A may be overwritten as Pr*R*A*C from previous steps, but is not modified in this routine.
+     */
+    t = SuperLU_timer_();
+
+    get_perm_c_vbatch(options, batchCount, SparseMatrix_handles, CpivPtr);
+
+    stat->utime[COLPERM] = SuperLU_timer_() - t;
+#if 0
+    for (d = 0; d < batchCount; ++d) {
+	PrintInt32("CpivPtr[d]", m, CpivPtr[d]);
+    }
+#endif
+
+#if (PRNTlevel >= 1)
+    printf("<---- END PREPROCESSING ----\n");
+#endif
+
+    /*---------------------
+     **** Stack the matrices into block diagonal form: A_big, and RHS B_big
+     ----------------------*/
+
+    /* Count total dimension and number of nonzeros. */
+    SuperMatrix *A;
+    int m_big = 0, n_big = 0, nnz_big = 0;
+    for (d = 0; d < batchCount; ++d) {
+	m_big += m[d];
+	n_big += n[d];
+	A = (SuperMatrix *) SparseMatrix_handles[d];
+	NCformat *Astore = (NCformat *) A->Store;
+	nnz_big += Astore->nnz;
+    }
+
+    /* Allocate storage in CSR containing all matrices in the batch */
+    // TO-DELETE: dallocateA_dist(n, nnz, &nzval, &rowind, &colptr);
+    double *a_big = (double *) doubleMalloc_dist(nnz_big);
+    int_t *colind = (int_t *) intMalloc_dist(nnz_big);
+    int_t *rowptr = (int_t *) intMalloc_dist(n_big + 1);
+    double *nzval_d; /* each diagonal block */
+    int_t *colind_d;
+    int_t *rowptr_d;
+    int_t nnz_d, col, row, offset_m;
+    int *perm_c, *perm_r;
+
+    /* B_big */
+    double *b;
+    if ( !(b = doubleMalloc_dist(m_big * nrhs)) ) ABORT("Malloc fails for b[:,nrhs]");
+
+    j = 0;   /* running sum of total nnz */
+    row = 0;
+    col = 0;
+    double alpha = -1.0, beta = 1.0;
+    offset_m = 0;
+
+    for (d = 0; d < batchCount; ++d) {
+
+	A = (SuperMatrix *) SparseMatrix_handles[d];
+	NCformat *Astore = (NCformat *) A->Store;
+	nnz_d = Astore->nnz;
+	perm_r = RpivPtr[d];
+	perm_c = CpivPtr[d];
+
+	/* Apply perm_c[] to row of A to preserve diagonal: A <= Pc*A */
+	for (i = 0; i < nnz_d; ++i)
+	    Astore->rowind[i] = perm_c[Astore->rowind[i]];
+
+	/* Convert to CSR format. */
+	dCompCol_to_CompRow_dist(m[d], n[d], Astore->nnz, Astore->nzval, Astore->colptr,
+				 Astore->rowind, &nzval_d, &rowptr_d, &colind_d);
+
+	//PrintInt32("rowptr_d", m+1, rowptr_d);
+
+	/* Copy this CSR matrix to a diagonal block of A_big.
+	   Apply each perm_c[] to each matrix by column.
+	   Now, diagonal block is permuted by Pc*A*Pc'
+	*/
+
+	/* Apply perm_c[] to columns of A (out-of-place) */
+	for (i = 0; i < m[d]; ++i) {
+	    rowptr[row++] = j;
+	    //irow = iperm_c[i]; // old irow
+	    //for (k = rowptr_d[irow]; k < rowptr_d[irow+1]; ++k) {
+	    for (k = rowptr_d[i]; k < rowptr_d[i+1]; ++k) {
+		colind[j] = perm_c[colind_d[k]] + col;  // add the *col* shift
+		a_big[j] = nzval_d[k];
+		++j;
+	    }
+	}
+
+	/* move to next block */
+	col += n[d];
+
+	SUPERLU_FREE(nzval_d);  /* TODO: remove repeated malloc/free */
+	SUPERLU_FREE(colind_d);
+	SUPERLU_FREE(rowptr_d);
+
+	/* Transform the right-hand side: RHS overwritten by B <= R*B */
+	double *rhs;
+
+	// NEED TO SAVE A COPY OF RHS ??
+
+	rowequ = ( DiagScale[d] == ROW || DiagScale[d] == BOTH );
+	//printf("  before transform RHS: rowequ %d\n", rowequ);
+	if ( rowequ ) { /* Scale RHS by R[] */
+	    R = ReqPtr[d];
+	    rhs = RHSptr[d]; // first RHS
+	    for (k = 0; k < nrhs; ++k) {
+		for (i = 0; i < m[d]; ++i) rhs[i] *= R[i];
+		rhs += ldRHS[d]; /* move to next RHS */
+	    }
+	}
+
+#if ( DEBUGlevel>=1 )
+	printf("System %d, next row %d, next col %d, next j %d\n", d, row, col, j);
+	//Printdouble5("big-RHS", m, RHSptr[d]);
+#endif
+
+	rhs = RHSptr[d]; // first RHS
+	for (k = 0; k < nrhs; ++k) {
+	    for (i = 0; i < m[d]; ++i) /* permute RHS by Pc*Pr (out-of-place) */
+		b[k * m_big + offset_m + perm_c[perm_r[i]]] = rhs[i];
+	    rhs += ldRHS[d]; /* move to next RHS */
+	}
+	offset_m += m[d];
+
+	//Printdouble5("big-RHS-permuted", m, &b[(k-1) * m_big + d * m]);
+
+    } /* end for d ... batchCount */
+
+    // assert(j == nnz_big);
+    // assert(row == m_big);
+    rowptr[row] = nnz_big;  /* +1 as an end marker */
+
+    /**** By now:  each A transformed to Pc*Pr*R*A*C
+     ****          each B transformed to R*B
+     **** Need to solve (Pc*Pr*R*A*C*Pc')*(Pc*C^{-1}*X) = (Pc*Pr*R)*B
+     ****/
+
+    /* Set up A_big in NR_loc format */
+    SuperMatrix A_big;
+    dCreate_CompRowLoc_Matrix_dist(&A_big, m_big, n_big, nnz_big, m_big, 0,
+				   a_big, colind, rowptr, SLU_NR_loc, SLU_D, SLU_GE);
+
+    //file_dPrint_CompRowLoc_to_Triples(&A_big);
+
+    /* Modify the input options.
+     * Turn off preprocessing options for the big system.
+     */
+    superlu_dist_options_t options_big;
+    set_default_options_dist(&options_big);
+    options_big.Equil  = NO;
+    options_big.ColPerm  = NATURAL;
+    options_big.RowPerm  = NOROWPERM;
+    options_big.ParSymbFact = NO;
+    options_big.batchCount = batchCount;
+
+    /* Copy the other options */
+    options_big.Fact = options->Fact;
+    options_big.ReplaceTinyPivot = options->ReplaceTinyPivot;
+    options_big.IterRefine = options->IterRefine;
+    options_big.Trans = options->Trans;
+    options_big.SolveInitialized = options->SolveInitialized;
+    options_big.RefineInitialized = options->RefineInitialized;
+    options_big.PrintStat = options->PrintStat;
+
+    dScalePermstruct_t ScalePermstruct;
+    dLUstruct_t LUstruct;
+    dSOLVEstruct_t SOLVEstruct;
+    gridinfo3d_t grid;
+    double *berr;
+    MPI_Comm comm = grid3d->comm;
+
+    /* Need to create a grid of size 1 */
+    int nprow = 1, npcol = 1, npdep = 1;
+    superlu_gridinit3d (comm, nprow, npcol, npdep, &grid);
+
+    /* Initialize ScalePermstruct and LUstruct. */
+    dScalePermstructInit (m_big, n_big, &ScalePermstruct);
+    dLUstructInit (n_big, &LUstruct);
+
+    //printf("\tbefore pdgssvx3d: m_big %d, n_big %d, nrhs %d\n", m_big, n_big, nrhs);
+    //dPrint_CompRowLoc_Matrix_dist(&A_big);
+
+    if (!(berr = doubleCalloc_dist (nrhs))) ABORT ("Malloc fails for berr[].");
+
+    /*---------------------
+     **** Call the linear equation solver
+     ----------------------*/
+
+    /*!!!! CHECK SETTING: TO BE SURE TO USE GPU VERSIONS !!!!
+       gpu3dVersion
+       superlu_acc_offload
+    */
+    /* perm_c_big may not be Identity due to etree postordering, however,
+     * since b[] is transormed back to the solution of the original BIG system,
+     * we do not need to consider perm_c_big outside pdgssvx3d().
+     */
+    pdgssvx3d (&options_big, &A_big, &ScalePermstruct, b, m_big, nrhs, &grid,
+               &LUstruct, &SOLVEstruct, berr, stat, info);
+
+#if (PRNTlevel >= 1)
+    printf("\tBIG system: berr[0] %e\n", berr[0]);
+    printf("after pdgssvx3d: DiagScale %d\n", ScalePermstruct.DiagScale);
+    //PrintInt32("after pdgssvx3d: ScalePermstruct.perm_c", m_big, ScalePermstruct.perm_c);
+    //Printdouble5("big-B-solution", m_big, b);
+#endif
+
+    if ( *info ) {  /* Something is wrong */
+        if ( grid3d->iam==0 ) {
+	    printf("ERROR: INFO = %d returned from pdgssvx3d()\n", *info);
+	    fflush(stdout);
+	}
+    }
+
+    /* ------------------------------------------------------------
+       DEALLOCATE STORAGE.
+       ------------------------------------------------------------ */
+
+    dDestroy_LU (n_big, &(grid.grid2d), &LUstruct);
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+	    PStatPrint (options, stat, &(grid3d->grid2d)); /* Print 2D statistics.*/
+    }
+
+    dSolveFinalize (&options_big, &SOLVEstruct);
+
+    Destroy_CompRowLoc_Matrix_dist (&A_big);
+    dScalePermstructFree (&ScalePermstruct);
+    dLUstructFree (&LUstruct);
+
+    /* Copy the big solution into individual ones, and compute B'errs */
+    double bn, rn;  // inf-norm of B and R
+    double *x;
+    offset_m = 0;
+    for (d = 0; d < batchCount; ++d) {
+
+	A = (SuperMatrix *) SparseMatrix_handles[d];
+	perm_c = CpivPtr[d];
+        perm_r = RpivPtr[d];
+
+	/* Permute the solution matrix z <= Pc'*y */
+	//PrintInt32("prepare Pc'*y: perm_c", n, perm_c);
+	x = Xptr[d];
+	for (k = 0; k < nrhs; ++k) {
+	    for (i = 0; i < n[d]; ++i)
+		x[i] = b[k* m_big + offset_m + perm_c[i]];
+	    x += ldX[d]; /* move to next x */
+	}
+
+	//Printdouble5("Permuted-solution after iperm_c", n, Xptr[d]);
+
+	/* Compute residual: Pc*Pr*(R*b) - (Pc*Pr*R*A*C)*z
+	 * Now x = Pc'*y, where y is computed from pdgssvx3d()
+	 */
+	x = Xptr[d];
+	for (k = 0; k < nrhs; ++k) {  // Sherry: can call sp_dgemm_dist() !!!!
+	    bn = 0.; // norm of B
+	    rn = 0.; // norm of R
+	    for (i = 0; i < m[d]; ++i) {
+		bn = SUPERLU_MAX( bn, fabs(RHSptr[d][k*m[d] + i]) );
+
+		/* permute RHS by Pc*Pr, use b[] as temporary storage */
+		b[k*m_big + offset_m + perm_c[perm_r[i]]] = RHSptr[d][k*ldRHS[d] + i];
+	    }
+
+	    sp_dgemv_dist("N", alpha, A, x, 1, beta, &b[k*m_big + offset_m], 1);
+
+	    for (i = 0; i < m[d]; ++i) rn = SUPERLU_MAX( rn, fabs(b[k*m_big + offset_m + i]) );
+	    Berrs[d][k] = rn / bn;
+	    x += ldX[d]; /* move to next x */
+	} /* end for k ... */
+	offset_m += m[d];
+
+	/* Transform the solution matrix X to the solution of the
+	 * original system before equilibration: x <= C*z
+	 */
+	colequ = ( DiagScale[d] == COL || DiagScale[d] == BOTH );
+	if ( colequ ) {
+	    C = CeqPtr[d];
+	    x = Xptr[d];
+	    for (k = 0; k < nrhs; ++k) {
+		for (i = 0; i < n[d]; ++i) x[i] *= C[i];
+		x += ldX[d]; /* move to next x */
+	    }
+	}
+
+    } /* end for d ... batchCount */
+
+    SUPERLU_FREE (b);
+    SUPERLU_FREE (berr);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(grid3d->iam, "Exit pdgssvx3d_csc_batch()");
+#endif
+
+    return 0;
+} /* end pdgssvx3d_csc_vbatch */
diff --git a/SRC/include/superlu_ddefs.h b/SRC/include/superlu_ddefs.h
index 30ff43c6..97c784ec 100755
--- a/SRC/include/superlu_ddefs.h
+++ b/SRC/include/superlu_ddefs.h
@@ -1104,6 +1104,9 @@ extern int dcreate_block_diag_3d(SuperMatrix *A, int batchCount, int nrhs, doubl
 extern int dcreate_batch_systems(handle_t *SparseMatrix_handles, int batchCount,
 				 int nrhs, double **rhs, int *ldb, double **x, int *ldx,
 				 FILE *fp, char * postfix, gridinfo3d_t *grid3d);
+extern int dcreate_batch_systems_multiple(handle_t *SparseMatrix_handles, int batchCount,
+                 int nrhs, double **RHSptr, int *ldRHS, double **xtrue, int *ldX,
+                 FILE **fp, char * postfix, gridinfo3d_t *grid3d);
 
 /* Matrix distributed in NRformat_loc in 3D process grid. It converts
    it to a NRformat_loc distributed in 2D grid in grid-0 */
@@ -1597,16 +1600,35 @@ extern int pdgssvx3d_csc_batch(
 		gridinfo3d_t *grid3d, SuperLUStat_t *stat, int *info
 		//DeviceContext context /* device context including queues, events, dependencies */
 		);
+extern int pdgssvx3d_csc_vbatch(
+        superlu_dist_options_t *, int batchCount, int *m, int *n, int *nnz,
+        int nrhs, handle_t *, double **RHSptr, int *ldRHS,
+        double **ReqPtr, double **CeqPtr,
+        int **RpivPtr, int **CpivPtr, DiagScale_t *DiagScale,
+        handle_t *F, double **Xptr, int *ldX, double **Berrs,
+        gridinfo3d_t *grid3d, SuperLUStat_t *stat, int *info
+        //DeviceContext context /* device context including queues, events, dependencies */
+        );
 extern int dequil_batch(
     superlu_dist_options_t *, int batchCount, int m, int n, handle_t *,
     double **ReqPtr, double **CeqPtr, DiagScale_t *
     //    DeviceContext context /* device context including queues, events, dependencies */
     );
+extern int dequil_vbatch(
+    superlu_dist_options_t *, int batchCount, int *m, int *n, handle_t *,
+    double **ReqPtr, double **CeqPtr, DiagScale_t *
+    //    DeviceContext context /* device context including queues, events, dependencies */
+    );
 extern int dpivot_batch(
     superlu_dist_options_t *, int batchCount, int m, int n, handle_t *,
     double **ReqPtr, double **CeqPtr, DiagScale_t *, int **RpivPtr
     //    DeviceContext context /* device context including queues, events, dependencies */
     );
+extern int dpivot_vbatch(
+    superlu_dist_options_t *, int batchCount, int *m, int *n, handle_t *,
+    double **ReqPtr, double **CeqPtr, DiagScale_t *, int **RpivPtr
+    //    DeviceContext context /* device context including queues, events, dependencies */
+    );
 
 extern int dwriteLUtoDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct);
 extern int dcheckArr(double *A, double *B, int n);
diff --git a/SRC/include/superlu_defs.h b/SRC/include/superlu_defs.h
index b3ce43fb..9c0e09f2 100755
--- a/SRC/include/superlu_defs.h
+++ b/SRC/include/superlu_defs.h
@@ -1060,6 +1060,8 @@ extern int    sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t, int_t *
 extern void   get_perm_c_dist(int, int, SuperMatrix *, int *);
 extern void   get_perm_c_batch(superlu_dist_options_t *options,	int batchCount,
 			       handle_t  *SparseMatrix_handles, int **CpivPtr);
+extern void   get_perm_c_vbatch(superlu_dist_options_t *options, int batchCount,
+                   handle_t  *SparseMatrix_handles, int **CpivPtr);
 extern void   at_plus_a_dist(const int_t, const int_t, int_t *, int_t *,
 			     int_t *, int_t **, int_t **);
 extern void   getata_dist(const int_t m, const int_t n, const int_t nz, int_t *colptr, int_t *rowind,
diff --git a/SRC/include/superlu_dist_config.h b/SRC/include/superlu_dist_config.h
new file mode 100644
index 00000000..32f820fa
--- /dev/null
+++ b/SRC/include/superlu_dist_config.h
@@ -0,0 +1,32 @@
+/* superlu_dist_config.h.in */
+
+/* Enable CUDA */
+#define HAVE_CUDA TRUE
+
+/* Enable NVSHMEM */
+#define HAVE_NVSHMEM TRUE
+
+/* Enable HIP */
+/* #undef HAVE_HIP */
+
+/* Enable parmetis */
+#define HAVE_PARMETIS TRUE
+
+/* Enable colamd */
+/* #undef HAVE_COLAMD */
+
+/* Enable LAPACK */
+#define SLU_HAVE_LAPACK TRUE
+
+/* Enable CombBLAS */
+/* #undef HAVE_COMBBLAS */
+
+/* Enable MAGMA */
+#define HAVE_MAGMA TRUE
+
+/* enable 64bit index mode */
+/* #undef XSDK_INDEX_SIZE */
+
+#if defined(XSDK_INDEX_SIZE) && (XSDK_INDEX_SIZE == 64)
+#define _LONGINT 1
+#endif
diff --git a/SRC/prec-independent/get_perm_c_vbatch.c b/SRC/prec-independent/get_perm_c_vbatch.c
new file mode 100755
index 00000000..384c5325
--- /dev/null
+++ b/SRC/prec-independent/get_perm_c_vbatch.c
@@ -0,0 +1,201 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*
+ * -- Distributed SuperLU routine (version 9.0) --
+ * Lawrence Berkeley National Lab
+ * November 5, 2023
+ * Last update:
+ */
+
+#include "superlu_dist_config.h"
+#include "superlu_ddefs.h"
+
+#ifdef HAVE_COLAMD
+#include "colamd.h"
+#endif
+
+/*! \brief Gets sparsity permutations for a batch of matrices
+ * <pre>
+ *
+ * @param[in]     options solver options
+ * @param[in]     batchCount number of matrices in the batch
+ * @param[in]     SparseMatrix_handles pointers to the matrices in the batch, each pointing to the actual stoage in CSC format
+ *     On entry, the original matrices, may be overwritten by A1 <- Pr*diag(R)*A*diag(C) from dequil_batch() and dpivot_batch()
+ * @param[out]    CpivPtr pointers to column permutation vectors for each matrix, each of size n
+ *
+ * </pre>
+ */
+void
+get_perm_c_vbatch(
+	superlu_dist_options_t *options, /* options for algorithm choices and algorithm parameters */
+	int batchCount, /* number of matrices in the batch */
+	handle_t  *SparseMatrix_handles, /* array of sparse matrix handles,
+					  * of size 'batchCount',
+					  * each pointing to the actual storage
+					  */
+	int **CpivPtr /* array of pointers to column permutation vectors , each of size N */
+		 )
+{
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Enter get_perm_c_batch()");
+#endif
+
+    /* Decipher the input matrices */
+    SuperMatrix **A;
+    A = SUPERLU_MALLOC(batchCount * sizeof(SuperMatrix *));
+    for (int d = 0; d < batchCount; ++d) {
+	A[d] = (SuperMatrix *) SparseMatrix_handles[d];
+    }
+
+    int_t delta, maxint;
+    int_t *dhead, *qsize, *llist, *marker;
+    int *invp;
+    int m, n;
+    // int m = A[0]->nrow;
+    // int n = A[0]->ncol;
+
+    if ( options->ColPerm == MMD_AT_PLUS_A || options->ColPerm == MMD_ATA ) {
+
+	/* These arrays can be reused by multiple matrices */
+	
+	delta = 0; /* DELTA is a parameter to allow the choice of nodes
+		      whose degree <= min-degree + DELTA. */
+	maxint = 2147483647; /* 2**31 - 1 */
+	invp = (int *) int32Malloc_dist(n+delta);
+	if ( !invp ) ABORT("SUPERLU_MALLOC fails for invp.");
+	dhead = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t));
+	if ( !dhead ) ABORT("SUPERLU_MALLOC fails for dhead.");
+	qsize = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t));
+	if ( !qsize ) ABORT("SUPERLU_MALLOC fails for qsize.");
+	llist = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t));
+	if ( !llist ) ABORT("SUPERLU_MALLOC fails for llist.");
+	marker = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t));
+	if ( !marker ) ABORT("SUPERLU_MALLOC fails for marker.");
+    }
+	
+    int_t bnz;
+    int_t *b_colptr, *b_rowind;  /* allocated in at_plus_a() or getata() */
+    int_t nofsub;
+    int i, j;
+    double t;
+	
+    /* Loop through each matrix in the batch */
+    for (int d = 0; d < batchCount; ++d) {
+
+    m = A[d]->nrow;
+    n = A[d]->ncol;
+	
+	NCformat *Astore = (NCformat *) A[d]->Store;
+	int *perm_c = CpivPtr[d];
+
+	t = SuperLU_timer_();
+	bnz = 0;
+	
+	switch ( options->ColPerm ) {
+	    
+            case NATURAL: /* Natural ordering */
+		for (i = 0; i < n; ++i) perm_c[i] = i;
+		break;
+
+            case MMD_AT_PLUS_A: /* Minimum degree ordering on A'+A */
+		if ( m != n ) ABORT("Matrix is not square");
+		at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
+			       &bnz, &b_colptr, &b_rowind);
+		t = SuperLU_timer_() - t;
+		/*printf("Form A'+A time = %8.3f\n", t);*/
+		
+		break;
+
+            case MMD_ATA: /* Minimum degree ordering on A'*A */
+		getata_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
+			    &bnz, &b_colptr, &b_rowind);
+		t = SuperLU_timer_() - t;
+		/*printf("Form A'*A time = %8.3f\n", t);*/
+		
+		break;
+
+            case (COLAMD): /* Approximate minimum degree column ordering. */
+		get_colamd_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
+				perm_c);
+		break;
+#ifdef HAVE_PARMETIS
+            case METIS_AT_PLUS_A: /* METIS ordering on A'+A */
+		if ( m != n ) ABORT("Matrix is not square");
+		at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind,
+			       &bnz, &b_colptr, &b_rowind);
+
+		if ( bnz ) { /* non-empty adjacency structure */
+		    get_metis_dist(n, bnz, b_colptr, b_rowind, perm_c);
+		} else { /* e.g., diagonal matrix */
+		    for (i = 0; i < n; ++i) perm_c[i] = i;
+		    SUPERLU_FREE(b_colptr);
+		    /* b_rowind is not allocated in this case */
+		}
+		break;
+#endif		
+            default:
+		ABORT("Invalid options->Colperm");
+		
+	} /* end switch (options->Colperm) */
+
+	if ( options->ColPerm == MMD_AT_PLUS_A || options->ColPerm == MMD_ATA ) {
+	    if ( bnz ) {
+		t = SuperLU_timer_();
+
+		/* Initialize and allocate storage for GENMMD. */
+		delta = 0; /* DELTA is a parameter to allow the choice of nodes
+			      whose degree <= min-degree + DELTA. */
+		/* Transform adjacency list into 1-based indexing required by GENMMD.*/
+		for (i = 0; i <= n; ++i) ++b_colptr[i];
+		for (i = 0; i < bnz; ++i) ++b_rowind[i];
+
+		int_t ln = n;
+		genmmd_dist_(&ln, b_colptr, b_rowind, perm_c, invp, &delta, dhead, 
+			     qsize, llist, marker, &maxint, &nofsub);
+
+		/* Transform perm_c into 0-based indexing. */
+		for (i = 0; i < n; ++i) --perm_c[i];
+
+		t = SuperLU_timer_() - t;
+		/*    printf("call GENMMD time = %8.3f\n", t);*/
+
+		SUPERLU_FREE(b_colptr);  /* TODO: repeated malloc/free for the batch */
+		SUPERLU_FREE(b_rowind);
+		//printf("\tafter free b_rowind bnz %lld\n", (long long)bnz);
+		
+	    } else { /* Empty adjacency structure */
+		for (i = 0; i < n; ++i) perm_c[i] = i;
+	    }
+	} /* end if MMD */
+
+#if ( DEBUGlevel>=1 )
+	check_perm_dist("perm_c", n, perm_c);
+#endif
+    } /* end for d = ... batchCount */
+    
+    if ( options->ColPerm == MMD_AT_PLUS_A || options->ColPerm == MMD_ATA ) {
+	SUPERLU_FREE(invp);
+	SUPERLU_FREE(dhead);
+	SUPERLU_FREE(qsize);
+	SUPERLU_FREE(llist);
+	SUPERLU_FREE(marker);
+    }
+
+    SUPERLU_FREE(A);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(0, "Exit get_perm_c_batch()");
+#endif
+
+    return;
+    
+} /* end get_perm_c_batch */
diff --git a/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_gcc_nvshmem_vbatch.sh b/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_gcc_nvshmem_vbatch.sh
new file mode 100755
index 00000000..4553a706
--- /dev/null
+++ b/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_gcc_nvshmem_vbatch.sh
@@ -0,0 +1,299 @@
+#!/bin/bash
+#
+#modules:
+# module load cpe/23.03
+# module load PrgEnv-gnu
+# module load gcc/11.2.0
+module load cmake
+# module load cudatoolkit/11.7
+# avoid bug in cray-libsci/21.08.1.2
+# module load cray-libsci/22.11.1.2
+# module load cray-libsci/23.02.1.1
+ulimit -s unlimited
+#MPI settings:
+export MPICH_GPU_SUPPORT_ENABLED=1
+export CRAY_ACCEL_TARGET=nvidia80
+echo MPICH_GPU_SUPPORT_ENABLED=$MPICH_GPU_SUPPORT_ENABLED
+export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH
+#SUPERLU settings:
+
+
+export SUPERLU_LBS=GD  
+export SUPERLU_ACC_OFFLOAD=1 # this can be 0 to do CPU tests on GPU nodes
+export GPU3DVERSION=1
+export ANC25D=0
+export NEW3DSOLVE=1    
+export NEW3DSOLVETREECOMM=1
+export SUPERLU_BIND_MPI_GPU=1 # assign GPU based on the MPI rank, assuming one MPI per GPU
+export SUPERLU_ACC_SOLVE=1
+
+export SUPERLU_MAXSUP=256 # max supernode size
+export SUPERLU_RELAX=64  # upper bound for relaxed supernode size
+export SUPERLU_MAX_BUFFER_SIZE=10000000 ## 500000000 # buffer size in words on GPU
+export SUPERLU_NUM_LOOKAHEADS=2   ##4, must be at least 2, see 'lookahead winSize'
+export SUPERLU_NUM_GPU_STREAMS=1
+export SUPERLU_N_GEMM=6000 # FLOPS threshold divide workload between CPU and GPU
+nmpipergpu=1
+export SUPERLU_MPI_PROCESS_PER_GPU=$nmpipergpu # 2: this can better saturate GPU
+
+##NVSHMEM settings:
+NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
+export NVSHMEM_USE_GDRCOPY=1
+export NVSHMEM_MPI_SUPPORT=1
+export MPI_HOME=${MPICH_DIR}
+export NVSHMEM_LIBFABRIC_SUPPORT=1
+export LIBFABRIC_HOME=/opt/cray/libfabric/1.15.2.0
+export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$LD_LIBRARY_PATH
+export NVSHMEM_DISABLE_CUDA_VMM=1
+export FI_CXI_OPTIMIZED_MRS=false
+export NVSHMEM_BOOTSTRAP_TWO_STAGE=1
+export NVSHMEM_BOOTSTRAP=MPI
+export NVSHMEM_REMOTE_TRANSPORT=libfabric
+
+#export NVSHMEM_DEBUG=TRACE
+#export NVSHMEM_DEBUG_SUBSYS=ALL
+#export NVSHMEM_DEBUG_FILE=nvdebug_success
+
+if [[ $NERSC_HOST == edison ]]; then
+  CORES_PER_NODE=24
+  THREADS_PER_NODE=48
+elif [[ $NERSC_HOST == cori ]]; then
+  CORES_PER_NODE=32
+  THREADS_PER_NODE=64
+  # This does not take hyperthreading into account
+elif [[ $NERSC_HOST == perlmutter ]]; then
+  CORES_PER_NODE=64
+  THREADS_PER_NODE=128
+  GPUS_PER_NODE=4
+else
+  # Host unknown; exiting
+  exit $EXIT_HOST
+fi
+
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 ../EXAMPLE/g20.rua
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 $CFS/m2957/tianyi/matrix/Landau120mat.dat
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 -f .rua $CFS/m2957/tianyi/matrix/repg20/g20
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 64 -f .dat $CFS/m2957/tianyi/matrix/Landau/Landau
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 512 -f .dat $CFS/m2957/tianyi/matrix/Landau_small/Landau
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/Landau_small/Landau
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -e 0 -p 0 -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72_rowperm_0_equil_0 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -e 0 -p 1 -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72_rowperm_1_equil_0 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -p 0 -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72_rowperm_0 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -p 1 -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72_rowperm_1 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 -f .dat $CFS/m2957/tianyi/matrix/isooctane_test/isooctane
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 78 -f .dat $CFS/m2957/tianyi/matrix/dodecane/dodecane
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/dodecane/dodecane
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 67 -f .dat $CFS/m2957/tianyi/matrix/drm19/drm19
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/drm19/drm19
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 73 -f .dat $CFS/m2957/tianyi/matrix/gri12/gri12
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/gri12/gri12
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 90 -f .dat $CFS/m2957/tianyi/matrix/gri30/gri30
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/gri30/gri30
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 100 -f .dat $CFS/m2957/tianyi/matrix/lidryer/lidryer
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/lidryer/lidryer
+
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -e 0 -p 0 -b 10 -f .mtx $CFS/m2957/tianyi/matrix/collision_matrices/A_ | tee $CFS/m2957/tianyi/superlu_results/SLU.o_collision_batch_10_rowperm_0_equil_0 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -e 0 -p 1 -b 10 -f .mtx $CFS/m2957/tianyi/matrix/collision_matrices/A_ | tee $CFS/m2957/tianyi/superlu_results/SLU.o_collision_batch_10_rowperm_1_equil_0 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -p 0 -b 10 -f .mtx $CFS/m2957/tianyi/matrix/collision_matrices/A_ | tee $CFS/m2957/tianyi/superlu_results/SLU.o_collision_batch_10_rowperm_0 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -p 1 -b 10 -f .mtx $CFS/m2957/tianyi/matrix/collision_matrices/A_ | tee $CFS/m2957/tianyi/superlu_results/SLU.o_collision_batch_10_rowperm_1 >/dev/null
+
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 30 $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120_batch_30 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 30 -f .dat $CFS/m2957/tianyi/matrix/Landau120perturbed/L120perturbed | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120perturbed_batch_30 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 30 -f .dat $CFS/m2957/tianyi/matrix/Landau120rcm_perturbed/L120rcm_perturbed | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120rcm_perturbed_batch_30 >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d $CFS/m2957/tianyi/matrix/Landau120perturbed/L120perturbed29.dat
+
+# for ((batch = 1; batch < 21; batch++)); do
+# echo "Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch -i 1 -f .dat $CFS/m2957/tianyi/matrix/Landau120perturbed/L120perturbed | tee $CFS/m2957/tianyi/superlu_results/Landau120perturbed/SLU.o_L120perturbed_batch_${batch} >/dev/null
+# done
+
+# for ((i = 1; i < 11; i++)); do
+# batch=`expr $i \* 10`
+# echo "Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch -i 1 -f .dat $CFS/m2957/tianyi/matrix/Landau120perturbed/L120perturbed | tee $CFS/m2957/tianyi/superlu_results/Landau120perturbed/SLU.o_L120perturbed_largerbatch_${batch} >/dev/null
+# done
+
+# for ((batch = 1; batch < 21; batch++)); do
+# echo "XGC matrix with $batch batches, size 1640-by-1640 and nnz 14278"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch -i 1 -f .dat $CFS/m2957/tianyi/matrix/xgcrun/xgc | tee $CFS/m2957/tianyi/superlu_results/xgc/SLU.o_xgc_batch_${batch} >/dev/null
+# done
+
+# for ((i = 1; i < 12; i++)); do
+# batch=`expr $i \* 10`
+# echo "XGC matrix with $batch batches, size 1640-by-1640 and nnz 14278"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch -i 1 -f .dat $CFS/m2957/tianyi/matrix/xgcrun/xgc | tee $CFS/m2957/tianyi/superlu_results/xgc/SLU.o_xgc_largerbatch_${batch} >/dev/null
+# done
+
+# for ((batch = 1; batch < 21; batch++)); do
+# echo "Repeat Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/Landau120repeated/SLU.o_Landau120_batch_${batch} >/dev/null
+# done
+
+# for ((i = 1; i < 11; i++)); do
+# batch=`expr $i \* 10`
+# echo "Repeat Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/Landau120repeated/SLU.o_Landau120_largerbatch_${batch} >/dev/null
+# done
+
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 200 -f .dat $CFS/m2957/tianyi/matrix/xgcrun/xgc | tee $CFS/m2957/tianyi/superlu_results/SLU.o_xgc_batch_200 >/dev/null
+
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 1 $CFS/m2957/tianyi/matrix/L120debug/L120debug1.dat
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 0 $CFS/m2957/tianyi/matrix/L120debug/L120debug1.dat
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 1 -i 1 $CFS/m2957/tianyi/matrix/L120debug/L120debug1.dat
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 -i 1 -f .dat $CFS/m2957/tianyi/matrix/L120debug/L120debug | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120debug_batch_2_ir >/dev/null
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 -f .dat $CFS/m2957/tianyi/matrix/L120debug/L120debug | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120debug_batch_2 >/dev/null
+
+# echo "Repeat g20 twice with 2 batches"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 ../EXAMPLE/g20.rua | tee $CFS/m2957/tianyi/superlu_results/SLU.o_g20_batch_2 >/dev/null
+
+# echo "Repeat Landau 3D 120 cells matrix with 3 batches, size 3403-by-3403 and nnz 381187"
+# # Todo: try larger batch count, probably up to 1000
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/SLU.o_Landau120_batch_3
+
+# echo "Original Landau matrices, size 193-by-193 and nnz 4417, batch size 512"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 512 -f .dat $CFS/m2957/tianyi/matrix/Landau_small/Landau | tee $CFS/m2957/tianyi/superlu_results/SLU.o_Landau_batch_512
+# # Looking closely, the matrices are also just a repetition of one single matrix
+
+# echo "Isooctane matrices from the Pele example, size 144-by-144 and nnz 6135, batch size 72"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72
+
+# echo "First three isooctane matrices from the Pele example, size 144-by-144 and nnz 6135, this example shows different diagonal scaling"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_3
+
+# echo "Lidryer matrices from the Pele example, size 10-by-10 and nnz 91, batch size 100"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 100 -f .dat $CFS/m2957/tianyi/matrix/lidryer/lidryer | tee $CFS/m2957/tianyi/superlu_results/SLU.o_lidryer_batch_100
+# # These are really small and almost dense matrices, however, their importance manifest in the next example
+
+# echo "First three lidryer matrices from the Pele example, size 10-by-10 and nnz 91"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/lidryer/lidryer | tee $CFS/m2957/tianyi/superlu_results/SLU.o_lidryer_batch_3
+
+# echo "Variable size batch example"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 2 -f .dat $CFS/m2957/tianyi/matrix/vbatch_test/test | tee $CFS/m2957/tianyi/superlu_results/SLU.o_vbatch_test_2
+
+echo "Variable size batch EMT example"
+srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_total
+srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_2
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_total
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 3 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun2/EMTrun2_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun2_3
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun2/EMTrun2_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun2_total
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun3/EMTrun3_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun3_2
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun3/EMTrun3_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun3_total
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun4/EMTrun4_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun4_2
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun4/EMTrun4_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun4_total
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_2_p_0
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_total_p_0
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 3 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun2/EMTrun2_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun2_3_p_0
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun2/EMTrun2_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun2_total_p_0
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun3/EMTrun3_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun3_2_p_0
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun3/EMTrun3_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun3_total_p_0
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun4/EMTrun4_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun4_2_p_0
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun4/EMTrun4_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun4_total_p_0
+
+# batchCount=(1 10 100 500 1000)
+# # batchCount=(1000)
+# for ((i = 0; i < ${#batchCount[@]}; i++)); do
+# batch=${batchCount[i]}
+
+# echo "Repeat Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187"
+# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/SLU.o_Landau120_batch_${batch} >/dev/null
+# done
+
+
+# nprows=(1)
+# npcols=(1)
+# npz=(1)
+# nrhs=(1)
+# NTH=1
+# NREP=1
+# # NODE_VAL_TOT=1
+
+# for ((i = 0; i < ${#npcols[@]}; i++)); do
+# NROW=${nprows[i]}
+# NCOL=${npcols[i]}
+# NPZ=${npz[i]}
+# for ((s = 0; s < ${#nrhs[@]}; s++)); do
+# NRHS=${nrhs[s]}
+# CORE_VAL2D=`expr $NCOL \* $NROW`
+# NODE_VAL2D=`expr $CORE_VAL2D / $GPUS_PER_NODE`
+# MOD_VAL=`expr $CORE_VAL2D % $GPUS_PER_NODE`
+# if [[ $MOD_VAL -ne 0 ]]
+# then
+#   NODE_VAL2D=`expr $NODE_VAL2D + 1`
+# fi
+
+# CORE_VAL=`expr $NCOL \* $NROW \* $NPZ`
+# NODE_VAL=`expr $CORE_VAL / $GPUS_PER_NODE`
+# MOD_VAL=`expr $CORE_VAL % $GPUS_PER_NODE`
+# if [[ $MOD_VAL -ne 0 ]]
+# then
+#   NODE_VAL=`expr $NODE_VAL + 1`
+# fi
+
+# # NODE_VAL=2
+# # NCORE_VAL_TOT=`expr $NODE_VAL_TOT \* $CORES_PER_NODE / $NTH`
+# batch=0 # whether to do batched test
+# NCORE_VAL_TOT=`expr $NROW \* $NCOL \* $NPZ `
+# NCORE_VAL_TOT2D=`expr $NROW \* $NCOL `
+
+# OMP_NUM_THREADS=$NTH
+# TH_PER_RANK=`expr $NTH \* 2`
+
+# export OMP_NUM_THREADS=$NTH
+# export OMP_PLACES=threads
+# export OMP_PROC_BIND=spread
+# export SLURM_CPU_BIND="cores"
+# export MPICH_MAX_THREAD_SAFETY=multiple
+
+# # srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../EXAMPLE/g20.rua
+
+# # export NSUP=256
+# # export NREL=256
+# # for MAT in big.rua
+# # for MAT in Geo_1438.bin
+# # for MAT in g20.rua
+# # for MAT in s1_mat_0_253872.bin s2D9pt2048.rua
+# # for MAT in dielFilterV3real.bin
+# for MAT in rma10.mtx 
+# # for MAT in raefsky3.mtx
+# # for MAT in s2D9pt2048.rua raefsky3.mtx rma10.mtx
+# # for MAT in s1_mat_0_126936.bin  # for MAT in s1_mat_0_126936.bin
+# # for MAT in s2D9pt2048.rua
+# # for MAT in nlpkkt80.bin dielFilterV3real.bin Ga19As19H42.bin
+# # for MAT in dielFilterV3real.bin 
+# # for MAT in s2D9pt1536.rua
+# # for MAT in s1_mat_0_126936.bin s1_mat_0_253872.bin s1_mat_0_507744.bin
+# # for MAT in matrix_ACTIVSg70k_AC_00.mtx matrix_ACTIVSg10k_AC_00.mtx
+# # for MAT in temp_13k.mtx temp_25k.mtx temp_75k.mtx
+# # for MAT in temp_13k.mtx
+# do
+# mkdir -p $MAT
+# for ii in `seq 1 $NREP`
+# do	
+
+# # SUPERLU_ACC_OFFLOAD=0
+# # srun -n $NCORE_VAL_TOT2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}
+
+# # SUPERLU_ACC_OFFLOAD=1
+# # srun -n $NCORE_VAL_TOT2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}_nmpipergpu${nmpipergpu}
+
+# SUPERLU_ACC_OFFLOAD=1
+# export GPU3DVERSION=0
+# echo "srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}"
+# srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}_nmpipergpu${nmpipergpu}
+
+# SUPERLU_ACC_OFFLOAD=1
+# export GPU3DVERSION=1
+# echo "srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}"
+# srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}_nmpipergpu${nmpipergpu}
+
+
+# # export SUPERLU_ACC_SOLVE=1
+# # srun -n $NCORE_VAL_TOT  -c $TH_PER_RANK --cpu_bind=cores valgrind --leak-check=yes ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}
+
+
+# done
+
+# done
+# done
+# done
diff --git a/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh b/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh
index 1a1c23ae..93f7b991 100755
--- a/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh
+++ b/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh
@@ -33,7 +33,7 @@ module load cmake
 module load cudatoolkit
 # avoid bug in cray-libsci/21.08.1.2
 # module load cray-libsci/22.11.1.2
-module load cray-libsci
+module load cray-libsci/24.07.0
 # module use /global/common/software/nersc/pe/modulefiles/latest
 # module load nvshmem/2.11.0
 export MAGMA_ROOT=/global/cfs/cdirs/m2957/lib/magma_master
@@ -44,7 +44,7 @@ export MAGMA_ROOT=/global/cfs/cdirs/m2957/lib/magma_master
 NVSHMEM_HOME=/global/cfs/cdirs/m2957/lib/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/
 #NVSHMEM_HOME=${CRAY_NVIDIA_PREFIX}/comm_libs/nvshmem/
 cmake .. \
-  -DCMAKE_C_FLAGS="-O2 -std=c11 -DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -DAdd_" \
+  -DCMAKE_C_FLAGS="-O2 -std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0 -DAdd_" \
   -DCMAKE_CXX_FLAGS="-O2 -std=c++11" \
   -DCMAKE_Fortran_FLAGS="-O2" \
   -DCMAKE_CXX_COMPILER=CC \
@@ -53,14 +53,14 @@ cmake .. \
   -DXSDK_ENABLE_Fortran=ON \
   -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
   -DTPL_ENABLE_LAPACKLIB=ON \
-  -DBUILD_SHARED_LIBS=ON \
+  -DBUILD_SHARED_LIBS=OFF \
   -DTPL_ENABLE_CUDALIB=ON \
   -DCMAKE_CUDA_FLAGS="-I${NVSHMEM_HOME}/include -I${MPICH_DIR}/include -ccbin=CC" \
   -DCMAKE_CUDA_ARCHITECTURES=80 \
   -DCMAKE_INSTALL_PREFIX=. \
   -DCMAKE_INSTALL_LIBDIR=./lib \
   -DCMAKE_BUILD_TYPE=Debug \
-  -DTPL_ENABLE_MAGMALIB=OFF \
+  -DTPL_ENABLE_MAGMALIB=ON \
   -DTPL_MAGMA_INCLUDE_DIRS="${MAGMA_ROOT}/include" \
   -DTPL_MAGMA_LIBRARIES="${MAGMA_ROOT}/lib/libmagma.so" \
   -DTPL_BLAS_LIBRARIES=$CRAY_LIBSCI_PREFIX/lib/libsci_gnu_mp.so \
@@ -69,18 +69,19 @@ cmake .. \
   -DTPL_PARMETIS_LIBRARIES="/global/cfs/cdirs/m2957/lib/lib/PrgEnv-gnu/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.so;/global/cfs/cdirs/m2957/lib/lib/PrgEnv-gnu/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.so" \
   -DTPL_ENABLE_COMBBLASLIB=OFF \
   -DTPL_ENABLE_NVSHMEM=ON \
-  -DTPL_NVSHMEM_LIBRARIES="-L${CUDA_HOME}/lib64/stubs/ -lnvidia-ml -L/usr/lib64 -lgdrapi -lstdc++ -L/opt/cray/libfabric/1.20.1/lib64 -lfabric -L${NVSHMEM_HOME}/lib -lnvshmem" \
+  -DTPL_NVSHMEM_LIBRARIES="-L${CUDA_HOME}/lib64/stubs/ -lnvidia-ml -L/usr/lib64 -lgdrapi -lstdc++ -L/opt/cray/libfabric/1.22.0/lib64 -lfabric -L${NVSHMEM_HOME}/lib -lnvshmem" \
   -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \
   -DMPIEXEC_NUMPROC_FLAG=-n \
   -DMPIEXEC_EXECUTABLE=/usr/bin/srun \
   -DMPIEXEC_MAX_NUMPROCS=16
 
-make pddrive -j16
-make pddrive3d -j16
-make pzdrive3d -j16
-make pzdrive
-make f_pddrive
-make pzdrive3d_qcd 
+# make pddrive -j16
+# make pddrive3d -j16
+make pddrive3d_vbatch -j16
+# make pzdrive3d -j16
+# make pzdrive
+# make f_pddrive
+# make pzdrive3d_qcd 
 
 ## -DTPL_BLAS_LIBRARIES=/global/cfs/cdirs/m3894/ptlin/tpl/amd_blis/install/amd_blis-20211021-n9-gcc9.3.0/lib/libblis.a \