diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt index 2ef0d7c7..df9dd473 100644 --- a/EXAMPLE/CMakeLists.txt +++ b/EXAMPLE/CMakeLists.txt @@ -67,7 +67,12 @@ if(enable_double) set(DEXM3D pddrive3d.c dcreate_matrix.c dcreate_matrix3d.c) add_executable(pddrive3d ${DEXM3D}) target_link_libraries(pddrive3d ${all_link_libs}) - install(TARGETS pddrive3d RUNTIME DESTINATION "${INSTALL_LIB_DIR}/EXAMPLE") + install(TARGETS pddrive3d RUNTIME DESTINATION "${INSTALL_LIB_DIR}/EXAMPLE") + + set(DEXM3D pddrive3d_vbatch.c dcreate_matrix.c dcreate_matrix3d.c) + add_executable(pddrive3d_vbatch ${DEXM3D}) + target_link_libraries(pddrive3d_vbatch ${all_link_libs}) + install(TARGETS pddrive3d_vbatch RUNTIME DESTINATION "${INSTALL_LIB_DIR}/EXAMPLE") set(DEXM3D pddrive3d_block_diag.c dcreate_matrix.c dcreate_matrix3d.c) add_executable(pddrive3d_block_diag ${DEXM3D}) diff --git a/EXAMPLE/dcreate_matrix3d.c b/EXAMPLE/dcreate_matrix3d.c index 03b35b59..83e40a9e 100644 --- a/EXAMPLE/dcreate_matrix3d.c +++ b/EXAMPLE/dcreate_matrix3d.c @@ -710,3 +710,105 @@ int dcreate_batch_systems(handle_t *SparseMatrix_handles, int batchCount, #endif return 0; } /* end dcreate_batch_systems */ + +int dcreate_batch_systems_multiple(handle_t *SparseMatrix_handles, int batchCount, + int nrhs, double **RHSptr, int *ldRHS, double **xtrue, int *ldX, + FILE **fp, char * postfix, gridinfo3d_t *grid3d) +{ + int_t *rowind_d, *colptr_d; /* metadata for one diagonal block */ + double *nzval_d; /* global */ + int_t m, n, nnz; + int row, col, i, j, relpos; + int iam; + char trans[1]; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) +CHECK_MALLOC(iam, "Enter dcreate_batch_systems()"); +#endif + + /* Allocate storage for CSC containing all the matrices */ + SuperMatrix **A = SUPERLU_MALLOC( batchCount * sizeof(SuperMatrix *) ); + int d = 0; + for (d = 0; d < batchCount; ++d) { + int_t *rowind, *colptr; + double *nzval; + + FILE *fpd = fp[d]; + if ( !iam ) + { + double t = SuperLU_timer_(); + + if (!strcmp(postfix, "rua")) + { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + dreadhb_dist(iam, fpd, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "mtx")) + { + /* Read the matrix stored on disk in Matrix Market format. */ + dreadMM_dist(fpd, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "rb")) + { + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + dreadrb_dist(iam, fpd, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "dat")) + { + /* Read the matrix stored on disk in triplet format. */ + dreadtriple_dist(fpd, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "datnh")) + { + /* Read the matrix stored on disk in triplet format (without header). */ + dreadtriple_noheader(fpd, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "bin")) + { + /* Read the matrix stored on disk in binary format. */ + dread_binary(fpd, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else + { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + } + + int_t *rowind_d, *colptr_d; /* each block */ + + /* Allocate storage for compressed column representation. */ + dallocateA_dist(n, nnz, &nzval_d, &rowind_d, &colptr_d); + + /* Copy the CSC arrays */ + for (j = 0; j < n+1; ++j) colptr_d[j] = colptr[j]; + for (i = 0; i < nnz; ++i) { + rowind_d[i] = rowind[i]; + nzval_d[i] = nzval[i]; + } + + /* Create compressed column matrix. */ + A[d] = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) ); + dCreate_CompCol_Matrix_dist(A[d], m, n, nnz, nzval_d, rowind_d, colptr_d, + SLU_NC, SLU_D, SLU_GE); + SparseMatrix_handles[d] = (handle_t) A[d]; + + /* Generate the exact solutions and compute the right-hand sides. */ + RHSptr[d] = doubleMalloc_dist( m * nrhs ); + xtrue[d] = doubleMalloc_dist( n * nrhs ); + ldRHS[d] = m; + ldX[d] = n; + *trans = 'N'; + dGenXtrue_dist(n, nrhs, xtrue[d], n); + dFillRHS_dist(trans, nrhs, xtrue[d], n, A[d], RHSptr[d], m); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit dcreate_batch_systems()"); +#endif + return 0; +} /* end dcreate_batch_systems_mult */ \ No newline at end of file diff --git a/EXAMPLE/pddrive3d_block_diag_vbatch.c b/EXAMPLE/pddrive3d_block_diag_vbatch.c new file mode 100755 index 00000000..7887da69 --- /dev/null +++ b/EXAMPLE/pddrive3d_block_diag_vbatch.c @@ -0,0 +1,498 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PDGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 9.0) -- + * Lawrence Berkeley National Lab, Georgia Institute of Technology, + * Oak Ridge National Lab + * May 12, 2021 + * August 27, 2022 Add batch option + * + */ +#include "superlu_ddefs.h" + +/*! \brief + * + *+ * Purpose + * ======= + * + * The driver program PDDRIVE3D. + * + * This example illustrates how to use PDGSSVX3D with the full + * (default) options to solve a linear system. + * + * Five basic steps are required: + * 1. Initialize the MPI environment and the SuperLU process grid + * 2. Set up the input matrix and the right-hand side + * 3. Set the options argument + * 4. Call pdgssvx + * 5. Release the process grid and terminate the MPI environment + * + * The program may be run by typing + * mpiexec -np+ */ + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) +{ + for(int j=0; jpddrive3d -r
-c \ + * -d + * NOTE: total number of processes p = r * c * d + * d must be a power-of-two, e.g., 1, 2, 4, ... + * + * nnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *xtrue; + int_t m, n; + int nprow, npcol, npdep; + int lookahead, colperm, rowperm, ir; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level, batchCount = 0; + int* usermap; /* The following variables are used for batch solves */ + float result_min[2]; + result_min[0]=1e10; + result_min[1]=1e10; + float result_max[2]; + result_max[0]=0.0; + result_max[1]=0.0; + MPI_Comm SubComm; + int myrank, p; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + lookahead = -1; + colperm = -1; + rowperm = -1; + ir = -1; + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + case 'l': lookahead = atoi(*cpp); + break; + case 'p': rowperm = atoi(*cpp); + break; + case 'q': colperm = atoi(*cpp); + break; + case 'i': ir = atoi(*cpp); + break; + case 'b': batchCount = atoi(*cpp); + break; + case 's': nrhs = atoi(*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); +#ifdef GPU_ACC + int superlu_acc_offload = get_acc_offload(); + if (superlu_acc_offload) { + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + double t1 = SuperLU_timer_(); + gpuFree(0); + double t2 = SuperLU_timer_(); + if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1); + gpublasHandle_t hb; + gpublasCreate(&hb); + if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2); + gpublasDestroy(hb); + } +#endif + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + fflush(stdout); + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; ii 0 ) { + printf("batchCount %d\n", batchCount); + dcreate_block_diag_3d(&A, batchCount, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &grid); + } else { + dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb, + &xtrue, &ldx, fp, suffix, &(grid)); + } + + //printf("ldx %d, ldb %d\n", ldx, ldb); + +#if 0 // following code is only for checking *Gather* routine + NRformat_loc *Astore, *Astore0; + double* B2d; + NRformat_loc Atmp = dGatherNRformat_loc( + (NRformat_loc *) A.Store, + b, ldb, nrhs, &B2d, + &grid); + Astore = &Atmp; + SuperMatrix Aref; + double *bref, *xtrueref; + if ( grid.zscp.Iam == 0 ) // only in process layer 0 + { + dcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb, + &xtrueref, &ldx, fp0, + suffix, &(grid.grid2d)); + Astore0 = (NRformat_loc *) Aref.Store; + + /* + if ( (grid.grid2d).iam == 0 ) { + printf(" iam %d\n", 0); + checkNRFMT(Astore, Astore0); + } else if ((grid.grid2d).iam == 1 ) { + printf(" iam %d\n", 1); + checkNRFMT(Astore, Astore0); + } + */ + + // bref, xtrueref are created on 2D + matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); + #endif +#endif + + if (!(berr = doubleMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------ */ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = SLU_DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); + options.Algo3d = YES; + options.ReplaceTinyPivot = YES; + options.IterRefine = NOREFINE; + options.DiagInv = YES; + // options.ParSymbFact = YES; + // options.ColPerm = PARMETIS; + options.Algo3d = YES; + options.DiagInv = YES; + options.ReplaceTinyPivot = YES; +#if 0 + options.ReplaceTinyPivot = YES; + options.RowPerm = NOROWPERM; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = YES; +#endif + + if (rowperm != -1) options.RowPerm = rowperm; + if (colperm != -1) options.ColPerm = colperm; + if (lookahead != -1) options.num_lookaheads = lookahead; + if (ir != -1) options.IterRefine = ir; + + if ( batchCount > 0 ) + options.batchCount = batchCount; + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + +#ifdef NRFRMT // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; +#else + if ( grid.zscp.Iam == 0 ) // Process layer 0 + { + m = A.nrow; + n = A.ncol; + } + // broadcast m, n to all the process layers; + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.zscp.comm); +#endif + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m, n, &ScalePermstruct); + dLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + + // dDestroy_LU (n, &(grid.grid2d), &LUstruct); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } + dSolveFinalize (&options, &SOLVEstruct); + dDestroy_LU (n, &(grid.grid2d), &LUstruct); + + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + if ( batchCount ) { + result_min[0] = stat.utime[FACT]; + result_min[1] = stat.utime[SOLVE]; + result_max[0] = stat.utime[FACT]; + result_max[1] = stat.utime[SOLVE]; + MPI_Allreduce(MPI_IN_PLACE, result_min, 2, MPI_FLOAT,MPI_MIN, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, result_max, 2, MPI_FLOAT,MPI_MAX, MPI_COMM_WORLD); + if (!myrank) { + printf("Batch solves returning data:\n"); + printf(" Factor time over all grids. Min: %8.4f Max: %8.4f\n",result_min[0], result_max[0]); + printf(" Solve time over all grids. Min: %8.4f Max: %8.4f\n",result_min[1], result_max[1]); + printf("**************************************************\n"); + fflush(stdout); + } + } + + superlu_gridexit3d (&grid); + if ( iam != -1 )PStatFree (&stat); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pddrive3d_vbatch.c b/EXAMPLE/pddrive3d_vbatch.c new file mode 100755 index 00000000..c0b46c12 --- /dev/null +++ b/EXAMPLE/pddrive3d_vbatch.c @@ -0,0 +1,650 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PDGSSVX3D example + * + * + * -- Distributed SuperLU routine (version 9.0) -- + * Lawrence Berkeley National Lab, Georgia Institute of Technology, + * Oak Ridge National Lab + * May 12, 2021 + * August 27, 2022 Add batch option + * January 7, 2024 Complete the batch interface + * + */ +#include "superlu_ddefs.h" + +/*! \brief + * + *+ */ +int +dequil_vbatch( + superlu_dist_options_t *options, /* options for algorithm choices and algorithm parameters */ + int batchCount, /* number of matrices in the batch */ + int *m, /* matrix row dimension */ + int *n, /* matrix column dimension */ + handle_t *SparseMatrix_handles, /* array of sparse matrix handles, of size 'batchCount', + * each pointing to the actual storage + */ + double **ReqPtr, /* array of pointers to diagonal row scaling vectors, + each of size M */ + double **CeqPtr, /* array of pointers to diagonal column scaling vectors, + each of size N */ + DiagScale_t *DiagScale /* How equilibration is done for each matrix. */ + // DeviceContext context /* device context including queues, events, dependencies */ + ) +{ + int i, j, irow, icol, info = 0; + fact_t Fact = options->Fact; + int factored = (Fact == FACTORED); + int Equil = (!factored && options->Equil == YES); + int notran = (options->Trans == NOTRANS); + +#if (DEBUGlevel >= 1) + CHECK_MALLOC(0, "Enter dequil_batch()"); +#endif + /* Decipher the input matrices */ + SuperMatrix **A = SUPERLU_MALLOC(batchCount * sizeof(SuperMatrix *)); + for (i = 0; i < batchCount; ++i) { + A[i] = (SuperMatrix *) SparseMatrix_handles[i]; + } + + /* Loop through each matrix in the batch */ + for (int k = 0; k < batchCount; ++k) { + + NCformat *Astore = (NCformat *) A[k]->Store; + double *a = (double *) Astore->nzval; + int_t *colptr = Astore->colptr; + int_t *rowind = Astore->rowind; + + /* Assuming each matrix is in CSC format + * Otherwise, convert to CSC first -- use the code in dvpivot_batch.c + * ... + */ + + /* The following arrays are replicated on all processes. */ + double *R = ReqPtr[k]; + double *C = CeqPtr[k]; + + /* Allocate stoage if not factored & ask for equilibration */ + if (Equil && Fact != SamePattern_SameRowPerm) { + /* Allocate storage if not done so before. */ + //switch (ScalePermstruct->DiagScale) + switch ( DiagScale[k] ) { + case NOEQUIL: + if (!(R = (double *)doubleMalloc_dist(m[k]))) ABORT("Malloc fails for R[]."); + if (!(C = (double *)doubleMalloc_dist(n[k]))) ABORT("Malloc fails for C[]."); + ReqPtr[k] = R; + CeqPtr[k] = C; + break; + case ROW: /* R[] was already allocated before */ + if (!(C = (double *)doubleMalloc_dist(n[k]))) ABORT("Malloc fails for C[]."); + CeqPtr[k] = C; + break; + case COL: /* C[] was already allocated before */ + if (!(R = (double *)doubleMalloc_dist(m[k]))) ABORT("Malloc fails for R[]."); + ReqPtr[k] = R; + break; + default: + break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if ( Equil ) { + if (Fact == SamePattern_SameRowPerm) { + /* Reuse R and C. */ + switch ( DiagScale[k] ) { + case NOEQUIL: break; + case ROW: + for (j = 0; j < n[k]; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + a[i] *= R[irow]; /* Scale rows. */ + } + } + break; + case COL: + for (j = 0; j < n[k]; ++j) { + double cj = C[j]; + for (i = colptr[j]; i < colptr[j+1]; ++i) { + a[i] *= cj; /* Scale columns. */ + } + } + break; + case BOTH: + for (j = 0; j < n[k]; ++j) { + double cj = C[j]; + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + a[i] *= R[irow] * cj; /* Scale rows and cols. */ + + } + } + break; + } /* end switch DiagScale[k] ... */ + + } else { /* Compute R[] & C[] from scratch */ + + int iinfo; + char equed[1]; + double amax, anorm, colcnd, rowcnd; + + /* Compute the row and column scalings. */ + + dgsequ_dist(A[k], R, C, &rowcnd, &colcnd, &amax, &iinfo); + + if (iinfo > 0) { + if (iinfo <= m[k]) { +#if (PRNTlevel >= 1) + fprintf(stderr, "Matrix %d: the %d-th row of A is exactly zero\n", k, (int)iinfo); +#endif + } else { +#if (PRNTlevel >= 1) + fprintf(stderr, "Matrix %d: the %d-th column of A is exactly zero\n", k, (int)iinfo - n[k]); +#endif + } + } else if (iinfo < 0) { + if ( info==0 ) info = iinfo; + } + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + dlaqgs_dist(A[k], R, C, rowcnd, colcnd, amax, equed); + + if (strncmp(equed, "R", 1) == 0) { + DiagScale[k] = ROW; + } else if (strncmp(equed, "C", 1) == 0) { + DiagScale[k] = COL; + } else if (strncmp(equed, "B", 1) == 0) { + DiagScale[k] = BOTH; + } else { + DiagScale[k] = NOEQUIL; + } +#if (PRNTlevel >= 1) + printf(".. equilibrated? *equed = %c, DiagScale[k] %d\n", *equed, DiagScale[k]); + fflush(stdout); +#endif + } /* end if-else Fact ... */ + + } /* end if Equil ... LAPACK style, not involving MC64 */ + + } /* end for k ... batchCount */ + +#if (DEBUGlevel >= 1) + CHECK_MALLOC(0, "Exit dequil_batch()"); +#endif + return info; +} /* end dequil_batch */ diff --git a/SRC/double/dpivot_vbatch.c b/SRC/double/dpivot_vbatch.c new file mode 100644 index 00000000..1c34b1bb --- /dev/null +++ b/SRC/double/dpivot_vbatch.c @@ -0,0 +1,274 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/* + * -- Distributed SuperLU routine (version 9.0) -- + * Lawrence Berkeley National Lab + * November 5, 2023 + * Last update: + */ +#include "superlu_ddefs.h" + +/*! \brief Compute row pivotings for each matrix, for numerical stability + *+ * Purpose + * ======= + * + * The driver program PDDRIVE3D. + * + * This example illustrates how to use PDGSSVX3D with the full + * (default) options to solve a linear system. + * + * Five basic steps are required: + * 1. Initialize the MPI environment and the SuperLU process grid + * 2. Set up the input matrix and the right-hand side + * 3. Set the options argument + * 4. Call pdgssvx + * 5. Release the process grid and terminate the MPI environment + * + * The program may be run by typing + * mpiexec -np+ */ + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) +{ + for(int j=0; jpddrive3d -r
-c \ + * -d + * NOTE: total number of processes p = r * c * d + * d must be a power-of-two, e.g., 1, 2, 4, ... + * + * nnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *xtrue; + int_t m, n; + int nprow, npcol, npdep; + int equil,colperm, rowperm, ir, lookahead; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level, batchCount = 0; + int* usermap; /* The following variables are used for batch solves */ + float result_min[2]; + result_min[0]=1e10; + result_min[1]=1e10; + float result_max[2]; + result_max[0]=0.0; + result_max[1]=0.0; + MPI_Comm SubComm; + int myrank, p; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + equil = -1; + colperm = -1; + rowperm = -1; + ir = -1; + lookahead = -1; + + int dir = 0; + char postfix[10]; + FILE **fparray; + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + case 'b': batchCount = atoi(*cpp); + break; + case 'e': equil = atoi(*cpp); + break; + case 'p': rowperm = atoi(*cpp); + break; + case 'q': colperm = atoi(*cpp); + break; + case 'i': ir = atoi(*cpp); + break; + case 's': nrhs = atoi(*cpp); + break; + case 'l': lookahead = atoi(*cpp); + break; + case 'f': strcpy(postfix, *cpp); dir = 1; + break; + } + } + else + { /* Last arg is considered a filename or a directory name*/ + if (dir) { + fparray = malloc(sizeof(FILE*) * batchCount); + for (int bc = 0; bc < batchCount; bc++) { + char buf[sizeof(int)*8+1]; + int lenbuf = snprintf(buf, sizeof(int)*8+1, "%d", bc); + + char name[100]; + strcpy(name, *cpp); + strcat(name, buf); + strcat(name, postfix); + + if (!(fparray[bc] = fopen (name, "r"))) + { + ABORT ("File does not exist"); + } + } + break; + } + else { + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); +#ifdef GPU_ACC + int superlu_acc_offload = get_acc_offload(&options); + if (superlu_acc_offload) { + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + double t1 = SuperLU_timer_(); + gpuFree(0); + double t2 = SuperLU_timer_(); + if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1); + gpublasHandle_t hb; + gpublasCreate(&hb); + if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2); + gpublasDestroy(hb); + } +#endif + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + fflush(stdout); + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = SLU_DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); + options.Algo3d = YES; + options.IterRefine = NOREFINE; + // options.ParSymbFact = YES; + // options.ColPerm = PARMETIS; +#if 0 + options.DiagInv = YES; // only if SLU_HAVE_LAPACK + options.ReplaceTinyPivot = YES; + options.RowPerm = NOROWPERM; + options.ColPerm = NATURAL; + options.ReplaceTinyPivot = YES; +#endif + + if ( batchCount > 0 ) + options.batchCount = batchCount; + + if (equil != -1) options.Equil = equil; + if (rowperm != -1) options.RowPerm = rowperm; + if (colperm != -1) options.ColPerm = colperm; + if (ir != -1) options.IterRefine = ir; + if (lookahead != -1) options.num_lookaheads = lookahead; + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + if (dir) { + suffix = &(postfix[1]); + } + else { + for (ii = 0; ii 0 ) { + /* ------------------------------------------------------------ + SOLVE THE BATCH LINEAR SYSTEM. + ------------------------------------------------------------ */ + printf("batchCount %d\n", batchCount); + // dcreate_block_diag_3d(&A, batchCount, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &grid); + + handle_t *F; + double **RHSptr; + int *ldRHS, *md, *nd, *nnzd; + double **ReqPtr; + double **CeqPtr; + DiagScale_t *DiagScale; + int **RpivPtr; + int **CpivPtr; + double **Xptr; + int *ldX; + double **Berrs, **xtrues; + + handle_t *SparseMatrix_handles = SUPERLU_MALLOC( batchCount * sizeof(handle_t) ); + RHSptr = (double **) SUPERLU_MALLOC( batchCount * sizeof(double *) ); + ldRHS = int32Malloc_dist(batchCount); + xtrues = (double **) SUPERLU_MALLOC( batchCount * sizeof(double *) ); + ldX = int32Malloc_dist(batchCount); + + if (dir) { + dcreate_batch_systems_multiple(SparseMatrix_handles, batchCount, nrhs, RHSptr, ldRHS, + xtrues, ldX, fparray, suffix, &grid); + } + else { + dcreate_batch_systems(SparseMatrix_handles, batchCount, nrhs, RHSptr, ldRHS, + xtrues, ldX, fp, suffix, &grid); + } + + md = int32Malloc_dist(batchCount); + nd = int32Malloc_dist(batchCount); + nnzd = int32Malloc_dist(batchCount); + for (int d = 0; d < batchCount; ++d) { + SuperMatrix *Ad = (SuperMatrix *) SparseMatrix_handles[d]; + NCformat *Adstore = Ad->Store; + double *ad = Adstore->nzval; + md[d] = Ad->nrow; + nd[d] = Ad->ncol; + nnzd[d] = Adstore->nnz; + } + + // SuperMatrix *A = (SuperMatrix *) SparseMatrix_handles[0]; + // NCformat *Astore = A->Store; + // double *a = Astore->nzval; + // m = A->nrow; + // n = A->ncol; + + ReqPtr = (double **) SUPERLU_MALLOC( batchCount * sizeof(double *) ); + CeqPtr = (double **) SUPERLU_MALLOC( batchCount * sizeof(double *) ); + RpivPtr = (int **) SUPERLU_MALLOC( batchCount * sizeof(int *) ); + CpivPtr = (int **) SUPERLU_MALLOC( batchCount * sizeof(int *) ); + DiagScale = (DiagScale_t *) SUPERLU_MALLOC( batchCount * sizeof(DiagScale_t) ); + Xptr = (double **) SUPERLU_MALLOC( batchCount * sizeof(double*) ); + Berrs = (double **) SUPERLU_MALLOC( batchCount * sizeof(double*) ); + for (int d = 0; d < batchCount; ++d) { + DiagScale[d] = NOEQUIL; + RpivPtr[d] = int32Malloc_dist(md[d]); + CpivPtr[d] = int32Malloc_dist(nd[d]); + Xptr[d] = doubleMalloc_dist( nd[d] * nrhs ); + Berrs[d] = doubleMalloc_dist( nrhs ); + } + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call batch solver */ + pdgssvx3d_csc_vbatch(&options, batchCount, + md, nd, nnzd, nrhs, SparseMatrix_handles, + RHSptr, ldRHS, ReqPtr, CeqPtr, RpivPtr, CpivPtr, + DiagScale, F, Xptr, ldX, Berrs, &grid, &stat, &info); + + printf("**** Backward errors ****\n"); + for (int d = 0; d < batchCount; ++d) { + printf("\tSystem %d: Berr = %e\n", d, Berrs[d][0]); + //printf("\t\tDiagScale[%d] %d\n", d, DiagScale[d]); + } + + /* Free matrices pointed to by the handles, and ReqPtr[], etc. */ + for (int d = 0; d < batchCount; ++d) { + if ( DiagScale[d] == ROW || DiagScale[d] == BOTH ) + SUPERLU_FREE(ReqPtr[d]); + if ( DiagScale[d] == COL || DiagScale[d] == BOTH ) + SUPERLU_FREE(CeqPtr[d]); + SUPERLU_FREE(RpivPtr[d]); + SUPERLU_FREE(CpivPtr[d]); + SUPERLU_FREE(Xptr[d]); + SUPERLU_FREE(Berrs[d]); + // A = (SuperMatrix *) SparseMatrix_handles[d]; + // Destroy_CompRowLoc_Matrix_dist (A); + } + SUPERLU_FREE(SparseMatrix_handles); + SUPERLU_FREE(RHSptr); + SUPERLU_FREE(ldRHS); + SUPERLU_FREE(md); + SUPERLU_FREE(nd); + SUPERLU_FREE(nnzd); + SUPERLU_FREE(xtrues); + SUPERLU_FREE(ldX); + SUPERLU_FREE(ReqPtr); + SUPERLU_FREE(CeqPtr); + SUPERLU_FREE(RpivPtr); + SUPERLU_FREE(CpivPtr); + SUPERLU_FREE(DiagScale); + SUPERLU_FREE(Xptr); + SUPERLU_FREE(Berrs); + + goto out; + + } else { + +#define NRFRMT +#ifndef NRFRMT + if ( grid.zscp.Iam == 0 ) // only in process layer 0 + dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d)); + +#else + dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb, + &xtrue, &ldx, fp, suffix, &(grid)); + // dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb, + // &xtrue, &ldx, fp, suffix, &(grid)); + } + +#if 0 // following code is only for checking *Gather* routine + NRformat_loc *Astore, *Astore0; + double* B2d; + NRformat_loc Atmp = dGatherNRformat_loc( + (NRformat_loc *) A.Store, + b, ldb, nrhs, &B2d, + &grid); + Astore = &Atmp; + SuperMatrix Aref; + double *bref, *xtrueref; + if ( grid.zscp.Iam == 0 ) // only in process layer 0 + { + dcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb, + &xtrueref, &ldx, fp0, + suffix, &(grid.grid2d)); + Astore0 = (NRformat_loc *) Aref.Store; + + /* + if ( (grid.grid2d).iam == 0 ) { + printf(" iam %d\n", 0); + checkNRFMT(Astore, Astore0); + } else if ((grid.grid2d).iam == 1 ) { + printf(" iam %d\n", 1); + checkNRFMT(Astore, Astore0); + } + */ + + // bref, xtrueref are created on 2D + matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); + #endif +#endif + + if (!(berr = doubleMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------ */ +#ifdef NRFRMT // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; +#else + if ( grid.zscp.Iam == 0 ) // Process layer 0 + { + m = A.nrow; + n = A.ncol; + } + // broadcast m, n to all the process layers; + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.zscp.comm); +#endif + // m = A.nrow; + // n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m, n, &ScalePermstruct); + dLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } + dDestroy_LU (n, &(grid.grid2d), &LUstruct); + dSolveFinalize (&options, &SOLVEstruct); + + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + + if (dir) { + for (int bc = 0; bc < batchCount; bc++) { + fclose(fparray[bc]); + } + free(fparray); + } + else { + fclose(fp); + } + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: +#if 0 // the following makes sense only for coarse-grain parallel model + if ( batchCount ) { + result_min[0] = stat.utime[FACT]; + result_min[1] = stat.utime[SOLVE]; + result_max[0] = stat.utime[FACT]; + result_max[1] = stat.utime[SOLVE]; + MPI_Allreduce(MPI_IN_PLACE, result_min, 2, MPI_FLOAT, MPI_MIN, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, result_max, 2, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD); + if (!myrank) { + printf("Batch solves returning data:\n"); + printf(" Factor time over all grids. Min: %8.4f Max: %8.4f\n",result_min[0], result_max[0]); + printf(" Solve time over all grids. Min: %8.4f Max: %8.4f\n",result_min[1], result_max[1]); + printf("**************************************************\n"); + fflush(stdout); + } + } +#endif + + superlu_gridexit3d (&grid); + if ( iam != -1 )PStatFree (&stat); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 9483a61c..302479a0 100755 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -81,6 +81,7 @@ set(sources prec-independent/treeFactorization.c prec-independent/sec_structs.c prec-independent/get_perm_c_batch.c + prec-independent/get_perm_c_vbatch.c ) # Add all the BLAS routines: CplusplusFactor/supelu_blas.hpp needs all @@ -164,8 +165,11 @@ if(enable_double) double/dtrfCommWrapper.c double/dsuperlu_blas.c double/pdgssvx3d_csc_batch.c # batch in CSC format + double/pdgssvx3d_csc_vbatch.c double/dequil_batch.c # batch in CSC format + double/dequil_vbatch.c double/dpivot_batch.c + double/dpivot_vbatch.c ) if (TPL_ENABLE_CUDALIB) diff --git a/SRC/double/dequil_vbatch.c b/SRC/double/dequil_vbatch.c new file mode 100644 index 00000000..6fbd9ed1 --- /dev/null +++ b/SRC/double/dequil_vbatch.c @@ -0,0 +1,204 @@ +/*! @file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/* + * -- Distributed SuperLU routine (version 9.0) -- + * Lawrence Berkeley National Lab + * November 5, 2023 + * Last update: + */ +#include "superlu_ddefs.h" + +/*! \brief Equilibrate the systems using the LAPACK-style algorithm + * + * @param[in] options solver options + * @param[in] batchCount number of matrices in the batch + * @param[in] m row dimension of the matrices + * @param[in] n column dimension of the matrices + * @param[in, out] SparseMatrix_handles pointers to the matrices in the batch, each pointing to the actual stoage in CSC format + * On entry, the original matrices + * On exit, each matrix may be overwritten by diag(R)*A*diag(C) + * @param[out] ReqPtr pointers to row scaling vectors (allocated internally) + * @param[out] CeqPtr pointers to column scaling vectors (allocated internally) + * @param[in, out] DiagScale arrays indicating how each system is equilibrated: {ROW, COL, BOTH} + * + * Return value i: + * = 0: successful exit + * > 0: indicates the first matrix in the batch has zero row or column + * if i <= m: the i-th row of A is exactly zero + * if i > m: the (i-m)-th column of A is exactly zero + * + * + * @param[in] options solver options + * @param[in] batchCount number of matrices in the batch + * @param[in] m row dimension of the matrices + * @param[in] n column dimension of the matrices + * @param[in, out] SparseMatrix_handles pointers to the matrices in the batch, each pointing to the actual stoage in CSC format + * On entry, the original matrices, may be overwritten by A1 <- diag(R)*A*diag(C) from dequil_batch() + * On exit, each matrix may be A2 <- Pr*A1 + * @param[in,out] ReqPtr pointers to row scaling vectors, maybe overwritten by scaling from MC64 + * @param[in,out] CeqPtr pointers to column scaling vectors, maybe overwritten by scaling from MC64 + * @param[in,out] DiagScale array indicating how each system is equilibrated: {ROW, COL, BOTH} + * @param[in,out] RpivPtr pointers to row permutation vectors for each matrix, each of size m + * On exit, each RpivPtr[] is applied to each matrix + * + * Return value: + * 0, success + * -1, invalid RowPerm option; an Identity perm_r[] is returned + * d, indicates that the d-th matrix is the first one in the batch encountering error + *+ */ +int +dpivot_vbatch( + superlu_dist_options_t *options, /* options for algorithm choices and algorithm parameters */ + int batchCount, /* number of matrices in the batch */ + int *m, /* matrix row dimension */ + int *n, /* matrix column dimension */ + handle_t *SparseMatrix_handles, /* array of sparse matrix handles, + * of size 'batchCount', each pointing to the actual storage + */ + double **ReqPtr, /* array of pointers to diagonal row scaling vectors, + each of size M */ + double **CeqPtr, /* array of pointers to diagonal column scaling vectors, + each of size N */ + DiagScale_t *DiagScale, /* How equilibration is done for each matrix. */ + int **RpivPtr /* array of pointers to row permutation vectors , each of size M */ + // DeviceContext context /* device context including queues, events, dependencies */ + ) +{ + int i, j, irow, iinfo, rowequ, colequ, info = 0; + fact_t Fact = options->Fact; + int factored = (Fact == FACTORED); + int Equil = (!factored && options->Equil == YES); + int notran = (options->Trans == NOTRANS); + int job = 5; + + +#if (DEBUGlevel >= 1) + CHECK_MALLOC(0, "Enter dpivot_batch()"); +#endif + + /* Decipher the input matrices */ + SuperMatrix **A; + A = SUPERLU_MALLOC(batchCount * sizeof(SuperMatrix *)); + for (i = 0; i < batchCount; ++i) { + A[i] = (SuperMatrix *) SparseMatrix_handles[i]; + } + + int_t *colptr; + int_t *rowind; + double *a, *at; + int_t nnz; + + /* Loop through each matrix in the batch */ + for (int d = 0; d < batchCount; ++d) { + + double *R1, *C1; + if (job == 5) { + /* Allocate storage for scaling factors. */ + if (!(R1 = doubleMalloc_dist(m[d]))) + ABORT("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = doubleMalloc_dist(n[d]))) + ABORT("SUPERLU_MALLOC fails for C1[]"); + } + + rowequ = ( DiagScale[d] == ROW || DiagScale[d] == BOTH ); + colequ = ( DiagScale[d] == COL || DiagScale[d] == BOTH ); + + /* If the matrix type is SLU_NR (CSR), then need to convert to CSC first */ + if ( A[d]->Stype == SLU_NR ) { /* CSR format */ + NRformat *Astore = (NRformat *) A[d]->Store; + a = (double *)Astore->nzval; + + dCompRow_to_CompCol_dist(m[d], n[d], nnz, a, + Astore->colind, Astore->rowptr, + &at, &rowind, &colptr); + + a = at; // now a[] points to at[], stored in CSC format. + nnz = Astore->nnz; + } else { /* CSC format */ + NCformat *Astore = (NCformat *) A[d]->Store; + a = (double *)Astore->nzval; + colptr = Astore->colptr; + rowind = Astore->rowind; + nnz = Astore->nnz; + } + + /* Row and column scaling factors. */ + double *R = ReqPtr[d]; + double *C = CeqPtr[d]; + + if ( !factored ) { /* Skip this if already factored. */ + + int *perm_r = RpivPtr[d]; + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + if (options->RowPerm != NO) { + + if (Fact != SamePattern_SameRowPerm) { + if (options->RowPerm == MY_PERMR) { /* Use user's perm_r. */ + /* Permute the matrix A for symbfact() */ + for (i = 0; i < colptr[n[d]]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if (options->RowPerm == LargeDiag_MC64) { + /* Finds a row permutation (serial) */ + iinfo = dldperm_dist(job, m[d], nnz, colptr, rowind, a, + perm_r, R1, C1); + + if ( iinfo ) { /* Error */ + printf(".. Matrix %d: LDPERM ERROR %d\n", d, iinfo); + if ( info==0 ) info = d+1 ; + } +#if (PRNTlevel >= 2) + dmin = damch_dist("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if (iinfo == 0) { + if (job == 5) { + if (Equil) { + for (i = 0; i < n[d]; ++i) { + R1[i] = exp(R1[i]); + C1[i] = exp(C1[i]); + } + + /* Scale the matrix further. + A <-- diag(R1)*A*diag(C1) */ + double cj; + for (j = 0; j < n[d]; ++j) { + cj = C1[j]; + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + a[i] *= R1[irow] * cj; + + } + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if (rowequ) + for (i = 0; i < m[d]; ++i) R[i] *= R1[i]; + else + for (i = 0; i < m[d]; ++i) R[i] = R1[i]; + if (colequ) + for (i = 0; i < n[d]; ++i) C[i] *= C1[i]; + else + for (i = 0; i < n[d]; ++i) C[i] = C1[i]; + + DiagScale[d] = BOTH; + rowequ = colequ = 1; + + } /* end if Equil */ + + /* Now permute rows of A to prepare for symbfact() */ + for (j = 0; j < n[d]; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; +#if (PRNTlevel >= 2) + dprod *= fabs(a[i]); +#endif + } + } + } else { /* job = 2,3,4 */ + for (j = 0; j < n[d]; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + +#if (PRNTlevel >= 2) + /* New diagonal */ + if (job == 2 || job == 3) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if (job == 4) + dsum += fabs(a[i]); +#endif + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + + } else { /* if iinfo != 0 ... MC64 returns error */ + for (i = 0; i < m[d]; ++i) perm_r[i] = i; + } +#if (PRNTlevel >= 2) + if (job == 2 || job == 3) { + if (!iam) printf("\tsmallest diagonal %e\n", dmin); + } else if (job == 4) { + if (!iam) printf("\tsum of diagonal %e\n", dsum); + } else if (job == 5) { +n if (!iam) printf("\t product of diagonal %e\n", dprod); + } +#endif + } else { + printf(".. LDPERM invalid RowPerm option %d\n", options->RowPerm); + info = -1; + for (i = 0; i < m[d]; ++i) perm_r[i] = i; + } /* end if-else options->RowPerm ... */ + +#if (PRNTlevel >= 1) + printf(".. LDPERM job %d\n", (int) job); + fflush(stdout); +#endif + } /* end if Fact not SamePattern_SameRowPerm ... */ + + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + + for (i = 0; i < m[d]; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=1 ) + check_perm_dist("perm_r", m[d], perm_r); + PrintInt32("perm_r", m[d], perm_r); +#endif + } /* end if (!factored) */ + + if ( A[d]->Stype == SLU_NR ) { + SUPERLU_FREE(at); + SUPERLU_FREE(rowind); + SUPERLU_FREE(colptr); + } + + if (job == 5) { + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } + + } /* end for d ... batchCount */ + + /* Deallocate storage */ + SUPERLU_FREE(A); + + +#if (DEBUGlevel >= 1) + CHECK_MALLOC(0, "Exit dpivot_batch()"); +#endif + return info; + +} /* end dpivot_batch */ diff --git a/SRC/double/pdgssvx3d_csc_vbatch.c b/SRC/double/pdgssvx3d_csc_vbatch.c new file mode 100755 index 00000000..b52d6dd6 --- /dev/null +++ b/SRC/double/pdgssvx3d_csc_vbatch.c @@ -0,0 +1,506 @@ +/*! @file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/* + * -- Distributed SuperLU routine (version 9.0) -- + * Lawrence Berkeley National Lab + * January 13, 2024 + * Last update: + */ +#include "superlu_ddefs.h" +#include "superlu_defs.h" +#include "superlu_upacked.h" +#include+ +int file_dPrint_CompRowLoc_to_Triples(SuperMatrix *A) +{ + NRformat_loc *Astore = A->Store; + int nnz, m, n, i, j; + double *dp; + FILE *fp = fopen("CSR.txt", "w"); + + m = A->nrow; + n = A->ncol; + nnz = Astore->nnz_loc; + dp = Astore->nzval; + + printf("print to triples: m %d, n %d, nnz %d\n", m, n, nnz); + for (i = 0; i < m; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + fprintf(fp, "%8d %8d %16.8e\n", i, (int) Astore->colind[j], dp[j]); + } + } + fclose(fp); + return 0; +} + +/*! \brief Solve a batch of linear systems Ai * Xi = Bi with direct method, + * computing the LU factorization of each matrix Ai;
+ * This is the fixed-size interface: all the input matrices have the same sparsity structure + * + *+ * @param[in] options solver options + * @param[in] batchCount number of matrices in the batch + * @param[in] m row dimension of the matrices + * @param[in] n column dimension of the matrices + * @param[in] nnz number of non-zero entries in each matrix + * @param[in] nrhs number of right-hand-sides + * @param[in,out] SparseMatrix_handles array of sparse matrix handles, of size 'batchCount', each pointing to the actual storage in CSC format, see 'NCformat' in SuperMatix structure + * Each A is overwritten by row/col scaling R*A*C + * @param[in,out] RHSptr array of pointers to dense storage of right-hand sides B + * Each B is overwritten by row/col scaling R*B*C + * @param[in] ldRHS array of leading dimensions of RHS + * @param[in,out] ReqPtr array of pointers to diagonal row scaling vectors R, each of size m + * ReqPtr[] are allocated internally if equilibration is asked for + * @param[in,out] CeqPtr array of pointers to diagonal colum scaling vectors C, each of size n + * CeqPtr[] are allocated internally if equilibration is asked for + * @param[in,out] RpivPtr array of pointers to row permutation vectors, each of size m + * @param[in,out] CpivPtr array of pointers to column permutation vectors, each of size n + * @param[in,out] DiagScale array of indicators how equilibration is done for each matrix + * @param[out] F array of handles pointing to the factored matrices + * @param[out] Xptr array of pointers to dense storage of solution + * @param[in] ldX array of leading dimensions of X + * @param[out] Berrs array of poiniters to backward errors + * @param[in]] grid3d contains MPI communicator + * @param[out] stat records algorithms statistics such as runtime, memory usage, etc. + * @param[out] info flags the errors on return + * + *+ */ +int +pdgssvx3d_csc_vbatch( + superlu_dist_options_t *options, /* options for algorithm choices and algorithm parameters */ + int batchCount, /* number of matrices in the batch */ + int *m, /* array of matrix row dimensions, size batchCount */ + int *n, /* array of matrix column dimension, size batchCount */ + int *nnz, /* array of number of non-zero entries, size batchCount */ + int nrhs, /* number of right-hand-sides */ + handle_t *SparseMatrix_handles, /* array of sparse matrix handles, + * of size 'batchCount', + * each pointing to the actual storage + */ + double **RHSptr, // array of pointers to dense RHS storage + int *ldRHS, // array of leading dimensions of RHS + double **ReqPtr, /* array of pointers to diagonal row scaling vectors, + each of size M */ + double **CeqPtr, /* array of pointers to diagonal column scaling vectors, + each of size N */ + int **RpivPtr, /* array of pointers to row permutation vectors , each of size M */ + int **CpivPtr, /* array of pointers to column permutation vectors , each of size N */ + DiagScale_t *DiagScale, /* indicate how equilibration is done for each matrix */ + handle_t *F, /* array of handles pointing to the factored matrices */ + double **Xptr, // array of pointers to dense solution storage + int *ldX, // array of leading dimensions of X + double **Berrs, /* array of poiniters to backward errors */ + gridinfo3d_t *grid3d, + SuperLUStat_t *stat, + int *info + //DeviceContext context /* device context including queues, events, dependencies */ + ) +{ + /* Steps in this routine + 1. Loop through all matrices A_i, perform preprocessing (all in serial format) + 1.1 equilibration + 1.2 numerical pivoting (e.g., MC64) + 1.3 sparsity reordering + + 2. Copy the matrices into block diagonal form: A_big + (can be merged into Step 3, then no need to have a big copy) + + 3. Factorize A_big -> LU_big + 3.1 symbolic factoization (not batched, can loop through each individual one serial) + 3.2 numerical factorization + + 4. Split LU_big into individual LU factors, store them in the handle array to return + (this may be difficult, and the users may not need them.) + + 5. Solve (2 designs) + 5.1 using LU_big -- requires RHS to be in contiguous memory + compute level set. Leverage B-to-X with an internal copy. + (OR) 5.2 loop through individual LU -- may lose some data-parallel opportunity + */ + + /* Test the options choices. */ + *info = 0; + SuperMatrix *A0 = (SuperMatrix *) SparseMatrix_handles[0]; + fact_t Fact = options->Fact; + + if (Fact < 0 || Fact > FACTORED) + *info = -1; + else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) + *info = -1; + else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) + *info = -1; + else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) + *info = -1; + else if (options->IterRefine == SLU_EXTRA) + { + *info = -1; + fprintf(stderr, + "Extra precise iterative refinement yet to support."); + } + else if (batchCount < 0) *info = -2; + /* Need to check M, N, NNZ */ + else if (A0->nrow != A0->ncol || A0->nrow < 0 || A0->Stype != SLU_NC || A0->Dtype != SLU_D || A0->Mtype != SLU_GE) + *info = -7; + else if (nrhs < 0) + { + *info = -6; + } + if (*info) { + pxerr_dist("pdgssvx3d_csc_vbatch", &(grid3d->grid2d), -(*info)); + return -1; + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid3d->iam, "Enter pdgssvx3d_csc_batch()"); +#endif + + int colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, k, irow; + double *C, *R; //*C1, *R1, amax, anorm, colcnd, rowcnd; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; + int d; /* index into each matrix in the batch */ + + double t = SuperLU_timer_(); + + /**** equilibration (LAPACK style) ****/ + /* ReqPtr[] and CeqPtr[] are allocated internally */ + /* Each A may be overwritten by R*A*C */ + dequil_vbatch(options, batchCount, m, n, SparseMatrix_handles, + ReqPtr, CeqPtr, DiagScale); + + stat->utime[EQUIL] = SuperLU_timer_() - t; + t = SuperLU_timer_(); + + /**** numerical pivoting (e.g., MC64) ****/ + /* If MC64(job=5 is invoked, further equilibration is done, + * DiagScale[] will be BOTH, and each A is modified, + * perm_r[]'s are applied to each matrix. + */ + /* no internal malloc */ + dpivot_vbatch(options, batchCount, m, n, SparseMatrix_handles, + ReqPtr, CeqPtr, DiagScale, RpivPtr); + + stat->utime[ROWPERM] = SuperLU_timer_() - t; + +#if 0 + for (d = 0; d < batchCount; ++d) { + printf("DiagScale[%d] %d\n", d, DiagScale[d]); + if ( DiagScale[d] ) { + Printdouble5("ReqPtr[d]", m, ReqPtr[d]); + Printdouble5("CeqPtr[d]", m, CeqPtr[d]); + } + PrintInt32("RpivPtr[d]", m, RpivPtr[d]); + } +#endif + + /**** sparsity reordering ****/ + /* col perms are computed for each matrix; may be different due to different row perm. + * A may be overwritten as Pr*R*A*C from previous steps, but is not modified in this routine. + */ + t = SuperLU_timer_(); + + get_perm_c_vbatch(options, batchCount, SparseMatrix_handles, CpivPtr); + + stat->utime[COLPERM] = SuperLU_timer_() - t; +#if 0 + for (d = 0; d < batchCount; ++d) { + PrintInt32("CpivPtr[d]", m, CpivPtr[d]); + } +#endif + +#if (PRNTlevel >= 1) + printf("<---- END PREPROCESSING ----\n"); +#endif + + /*--------------------- + **** Stack the matrices into block diagonal form: A_big, and RHS B_big + ----------------------*/ + + /* Count total dimension and number of nonzeros. */ + SuperMatrix *A; + int m_big = 0, n_big = 0, nnz_big = 0; + for (d = 0; d < batchCount; ++d) { + m_big += m[d]; + n_big += n[d]; + A = (SuperMatrix *) SparseMatrix_handles[d]; + NCformat *Astore = (NCformat *) A->Store; + nnz_big += Astore->nnz; + } + + /* Allocate storage in CSR containing all matrices in the batch */ + // TO-DELETE: dallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + double *a_big = (double *) doubleMalloc_dist(nnz_big); + int_t *colind = (int_t *) intMalloc_dist(nnz_big); + int_t *rowptr = (int_t *) intMalloc_dist(n_big + 1); + double *nzval_d; /* each diagonal block */ + int_t *colind_d; + int_t *rowptr_d; + int_t nnz_d, col, row, offset_m; + int *perm_c, *perm_r; + + /* B_big */ + double *b; + if ( !(b = doubleMalloc_dist(m_big * nrhs)) ) ABORT("Malloc fails for b[:,nrhs]"); + + j = 0; /* running sum of total nnz */ + row = 0; + col = 0; + double alpha = -1.0, beta = 1.0; + offset_m = 0; + + for (d = 0; d < batchCount; ++d) { + + A = (SuperMatrix *) SparseMatrix_handles[d]; + NCformat *Astore = (NCformat *) A->Store; + nnz_d = Astore->nnz; + perm_r = RpivPtr[d]; + perm_c = CpivPtr[d]; + + /* Apply perm_c[] to row of A to preserve diagonal: A <= Pc*A */ + for (i = 0; i < nnz_d; ++i) + Astore->rowind[i] = perm_c[Astore->rowind[i]]; + + /* Convert to CSR format. */ + dCompCol_to_CompRow_dist(m[d], n[d], Astore->nnz, Astore->nzval, Astore->colptr, + Astore->rowind, &nzval_d, &rowptr_d, &colind_d); + + //PrintInt32("rowptr_d", m+1, rowptr_d); + + /* Copy this CSR matrix to a diagonal block of A_big. + Apply each perm_c[] to each matrix by column. + Now, diagonal block is permuted by Pc*A*Pc' + */ + + /* Apply perm_c[] to columns of A (out-of-place) */ + for (i = 0; i < m[d]; ++i) { + rowptr[row++] = j; + //irow = iperm_c[i]; // old irow + //for (k = rowptr_d[irow]; k < rowptr_d[irow+1]; ++k) { + for (k = rowptr_d[i]; k < rowptr_d[i+1]; ++k) { + colind[j] = perm_c[colind_d[k]] + col; // add the *col* shift + a_big[j] = nzval_d[k]; + ++j; + } + } + + /* move to next block */ + col += n[d]; + + SUPERLU_FREE(nzval_d); /* TODO: remove repeated malloc/free */ + SUPERLU_FREE(colind_d); + SUPERLU_FREE(rowptr_d); + + /* Transform the right-hand side: RHS overwritten by B <= R*B */ + double *rhs; + + // NEED TO SAVE A COPY OF RHS ?? + + rowequ = ( DiagScale[d] == ROW || DiagScale[d] == BOTH ); + //printf(" before transform RHS: rowequ %d\n", rowequ); + if ( rowequ ) { /* Scale RHS by R[] */ + R = ReqPtr[d]; + rhs = RHSptr[d]; // first RHS + for (k = 0; k < nrhs; ++k) { + for (i = 0; i < m[d]; ++i) rhs[i] *= R[i]; + rhs += ldRHS[d]; /* move to next RHS */ + } + } + +#if ( DEBUGlevel>=1 ) + printf("System %d, next row %d, next col %d, next j %d\n", d, row, col, j); + //Printdouble5("big-RHS", m, RHSptr[d]); +#endif + + rhs = RHSptr[d]; // first RHS + for (k = 0; k < nrhs; ++k) { + for (i = 0; i < m[d]; ++i) /* permute RHS by Pc*Pr (out-of-place) */ + b[k * m_big + offset_m + perm_c[perm_r[i]]] = rhs[i]; + rhs += ldRHS[d]; /* move to next RHS */ + } + offset_m += m[d]; + + //Printdouble5("big-RHS-permuted", m, &b[(k-1) * m_big + d * m]); + + } /* end for d ... batchCount */ + + // assert(j == nnz_big); + // assert(row == m_big); + rowptr[row] = nnz_big; /* +1 as an end marker */ + + /**** By now: each A transformed to Pc*Pr*R*A*C + **** each B transformed to R*B + **** Need to solve (Pc*Pr*R*A*C*Pc')*(Pc*C^{-1}*X) = (Pc*Pr*R)*B + ****/ + + /* Set up A_big in NR_loc format */ + SuperMatrix A_big; + dCreate_CompRowLoc_Matrix_dist(&A_big, m_big, n_big, nnz_big, m_big, 0, + a_big, colind, rowptr, SLU_NR_loc, SLU_D, SLU_GE); + + //file_dPrint_CompRowLoc_to_Triples(&A_big); + + /* Modify the input options. + * Turn off preprocessing options for the big system. + */ + superlu_dist_options_t options_big; + set_default_options_dist(&options_big); + options_big.Equil = NO; + options_big.ColPerm = NATURAL; + options_big.RowPerm = NOROWPERM; + options_big.ParSymbFact = NO; + options_big.batchCount = batchCount; + + /* Copy the other options */ + options_big.Fact = options->Fact; + options_big.ReplaceTinyPivot = options->ReplaceTinyPivot; + options_big.IterRefine = options->IterRefine; + options_big.Trans = options->Trans; + options_big.SolveInitialized = options->SolveInitialized; + options_big.RefineInitialized = options->RefineInitialized; + options_big.PrintStat = options->PrintStat; + + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + MPI_Comm comm = grid3d->comm; + + /* Need to create a grid of size 1 */ + int nprow = 1, npcol = 1, npdep = 1; + superlu_gridinit3d (comm, nprow, npcol, npdep, &grid); + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m_big, n_big, &ScalePermstruct); + dLUstructInit (n_big, &LUstruct); + + //printf("\tbefore pdgssvx3d: m_big %d, n_big %d, nrhs %d\n", m_big, n_big, nrhs); + //dPrint_CompRowLoc_Matrix_dist(&A_big); + + if (!(berr = doubleCalloc_dist (nrhs))) ABORT ("Malloc fails for berr[]."); + + /*--------------------- + **** Call the linear equation solver + ----------------------*/ + + /*!!!! CHECK SETTING: TO BE SURE TO USE GPU VERSIONS !!!! + gpu3dVersion + superlu_acc_offload + */ + /* perm_c_big may not be Identity due to etree postordering, however, + * since b[] is transormed back to the solution of the original BIG system, + * we do not need to consider perm_c_big outside pdgssvx3d(). + */ + pdgssvx3d (&options_big, &A_big, &ScalePermstruct, b, m_big, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, stat, info); + +#if (PRNTlevel >= 1) + printf("\tBIG system: berr[0] %e\n", berr[0]); + printf("after pdgssvx3d: DiagScale %d\n", ScalePermstruct.DiagScale); + //PrintInt32("after pdgssvx3d: ScalePermstruct.perm_c", m_big, ScalePermstruct.perm_c); + //Printdouble5("big-B-solution", m_big, b); +#endif + + if ( *info ) { /* Something is wrong */ + if ( grid3d->iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", *info); + fflush(stdout); + } + } + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + + dDestroy_LU (n_big, &(grid.grid2d), &LUstruct); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (options, stat, &(grid3d->grid2d)); /* Print 2D statistics.*/ + } + + dSolveFinalize (&options_big, &SOLVEstruct); + + Destroy_CompRowLoc_Matrix_dist (&A_big); + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + + /* Copy the big solution into individual ones, and compute B'errs */ + double bn, rn; // inf-norm of B and R + double *x; + offset_m = 0; + for (d = 0; d < batchCount; ++d) { + + A = (SuperMatrix *) SparseMatrix_handles[d]; + perm_c = CpivPtr[d]; + perm_r = RpivPtr[d]; + + /* Permute the solution matrix z <= Pc'*y */ + //PrintInt32("prepare Pc'*y: perm_c", n, perm_c); + x = Xptr[d]; + for (k = 0; k < nrhs; ++k) { + for (i = 0; i < n[d]; ++i) + x[i] = b[k* m_big + offset_m + perm_c[i]]; + x += ldX[d]; /* move to next x */ + } + + //Printdouble5("Permuted-solution after iperm_c", n, Xptr[d]); + + /* Compute residual: Pc*Pr*(R*b) - (Pc*Pr*R*A*C)*z + * Now x = Pc'*y, where y is computed from pdgssvx3d() + */ + x = Xptr[d]; + for (k = 0; k < nrhs; ++k) { // Sherry: can call sp_dgemm_dist() !!!! + bn = 0.; // norm of B + rn = 0.; // norm of R + for (i = 0; i < m[d]; ++i) { + bn = SUPERLU_MAX( bn, fabs(RHSptr[d][k*m[d] + i]) ); + + /* permute RHS by Pc*Pr, use b[] as temporary storage */ + b[k*m_big + offset_m + perm_c[perm_r[i]]] = RHSptr[d][k*ldRHS[d] + i]; + } + + sp_dgemv_dist("N", alpha, A, x, 1, beta, &b[k*m_big + offset_m], 1); + + for (i = 0; i < m[d]; ++i) rn = SUPERLU_MAX( rn, fabs(b[k*m_big + offset_m + i]) ); + Berrs[d][k] = rn / bn; + x += ldX[d]; /* move to next x */ + } /* end for k ... */ + offset_m += m[d]; + + /* Transform the solution matrix X to the solution of the + * original system before equilibration: x <= C*z + */ + colequ = ( DiagScale[d] == COL || DiagScale[d] == BOTH ); + if ( colequ ) { + C = CeqPtr[d]; + x = Xptr[d]; + for (k = 0; k < nrhs; ++k) { + for (i = 0; i < n[d]; ++i) x[i] *= C[i]; + x += ldX[d]; /* move to next x */ + } + } + + } /* end for d ... batchCount */ + + SUPERLU_FREE (b); + SUPERLU_FREE (berr); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid3d->iam, "Exit pdgssvx3d_csc_batch()"); +#endif + + return 0; +} /* end pdgssvx3d_csc_vbatch */ diff --git a/SRC/include/superlu_ddefs.h b/SRC/include/superlu_ddefs.h index 30ff43c6..97c784ec 100755 --- a/SRC/include/superlu_ddefs.h +++ b/SRC/include/superlu_ddefs.h @@ -1104,6 +1104,9 @@ extern int dcreate_block_diag_3d(SuperMatrix *A, int batchCount, int nrhs, doubl extern int dcreate_batch_systems(handle_t *SparseMatrix_handles, int batchCount, int nrhs, double **rhs, int *ldb, double **x, int *ldx, FILE *fp, char * postfix, gridinfo3d_t *grid3d); +extern int dcreate_batch_systems_multiple(handle_t *SparseMatrix_handles, int batchCount, + int nrhs, double **RHSptr, int *ldRHS, double **xtrue, int *ldX, + FILE **fp, char * postfix, gridinfo3d_t *grid3d); /* Matrix distributed in NRformat_loc in 3D process grid. It converts it to a NRformat_loc distributed in 2D grid in grid-0 */ @@ -1597,16 +1600,35 @@ extern int pdgssvx3d_csc_batch( gridinfo3d_t *grid3d, SuperLUStat_t *stat, int *info //DeviceContext context /* device context including queues, events, dependencies */ ); +extern int pdgssvx3d_csc_vbatch( + superlu_dist_options_t *, int batchCount, int *m, int *n, int *nnz, + int nrhs, handle_t *, double **RHSptr, int *ldRHS, + double **ReqPtr, double **CeqPtr, + int **RpivPtr, int **CpivPtr, DiagScale_t *DiagScale, + handle_t *F, double **Xptr, int *ldX, double **Berrs, + gridinfo3d_t *grid3d, SuperLUStat_t *stat, int *info + //DeviceContext context /* device context including queues, events, dependencies */ + ); extern int dequil_batch( superlu_dist_options_t *, int batchCount, int m, int n, handle_t *, double **ReqPtr, double **CeqPtr, DiagScale_t * // DeviceContext context /* device context including queues, events, dependencies */ ); +extern int dequil_vbatch( + superlu_dist_options_t *, int batchCount, int *m, int *n, handle_t *, + double **ReqPtr, double **CeqPtr, DiagScale_t * + // DeviceContext context /* device context including queues, events, dependencies */ + ); extern int dpivot_batch( superlu_dist_options_t *, int batchCount, int m, int n, handle_t *, double **ReqPtr, double **CeqPtr, DiagScale_t *, int **RpivPtr // DeviceContext context /* device context including queues, events, dependencies */ ); +extern int dpivot_vbatch( + superlu_dist_options_t *, int batchCount, int *m, int *n, handle_t *, + double **ReqPtr, double **CeqPtr, DiagScale_t *, int **RpivPtr + // DeviceContext context /* device context including queues, events, dependencies */ + ); extern int dwriteLUtoDisk(int nsupers, int_t *xsup, dLUstruct_t *LUstruct); extern int dcheckArr(double *A, double *B, int n); diff --git a/SRC/include/superlu_defs.h b/SRC/include/superlu_defs.h index b3ce43fb..9c0e09f2 100755 --- a/SRC/include/superlu_defs.h +++ b/SRC/include/superlu_defs.h @@ -1060,6 +1060,8 @@ extern int sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t, int_t * extern void get_perm_c_dist(int, int, SuperMatrix *, int *); extern void get_perm_c_batch(superlu_dist_options_t *options, int batchCount, handle_t *SparseMatrix_handles, int **CpivPtr); +extern void get_perm_c_vbatch(superlu_dist_options_t *options, int batchCount, + handle_t *SparseMatrix_handles, int **CpivPtr); extern void at_plus_a_dist(const int_t, const int_t, int_t *, int_t *, int_t *, int_t **, int_t **); extern void getata_dist(const int_t m, const int_t n, const int_t nz, int_t *colptr, int_t *rowind, diff --git a/SRC/include/superlu_dist_config.h b/SRC/include/superlu_dist_config.h new file mode 100644 index 00000000..32f820fa --- /dev/null +++ b/SRC/include/superlu_dist_config.h @@ -0,0 +1,32 @@ +/* superlu_dist_config.h.in */ + +/* Enable CUDA */ +#define HAVE_CUDA TRUE + +/* Enable NVSHMEM */ +#define HAVE_NVSHMEM TRUE + +/* Enable HIP */ +/* #undef HAVE_HIP */ + +/* Enable parmetis */ +#define HAVE_PARMETIS TRUE + +/* Enable colamd */ +/* #undef HAVE_COLAMD */ + +/* Enable LAPACK */ +#define SLU_HAVE_LAPACK TRUE + +/* Enable CombBLAS */ +/* #undef HAVE_COMBBLAS */ + +/* Enable MAGMA */ +#define HAVE_MAGMA TRUE + +/* enable 64bit index mode */ +/* #undef XSDK_INDEX_SIZE */ + +#if defined(XSDK_INDEX_SIZE) && (XSDK_INDEX_SIZE == 64) +#define _LONGINT 1 +#endif diff --git a/SRC/prec-independent/get_perm_c_vbatch.c b/SRC/prec-independent/get_perm_c_vbatch.c new file mode 100755 index 00000000..384c5325 --- /dev/null +++ b/SRC/prec-independent/get_perm_c_vbatch.c @@ -0,0 +1,201 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/* + * -- Distributed SuperLU routine (version 9.0) -- + * Lawrence Berkeley National Lab + * November 5, 2023 + * Last update: + */ + +#include "superlu_dist_config.h" +#include "superlu_ddefs.h" + +#ifdef HAVE_COLAMD +#include "colamd.h" +#endif + +/*! \brief Gets sparsity permutations for a batch of matrices + *+ * + * @param[in] options solver options + * @param[in] batchCount number of matrices in the batch + * @param[in] SparseMatrix_handles pointers to the matrices in the batch, each pointing to the actual stoage in CSC format + * On entry, the original matrices, may be overwritten by A1 <- Pr*diag(R)*A*diag(C) from dequil_batch() and dpivot_batch() + * @param[out] CpivPtr pointers to column permutation vectors for each matrix, each of size n + * + *+ */ +void +get_perm_c_vbatch( + superlu_dist_options_t *options, /* options for algorithm choices and algorithm parameters */ + int batchCount, /* number of matrices in the batch */ + handle_t *SparseMatrix_handles, /* array of sparse matrix handles, + * of size 'batchCount', + * each pointing to the actual storage + */ + int **CpivPtr /* array of pointers to column permutation vectors , each of size N */ + ) +{ +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Enter get_perm_c_batch()"); +#endif + + /* Decipher the input matrices */ + SuperMatrix **A; + A = SUPERLU_MALLOC(batchCount * sizeof(SuperMatrix *)); + for (int d = 0; d < batchCount; ++d) { + A[d] = (SuperMatrix *) SparseMatrix_handles[d]; + } + + int_t delta, maxint; + int_t *dhead, *qsize, *llist, *marker; + int *invp; + int m, n; + // int m = A[0]->nrow; + // int n = A[0]->ncol; + + if ( options->ColPerm == MMD_AT_PLUS_A || options->ColPerm == MMD_ATA ) { + + /* These arrays can be reused by multiple matrices */ + + delta = 0; /* DELTA is a parameter to allow the choice of nodes + whose degree <= min-degree + DELTA. */ + maxint = 2147483647; /* 2**31 - 1 */ + invp = (int *) int32Malloc_dist(n+delta); + if ( !invp ) ABORT("SUPERLU_MALLOC fails for invp."); + dhead = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t)); + if ( !dhead ) ABORT("SUPERLU_MALLOC fails for dhead."); + qsize = (int_t *) SUPERLU_MALLOC((n+delta)*sizeof(int_t)); + if ( !qsize ) ABORT("SUPERLU_MALLOC fails for qsize."); + llist = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t)); + if ( !llist ) ABORT("SUPERLU_MALLOC fails for llist."); + marker = (int_t *) SUPERLU_MALLOC(n*sizeof(int_t)); + if ( !marker ) ABORT("SUPERLU_MALLOC fails for marker."); + } + + int_t bnz; + int_t *b_colptr, *b_rowind; /* allocated in at_plus_a() or getata() */ + int_t nofsub; + int i, j; + double t; + + /* Loop through each matrix in the batch */ + for (int d = 0; d < batchCount; ++d) { + + m = A[d]->nrow; + n = A[d]->ncol; + + NCformat *Astore = (NCformat *) A[d]->Store; + int *perm_c = CpivPtr[d]; + + t = SuperLU_timer_(); + bnz = 0; + + switch ( options->ColPerm ) { + + case NATURAL: /* Natural ordering */ + for (i = 0; i < n; ++i) perm_c[i] = i; + break; + + case MMD_AT_PLUS_A: /* Minimum degree ordering on A'+A */ + if ( m != n ) ABORT("Matrix is not square"); + at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); + t = SuperLU_timer_() - t; + /*printf("Form A'+A time = %8.3f\n", t);*/ + + break; + + case MMD_ATA: /* Minimum degree ordering on A'*A */ + getata_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); + t = SuperLU_timer_() - t; + /*printf("Form A'*A time = %8.3f\n", t);*/ + + break; + + case (COLAMD): /* Approximate minimum degree column ordering. */ + get_colamd_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind, + perm_c); + break; +#ifdef HAVE_PARMETIS + case METIS_AT_PLUS_A: /* METIS ordering on A'+A */ + if ( m != n ) ABORT("Matrix is not square"); + at_plus_a_dist(n, Astore->nnz, Astore->colptr, Astore->rowind, + &bnz, &b_colptr, &b_rowind); + + if ( bnz ) { /* non-empty adjacency structure */ + get_metis_dist(n, bnz, b_colptr, b_rowind, perm_c); + } else { /* e.g., diagonal matrix */ + for (i = 0; i < n; ++i) perm_c[i] = i; + SUPERLU_FREE(b_colptr); + /* b_rowind is not allocated in this case */ + } + break; +#endif + default: + ABORT("Invalid options->Colperm"); + + } /* end switch (options->Colperm) */ + + if ( options->ColPerm == MMD_AT_PLUS_A || options->ColPerm == MMD_ATA ) { + if ( bnz ) { + t = SuperLU_timer_(); + + /* Initialize and allocate storage for GENMMD. */ + delta = 0; /* DELTA is a parameter to allow the choice of nodes + whose degree <= min-degree + DELTA. */ + /* Transform adjacency list into 1-based indexing required by GENMMD.*/ + for (i = 0; i <= n; ++i) ++b_colptr[i]; + for (i = 0; i < bnz; ++i) ++b_rowind[i]; + + int_t ln = n; + genmmd_dist_(&ln, b_colptr, b_rowind, perm_c, invp, &delta, dhead, + qsize, llist, marker, &maxint, &nofsub); + + /* Transform perm_c into 0-based indexing. */ + for (i = 0; i < n; ++i) --perm_c[i]; + + t = SuperLU_timer_() - t; + /* printf("call GENMMD time = %8.3f\n", t);*/ + + SUPERLU_FREE(b_colptr); /* TODO: repeated malloc/free for the batch */ + SUPERLU_FREE(b_rowind); + //printf("\tafter free b_rowind bnz %lld\n", (long long)bnz); + + } else { /* Empty adjacency structure */ + for (i = 0; i < n; ++i) perm_c[i] = i; + } + } /* end if MMD */ + +#if ( DEBUGlevel>=1 ) + check_perm_dist("perm_c", n, perm_c); +#endif + } /* end for d = ... batchCount */ + + if ( options->ColPerm == MMD_AT_PLUS_A || options->ColPerm == MMD_ATA ) { + SUPERLU_FREE(invp); + SUPERLU_FREE(dhead); + SUPERLU_FREE(qsize); + SUPERLU_FREE(llist); + SUPERLU_FREE(marker); + } + + SUPERLU_FREE(A); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Exit get_perm_c_batch()"); +#endif + + return; + +} /* end get_perm_c_batch */ diff --git a/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_gcc_nvshmem_vbatch.sh b/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_gcc_nvshmem_vbatch.sh new file mode 100755 index 00000000..4553a706 --- /dev/null +++ b/example_scripts/batch_script_mpi_runit_perlmutter_3dsolve_gcc_nvshmem_vbatch.sh @@ -0,0 +1,299 @@ +#!/bin/bash +# +#modules: +# module load cpe/23.03 +# module load PrgEnv-gnu +# module load gcc/11.2.0 +module load cmake +# module load cudatoolkit/11.7 +# avoid bug in cray-libsci/21.08.1.2 +# module load cray-libsci/22.11.1.2 +# module load cray-libsci/23.02.1.1 +ulimit -s unlimited +#MPI settings: +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=nvidia80 +echo MPICH_GPU_SUPPORT_ENABLED=$MPICH_GPU_SUPPORT_ENABLED +export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH +#SUPERLU settings: + + +export SUPERLU_LBS=GD +export SUPERLU_ACC_OFFLOAD=1 # this can be 0 to do CPU tests on GPU nodes +export GPU3DVERSION=1 +export ANC25D=0 +export NEW3DSOLVE=1 +export NEW3DSOLVETREECOMM=1 +export SUPERLU_BIND_MPI_GPU=1 # assign GPU based on the MPI rank, assuming one MPI per GPU +export SUPERLU_ACC_SOLVE=1 + +export SUPERLU_MAXSUP=256 # max supernode size +export SUPERLU_RELAX=64 # upper bound for relaxed supernode size +export SUPERLU_MAX_BUFFER_SIZE=10000000 ## 500000000 # buffer size in words on GPU +export SUPERLU_NUM_LOOKAHEADS=2 ##4, must be at least 2, see 'lookahead winSize' +export SUPERLU_NUM_GPU_STREAMS=1 +export SUPERLU_N_GEMM=6000 # FLOPS threshold divide workload between CPU and GPU +nmpipergpu=1 +export SUPERLU_MPI_PROCESS_PER_GPU=$nmpipergpu # 2: this can better saturate GPU + +##NVSHMEM settings: +NVSHMEM_HOME=/global/cfs/cdirs/m3894/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/ +export NVSHMEM_USE_GDRCOPY=1 +export NVSHMEM_MPI_SUPPORT=1 +export MPI_HOME=${MPICH_DIR} +export NVSHMEM_LIBFABRIC_SUPPORT=1 +export LIBFABRIC_HOME=/opt/cray/libfabric/1.15.2.0 +export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$LD_LIBRARY_PATH +export NVSHMEM_DISABLE_CUDA_VMM=1 +export FI_CXI_OPTIMIZED_MRS=false +export NVSHMEM_BOOTSTRAP_TWO_STAGE=1 +export NVSHMEM_BOOTSTRAP=MPI +export NVSHMEM_REMOTE_TRANSPORT=libfabric + +#export NVSHMEM_DEBUG=TRACE +#export NVSHMEM_DEBUG_SUBSYS=ALL +#export NVSHMEM_DEBUG_FILE=nvdebug_success + +if [[ $NERSC_HOST == edison ]]; then + CORES_PER_NODE=24 + THREADS_PER_NODE=48 +elif [[ $NERSC_HOST == cori ]]; then + CORES_PER_NODE=32 + THREADS_PER_NODE=64 + # This does not take hyperthreading into account +elif [[ $NERSC_HOST == perlmutter ]]; then + CORES_PER_NODE=64 + THREADS_PER_NODE=128 + GPUS_PER_NODE=4 +else + # Host unknown; exiting + exit $EXIT_HOST +fi + +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 ../EXAMPLE/g20.rua +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 $CFS/m2957/tianyi/matrix/Landau120mat.dat +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 -f .rua $CFS/m2957/tianyi/matrix/repg20/g20 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 64 -f .dat $CFS/m2957/tianyi/matrix/Landau/Landau +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 512 -f .dat $CFS/m2957/tianyi/matrix/Landau_small/Landau +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/Landau_small/Landau +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -e 0 -p 0 -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72_rowperm_0_equil_0 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -e 0 -p 1 -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72_rowperm_1_equil_0 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -p 0 -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72_rowperm_0 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -p 1 -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72_rowperm_1 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 -f .dat $CFS/m2957/tianyi/matrix/isooctane_test/isooctane +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 78 -f .dat $CFS/m2957/tianyi/matrix/dodecane/dodecane +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/dodecane/dodecane +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 67 -f .dat $CFS/m2957/tianyi/matrix/drm19/drm19 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/drm19/drm19 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 73 -f .dat $CFS/m2957/tianyi/matrix/gri12/gri12 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/gri12/gri12 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 90 -f .dat $CFS/m2957/tianyi/matrix/gri30/gri30 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/gri30/gri30 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 100 -f .dat $CFS/m2957/tianyi/matrix/lidryer/lidryer +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/lidryer/lidryer + +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -e 0 -p 0 -b 10 -f .mtx $CFS/m2957/tianyi/matrix/collision_matrices/A_ | tee $CFS/m2957/tianyi/superlu_results/SLU.o_collision_batch_10_rowperm_0_equil_0 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -e 0 -p 1 -b 10 -f .mtx $CFS/m2957/tianyi/matrix/collision_matrices/A_ | tee $CFS/m2957/tianyi/superlu_results/SLU.o_collision_batch_10_rowperm_1_equil_0 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -p 0 -b 10 -f .mtx $CFS/m2957/tianyi/matrix/collision_matrices/A_ | tee $CFS/m2957/tianyi/superlu_results/SLU.o_collision_batch_10_rowperm_0 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -p 1 -b 10 -f .mtx $CFS/m2957/tianyi/matrix/collision_matrices/A_ | tee $CFS/m2957/tianyi/superlu_results/SLU.o_collision_batch_10_rowperm_1 >/dev/null + +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 30 $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120_batch_30 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 30 -f .dat $CFS/m2957/tianyi/matrix/Landau120perturbed/L120perturbed | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120perturbed_batch_30 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 30 -f .dat $CFS/m2957/tianyi/matrix/Landau120rcm_perturbed/L120rcm_perturbed | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120rcm_perturbed_batch_30 >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d $CFS/m2957/tianyi/matrix/Landau120perturbed/L120perturbed29.dat + +# for ((batch = 1; batch < 21; batch++)); do +# echo "Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch -i 1 -f .dat $CFS/m2957/tianyi/matrix/Landau120perturbed/L120perturbed | tee $CFS/m2957/tianyi/superlu_results/Landau120perturbed/SLU.o_L120perturbed_batch_${batch} >/dev/null +# done + +# for ((i = 1; i < 11; i++)); do +# batch=`expr $i \* 10` +# echo "Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch -i 1 -f .dat $CFS/m2957/tianyi/matrix/Landau120perturbed/L120perturbed | tee $CFS/m2957/tianyi/superlu_results/Landau120perturbed/SLU.o_L120perturbed_largerbatch_${batch} >/dev/null +# done + +# for ((batch = 1; batch < 21; batch++)); do +# echo "XGC matrix with $batch batches, size 1640-by-1640 and nnz 14278" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch -i 1 -f .dat $CFS/m2957/tianyi/matrix/xgcrun/xgc | tee $CFS/m2957/tianyi/superlu_results/xgc/SLU.o_xgc_batch_${batch} >/dev/null +# done + +# for ((i = 1; i < 12; i++)); do +# batch=`expr $i \* 10` +# echo "XGC matrix with $batch batches, size 1640-by-1640 and nnz 14278" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch -i 1 -f .dat $CFS/m2957/tianyi/matrix/xgcrun/xgc | tee $CFS/m2957/tianyi/superlu_results/xgc/SLU.o_xgc_largerbatch_${batch} >/dev/null +# done + +# for ((batch = 1; batch < 21; batch++)); do +# echo "Repeat Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/Landau120repeated/SLU.o_Landau120_batch_${batch} >/dev/null +# done + +# for ((i = 1; i < 11; i++)); do +# batch=`expr $i \* 10` +# echo "Repeat Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/Landau120repeated/SLU.o_Landau120_largerbatch_${batch} >/dev/null +# done + +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 200 -f .dat $CFS/m2957/tianyi/matrix/xgcrun/xgc | tee $CFS/m2957/tianyi/superlu_results/SLU.o_xgc_batch_200 >/dev/null + +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 1 $CFS/m2957/tianyi/matrix/L120debug/L120debug1.dat +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 0 $CFS/m2957/tianyi/matrix/L120debug/L120debug1.dat +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 1 -i 1 $CFS/m2957/tianyi/matrix/L120debug/L120debug1.dat +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 -i 1 -f .dat $CFS/m2957/tianyi/matrix/L120debug/L120debug | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120debug_batch_2_ir >/dev/null +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 -f .dat $CFS/m2957/tianyi/matrix/L120debug/L120debug | tee $CFS/m2957/tianyi/superlu_results/SLU.o_L120debug_batch_2 >/dev/null + +# echo "Repeat g20 twice with 2 batches" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 2 ../EXAMPLE/g20.rua | tee $CFS/m2957/tianyi/superlu_results/SLU.o_g20_batch_2 >/dev/null + +# echo "Repeat Landau 3D 120 cells matrix with 3 batches, size 3403-by-3403 and nnz 381187" +# # Todo: try larger batch count, probably up to 1000 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/SLU.o_Landau120_batch_3 + +# echo "Original Landau matrices, size 193-by-193 and nnz 4417, batch size 512" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 512 -f .dat $CFS/m2957/tianyi/matrix/Landau_small/Landau | tee $CFS/m2957/tianyi/superlu_results/SLU.o_Landau_batch_512 +# # Looking closely, the matrices are also just a repetition of one single matrix + +# echo "Isooctane matrices from the Pele example, size 144-by-144 and nnz 6135, batch size 72" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 72 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_72 + +# echo "First three isooctane matrices from the Pele example, size 144-by-144 and nnz 6135, this example shows different diagonal scaling" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/isooctane/isooctane | tee $CFS/m2957/tianyi/superlu_results/SLU.o_isooctane_batch_3 + +# echo "Lidryer matrices from the Pele example, size 10-by-10 and nnz 91, batch size 100" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 100 -f .dat $CFS/m2957/tianyi/matrix/lidryer/lidryer | tee $CFS/m2957/tianyi/superlu_results/SLU.o_lidryer_batch_100 +# # These are really small and almost dense matrices, however, their importance manifest in the next example + +# echo "First three lidryer matrices from the Pele example, size 10-by-10 and nnz 91" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b 3 -f .dat $CFS/m2957/tianyi/matrix/lidryer/lidryer | tee $CFS/m2957/tianyi/superlu_results/SLU.o_lidryer_batch_3 + +# echo "Variable size batch example" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 2 -f .dat $CFS/m2957/tianyi/matrix/vbatch_test/test | tee $CFS/m2957/tianyi/superlu_results/SLU.o_vbatch_test_2 + +echo "Variable size batch EMT example" +srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_total +srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_2 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_total +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 3 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun2/EMTrun2_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun2_3 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun2/EMTrun2_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun2_total +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun3/EMTrun3_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun3_2 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun3/EMTrun3_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun3_total +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun4/EMTrun4_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun4_2 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun4/EMTrun4_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun4_total +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_2_p_0 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun1/EMTrun1_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun1_total_p_0 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 3 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun2/EMTrun2_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun2_3_p_0 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun2/EMTrun2_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun2_total_p_0 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun3/EMTrun3_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun3_2_p_0 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun3/EMTrun3_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun3_total_p_0 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 2 -f .dat $CFS/m2957/tianyi/matrix/EMTrun/EMTrun4/EMTrun4_ | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun4_2_p_0 +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d_vbatch -p 0 -b 1 $CFS/m2957/tianyi/matrix/EMTrun/EMTrun4/EMTrun4_total.dat | tee $CFS/m2957/tianyi/superlu_results/EMT_results/SLU.o_vbatch_EMTrun4_total_p_0 + +# batchCount=(1 10 100 500 1000) +# # batchCount=(1000) +# for ((i = 0; i < ${#batchCount[@]}; i++)); do +# batch=${batchCount[i]} + +# echo "Repeat Landau 3D 120 cells matrix with $batch batches, size 3403-by-3403 and nnz 381187" +# srun -n 1 -N 1 ./EXAMPLE/pddrive3d -b $batch $CFS/m2957/tianyi/matrix/Landau120mat.dat | tee $CFS/m2957/tianyi/superlu_results/SLU.o_Landau120_batch_${batch} >/dev/null +# done + + +# nprows=(1) +# npcols=(1) +# npz=(1) +# nrhs=(1) +# NTH=1 +# NREP=1 +# # NODE_VAL_TOT=1 + +# for ((i = 0; i < ${#npcols[@]}; i++)); do +# NROW=${nprows[i]} +# NCOL=${npcols[i]} +# NPZ=${npz[i]} +# for ((s = 0; s < ${#nrhs[@]}; s++)); do +# NRHS=${nrhs[s]} +# CORE_VAL2D=`expr $NCOL \* $NROW` +# NODE_VAL2D=`expr $CORE_VAL2D / $GPUS_PER_NODE` +# MOD_VAL=`expr $CORE_VAL2D % $GPUS_PER_NODE` +# if [[ $MOD_VAL -ne 0 ]] +# then +# NODE_VAL2D=`expr $NODE_VAL2D + 1` +# fi + +# CORE_VAL=`expr $NCOL \* $NROW \* $NPZ` +# NODE_VAL=`expr $CORE_VAL / $GPUS_PER_NODE` +# MOD_VAL=`expr $CORE_VAL % $GPUS_PER_NODE` +# if [[ $MOD_VAL -ne 0 ]] +# then +# NODE_VAL=`expr $NODE_VAL + 1` +# fi + +# # NODE_VAL=2 +# # NCORE_VAL_TOT=`expr $NODE_VAL_TOT \* $CORES_PER_NODE / $NTH` +# batch=0 # whether to do batched test +# NCORE_VAL_TOT=`expr $NROW \* $NCOL \* $NPZ ` +# NCORE_VAL_TOT2D=`expr $NROW \* $NCOL ` + +# OMP_NUM_THREADS=$NTH +# TH_PER_RANK=`expr $NTH \* 2` + +# export OMP_NUM_THREADS=$NTH +# export OMP_PLACES=threads +# export OMP_PROC_BIND=spread +# export SLURM_CPU_BIND="cores" +# export MPICH_MAX_THREAD_SAFETY=multiple + +# # srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../EXAMPLE/g20.rua + +# # export NSUP=256 +# # export NREL=256 +# # for MAT in big.rua +# # for MAT in Geo_1438.bin +# # for MAT in g20.rua +# # for MAT in s1_mat_0_253872.bin s2D9pt2048.rua +# # for MAT in dielFilterV3real.bin +# for MAT in rma10.mtx +# # for MAT in raefsky3.mtx +# # for MAT in s2D9pt2048.rua raefsky3.mtx rma10.mtx +# # for MAT in s1_mat_0_126936.bin # for MAT in s1_mat_0_126936.bin +# # for MAT in s2D9pt2048.rua +# # for MAT in nlpkkt80.bin dielFilterV3real.bin Ga19As19H42.bin +# # for MAT in dielFilterV3real.bin +# # for MAT in s2D9pt1536.rua +# # for MAT in s1_mat_0_126936.bin s1_mat_0_253872.bin s1_mat_0_507744.bin +# # for MAT in matrix_ACTIVSg70k_AC_00.mtx matrix_ACTIVSg10k_AC_00.mtx +# # for MAT in temp_13k.mtx temp_25k.mtx temp_75k.mtx +# # for MAT in temp_13k.mtx +# do +# mkdir -p $MAT +# for ii in `seq 1 $NREP` +# do + +# # SUPERLU_ACC_OFFLOAD=0 +# # srun -n $NCORE_VAL_TOT2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD} + +# # SUPERLU_ACC_OFFLOAD=1 +# # srun -n $NCORE_VAL_TOT2D -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive -c $NCOL -r $NROW -b $batch $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}_${NTH}_1rhs_2d_gpu_${SUPERLU_ACC_OFFLOAD}_nmpipergpu${nmpipergpu} + +# SUPERLU_ACC_OFFLOAD=1 +# export GPU3DVERSION=0 +# echo "srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}" +# srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}_nmpipergpu${nmpipergpu} + +# SUPERLU_ACC_OFFLOAD=1 +# export GPU3DVERSION=1 +# echo "srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}" +# srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS}_gpu_${SUPERLU_ACC_OFFLOAD}_cpp_${GPU3DVERSION}_nmpipergpu${nmpipergpu} + + +# # export SUPERLU_ACC_SOLVE=1 +# # srun -n $NCORE_VAL_TOT -c $TH_PER_RANK --cpu_bind=cores valgrind --leak-check=yes ./EXAMPLE/pddrive3d -c $NCOL -r $NROW -d $NPZ -b $batch -i 0 -s $NRHS $CFS/m2957/liuyangz/my_research/matrix/$MAT | tee ./$MAT/SLU.o_mpi_${NROW}x${NCOL}x${NPZ}_${OMP_NUM_THREADS}_3d_newest_gpusolve_${SUPERLU_ACC_SOLVE}_nrhs_${NRHS} + + +# done + +# done +# done +# done diff --git a/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh b/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh index 1a1c23ae..93f7b991 100755 --- a/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh +++ b/example_scripts/run_cmake_build_perlmutter_gcc_nvshmem.sh @@ -33,7 +33,7 @@ module load cmake module load cudatoolkit # avoid bug in cray-libsci/21.08.1.2 # module load cray-libsci/22.11.1.2 -module load cray-libsci +module load cray-libsci/24.07.0 # module use /global/common/software/nersc/pe/modulefiles/latest # module load nvshmem/2.11.0 export MAGMA_ROOT=/global/cfs/cdirs/m2957/lib/magma_master @@ -44,7 +44,7 @@ export MAGMA_ROOT=/global/cfs/cdirs/m2957/lib/magma_master NVSHMEM_HOME=/global/cfs/cdirs/m2957/lib/lib/PrgEnv-gnu/nvshmem_src_2.8.0-3/build/ #NVSHMEM_HOME=${CRAY_NVIDIA_PREFIX}/comm_libs/nvshmem/ cmake .. \ - -DCMAKE_C_FLAGS="-O2 -std=c11 -DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -DAdd_" \ + -DCMAKE_C_FLAGS="-O2 -std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0 -DAdd_" \ -DCMAKE_CXX_FLAGS="-O2 -std=c++11" \ -DCMAKE_Fortran_FLAGS="-O2" \ -DCMAKE_CXX_COMPILER=CC \ @@ -53,14 +53,14 @@ cmake .. \ -DXSDK_ENABLE_Fortran=ON \ -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \ -DTPL_ENABLE_LAPACKLIB=ON \ - -DBUILD_SHARED_LIBS=ON \ + -DBUILD_SHARED_LIBS=OFF \ -DTPL_ENABLE_CUDALIB=ON \ -DCMAKE_CUDA_FLAGS="-I${NVSHMEM_HOME}/include -I${MPICH_DIR}/include -ccbin=CC" \ -DCMAKE_CUDA_ARCHITECTURES=80 \ -DCMAKE_INSTALL_PREFIX=. \ -DCMAKE_INSTALL_LIBDIR=./lib \ -DCMAKE_BUILD_TYPE=Debug \ - -DTPL_ENABLE_MAGMALIB=OFF \ + -DTPL_ENABLE_MAGMALIB=ON \ -DTPL_MAGMA_INCLUDE_DIRS="${MAGMA_ROOT}/include" \ -DTPL_MAGMA_LIBRARIES="${MAGMA_ROOT}/lib/libmagma.so" \ -DTPL_BLAS_LIBRARIES=$CRAY_LIBSCI_PREFIX/lib/libsci_gnu_mp.so \ @@ -69,18 +69,19 @@ cmake .. \ -DTPL_PARMETIS_LIBRARIES="/global/cfs/cdirs/m2957/lib/lib/PrgEnv-gnu/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.so;/global/cfs/cdirs/m2957/lib/lib/PrgEnv-gnu/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.so" \ -DTPL_ENABLE_COMBBLASLIB=OFF \ -DTPL_ENABLE_NVSHMEM=ON \ - -DTPL_NVSHMEM_LIBRARIES="-L${CUDA_HOME}/lib64/stubs/ -lnvidia-ml -L/usr/lib64 -lgdrapi -lstdc++ -L/opt/cray/libfabric/1.20.1/lib64 -lfabric -L${NVSHMEM_HOME}/lib -lnvshmem" \ + -DTPL_NVSHMEM_LIBRARIES="-L${CUDA_HOME}/lib64/stubs/ -lnvidia-ml -L/usr/lib64 -lgdrapi -lstdc++ -L/opt/cray/libfabric/1.22.0/lib64 -lfabric -L${NVSHMEM_HOME}/lib -lnvshmem" \ -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -DMPIEXEC_NUMPROC_FLAG=-n \ -DMPIEXEC_EXECUTABLE=/usr/bin/srun \ -DMPIEXEC_MAX_NUMPROCS=16 -make pddrive -j16 -make pddrive3d -j16 -make pzdrive3d -j16 -make pzdrive -make f_pddrive -make pzdrive3d_qcd +# make pddrive -j16 +# make pddrive3d -j16 +make pddrive3d_vbatch -j16 +# make pzdrive3d -j16 +# make pzdrive +# make f_pddrive +# make pzdrive3d_qcd ## -DTPL_BLAS_LIBRARIES=/global/cfs/cdirs/m3894/ptlin/tpl/amd_blis/install/amd_blis-20211021-n9-gcc9.3.0/lib/libblis.a \