When running CLASS D and E, the program crashes due to segmentation faults on an A100-PCIe-40GB using CUDA 12.8.
The root cause might be that memory allocation and address indexing in the original implementation are entirely based on 32-bit integers. At these problem sizes, the computed indices overflow, resulting in invalid memory accesses.
❯ cd CUDA
❯ make CG CLASS=E
===========================================
= NAS PARALLEL BENCHMARKS 4.1 =
= CUDA Versions =
= C++ =
===========================================
cd CG; ../config/get_gpu_arch.sh; make CLASS=E
Updated COMPUTE_CAPABILITY (GPU of id 0) in ../config/make.def to -gencode arch=compute_80,code=sm_80
make[1]: Entering directory '/archive/share/zyx/NPB-GPU/CUDA/CG'
make[2]: Entering directory '/archive/share/zyx/NPB-GPU/CUDA/sys'
cc -o setparams setparams.cpp
setparams.cpp:139:1: warning: ISO C++ forbids declaration of ‘main’ with no type [-Wreturn-type]
139 | main(int argc, char *argv[]){
| ^~~~
setparams.cpp: In function ‘char* read_nvcc_cuda_version()’:
setparams.cpp:216:14: warning: ‘char* fgets(char*, int, FILE*)’ writing 1024 bytes into a region of size 64 overflows the destination [-Wstringop-overflow=]
216 | fgets(result, 1024 , file);
| ~~~~~^~~~~~~~~~~~~~~~~~~~~
setparams.cpp:212:14: note: destination object ‘result’ of size 64
212 | char result[64];
| ^~~~~~
In file included from /usr/include/c++/13/cstdio:42,
from setparams.cpp:82:
/usr/include/stdio.h:654:14: note: in a call to function ‘char* fgets(char*, int, FILE*)’ declared with attribute ‘access (write_only, 1, 2)’
654 | extern char *fgets (char *__restrict __s, int __n, FILE *__restrict __stream)
| ^~~~~
make[2]: Leaving directory '/archive/share/zyx/NPB-GPU/CUDA/sys'
../sys/setparams cg E
make[1]: Warning: File 'npbparams.hpp' has modification time 0.0037 s in the future
nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3 -c -I../common -O3 cg.cu
cg.cu(111): warning #61-D: integer operation result is out of range
static int (*colidx)=(int*)malloc(sizeof(int)*((9000000*(26 +1)*(26 +1))));
^
Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"
cg.cu(111): warning #68-D: integer conversion resulted in a change of sign
static int (*colidx)=(int*)malloc(sizeof(int)*((9000000*(26 +1)*(26 +1))));
^
cg.cu(117): warning #61-D: integer operation result is out of range
static double (*a)=(double*)malloc(sizeof(double)*((9000000*(26 +1)*(26 +1))));
^
cg.cu(117): warning #68-D: integer conversion resulted in a change of sign
static double (*a)=(double*)malloc(sizeof(double)*((9000000*(26 +1)*(26 +1))));
^
cg.cu(403): warning #61-D: integer operation result is out of range
nzz = (9000000*(26 +1)*(26 +1));
^
cg.cu(1530): warning #61-D: integer operation result is out of range
size_colidx_device=(9000000*(26 +1)*(26 +1))*sizeof(int);
^
cg.cu(1530): warning #68-D: integer conversion resulted in a change of sign
size_colidx_device=(9000000*(26 +1)*(26 +1))*sizeof(int);
^
cg.cu(1536): warning #61-D: integer operation result is out of range
size_a_device=(9000000*(26 +1)*(26 +1))*sizeof(double);
^
cg.cu(1536): warning #68-D: integer conversion resulted in a change of sign
size_a_device=(9000000*(26 +1)*(26 +1))*sizeof(double);
^
cg.cu(111): warning #61-D: integer operation result is out of range
static int (*colidx)=(int*)malloc(sizeof(int)*((9000000*(26 +1)*(26 +1))));
^
Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"
cg.cu(111): warning #68-D: integer conversion resulted in a change of sign
static int (*colidx)=(int*)malloc(sizeof(int)*((9000000*(26 +1)*(26 +1))));
^
cg.cu(117): warning #61-D: integer operation result is out of range
static double (*a)=(double*)malloc(sizeof(double)*((9000000*(26 +1)*(26 +1))));
^
cg.cu(117): warning #68-D: integer conversion resulted in a change of sign
static double (*a)=(double*)malloc(sizeof(double)*((9000000*(26 +1)*(26 +1))));
^
cg.cu(403): warning #61-D: integer operation result is out of range
nzz = (9000000*(26 +1)*(26 +1));
^
cg.cu(1530): warning #61-D: integer operation result is out of range
size_colidx_device=(9000000*(26 +1)*(26 +1))*sizeof(int);
^
cg.cu(1530): warning #68-D: integer conversion resulted in a change of sign
size_colidx_device=(9000000*(26 +1)*(26 +1))*sizeof(int);
^
cg.cu(1536): warning #61-D: integer operation result is out of range
size_a_device=(9000000*(26 +1)*(26 +1))*sizeof(double);
^
cg.cu(1536): warning #68-D: integer conversion resulted in a change of sign
size_a_device=(9000000*(26 +1)*(26 +1))*sizeof(double);
^
cg.cu:111:72: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
111 | static int (*colidx)=(int*)malloc(sizeof(int)*(NZ));
| ~~ ^
cg.cu:117:76: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
117 | static double (*a)=(double*)malloc(sizeof(double)*(NZ));
| ~~ ^
cg.cu: In function ‘int main(int, char**)’:
cg.cu:403:36: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
403 | nzz = NZ;
| ~~~ ^
cg.cu: In function ‘void setup_gpu()’:
cg.cu:1530:52: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
1530 | size_colidx_device=NZ*sizeof(int);
| ~~~~~~~~~~~~ ^
cg.cu:1536:47: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
1536 | size_a_device=NZ*sizeof(double);
| ~~~~~~~~~~~~~~~ ^
In function ‘void __static_initialization_and_destruction_0()’,
inlined from ‘(static initializers for /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c)’ at /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c:4:27:
cg.cu:111:35: warning: argument 1 value ‘18446744065593813248’ exceeds maximum object size 9223372036854775807 [-Walloc-size-larger-than=]
111 | static int (*colidx)=(int*)malloc(sizeof(int)*(NZ));
| ~~~~~~^~~~~~~~~~~~~~~~~~
/usr/include/stdlib.h: In function ‘(static initializers for /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c)’:
/usr/include/stdlib.h:672:14: note: in a call to allocation function ‘void* malloc(size_t)’ declared here
672 | extern void *malloc (size_t __size) __THROW __attribute_malloc__
| ^~~~~~
In function ‘void __static_initialization_and_destruction_0()’,
inlined from ‘(static initializers for /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c)’ at /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c:4:27:
cg.cu:117:36: warning: argument 1 value ‘18446744057478074880’ exceeds maximum object size 9223372036854775807 [-Walloc-size-larger-than=]
117 | static double (*a)=(double*)malloc(sizeof(double)*(NZ));
| ~~~~~~^~~~~~~~~~~~~~~~~~~~~
/usr/include/stdlib.h: In function ‘(static initializers for /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c)’:
/usr/include/stdlib.h:672:14: note: in a call to allocation function ‘void* malloc(size_t)’ declared here
672 | extern void *malloc (size_t __size) __THROW __attribute_malloc__
| ^~~~~~
cd ../common; nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3 -c -I../common -O3 c_print_results.cpp
cd ../common; nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3 -c -I../common -O3 c_randdp.cpp
cd ../common; nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3 -c -I../common -O3 c_timers.cu
cd ../common; nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3 -c -I../common -O3 -o c_wtime.o ../common/wtime.cpp
nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3 -O3 -o ../bin/cg.E cg.o ../common/c_print_results.o ../common/c_randdp.o ../common/c_timers.o ../common/c_wtime.o -lm
make[1]: warning: Clock skew detected. Your build may be incomplete.
make[1]: Leaving directory '/archive/share/zyx/NPB-GPU/CUDA/CG'
❯ ./bin/cg.E
NAS Parallel Benchmarks 4.1 CUDA C++ version - CG Benchmark
Size: 9000000
Iterations: 100
[1] 1311990 segmentation fault (core dumped) ./bin/cg.E
When running CLASS D and E, the program crashes due to segmentation faults on an A100-PCIe-40GB using CUDA 12.8.
The root cause might be that memory allocation and address indexing in the original implementation are entirely based on 32-bit integers. At these problem sizes, the computed indices overflow, resulting in invalid memory accesses.