Skip to content

Integer overflow in memory indexing for CLASS D/E leads to segmentation fault #17

@yuxuan-z19

Description

@yuxuan-z19

When running CLASS D and E, the program crashes due to segmentation faults on an A100-PCIe-40GB using CUDA 12.8.

The root cause might be that memory allocation and address indexing in the original implementation are entirely based on 32-bit integers. At these problem sizes, the computed indices overflow, resulting in invalid memory accesses.

cd CUDA
❯ make CG CLASS=E
   ===========================================
   =      NAS PARALLEL BENCHMARKS 4.1        =
   =      CUDA Versions                      =
   =      C++                                =
   ===========================================

cd CG; ../config/get_gpu_arch.sh; make CLASS=E
Updated COMPUTE_CAPABILITY (GPU of id 0) in ../config/make.def to -gencode arch=compute_80,code=sm_80
make[1]: Entering directory '/archive/share/zyx/NPB-GPU/CUDA/CG'
make[2]: Entering directory '/archive/share/zyx/NPB-GPU/CUDA/sys'
cc -o setparams setparams.cpp
setparams.cpp:139:1: warning: ISO C++ forbids declaration of ‘main’ with no type [-Wreturn-type]
  139 | main(int argc, char *argv[]){
      | ^~~~
setparams.cpp: In function ‘char* read_nvcc_cuda_version()’:
setparams.cpp:216:14: warning: ‘char* fgets(char*, int, FILE*)’ writing 1024 bytes into a region of size 64 overflows the destination [-Wstringop-overflow=]
  216 |         fgets(result, 1024 , file);
      |         ~~~~~^~~~~~~~~~~~~~~~~~~~~
setparams.cpp:212:14: note: destination object ‘result’ of size 64
  212 |         char result[64];
      |              ^~~~~~
In file included from /usr/include/c++/13/cstdio:42,
                 from setparams.cpp:82:
/usr/include/stdio.h:654:14: note: in a call to function ‘char* fgets(char*, int, FILE*)’ declared with attribute ‘access (write_only, 1, 2)’
  654 | extern char *fgets (char *__restrict __s, int __n, FILE *__restrict __stream)
      |              ^~~~~
make[2]: Leaving directory '/archive/share/zyx/NPB-GPU/CUDA/sys'
../sys/setparams cg E
make[1]: Warning: File 'npbparams.hpp' has modification time 0.0037 s in the future
nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3   -c -I../common  -O3 cg.cu
cg.cu(111): warning #61-D: integer operation result is out of range
  static int (*colidx)=(int*)malloc(sizeof(int)*((9000000*(26 +1)*(26 +1))));
                                                                 ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

cg.cu(111): warning #68-D: integer conversion resulted in a change of sign
  static int (*colidx)=(int*)malloc(sizeof(int)*((9000000*(26 +1)*(26 +1))));
                                                ^

cg.cu(117): warning #61-D: integer operation result is out of range
  static double (*a)=(double*)malloc(sizeof(double)*((9000000*(26 +1)*(26 +1))));
                                                                     ^

cg.cu(117): warning #68-D: integer conversion resulted in a change of sign
  static double (*a)=(double*)malloc(sizeof(double)*((9000000*(26 +1)*(26 +1))));
                                                    ^

cg.cu(403): warning #61-D: integer operation result is out of range
   nzz = (9000000*(26 +1)*(26 +1));
                         ^

cg.cu(1530): warning #61-D: integer operation result is out of range
   size_colidx_device=(9000000*(26 +1)*(26 +1))*sizeof(int);
                                      ^

cg.cu(1530): warning #68-D: integer conversion resulted in a change of sign
   size_colidx_device=(9000000*(26 +1)*(26 +1))*sizeof(int);
                      ^

cg.cu(1536): warning #61-D: integer operation result is out of range
   size_a_device=(9000000*(26 +1)*(26 +1))*sizeof(double);
                                 ^

cg.cu(1536): warning #68-D: integer conversion resulted in a change of sign
   size_a_device=(9000000*(26 +1)*(26 +1))*sizeof(double);
                 ^

cg.cu(111): warning #61-D: integer operation result is out of range
  static int (*colidx)=(int*)malloc(sizeof(int)*((9000000*(26 +1)*(26 +1))));
                                                                 ^

Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"

cg.cu(111): warning #68-D: integer conversion resulted in a change of sign
  static int (*colidx)=(int*)malloc(sizeof(int)*((9000000*(26 +1)*(26 +1))));
                                                ^

cg.cu(117): warning #61-D: integer operation result is out of range
  static double (*a)=(double*)malloc(sizeof(double)*((9000000*(26 +1)*(26 +1))));
                                                                     ^

cg.cu(117): warning #68-D: integer conversion resulted in a change of sign
  static double (*a)=(double*)malloc(sizeof(double)*((9000000*(26 +1)*(26 +1))));
                                                    ^

cg.cu(403): warning #61-D: integer operation result is out of range
   nzz = (9000000*(26 +1)*(26 +1));
                         ^

cg.cu(1530): warning #61-D: integer operation result is out of range
   size_colidx_device=(9000000*(26 +1)*(26 +1))*sizeof(int);
                                      ^

cg.cu(1530): warning #68-D: integer conversion resulted in a change of sign
   size_colidx_device=(9000000*(26 +1)*(26 +1))*sizeof(int);
                      ^

cg.cu(1536): warning #61-D: integer operation result is out of range
   size_a_device=(9000000*(26 +1)*(26 +1))*sizeof(double);
                                 ^

cg.cu(1536): warning #68-D: integer conversion resulted in a change of sign
   size_a_device=(9000000*(26 +1)*(26 +1))*sizeof(double);
                 ^

cg.cu:111:72: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
  111 | static int (*colidx)=(int*)malloc(sizeof(int)*(NZ));
      |                                                   ~~                   ^         
cg.cu:117:76: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
  117 | static double (*a)=(double*)malloc(sizeof(double)*(NZ));
      |                                                       ~~                   ^         
cg.cu: In function ‘int main(int, char**)’:
cg.cu:403:36: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
  403 |         nzz = NZ;
      |               ~~~                  ^         
cg.cu: In function ‘void setup_gpu()’:
cg.cu:1530:52: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
 1530 |         size_colidx_device=NZ*sizeof(int);
      |                               ~~~~~~~~~~~~         ^         
cg.cu:1536:47: warning: integer overflow in expression of type ‘int’ results in ‘-2028934592’ [-Woverflow]
 1536 |         size_a_device=NZ*sizeof(double);
      |                          ~~~~~~~~~~~~~~~      ^         
In function ‘void __static_initialization_and_destruction_0()’,
    inlined from ‘(static initializers for /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c)’ at /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c:4:27:
cg.cu:111:35: warning: argument 1 value ‘18446744065593813248’ exceeds maximum object size 9223372036854775807 [-Walloc-size-larger-than=]
  111 | static int (*colidx)=(int*)malloc(sizeof(int)*(NZ));
      |                             ~~~~~~^~~~~~~~~~~~~~~~~~                               
/usr/include/stdlib.h: In function ‘(static initializers for /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c)’:
/usr/include/stdlib.h:672:14: note: in a call to allocation function ‘void* malloc(size_t)’ declared here
  672 | extern void *malloc (size_t __size) __THROW __attribute_malloc__
      |              ^~~~~~
In function ‘void __static_initialization_and_destruction_0()’,
    inlined from ‘(static initializers for /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c)’ at /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c:4:27:
cg.cu:117:36: warning: argument 1 value ‘18446744057478074880’ exceeds maximum object size 9223372036854775807 [-Walloc-size-larger-than=]
  117 | static double (*a)=(double*)malloc(sizeof(double)*(NZ));
      |                              ~~~~~~^~~~~~~~~~~~~~~~~~~~~                               
/usr/include/stdlib.h: In function ‘(static initializers for /tmp/tmpxft_00140463_00000000-6_cg.cudafe1.stub.c)’:
/usr/include/stdlib.h:672:14: note: in a call to allocation function ‘void* malloc(size_t)’ declared here
  672 | extern void *malloc (size_t __size) __THROW __attribute_malloc__
      |              ^~~~~~
cd ../common; nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3   -c -I../common  -O3 c_print_results.cpp
cd ../common; nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3   -c -I../common  -O3 c_randdp.cpp
cd ../common; nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3   -c -I../common  -O3 c_timers.cu
cd ../common; nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3   -c -I../common  -O3 -o c_wtime.o ../common/wtime.cpp
nvcc -gencode arch=compute_80,code=sm_80 -Xcompiler -fopenmp -lgomp -use_fast_math -Xptxas --preserve-relocs -Xptxas -O3  -O3 -o ../bin/cg.E cg.o ../common/c_print_results.o ../common/c_randdp.o ../common/c_timers.o ../common/c_wtime.o -lm 
make[1]: warning:  Clock skew detected.  Your build may be incomplete.
make[1]: Leaving directory '/archive/share/zyx/NPB-GPU/CUDA/CG'
❯ ./bin/cg.E


 NAS Parallel Benchmarks 4.1 CUDA C++ version - CG Benchmark

 Size:     9000000
 Iterations:   100
[1]    1311990 segmentation fault (core dumped)  ./bin/cg.E

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions