contraction errors when using C8 types on AMD gpus

I see error messages when contracting tensors that are of type `complex double (C8)` on AMD GPUs.

```
#MESSAGE: Printing TAL-SH task info:
 Device kind -1: Error 106
#END OF MESSAGE
```

I consistently see this error with rocm versions `4.5.0`, `4.5.2` and `5.1.0`.

Below is a slimmer version of `test.cpp` which only runs the `test_talsh_c` routine. Additionally changed the `R8` occurrences to `C8` to reproduce the error. It looks like call to [gpu_tensor_block_contract_dlf](https://github.com/DmitryLyakh/TAL_SH/blob/master/talshc.cpp#L5341)  is where things go wrong. This call returns a task error code that is > 0 for when the tensor type is `C8`.

```c++
#include "talshxx.hpp"
#include "talsh.h"
#include "device_algebra.hip.h"

#include <iostream>
#include <memory>
#include <string>
#include <complex> 

#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <cassert>

void test_talsh_c(int * ierr)
{
 const int VDIM_SIZE=30; //virtual
 const int ODIM_SIZE=20; //occupied
 int errc;
 //size_t host_buffer_size=TALSH_NO_HOST_BUFFER;
 size_t host_buffer_size = 1024*1024*1024; //bytes
 int gpu_list[MAX_GPUS_PER_NODE];

 *ierr=0;

//Query the total number of NVIDIA GPU on node:
 int ngpu;
 errc=talshDeviceCount(DEV_NVIDIA_GPU,&ngpu); if(errc){*ierr=1; return;};
 printf(" Number of NVIDIA GPU found on node = %d\n",ngpu);

//Initialize TAL-SH (with a negligible Host buffer since we will use external memory):
 int host_arg_max;
 for(int i=0; i<ngpu; ++i) gpu_list[i]=i; //list of NVIDIA GPU devices to use in this process
 errc=talshInit(&host_buffer_size,&host_arg_max,ngpu,gpu_list,0,NULL,0,NULL);
 printf(" TAL-SH has been initialized: Status %d: Host buffer size = %lu\n",errc,host_buffer_size); if(errc){*ierr=2; return;};

//Allocate three tensor blocks in Host memory outside of TAL-SH (external application):
 //Tensor block 0:
 int trank0 = 4; //tensor block rank
 const int dims0[] = {VDIM_SIZE,VDIM_SIZE,ODIM_SIZE,ODIM_SIZE}; //tensor block dimension extents
 int trank1 = 4; //tensor block rank
 const int dims1[] = {VDIM_SIZE,VDIM_SIZE,VDIM_SIZE,VDIM_SIZE}; //tensor block dimension extents
 
 int trank2 = 4; //tensor block rank
 const int dims2[] = {ODIM_SIZE,VDIM_SIZE,ODIM_SIZE,VDIM_SIZE}; //tensor block dimension extents

 talsh_tens_t tens0; //declare a TAL-SH tensor block
 errc = talshTensorClean(&tens0); if(errc){*ierr=3; return;}; //clean TAL-SH tensor block object (default ctor)
 errc = talshTensorConstruct(&tens0,C8,trank0,dims0,talshFlatDevId(DEV_HOST,0),NULL,-1,NULL,0.0); //construct tensor block in Host buffer
 //errc = talshTensorConstruct(&tens0,C8,trank0,dims0,talshFlatDevId(DEV_HOST,0),(void*)tblock0); //register tensor block with external memory
 if(errc){*ierr=4; return;};
 size_t vol0 = talshTensorVolume(&tens0);
 //Tensor block 1:
 talsh_tens_t tens1; //declare a TAL-SH tensor block
 errc = talshTensorClean(&tens1); if(errc){*ierr=5; return;}; //clean TAL-SH tensor block object (default ctor)
 errc = talshTensorConstruct(&tens1,C8,trank1,dims1,talshFlatDevId(DEV_HOST,0),NULL,-1,NULL,0.001); //construct tensor block in Host buffer
 //errc = talshTensorConstruct(&tens1,C8,trank1,dims1,talshFlatDevId(DEV_HOST,0),(void*)tblock1); //register tensor block with external memory
 if(errc){*ierr=6; return;};
 size_t vol1 = talshTensorVolume(&tens1);
 //Tensor block 2:
 talsh_tens_t tens2; //declare a TAL-SH tensor block
 errc = talshTensorClean(&tens2); if(errc){*ierr=7; return;}; //clean TAL-SH tensor block object (default ctor)
 errc = talshTensorConstruct(&tens2,C8,trank2,dims2,talshFlatDevId(DEV_HOST,0),NULL,-1,NULL,0.01); //construct tensor block in Host buffer
 //errc=talshTensorConstruct(&tens2,C8,trank2,dims2,talshFlatDevId(DEV_HOST,0),(void*)tblock2); //register tensor block with external memory
 if(errc){*ierr=8; return;};
 size_t vol2 = talshTensorVolume(&tens2);
 double gflops = (sqrt(((double)(vol0))*((double)(vol1))*((double)(vol2)))*2.0)/1e9; //total number of floating point operations (GFlops)
 double theor_norm1 = gflops * 0.01 * 0.001 * 1e9;
 printf(" Three TAL-SH tensor blocks have been constructed: Volumes: %lu, %lu, %lu: GFlops = %f\n",vol0,vol1,vol2,gflops);

//Declare a TAL-SH task handle:
 talsh_task_t task0; //declare a TAL-SH task handle
 errc=talshTaskClean(&task0); //clean TAL-SH task handle object to an empty state
 if(errc){*ierr=9; return;};

//Execute a tensor contraction either on CPU (synchronously) or GPU (asynchronously):
#ifndef NO_GPU
 int dev_kind = DEV_NVIDIA_GPU; //NVIDIA GPU devices
 int dev_num = 0; //specific device number (any from gpu_list[])
#else
 int dev_kind = DEV_HOST; //CPU Host (multicore)
 int dev_num = 0; //CPU Host is always a single device (but multicore)
#endif
 //Schedule:
 clock_t tms = clock();
 errc=talshTensorContract("D(a,b,i,j)+=L(c,b,d,a)*R(j,d,i,c)",&tens0,&tens1,&tens2,2.0,0.0,dev_num,dev_kind,COPY_MTT,YEP,&task0);
 printf(" Tensor contraction has been scheduled for execution: Status %d\n",errc); if(errc){*ierr=10; return;};
 //Test for completion: 
 int sts,done=NOPE;
 while(done != YEP && errc == TALSH_SUCCESS){done=talshTaskComplete(&task0,&sts,&errc);}
 double tm = ((double)(clock() - tms))/CLOCKS_PER_SEC;
 if(errc == TALSH_SUCCESS){
  printf(" Tensor contraction has completed successfully: Status %d: Time %f sec\n",sts,tm);
 }else{
  printf(" Tensor contraction has failed: Status %d: Error %d\n",sts,errc);
  *ierr=11; return;
 }
 //Timing:
 double total_time;
 errc=talshTaskTime(&task0,&total_time); if(errc){*ierr=12; return;};
 printf(" Tensor contraction total time = %f: GFlop/s = %f\n",total_time,gflops/total_time);
 //Destruct the task handle:
 errc=talshTaskDestruct(&task0); if(errc){*ierr=13; return;};
#ifndef NO_GPU
 //If executed on GPU, COPY_MTT parameter in the tensor contraction call above means that the
 //destination tensor image was moved to GPU device (letter M means MOVE).
 //So, let's move it back to Host (to a user-specified memory location):
 errc=talshTensorPlace(&tens0,0,DEV_HOST,NULL,COPY_M); //this will move the resulting tensor block back to Host (letter M means MOVE)
 if(errc){*ierr=14; return;};
#endif
 printf(" Tensor result was moved back to Host: Norm1 = %E: Correct = %E\n",talshTensorImageNorm1_cpu(&tens0),theor_norm1);

//Unregister tensor blocks with TAL-SH:
 errc=talshTensorDestruct(&tens2); if(errc){*ierr=15; return;};
 errc=talshTensorDestruct(&tens1); if(errc){*ierr=16; return;};
 errc=talshTensorDestruct(&tens0); if(errc){*ierr=17; return;};
 printf(" Three external tensor blocks have been unregistered with TAL-SH\n");

//Shutdown TAL-SH:
 errc=talshShutdown();
 printf(" TAL-SH has been shut down: Status %d\n",errc); if(errc){*ierr=18; return;};

 return;
}

int main(int argc, char* argv[]) {
  int ierr=0;
  test_talsh_c(&ierr);
  return 0;
}


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

contraction errors when using C8 types on AMD gpus #22

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

contraction errors when using C8 types on AMD gpus #22

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions