quantumlib · 95-martin-orion · Nov 9, 2023 · Aug 9, 2023 · Aug 9, 2023 · Aug 9, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,13 @@ cmake_minimum_required(VERSION 3.11)
 
 execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
 if(has_nvcc STREQUAL "")
-    project(qsim)
+    execute_process(COMMAND which hipcc OUTPUT_VARIABLE has_hipcc)
+    if(has_hipcc STREQUAL "")
+        project(qsim)
+    else()
+        project(qsim LANGUAGES CXX HIP)
+        ADD_SUBDIRECTORY(pybind_interface/hip)
+    endif()
 else()
     project(qsim LANGUAGES CXX CUDA)
     ADD_SUBDIRECTORY(pybind_interface/cuda)

diff --git a/Makefile b/Makefile
@@ -6,10 +6,12 @@ TESTS = run-cxx-tests
 
 CXX=g++
 NVCC=nvcc
+HIPCC=hipcc
 
 CXXFLAGS = -O3 -fopenmp
 ARCHFLAGS = -march=native
 NVCCFLAGS = -O3
+HIPCCFLAGS = -O3
 
 # CUQUANTUM_ROOT should be set.
 CUSTATEVECFLAGS = -I$(CUQUANTUM_ROOT)/include -L${CUQUANTUM_ROOT}/lib -L$(CUQUANTUM_ROOT)/lib64 -lcustatevec -lcublas
@@ -22,6 +24,8 @@ export ARCHFLAGS
 export NVCC
 export NVCCFLAGS
 export CUSTATEVECFLAGS
+export HIPCC
+export HIPCCFLAGS
 
 ifeq ($(PYBIND11), true)
   TARGETS += pybind
@@ -43,6 +47,10 @@ qsim-cuda:
 qsim-custatevec:
 	$(MAKE) -C apps/ qsim-custatevec
 
+.PHONY: qsim-hip
+qsim-hip:
+	$(MAKE) -C apps/ qsim-hip
+
 .PHONY: pybind
 pybind:
 	$(MAKE) -C pybind_interface/ pybind
@@ -59,6 +67,10 @@ cuda-tests:
 custatevec-tests:
 	$(MAKE) -C tests/ custatevec-tests
 
+.PHONY: hip-tests
+hip-tests:
+	$(MAKE) -C tests/ hip-tests
+
 .PHONY: run-cxx-tests
 run-cxx-tests: cxx-tests
 	$(MAKE) -C tests/ run-cxx-tests
@@ -71,6 +83,10 @@ run-cuda-tests: cuda-tests
 run-custatevec-tests: custatevec-tests
 	$(MAKE) -C tests/ run-custatevec-tests
 
+.PHONY: run-hip-tests
+run-hip-tests: hip-tests
+	$(MAKE) -C tests/ run-hip-tests
+
 PYTESTS = $(shell find qsimcirq_tests/ -name '*_test.py')
 
 .PHONY: run-py-tests

diff --git a/apps/Makefile b/apps/Makefile
@@ -7,6 +7,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x)
 CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu")
 CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x)
 
+HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu')
+HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x)
+
 .PHONY: qsim
 qsim: $(CXX_TARGETS)
 
@@ -16,6 +19,9 @@ qsim-cuda: $(CUDA_TARGETS)
 .PHONY: qsim-custatevec
 qsim-custatevec: $(CUSTATEVEC_TARGETS)
 
+.PHONY: qsim-hip
+qsim-hip: $(HIP_TARGETS)
+
 %.x: %.cc
 	$(CXX) -o ./$@ $< $(CXXFLAGS) $(ARCHFLAGS)
 
@@ -25,6 +31,9 @@ qsim-custatevec: $(CUSTATEVEC_TARGETS)
 %custatevec.x: %custatevec.cu
 	$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)
 
+%hip.x: %cuda.cu
+	$(HIPCC) -o ./$@ $< $(HIPCCFLAGS)
+
 .PHONY: clean
 clean:
 	-rm -f ./*.x ./*.a ./*.so ./*.mod
diff --git a/apps/make.sh b/apps/make.sh
@@ -23,9 +23,15 @@ g++ -O3 -march=native -fopenmp -o qsim_amplitudes.x qsim_amplitudes.cc
 g++ -O3 -march=native -fopenmp -o qsimh_base.x qsimh_base.cc
 g++ -O3 -march=native -fopenmp -o qsimh_amplitudes.x qsimh_amplitudes.cc
 
-nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
-nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
+if command -v nvcc &>/dev/null; then
+    nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
+    nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
 
-# CUQUANTUM_ROOT should be set.
-CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
-nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
+    if [ -n "$CUQUANTUM_ROOT" ]; then
+        CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
+        nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
+    fi
+elif command -v hipcc &>/dev/null; then
+    hipcc -O3 -o qsim_base_hip.x qsim_base_cuda.cu
+    hipcc -O3 -o qsim_qtrajectory_hip.x qsim_qtrajectory_cuda.cu
+fi
diff --git a/lib/cuda2hip.h b/lib/cuda2hip.h
@@ -0,0 +1,61 @@
+// Copyright 2023 Advanced Micro Devices, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA2HIP_H_
+#define SIMULATOR_CUDA2HIP_H_
+
+#define cublasCaxpy              hipblasCaxpy
+#define cublasCdotc              hipblasCdotc
+#define cublasCreate             hipblasCreate
+#define cublasCscal              hipblasCscal
+#define cublasCsscal             hipblasCsscal
+#define cublasDestroy            hipblasDestroy
+#define cublasDznrm2             hipblasDznrm2
+#define cublasHandle_t           hipblasHandle_t
+#define cublasScnrm2             hipblasScnrm2
+#define CUBLAS_STATUS_SUCCESS    HIPBLAS_STATUS_SUCCESS
+#define cublasStatus_t           hipblasStatus_t
+#define cublasZaxpy              hipblasZaxpy
+#define cublasZdotc              hipblasZdotc
+#define cublasZdscal             hipblasZdscal
+#define cublasZscal              hipblasZscal
+#define cuCimagf                 hipCimagf
+#define cuCimag                  hipCimag
+#define cuComplex                hipComplex
+#define cuCrealf                 hipCrealf
+#define cuCreal                  hipCreal
+#define CUDA_C_32F               HIPBLAS_C_32F
+#define CUDA_C_64F               HIPBLAS_C_64F
+#define cudaDeviceSynchronize    hipDeviceSynchronize
+#define cudaError_t              hipError_t
+#define cudaFree                 hipFree
+#define cudaGetErrorString       hipGetErrorString
+#define cudaMalloc               hipMalloc
+#define cudaMemcpyAsync          hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost   hipMemcpyDeviceToHost
+#define cudaMemcpy               hipMemcpy
+#define cudaMemcpyHostToDevice   hipMemcpyHostToDevice
+#define cudaMemset               hipMemset
+#define cudaPeekAtLastError      hipPeekAtLastError
+#define cudaSuccess              hipSuccess
+#define cuDoubleComplex          hipDoubleComplex
+
+template <typename T>
+__device__ __forceinline__ T __shfl_down_sync(
+    unsigned mask, T var, unsigned int delta, int width = warpSize) {
+  return __shfl_down(var, delta, width);
+}
+
+#endif  // SIMULATOR_CUDA2HIP_H_
diff --git a/lib/fuser_mqubit.h b/lib/fuser_mqubit.h
@@ -561,8 +561,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
   static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
                                 std::vector<GateF*>& orphaned_gates,
                                 std::vector<GateFused>& fused_gates) {
-    unsigned count = 0;
-
     for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
       auto ogate1 = orphaned_gates[i];
 
@@ -575,8 +573,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
         if (ogate2->visited == kFinal) continue;
 
-        ++count;
-
         unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();
 
         if (cur_size <= max_fused_size) {

diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h
@@ -15,10 +15,15 @@
 #ifndef SIMULATOR_CUDA_KERNELS_H_
 #define SIMULATOR_CUDA_KERNELS_H_
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include "util_cuda.h"
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+
+  #include "util_cuda.h"
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 namespace qsim {
 

diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
@@ -15,7 +15,12 @@
 #ifndef STATESPACE_CUDA_H_
 #define STATESPACE_CUDA_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include <algorithm>
 #include <complex>
@@ -102,7 +107,7 @@ class StateSpaceCUDA :
   }
 
   void SetAllZeros(State& state) const {
-    cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type));
+    ErrorCheck(cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type)));
   }
 
   // Uniform superposition.

diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h
@@ -15,7 +15,12 @@
 #ifndef STATESPACE_CUDA_KERNELS_H_
 #define STATESPACE_CUDA_KERNELS_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include "util_cuda.h"
 

diff --git a/lib/util_cuda.h b/lib/util_cuda.h
@@ -15,7 +15,11 @@
 #ifndef UTIL_CUDA_H_
 #define UTIL_CUDA_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+#endif
 
 #include <cstdlib>
 

diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
@@ -15,8 +15,13 @@
 #ifndef VECTORSPACE_CUDA_H_
 #define VECTORSPACE_CUDA_H_
 
-#include <cuda.h>
-#include <cuda_runtime.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include <memory>
 #include <utility>
@@ -28,7 +33,7 @@ namespace detail {
 inline void do_not_free(void*) {}
 
 inline void free(void* ptr) {
-  cudaFree(ptr);
+  ErrorCheck(cudaFree(ptr));
 }
 
 }  // namespace detail
@@ -114,29 +119,32 @@ class VectorSpaceCUDA {
       return false;
     }
 
-    cudaMemcpy(dest.get(), src.get(),
-               sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-               cudaMemcpyDeviceToDevice);
+    ErrorCheck(
+      cudaMemcpy(dest.get(), src.get(),
+                 sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                 cudaMemcpyDeviceToDevice));
 
     return true;
   }
 
   // It is the client's responsibility to make sure that dest has at least
   // Impl::MinSize(src.num_qubits()) elements.
   bool Copy(const Vector& src, fp_type* dest) const {
-    cudaMemcpy(dest, src.get(),
-               sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-               cudaMemcpyDeviceToHost);
+    ErrorCheck(
+      cudaMemcpy(dest, src.get(),
+                 sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                 cudaMemcpyDeviceToHost));
 
     return true;
   }
 
   // It is the client's responsibility to make sure that src has at least
   // Impl::MinSize(dest.num_qubits()) elements.
   bool Copy(const fp_type* src, Vector& dest) const {
-    cudaMemcpy(dest.get(), src,
-               sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
-               cudaMemcpyHostToDevice);
+    ErrorCheck(
+      cudaMemcpy(dest.get(), src,
+                 sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
+                 cudaMemcpyHostToDevice));
 
     return true;
   }
@@ -145,12 +153,13 @@ class VectorSpaceCUDA {
   // min(size, Impl::MinSize(dest.num_qubits())) elements.
   bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
     size = std::min(size, Impl::MinSize(dest.num_qubits()));
-    cudaMemcpy(dest.get(), src, sizeof(fp_type) * size, cudaMemcpyHostToDevice);
+    ErrorCheck(
+      cudaMemcpy(dest.get(), src, sizeof(fp_type) * size, cudaMemcpyHostToDevice));
     return true;
   }
 
   void DeviceSync() {
-    cudaDeviceSynchronize();
+    ErrorCheck(cudaDeviceSynchronize());
   }
 
  protected: