Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for AMD GPUs #619

Merged
merged 17 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@ cmake_minimum_required(VERSION 3.11)

execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
if(has_nvcc STREQUAL "")
project(qsim)
execute_process(COMMAND which hipcc OUTPUT_VARIABLE has_hipcc)
if(has_hipcc STREQUAL "")
project(qsim)
else()
project(qsim LANGUAGES CXX HIP)
ADD_SUBDIRECTORY(pybind_interface/hip)
endif()
else()
project(qsim LANGUAGES CXX CUDA)
ADD_SUBDIRECTORY(pybind_interface/cuda)
Expand Down
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ TESTS = run-cxx-tests

CXX=g++
NVCC=nvcc
HIPCC=hipcc

CXXFLAGS = -O3 -fopenmp
ARCHFLAGS = -march=native
NVCCFLAGS = -O3
HIPCCFLAGS = -O3

# CUQUANTUM_ROOT should be set.
CUSTATEVECFLAGS = -I$(CUQUANTUM_ROOT)/include -L${CUQUANTUM_ROOT}/lib -L$(CUQUANTUM_ROOT)/lib64 -lcustatevec -lcublas
Expand All @@ -22,6 +24,8 @@ export ARCHFLAGS
export NVCC
export NVCCFLAGS
export CUSTATEVECFLAGS
export HIPCC
export HIPCCFLAGS

ifeq ($(PYBIND11), true)
TARGETS += pybind
Expand All @@ -43,6 +47,10 @@ qsim-cuda:
qsim-custatevec:
$(MAKE) -C apps/ qsim-custatevec

.PHONY: qsim-hip
qsim-hip:
$(MAKE) -C apps/ qsim-hip

.PHONY: pybind
pybind:
$(MAKE) -C pybind_interface/ pybind
Expand All @@ -59,6 +67,10 @@ cuda-tests:
custatevec-tests:
$(MAKE) -C tests/ custatevec-tests

.PHONY: hip-tests
hip-tests:
$(MAKE) -C tests/ hip-tests

.PHONY: run-cxx-tests
run-cxx-tests: cxx-tests
$(MAKE) -C tests/ run-cxx-tests
Expand All @@ -71,6 +83,10 @@ run-cuda-tests: cuda-tests
run-custatevec-tests: custatevec-tests
$(MAKE) -C tests/ run-custatevec-tests

.PHONY: run-hip-tests
run-hip-tests: hip-tests
$(MAKE) -C tests/ run-hip-tests

PYTESTS = $(shell find qsimcirq_tests/ -name '*_test.py')

.PHONY: run-py-tests
Expand Down
9 changes: 9 additions & 0 deletions apps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x)
CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu")
CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x)

HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu')
HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x)

.PHONY: qsim
qsim: $(CXX_TARGETS)

Expand All @@ -16,6 +19,9 @@ qsim-cuda: $(CUDA_TARGETS)
.PHONY: qsim-custatevec
qsim-custatevec: $(CUSTATEVEC_TARGETS)

.PHONY: qsim-hip
qsim-hip: $(HIP_TARGETS)

%.x: %.cc
$(CXX) -o ./$@ $< $(CXXFLAGS) $(ARCHFLAGS)

Expand All @@ -25,6 +31,9 @@ qsim-custatevec: $(CUSTATEVEC_TARGETS)
%custatevec.x: %custatevec.cu
$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)

%hip.x: %cuda.cu
$(HIPCC) -o ./$@ $< $(HIPCCFLAGS)

.PHONY: clean
clean:
-rm -f ./*.x ./*.a ./*.so ./*.mod
16 changes: 11 additions & 5 deletions apps/make.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,15 @@ g++ -O3 -march=native -fopenmp -o qsim_amplitudes.x qsim_amplitudes.cc
g++ -O3 -march=native -fopenmp -o qsimh_base.x qsimh_base.cc
g++ -O3 -march=native -fopenmp -o qsimh_amplitudes.x qsimh_amplitudes.cc

nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
if command -v nvcc &>/dev/null; then
nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu

# CUQUANTUM_ROOT should be set.
CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
if [ -n "$CUQUANTUM_ROOT" ]; then
CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
fi
elif command -v hipcc &>/dev/null; then
hipcc -O3 -o qsim_base_hip.x qsim_base_cuda.cu
hipcc -O3 -o qsim_qtrajectory_hip.x qsim_qtrajectory_cuda.cu
fi
61 changes: 61 additions & 0 deletions lib/cuda2hip.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2023 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef SIMULATOR_CUDA2HIP_H_
#define SIMULATOR_CUDA2HIP_H_

#define cublasCaxpy hipblasCaxpy
#define cublasCdotc hipblasCdotc
#define cublasCreate hipblasCreate
#define cublasCscal hipblasCscal
#define cublasCsscal hipblasCsscal
#define cublasDestroy hipblasDestroy
#define cublasDznrm2 hipblasDznrm2
#define cublasHandle_t hipblasHandle_t
#define cublasScnrm2 hipblasScnrm2
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define cublasStatus_t hipblasStatus_t
#define cublasZaxpy hipblasZaxpy
#define cublasZdotc hipblasZdotc
#define cublasZdscal hipblasZdscal
#define cublasZscal hipblasZscal
#define cuCimagf hipCimagf
#define cuCimag hipCimag
#define cuComplex hipComplex
#define cuCrealf hipCrealf
#define cuCreal hipCreal
#define CUDA_C_32F HIPBLAS_C_32F
#define CUDA_C_64F HIPBLAS_C_64F
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaFree hipFree
#define cudaGetErrorString hipGetErrorString
#define cudaMalloc hipMalloc
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpy hipMemcpy
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemset hipMemset
#define cudaPeekAtLastError hipPeekAtLastError
#define cudaSuccess hipSuccess
#define cuDoubleComplex hipDoubleComplex

template <typename T>
__device__ __forceinline__ T __shfl_down_sync(
unsigned mask, T var, unsigned int delta, int width = warpSize) {
return __shfl_down(var, delta, width);
}

#endif // SIMULATOR_CUDA2HIP_H_
4 changes: 0 additions & 4 deletions lib/fuser_mqubit.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,8 +561,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
std::vector<GateF*>& orphaned_gates,
std::vector<GateFused>& fused_gates) {
unsigned count = 0;

for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
auto ogate1 = orphaned_gates[i];

Expand All @@ -575,8 +573,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {

if (ogate2->visited == kFinal) continue;

++count;

unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();

if (cur_size <= max_fused_size) {
Expand Down
13 changes: 9 additions & 4 deletions lib/simulator_cuda_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@
#ifndef SIMULATOR_CUDA_KERNELS_H_
#define SIMULATOR_CUDA_KERNELS_H_

#include <cuda.h>
#include <cuda_runtime.h>

#include "util_cuda.h"
#ifdef __NVCC__
#include <cuda.h>
#include <cuda_runtime.h>

#include "util_cuda.h"
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

namespace qsim {

Expand Down
9 changes: 7 additions & 2 deletions lib/statespace_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
#ifndef STATESPACE_CUDA_H_
#define STATESPACE_CUDA_H_

#include <cuda.h>
#ifdef __NVCC__
#include <cuda.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

#include <algorithm>
#include <complex>
Expand Down Expand Up @@ -102,7 +107,7 @@ class StateSpaceCUDA :
}

void SetAllZeros(State& state) const {
cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type));
ErrorCheck(cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type)));
sergeisakov marked this conversation as resolved.
Show resolved Hide resolved
}

// Uniform superposition.
Expand Down
7 changes: 6 additions & 1 deletion lib/statespace_cuda_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
#ifndef STATESPACE_CUDA_KERNELS_H_
#define STATESPACE_CUDA_KERNELS_H_

#include <cuda.h>
#ifdef __NVCC__
#include <cuda.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

#include "util_cuda.h"

Expand Down
6 changes: 5 additions & 1 deletion lib/util_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@
#ifndef UTIL_CUDA_H_
#define UTIL_CUDA_H_

#include <cuda.h>
#ifdef __NVCC__
#include <cuda.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#endif

#include <cstdlib>

Expand Down
37 changes: 23 additions & 14 deletions lib/vectorspace_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@
#ifndef VECTORSPACE_CUDA_H_
#define VECTORSPACE_CUDA_H_

#include <cuda.h>
#include <cuda_runtime.h>
#ifdef __NVCC__
#include <cuda.h>
#include <cuda_runtime.h>
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

#include <memory>
#include <utility>
Expand All @@ -28,7 +33,7 @@ namespace detail {
inline void do_not_free(void*) {}

inline void free(void* ptr) {
cudaFree(ptr);
ErrorCheck(cudaFree(ptr));
}

} // namespace detail
Expand Down Expand Up @@ -114,29 +119,32 @@ class VectorSpaceCUDA {
return false;
}

cudaMemcpy(dest.get(), src.get(),
sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
cudaMemcpyDeviceToDevice);
ErrorCheck(
cudaMemcpy(dest.get(), src.get(),
sergeisakov marked this conversation as resolved.
Show resolved Hide resolved
sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
cudaMemcpyDeviceToDevice));

return true;
}

// It is the client's responsibility to make sure that dest has at least
// Impl::MinSize(src.num_qubits()) elements.
bool Copy(const Vector& src, fp_type* dest) const {
cudaMemcpy(dest, src.get(),
sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
cudaMemcpyDeviceToHost);
ErrorCheck(
cudaMemcpy(dest, src.get(),
sergeisakov marked this conversation as resolved.
Show resolved Hide resolved
sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
cudaMemcpyDeviceToHost));

return true;
}

// It is the client's responsibility to make sure that src has at least
// Impl::MinSize(dest.num_qubits()) elements.
bool Copy(const fp_type* src, Vector& dest) const {
cudaMemcpy(dest.get(), src,
sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
cudaMemcpyHostToDevice);
ErrorCheck(
cudaMemcpy(dest.get(), src,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be a four space indent if the line breaks after (.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
cudaMemcpyHostToDevice));

return true;
}
Expand All @@ -145,12 +153,13 @@ class VectorSpaceCUDA {
// min(size, Impl::MinSize(dest.num_qubits())) elements.
bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
size = std::min(size, Impl::MinSize(dest.num_qubits()));
cudaMemcpy(dest.get(), src, sizeof(fp_type) * size, cudaMemcpyHostToDevice);
ErrorCheck(
cudaMemcpy(dest.get(), src, sizeof(fp_type) * size, cudaMemcpyHostToDevice));
sergeisakov marked this conversation as resolved.
Show resolved Hide resolved
return true;
}

void DeviceSync() {
cudaDeviceSynchronize();
ErrorCheck(cudaDeviceSynchronize());
}

protected:
Expand Down
Loading