From aa76b30773a98764976488895c359ef0f3032b22 Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Wed, 9 Aug 2023 13:45:36 -0400
Subject: [PATCH 01/15] Remove unused lambda captures

---
 pybind_interface/pybind_main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/pybind_interface/pybind_main.cpp b/pybind_interface/pybind_main.cpp
index c5615ebc..63a03e4a 100644
--- a/pybind_interface/pybind_main.cpp
+++ b/pybind_interface/pybind_main.cpp
@@ -377,7 +377,7 @@ std::vector<std::complex<float>> qsim_simulate(const py::dict &options) {
   std::vector<std::complex<float>> amplitudes;
   amplitudes.reserve(bitstrings.size());
 
-  auto measure = [&bitstrings, &circuit, &amplitudes](
+  auto measure = [&bitstrings, &amplitudes](
                      unsigned k, const StateSpace &state_space,
                      const State &state) {
     for (const auto &b : bitstrings) {
@@ -481,7 +481,7 @@ std::vector<std::complex<float>> qtrajectory_simulate(const py::dict &options) {
   Simulator simulator = factory.CreateSimulator();
   StateSpace state_space = factory.CreateStateSpace();
 
-  auto measure = [&bitstrings, &ncircuit, &amplitudes, &state_space](
+  auto measure = [&bitstrings, &amplitudes, &state_space](
                   unsigned k, const State &state, Runner::Stat& stat) {
     for (const auto &b : bitstrings) {
       amplitudes.push_back(state_space.GetAmpl(state, b));
@@ -1082,7 +1082,7 @@ std::vector<unsigned> qtrajectory_sample(const py::dict &options) {
 
   std::vector<std::vector<unsigned>> results;
 
-  auto measure = [&results, &ncircuit, &state_space](
+  auto measure = [&results, &ncircuit](
                   unsigned k, const State& state, Runner::Stat& stat) {
     // Converts stat (which matches the MeasurementResult 'bits' field) into
     // bitstrings matching the MeasurementResult 'bitstring' field.

From 036654dfa75e450551b6d9a0db41055566e2efce Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Wed, 9 Aug 2023 13:48:30 -0400
Subject: [PATCH 02/15] Remove unused variable

---
 lib/fuser_mqubit.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/fuser_mqubit.h b/lib/fuser_mqubit.h
index aa4e6305..fdbfddb0 100644
--- a/lib/fuser_mqubit.h
+++ b/lib/fuser_mqubit.h
@@ -561,8 +561,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
   static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
                                 std::vector<GateF*>& orphaned_gates,
                                 std::vector<GateFused>& fused_gates) {
-    unsigned count = 0;
-
     for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
       auto ogate1 = orphaned_gates[i];
 
@@ -575,8 +573,6 @@ class MultiQubitGateFuser final : public Fuser<IO, Gate> {
 
         if (ogate2->visited == kFinal) continue;
 
-        ++count;
-
         unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();
 
         if (cur_size <= max_fused_size) {

From 61c9d09d22924edd242e3e2e1ee16244501d9ab7 Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Wed, 9 Aug 2023 13:53:55 -0400
Subject: [PATCH 03/15] Add missing CUDA error checks

---
 lib/statespace_cuda.h  |  2 +-
 lib/vectorspace_cuda.h | 28 ++++++++++++++++------------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
index 82f5128b..8f3b2055 100644
--- a/lib/statespace_cuda.h
+++ b/lib/statespace_cuda.h
@@ -102,7 +102,7 @@ class StateSpaceCUDA :
   }
 
   void SetAllZeros(State& state) const {
-    cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type));
+    ErrorCheck(cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type)));
   }
 
   // Uniform superposition.
diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index 0c1d6167..de210734 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -28,7 +28,7 @@ namespace detail {
 inline void do_not_free(void*) {}
 
 inline void free(void* ptr) {
-  cudaFree(ptr);
+  ErrorCheck(cudaFree(ptr));
 }
 
 }  // namespace detail
@@ -114,9 +114,10 @@ class VectorSpaceCUDA {
       return false;
     }
 
-    cudaMemcpy(dest.get(), src.get(),
-               sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-               cudaMemcpyDeviceToDevice);
+    ErrorCheck(
+      cudaMemcpy(dest.get(), src.get(),
+                 sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                 cudaMemcpyDeviceToDevice));
 
     return true;
   }
@@ -124,9 +125,10 @@ class VectorSpaceCUDA {
   // It is the client's responsibility to make sure that dest has at least
   // Impl::MinSize(src.num_qubits()) elements.
   bool Copy(const Vector& src, fp_type* dest) const {
-    cudaMemcpy(dest, src.get(),
-               sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-               cudaMemcpyDeviceToHost);
+    ErrorCheck(
+      cudaMemcpy(dest, src.get(),
+                 sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                 cudaMemcpyDeviceToHost));
 
     return true;
   }
@@ -134,9 +136,10 @@ class VectorSpaceCUDA {
   // It is the client's responsibility to make sure that src has at least
   // Impl::MinSize(dest.num_qubits()) elements.
   bool Copy(const fp_type* src, Vector& dest) const {
-    cudaMemcpy(dest.get(), src,
-               sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
-               cudaMemcpyHostToDevice);
+    ErrorCheck(
+      cudaMemcpy(dest.get(), src,
+                 sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
+                 cudaMemcpyHostToDevice));
 
     return true;
   }
@@ -145,12 +148,13 @@ class VectorSpaceCUDA {
   // min(size, Impl::MinSize(dest.num_qubits())) elements.
   bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
     size = std::min(size, Impl::MinSize(dest.num_qubits()));
-    cudaMemcpy(dest.get(), src, sizeof(fp_type) * size, cudaMemcpyHostToDevice);
+    ErrorCheck(
+      cudaMemcpy(dest.get(), src, sizeof(fp_type) * size, cudaMemcpyHostToDevice));
     return true;
   }
 
   void DeviceSync() {
-    cudaDeviceSynchronize();
+    ErrorCheck(cudaDeviceSynchronize());
   }
 
  protected:

From caa5463d6e0f6ee52719d6d84594b06774218594 Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Wed, 16 Aug 2023 15:13:35 -0400
Subject: [PATCH 04/15] Add CUDA to HIP translation

---
 lib/cuda2hip.h                | 61 +++++++++++++++++++++++++++++++++++
 lib/simulator_cuda_kernels.h  | 13 +++++---
 lib/statespace_cuda.h         |  7 +++-
 lib/statespace_cuda_kernels.h |  7 +++-
 lib/util_cuda.h               |  6 +++-
 lib/vectorspace_cuda.h        |  9 ++++--
 6 files changed, 94 insertions(+), 9 deletions(-)
 create mode 100644 lib/cuda2hip.h

diff --git a/lib/cuda2hip.h b/lib/cuda2hip.h
new file mode 100644
index 00000000..da2d074c
--- /dev/null
+++ b/lib/cuda2hip.h
@@ -0,0 +1,61 @@
+// Copyright 2023 Advanced Micro Devices, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA2HIP_H_
+#define SIMULATOR_CUDA2HIP_H_
+
+#define cublasCaxpy              hipblasCaxpy
+#define cublasCdotc              hipblasCdotc
+#define cublasCreate             hipblasCreate
+#define cublasCscal              hipblasCscal
+#define cublasCsscal             hipblasCsscal
+#define cublasDestroy            hipblasDestroy
+#define cublasDznrm2             hipblasDznrm2
+#define cublasHandle_t           hipblasHandle_t
+#define cublasScnrm2             hipblasScnrm2
+#define CUBLAS_STATUS_SUCCESS    HIPBLAS_STATUS_SUCCESS
+#define cublasStatus_t           hipblasStatus_t
+#define cublasZaxpy              hipblasZaxpy
+#define cublasZdotc              hipblasZdotc
+#define cublasZdscal             hipblasZdscal
+#define cublasZscal              hipblasZscal
+#define cuCimagf                 hipCimagf
+#define cuCimag                  hipCimag
+#define cuComplex                hipComplex
+#define cuCrealf                 hipCrealf
+#define cuCreal                  hipCreal
+#define CUDA_C_32F               HIPBLAS_C_32F
+#define CUDA_C_64F               HIPBLAS_C_64F
+#define cudaDeviceSynchronize    hipDeviceSynchronize
+#define cudaError_t              hipError_t
+#define cudaFree                 hipFree
+#define cudaGetErrorString       hipGetErrorString
+#define cudaMalloc               hipMalloc
+#define cudaMemcpyAsync          hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost   hipMemcpyDeviceToHost
+#define cudaMemcpy               hipMemcpy
+#define cudaMemcpyHostToDevice   hipMemcpyHostToDevice
+#define cudaMemset               hipMemset
+#define cudaPeekAtLastError      hipPeekAtLastError
+#define cudaSuccess              hipSuccess
+#define cuDoubleComplex          hipDoubleComplex
+
+template <typename T>
+__device__ __forceinline__ T __shfl_down_sync(
+    unsigned mask, T var, unsigned int delta, int width = warpSize) {
+  return __shfl_down(var, delta, width);
+}
+
+#endif  // SIMULATOR_CUDA2HIP_H_
diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h
index 6510fadf..e21a9d62 100644
--- a/lib/simulator_cuda_kernels.h
+++ b/lib/simulator_cuda_kernels.h
@@ -15,10 +15,15 @@
 #ifndef SIMULATOR_CUDA_KERNELS_H_
 #define SIMULATOR_CUDA_KERNELS_H_
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include "util_cuda.h"
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+
+  #include "util_cuda.h"
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 namespace qsim {
 
diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
index 8f3b2055..81780150 100644
--- a/lib/statespace_cuda.h
+++ b/lib/statespace_cuda.h
@@ -15,7 +15,12 @@
 #ifndef STATESPACE_CUDA_H_
 #define STATESPACE_CUDA_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include <algorithm>
 #include <complex>
diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h
index bcb7fd25..b54ebca9 100644
--- a/lib/statespace_cuda_kernels.h
+++ b/lib/statespace_cuda_kernels.h
@@ -15,7 +15,12 @@
 #ifndef STATESPACE_CUDA_KERNELS_H_
 #define STATESPACE_CUDA_KERNELS_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include "util_cuda.h"
 
diff --git a/lib/util_cuda.h b/lib/util_cuda.h
index 591d852d..5d8cb5df 100644
--- a/lib/util_cuda.h
+++ b/lib/util_cuda.h
@@ -15,7 +15,11 @@
 #ifndef UTIL_CUDA_H_
 #define UTIL_CUDA_H_
 
-#include <cuda.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+#endif
 
 #include <cstdlib>
 
diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index de210734..ebf21095 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -15,8 +15,13 @@
 #ifndef VECTORSPACE_CUDA_H_
 #define VECTORSPACE_CUDA_H_
 
-#include <cuda.h>
-#include <cuda_runtime.h>
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
 
 #include <memory>
 #include <utility>

From 6dd69e2095393697083d449b755e1eaf7d2675bd Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Wed, 16 Aug 2023 15:42:57 -0400
Subject: [PATCH 05/15] Add HIP to pybind11

---
 pybind_interface/decide/decide.cpp       |  4 +-
 pybind_interface/hip/CMakeLists.txt      | 28 ++++++++++++++
 pybind_interface/hip/pybind_main_hip.cpp | 47 ++++++++++++++++++++++++
 pybind_interface/hip/pybind_main_hip.h   | 17 +++++++++
 qsimcirq/__init__.py                     |  2 +
 setup.py                                 |  1 +
 6 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 pybind_interface/hip/CMakeLists.txt
 create mode 100644 pybind_interface/hip/pybind_main_hip.cpp
 create mode 100644 pybind_interface/hip/pybind_main_hip.h

diff --git a/pybind_interface/decide/decide.cpp b/pybind_interface/decide/decide.cpp
index 6355ad3b..5e9a63f1 100644
--- a/pybind_interface/decide/decide.cpp
+++ b/pybind_interface/decide/decide.cpp
@@ -60,7 +60,7 @@ int detect_instructions() {
 }
 
 enum GPUCapabilities {
-    CUDA = 0, CUSTATEVEC = 1, NO_GPU = 10, NO_CUSTATEVEC = 11 };
+    CUDA = 0, CUSTATEVEC = 1, HIP = 2, NO_GPU = 10, NO_CUSTATEVEC = 11 };
 
 // For now, GPU detection is performed at compile time, as our wheels are
 // generated on Github Actions runners which do not have GPU support.
@@ -70,6 +70,8 @@ enum GPUCapabilities {
 int detect_gpu() {
   #ifdef __NVCC__
   GPUCapabilities gpu = CUDA;
+  #elif __HIP__
+  GPUCapabilities gpu = HIP;
   #else
   GPUCapabilities gpu = NO_GPU;
   #endif
diff --git a/pybind_interface/hip/CMakeLists.txt b/pybind_interface/hip/CMakeLists.txt
new file mode 100644
index 00000000..3e89113d
--- /dev/null
+++ b/pybind_interface/hip/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.18)
+project(qsim LANGUAGES CXX HIP)
+
+IF (WIN32)
+    set(CMAKE_CXX_FLAGS "/O2 /openmp")
+ELSE()
+    set(CMAKE_CXX_FLAGS "-O3 -fopenmp")
+ENDIF()
+
+if(APPLE)
+    set(CMAKE_CXX_STANDARD 14)
+    include_directories("/usr/local/include" "/usr/local/opt/llvm/include")
+    link_directories("/usr/local/lib" "/usr/local/opt/llvm/lib")
+endif()
+
+INCLUDE(../GetPybind11.cmake)
+find_package(PythonLibs 3.7 REQUIRED)
+find_package(HIP REQUIRED)
+
+include_directories(${PYTHON_INCLUDE_DIRS} ${pybind11_SOURCE_DIR}/include)
+
+hip_add_library(qsim_hip MODULE pybind_main_hip.cpp)
+
+set_target_properties(qsim_hip PROPERTIES
+       PREFIX "${PYTHON_MODULE_PREFIX}"
+       SUFFIX "${PYTHON_MODULE_EXTENSION}"
+)
+set_source_files_properties(pybind_main_hip.cpp PROPERTIES LANGUAGE HIP)
diff --git a/pybind_interface/hip/pybind_main_hip.cpp b/pybind_interface/hip/pybind_main_hip.cpp
new file mode 100644
index 00000000..d6ee181f
--- /dev/null
+++ b/pybind_interface/hip/pybind_main_hip.cpp
@@ -0,0 +1,47 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pybind_main_hip.h"
+
+#include "../../lib/simulator_cuda.h"
+
+namespace qsim {
+  using Simulator = SimulatorCUDA<float>;
+
+  struct Factory {
+    using Simulator = qsim::Simulator;
+    using StateSpace = Simulator::StateSpace;
+
+    Factory(
+      unsigned num_sim_threads,
+      unsigned num_state_threads,
+      unsigned num_dblocks
+    ) : ss_params{num_state_threads, num_dblocks} {}
+
+    StateSpace CreateStateSpace() const {
+      return StateSpace(ss_params);
+    }
+
+    Simulator CreateSimulator() const {
+      return Simulator();
+    }
+
+    StateSpace::Parameter ss_params;
+  };
+
+  inline void SetFlushToZeroAndDenormalsAreZeros() {}
+  inline void ClearFlushToZeroAndDenormalsAreZeros() {}
+}
+
+#include "../pybind_main.cpp"
diff --git a/pybind_interface/hip/pybind_main_hip.h b/pybind_interface/hip/pybind_main_hip.h
new file mode 100644
index 00000000..55a672d5
--- /dev/null
+++ b/pybind_interface/hip/pybind_main_hip.h
@@ -0,0 +1,17 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../pybind_main.h"
+
+PYBIND11_MODULE(qsim_hip, m) { GPU_MODULE_BINDINGS }
diff --git a/qsimcirq/__init__.py b/qsimcirq/__init__.py
index 0f022a1a..d9c5504d 100644
--- a/qsimcirq/__init__.py
+++ b/qsimcirq/__init__.py
@@ -34,6 +34,8 @@ def _load_qsim_gpu():
     instr = qsim_decide.detect_gpu()
     if instr == 0:
         qsim_gpu = importlib.import_module("qsimcirq.qsim_cuda")
+    elif instr == 2:
+        qsim_gpu = importlib.import_module("qsimcirq.qsim_hip")
     else:
         qsim_gpu = None
     return qsim_gpu
diff --git a/setup.py b/setup.py
index c57a37a9..eafa4387 100644
--- a/setup.py
+++ b/setup.py
@@ -111,6 +111,7 @@ def build_extension(self, ext):
         CMakeExtension("qsimcirq/qsim_cuda"),
         CMakeExtension("qsimcirq/qsim_custatevec"),
         CMakeExtension("qsimcirq/qsim_decide"),
+        CMakeExtension("qsimcirq/qsim_hip"),
     ],
     cmdclass=dict(build_ext=CMakeBuild),
     zip_safe=False,

From 5edafd6893f910733d1e5441e9796d587faa20f9 Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Wed, 16 Aug 2023 15:44:17 -0400
Subject: [PATCH 06/15] Add HIP to make and cmake files

---
 CMakeLists.txt                         |  8 ++++++-
 Makefile                               | 16 +++++++++++++
 apps/Makefile                          |  9 ++++++++
 apps/make.sh                           | 16 +++++++++----
 pybind_interface/Makefile              | 32 ++++++++++++++++++++------
 pybind_interface/decide/CMakeLists.txt | 24 +++++++++++++++++--
 tests/Makefile                         | 13 +++++++++++
 tests/make.sh                          | 28 ++++++++++++++--------
 8 files changed, 121 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 615a33db..f902a421 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,13 @@ cmake_minimum_required(VERSION 3.11)
 
 execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
 if(has_nvcc STREQUAL "")
-    project(qsim)
+    execute_process(COMMAND which hipcc OUTPUT_VARIABLE has_hipcc)
+    if(has_hipcc STREQUAL "")
+        project(qsim)
+    else()
+        project(qsim LANGUAGES CXX HIP)
+        ADD_SUBDIRECTORY(pybind_interface/hip)
+    endif()
 else()
     project(qsim LANGUAGES CXX CUDA)
     ADD_SUBDIRECTORY(pybind_interface/cuda)
diff --git a/Makefile b/Makefile
index 9ef0afdd..227ca603 100644
--- a/Makefile
+++ b/Makefile
@@ -6,10 +6,12 @@ TESTS = run-cxx-tests
 
 CXX=g++
 NVCC=nvcc
+HIPCC=hipcc
 
 CXXFLAGS = -O3 -fopenmp
 ARCHFLAGS = -march=native
 NVCCFLAGS = -O3
+HIPCCFLAGS = -O3
 
 # CUQUANTUM_ROOT should be set.
 CUSTATEVECFLAGS = -I$(CUQUANTUM_ROOT)/include -L${CUQUANTUM_ROOT}/lib -L$(CUQUANTUM_ROOT)/lib64 -lcustatevec -lcublas
@@ -22,6 +24,8 @@ export ARCHFLAGS
 export NVCC
 export NVCCFLAGS
 export CUSTATEVECFLAGS
+export HIPCC
+export HIPCCFLAGS
 
 ifeq ($(PYBIND11), true)
   TARGETS += pybind
@@ -43,6 +47,10 @@ qsim-cuda:
 qsim-custatevec:
 	$(MAKE) -C apps/ qsim-custatevec
 
+.PHONY: qsim-hip
+qsim-hip:
+	$(MAKE) -C apps/ qsim-hip
+
 .PHONY: pybind
 pybind:
 	$(MAKE) -C pybind_interface/ pybind
@@ -59,6 +67,10 @@ cuda-tests:
 custatevec-tests:
 	$(MAKE) -C tests/ custatevec-tests
 
+.PHONY: hip-tests
+hip-tests:
+	$(MAKE) -C tests/ hip-tests
+
 .PHONY: run-cxx-tests
 run-cxx-tests: cxx-tests
 	$(MAKE) -C tests/ run-cxx-tests
@@ -71,6 +83,10 @@ run-cuda-tests: cuda-tests
 run-custatevec-tests: custatevec-tests
 	$(MAKE) -C tests/ run-custatevec-tests
 
+.PHONY: run-hip-tests
+run-hip-tests: hip-tests
+	$(MAKE) -C tests/ run-hip-tests
+
 PYTESTS = $(shell find qsimcirq_tests/ -name '*_test.py')
 
 .PHONY: run-py-tests
diff --git a/apps/Makefile b/apps/Makefile
index 41fb81e5..48b25cab 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -7,6 +7,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda.cu=%cuda.x)
 CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec.cu")
 CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec.cu=%custatevec.x)
 
+HIP_TARGETS = $(shell find . -maxdepth 1 -name '*cuda.cu')
+HIP_TARGETS := $(HIP_TARGETS:%cuda.cu=%hip.x)
+
 .PHONY: qsim
 qsim: $(CXX_TARGETS)
 
@@ -16,6 +19,9 @@ qsim-cuda: $(CUDA_TARGETS)
 .PHONY: qsim-custatevec
 qsim-custatevec: $(CUSTATEVEC_TARGETS)
 
+.PHONY: qsim-hip
+qsim-hip: $(HIP_TARGETS)
+
 %.x: %.cc
 	$(CXX) -o ./$@ $< $(CXXFLAGS) $(ARCHFLAGS)
 
@@ -25,6 +31,9 @@ qsim-custatevec: $(CUSTATEVEC_TARGETS)
 %custatevec.x: %custatevec.cu
 	$(NVCC) -o ./$@ $< $(NVCCFLAGS) $(CUSTATEVECFLAGS)
 
+%hip.x: %cuda.cu
+	$(HIPCC) -o ./$@ $< $(HIPCCFLAGS)
+
 .PHONY: clean
 clean:
 	-rm -f ./*.x ./*.a ./*.so ./*.mod
diff --git a/apps/make.sh b/apps/make.sh
index f2e777e5..5ddd4d6b 100755
--- a/apps/make.sh
+++ b/apps/make.sh
@@ -23,9 +23,15 @@ g++ -O3 -march=native -fopenmp -o qsim_amplitudes.x qsim_amplitudes.cc
 g++ -O3 -march=native -fopenmp -o qsimh_base.x qsimh_base.cc
 g++ -O3 -march=native -fopenmp -o qsimh_amplitudes.x qsimh_amplitudes.cc
 
-nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
-nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
+if command -v nvcc &>/dev/null; then
+    nvcc -O3 -o qsim_base_cuda.x qsim_base_cuda.cu
+    nvcc -O3 -o qsim_qtrajectory_cuda.x qsim_qtrajectory_cuda.cu
 
-# CUQUANTUM_ROOT should be set.
-CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
-nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
+    if [ -n "$CUQUANTUM_ROOT" ]; then
+        CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
+        nvcc -O3 $CUSTATEVECFLAGS -o qsim_base_custatevec.x qsim_base_custatevec.cu
+    fi
+elif command -v hipcc &>/dev/null; then
+    hipcc -O3 -o qsim_base_hip.x qsim_base_cuda.cu
+    hipcc -O3 -o qsim_qtrajectory_hip.x qsim_qtrajectory_cuda.cu
+fi
diff --git a/pybind_interface/Makefile b/pybind_interface/Makefile
index c7c70860..60e7cc99 100644
--- a/pybind_interface/Makefile
+++ b/pybind_interface/Makefile
@@ -5,6 +5,7 @@ QSIMLIB_AVX2 = ../qsimcirq/qsim_avx2`python3-config --extension-suffix`
 QSIMLIB_AVX512 = ../qsimcirq/qsim_avx512`python3-config --extension-suffix`
 QSIMLIB_CUDA = ../qsimcirq/qsim_cuda`python3-config --extension-suffix`
 QSIMLIB_CUSTATEVEC = ../qsimcirq/qsim_custatevec`python3-config --extension-suffix`
+QSIMLIB_HIP = ../qsimcirq/qsim_hip`python3-config --extension-suffix`
 QSIMLIB_DECIDE = ../qsimcirq/qsim_decide`python3-config --extension-suffix`
 
 # The flags for the compilation of the simd-specific Pybind11 interfaces
@@ -13,21 +14,29 @@ PYBINDFLAGS_SSE = -msse4.1 -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 -
 PYBINDFLAGS_AVX2 = -mavx2 -mfma -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
 PYBINDFLAGS_AVX512 = -mavx512f -mbmi2 -Wall -shared -std=c++17 -fPIC `python3 -m pybind11 --includes`
 
-# The flags for the compilation of GPU-specific Pybind11 interfaces
+# The flags for the compilation of CUDA-specific Pybind11 interfaces
 PYBINDFLAGS_CUDA = -std=c++17 -x cu -Xcompiler "-Wall -shared -fPIC `python3 -m pybind11 --includes`"
 
 # The flags for the compilation of cuStateVec-specific Pybind11 interfaces
 PYBINDFLAGS_CUSTATEVEC = $(CUSTATEVECFLAGS) $(PYBINDFLAGS_CUDA)
 
+# The flags for the compilation of HIP-specific Pybind11 interfaces
+PYBINDFLAGS_HIP = -std=c++17 -Wall -shared -fPIC `python3 -m pybind11 --includes`
+
 # Check for nvcc to decide compilation mode.
 ifeq ($(shell which $(NVCC)),)
+# Check for hipcc to decide compilation mode.
+ifeq ($(shell which $(HIPCC)),)
 pybind: pybind-cpu decide-cpu
 else
+pybind: pybind-cpu pybind-hip decide-hip
+endif
+else
 # Check for the cuStateVec library.
 ifeq ($(CUQUANTUM_ROOT),)
-pybind: pybind-cpu pybind-gpu decide-gpu
+pybind: pybind-cpu pybind-cuda decide-cuda
 else
-pybind: pybind-cpu pybind-gpu pybind-custatevec decide-custatevec
+pybind: pybind-cpu pybind-cuda pybind-custatevec decide-custatevec
 endif
 endif
 
@@ -40,14 +49,15 @@ pybind-cpu:
 
 .PHONY: decide-cpu
 decide-cpu:
+	echo "building decide-cpu"
 	$(CXX) decide/decide.cpp -o $(QSIMLIB_DECIDE) $(CXXFLAGS) $(PYBINDFLAGS_BASIC)
 
-.PHONY: pybind-gpu
-pybind-gpu:
+.PHONY: pybind-cuda
+pybind-cuda:
 	$(NVCC) cuda/pybind_main_cuda.cpp -o $(QSIMLIB_CUDA) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
 
-.PHONY: decide-gpu
-decide-gpu:
+.PHONY: decide-cuda
+decide-cuda:
 	$(NVCC) decide/decide.cpp -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
 
 .PHONY: pybind-custatevec
@@ -58,6 +68,14 @@ pybind-custatevec:
 decide-custatevec:
 	$(NVCC) decide/decide.cpp -D__CUSTATEVEC__ -o $(QSIMLIB_DECIDE) $(NVCCFLAGS) $(PYBINDFLAGS_CUDA)
 
+.PHONY: pybind-hip
+pybind-hip:
+	$(HIPCC) hip/pybind_main_hip.cpp -o $(QSIMLIB_HIP) $(HIPCCFLAGS) $(PYBINDFLAGS_HIP)
+
+.PHONY: decide-hip
+decide-hip:
+	$(HIPCC) decide/decide.cpp -o $(QSIMLIB_DECIDE) $(HIPCCFLAGS) $(PYBINDFLAGS_HIP)
+
 .PHONY: clean
 clean:
 	-rm -f ./basic/*.x ./basic/*.a ./basic/*.so ./basic/*.mod $(QSIMLIB_BASIC)
diff --git a/pybind_interface/decide/CMakeLists.txt b/pybind_interface/decide/CMakeLists.txt
index 441f3d66..2b28ca96 100644
--- a/pybind_interface/decide/CMakeLists.txt
+++ b/pybind_interface/decide/CMakeLists.txt
@@ -2,7 +2,12 @@ cmake_minimum_required(VERSION 3.11)
 
 execute_process(COMMAND which nvcc OUTPUT_VARIABLE has_nvcc)
 if(has_nvcc STREQUAL "")
-  project(qsim)
+  execute_process(COMMAND which hipcc OUTPUT_VARIABLE has_hipcc)
+  if(has_hipcc STREQUAL "")
+    project(qsim)
+  else()
+    project(qsim LANGUAGES CXX HIP)
+  endif()
 else()
   project(qsim LANGUAGES CXX CUDA)
 endif()
@@ -22,7 +27,22 @@ endif()
 INCLUDE(../GetPybind11.cmake)
 
 if(has_nvcc STREQUAL "")
-  pybind11_add_module(qsim_decide decide.cpp)
+  if(has_hipcc STREQUAL "")
+    pybind11_add_module(qsim_decide decide.cpp)
+  else()
+    find_package(HIP REQUIRED)
+    find_package(PythonLibs 3.7 REQUIRED)
+
+    include_directories(${PYTHON_INCLUDE_DIRS} ${pybind11_SOURCE_DIR}/include)
+
+    hip_add_library(qsim_decide MODULE decide.cpp)
+
+    set_target_properties(qsim_decide PROPERTIES
+        PREFIX "${PYTHON_MODULE_PREFIX}"
+        SUFFIX "${PYTHON_MODULE_EXTENSION}"
+    )
+    set_source_files_properties(decide.cpp PROPERTIES LANGUAGE HIP)
+  endif()
 else()
   find_package(PythonLibs 3.7 REQUIRED)
   find_package(CUDA REQUIRED)
diff --git a/tests/Makefile b/tests/Makefile
index f4a37278..6a81e70b 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -14,6 +14,9 @@ CUDA_TARGETS := $(CUDA_TARGETS:%cuda_test.cu=%cuda_test.x)
 CUSTATEVEC_TARGETS = $(shell find . -maxdepth 1 -name "*custatevec_test.cu")
 CUSTATEVEC_TARGETS := $(CUSTATEVEC_TARGETS:%custatevec_test.cu=%custatevec_test.x)
 
+HIP_TARGETS = $(shell find . -maxdepth 1 -name "*cuda_test.cu")
+HIP_TARGETS := $(HIP_TARGETS:%cuda_test.cu=%hip_test.x)
+
 GTEST_DIR = $(CURDIR)/googletest/googletest
 GMOCK_DIR = $(CURDIR)/googletest/googlemock
 
@@ -30,6 +33,9 @@ cuda-tests: $(CUDA_TARGETS)
 .PHONY: custatevec-tests
 custatevec-tests: $(CUSTATEVEC_TARGETS)
 
+.PHONY: hip-tests
+hip-tests: $(HIP_TARGETS)
+
 .PHONY: run-cxx-tests
 run-cxx-tests: cxx-tests
 	for exe in $(CXX_TARGETS); do if ! ./$$exe; then exit 1; fi; done
@@ -42,6 +48,10 @@ run-cuda-tests: cuda-tests
 run-custatevec-tests: custatevec-tests
 	for exe in $(CUSTATEVEC_TARGETS); do if ! ./$$exe; then exit 1; fi; done
 
+.PHONY: run-hip-tests
+run-hip-tests: hip-tests
+	for exe in $(HIP_TARGETS); do if ! ./$$exe; then exit 1; fi; done
+
 $(GTEST_DIR)/make:
 	-git submodule update --init --recursive googletest
 	mkdir -p $(GTEST_DIR)/make
@@ -56,6 +66,9 @@ $(GTEST_DIR)/make:
 %custatevec_test.x: %custatevec_test.cu $(GTEST_DIR)/make
 	$(NVCC) -o ./$@ $< $(TESTFLAGS) $(NVCCFLAGS) $(CUSTATEVECFLAGS)
 
+%hip_test.x: %cuda_test.cu $(GTEST_DIR)/make
+	$(HIPCC) -o ./$@ $< $(TESTFLAGS) $(HIPCCFLAGS)
+
 .PHONY: clean
 clean:
 	-rm -f ./*.x ./*.a ./*.so ./*.mod
diff --git a/tests/make.sh b/tests/make.sh
index 890663da..b38742df 100755
--- a/tests/make.sh
+++ b/tests/make.sh
@@ -55,14 +55,22 @@ g++ -O3 -I$path_to_include -L$path_to_lib -fopenmp -o unitaryspace_basic_test.x
 g++ -O3 -I$path_to_include -L$path_to_lib -msse4 -fopenmp -o unitaryspace_sse_test.x unitaryspace_sse_test.cc -lgtest -lpthread
 g++ -O3 -I$path_to_include -L$path_to_lib -o vectorspace_test.x vectorspace_test.cc -lgtest -lpthread
 
-nvcc -O3 -I$path_to_include -L$path_to_lib -o hybrid_cuda_test.x hybrid_cuda_test.cu -lgtest -lpthread
-nvcc -O3 -I$path_to_include -L$path_to_lib -o qtrajectory_cuda_test.x qtrajectory_cuda_test.cu -lgtest -lpthread
-nvcc -O3 -I$path_to_include -L$path_to_lib -o simulator_cuda_test.x simulator_cuda_test.cu -lgtest -lpthread
-nvcc -O3 -I$path_to_include -L$path_to_lib -o statespace_cuda_test.x statespace_cuda_test.cu -lgtest -lpthread
+if command -v nvcc &>/dev/null; then
+    nvcc -O3 -I$path_to_include -L$path_to_lib -o hybrid_cuda_test.x hybrid_cuda_test.cu -lgtest -lpthread
+    nvcc -O3 -I$path_to_include -L$path_to_lib -o qtrajectory_cuda_test.x qtrajectory_cuda_test.cu -lgtest -lpthread
+    nvcc -O3 -I$path_to_include -L$path_to_lib -o simulator_cuda_test.x simulator_cuda_test.cu -lgtest -lpthread
+    nvcc -O3 -I$path_to_include -L$path_to_lib -o statespace_cuda_test.x statespace_cuda_test.cu -lgtest -lpthread
 
-# CUQUANTUM_ROOT should be set.
-CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
-nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o hybrid_custatevec_test.x hybrid_custatevec_test.cu -lgtest -lpthread
-nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o qtrajectory_custatevec_test.x qtrajectory_custatevec_test.cu -lgtest -lpthread
-nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o simulator_custatevec_test.x simulator_custatevec_test.cu -lgtest -lpthread
-nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o statespace_custatevec_test.x statespace_custatevec_test.cu -lgtest -lpthread
+    if [ -n "$CUQUANTUM_ROOT" ]; then
+        CUSTATEVECFLAGS="-I${CUQUANTUM_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUQUANTUM_ROOT}/lib64 -lcustatevec -lcublas"
+        nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o hybrid_custatevec_test.x hybrid_custatevec_test.cu -lgtest -lpthread
+        nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o qtrajectory_custatevec_test.x qtrajectory_custatevec_test.cu -lgtest -lpthread
+        nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o simulator_custatevec_test.x simulator_custatevec_test.cu -lgtest -lpthread
+        nvcc -O3 $CUSTATEVECFLAGS -I$path_to_include -L$path_to_lib -o statespace_custatevec_test.x statespace_custatevec_test.cu -lgtest -lpthread
+    fi
+elif command -v hipcc &>/dev/null; then
+    hipcc -O3 -I$path_to_include -L$path_to_lib -o hybrid_hip_test.x hybrid_cuda_test.cu -lgtest -lpthread
+    hipcc -O3 -I$path_to_include -L$path_to_lib -o qtrajectory_hip_test.x qtrajectory_cuda_test.cu -lgtest -lpthread
+    hipcc -O3 -I$path_to_include -L$path_to_lib -o simulator_hip_test.x simulator_cuda_test.cu -lgtest -lpthread
+    hipcc -O3 -I$path_to_include -L$path_to_lib -o statespace_hip_test.x statespace_cuda_test.cu -lgtest -lpthread
+fi

From c80f91d0140ba47e10fbbeeb55d65521eb9a2cb6 Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Fri, 18 Aug 2023 12:56:29 -0400
Subject: [PATCH 07/15] Add HIP cleanup in pybind_interface

---
 pybind_interface/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pybind_interface/Makefile b/pybind_interface/Makefile
index 60e7cc99..253d3f85 100644
--- a/pybind_interface/Makefile
+++ b/pybind_interface/Makefile
@@ -83,5 +83,6 @@ clean:
 	-rm -f ./avx2/*.x ./avx2/*.a ./avx2/*.so ./avx2/*.mod $(QSIMLIB_AVX2)
 	-rm -f ./avx512/*.x ./avx512/*.a ./avx512/*.so ./avx512/*.mod $(QSIMLIB_AVX512)
 	-rm -f ./cuda/*.x ./cuda/*.a ./cuda/*.so ./cuda/*.mod $(QSIMLIB_CUDA)
+	-rm -f ./hip/*.x ./hip/*.a ./hip/*.so ./hip/*.mod $(QSIMLIB_HIP)
 	-rm -f ./custatevec/*.x ./custatevec/*.a ./custatevec/*.so ./custatevec/*.mod $(QSIMLIB_CUSTATEVEC)
 	-rm -f ./decide/*.x ./decide/*.a ./decide/*.so ./decide/*.mod $(QSIMLIB_DECIDE)

From 09611cfd2185bdf376f9575459201556c2aafdeb Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Fri, 18 Aug 2023 14:19:29 -0400
Subject: [PATCH 08/15] Remove APPLE from pybind for HIP

---
 pybind_interface/hip/CMakeLists.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/pybind_interface/hip/CMakeLists.txt b/pybind_interface/hip/CMakeLists.txt
index 3e89113d..982579fb 100644
--- a/pybind_interface/hip/CMakeLists.txt
+++ b/pybind_interface/hip/CMakeLists.txt
@@ -7,12 +7,6 @@ ELSE()
     set(CMAKE_CXX_FLAGS "-O3 -fopenmp")
 ENDIF()
 
-if(APPLE)
-    set(CMAKE_CXX_STANDARD 14)
-    include_directories("/usr/local/include" "/usr/local/opt/llvm/include")
-    link_directories("/usr/local/lib" "/usr/local/opt/llvm/lib")
-endif()
-
 INCLUDE(../GetPybind11.cmake)
 find_package(PythonLibs 3.7 REQUIRED)
 find_package(HIP REQUIRED)

From d732ea83b5ad321c63191a7f25b1664e70dfd05d Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Fri, 18 Aug 2023 14:20:52 -0400
Subject: [PATCH 09/15] Use HIPCC for CPU and GPU pybind files

---
 pybind_interface/Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pybind_interface/Makefile b/pybind_interface/Makefile
index 253d3f85..95f64b32 100644
--- a/pybind_interface/Makefile
+++ b/pybind_interface/Makefile
@@ -29,7 +29,7 @@ ifeq ($(shell which $(NVCC)),)
 ifeq ($(shell which $(HIPCC)),)
 pybind: pybind-cpu decide-cpu
 else
-pybind: pybind-cpu pybind-hip decide-hip
+pybind: pybind-hip decide-hip
 endif
 else
 # Check for the cuStateVec library.
@@ -70,6 +70,10 @@ decide-custatevec:
 
 .PHONY: pybind-hip
 pybind-hip:
+	$(HIPCC) basic/pybind_main_basic.cpp -o $(QSIMLIB_BASIC) $(CXXFLAGS) $(PYBINDFLAGS_BASIC)
+	$(HIPCC) sse/pybind_main_sse.cpp -o $(QSIMLIB_SSE) $(CXXFLAGS) $(PYBINDFLAGS_SSE)
+	$(HIPCC) avx2/pybind_main_avx2.cpp -o $(QSIMLIB_AVX2) $(CXXFLAGS) $(PYBINDFLAGS_AVX2)
+	$(HIPCC) avx512/pybind_main_avx512.cpp -o $(QSIMLIB_AVX512) $(CXXFLAGS) $(PYBINDFLAGS_AVX512)
 	$(HIPCC) hip/pybind_main_hip.cpp -o $(QSIMLIB_HIP) $(HIPCCFLAGS) $(PYBINDFLAGS_HIP)
 
 .PHONY: decide-hip

From 31fff3bd04d3933b892f2ee5e5924e83f7dae86f Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Fri, 18 Aug 2023 14:23:01 -0400
Subject: [PATCH 10/15] Add CMAKE_MODULE_PATH for pybind with HIP

---
 pybind_interface/decide/CMakeLists.txt | 1 +
 pybind_interface/hip/CMakeLists.txt    | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/pybind_interface/decide/CMakeLists.txt b/pybind_interface/decide/CMakeLists.txt
index 2b28ca96..b94b0ade 100644
--- a/pybind_interface/decide/CMakeLists.txt
+++ b/pybind_interface/decide/CMakeLists.txt
@@ -30,6 +30,7 @@ if(has_nvcc STREQUAL "")
   if(has_hipcc STREQUAL "")
     pybind11_add_module(qsim_decide decide.cpp)
   else()
+    list(APPEND CMAKE_MODULE_PATH "/opt/rocm/lib/cmake/hip")
     find_package(HIP REQUIRED)
     find_package(PythonLibs 3.7 REQUIRED)
 
diff --git a/pybind_interface/hip/CMakeLists.txt b/pybind_interface/hip/CMakeLists.txt
index 982579fb..fe4b1c54 100644
--- a/pybind_interface/hip/CMakeLists.txt
+++ b/pybind_interface/hip/CMakeLists.txt
@@ -9,6 +9,8 @@ ENDIF()
 
 INCLUDE(../GetPybind11.cmake)
 find_package(PythonLibs 3.7 REQUIRED)
+
+list(APPEND CMAKE_MODULE_PATH "/opt/rocm/lib/cmake/hip")
 find_package(HIP REQUIRED)
 
 include_directories(${PYTHON_INCLUDE_DIRS} ${pybind11_SOURCE_DIR}/include)

From 9e12b68ca77ceab07e656b93721cf17bbe4b2937 Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Fri, 18 Aug 2023 14:24:13 -0400
Subject: [PATCH 11/15] Add hipcc detection in setup.py

---
 setup.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/setup.py b/setup.py
index eafa4387..593620a6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,7 @@
 import os
 import re
 import sys
+import shutil
 import platform
 import subprocess
 
@@ -63,6 +64,12 @@ def build_extension(self, ext):
                 "-DCMAKE_CXX_COMPILER=/usr/local/opt/llvm/bin/clang++",
             ]
 
+        if shutil.which("hipcc") is not None:
+            cmake_args += [
+                "-DCMAKE_C_COMPILER=hipcc",
+                "-DCMAKE_CXX_COMPILER=hipcc",
+            ]
+
         env = os.environ.copy()
         env["CXXFLAGS"] = '{} -DVERSION_INFO=\\"{}\\"'.format(
             env.get("CXXFLAGS", ""), self.distribution.get_version()

From d25454331acdd3f31d8049f45b99b6a0637cbde2 Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Mon, 16 Oct 2023 14:43:59 -0400
Subject: [PATCH 12/15] Fix style

---
 lib/statespace_cuda.h  |  3 ++-
 lib/vectorspace_cuda.h | 22 ++++++++++++----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
index 81780150..660db074 100644
--- a/lib/statespace_cuda.h
+++ b/lib/statespace_cuda.h
@@ -107,7 +107,8 @@ class StateSpaceCUDA :
   }
 
   void SetAllZeros(State& state) const {
-    ErrorCheck(cudaMemset(state.get(), 0, MinSize(state.num_qubits()) * sizeof(fp_type)));
+    ErrorCheck(cudaMemset(state.get(), 0,
+               MinSize(state.num_qubits()) * sizeof(fp_type)));
   }
 
   // Uniform superposition.
diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index ebf21095..fd91553d 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -120,9 +120,9 @@ class VectorSpaceCUDA {
     }
 
     ErrorCheck(
-      cudaMemcpy(dest.get(), src.get(),
-                 sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-                 cudaMemcpyDeviceToDevice));
+        cudaMemcpy(dest.get(), src.get(),
+                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                   cudaMemcpyDeviceToDevice));
 
     return true;
   }
@@ -131,9 +131,9 @@ class VectorSpaceCUDA {
   // Impl::MinSize(src.num_qubits()) elements.
   bool Copy(const Vector& src, fp_type* dest) const {
     ErrorCheck(
-      cudaMemcpy(dest, src.get(),
-                 sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-                 cudaMemcpyDeviceToHost));
+        cudaMemcpy(dest, src.get(),
+                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                   cudaMemcpyDeviceToHost));
 
     return true;
   }
@@ -142,9 +142,9 @@ class VectorSpaceCUDA {
   // Impl::MinSize(dest.num_qubits()) elements.
   bool Copy(const fp_type* src, Vector& dest) const {
     ErrorCheck(
-      cudaMemcpy(dest.get(), src,
-                 sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
-                 cudaMemcpyHostToDevice));
+        cudaMemcpy(dest.get(), src,
+                   sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
+                   cudaMemcpyHostToDevice));
 
     return true;
   }
@@ -154,7 +154,9 @@ class VectorSpaceCUDA {
   bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
     size = std::min(size, Impl::MinSize(dest.num_qubits()));
     ErrorCheck(
-      cudaMemcpy(dest.get(), src, sizeof(fp_type) * size, cudaMemcpyHostToDevice));
+        cudaMemcpy(dest.get(), src,
+                   sizeof(fp_type) * size,
+                   cudaMemcpyHostToDevice));
     return true;
   }
 

From a1ab2e2700fc3f948da72f1355755ce79ebd03eb Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Tue, 7 Nov 2023 14:14:54 -0500
Subject: [PATCH 13/15] Add documentation for AMD GPU support

---
 docs/_book.yaml           |  2 +
 docs/tutorials/amd_gpu.md | 82 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100644 docs/tutorials/amd_gpu.md

diff --git a/docs/_book.yaml b/docs/_book.yaml
index 2fe9a2c8..05c862b3 100644
--- a/docs/_book.yaml
+++ b/docs/_book.yaml
@@ -26,6 +26,8 @@ upper_tabs:
          path: /qsim/tutorials/q32d14
        - title: "Simulate noise"
          path: /qsim/tutorials/noisy_qsimcirq
+       - title: "AMD GPU support"
+         path: /qsim/tutorials/amd_gpu
 
    - name: "Guides"
      contents:
diff --git a/docs/tutorials/amd_gpu.md b/docs/tutorials/amd_gpu.md
new file mode 100644
index 00000000..dcd29500
--- /dev/null
+++ b/docs/tutorials/amd_gpu.md
@@ -0,0 +1,82 @@
+# Support for AMD Instinct™ MI Series Accelerators
+
+Qsim provides support for AMD Instinct accelerators.
+The implementation covers the native GPU support in Qsim
+by utilizing [HIP](https://rocm.docs.amd.com/projects/HIP)
+(Heterogeneous-Compute Interface for Portability).
+The cuQuantum implementation is currently not covered.
+
+## Building
+
+To enable support for AMD Instinct GPUs, Qsim needs to be built from sources.
+This can be done as follows:
+
+```
+conda env list
+conda create -y -n CirqDevEnv python=3
+conda activate CirqDevEnv
+pip install pybind11
+
+git clone https://github.com/quantumlib/qsim.git
+cd qsim
+
+make -j qsim      # to build CPU qsim
+make -j qsim-hip  # to build HIP qsim
+make -j pybind    # to build Python bindings
+make -j cxx-tests # to build CPU tests
+make -j hip-tests # to build HIP tests
+
+pip install .
+```
+
+Note: To avoid problems when building Qsim with support for AMD GPUs,
+make sure to use the latest version of CMake.
+
+## Testing
+
+### Simulator
+
+To test the Qsim simulator:
+
+```
+make run-cxx-tests # to run CPU tests
+make run-hip-tests # to run HIP tests
+```
+
+or
+
+```
+cd tests
+for file in *.x; do ./"$file"; done          # to run all tests
+for file in *_hip_test.x; do ./"$file"; done # to run HIP tests only
+```
+
+### Python Bindings
+
+To test the Python bindings:
+
+```
+make run-py-tests
+```
+
+or
+
+```
+cd qsimcirq_tests
+python3 -m pytest -v qsimcirq_test.py
+```
+
+## Using
+
+Using Qsim on AMD Instinct GPUs is identical to using it on NVIDIA GPUs.
+I.e., it is done by passing `use_gpu=True` and `gpu_mode=0` as `qsimcirq.QSimOptions`:
+
+```
+simulator = qsimcirq.QSimSimulator(qsim_options=qsimcirq.QSimOptions(
+        use_gpu=True,
+        gpu_mode=0,
+        ...
+    ))
+```
+
+Note: `gpu_mode` has to be set to zero for AMD GPUs, as cuStateVec is not supported.

From 56c27c60db5a5725c078b96818fa272d95ca82de Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Wed, 8 Nov 2023 12:13:00 -0500
Subject: [PATCH 14/15] Replace Qsim with qsim in amd_gpu.md

---
 docs/tutorials/amd_gpu.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/tutorials/amd_gpu.md b/docs/tutorials/amd_gpu.md
index dcd29500..11965c2d 100644
--- a/docs/tutorials/amd_gpu.md
+++ b/docs/tutorials/amd_gpu.md
@@ -1,14 +1,14 @@
 # Support for AMD Instinct™ MI Series Accelerators
 
-Qsim provides support for AMD Instinct accelerators.
-The implementation covers the native GPU support in Qsim
+qsim provides support for AMD Instinct accelerators.
+The implementation covers the native GPU support in qsim
 by utilizing [HIP](https://rocm.docs.amd.com/projects/HIP)
 (Heterogeneous-Compute Interface for Portability).
 The cuQuantum implementation is currently not covered.
 
 ## Building
 
-To enable support for AMD Instinct GPUs, Qsim needs to be built from sources.
+To enable support for AMD Instinct GPUs, qsim needs to be built from sources.
 This can be done as follows:
 
 ```
@@ -29,14 +29,14 @@ make -j hip-tests # to build HIP tests
 pip install .
 ```
 
-Note: To avoid problems when building Qsim with support for AMD GPUs,
+Note: To avoid problems when building qsim with support for AMD GPUs,
 make sure to use the latest version of CMake.
 
 ## Testing
 
 ### Simulator
 
-To test the Qsim simulator:
+To test the qsim simulator:
 
 ```
 make run-cxx-tests # to run CPU tests
@@ -68,7 +68,7 @@ python3 -m pytest -v qsimcirq_test.py
 
 ## Using
 
-Using Qsim on AMD Instinct GPUs is identical to using it on NVIDIA GPUs.
+Using qsim on AMD Instinct GPUs is identical to using it on NVIDIA GPUs.
 I.e., it is done by passing `use_gpu=True` and `gpu_mode=0` as `qsimcirq.QSimOptions`:
 
 ```

From 62d64c4198ae8701b9bac7337bc28959fd855053 Mon Sep 17 00:00:00 2001
From: Jakub Kurzak <jakurzak@amd.com>
Date: Wed, 8 Nov 2023 12:27:33 -0500
Subject: [PATCH 15/15] Add pointers to AMD ROCm Platform to amd_gpu.md

---
 docs/tutorials/amd_gpu.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/tutorials/amd_gpu.md b/docs/tutorials/amd_gpu.md
index 11965c2d..e7c5f613 100644
--- a/docs/tutorials/amd_gpu.md
+++ b/docs/tutorials/amd_gpu.md
@@ -2,13 +2,17 @@
 
 qsim provides support for AMD Instinct accelerators.
 The implementation covers the native GPU support in qsim
-by utilizing [HIP](https://rocm.docs.amd.com/projects/HIP)
+by utilizing [AMD HIP SDK](https://rocm.docs.amd.com/projects/HIP)
 (Heterogeneous-Compute Interface for Portability).
 The cuQuantum implementation is currently not covered.
 
 ## Building
 
-To enable support for AMD Instinct GPUs, qsim needs to be built from sources.
+Building qsim with support for AMD Instinct accelerators requires installation of
+[AMD ROCm™ Open Software Platform](https://www.amd.com/en/developer/resources/rocm-hub.html).
+Instructions for installing ROCm are available at https://rocm.docs.amd.com/.
+
+To enable support for AMD GPUs, qsim needs to be built from sources.
 This can be done as follows:
 
 ```