-
Notifications
You must be signed in to change notification settings - Fork 14
Description
Hi.
I use CUDA 12.8, pytorch 2.8.0, triton 3.4 on Windows. I tried:
import torch
from flashrnn import flashrnn
device = torch.device('cuda')
dtype = torch.bfloat16
B = 8 # batch size
T = 1024 # sequence length
N = 3 # number of heads
D = 256 # head dimension
G = 4 # number of gates / pre-activations for LSTM example
S = 2 # number of states
Wx = torch.randn([B, T, G, N, D], device=device, dtype=dtype, requires_grad=True)
R = torch.randn([G, N, D, D], device=device, dtype=dtype, requires_grad=True)
b = torch.randn([G, N, D], device=device, dtype=dtype, requires_grad=True)
states_initial = torch.randn([S, B, 1, N, D], device=device, dtype=dtype, requires_grad=True)
available functions
lstm, gru, elman, slstm
available backend
cuda_fused, cuda, triton and vanilla
states, last_states = flashrnn(Wx, R, b, states=states_initial, function="lstm", backend="cuda_fused")
I am getting error:
states, last_states = flashrnn(Wx, R, b, states=states_initial, function="lstm", backend="cuda_fused")
Traceback (most recent call last):
File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:2595 in _run_ninja_build
subprocess.run(
File C:\ProgramData\anaconda3\Lib\subprocess.py:571 in run
raise CalledProcessError(retcode, process.args,
CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
Cell In[17], line 1
states, last_states = flashrnn(Wx, R, b, states=states_initial, function="lstm", backend="cuda_fused")
File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\flashrnn.py:1134 in flashrnn
h, last_h = kernel(Wx, states, R, b)
File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\flashrnn.py:1049 in fn
states = FlashRNNFuncGeneratorFused(
File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\flashrnn.py:680 in FlashRNNFuncGeneratorFused
flashrnn_cuda = _FlashRNNCUDAFused.instance(config)
File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\flashrnn.py:394 in instance
gpu_info = get_gpu_info(device_id=torch.cuda.current_device())
File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.py:40 in get_gpu_info
gpu_info_cuda = _GPUInfoCUDA.instance()
File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.py:27 in instance
cls.mod = load(
File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\cuda_init.py:132 in load
mod = _load(name + suffix, sources, **myargs)
File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:1681 in load
return _jit_compile(
File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:2138 in _jit_compile
_write_ninja_file_and_build_library(
File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:2290 in _write_ninja_file_and_build_library
_run_ninja_build(
File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:2612 in _run_ninja_build
raise RuntimeError(message) from e
RuntimeError: Error building extension 'gpu_info2'
b'[1/3] C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc --generate-dependencies-with-compile --dependency-output gpu_info.cuda.o.d -Xcudafe --diag_suppress=dll_interface_conflict_dllexport_assumed -Xcudafe --diag_suppress=dll_interface_conflict_none_assumed -Xcudafe --diag_suppress=field_without_dll_interface -Xcudafe --diag_suppress=base_class_has_different_dll_interface -Xcompiler /EHsc -Xcompiler /wd4068 -Xcompiler /wd4067 -Xcompiler /wd4624 -Xcompiler /wd4190 -Xcompiler /wd4018 -Xcompiler /wd4275 -Xcompiler /wd4267 -Xcompiler /wd4244 -Xcompiler /wd4251 -Xcompiler /wd4819 -Xcompiler /MD -DTORCH_EXTENSION_NAME=gpu_info2 -DTORCH_API_INCLUDE_EXTENSION_H -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include\torch\csrc\api\include "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include" -IC:\ProgramData\anaconda3\Include -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -std=c++17 -Xptxas="-v" -gencode arch=compute_80,code=compute_80 -res-usage --use_fast_math -O3 "-Xptxas -O3" --extra-device-vectorization -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ -IC:\ProgramData\anaconda3\include -IC:\ProgramData\anaconda3\include -c C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.cu -o gpu_info.cuda.o \r\n\x1b[31mFAILED: \x1b[0mgpu_info.cuda.o \r\nC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc --generate-dependencies-with-compile --dependency-output gpu_info.cuda.o.d -Xcudafe --diag_suppress=dll_interface_conflict_dllexport_assumed -Xcudafe --diag_suppress=dll_interface_conflict_none_assumed -Xcudafe --diag_suppress=field_without_dll_interface -Xcudafe --diag_suppress=base_class_has_different_dll_interface -Xcompiler /EHsc -Xcompiler /wd4068 -Xcompiler /wd4067 -Xcompiler /wd4624 -Xcompiler /wd4190 -Xcompiler /wd4018 -Xcompiler /wd4275 -Xcompiler /wd4267 -Xcompiler /wd4244 -Xcompiler /wd4251 -Xcompiler /wd4819 -Xcompiler /MD -DTORCH_EXTENSION_NAME=gpu_info2 -DTORCH_API_INCLUDE_EXTENSION_H -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include\torch\csrc\api\include "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include" -IC:\ProgramData\anaconda3\Include -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -std=c++17 -Xptxas="-v" -gencode arch=compute_80,code=compute_80 -res-usage --use_fast_math -O3 "-Xptxas -O3" --extra-device-vectorization -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ -IC:\ProgramData\anaconda3\include -IC:\ProgramData\anaconda3\include -c C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.cu -o gpu_info.cuda.o \r\nnvcc fatal : Unknown option '-Xptxas -O3'\r\r\n[2/3] cl /showIncludes -DTORCH_EXTENSION_NAME=gpu_info2 -DTORCH_API_INCLUDE_EXTENSION_H -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include\torch\csrc\api\include "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include" -IC:\ProgramData\anaconda3\Include /std:c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ -IC:\ProgramData\anaconda3\include /MD /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc -c C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.cc /Fogpu_info.o \r\nMicrosoft (R) C/C++ Optimizing Compiler Version 19.41.34120 for x64\r\nCopyright (C) Microsoft Corporation. All rights reserved.\r\n\r\nninja: build stopped: subcommand failed.\r\n'