Skip to content

flashrnn outputs error around 'gpu_info2' #6

@roman8ivanov

Description

@roman8ivanov

Hi.

I use CUDA 12.8, pytorch 2.8.0, triton 3.4 on Windows. I tried:

import torch
from flashrnn import flashrnn

device = torch.device('cuda')
dtype = torch.bfloat16
B = 8 # batch size
T = 1024 # sequence length
N = 3 # number of heads
D = 256 # head dimension
G = 4 # number of gates / pre-activations for LSTM example
S = 2 # number of states

Wx = torch.randn([B, T, G, N, D], device=device, dtype=dtype, requires_grad=True)
R = torch.randn([G, N, D, D], device=device, dtype=dtype, requires_grad=True)
b = torch.randn([G, N, D], device=device, dtype=dtype, requires_grad=True)
states_initial = torch.randn([S, B, 1, N, D], device=device, dtype=dtype, requires_grad=True)

available functions

lstm, gru, elman, slstm

available backend

cuda_fused, cuda, triton and vanilla

states, last_states = flashrnn(Wx, R, b, states=states_initial, function="lstm", backend="cuda_fused")

I am getting error:
states, last_states = flashrnn(Wx, R, b, states=states_initial, function="lstm", backend="cuda_fused")
Traceback (most recent call last):

File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:2595 in _run_ninja_build
subprocess.run(

File C:\ProgramData\anaconda3\Lib\subprocess.py:571 in run
raise CalledProcessError(retcode, process.args,

CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):

Cell In[17], line 1
states, last_states = flashrnn(Wx, R, b, states=states_initial, function="lstm", backend="cuda_fused")

File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\flashrnn.py:1134 in flashrnn
h, last_h = kernel(Wx, states, R, b)

File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\flashrnn.py:1049 in fn
states = FlashRNNFuncGeneratorFused(

File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\flashrnn.py:680 in FlashRNNFuncGeneratorFused
flashrnn_cuda = _FlashRNNCUDAFused.instance(config)

File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\flashrnn.py:394 in instance
gpu_info = get_gpu_info(device_id=torch.cuda.current_device())

File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.py:40 in get_gpu_info
gpu_info_cuda = _GPUInfoCUDA.instance()

File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.py:27 in instance
cls.mod = load(

File C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\cuda_init.py:132 in load
mod = _load(name + suffix, sources, **myargs)

File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:1681 in load
return _jit_compile(

File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:2138 in _jit_compile
_write_ninja_file_and_build_library(

File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:2290 in _write_ninja_file_and_build_library
_run_ninja_build(

File C:\ProgramData\anaconda3\Lib\site-packages\torch\utils\cpp_extension.py:2612 in _run_ninja_build
raise RuntimeError(message) from e

RuntimeError: Error building extension 'gpu_info2'

b'[1/3] C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc --generate-dependencies-with-compile --dependency-output gpu_info.cuda.o.d -Xcudafe --diag_suppress=dll_interface_conflict_dllexport_assumed -Xcudafe --diag_suppress=dll_interface_conflict_none_assumed -Xcudafe --diag_suppress=field_without_dll_interface -Xcudafe --diag_suppress=base_class_has_different_dll_interface -Xcompiler /EHsc -Xcompiler /wd4068 -Xcompiler /wd4067 -Xcompiler /wd4624 -Xcompiler /wd4190 -Xcompiler /wd4018 -Xcompiler /wd4275 -Xcompiler /wd4267 -Xcompiler /wd4244 -Xcompiler /wd4251 -Xcompiler /wd4819 -Xcompiler /MD -DTORCH_EXTENSION_NAME=gpu_info2 -DTORCH_API_INCLUDE_EXTENSION_H -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include\torch\csrc\api\include "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include" -IC:\ProgramData\anaconda3\Include -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -std=c++17 -Xptxas="-v" -gencode arch=compute_80,code=compute_80 -res-usage --use_fast_math -O3 "-Xptxas -O3" --extra-device-vectorization -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ -IC:\ProgramData\anaconda3\include -IC:\ProgramData\anaconda3\include -c C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.cu -o gpu_info.cuda.o \r\n\x1b[31mFAILED: \x1b[0mgpu_info.cuda.o \r\nC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc --generate-dependencies-with-compile --dependency-output gpu_info.cuda.o.d -Xcudafe --diag_suppress=dll_interface_conflict_dllexport_assumed -Xcudafe --diag_suppress=dll_interface_conflict_none_assumed -Xcudafe --diag_suppress=field_without_dll_interface -Xcudafe --diag_suppress=base_class_has_different_dll_interface -Xcompiler /EHsc -Xcompiler /wd4068 -Xcompiler /wd4067 -Xcompiler /wd4624 -Xcompiler /wd4190 -Xcompiler /wd4018 -Xcompiler /wd4275 -Xcompiler /wd4267 -Xcompiler /wd4244 -Xcompiler /wd4251 -Xcompiler /wd4819 -Xcompiler /MD -DTORCH_EXTENSION_NAME=gpu_info2 -DTORCH_API_INCLUDE_EXTENSION_H -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include\torch\csrc\api\include "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include" -IC:\ProgramData\anaconda3\Include -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90 -std=c++17 -Xptxas="-v" -gencode arch=compute_80,code=compute_80 -res-usage --use_fast_math -O3 "-Xptxas -O3" --extra-device-vectorization -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ -IC:\ProgramData\anaconda3\include -IC:\ProgramData\anaconda3\include -c C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.cu -o gpu_info.cuda.o \r\nnvcc fatal : Unknown option '-Xptxas -O3'\r\r\n[2/3] cl /showIncludes -DTORCH_EXTENSION_NAME=gpu_info2 -DTORCH_API_INCLUDE_EXTENSION_H -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include -IC:\ProgramData\anaconda3\Lib\site-packages\torch\include\torch\csrc\api\include "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include" -IC:\ProgramData\anaconda3\Include /std:c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ -IC:\ProgramData\anaconda3\include /MD /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc -c C:\ProgramData\anaconda3\Lib\site-packages\flashrnn\flashrnn\gpu_info\gpu_info.cc /Fogpu_info.o \r\nMicrosoft (R) C/C++ Optimizing Compiler Version 19.41.34120 for x64\r\nCopyright (C) Microsoft Corporation. All rights reserved.\r\n\r\nninja: build stopped: subcommand failed.\r\n'

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions