Skip to content
This repository was archived by the owner on Feb 24, 2026. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
branch = tilelang_codebase
[submodule "3rdparty/tilelang"]
path = 3rdparty/tilelang
url = https://github.com/tile-ai/tilelang
branch = bitblas
url = https://github.com/tile-ai/tilelang.git
[submodule "3rdparty/cutlass"]
path = 3rdparty/cutlass
url = https://github.com/tile-ai/cutlass
branch = tldev
url = https://github.com/NVIDIA/cutlass
2 changes: 1 addition & 1 deletion 3rdparty/cutlass
Submodule cutlass updated 2480 files
2 changes: 1 addition & 1 deletion 3rdparty/tilelang
Submodule tilelang updated 489 files
3 changes: 2 additions & 1 deletion bitblas/ops/operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import re

logger = logging.getLogger(__name__)
logger.setLevel(level=logging.DEBUG)

APPLY_SCHEDULE_FAILED_MESSAGE = ("Failed to apply default schedule for operator {} "
"With target {} and hint {}. \n"
Expand Down Expand Up @@ -193,7 +194,7 @@ def tvm_callback_hip_postproc(code, _):
rt_mod = tvm.build(self.scheduled_ir_module, target=target)
elif self.is_tilelang_backend():
rt_mod = tilelang.lower(
self.scheduled_ir_module, target=target, runtime_only=True)
self.prim_func, target=target, runtime_only=True, enable_host_codegen=True).rt_mod
else:
raise ValueError(f"Unsupported backend: {self.backend}")
except Exception as build_runtime_error: # noqa: F841
Expand Down
116 changes: 112 additions & 4 deletions bitblas/tl/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,13 @@ def tvm_callback_cuda_postproc(code, _):
"tir.disable_cse_tir": True,
**(config.pass_context if config.pass_context else {})
}):
rt_mod = tilelang.lower(tl_prim_func, arch.target, runtime_only=True)
rt_mod = tilelang.lower(tl_prim_func, arch.target, runtime_only=True, enable_host_codegen=True)

from tvm.contrib.tar import tar # Import the tar module

artifact_path = os.path.join(tempfile.mkdtemp(), "tvm_tmp_mod." + tar.output_format)
code = rt_mod.imported_modules[0].get_source()
rt_mod.export_library(artifact_path, fcompile=tar)
code = rt_mod.rt_mod.imported_modules[0].get_source()
rt_mod.rt_mod.export_library(artifact_path, fcompile=tar)
return idx, code, artifact_path

# Use ThreadPoolExecutor for parallel execution
Expand Down Expand Up @@ -189,6 +189,114 @@ def tvm_callback_cuda_postproc(code, _):

return cpresults, best

def apply_and_build_serial(scheduler,
configs,
arch,
num_repeats=3,
timeout=60,
max_workers=1,
data_distribution="uniform") -> CompileResult:
cpresults = []

# Process each config serially
_scheduled_ir_modules: List[Schedule] = []

def _submit_config(f, c, a):
try:
scheduled_ir_module = _apply_config(f, c, a)
except Exception as apply_schedule_error:
logger.debug("Apply schedule failed: {}".format(apply_schedule_error))
scheduled_ir_module = None
return scheduled_ir_module

# Apply config one by one in serial
for config in configs:
_scheduled_ir_modules.append(_submit_config(scheduler, config, arch))

# Build in serial
def _build(context):
idx, mod, arch = context
if mod is None:
return idx, None, None

config = configs[idx]
assert config is not None

@tvm.register_func(func_name="tvm_callback_cuda_postproc", override=True)
def tvm_callback_cuda_postproc(code, _):
code = tensor_replace_dp4a(code)
code = tensor_remove_make_int4(code)
code = tensor_remove_make_int2(code)
return code

# Check only have one function in the module
if len(mod.functions) > 1:
raise ValueError("Only support one function in the module")

tl_prim_func = list(mod.functions.values())[0]

with tvm.transform.PassContext(
config={
"tir.use_async_copy": True,
"tir.disable_cse_tir": True,
**(config.pass_context if config.pass_context else {})
}):
rt_mod = tilelang.lower(tl_prim_func, arch.target, runtime_only=True, enable_host_codegen=True)

from tvm.contrib.tar import tar # Import the tar module

artifact_path = os.path.join(tempfile.mkdtemp(), "tvm_tmp_mod." + tar.output_format)
code = rt_mod.kernel_source
rt_mod.rt_mod.export_library(artifact_path, fcompile=tar)
return idx, code, artifact_path

# Build each module one by one in serial
for idx, mod in enumerate(_scheduled_ir_modules):
try:
idx, code, artifact_path = _build((idx, mod, arch))
ir_module = _scheduled_ir_modules[idx]
config = configs[idx]

if artifact_path is None:
ARTIFACT_NOT_FOUND = f"Apply config {config} failed, artifact path is None"
logger.error(ARTIFACT_NOT_FOUND)
continue

rt_mod = tvm.runtime.load_module(artifact_path)
cpresult = CompileResult(config, tvm.tir.Schedule(ir_module), rt_mod)
timer_cuda_mod = rt_mod.time_evaluator(
rt_mod.entry_name, arch.device, number=num_repeats)
cpresult.time_evaluator = timer_cuda_mod
cpresult.code = code
cpresults.append(cpresult)

except Exception as e:
local_build_error = str(e)
if len(local_build_error) > MAX_ERROR_MESSAGE_LENGTH:
local_build_error = (
local_build_error[:MAX_ERROR_MESSAGE_LENGTH] + "\t...\t" +
local_build_error[-MAX_ERROR_MESSAGE_LENGTH:])
logger.error(f"An exception occurred for hint {config}: {local_build_error}")

best = None
best_latency = 1e9
for cpresult in cpresults:
config = cpresult.config
try:
latency = cpresult.profile(data_distribution=data_distribution)
except Exception as e_mesg:
logger.debug(f"Evaluation with config failed {e_mesg}")
continue
logger.info("Evaluation with config {}".format(config))
logger.info("Time cost of this config: {:.3f} ms".format(latency))

cpresult.latency = latency
if latency < best_latency:
best_latency = latency
best = cpresult

return cpresults, best


def apply_and_build(
scheduler,
Expand All @@ -198,5 +306,5 @@ def apply_and_build(
data_distribution="uniform",
) -> Tuple[List[CompileResult], CompileResult]:
max_workers = 10 if parallel_build else 1
return apply_and_build_parallel(
return apply_and_build_serial(
scheduler, configs, arch, max_workers=max_workers, data_distribution=data_distribution)
10 changes: 7 additions & 3 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,12 @@ if [ $? -ne 0 ]; then
exit 1
fi

CORES=$(nproc)
MAKE_JOBS=$(( CORES * 75 / 100 ))
echo "Using $MAKE_JOBS jobs for make..."

echo "Building TVM with make..."
make -j
make -j${MAKE_JOBS}
if [ $? -ne 0 ]; then
echo "Error: TVM build failed."
exit 1
Expand All @@ -134,7 +138,7 @@ if [ $? -ne 0 ]; then
exit 1
fi

make -j
make -j${MAKE_JOBS}
if [ $? -ne 0 ]; then
echo "Error: TileLang build failed."
exit 1
Expand Down Expand Up @@ -185,4 +189,4 @@ else
fi

# Reload ~/.bashrc to apply the changes
source ~/.bashrc
source ~/.bashrc
6 changes: 4 additions & 2 deletions install_amd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ cp cmake/config.cmake build
cd build
echo "set(USE_LLVM llvm-config-16)" >> config.cmake && echo "set(USE_ROCM /opt/rocm)" >> config.cmake

cmake .. && make -j && cd ../../..
CORES=$(nproc)
MAKE_JOBS=$(( CORES * 75 / 100 ))
cmake .. && make -j${MAKE_JOBS} && cd ../../..

TVM_PREBUILD_PATH=$(realpath .)

Expand All @@ -77,7 +79,7 @@ if [ $? -ne 0 ]; then
exit 1
fi

make -j
make -j${MAKE_JOBS}
if [ $? -ne 0 ]; then
echo "Error: TileLang build failed."
exit 1
Expand Down
Loading