microsoft · Alex4210987 · Apr 13, 2025 · Apr 13, 2025 · Apr 14, 2025 · Apr 14, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -4,9 +4,7 @@
 	branch = tilelang_codebase
 [submodule "3rdparty/tilelang"]
 	path = 3rdparty/tilelang
-	url = https://github.com/tile-ai/tilelang
-	branch = bitblas
+	url = https://github.com/tile-ai/tilelang.git
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
-	url = https://github.com/tile-ai/cutlass
-	branch = tldev
+	url = https://github.com/NVIDIA/cutlass
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
diff --git a/3rdparty/tilelang b/3rdparty/tilelang
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/bitblas/ops/operator.py b/bitblas/ops/operator.py
@@ -26,6 +26,7 @@
 import re
 
 logger = logging.getLogger(__name__)
+logger.setLevel(level=logging.DEBUG)
 
 APPLY_SCHEDULE_FAILED_MESSAGE = ("Failed to apply default schedule for operator {} "
                                  "With target {} and hint {}. \n"
@@ -193,7 +194,7 @@ def tvm_callback_hip_postproc(code, _):
                         rt_mod = tvm.build(self.scheduled_ir_module, target=target)
                     elif self.is_tilelang_backend():
                         rt_mod = tilelang.lower(
-                            self.scheduled_ir_module, target=target, runtime_only=True)
+                            self.prim_func, target=target, runtime_only=True, enable_host_codegen=True).rt_mod
                     else:
                         raise ValueError(f"Unsupported backend: {self.backend}")
             except Exception as build_runtime_error:  # noqa: F841

diff --git a/bitblas/tl/tuner.py b/bitblas/tl/tuner.py
@@ -122,13 +122,13 @@ def tvm_callback_cuda_postproc(code, _):
                     "tir.disable_cse_tir": True,
                     **(config.pass_context if config.pass_context else {})
                 }):
-            rt_mod = tilelang.lower(tl_prim_func, arch.target, runtime_only=True)
+            rt_mod = tilelang.lower(tl_prim_func, arch.target, runtime_only=True, enable_host_codegen=True)
 
         from tvm.contrib.tar import tar  # Import the tar module
 
         artifact_path = os.path.join(tempfile.mkdtemp(), "tvm_tmp_mod." + tar.output_format)
-        code = rt_mod.imported_modules[0].get_source()
-        rt_mod.export_library(artifact_path, fcompile=tar)
+        code = rt_mod.rt_mod.imported_modules[0].get_source()
+        rt_mod.rt_mod.export_library(artifact_path, fcompile=tar)
         return idx, code, artifact_path
 
     # Use ThreadPoolExecutor for parallel execution
@@ -189,6 +189,114 @@ def tvm_callback_cuda_postproc(code, _):
 
     return cpresults, best
 
+def apply_and_build_serial(scheduler,
+                            configs,
+                            arch,
+                            num_repeats=3,
+                            timeout=60,
+                            max_workers=1,
+                            data_distribution="uniform") -> CompileResult:
+    cpresults = []
+
+    # Process each config serially
+    _scheduled_ir_modules: List[Schedule] = []
+
+    def _submit_config(f, c, a):
+        try:
+            scheduled_ir_module = _apply_config(f, c, a)
+        except Exception as apply_schedule_error:
+            logger.debug("Apply schedule failed: {}".format(apply_schedule_error))
+            scheduled_ir_module = None
+        return scheduled_ir_module
+
+    # Apply config one by one in serial
+    for config in configs:
+        _scheduled_ir_modules.append(_submit_config(scheduler, config, arch))
+
+    # Build in serial
+    def _build(context):
+        idx, mod, arch = context
+        if mod is None:
+            return idx, None, None
+
+        config = configs[idx]
+        assert config is not None
+
+        @tvm.register_func(func_name="tvm_callback_cuda_postproc", override=True)
+        def tvm_callback_cuda_postproc(code, _):
+            code = tensor_replace_dp4a(code)
+            code = tensor_remove_make_int4(code)
+            code = tensor_remove_make_int2(code)
+            return code
+
+        # Check only have one function in the module
+        if len(mod.functions) > 1:
+            raise ValueError("Only support one function in the module")
+
+        tl_prim_func = list(mod.functions.values())[0]
+
+        with tvm.transform.PassContext(
+                config={
+                    "tir.use_async_copy": True,
+                    "tir.disable_cse_tir": True,
+                    **(config.pass_context if config.pass_context else {})
+                }):
+            rt_mod = tilelang.lower(tl_prim_func, arch.target, runtime_only=True, enable_host_codegen=True)
+
+        from tvm.contrib.tar import tar  # Import the tar module
+
+        artifact_path = os.path.join(tempfile.mkdtemp(), "tvm_tmp_mod." + tar.output_format)
+        code = rt_mod.kernel_source
+        rt_mod.rt_mod.export_library(artifact_path, fcompile=tar)
+        return idx, code, artifact_path
+
+    # Build each module one by one in serial
+    for idx, mod in enumerate(_scheduled_ir_modules):
+        try:
+            idx, code, artifact_path = _build((idx, mod, arch))
+            ir_module = _scheduled_ir_modules[idx]
+            config = configs[idx]
+
+            if artifact_path is None:
+                ARTIFACT_NOT_FOUND = f"Apply config {config} failed, artifact path is None"
+                logger.error(ARTIFACT_NOT_FOUND)
+                continue
+
+            rt_mod = tvm.runtime.load_module(artifact_path)
+            cpresult = CompileResult(config, tvm.tir.Schedule(ir_module), rt_mod)
+            timer_cuda_mod = rt_mod.time_evaluator(
+                rt_mod.entry_name, arch.device, number=num_repeats)
+            cpresult.time_evaluator = timer_cuda_mod
+            cpresult.code = code
+            cpresults.append(cpresult)
+
+        except Exception as e:
+            local_build_error = str(e)
+            if len(local_build_error) > MAX_ERROR_MESSAGE_LENGTH:
+                local_build_error = (
+                    local_build_error[:MAX_ERROR_MESSAGE_LENGTH] + "\t...\t" +
+                    local_build_error[-MAX_ERROR_MESSAGE_LENGTH:])
+            logger.error(f"An exception occurred for hint {config}: {local_build_error}")
+
+    best = None
+    best_latency = 1e9
+    for cpresult in cpresults:
+        config = cpresult.config
+        try:
+            latency = cpresult.profile(data_distribution=data_distribution)
+        except Exception as e_mesg:
+            logger.debug(f"Evaluation with config failed {e_mesg}")
+            continue
+        logger.info("Evaluation with config {}".format(config))
+        logger.info("Time cost of this config: {:.3f} ms".format(latency))
+
+        cpresult.latency = latency
+        if latency < best_latency:
+            best_latency = latency
+            best = cpresult
+
+    return cpresults, best
+
 
 def apply_and_build(
     scheduler,
@@ -198,5 +306,5 @@ def apply_and_build(
     data_distribution="uniform",
 ) -> Tuple[List[CompileResult], CompileResult]:
     max_workers = 10 if parallel_build else 1
-    return apply_and_build_parallel(
+    return apply_and_build_serial(
         scheduler, configs, arch, max_workers=max_workers, data_distribution=data_distribution)
diff --git a/install.sh b/install.sh
@@ -110,8 +110,12 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
+CORES=$(nproc)
+MAKE_JOBS=$(( CORES * 75 / 100 ))
+echo "Using $MAKE_JOBS jobs for make..."
+
 echo "Building TVM with make..."
-make -j
+make -j${MAKE_JOBS}
 if [ $? -ne 0 ]; then
     echo "Error: TVM build failed."
     exit 1
@@ -134,7 +138,7 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
-make -j
+make -j${MAKE_JOBS}
 if [ $? -ne 0 ]; then
     echo "Error: TileLang build failed."
     exit 1
@@ -185,4 +189,4 @@ else
 fi
 
 # Reload ~/.bashrc to apply the changes
-source ~/.bashrc
+source ~/.bashrc
diff --git a/install_amd.sh b/install_amd.sh
@@ -60,7 +60,9 @@ cp cmake/config.cmake build
 cd build
 echo "set(USE_LLVM llvm-config-16)" >> config.cmake && echo "set(USE_ROCM /opt/rocm)" >> config.cmake
 
-cmake .. && make -j && cd ../../..
+CORES=$(nproc)
+MAKE_JOBS=$(( CORES * 75 / 100 ))
+cmake .. && make -j${MAKE_JOBS} && cd ../../..
 
 TVM_PREBUILD_PATH=$(realpath .)
 
@@ -77,7 +79,7 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
-make -j
+make -j${MAKE_JOBS}
 if [ $? -ne 0 ]; then
     echo "Error: TileLang build failed."
     exit 1
+18 −13		include/tvm/runtime/data_type.h
+1 −1		python/tvm/_ffi/libinfo.py
+5 −2		python/tvm/script/parser/core/parser.py
+11 −1		python/tvm/script/parser/tir/parser.py
+1 −1		src/ir/expr.cc
+1 −1		src/relay/transforms/to_mixed_precision.cc
+2 −2		src/runtime/contrib/cublas/cublas.cc
+3 −0		src/runtime/cuda/cuda_device_api.cc
+2 −1		src/script/ir_builder/tir/ir.cc
+11 −0		src/script/printer/ir/misc.cc
+1 −1		src/target/llvm/codegen_llvm.cc
+1 −1		src/target/source/codegen_c.cc
+5 −5		src/target/source/codegen_cuda.cc
+10 −7		src/tir/ir/expr.cc
+4 −4		src/tir/op/op.cc
+1 −1		src/tir/transforms/dtype_conversion.h
+2 −1		src/tir/transforms/merge_shared_memory_allocations.cc
+3 −2		src/tir/transforms/storage_rewrite.cc