Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions models/deepseek/v4/attention_swa.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,13 +524,6 @@ def init_wo_b():
specs=build_tensor_specs(),
golden_fn=golden_attention_swa,
config=RunConfig(
# qkv_proj_rope and sparse_attn both use W8A8/BF16 stages; the
# random KV-cache fixture exercises a less diluted attention output
# than the previous all-zero cache.
# x_out uses ratio_allclose with the standard W8A8 attention
# tolerance (atol=1e-4, rtol=1/128, 0.5% outlier allowance);
# the RunConfig defaults below only apply to any other outputs
# that lack a custom comparator.
rtol=1e-2,
atol=1e-2,
compare_fn={
Expand Down
9 changes: 7 additions & 2 deletions models/deepseek/v4/compressor_ratio128.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ def init_kv_cache():

if __name__ == "__main__":
import argparse
from golden import RunConfig, bf16_allclose_or_ulp, run_jit
from golden import RunConfig, bf16_allclose_or_ulp, ratio_allclose, run_jit

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--platform", type=str, default="a2a3",
Expand All @@ -420,7 +420,12 @@ def init_kv_cache():
device_id=args.device,
enable_l2_swimlane=args.enable_l2_swimlane,
),
compare_fn={"kv_cache": bf16_allclose_or_ulp()},
compare_fn={
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128, max_error_ratio=0.0),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The PR summary states that the kv outputs for ratio 4/128 fall just outside strict allclose (~0.085% and ~0.39% bad points) but are within the 0.5% outlier allowance. However, the code here explicitly sets max_error_ratio=0.0, which enforces strict allclose and will cause the validation to fail for these kernels. You should remove the max_error_ratio=0.0 argument for the kv output to allow the default 0.5% outlier escape.

Suggested change
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128, max_error_ratio=0.0),
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128),

"kv_state": ratio_allclose(atol=1e-3, rtol=1e-3, max_error_ratio=0.0),
"score_state": ratio_allclose(atol=1e-3, rtol=1e-3, max_error_ratio=0.0),
"kv_cache": bf16_allclose_or_ulp(),
},
),
)
if not result.passed:
Expand Down
9 changes: 7 additions & 2 deletions models/deepseek/v4/compressor_ratio4.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def init_kv_cache():

if __name__ == "__main__":
import argparse
from golden import RunConfig, bf16_allclose_or_ulp, run_jit
from golden import RunConfig, bf16_allclose_or_ulp, ratio_allclose, run_jit

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--platform", type=str, default="a2a3",
Expand All @@ -449,7 +449,12 @@ def init_kv_cache():
device_id=args.device,
enable_l2_swimlane=args.enable_l2_swimlane,
),
compare_fn={"kv_cache": bf16_allclose_or_ulp()},
compare_fn={
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128, max_error_ratio=0.0),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The PR summary states that the kv outputs for ratio 4/128 fall just outside strict allclose (~0.085% and ~0.39% bad points) but are within the 0.5% outlier allowance. However, the code here explicitly sets max_error_ratio=0.0, which enforces strict allclose and will cause the validation to fail for these kernels. You should remove the max_error_ratio=0.0 argument for the kv output to allow the default 0.5% outlier escape.

Suggested change
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128, max_error_ratio=0.0),
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128),

"kv_state": ratio_allclose(atol=1e-3, rtol=1e-3, max_error_ratio=0.0),
"score_state": ratio_allclose(atol=1e-3, rtol=1e-3, max_error_ratio=0.0),
"kv_cache": bf16_allclose_or_ulp(),
},
),
)
if not result.passed:
Expand Down
7 changes: 6 additions & 1 deletion models/deepseek/v4/hc_pre.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def init_hc_base():

if __name__ == "__main__":
import argparse
from golden import RunConfig, run_jit
from golden import RunConfig, ratio_allclose, run_jit

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--platform", type=str, default="a2a3",
Expand All @@ -366,6 +366,11 @@ def init_hc_base():
config=RunConfig(
rtol=1e-3,
atol=1e-3,
compare_fn={
"x_mixed": ratio_allclose(atol=1e-4, rtol=1.0 / 128),
"post": ratio_allclose(atol=2.5e-5, rtol=5e-3),
"comb": ratio_allclose(atol=2.5e-5, rtol=5e-3),
},
compile=dict(dump_passes=True),
runtime=dict(
platform=args.platform,
Expand Down
5 changes: 3 additions & 2 deletions models/deepseek/v4/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ def init_idx_kv_cache():

if __name__ == "__main__":
import argparse
from golden import RunConfig, run_jit, bf16_allclose_or_ulp, topk_pair_compare
from golden import RunConfig, bf16_allclose_or_ulp, ratio_allclose, run_jit, topk_pair_compare

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--platform", type=str, default="a2a3",
Expand All @@ -632,7 +632,8 @@ def init_idx_kv_cache():
atol=1e-3,
compile=dict(dump_passes=True),
compare_fn={
"topk_idxs": topk_pair_compare("score"),
"score": ratio_allclose(atol=1e-4, rtol=1.0 / 128),
"topk_idxs": topk_pair_compare("score"),
"idx_kv_cache": bf16_allclose_or_ulp(),
},
runtime=dict(
Expand Down
9 changes: 7 additions & 2 deletions models/deepseek/v4/indexer_compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def init_kv_cache():

if __name__ == "__main__":
import argparse
from golden import RunConfig, bf16_allclose_or_ulp, run_jit
from golden import RunConfig, bf16_allclose_or_ulp, ratio_allclose, run_jit

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--platform", type=str, default="a2a3",
Expand All @@ -449,7 +449,12 @@ def init_kv_cache():
device_id=args.device,
enable_l2_swimlane=args.enable_l2_swimlane,
),
compare_fn={"kv_cache": bf16_allclose_or_ulp()},
compare_fn={
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128, max_error_ratio=0.0),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The PR summary states that the kv outputs for ratio 4/128 fall just outside strict allclose (~0.085% and ~0.39% bad points) but are within the 0.5% outlier allowance. However, the code here explicitly sets max_error_ratio=0.0, which enforces strict allclose and will cause the validation to fail for these kernels. You should remove the max_error_ratio=0.0 argument for the kv output to allow the default 0.5% outlier escape.

Suggested change
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128, max_error_ratio=0.0),
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128),

"kv_state": ratio_allclose(atol=1e-3, rtol=1e-3, max_error_ratio=0.0),
"score_state": ratio_allclose(atol=1e-3, rtol=1e-3, max_error_ratio=0.0),
"kv_cache": bf16_allclose_or_ulp(),
},
),
)
if not result.passed:
Expand Down
17 changes: 7 additions & 10 deletions models/deepseek/v4/qkv_proj_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,15 +502,7 @@ def init_gamma_ckv():

if __name__ == "__main__":
import argparse
from golden import RunConfig, run_jit

def int8_lsb_compare(actual, expected, actual_outputs, expected_outputs, inputs, rtol, atol):
import torch

diff = torch.abs(actual.to(torch.int16) - expected.to(torch.int16))
if torch.max(diff) <= 1:
return True, ""
return False, "max INT8 diff > 1"
from golden import RunConfig, ratio_allclose, run_jit

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--platform", type=str, default="a2a3",
Expand All @@ -527,7 +519,12 @@ def int8_lsb_compare(actual, expected, actual_outputs, expected_outputs, inputs,
# W8A8C16 q_proj adds INT8 quant/dequant round-off before per-head RMSNorm.
rtol=5e-3,
atol=5e-3,
compare_fn={"qr": int8_lsb_compare},
compare_fn={
"q": ratio_allclose(atol=1e-4, rtol=1.0 / 128),
"kv": ratio_allclose(atol=1e-4, rtol=1.0 / 128),
"qr": ratio_allclose(atol=1, rtol=0, max_error_ratio=0),
"qr_scale": ratio_allclose(atol=2.5e-5, rtol=5e-3),
},
compile=dict(dump_passes=True),
runtime=dict(
platform=args.platform,
Expand Down
5 changes: 4 additions & 1 deletion models/deepseek/v4/sparse_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def init_wo_b_scale():

if __name__ == "__main__":
import argparse
from golden import RunConfig, run_jit
from golden import RunConfig, ratio_allclose, run_jit

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--platform", type=str, default="a2a3",
Expand All @@ -652,6 +652,9 @@ def init_wo_b_scale():
config=RunConfig(
rtol=1e-3,
atol=1e-3,
compare_fn={
"attn_out": ratio_allclose(atol=1e-4, rtol=1.0 / 128),
},
compile=dict(dump_passes=True),
runtime=dict(
platform=args.platform,
Expand Down
Loading