Fix comments 3

vshampor · vshampor · commit 9664332d32bc · 2025-03-28T11:10:34.000+01:00
diff --git a/samples/python/text_generation/limit_checker.py b/samples/python/text_generation/limit_checker.py
@@ -43,7 +43,7 @@ def retry_request(func, retries=5):
         "ServiceUnavailable",
         "InternalServerError"
     ]
-    
+
     for attempt in range(retries):
         try:
             return func()
@@ -126,10 +126,10 @@ def run_and_write_metrics(model, prompt, generation_config, report_file):
     print(f"result length: {result_length}")
     print()
 
-    if args.report is not None:
-        with open(args.report, 'a') as f:
+    if report_file is not None:
+        with open(report_file, 'a') as f:
             csv_writer = csv.writer(f)
-            csv_writer.writerow([generation_length, result_length, pipeline_opt_metrics.avg_cache_usage, pipeline_opt_metrics.max_cache_usage, rss_usage_gb])
+            csv_writer.writerow([generation_config.max_new_tokens - 1, result_length, pipeline_opt_metrics.avg_cache_usage, pipeline_opt_metrics.max_cache_usage, rss_usage_gb])
     return pipeline_opt_metrics.max_cache_usage
 
 
@@ -194,6 +194,8 @@ def run_and_write_metrics(model, prompt, generation_config, report_file):
                 break
 
             generation_length *= 2
+
+        del data_dict
     elif args.mode == "gen_throughput":
         dataset = load_samsum_dataset(args.data)
         prompt_throughput = 1
@@ -236,5 +238,4 @@ def run_and_write_metrics(model, prompt, generation_config, report_file):
 
 
         print(f"Approximate highest throughput: {prompt_throughput} prompts")
-        del data_dict
 
diff --git a/site/docs/concepts/optimization-techniques/kvcache-eviction-algorithm.md b/site/docs/concepts/optimization-techniques/kvcache-eviction-algorithm.md
@@ -15,7 +15,7 @@ The KV cache for each sequence is divided into three logical areas:
 
 * Start Area: Initial tokens that are never evicted
 * Evictable Area: Tokens that can be evicted based on importance scores
-* Recent Area: Most recent tokens that are preserved (never evicted)
+* Recent Area: Most recent tokens that are preserved (not evicted while in this area, but naturally migrating toward the evictable area as the text generation goes on)
 
 The sizes of all three areas can be configured by modifying corresponding fields in a `CacheEvictionConfig` struct, which itself is a part of the pipeline-wide `SchedulerConfig`.
 As the generation starts, the blocks in respective logical areas are filled token-by-token, and once at least one block past the "recent" area is filled, eviction may take place. 
@@ -55,4 +55,8 @@ This may impact the ability of the model to correctly recognize the relative pos
 Cache rotation seeks to alleviate this by "re-rotating" corresponding blocks so that the blocks that remain after each eviction are once again "continuous" in terms of the effective RoPE embedding. 
 It can be enabled by setting the `CacheEvictionConfig.apply_rotation` field to `true` (default is `false`).
 
+## Current limitations
 
+* Cache rotation is only targeted for the regular, linear LLaMa-like RoPE application and may degrade accuracy on models that use other RoPE schemes.
+
+* Cache rotation is currently only supported for the models with uniform V embedding sizes across the layers.