ServiceNow · AndrewRWilliams · Jan 23, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/cik_benchmark/baselines/direct_prompt.py b/cik_benchmark/baselines/direct_prompt.py
@@ -2,6 +2,7 @@
 Direct prompt method
 
 """
+
 import inspect
 import logging
 import numpy as np
@@ -96,23 +97,35 @@ def huggingface_instruct_model_client(
 
     def constrained_decoding_regex(required_timestamps):
         """
-        Generates a regular expression to force the model output
-        to satisfy the required format and provide values for
-        all required timestamps
-
+        Generates a regex pattern for constrained decoding such that:
+        - <forecast> occurs at start (on its own line).
+        - For each required timestamp ts, the model must produce
+            (ts,NUMBER)
+            with NO extra whitespace, exactly as shown:
+            open paren, timestamp literal, comma, numeric value, close paren
+        - Then </forecast> at the end (on its own line).
         """
-        timestamp_regex = "".join(
-            [
-                r"\(\s*{}\s*,\s*[-+]?\d+(\.\d+)?\)\n".format(re.escape(ts))
-                for ts in required_timestamps
-            ]
-        )
-        return r"<forecast>\n{}<\/forecast>".format(timestamp_regex)
+
+        # Build one pattern line per required timestamp:
+        #   (YYYY-MM-DD HH:MM:SS,[-+]?\d+(?:\.\d+)?)
+        # No spaces allowed anywhere, so everything is literally "fixed"
+        # except for the numeric portion.
+        lines = [
+            rf"\({re.escape(ts)},[-+]?\d{1,20}(?:\.\d{0,20})?\)"
+            for ts in required_timestamps
+        ]
+
+        # Join lines with exactly one "\n".
+        body = r"\n".join(lines)
+
+        # Return the full pattern, ensuring a single newline
+        # after <forecast> and before </forecast>.
+        return rf"<forecast>\n{body}\n</forecast>"
 
     # Make generation pipeline
     pipe = pipeline(
         task="text-generation",
-        model=llm,
+        model=llm.cuda().to(torch.bfloat16),
         tokenizer=tokenizer,
         device_map="auto",
     )
@@ -124,19 +137,55 @@ def constrained_decoding_regex(required_timestamps):
     )
 
     # Now extract the assistant's reply
-    choices = []
-    for response in pipe(
-        [messages] * n,
-        max_length=max_tokens,
-        temperature=temperature,
-        prefix_allowed_tokens_fn=prefix_function,
-        batch_size=n,
-    ):
-        # Create a message object
-        message = SimpleNamespace(content=response[0]["generated_text"][-1]["content"])
-        # Create a choice object
-        choice = SimpleNamespace(message=message)
-        choices.append(choice)
+
+    # TODO: verify
+    # if pipe has no chat_template, get the context from the messages and append it to the prompt.
+    # then complete the prompt with the model
+    if getattr(pipe.tokenizer, "chat_template", None) is not None:
+        start_time = time.time()
+        choices = []
+        for response in pipe(
+            [messages] * n,
+            max_length=max_tokens,
+            temperature=temperature,
+            prefix_allowed_tokens_fn=prefix_function,
+            batch_size=n,
+        ):
+            # Create a message object
+            message = SimpleNamespace(
+                content=response[0]["generated_text"][-1]["content"]
+            )
+            # Create a choice object
+            choice = SimpleNamespace(message=message)
+            choices.append(choice)
+        print(f"Time taken for completion: {time.time() - start_time}")
+
+    else:
+        # Get the context from the messages
+        context = ""
+        for message in messages:
+            context += message["content"] + " "  # directly concatenate the context
+
+        # Generate completions
+        choices = []
+        start_time = time.time()
+        responses = pipe(
+            [context] * n,
+            temperature=temperature,
+            prefix_allowed_tokens_fn=prefix_function,
+            batch_size=n,
+            max_new_tokens=max_tokens,
+        )
+        print(f"Time taken for completion: {time.time() - start_time}")
+
+        for response in responses:
+            # Create a message object
+            message = SimpleNamespace(
+                content=response[0]["generated_text"][len(context) :].strip()
+            )
+            # Create a choice object
+            choice = SimpleNamespace(message=message)
+            choices.append(choice)
 
     # Create a usage object (we can estimate tokens)
     usage = SimpleNamespace(

diff --git a/cik_benchmark/baselines/hf_utils/dp_hf_api.py b/cik_benchmark/baselines/hf_utils/dp_hf_api.py
@@ -5,6 +5,10 @@
     AutoTokenizer,
     LlamaTokenizerFast,
     MistralForCausalLM,
+    # MambaConfig,
+    # MambaForCausalLM,
+    # Mamba2Config,
+    # Mamba2ForCausalLM,
 )
 import torch
 import gc
@@ -35,6 +39,29 @@
     "Mistral-7B-Instruct-v0.3": "mistralai/Mistral-7B-Instruct-v0.3",
     "falcon-7b-instruct": "tiiuae/falcon-7b-instruct",
     "falcon-40b-instruct": "tiiuae/falcon-40b-instruct",
+    # SSMs
+    # "Hymba-1.5B-Base": "nvidia/Hymba-1.5B-Base",  # needs FlexAttention installation
+    # "Hymba-1.5B-Instruct": "nvidia/Hymba-1.5B-Instruct",  # needs FlexAttention installation
+    # "mamba-2.8B": "state-spaces/mamba-2.8b-hf",
+    # "mamba-1.4B": "state-spaces/mamba-1.4b-hf",
+    # "mamba-790m": "state-spaces/mamba-790m-hf",
+    # "mamba-370m": "state-spaces/mamba-370m-hf",
+    # "mamba-130m": "state-spaces/mamba-130m-hf",
+    # "mamba2-2.7B": "state-spaces/mamba2-2.7b",
+    # "mamba2-1.3B": "state-spaces/mamba2-1.3b",
+    # "mamba2-780m": "state-spaces/mamba2-780m",
+    # "mamba2-370m": "state-spaces/mamba2-370m",
+    # "mamba2-130m": "state-spaces/mamba2-130m",
+    # "Zamba-7B-v1": "Zyphra/Zamba-7B-v1",
+    # "Zamba2-7B": "Zyphra/Zamba2-7B",
+    # "Zamba2-2.7B": "Zyphra/Zamba2-2.7B",
+    # "Zamba2-1.2B": "Zyphra/Zamba2-1.2B",
+    # "Zamba2-7B-Instruct": "Zyphra/Zamba2-7B-Instruct",
+    # "Zamba2-2.7B-Instruct": "Zyphra/Zamba2-2.7B-Instruct",
+    # "Zamba2-1.2B-Instruct": "Zyphra/Zamba2-1.2B-Instruct",
+    # "Falcon3-Mamba-7B-Base": "tiiuae/Falcon3-Mamba-7B-Base",
+    # "Falcon3-Mamba-7B-Instruct": "tiiuae/Falcon3-Mamba-7B-Instruct",
+    # "Bamba-9B": "ibm-fms/Bamba-9B",
 }
 
 
@@ -72,6 +99,16 @@ def get_tokenizer(llm_path, llm_type):
         tokenizer = AutoTokenizer.from_pretrained(llm_path)
     elif "falcon" in llm_type:
         tokenizer = AutoTokenizer.from_pretrained(llm_path)
+    # elif "mamba-" in llm_type:
+    #     tokenizer = AutoTokenizer.from_pretrained(llm_path)
+    # elif "Zamba2-" in llm_type:
+    #     tokenizer = AutoTokenizer.from_pretrained(llm_path)
+    # elif "Zamba-" in llm_type:
+    #     tokenizer = AutoTokenizer.from_pretrained(llm_path)
+    # elif "Hymba-" in llm_type:
+    #     tokenizer = AutoTokenizer.from_pretrained(
+    #         llm_path, trust_remote_code=True, parallelism="none"
+    #     )
     else:
         assert False
 
@@ -133,6 +170,36 @@ def get_model_and_tokenizer(llm_path, llm_type):
         model = MistralForCausalLM.from_pretrained(
             llm_path, torch_dtype=torch.bfloat16, device_map="auto"
         )
+    # elif "mamba-" in llm_type:
+    #     model = MambaForCausalLM.from_pretrained(
+    #         llm_path,
+    #         device_map="auto",
+    #         torch_dtype=torch.float16,
+    #     )
+    # elif "mamba2-" in llm_type:
+    #     model = Mamba2ForCausalLM.from_pretrained(
+    #         llm_path,
+    #         device_map="auto",
+    #         torch_dtype=torch.float16,
+    #     )
+    # elif "Zamba2-" in llm_type:
+    #     model = AutoModelForCausalLM.from_pretrained(
+    #         llm_path,
+    #         device_map="auto",
+    #         torch_dtype=torch.bfloat16,
+    #     )
+    # elif "Zamba-" in llm_type:
+    #     model = AutoModelForCausalLM.from_pretrained(
+    #         llm_path,
+    #         device_map="auto",
+    #         torch_dtype=torch.bfloat16,
+    #     )
+    # elif "Hymba-" in llm_type:
+    #     model = AutoModelForCausalLM.from_pretrained(
+    #         llm_path,
+    #         trust_remote_code=True,
+    #     )
+    #     model = model.cuda().to(torch.bfloat16)
     else:
         assert False
 

diff --git a/experiments/direct-prompt-models/qwen_7b_instruct_ctx_g2.json b/experiments/direct-prompt-models/qwen_7b_instruct_ctx_g2.json
@@ -1,4 +1,12 @@
 [
-    {"label": "CC-Qwen-2.5-7B-Instruct (ctx)", "method": "directprompt", "llm": "qwen2.5-7B-Instruct", "use_context": true, "temperature": 1.0, 
-    "batch_size_on_retry":10, "batch_size":10, "n_retries": 10}
-]
+    {
+        "label": "CC-Qwen-2.5-7B-Instruct (ctx)",
+        "method": "directprompt",
+        "llm": "qwen2.5-7B-Instruct",
+        "use_context": true,
+        "temperature": 1.0,
+        "batch_size_on_retry": 10,
+        "batch_size": 10,
+        "n_retries": 10
+    }
+]
diff --git a/experiments/statistical-models/statsmodels_c40.json b/experiments/statistical-models/statsmodels_c40.json
@@ -1,3 +1,6 @@
 [
-    {"label": "Statsmodels", "method": "statsmodels"}
-]
+    {
+        "label": "Statsmodels",
+        "method": "statsmodels"
+    }
+]
diff --git a/requirements.txt b/requirements.txt
@@ -19,5 +19,6 @@ termcolor
 tenacity
 h5py
 transformers>4.4.1
+tokenizers
 sentencepiece
 lm-format-enforcer
diff --git a/run_baselines.py b/run_baselines.py
@@ -10,8 +10,10 @@
 import numpy as np
 import pandas as pd
 
+
 from collections import defaultdict
 from pathlib import Path
+
 from cik_benchmark.baselines.direct_prompt import DirectPrompt
 from cik_benchmark.baselines.lag_llama import lag_llama
 from cik_benchmark.baselines.chronos import ChronosForecaster
@@ -207,7 +209,9 @@ def experiment_directprompt(
     dp_forecaster = DirectPrompt(
         model=llm,
         use_context=use_context,
-        token_cost=openai_costs[llm] if llm in openai_costs else {"input": 0.0, "output": 0.0}, # Cost only used for OpenAI models
+        token_cost=(
+            openai_costs[llm] if llm in openai_costs else {"input": 0.0, "output": 0.0}
+        ),  # Cost only used for OpenAI models
         batch_size=batch_size,
         batch_size_on_retry=batch_size_on_retry,
         n_retries=n_retries,