Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion berkeley-function-call-leaderboard/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ You can provide multiple models or test categories by separating them with comma

```bash
bfcl generate --model claude-3-5-sonnet-20241022-FC,gpt-4o-2024-11-20-FC --test-category simple,parallel,multiple,multi_turn


bfcl generate --model moonshotai/Kimi-K2-Instruct --num-threads 6
```

#### Selecting Specific Test Cases with `--run-ids`
Expand Down Expand Up @@ -242,7 +245,7 @@ VLLM_PORT=1053
For those who prefer using script execution instead of the CLI, you can run the following command:

```bash
python -m bfcl_eval.openfunctions_evaluation --model MODEL_NAME --test-category TEST_CATEGORY
python -m bfcl_eval.openfunctions_evaluation --model moonshotai/Kimi-K2-Instruct --test-category TEST_CATEGORY
```

When specifying multiple models or test categories, separate them with **spaces**, not commas. All other flags mentioned earlier are compatible with the script execution method as well.
Expand Down
5 changes: 4 additions & 1 deletion berkeley-function-call-leaderboard/bfcl_eval/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,13 @@ NOVITA_API_KEY=sk-XXXXXX
# We use the API Key from Alipay to inference Bailing (Ling) models (see https://zxb.alipay.com/llm/landing)
LING_API_KEY=sk-XXXXXX

# [OPTIONAL] For inference via Together AI endpoint
TOGETHER_API_KEY=

# [OPTIONAL] For local vllm/sglang server configuration
# Defaults to localhost port 1053 if not provided
VLLM_ENDPOINT=localhost
VLLM_PORT=1053

# [OPTIONAL] Required for WandB to log the generated .csv in the format 'entity:project
WANDB_BFCL_PROJECT=ENTITY:PROJECT
WANDB_BFCL_PROJECT=ENTITY:PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from bfcl_eval.model_handler.api_inference.deepseek import DeepSeekAPIHandler
from bfcl_eval.model_handler.api_inference.dm_cito import DMCitoHandler
from bfcl_eval.model_handler.api_inference.fireworks import FireworksHandler
from bfcl_eval.model_handler.api_inference.together import TogetherHandler
from bfcl_eval.model_handler.api_inference.functionary import FunctionaryHandler
from bfcl_eval.model_handler.api_inference.gemini import GeminiHandler
from bfcl_eval.model_handler.api_inference.gogoagent import GoGoAgentHandler
Expand Down Expand Up @@ -2034,6 +2035,18 @@ class ModelConfig:
is_fc_model=True,
underscore_to_dot=False,
),
"moonshotai/Kimi-K2-Instruct-FC": ModelConfig(
model_name="moonshotai/Kimi-K2-Instruct",
display_name="Kimi-K2-Instruct",
url="https://huggingface.co/katanemo/Arch-Agent-32B",
org="moonshotai",
license="katanemo-research",
model_handler=TogetherHandler,
input_price=None,
output_price=None,
is_fc_model=True,
underscore_to_dot=False,
),
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,6 @@
"katanemo/Arch-Agent-3B",
"katanemo/Arch-Agent-7B",
"katanemo/Arch-Agent-32B"
"katanemo/Arch-Agent-32B",
"moonshotai/Kimi-K2-Instruct-FC"
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os

from bfcl_eval.model_handler.api_inference.openai_completion import OpenAICompletionsHandler
from bfcl_eval.model_handler.model_style import ModelStyle
from openai import OpenAI


class TogetherHandler(OpenAICompletionsHandler):
def __init__(self, model_name, temperature) -> None:
super().__init__(model_name, temperature)
self.model_style = ModelStyle.TOGETHER_AI
self.client = OpenAI(
base_url="https://api.together.xyz/v1",
api_key=os.getenv("TOGETHER_API_KEY"),
)

#### FC methods ####

def _query_FC(self, inference_data: dict):
message: list[dict] = inference_data["message"]
tools = inference_data["tools"]
inference_data["inference_input_log"] = {
"message": repr(message),
"tools": tools,
}

if len(tools) > 0:
return self.generate_with_backoff(
messages=message,
model=self.model_name.replace("-FC", "").replace("-together", ""),
temperature=self.temperature,
tools=tools,
)
else:

return self.generate_with_backoff(
messages=message,
model=self.model_name.replace("-FC", "").replace("-together", ""),
temperature=self.temperature,
)

#### Prompting methods ####

def _query_prompting(self, inference_data: dict):
inference_data["inference_input_log"] = {"message": repr(inference_data["message"])}

return self.generate_with_backoff(
messages=inference_data["message"],
model=self.model_name.replace("-together", ""),
temperature=self.temperature,
)
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class ModelStyle(Enum):
GOOGLE = "google"
AMAZON = "amazon"
FIREWORK_AI = "firework_ai"
TOGETHER_AI = "together_ai"
NEXUS = "nexus"
OSSMODEL = "ossmodel"
COHERE = "cohere"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def convert_to_tool(functions, mapping, model_style):
ModelStyle.WRITER,
ModelStyle.AMAZON,
ModelStyle.NOVITA_AI,
ModelStyle.TOGETHER_AI,
]:
item[
"description"
Expand All @@ -189,6 +190,7 @@ def convert_to_tool(functions, mapping, model_style):
ModelStyle.FIREWORK_AI,
ModelStyle.WRITER,
ModelStyle.NOVITA_AI,
ModelStyle.TOGETHER_AI,
]:
oai_tool.append({"type": "function", "function": item})
elif model_style == ModelStyle.AMAZON:
Expand Down