-
Notifications
You must be signed in to change notification settings - Fork 65
fix: OpenAI base_url default and reasoning effort model option.
#271
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
6e98053
ab9e56b
0862cf0
29481f7
1426ee9
c11fbef
0bde6ec
4d87c83
f87f86b
e7e161b
b6d16a6
a94205d
1e7c1b4
a695cb4
41a0c62
c905843
0a7747a
d0ecfc7
17c2862
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,7 +72,7 @@ class OpenAIBackend(FormatterBackend, AdapterMixin): | |
|
|
||
| def __init__( | ||
| self, | ||
| model_id: str | ModelIdentifier = model_ids.IBM_GRANITE_4_MICRO_3B, | ||
| model_id: str | ModelIdentifier = model_ids.OPENAI_GPT_5_1, | ||
| formatter: Formatter | None = None, | ||
| base_url: str | None = None, | ||
| model_options: dict | None = None, | ||
|
|
@@ -152,16 +152,21 @@ def __init__( | |
| ) | ||
| self._hf_model_id = model_id.hf_model_name | ||
|
|
||
| if base_url is None: | ||
| self._base_url = "http://localhost:11434/v1" # ollama | ||
| else: | ||
| self._base_url = base_url | ||
| if api_key is None: | ||
| FancyLogger.get_logger().warning( | ||
| "You are using an OpenAI backend with no api_key. Because no API key was provided, mellea assumes you intend to use the openai-compatible interface to your local ollama instance. If you intend to use OpenAI's platform you must specify your API key when instantiating your Mellea session/backend object." | ||
| ) | ||
| self._base_url: str | None = "http://localhost:11434/v1" # ollama | ||
| self._api_key = "ollama" | ||
|
||
| else: | ||
| self._base_url = base_url | ||
| self._api_key = api_key | ||
|
|
||
| self._server_type = _server_type(self._base_url) | ||
| self._server_type: _ServerType = ( | ||
| _server_type(self._base_url) | ||
| if self._base_url is not None | ||
| else _ServerType.OPENAI | ||
| ) # type: ignore | ||
|
|
||
| self._openai_client_kwargs = self.filter_openai_client_kwargs(**kwargs) | ||
|
|
||
|
|
@@ -598,14 +603,38 @@ async def _generate_from_chat_context_standard( | |
|
|
||
| extra_params: dict[str, Any] = {} | ||
| if _format is not None: | ||
| extra_params["response_format"] = { | ||
| "type": "json_schema", | ||
| "json_schema": { | ||
| "name": _format.__name__, | ||
| "schema": _format.model_json_schema(), | ||
| "strict": True, | ||
| }, | ||
| } | ||
| if self._server_type == _ServerType.OPENAI: | ||
| # The OpenAI platform requires that additionalProperties=False on all response_format schemas. | ||
| # However, not all schemas generates by Mellea include additionalProperties. | ||
| # GenerativeSlot, in particular, does not add this property. | ||
| # The easiest way to address this disparity between OpenAI and other inference providers is to | ||
| # monkey-patch the response format exactly when we are actually using the OpenAI server. | ||
| # | ||
| # This only addresses the additionalProperties=False constraint. | ||
| # Other constraints we should be checking/patching are described here: | ||
| # https://platform.openai.com/docs/guides/structured-outputs?api-mode=chat | ||
| monkey_patched_response_schema = _format.model_json_schema() | ||
| monkey_patched_response_schema["additionalProperties"] = False | ||
| extra_params["response_format"] = { | ||
| "type": "json_schema", | ||
| "json_schema": { | ||
| "name": _format.__name__, | ||
| "schema": monkey_patched_response_schema, | ||
| "strict": True, | ||
| }, | ||
| } | ||
| else: | ||
| FancyLogger().get_logger().warning( | ||
| "Mellea assumes you are NOT using the OpenAI platform, and that other model providers have less strict requirements on support JSON schemas passed into `format=`. If you encounter a server-side error following this message, then you found an exception to this assumption. Please open an issue at github.com/generative_computing/mellea with this stack trace and your inference engine / model provider." | ||
| ) | ||
| extra_params["response_format"] = { | ||
| "type": "json_schema", | ||
| "json_schema": { | ||
| "name": _format.__name__, | ||
| "schema": _format.model_json_schema(), | ||
| "strict": True, | ||
| }, | ||
| } | ||
|
|
||
| # Append tool call information if applicable. | ||
| tools: dict[str, Callable] = dict() | ||
|
|
@@ -631,15 +660,20 @@ async def _generate_from_chat_context_standard( | |
| formatted_tools = convert_tools_to_json(tools) | ||
| use_tools = len(formatted_tools) > 0 | ||
|
|
||
| # Build optional reasoning parameters | ||
| reasoning_params = {} | ||
| if thinking is not None: | ||
| reasoning_params["reasoning_effort"] = thinking | ||
|
|
||
|
Comment on lines
671
to
676
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please also add a comment noting that OpenAI doesn't like it when non-reasoning models get reasoning parameters.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done! |
||
| chat_response: Coroutine[ | ||
| Any, Any, ChatCompletion | openai.AsyncStream[ChatCompletionChunk] | ||
| ] = self._async_client.chat.completions.create( | ||
| model=self._hf_model_id, | ||
| messages=conversation, # type: ignore | ||
| reasoning_effort=thinking, # type: ignore | ||
| tools=formatted_tools if use_tools else None, # type: ignore | ||
| # parallel_tool_calls=False, # We only support calling one tool per turn. But we do the choosing on our side so we leave this False. | ||
| **extra_params, | ||
| **reasoning_params, # type: ignore | ||
| **self._make_backend_specific_and_remove( | ||
| model_opts, is_chat_context=ctx.is_chat_context | ||
| ), | ||
|
|
@@ -976,6 +1010,9 @@ def apply_chat_template(self, chat: list[dict[str, str]]): | |
| from transformers import AutoTokenizer | ||
|
|
||
| if not hasattr(self, "_tokenizer"): | ||
| assert self._base_url, ( | ||
| "The OpenAI Platform does not support adapters. You must specify a _base_url when using adapters." | ||
| ) | ||
| match _server_type(self._base_url): | ||
| case _ServerType.LOCALHOST: | ||
| self._tokenizer: "PreTrainedTokenizer" = ( # noqa: UP037 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we are close to defaults that make sense here. I think if the user specifies a base_url we should always use that base_url (even if no apikey is set). I also wonder if we should default the apikey to
ollamain those situations.Otherwise, we have no way to target arbitrary localhost ports that don't require an apikey.
For example (and this isn't the best since it uses LiteLLM and we have a separate backend for that), LiteLLM has a proxy that you can run locally. This proxy stores the apikey information itself; so you can target an arbitrary localhost port without an apikey.
My proposed solution would be to just set the parameter default values to work for the ollama version (ie
api_key="ollama"andbase_url="http://localhost:11434/v1"). Then users can override these values. I think this would also allow users to explicitly set api_key / base_url toNoneand have the underlying OpenAI SDK still automatically pick up their env vars (without the risk of users accidentally incurring expenses).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consensus: just pass the args through to the openai sdk. Don't do argument handling such as this.