Support /v1/responses for OpenAI models. #9795
* Support /v1/responses for OpenAI models. #9795 * Address CodeRabbit review feedback on OpenAI provider. - Preserve exception chains with 'raise ... from e' in all exception handlers for better debugging tracebacks. - Use f-string !s conversion instead of str() calls. - Extract duplicated max_tokens error handling into a shared _raise_max_tokens_error() helper method. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Validate api_url and use incomplete_details from Responses API. - Strip known endpoint suffixes (/chat/completions, /responses) from api_url in __init__ to prevent doubled paths if a user provides a full endpoint URL instead of a base URL. - Use incomplete_details.reason from the Responses API to properly distinguish between max_output_tokens and content_filter when the response status is 'incomplete', in both the non-streaming and streaming parsers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>pull/9767/head^2
parent
2228575564
commit
9bb96360dd
|
|
@ -47,7 +47,9 @@ class OpenAIClient(LLMClient):
|
|||
OpenAI GPT API client.
|
||||
|
||||
Implements the LLMClient interface for OpenAI's GPT models
|
||||
and any OpenAI-compatible API endpoint.
|
||||
and any OpenAI-compatible API endpoint. Supports both the
|
||||
Chat Completions API (/v1/chat/completions) and the Responses
|
||||
API (/v1/responses) for newer models that require it.
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None,
|
||||
|
|
@ -67,7 +69,20 @@ class OpenAIClient(LLMClient):
|
|||
self._api_key = api_key or ''
|
||||
self._model = model or DEFAULT_MODEL
|
||||
base_url = (api_url or DEFAULT_API_BASE_URL).rstrip('/')
|
||||
self._api_url = f'{base_url}/chat/completions'
|
||||
# Strip known endpoint suffixes in case the user provided a full URL
|
||||
for suffix in ('/chat/completions', '/responses'):
|
||||
if base_url.endswith(suffix):
|
||||
base_url = base_url[:-len(suffix)].rstrip('/')
|
||||
break
|
||||
self._base_url = base_url
|
||||
self._use_responses_api = False
|
||||
|
||||
@property
|
||||
def _api_url(self) -> str:
|
||||
"""Return the appropriate API endpoint URL."""
|
||||
if self._use_responses_api:
|
||||
return f'{self._base_url}/responses'
|
||||
return f'{self._base_url}/chat/completions'
|
||||
|
||||
@property
|
||||
def provider_name(self) -> str:
|
||||
|
|
@ -81,7 +96,9 @@ class OpenAIClient(LLMClient):
|
|||
"""Check if the client is properly configured."""
|
||||
# API key is required for the default OpenAI endpoint, but optional
|
||||
# for custom endpoints (e.g., local LLM servers).
|
||||
if self._api_url.startswith(DEFAULT_API_BASE_URL):
|
||||
if self._base_url.rstrip('/').startswith(
|
||||
DEFAULT_API_BASE_URL.rstrip('/')
|
||||
):
|
||||
return bool(self._api_key)
|
||||
return True
|
||||
|
||||
|
|
@ -109,10 +126,72 @@ class OpenAIClient(LLMClient):
|
|||
Raises:
|
||||
LLMClientError: If the request fails.
|
||||
"""
|
||||
# Build the request payload
|
||||
if self._use_responses_api:
|
||||
return self._chat_responses(
|
||||
messages, tools, system_prompt, max_tokens
|
||||
)
|
||||
|
||||
# Try Chat Completions API first
|
||||
payload = self._build_chat_payload(
|
||||
messages, tools, system_prompt, max_tokens
|
||||
)
|
||||
|
||||
try:
|
||||
response_data = self._make_request(payload)
|
||||
return self._parse_response(response_data)
|
||||
except LLMClientError as e:
|
||||
if self._should_use_responses_api(e):
|
||||
self._use_responses_api = True
|
||||
return self._chat_responses(
|
||||
messages, tools, system_prompt, max_tokens
|
||||
)
|
||||
raise
|
||||
except Exception as e:
|
||||
raise LLMClientError(LLMError(
|
||||
message=f"Request failed: {e!s}",
|
||||
provider=self.provider_name
|
||||
)) from e
|
||||
|
||||
def _chat_responses(
|
||||
self,
|
||||
messages: list[Message],
|
||||
tools: Optional[list[Tool]] = None,
|
||||
system_prompt: Optional[str] = None,
|
||||
max_tokens: int = 4096,
|
||||
) -> LLMResponse:
|
||||
"""Send a chat request using the Responses API."""
|
||||
payload = self._build_responses_payload(
|
||||
messages, tools, system_prompt, max_tokens
|
||||
)
|
||||
|
||||
try:
|
||||
response_data = self._make_request(payload)
|
||||
return self._parse_responses_response(response_data)
|
||||
except LLMClientError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise LLMClientError(LLMError(
|
||||
message=f"Request failed: {e!s}",
|
||||
provider=self.provider_name
|
||||
)) from e
|
||||
|
||||
def _should_use_responses_api(self, error: LLMClientError) -> bool:
|
||||
"""Check if the error indicates we should use the Responses API."""
|
||||
error_msg = str(error).lower()
|
||||
return ('v1/responses' in error_msg or
|
||||
'not supported in the v1/chat/completions' in error_msg or
|
||||
'not a chat model' in error_msg)
|
||||
|
||||
def _build_chat_payload(
|
||||
self,
|
||||
messages: list[Message],
|
||||
tools: Optional[list[Tool]],
|
||||
system_prompt: Optional[str],
|
||||
max_tokens: int
|
||||
) -> dict:
|
||||
"""Build payload for the Chat Completions API."""
|
||||
converted_messages = self._convert_messages(messages)
|
||||
|
||||
# Add system prompt at the beginning if provided
|
||||
if system_prompt:
|
||||
converted_messages.insert(0, {
|
||||
'role': 'system',
|
||||
|
|
@ -129,20 +208,35 @@ class OpenAIClient(LLMClient):
|
|||
payload['tools'] = self._convert_tools(tools)
|
||||
payload['tool_choice'] = 'auto'
|
||||
|
||||
# Make the API request
|
||||
try:
|
||||
response_data = self._make_request(payload)
|
||||
return self._parse_response(response_data)
|
||||
except LLMClientError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise LLMClientError(LLMError(
|
||||
message=f"Request failed: {str(e)}",
|
||||
provider=self.provider_name
|
||||
))
|
||||
return payload
|
||||
|
||||
def _build_responses_payload(
|
||||
self,
|
||||
messages: list[Message],
|
||||
tools: Optional[list[Tool]],
|
||||
system_prompt: Optional[str],
|
||||
max_tokens: int
|
||||
) -> dict:
|
||||
"""Build payload for the Responses API."""
|
||||
input_items = self._convert_messages_responses(messages)
|
||||
|
||||
payload = {
|
||||
'model': self._model,
|
||||
'input': input_items,
|
||||
'max_output_tokens': max_tokens,
|
||||
}
|
||||
|
||||
if system_prompt:
|
||||
payload['instructions'] = system_prompt
|
||||
|
||||
if tools:
|
||||
payload['tools'] = self._convert_tools_responses(tools)
|
||||
payload['tool_choice'] = 'auto'
|
||||
|
||||
return payload
|
||||
|
||||
def _convert_messages(self, messages: list[Message]) -> list[dict]:
|
||||
"""Convert Message objects to OpenAI API format."""
|
||||
"""Convert Message objects to OpenAI Chat Completions API format."""
|
||||
result = []
|
||||
|
||||
for msg in messages:
|
||||
|
|
@ -191,8 +285,53 @@ class OpenAIClient(LLMClient):
|
|||
|
||||
return result
|
||||
|
||||
def _convert_messages_responses(
|
||||
self, messages: list[Message]
|
||||
) -> list[dict]:
|
||||
"""Convert Message objects to OpenAI Responses API format."""
|
||||
result = []
|
||||
|
||||
for msg in messages:
|
||||
if msg.role == Role.SYSTEM:
|
||||
result.append({
|
||||
'role': 'developer',
|
||||
'content': msg.content
|
||||
})
|
||||
|
||||
elif msg.role == Role.USER:
|
||||
result.append({
|
||||
'role': 'user',
|
||||
'content': msg.content
|
||||
})
|
||||
|
||||
elif msg.role == Role.ASSISTANT:
|
||||
if msg.content:
|
||||
result.append({
|
||||
'role': 'assistant',
|
||||
'content': msg.content
|
||||
})
|
||||
# Tool calls are separate items in Responses API
|
||||
if msg.tool_calls:
|
||||
for tc in msg.tool_calls:
|
||||
result.append({
|
||||
'type': 'function_call',
|
||||
'call_id': tc.id,
|
||||
'name': tc.name,
|
||||
'arguments': json.dumps(tc.arguments)
|
||||
})
|
||||
|
||||
elif msg.role == Role.TOOL:
|
||||
for tr in msg.tool_results:
|
||||
result.append({
|
||||
'type': 'function_call_output',
|
||||
'call_id': tr.tool_call_id,
|
||||
'output': tr.content
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
def _convert_tools(self, tools: list[Tool]) -> list[dict]:
|
||||
"""Convert Tool objects to OpenAI API format."""
|
||||
"""Convert Tool objects to Chat Completions API format."""
|
||||
return [
|
||||
{
|
||||
'type': 'function',
|
||||
|
|
@ -205,6 +344,18 @@ class OpenAIClient(LLMClient):
|
|||
for tool in tools
|
||||
]
|
||||
|
||||
def _convert_tools_responses(self, tools: list[Tool]) -> list[dict]:
|
||||
"""Convert Tool objects to Responses API format."""
|
||||
return [
|
||||
{
|
||||
'type': 'function',
|
||||
'name': tool.name,
|
||||
'description': tool.description,
|
||||
'parameters': tool.parameters
|
||||
}
|
||||
for tool in tools
|
||||
]
|
||||
|
||||
def _make_request(self, payload: dict) -> dict:
|
||||
"""Make an HTTP request to the OpenAI API."""
|
||||
headers = {
|
||||
|
|
@ -255,8 +406,24 @@ class OpenAIClient(LLMClient):
|
|||
retryable=True
|
||||
))
|
||||
|
||||
def _raise_max_tokens_error(self, input_tokens: int):
|
||||
"""Raise an error when a response is truncated due to token limit."""
|
||||
raise LLMClientError(LLMError(
|
||||
message=f'Response truncated due to token limit '
|
||||
f'(input: {input_tokens} tokens). '
|
||||
f'The request is too large for model '
|
||||
f'{self._model}. '
|
||||
f'Try using a model with a larger context '
|
||||
f'window, or analyze a smaller scope (e.g., a '
|
||||
f'specific schema instead of the entire '
|
||||
f'database).',
|
||||
code='max_tokens',
|
||||
provider=self.provider_name,
|
||||
retryable=False
|
||||
))
|
||||
|
||||
def _parse_response(self, data: dict) -> LLMResponse:
|
||||
"""Parse the OpenAI API response into an LLMResponse."""
|
||||
"""Parse the Chat Completions API response into an LLMResponse."""
|
||||
# Check for API-level errors in the response
|
||||
if 'error' in data:
|
||||
error_info = data['error']
|
||||
|
|
@ -325,20 +492,7 @@ class OpenAIClient(LLMClient):
|
|||
# Check for problematic responses
|
||||
if not content and not tool_calls:
|
||||
if stop_reason == StopReason.MAX_TOKENS:
|
||||
input_tokens = usage.input_tokens
|
||||
raise LLMClientError(LLMError(
|
||||
message=f'Response truncated due to token limit '
|
||||
f'(input: {input_tokens} tokens). '
|
||||
f'The request is too large for model '
|
||||
f'{self._model}. '
|
||||
f'Try using a model with a larger context '
|
||||
f'window, or analyze a smaller scope (e.g., a '
|
||||
f'specific schema instead of the entire '
|
||||
f'database).',
|
||||
code='max_tokens',
|
||||
provider=self.provider_name,
|
||||
retryable=False
|
||||
))
|
||||
self._raise_max_tokens_error(usage.input_tokens)
|
||||
elif finish_reason and finish_reason not in ('stop', 'tool_calls'):
|
||||
raise LLMClientError(LLMError(
|
||||
message=(f'Empty response with finish reason: '
|
||||
|
|
@ -357,6 +511,92 @@ class OpenAIClient(LLMClient):
|
|||
raw_response=data
|
||||
)
|
||||
|
||||
def _parse_responses_response(self, data: dict) -> LLMResponse:
|
||||
"""Parse the Responses API response into an LLMResponse."""
|
||||
# Check for API-level errors
|
||||
if 'error' in data:
|
||||
error_info = data['error']
|
||||
raise LLMClientError(LLMError(
|
||||
message=error_info.get('message', 'Unknown API error'),
|
||||
code=error_info.get('code', 'unknown'),
|
||||
provider=self.provider_name,
|
||||
retryable=False
|
||||
))
|
||||
|
||||
output = data.get('output', [])
|
||||
content = ''
|
||||
tool_calls = []
|
||||
|
||||
for item in output:
|
||||
item_type = item.get('type', '')
|
||||
|
||||
if item_type == 'message':
|
||||
for part in item.get('content', []):
|
||||
if part.get('type') == 'output_text':
|
||||
content += part.get('text', '')
|
||||
|
||||
elif item_type == 'function_call':
|
||||
try:
|
||||
arguments = json.loads(
|
||||
item.get('arguments', '{}')
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
arguments = {}
|
||||
|
||||
tool_calls.append(ToolCall(
|
||||
id=item.get('call_id', str(uuid.uuid4())),
|
||||
name=item.get('name', ''),
|
||||
arguments=arguments
|
||||
))
|
||||
|
||||
# Determine stop reason from status and incomplete_details
|
||||
status = data.get('status', '')
|
||||
if tool_calls:
|
||||
stop_reason = StopReason.TOOL_USE
|
||||
elif status == 'completed':
|
||||
stop_reason = StopReason.END_TURN
|
||||
elif status == 'incomplete':
|
||||
reason = data.get(
|
||||
'incomplete_details', {}
|
||||
).get('reason', '')
|
||||
if reason == 'content_filter':
|
||||
stop_reason = StopReason.STOP_SEQUENCE
|
||||
elif reason == 'max_output_tokens':
|
||||
stop_reason = StopReason.MAX_TOKENS
|
||||
else:
|
||||
stop_reason = StopReason.MAX_TOKENS
|
||||
else:
|
||||
stop_reason = StopReason.UNKNOWN
|
||||
|
||||
# Parse usage information
|
||||
usage_data = data.get('usage', {})
|
||||
usage = Usage(
|
||||
input_tokens=usage_data.get('input_tokens', 0),
|
||||
output_tokens=usage_data.get('output_tokens', 0),
|
||||
total_tokens=usage_data.get('total_tokens', 0)
|
||||
)
|
||||
|
||||
# Check for problematic responses
|
||||
if not content and not tool_calls:
|
||||
if stop_reason == StopReason.MAX_TOKENS:
|
||||
self._raise_max_tokens_error(usage.input_tokens)
|
||||
elif stop_reason == StopReason.STOP_SEQUENCE:
|
||||
raise LLMClientError(LLMError(
|
||||
message='Response blocked by content filter.',
|
||||
code='content_filter',
|
||||
provider=self.provider_name,
|
||||
retryable=False
|
||||
))
|
||||
|
||||
return LLMResponse(
|
||||
content=content,
|
||||
tool_calls=tool_calls,
|
||||
stop_reason=stop_reason,
|
||||
model=data.get('model', self._model),
|
||||
usage=usage,
|
||||
raw_response=data
|
||||
)
|
||||
|
||||
def chat_stream(
|
||||
self,
|
||||
messages: list[Message],
|
||||
|
|
@ -367,35 +607,46 @@ class OpenAIClient(LLMClient):
|
|||
**kwargs
|
||||
) -> Generator[Union[str, LLMResponse], None, None]:
|
||||
"""Stream a chat response from OpenAI."""
|
||||
converted_messages = self._convert_messages(messages)
|
||||
if self._use_responses_api:
|
||||
payload = self._build_responses_payload(
|
||||
messages, tools, system_prompt, max_tokens
|
||||
)
|
||||
payload['stream'] = True
|
||||
try:
|
||||
yield from self._process_stream(payload)
|
||||
except LLMClientError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise LLMClientError(LLMError(
|
||||
message=f"Streaming request failed: {e!s}",
|
||||
provider=self.provider_name
|
||||
)) from e
|
||||
return
|
||||
|
||||
if system_prompt:
|
||||
converted_messages.insert(0, {
|
||||
'role': 'system',
|
||||
'content': system_prompt
|
||||
})
|
||||
|
||||
payload = {
|
||||
'model': self._model,
|
||||
'messages': converted_messages,
|
||||
'max_completion_tokens': max_tokens,
|
||||
'stream': True,
|
||||
'stream_options': {'include_usage': True}
|
||||
}
|
||||
|
||||
if tools:
|
||||
payload['tools'] = self._convert_tools(tools)
|
||||
payload['tool_choice'] = 'auto'
|
||||
# Try Chat Completions API first
|
||||
payload = self._build_chat_payload(
|
||||
messages, tools, system_prompt, max_tokens
|
||||
)
|
||||
payload['stream'] = True
|
||||
payload['stream_options'] = {'include_usage': True}
|
||||
|
||||
try:
|
||||
yield from self._process_stream(payload)
|
||||
except LLMClientError:
|
||||
raise
|
||||
except LLMClientError as e:
|
||||
if self._should_use_responses_api(e):
|
||||
self._use_responses_api = True
|
||||
payload = self._build_responses_payload(
|
||||
messages, tools, system_prompt, max_tokens
|
||||
)
|
||||
payload['stream'] = True
|
||||
yield from self._process_stream(payload)
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise LLMClientError(LLMError(
|
||||
message=f"Streaming request failed: {str(e)}",
|
||||
message=f"Streaming request failed: {e!s}",
|
||||
provider=self.provider_name
|
||||
))
|
||||
)) from e
|
||||
|
||||
def _process_stream(
|
||||
self, payload: dict
|
||||
|
|
@ -449,16 +700,19 @@ class OpenAIClient(LLMClient):
|
|||
))
|
||||
|
||||
try:
|
||||
yield from self._read_openai_stream(response)
|
||||
if self._use_responses_api:
|
||||
yield from self._read_responses_stream(response)
|
||||
else:
|
||||
yield from self._read_openai_stream(response)
|
||||
finally:
|
||||
response.close()
|
||||
|
||||
def _read_openai_stream(
|
||||
self, response
|
||||
) -> Generator[Union[str, LLMResponse], None, None]:
|
||||
"""Read and parse an OpenAI-format SSE stream.
|
||||
"""Read and parse an OpenAI Chat Completions SSE stream.
|
||||
|
||||
Uses readline() for incremental reading — it returns as soon
|
||||
Uses readline() for incremental reading -- it returns as soon
|
||||
as a complete line arrives from the server, unlike read()
|
||||
which blocks until a buffer fills up.
|
||||
"""
|
||||
|
|
@ -574,3 +828,126 @@ class OpenAIClient(LLMClient):
|
|||
model=model_name,
|
||||
usage=usage
|
||||
)
|
||||
|
||||
def _read_responses_stream(
|
||||
self, response
|
||||
) -> Generator[Union[str, LLMResponse], None, None]:
|
||||
"""Read and parse an OpenAI Responses API SSE stream.
|
||||
|
||||
The Responses API uses named events with types like
|
||||
response.output_text.delta for text streaming and
|
||||
response.completed for the final response.
|
||||
"""
|
||||
content_parts = []
|
||||
# tool_calls_data: {call_id: {name, arguments}}
|
||||
tool_calls_data = {}
|
||||
model_name = self._model
|
||||
usage = Usage()
|
||||
resp_status = ''
|
||||
resp_incomplete = {}
|
||||
|
||||
while True:
|
||||
line_bytes = response.readline()
|
||||
if not line_bytes:
|
||||
break
|
||||
|
||||
line = line_bytes.decode('utf-8', errors='replace').strip()
|
||||
|
||||
if not line or line.startswith(':'):
|
||||
continue
|
||||
|
||||
# Skip event type lines - we identify events by data type field
|
||||
if line.startswith('event: '):
|
||||
continue
|
||||
|
||||
if not line.startswith('data: '):
|
||||
continue
|
||||
|
||||
try:
|
||||
data = json.loads(line[6:])
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
event_type = data.get('type', '')
|
||||
|
||||
if event_type == 'response.output_text.delta':
|
||||
delta = data.get('delta', '')
|
||||
if delta:
|
||||
content_parts.append(delta)
|
||||
yield delta
|
||||
|
||||
elif event_type == 'response.output_item.added':
|
||||
item = data.get('item', {})
|
||||
if item.get('type') == 'function_call':
|
||||
call_id = item.get('call_id', '')
|
||||
tool_calls_data[call_id] = {
|
||||
'name': item.get('name', ''),
|
||||
'arguments': ''
|
||||
}
|
||||
|
||||
elif event_type == 'response.function_call_arguments.delta':
|
||||
call_id = data.get('call_id', '')
|
||||
if call_id not in tool_calls_data:
|
||||
tool_calls_data[call_id] = {
|
||||
'name': '', 'arguments': ''
|
||||
}
|
||||
tool_calls_data[call_id]['arguments'] += data.get(
|
||||
'delta', ''
|
||||
)
|
||||
|
||||
elif event_type == 'response.completed':
|
||||
resp = data.get('response', {})
|
||||
u = resp.get('usage', {})
|
||||
usage = Usage(
|
||||
input_tokens=u.get('input_tokens', 0),
|
||||
output_tokens=u.get('output_tokens', 0),
|
||||
total_tokens=u.get('total_tokens', 0)
|
||||
)
|
||||
model_name = resp.get('model', model_name)
|
||||
resp_status = resp.get('status', '')
|
||||
resp_incomplete = resp.get('incomplete_details', {})
|
||||
|
||||
# Build final response
|
||||
content = ''.join(content_parts)
|
||||
tool_calls = []
|
||||
for call_id, tc in tool_calls_data.items():
|
||||
try:
|
||||
arguments = json.loads(tc['arguments']) \
|
||||
if tc['arguments'] else {}
|
||||
except json.JSONDecodeError:
|
||||
arguments = {}
|
||||
tool_calls.append(ToolCall(
|
||||
id=call_id or str(uuid.uuid4()),
|
||||
name=tc['name'],
|
||||
arguments=arguments
|
||||
))
|
||||
|
||||
# Determine stop reason from final response status
|
||||
if tool_calls:
|
||||
stop_reason = StopReason.TOOL_USE
|
||||
elif resp_status == 'incomplete':
|
||||
reason = resp_incomplete.get('reason', '') \
|
||||
if resp_incomplete else ''
|
||||
if reason == 'content_filter':
|
||||
stop_reason = StopReason.STOP_SEQUENCE
|
||||
else:
|
||||
stop_reason = StopReason.MAX_TOKENS
|
||||
elif content:
|
||||
stop_reason = StopReason.END_TURN
|
||||
else:
|
||||
stop_reason = StopReason.UNKNOWN
|
||||
|
||||
if not content and not tool_calls:
|
||||
raise LLMClientError(LLMError(
|
||||
message='No response content returned from API',
|
||||
provider=self.provider_name,
|
||||
retryable=False
|
||||
))
|
||||
|
||||
yield LLMResponse(
|
||||
content=content,
|
||||
tool_calls=tool_calls,
|
||||
stop_reason=stop_reason,
|
||||
model=model_name,
|
||||
usage=usage
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in New Issue