Support /v1/responses for OpenAI models. #9795

* Support /v1/responses for OpenAI models. #9795

* Address CodeRabbit review feedback on OpenAI provider.

- Preserve exception chains with 'raise ... from e' in all
  exception handlers for better debugging tracebacks.
- Use f-string !s conversion instead of str() calls.
- Extract duplicated max_tokens error handling into a shared
  _raise_max_tokens_error() helper method.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* Validate api_url and use incomplete_details from Responses API.

- Strip known endpoint suffixes (/chat/completions, /responses) from
  api_url in __init__ to prevent doubled paths if a user provides a
  full endpoint URL instead of a base URL.
- Use incomplete_details.reason from the Responses API to properly
  distinguish between max_output_tokens and content_filter when the
  response status is 'incomplete', in both the non-streaming and
  streaming parsers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
pull/9767/head^2
Dave Page 2026-03-30 09:46:22 +01:00 committed by GitHub
parent 2228575564
commit 9bb96360dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 435 additions and 58 deletions

View File

@ -47,7 +47,9 @@ class OpenAIClient(LLMClient):
OpenAI GPT API client.
Implements the LLMClient interface for OpenAI's GPT models
and any OpenAI-compatible API endpoint.
and any OpenAI-compatible API endpoint. Supports both the
Chat Completions API (/v1/chat/completions) and the Responses
API (/v1/responses) for newer models that require it.
"""
def __init__(self, api_key: Optional[str] = None,
@ -67,7 +69,20 @@ class OpenAIClient(LLMClient):
self._api_key = api_key or ''
self._model = model or DEFAULT_MODEL
base_url = (api_url or DEFAULT_API_BASE_URL).rstrip('/')
self._api_url = f'{base_url}/chat/completions'
# Strip known endpoint suffixes in case the user provided a full URL
for suffix in ('/chat/completions', '/responses'):
if base_url.endswith(suffix):
base_url = base_url[:-len(suffix)].rstrip('/')
break
self._base_url = base_url
self._use_responses_api = False
@property
def _api_url(self) -> str:
"""Return the appropriate API endpoint URL."""
if self._use_responses_api:
return f'{self._base_url}/responses'
return f'{self._base_url}/chat/completions'
@property
def provider_name(self) -> str:
@ -81,7 +96,9 @@ class OpenAIClient(LLMClient):
"""Check if the client is properly configured."""
# API key is required for the default OpenAI endpoint, but optional
# for custom endpoints (e.g., local LLM servers).
if self._api_url.startswith(DEFAULT_API_BASE_URL):
if self._base_url.rstrip('/').startswith(
DEFAULT_API_BASE_URL.rstrip('/')
):
return bool(self._api_key)
return True
@ -109,10 +126,72 @@ class OpenAIClient(LLMClient):
Raises:
LLMClientError: If the request fails.
"""
# Build the request payload
if self._use_responses_api:
return self._chat_responses(
messages, tools, system_prompt, max_tokens
)
# Try Chat Completions API first
payload = self._build_chat_payload(
messages, tools, system_prompt, max_tokens
)
try:
response_data = self._make_request(payload)
return self._parse_response(response_data)
except LLMClientError as e:
if self._should_use_responses_api(e):
self._use_responses_api = True
return self._chat_responses(
messages, tools, system_prompt, max_tokens
)
raise
except Exception as e:
raise LLMClientError(LLMError(
message=f"Request failed: {e!s}",
provider=self.provider_name
)) from e
def _chat_responses(
self,
messages: list[Message],
tools: Optional[list[Tool]] = None,
system_prompt: Optional[str] = None,
max_tokens: int = 4096,
) -> LLMResponse:
"""Send a chat request using the Responses API."""
payload = self._build_responses_payload(
messages, tools, system_prompt, max_tokens
)
try:
response_data = self._make_request(payload)
return self._parse_responses_response(response_data)
except LLMClientError:
raise
except Exception as e:
raise LLMClientError(LLMError(
message=f"Request failed: {e!s}",
provider=self.provider_name
)) from e
def _should_use_responses_api(self, error: LLMClientError) -> bool:
"""Check if the error indicates we should use the Responses API."""
error_msg = str(error).lower()
return ('v1/responses' in error_msg or
'not supported in the v1/chat/completions' in error_msg or
'not a chat model' in error_msg)
def _build_chat_payload(
self,
messages: list[Message],
tools: Optional[list[Tool]],
system_prompt: Optional[str],
max_tokens: int
) -> dict:
"""Build payload for the Chat Completions API."""
converted_messages = self._convert_messages(messages)
# Add system prompt at the beginning if provided
if system_prompt:
converted_messages.insert(0, {
'role': 'system',
@ -129,20 +208,35 @@ class OpenAIClient(LLMClient):
payload['tools'] = self._convert_tools(tools)
payload['tool_choice'] = 'auto'
# Make the API request
try:
response_data = self._make_request(payload)
return self._parse_response(response_data)
except LLMClientError:
raise
except Exception as e:
raise LLMClientError(LLMError(
message=f"Request failed: {str(e)}",
provider=self.provider_name
))
return payload
def _build_responses_payload(
self,
messages: list[Message],
tools: Optional[list[Tool]],
system_prompt: Optional[str],
max_tokens: int
) -> dict:
"""Build payload for the Responses API."""
input_items = self._convert_messages_responses(messages)
payload = {
'model': self._model,
'input': input_items,
'max_output_tokens': max_tokens,
}
if system_prompt:
payload['instructions'] = system_prompt
if tools:
payload['tools'] = self._convert_tools_responses(tools)
payload['tool_choice'] = 'auto'
return payload
def _convert_messages(self, messages: list[Message]) -> list[dict]:
"""Convert Message objects to OpenAI API format."""
"""Convert Message objects to OpenAI Chat Completions API format."""
result = []
for msg in messages:
@ -191,8 +285,53 @@ class OpenAIClient(LLMClient):
return result
def _convert_messages_responses(
self, messages: list[Message]
) -> list[dict]:
"""Convert Message objects to OpenAI Responses API format."""
result = []
for msg in messages:
if msg.role == Role.SYSTEM:
result.append({
'role': 'developer',
'content': msg.content
})
elif msg.role == Role.USER:
result.append({
'role': 'user',
'content': msg.content
})
elif msg.role == Role.ASSISTANT:
if msg.content:
result.append({
'role': 'assistant',
'content': msg.content
})
# Tool calls are separate items in Responses API
if msg.tool_calls:
for tc in msg.tool_calls:
result.append({
'type': 'function_call',
'call_id': tc.id,
'name': tc.name,
'arguments': json.dumps(tc.arguments)
})
elif msg.role == Role.TOOL:
for tr in msg.tool_results:
result.append({
'type': 'function_call_output',
'call_id': tr.tool_call_id,
'output': tr.content
})
return result
def _convert_tools(self, tools: list[Tool]) -> list[dict]:
"""Convert Tool objects to OpenAI API format."""
"""Convert Tool objects to Chat Completions API format."""
return [
{
'type': 'function',
@ -205,6 +344,18 @@ class OpenAIClient(LLMClient):
for tool in tools
]
def _convert_tools_responses(self, tools: list[Tool]) -> list[dict]:
"""Convert Tool objects to Responses API format."""
return [
{
'type': 'function',
'name': tool.name,
'description': tool.description,
'parameters': tool.parameters
}
for tool in tools
]
def _make_request(self, payload: dict) -> dict:
"""Make an HTTP request to the OpenAI API."""
headers = {
@ -255,8 +406,24 @@ class OpenAIClient(LLMClient):
retryable=True
))
def _raise_max_tokens_error(self, input_tokens: int):
"""Raise an error when a response is truncated due to token limit."""
raise LLMClientError(LLMError(
message=f'Response truncated due to token limit '
f'(input: {input_tokens} tokens). '
f'The request is too large for model '
f'{self._model}. '
f'Try using a model with a larger context '
f'window, or analyze a smaller scope (e.g., a '
f'specific schema instead of the entire '
f'database).',
code='max_tokens',
provider=self.provider_name,
retryable=False
))
def _parse_response(self, data: dict) -> LLMResponse:
"""Parse the OpenAI API response into an LLMResponse."""
"""Parse the Chat Completions API response into an LLMResponse."""
# Check for API-level errors in the response
if 'error' in data:
error_info = data['error']
@ -325,20 +492,7 @@ class OpenAIClient(LLMClient):
# Check for problematic responses
if not content and not tool_calls:
if stop_reason == StopReason.MAX_TOKENS:
input_tokens = usage.input_tokens
raise LLMClientError(LLMError(
message=f'Response truncated due to token limit '
f'(input: {input_tokens} tokens). '
f'The request is too large for model '
f'{self._model}. '
f'Try using a model with a larger context '
f'window, or analyze a smaller scope (e.g., a '
f'specific schema instead of the entire '
f'database).',
code='max_tokens',
provider=self.provider_name,
retryable=False
))
self._raise_max_tokens_error(usage.input_tokens)
elif finish_reason and finish_reason not in ('stop', 'tool_calls'):
raise LLMClientError(LLMError(
message=(f'Empty response with finish reason: '
@ -357,6 +511,92 @@ class OpenAIClient(LLMClient):
raw_response=data
)
def _parse_responses_response(self, data: dict) -> LLMResponse:
"""Parse the Responses API response into an LLMResponse."""
# Check for API-level errors
if 'error' in data:
error_info = data['error']
raise LLMClientError(LLMError(
message=error_info.get('message', 'Unknown API error'),
code=error_info.get('code', 'unknown'),
provider=self.provider_name,
retryable=False
))
output = data.get('output', [])
content = ''
tool_calls = []
for item in output:
item_type = item.get('type', '')
if item_type == 'message':
for part in item.get('content', []):
if part.get('type') == 'output_text':
content += part.get('text', '')
elif item_type == 'function_call':
try:
arguments = json.loads(
item.get('arguments', '{}')
)
except json.JSONDecodeError:
arguments = {}
tool_calls.append(ToolCall(
id=item.get('call_id', str(uuid.uuid4())),
name=item.get('name', ''),
arguments=arguments
))
# Determine stop reason from status and incomplete_details
status = data.get('status', '')
if tool_calls:
stop_reason = StopReason.TOOL_USE
elif status == 'completed':
stop_reason = StopReason.END_TURN
elif status == 'incomplete':
reason = data.get(
'incomplete_details', {}
).get('reason', '')
if reason == 'content_filter':
stop_reason = StopReason.STOP_SEQUENCE
elif reason == 'max_output_tokens':
stop_reason = StopReason.MAX_TOKENS
else:
stop_reason = StopReason.MAX_TOKENS
else:
stop_reason = StopReason.UNKNOWN
# Parse usage information
usage_data = data.get('usage', {})
usage = Usage(
input_tokens=usage_data.get('input_tokens', 0),
output_tokens=usage_data.get('output_tokens', 0),
total_tokens=usage_data.get('total_tokens', 0)
)
# Check for problematic responses
if not content and not tool_calls:
if stop_reason == StopReason.MAX_TOKENS:
self._raise_max_tokens_error(usage.input_tokens)
elif stop_reason == StopReason.STOP_SEQUENCE:
raise LLMClientError(LLMError(
message='Response blocked by content filter.',
code='content_filter',
provider=self.provider_name,
retryable=False
))
return LLMResponse(
content=content,
tool_calls=tool_calls,
stop_reason=stop_reason,
model=data.get('model', self._model),
usage=usage,
raw_response=data
)
def chat_stream(
self,
messages: list[Message],
@ -367,35 +607,46 @@ class OpenAIClient(LLMClient):
**kwargs
) -> Generator[Union[str, LLMResponse], None, None]:
"""Stream a chat response from OpenAI."""
converted_messages = self._convert_messages(messages)
if self._use_responses_api:
payload = self._build_responses_payload(
messages, tools, system_prompt, max_tokens
)
payload['stream'] = True
try:
yield from self._process_stream(payload)
except LLMClientError:
raise
except Exception as e:
raise LLMClientError(LLMError(
message=f"Streaming request failed: {e!s}",
provider=self.provider_name
)) from e
return
if system_prompt:
converted_messages.insert(0, {
'role': 'system',
'content': system_prompt
})
payload = {
'model': self._model,
'messages': converted_messages,
'max_completion_tokens': max_tokens,
'stream': True,
'stream_options': {'include_usage': True}
}
if tools:
payload['tools'] = self._convert_tools(tools)
payload['tool_choice'] = 'auto'
# Try Chat Completions API first
payload = self._build_chat_payload(
messages, tools, system_prompt, max_tokens
)
payload['stream'] = True
payload['stream_options'] = {'include_usage': True}
try:
yield from self._process_stream(payload)
except LLMClientError:
raise
except LLMClientError as e:
if self._should_use_responses_api(e):
self._use_responses_api = True
payload = self._build_responses_payload(
messages, tools, system_prompt, max_tokens
)
payload['stream'] = True
yield from self._process_stream(payload)
else:
raise
except Exception as e:
raise LLMClientError(LLMError(
message=f"Streaming request failed: {str(e)}",
message=f"Streaming request failed: {e!s}",
provider=self.provider_name
))
)) from e
def _process_stream(
self, payload: dict
@ -449,16 +700,19 @@ class OpenAIClient(LLMClient):
))
try:
yield from self._read_openai_stream(response)
if self._use_responses_api:
yield from self._read_responses_stream(response)
else:
yield from self._read_openai_stream(response)
finally:
response.close()
def _read_openai_stream(
self, response
) -> Generator[Union[str, LLMResponse], None, None]:
"""Read and parse an OpenAI-format SSE stream.
"""Read and parse an OpenAI Chat Completions SSE stream.
Uses readline() for incremental reading it returns as soon
Uses readline() for incremental reading -- it returns as soon
as a complete line arrives from the server, unlike read()
which blocks until a buffer fills up.
"""
@ -574,3 +828,126 @@ class OpenAIClient(LLMClient):
model=model_name,
usage=usage
)
def _read_responses_stream(
self, response
) -> Generator[Union[str, LLMResponse], None, None]:
"""Read and parse an OpenAI Responses API SSE stream.
The Responses API uses named events with types like
response.output_text.delta for text streaming and
response.completed for the final response.
"""
content_parts = []
# tool_calls_data: {call_id: {name, arguments}}
tool_calls_data = {}
model_name = self._model
usage = Usage()
resp_status = ''
resp_incomplete = {}
while True:
line_bytes = response.readline()
if not line_bytes:
break
line = line_bytes.decode('utf-8', errors='replace').strip()
if not line or line.startswith(':'):
continue
# Skip event type lines - we identify events by data type field
if line.startswith('event: '):
continue
if not line.startswith('data: '):
continue
try:
data = json.loads(line[6:])
except json.JSONDecodeError:
continue
event_type = data.get('type', '')
if event_type == 'response.output_text.delta':
delta = data.get('delta', '')
if delta:
content_parts.append(delta)
yield delta
elif event_type == 'response.output_item.added':
item = data.get('item', {})
if item.get('type') == 'function_call':
call_id = item.get('call_id', '')
tool_calls_data[call_id] = {
'name': item.get('name', ''),
'arguments': ''
}
elif event_type == 'response.function_call_arguments.delta':
call_id = data.get('call_id', '')
if call_id not in tool_calls_data:
tool_calls_data[call_id] = {
'name': '', 'arguments': ''
}
tool_calls_data[call_id]['arguments'] += data.get(
'delta', ''
)
elif event_type == 'response.completed':
resp = data.get('response', {})
u = resp.get('usage', {})
usage = Usage(
input_tokens=u.get('input_tokens', 0),
output_tokens=u.get('output_tokens', 0),
total_tokens=u.get('total_tokens', 0)
)
model_name = resp.get('model', model_name)
resp_status = resp.get('status', '')
resp_incomplete = resp.get('incomplete_details', {})
# Build final response
content = ''.join(content_parts)
tool_calls = []
for call_id, tc in tool_calls_data.items():
try:
arguments = json.loads(tc['arguments']) \
if tc['arguments'] else {}
except json.JSONDecodeError:
arguments = {}
tool_calls.append(ToolCall(
id=call_id or str(uuid.uuid4()),
name=tc['name'],
arguments=arguments
))
# Determine stop reason from final response status
if tool_calls:
stop_reason = StopReason.TOOL_USE
elif resp_status == 'incomplete':
reason = resp_incomplete.get('reason', '') \
if resp_incomplete else ''
if reason == 'content_filter':
stop_reason = StopReason.STOP_SEQUENCE
else:
stop_reason = StopReason.MAX_TOKENS
elif content:
stop_reason = StopReason.END_TURN
else:
stop_reason = StopReason.UNKNOWN
if not content and not tool_calls:
raise LLMClientError(LLMError(
message='No response content returned from API',
provider=self.provider_name,
retryable=False
))
yield LLMResponse(
content=content,
tool_calls=tool_calls,
stop_reason=stop_reason,
model=model_name,
usage=usage
)