core/homeassistant/components/voip/voip.py

348 lines
12 KiB
Python

"""Voice over IP (VoIP) implementation."""
from __future__ import annotations
import asyncio
from collections import deque
from collections.abc import AsyncIterable, MutableSequence, Sequence
from functools import partial
import logging
from pathlib import Path
import time
from typing import TYPE_CHECKING
import async_timeout
from voip_utils import CallInfo, RtpDatagramProtocol, SdpInfo, VoipDatagramProtocol
from homeassistant.components import stt, tts
from homeassistant.components.assist_pipeline import (
Pipeline,
PipelineEvent,
PipelineEventType,
async_pipeline_from_audio_stream,
select as pipeline_select,
)
from homeassistant.components.assist_pipeline.vad import VoiceCommandSegmenter
from homeassistant.const import __version__
from homeassistant.core import Context, HomeAssistant
from homeassistant.util.ulid import ulid
from .const import DOMAIN
if TYPE_CHECKING:
from .devices import VoIPDevice, VoIPDevices
_BUFFERED_CHUNKS_BEFORE_SPEECH = 100 # ~2 seconds
_TONE_DELAY = 0.2 # seconds before playing tone
_MESSAGE_DELAY = 1.0 # seconds before playing "not configured" message
_LOOP_DELAY = 2.0 # seconds before replaying not-configured message
_RTP_AUDIO_SETTINGS = {"rate": 16000, "width": 2, "channels": 1, "sleep_ratio": 1.01}
_LOGGER = logging.getLogger(__name__)
class HassVoipDatagramProtocol(VoipDatagramProtocol):
"""HA UDP server for Voice over IP (VoIP)."""
def __init__(self, hass: HomeAssistant, devices: VoIPDevices) -> None:
"""Set up VoIP call handler."""
super().__init__(
sdp_info=SdpInfo(
username="homeassistant",
id=time.monotonic_ns(),
session_name="voip_hass",
version=__version__,
),
valid_protocol_factory=lambda call_info: PipelineRtpDatagramProtocol(
hass,
hass.config.language,
devices.async_get_or_create(call_info),
),
invalid_protocol_factory=lambda call_info: NotConfiguredRtpDatagramProtocol(
hass,
),
)
self.hass = hass
self.devices = devices
def is_valid_call(self, call_info: CallInfo) -> bool:
"""Filter calls."""
device = self.devices.async_get_or_create(call_info)
return device.async_allow_call(self.hass)
class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
"""Run a voice assistant pipeline in a loop for a VoIP call."""
def __init__(
self,
hass: HomeAssistant,
language: str,
voip_device: VoIPDevice,
pipeline_timeout: float = 30.0,
audio_timeout: float = 2.0,
listening_tone_enabled: bool = True,
) -> None:
"""Set up pipeline RTP server."""
# STT expects 16Khz mono with 16-bit samples
super().__init__(rate=16000, width=2, channels=1)
self.hass = hass
self.language = language
self.voip_device = voip_device
self.pipeline: Pipeline | None = None
self.pipeline_timeout = pipeline_timeout
self.audio_timeout = audio_timeout
self.listening_tone_enabled = listening_tone_enabled
self._audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
self._context = Context()
self._conversation_id: str | None = None
self._pipeline_task: asyncio.Task | None = None
self._session_id: str | None = None
self._tone_bytes: bytes | None = None
def connection_made(self, transport):
"""Server is ready."""
super().connection_made(transport)
self.voip_device.set_is_active(True)
def connection_lost(self, exc):
"""Handle connection is lost or closed."""
super().connection_lost(exc)
self.voip_device.set_is_active(False)
def on_chunk(self, audio_bytes: bytes) -> None:
"""Handle raw audio chunk."""
if self._pipeline_task is None:
self._clear_audio_queue()
# Run pipeline until voice command finishes, then start over
self._pipeline_task = self.hass.async_create_background_task(
self._run_pipeline(),
"voip_pipeline_run",
)
self._audio_queue.put_nowait(audio_bytes)
async def _run_pipeline(
self,
) -> None:
"""Forward audio to pipeline STT and handle TTS."""
if self._session_id is None:
self._session_id = ulid()
if self.listening_tone_enabled:
await self._play_listening_tone()
try:
# Wait for speech before starting pipeline
segmenter = VoiceCommandSegmenter()
chunk_buffer: deque[bytes] = deque(
maxlen=_BUFFERED_CHUNKS_BEFORE_SPEECH,
)
speech_detected = await self._wait_for_speech(
segmenter,
chunk_buffer,
)
if not speech_detected:
_LOGGER.debug("No speech detected")
return
_LOGGER.debug("Starting pipeline")
async def stt_stream():
try:
async for chunk in self._segment_audio(
segmenter,
chunk_buffer,
):
yield chunk
except asyncio.TimeoutError:
# Expected after caller hangs up
_LOGGER.debug("Audio timeout")
self._session_id = None
self.disconnect()
finally:
self._clear_audio_queue()
# Run pipeline with a timeout
async with async_timeout.timeout(self.pipeline_timeout):
await async_pipeline_from_audio_stream(
self.hass,
context=self._context,
event_callback=self._event_callback,
stt_metadata=stt.SpeechMetadata(
language="", # set in async_pipeline_from_audio_stream
format=stt.AudioFormats.WAV,
codec=stt.AudioCodecs.PCM,
bit_rate=stt.AudioBitRates.BITRATE_16,
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
channel=stt.AudioChannels.CHANNEL_MONO,
),
stt_stream=stt_stream(),
pipeline_id=pipeline_select.get_chosen_pipeline(
self.hass, DOMAIN, self.voip_device.voip_id
),
conversation_id=self._conversation_id,
tts_audio_output="raw",
)
except asyncio.TimeoutError:
# Expected after caller hangs up
_LOGGER.debug("Pipeline timeout")
self._session_id = None
self.disconnect()
finally:
# Allow pipeline to run again
self._pipeline_task = None
async def _wait_for_speech(
self,
segmenter: VoiceCommandSegmenter,
chunk_buffer: MutableSequence[bytes],
):
"""Buffer audio chunks until speech is detected.
Returns True if speech was detected, False otherwise.
"""
# Timeout if no audio comes in for a while.
# This means the caller hung up.
async with async_timeout.timeout(self.audio_timeout):
chunk = await self._audio_queue.get()
while chunk:
segmenter.process(chunk)
if segmenter.in_command:
return True
# Buffer until command starts
chunk_buffer.append(chunk)
async with async_timeout.timeout(self.audio_timeout):
chunk = await self._audio_queue.get()
return False
async def _segment_audio(
self,
segmenter: VoiceCommandSegmenter,
chunk_buffer: Sequence[bytes],
) -> AsyncIterable[bytes]:
"""Yield audio chunks until voice command has finished."""
# Buffered chunks first
for buffered_chunk in chunk_buffer:
yield buffered_chunk
# Timeout if no audio comes in for a while.
# This means the caller hung up.
async with async_timeout.timeout(self.audio_timeout):
chunk = await self._audio_queue.get()
while chunk:
if not segmenter.process(chunk):
# Voice command is finished
break
yield chunk
async with async_timeout.timeout(self.audio_timeout):
chunk = await self._audio_queue.get()
def _clear_audio_queue(self) -> None:
while not self._audio_queue.empty():
self._audio_queue.get_nowait()
def _event_callback(self, event: PipelineEvent):
if not event.data:
return
if event.type == PipelineEventType.INTENT_END:
# Capture conversation id
self._conversation_id = event.data["intent_output"]["conversation_id"]
elif event.type == PipelineEventType.TTS_END:
# Send TTS audio to caller over RTP
media_id = event.data["tts_output"]["media_id"]
self.hass.async_create_background_task(
self._send_media(media_id),
"voip_pipeline_tts",
)
async def _send_media(self, media_id: str) -> None:
"""Send TTS audio to caller via RTP."""
if self.transport is None:
return
_extension, audio_bytes = await tts.async_get_media_source_audio(
self.hass,
media_id,
)
_LOGGER.debug("Sending %s byte(s) of audio", len(audio_bytes))
# Assume TTS audio is 16Khz 16-bit mono
await self.hass.async_add_executor_job(
partial(self.send_audio, audio_bytes, **_RTP_AUDIO_SETTINGS)
)
async def _play_listening_tone(self) -> None:
"""Play a tone to indicate that Home Assistant is listening."""
if self._tone_bytes is None:
# Do I/O in executor
self._tone_bytes = await self.hass.async_add_executor_job(
self._load_tone,
)
await self.hass.async_add_executor_job(
partial(
self.send_audio,
self._tone_bytes,
silence_before=_TONE_DELAY,
**_RTP_AUDIO_SETTINGS,
)
)
def _load_tone(self) -> bytes:
"""Load raw tone audio (16Khz, 16-bit mono)."""
return (Path(__file__).parent / "tone.pcm").read_bytes()
class NotConfiguredRtpDatagramProtocol(RtpDatagramProtocol):
"""Plays audio on a loop to inform the user to configure the phone in Home Assistant."""
def __init__(self, hass: HomeAssistant) -> None:
"""Set up RTP server."""
super().__init__(rate=16000, width=2, channels=1)
self.hass = hass
self._audio_task: asyncio.Task | None = None
self._audio_bytes: bytes | None = None
def on_chunk(self, audio_bytes: bytes) -> None:
"""Handle raw audio chunk."""
if self.transport is None:
return
if self._audio_bytes is None:
# 16Khz, 16-bit mono audio message
self._audio_bytes = (
Path(__file__).parent / "not_configured.pcm"
).read_bytes()
if self._audio_task is None:
self._audio_task = self.hass.async_create_background_task(
self._play_message(),
"voip_not_connected",
)
async def _play_message(self) -> None:
await self.hass.async_add_executor_job(
partial(
self.send_audio,
self._audio_bytes,
silence_before=_MESSAGE_DELAY,
**_RTP_AUDIO_SETTINGS,
)
)
await asyncio.sleep(_LOOP_DELAY)
# Allow message to play again
self._audio_task = None