348 lines
12 KiB
Python
348 lines
12 KiB
Python
"""Voice over IP (VoIP) implementation."""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from collections import deque
|
|
from collections.abc import AsyncIterable, MutableSequence, Sequence
|
|
from functools import partial
|
|
import logging
|
|
from pathlib import Path
|
|
import time
|
|
from typing import TYPE_CHECKING
|
|
|
|
import async_timeout
|
|
from voip_utils import CallInfo, RtpDatagramProtocol, SdpInfo, VoipDatagramProtocol
|
|
|
|
from homeassistant.components import stt, tts
|
|
from homeassistant.components.assist_pipeline import (
|
|
Pipeline,
|
|
PipelineEvent,
|
|
PipelineEventType,
|
|
async_pipeline_from_audio_stream,
|
|
select as pipeline_select,
|
|
)
|
|
from homeassistant.components.assist_pipeline.vad import VoiceCommandSegmenter
|
|
from homeassistant.const import __version__
|
|
from homeassistant.core import Context, HomeAssistant
|
|
from homeassistant.util.ulid import ulid
|
|
|
|
from .const import DOMAIN
|
|
|
|
if TYPE_CHECKING:
|
|
from .devices import VoIPDevice, VoIPDevices
|
|
|
|
_BUFFERED_CHUNKS_BEFORE_SPEECH = 100 # ~2 seconds
|
|
_TONE_DELAY = 0.2 # seconds before playing tone
|
|
_MESSAGE_DELAY = 1.0 # seconds before playing "not configured" message
|
|
_LOOP_DELAY = 2.0 # seconds before replaying not-configured message
|
|
_RTP_AUDIO_SETTINGS = {"rate": 16000, "width": 2, "channels": 1, "sleep_ratio": 1.01}
|
|
_LOGGER = logging.getLogger(__name__)
|
|
|
|
|
|
class HassVoipDatagramProtocol(VoipDatagramProtocol):
|
|
"""HA UDP server for Voice over IP (VoIP)."""
|
|
|
|
def __init__(self, hass: HomeAssistant, devices: VoIPDevices) -> None:
|
|
"""Set up VoIP call handler."""
|
|
super().__init__(
|
|
sdp_info=SdpInfo(
|
|
username="homeassistant",
|
|
id=time.monotonic_ns(),
|
|
session_name="voip_hass",
|
|
version=__version__,
|
|
),
|
|
valid_protocol_factory=lambda call_info: PipelineRtpDatagramProtocol(
|
|
hass,
|
|
hass.config.language,
|
|
devices.async_get_or_create(call_info),
|
|
),
|
|
invalid_protocol_factory=lambda call_info: NotConfiguredRtpDatagramProtocol(
|
|
hass,
|
|
),
|
|
)
|
|
self.hass = hass
|
|
self.devices = devices
|
|
|
|
def is_valid_call(self, call_info: CallInfo) -> bool:
|
|
"""Filter calls."""
|
|
device = self.devices.async_get_or_create(call_info)
|
|
return device.async_allow_call(self.hass)
|
|
|
|
|
|
class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
|
|
"""Run a voice assistant pipeline in a loop for a VoIP call."""
|
|
|
|
def __init__(
|
|
self,
|
|
hass: HomeAssistant,
|
|
language: str,
|
|
voip_device: VoIPDevice,
|
|
pipeline_timeout: float = 30.0,
|
|
audio_timeout: float = 2.0,
|
|
listening_tone_enabled: bool = True,
|
|
) -> None:
|
|
"""Set up pipeline RTP server."""
|
|
# STT expects 16Khz mono with 16-bit samples
|
|
super().__init__(rate=16000, width=2, channels=1)
|
|
|
|
self.hass = hass
|
|
self.language = language
|
|
self.voip_device = voip_device
|
|
self.pipeline: Pipeline | None = None
|
|
self.pipeline_timeout = pipeline_timeout
|
|
self.audio_timeout = audio_timeout
|
|
self.listening_tone_enabled = listening_tone_enabled
|
|
|
|
self._audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
|
|
self._context = Context()
|
|
self._conversation_id: str | None = None
|
|
self._pipeline_task: asyncio.Task | None = None
|
|
self._session_id: str | None = None
|
|
self._tone_bytes: bytes | None = None
|
|
|
|
def connection_made(self, transport):
|
|
"""Server is ready."""
|
|
super().connection_made(transport)
|
|
self.voip_device.set_is_active(True)
|
|
|
|
def connection_lost(self, exc):
|
|
"""Handle connection is lost or closed."""
|
|
super().connection_lost(exc)
|
|
self.voip_device.set_is_active(False)
|
|
|
|
def on_chunk(self, audio_bytes: bytes) -> None:
|
|
"""Handle raw audio chunk."""
|
|
if self._pipeline_task is None:
|
|
self._clear_audio_queue()
|
|
|
|
# Run pipeline until voice command finishes, then start over
|
|
self._pipeline_task = self.hass.async_create_background_task(
|
|
self._run_pipeline(),
|
|
"voip_pipeline_run",
|
|
)
|
|
|
|
self._audio_queue.put_nowait(audio_bytes)
|
|
|
|
async def _run_pipeline(
|
|
self,
|
|
) -> None:
|
|
"""Forward audio to pipeline STT and handle TTS."""
|
|
if self._session_id is None:
|
|
self._session_id = ulid()
|
|
if self.listening_tone_enabled:
|
|
await self._play_listening_tone()
|
|
|
|
try:
|
|
# Wait for speech before starting pipeline
|
|
segmenter = VoiceCommandSegmenter()
|
|
chunk_buffer: deque[bytes] = deque(
|
|
maxlen=_BUFFERED_CHUNKS_BEFORE_SPEECH,
|
|
)
|
|
speech_detected = await self._wait_for_speech(
|
|
segmenter,
|
|
chunk_buffer,
|
|
)
|
|
if not speech_detected:
|
|
_LOGGER.debug("No speech detected")
|
|
return
|
|
|
|
_LOGGER.debug("Starting pipeline")
|
|
|
|
async def stt_stream():
|
|
try:
|
|
async for chunk in self._segment_audio(
|
|
segmenter,
|
|
chunk_buffer,
|
|
):
|
|
yield chunk
|
|
except asyncio.TimeoutError:
|
|
# Expected after caller hangs up
|
|
_LOGGER.debug("Audio timeout")
|
|
self._session_id = None
|
|
self.disconnect()
|
|
finally:
|
|
self._clear_audio_queue()
|
|
|
|
# Run pipeline with a timeout
|
|
async with async_timeout.timeout(self.pipeline_timeout):
|
|
await async_pipeline_from_audio_stream(
|
|
self.hass,
|
|
context=self._context,
|
|
event_callback=self._event_callback,
|
|
stt_metadata=stt.SpeechMetadata(
|
|
language="", # set in async_pipeline_from_audio_stream
|
|
format=stt.AudioFormats.WAV,
|
|
codec=stt.AudioCodecs.PCM,
|
|
bit_rate=stt.AudioBitRates.BITRATE_16,
|
|
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
|
|
channel=stt.AudioChannels.CHANNEL_MONO,
|
|
),
|
|
stt_stream=stt_stream(),
|
|
pipeline_id=pipeline_select.get_chosen_pipeline(
|
|
self.hass, DOMAIN, self.voip_device.voip_id
|
|
),
|
|
conversation_id=self._conversation_id,
|
|
tts_audio_output="raw",
|
|
)
|
|
|
|
except asyncio.TimeoutError:
|
|
# Expected after caller hangs up
|
|
_LOGGER.debug("Pipeline timeout")
|
|
self._session_id = None
|
|
self.disconnect()
|
|
finally:
|
|
# Allow pipeline to run again
|
|
self._pipeline_task = None
|
|
|
|
async def _wait_for_speech(
|
|
self,
|
|
segmenter: VoiceCommandSegmenter,
|
|
chunk_buffer: MutableSequence[bytes],
|
|
):
|
|
"""Buffer audio chunks until speech is detected.
|
|
|
|
Returns True if speech was detected, False otherwise.
|
|
"""
|
|
# Timeout if no audio comes in for a while.
|
|
# This means the caller hung up.
|
|
async with async_timeout.timeout(self.audio_timeout):
|
|
chunk = await self._audio_queue.get()
|
|
|
|
while chunk:
|
|
segmenter.process(chunk)
|
|
if segmenter.in_command:
|
|
return True
|
|
|
|
# Buffer until command starts
|
|
chunk_buffer.append(chunk)
|
|
|
|
async with async_timeout.timeout(self.audio_timeout):
|
|
chunk = await self._audio_queue.get()
|
|
|
|
return False
|
|
|
|
async def _segment_audio(
|
|
self,
|
|
segmenter: VoiceCommandSegmenter,
|
|
chunk_buffer: Sequence[bytes],
|
|
) -> AsyncIterable[bytes]:
|
|
"""Yield audio chunks until voice command has finished."""
|
|
# Buffered chunks first
|
|
for buffered_chunk in chunk_buffer:
|
|
yield buffered_chunk
|
|
|
|
# Timeout if no audio comes in for a while.
|
|
# This means the caller hung up.
|
|
async with async_timeout.timeout(self.audio_timeout):
|
|
chunk = await self._audio_queue.get()
|
|
|
|
while chunk:
|
|
if not segmenter.process(chunk):
|
|
# Voice command is finished
|
|
break
|
|
|
|
yield chunk
|
|
|
|
async with async_timeout.timeout(self.audio_timeout):
|
|
chunk = await self._audio_queue.get()
|
|
|
|
def _clear_audio_queue(self) -> None:
|
|
while not self._audio_queue.empty():
|
|
self._audio_queue.get_nowait()
|
|
|
|
def _event_callback(self, event: PipelineEvent):
|
|
if not event.data:
|
|
return
|
|
|
|
if event.type == PipelineEventType.INTENT_END:
|
|
# Capture conversation id
|
|
self._conversation_id = event.data["intent_output"]["conversation_id"]
|
|
elif event.type == PipelineEventType.TTS_END:
|
|
# Send TTS audio to caller over RTP
|
|
media_id = event.data["tts_output"]["media_id"]
|
|
self.hass.async_create_background_task(
|
|
self._send_media(media_id),
|
|
"voip_pipeline_tts",
|
|
)
|
|
|
|
async def _send_media(self, media_id: str) -> None:
|
|
"""Send TTS audio to caller via RTP."""
|
|
if self.transport is None:
|
|
return
|
|
|
|
_extension, audio_bytes = await tts.async_get_media_source_audio(
|
|
self.hass,
|
|
media_id,
|
|
)
|
|
|
|
_LOGGER.debug("Sending %s byte(s) of audio", len(audio_bytes))
|
|
|
|
# Assume TTS audio is 16Khz 16-bit mono
|
|
await self.hass.async_add_executor_job(
|
|
partial(self.send_audio, audio_bytes, **_RTP_AUDIO_SETTINGS)
|
|
)
|
|
|
|
async def _play_listening_tone(self) -> None:
|
|
"""Play a tone to indicate that Home Assistant is listening."""
|
|
if self._tone_bytes is None:
|
|
# Do I/O in executor
|
|
self._tone_bytes = await self.hass.async_add_executor_job(
|
|
self._load_tone,
|
|
)
|
|
|
|
await self.hass.async_add_executor_job(
|
|
partial(
|
|
self.send_audio,
|
|
self._tone_bytes,
|
|
silence_before=_TONE_DELAY,
|
|
**_RTP_AUDIO_SETTINGS,
|
|
)
|
|
)
|
|
|
|
def _load_tone(self) -> bytes:
|
|
"""Load raw tone audio (16Khz, 16-bit mono)."""
|
|
return (Path(__file__).parent / "tone.pcm").read_bytes()
|
|
|
|
|
|
class NotConfiguredRtpDatagramProtocol(RtpDatagramProtocol):
|
|
"""Plays audio on a loop to inform the user to configure the phone in Home Assistant."""
|
|
|
|
def __init__(self, hass: HomeAssistant) -> None:
|
|
"""Set up RTP server."""
|
|
super().__init__(rate=16000, width=2, channels=1)
|
|
self.hass = hass
|
|
self._audio_task: asyncio.Task | None = None
|
|
self._audio_bytes: bytes | None = None
|
|
|
|
def on_chunk(self, audio_bytes: bytes) -> None:
|
|
"""Handle raw audio chunk."""
|
|
if self.transport is None:
|
|
return
|
|
|
|
if self._audio_bytes is None:
|
|
# 16Khz, 16-bit mono audio message
|
|
self._audio_bytes = (
|
|
Path(__file__).parent / "not_configured.pcm"
|
|
).read_bytes()
|
|
|
|
if self._audio_task is None:
|
|
self._audio_task = self.hass.async_create_background_task(
|
|
self._play_message(),
|
|
"voip_not_connected",
|
|
)
|
|
|
|
async def _play_message(self) -> None:
|
|
await self.hass.async_add_executor_job(
|
|
partial(
|
|
self.send_audio,
|
|
self._audio_bytes,
|
|
silence_before=_MESSAGE_DELAY,
|
|
**_RTP_AUDIO_SETTINGS,
|
|
)
|
|
)
|
|
|
|
await asyncio.sleep(_LOOP_DELAY)
|
|
|
|
# Allow message to play again
|
|
self._audio_task = None
|