559 lines
19 KiB
Python
559 lines
19 KiB
Python
"""Voice over IP (VoIP) implementation."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from collections import deque
|
|
from collections.abc import AsyncIterable, MutableSequence, Sequence
|
|
from functools import partial
|
|
import io
|
|
import logging
|
|
from pathlib import Path
|
|
import time
|
|
from typing import TYPE_CHECKING
|
|
import wave
|
|
|
|
from voip_utils import (
|
|
CallInfo,
|
|
RtcpState,
|
|
RtpDatagramProtocol,
|
|
SdpInfo,
|
|
VoipDatagramProtocol,
|
|
)
|
|
|
|
from homeassistant.components import stt, tts
|
|
from homeassistant.components.assist_pipeline import (
|
|
Pipeline,
|
|
PipelineEvent,
|
|
PipelineEventType,
|
|
PipelineNotFound,
|
|
async_get_pipeline,
|
|
async_pipeline_from_audio_stream,
|
|
select as pipeline_select,
|
|
)
|
|
from homeassistant.components.assist_pipeline.vad import (
|
|
AudioBuffer,
|
|
VadSensitivity,
|
|
VoiceActivityDetector,
|
|
VoiceCommandSegmenter,
|
|
WebRtcVad,
|
|
)
|
|
from homeassistant.const import __version__
|
|
from homeassistant.core import Context, HomeAssistant
|
|
from homeassistant.util.ulid import ulid_now
|
|
|
|
from .const import CHANNELS, DOMAIN, RATE, RTP_AUDIO_SETTINGS, WIDTH
|
|
|
|
if TYPE_CHECKING:
|
|
from .devices import VoIPDevice, VoIPDevices
|
|
|
|
_LOGGER = logging.getLogger(__name__)
|
|
|
|
|
|
def make_protocol(
|
|
hass: HomeAssistant,
|
|
devices: VoIPDevices,
|
|
call_info: CallInfo,
|
|
rtcp_state: RtcpState | None = None,
|
|
) -> VoipDatagramProtocol:
|
|
"""Plays a pre-recorded message if pipeline is misconfigured."""
|
|
voip_device = devices.async_get_or_create(call_info)
|
|
pipeline_id = pipeline_select.get_chosen_pipeline(
|
|
hass,
|
|
DOMAIN,
|
|
voip_device.voip_id,
|
|
)
|
|
try:
|
|
pipeline: Pipeline | None = async_get_pipeline(hass, pipeline_id)
|
|
except PipelineNotFound:
|
|
pipeline = None
|
|
|
|
if (
|
|
(pipeline is None)
|
|
or (pipeline.stt_engine is None)
|
|
or (pipeline.tts_engine is None)
|
|
):
|
|
# Play pre-recorded message instead of failing
|
|
return PreRecordMessageProtocol(
|
|
hass,
|
|
"problem.pcm",
|
|
opus_payload_type=call_info.opus_payload_type,
|
|
rtcp_state=rtcp_state,
|
|
)
|
|
|
|
vad_sensitivity = pipeline_select.get_vad_sensitivity(
|
|
hass,
|
|
DOMAIN,
|
|
voip_device.voip_id,
|
|
)
|
|
|
|
# Pipeline is properly configured
|
|
return PipelineRtpDatagramProtocol(
|
|
hass,
|
|
hass.config.language,
|
|
voip_device,
|
|
Context(user_id=devices.config_entry.data["user"]),
|
|
opus_payload_type=call_info.opus_payload_type,
|
|
silence_seconds=VadSensitivity.to_seconds(vad_sensitivity),
|
|
rtcp_state=rtcp_state,
|
|
)
|
|
|
|
|
|
class HassVoipDatagramProtocol(VoipDatagramProtocol):
|
|
"""HA UDP server for Voice over IP (VoIP)."""
|
|
|
|
def __init__(self, hass: HomeAssistant, devices: VoIPDevices) -> None:
|
|
"""Set up VoIP call handler."""
|
|
super().__init__(
|
|
sdp_info=SdpInfo(
|
|
username="homeassistant",
|
|
id=time.monotonic_ns(),
|
|
session_name="voip_hass",
|
|
version=__version__,
|
|
),
|
|
valid_protocol_factory=lambda call_info, rtcp_state: make_protocol(
|
|
hass, devices, call_info, rtcp_state
|
|
),
|
|
invalid_protocol_factory=(
|
|
lambda call_info, rtcp_state: PreRecordMessageProtocol(
|
|
hass,
|
|
"not_configured.pcm",
|
|
opus_payload_type=call_info.opus_payload_type,
|
|
rtcp_state=rtcp_state,
|
|
)
|
|
),
|
|
)
|
|
self.hass = hass
|
|
self.devices = devices
|
|
self._closed_event = asyncio.Event()
|
|
|
|
def is_valid_call(self, call_info: CallInfo) -> bool:
|
|
"""Filter calls."""
|
|
device = self.devices.async_get_or_create(call_info)
|
|
return device.async_allow_call(self.hass)
|
|
|
|
def connection_lost(self, exc):
|
|
"""Signal wait_closed when transport is completely closed."""
|
|
self.hass.loop.call_soon_threadsafe(self._closed_event.set)
|
|
|
|
async def wait_closed(self) -> None:
|
|
"""Wait for connection_lost to be called."""
|
|
await self._closed_event.wait()
|
|
|
|
|
|
class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
|
|
"""Run a voice assistant pipeline in a loop for a VoIP call."""
|
|
|
|
def __init__(
|
|
self,
|
|
hass: HomeAssistant,
|
|
language: str,
|
|
voip_device: VoIPDevice,
|
|
context: Context,
|
|
opus_payload_type: int,
|
|
pipeline_timeout: float = 30.0,
|
|
audio_timeout: float = 2.0,
|
|
buffered_chunks_before_speech: int = 100,
|
|
listening_tone_enabled: bool = True,
|
|
processing_tone_enabled: bool = True,
|
|
error_tone_enabled: bool = True,
|
|
tone_delay: float = 0.2,
|
|
tts_extra_timeout: float = 1.0,
|
|
silence_seconds: float = 1.0,
|
|
rtcp_state: RtcpState | None = None,
|
|
) -> None:
|
|
"""Set up pipeline RTP server."""
|
|
super().__init__(
|
|
rate=RATE,
|
|
width=WIDTH,
|
|
channels=CHANNELS,
|
|
opus_payload_type=opus_payload_type,
|
|
rtcp_state=rtcp_state,
|
|
)
|
|
|
|
self.hass = hass
|
|
self.language = language
|
|
self.voip_device = voip_device
|
|
self.pipeline: Pipeline | None = None
|
|
self.pipeline_timeout = pipeline_timeout
|
|
self.audio_timeout = audio_timeout
|
|
self.buffered_chunks_before_speech = buffered_chunks_before_speech
|
|
self.listening_tone_enabled = listening_tone_enabled
|
|
self.processing_tone_enabled = processing_tone_enabled
|
|
self.error_tone_enabled = error_tone_enabled
|
|
self.tone_delay = tone_delay
|
|
self.tts_extra_timeout = tts_extra_timeout
|
|
self.silence_seconds = silence_seconds
|
|
|
|
self._audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
|
|
self._context = context
|
|
self._conversation_id: str | None = None
|
|
self._pipeline_task: asyncio.Task | None = None
|
|
self._tts_done = asyncio.Event()
|
|
self._session_id: str | None = None
|
|
self._tone_bytes: bytes | None = None
|
|
self._processing_bytes: bytes | None = None
|
|
self._error_bytes: bytes | None = None
|
|
self._pipeline_error: bool = False
|
|
|
|
def connection_made(self, transport):
|
|
"""Server is ready."""
|
|
super().connection_made(transport)
|
|
self.voip_device.set_is_active(True)
|
|
|
|
def connection_lost(self, exc):
|
|
"""Handle connection is lost or closed."""
|
|
super().connection_lost(exc)
|
|
self.voip_device.set_is_active(False)
|
|
|
|
def on_chunk(self, audio_bytes: bytes) -> None:
|
|
"""Handle raw audio chunk."""
|
|
if self._pipeline_task is None:
|
|
self._clear_audio_queue()
|
|
|
|
# Run pipeline until voice command finishes, then start over
|
|
self._pipeline_task = self.hass.async_create_background_task(
|
|
self._run_pipeline(),
|
|
"voip_pipeline_run",
|
|
)
|
|
|
|
self._audio_queue.put_nowait(audio_bytes)
|
|
|
|
async def _run_pipeline(
|
|
self,
|
|
) -> None:
|
|
"""Forward audio to pipeline STT and handle TTS."""
|
|
if self._session_id is None:
|
|
self._session_id = ulid_now()
|
|
|
|
# Play listening tone at the start of each cycle
|
|
if self.listening_tone_enabled:
|
|
await self._play_listening_tone()
|
|
|
|
try:
|
|
# Wait for speech before starting pipeline
|
|
segmenter = VoiceCommandSegmenter(silence_seconds=self.silence_seconds)
|
|
vad = WebRtcVad()
|
|
chunk_buffer: deque[bytes] = deque(
|
|
maxlen=self.buffered_chunks_before_speech,
|
|
)
|
|
speech_detected = await self._wait_for_speech(
|
|
segmenter,
|
|
vad,
|
|
chunk_buffer,
|
|
)
|
|
if not speech_detected:
|
|
_LOGGER.debug("No speech detected")
|
|
return
|
|
|
|
_LOGGER.debug("Starting pipeline")
|
|
self._tts_done.clear()
|
|
|
|
async def stt_stream():
|
|
try:
|
|
async for chunk in self._segment_audio(
|
|
segmenter,
|
|
vad,
|
|
chunk_buffer,
|
|
):
|
|
yield chunk
|
|
|
|
if self.processing_tone_enabled:
|
|
await self._play_processing_tone()
|
|
except TimeoutError:
|
|
# Expected after caller hangs up
|
|
_LOGGER.debug("Audio timeout")
|
|
self._session_id = None
|
|
self.disconnect()
|
|
finally:
|
|
self._clear_audio_queue()
|
|
|
|
# Run pipeline with a timeout
|
|
async with asyncio.timeout(self.pipeline_timeout):
|
|
await async_pipeline_from_audio_stream(
|
|
self.hass,
|
|
context=self._context,
|
|
event_callback=self._event_callback,
|
|
stt_metadata=stt.SpeechMetadata(
|
|
language="", # set in async_pipeline_from_audio_stream
|
|
format=stt.AudioFormats.WAV,
|
|
codec=stt.AudioCodecs.PCM,
|
|
bit_rate=stt.AudioBitRates.BITRATE_16,
|
|
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
|
|
channel=stt.AudioChannels.CHANNEL_MONO,
|
|
),
|
|
stt_stream=stt_stream(),
|
|
pipeline_id=pipeline_select.get_chosen_pipeline(
|
|
self.hass, DOMAIN, self.voip_device.voip_id
|
|
),
|
|
conversation_id=self._conversation_id,
|
|
device_id=self.voip_device.device_id,
|
|
tts_audio_output="wav",
|
|
)
|
|
|
|
if self._pipeline_error:
|
|
self._pipeline_error = False
|
|
if self.error_tone_enabled:
|
|
await self._play_error_tone()
|
|
else:
|
|
# Block until TTS is done speaking.
|
|
#
|
|
# This is set in _send_tts and has a timeout that's based on the
|
|
# length of the TTS audio.
|
|
await self._tts_done.wait()
|
|
|
|
_LOGGER.debug("Pipeline finished")
|
|
except PipelineNotFound:
|
|
_LOGGER.warning("Pipeline not found")
|
|
except TimeoutError:
|
|
# Expected after caller hangs up
|
|
_LOGGER.debug("Pipeline timeout")
|
|
self._session_id = None
|
|
self.disconnect()
|
|
finally:
|
|
# Allow pipeline to run again
|
|
self._pipeline_task = None
|
|
|
|
async def _wait_for_speech(
|
|
self,
|
|
segmenter: VoiceCommandSegmenter,
|
|
vad: VoiceActivityDetector,
|
|
chunk_buffer: MutableSequence[bytes],
|
|
):
|
|
"""Buffer audio chunks until speech is detected.
|
|
|
|
Returns True if speech was detected, False otherwise.
|
|
"""
|
|
# Timeout if no audio comes in for a while.
|
|
# This means the caller hung up.
|
|
async with asyncio.timeout(self.audio_timeout):
|
|
chunk = await self._audio_queue.get()
|
|
|
|
assert vad.samples_per_chunk is not None
|
|
vad_buffer = AudioBuffer(vad.samples_per_chunk * WIDTH)
|
|
|
|
while chunk:
|
|
chunk_buffer.append(chunk)
|
|
|
|
segmenter.process_with_vad(chunk, vad, vad_buffer)
|
|
if segmenter.in_command:
|
|
# Buffer until command starts
|
|
if len(vad_buffer) > 0:
|
|
chunk_buffer.append(vad_buffer.bytes())
|
|
|
|
return True
|
|
|
|
async with asyncio.timeout(self.audio_timeout):
|
|
chunk = await self._audio_queue.get()
|
|
|
|
return False
|
|
|
|
async def _segment_audio(
|
|
self,
|
|
segmenter: VoiceCommandSegmenter,
|
|
vad: VoiceActivityDetector,
|
|
chunk_buffer: Sequence[bytes],
|
|
) -> AsyncIterable[bytes]:
|
|
"""Yield audio chunks until voice command has finished."""
|
|
# Buffered chunks first
|
|
for buffered_chunk in chunk_buffer:
|
|
yield buffered_chunk
|
|
|
|
# Timeout if no audio comes in for a while.
|
|
# This means the caller hung up.
|
|
async with asyncio.timeout(self.audio_timeout):
|
|
chunk = await self._audio_queue.get()
|
|
|
|
assert vad.samples_per_chunk is not None
|
|
vad_buffer = AudioBuffer(vad.samples_per_chunk * WIDTH)
|
|
|
|
while chunk:
|
|
if not segmenter.process_with_vad(chunk, vad, vad_buffer):
|
|
# Voice command is finished
|
|
break
|
|
|
|
yield chunk
|
|
|
|
async with asyncio.timeout(self.audio_timeout):
|
|
chunk = await self._audio_queue.get()
|
|
|
|
def _clear_audio_queue(self) -> None:
|
|
while not self._audio_queue.empty():
|
|
self._audio_queue.get_nowait()
|
|
|
|
def _event_callback(self, event: PipelineEvent):
|
|
if not event.data:
|
|
return
|
|
|
|
if event.type == PipelineEventType.INTENT_END:
|
|
# Capture conversation id
|
|
self._conversation_id = event.data["intent_output"]["conversation_id"]
|
|
elif event.type == PipelineEventType.TTS_END:
|
|
# Send TTS audio to caller over RTP
|
|
tts_output = event.data["tts_output"]
|
|
if tts_output:
|
|
media_id = tts_output["media_id"]
|
|
self.hass.async_create_background_task(
|
|
self._send_tts(media_id),
|
|
"voip_pipeline_tts",
|
|
)
|
|
else:
|
|
# Empty TTS response
|
|
self._tts_done.set()
|
|
elif event.type == PipelineEventType.ERROR:
|
|
# Play error tone instead of wait for TTS
|
|
self._pipeline_error = True
|
|
|
|
async def _send_tts(self, media_id: str) -> None:
|
|
"""Send TTS audio to caller via RTP."""
|
|
try:
|
|
if self.transport is None:
|
|
return
|
|
|
|
extension, data = await tts.async_get_media_source_audio(
|
|
self.hass,
|
|
media_id,
|
|
)
|
|
|
|
if extension != "wav":
|
|
raise ValueError(f"Only WAV audio can be streamed, got {extension}")
|
|
|
|
with io.BytesIO(data) as wav_io:
|
|
with wave.open(wav_io, "rb") as wav_file:
|
|
sample_rate = wav_file.getframerate()
|
|
sample_width = wav_file.getsampwidth()
|
|
sample_channels = wav_file.getnchannels()
|
|
|
|
if (
|
|
(sample_rate != 16000)
|
|
or (sample_width != 2)
|
|
or (sample_channels != 1)
|
|
):
|
|
raise ValueError(
|
|
"Expected rate/width/channels as 16000/2/1,"
|
|
" got {sample_rate}/{sample_width}/{sample_channels}}"
|
|
)
|
|
|
|
audio_bytes = wav_file.readframes(wav_file.getnframes())
|
|
|
|
_LOGGER.debug("Sending %s byte(s) of audio", len(audio_bytes))
|
|
|
|
# Time out 1 second after TTS audio should be finished
|
|
tts_samples = len(audio_bytes) / (WIDTH * CHANNELS)
|
|
tts_seconds = tts_samples / RATE
|
|
|
|
async with asyncio.timeout(tts_seconds + self.tts_extra_timeout):
|
|
# TTS audio is 16Khz 16-bit mono
|
|
await self._async_send_audio(audio_bytes)
|
|
except TimeoutError:
|
|
_LOGGER.warning("TTS timeout")
|
|
raise
|
|
finally:
|
|
# Signal pipeline to restart
|
|
self._tts_done.set()
|
|
|
|
async def _async_send_audio(self, audio_bytes: bytes, **kwargs):
|
|
"""Send audio in executor."""
|
|
await self.hass.async_add_executor_job(
|
|
partial(self.send_audio, audio_bytes, **RTP_AUDIO_SETTINGS, **kwargs)
|
|
)
|
|
|
|
async def _play_listening_tone(self) -> None:
|
|
"""Play a tone to indicate that Home Assistant is listening."""
|
|
if self._tone_bytes is None:
|
|
# Do I/O in executor
|
|
self._tone_bytes = await self.hass.async_add_executor_job(
|
|
self._load_pcm,
|
|
"tone.pcm",
|
|
)
|
|
|
|
await self._async_send_audio(
|
|
self._tone_bytes,
|
|
silence_before=self.tone_delay,
|
|
)
|
|
|
|
async def _play_processing_tone(self) -> None:
|
|
"""Play a tone to indicate that Home Assistant is processing the voice command."""
|
|
if self._processing_bytes is None:
|
|
# Do I/O in executor
|
|
self._processing_bytes = await self.hass.async_add_executor_job(
|
|
self._load_pcm,
|
|
"processing.pcm",
|
|
)
|
|
|
|
await self._async_send_audio(self._processing_bytes)
|
|
|
|
async def _play_error_tone(self) -> None:
|
|
"""Play a tone to indicate a pipeline error occurred."""
|
|
if self._error_bytes is None:
|
|
# Do I/O in executor
|
|
self._error_bytes = await self.hass.async_add_executor_job(
|
|
self._load_pcm,
|
|
"error.pcm",
|
|
)
|
|
|
|
await self._async_send_audio(self._error_bytes)
|
|
|
|
def _load_pcm(self, file_name: str) -> bytes:
|
|
"""Load raw audio (16Khz, 16-bit mono)."""
|
|
return (Path(__file__).parent / file_name).read_bytes()
|
|
|
|
|
|
class PreRecordMessageProtocol(RtpDatagramProtocol):
|
|
"""Plays a pre-recorded message on a loop."""
|
|
|
|
def __init__(
|
|
self,
|
|
hass: HomeAssistant,
|
|
file_name: str,
|
|
opus_payload_type: int,
|
|
message_delay: float = 1.0,
|
|
loop_delay: float = 2.0,
|
|
rtcp_state: RtcpState | None = None,
|
|
) -> None:
|
|
"""Set up RTP server."""
|
|
super().__init__(
|
|
rate=RATE,
|
|
width=WIDTH,
|
|
channels=CHANNELS,
|
|
opus_payload_type=opus_payload_type,
|
|
rtcp_state=rtcp_state,
|
|
)
|
|
self.hass = hass
|
|
self.file_name = file_name
|
|
self.message_delay = message_delay
|
|
self.loop_delay = loop_delay
|
|
self._audio_task: asyncio.Task | None = None
|
|
self._audio_bytes: bytes | None = None
|
|
|
|
def on_chunk(self, audio_bytes: bytes) -> None:
|
|
"""Handle raw audio chunk."""
|
|
if self.transport is None:
|
|
return
|
|
|
|
if self._audio_bytes is None:
|
|
# 16Khz, 16-bit mono audio message
|
|
file_path = Path(__file__).parent / self.file_name
|
|
self._audio_bytes = file_path.read_bytes()
|
|
|
|
if self._audio_task is None:
|
|
self._audio_task = self.hass.async_create_background_task(
|
|
self._play_message(),
|
|
"voip_not_connected",
|
|
)
|
|
|
|
async def _play_message(self) -> None:
|
|
await self.hass.async_add_executor_job(
|
|
partial(
|
|
self.send_audio,
|
|
self._audio_bytes,
|
|
silence_before=self.message_delay,
|
|
**RTP_AUDIO_SETTINGS,
|
|
)
|
|
)
|
|
|
|
await asyncio.sleep(self.loop_delay)
|
|
|
|
# Allow message to play again
|
|
self._audio_task = None
|