VoIP listening tone and "not configured" message (#91762)

* Play tone when starting a VoIP call

* Play audio message when call is rejected

* Add option to disable tone for tests

* Send RTP audio in executor to reduce jitter

* Don't start pipeline until speech

* Bump voip utils
pull/91783/head
Michael Hansen 2023-04-20 19:30:51 -05:00 committed by GitHub
parent f4f3962ee9
commit 5080654776
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 180 additions and 45 deletions

View File

@ -7,5 +7,5 @@
"documentation": "https://www.home-assistant.io/integrations/voip", "documentation": "https://www.home-assistant.io/integrations/voip",
"iot_class": "local_push", "iot_class": "local_push",
"quality_scale": "internal", "quality_scale": "internal",
"requirements": ["voip-utils==0.0.2"] "requirements": ["voip-utils==0.0.5"]
} }

Binary file not shown.

Binary file not shown.

View File

@ -3,8 +3,10 @@ from __future__ import annotations
import asyncio import asyncio
from collections import deque from collections import deque
from collections.abc import AsyncIterable from collections.abc import AsyncIterable, MutableSequence, Sequence
from functools import partial
import logging import logging
from pathlib import Path
import time import time
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@ -22,6 +24,7 @@ from homeassistant.components.assist_pipeline import (
from homeassistant.components.assist_pipeline.vad import VoiceCommandSegmenter from homeassistant.components.assist_pipeline.vad import VoiceCommandSegmenter
from homeassistant.const import __version__ from homeassistant.const import __version__
from homeassistant.core import Context, HomeAssistant from homeassistant.core import Context, HomeAssistant
from homeassistant.util.ulid import ulid
from .const import DOMAIN from .const import DOMAIN
@ -29,6 +32,9 @@ if TYPE_CHECKING:
from .devices import VoIPDevice, VoIPDevices from .devices import VoIPDevice, VoIPDevices
_BUFFERED_CHUNKS_BEFORE_SPEECH = 100 # ~2 seconds _BUFFERED_CHUNKS_BEFORE_SPEECH = 100 # ~2 seconds
_TONE_DELAY = 0.2 # seconds before playing tone
_MESSAGE_DELAY = 1.0 # seconds before playing "not configured" message
_LOOP_DELAY = 2.0 # seconds before replaying not-configured message
_LOGGER = logging.getLogger(__name__) _LOGGER = logging.getLogger(__name__)
@ -44,11 +50,14 @@ class HassVoipDatagramProtocol(VoipDatagramProtocol):
session_name="voip_hass", session_name="voip_hass",
version=__version__, version=__version__,
), ),
protocol_factory=lambda call_info: PipelineRtpDatagramProtocol( valid_protocol_factory=lambda call_info: PipelineRtpDatagramProtocol(
hass, hass,
hass.config.language, hass.config.language,
devices.async_get_or_create(call_info), devices.async_get_or_create(call_info),
), ),
invalid_protocol_factory=lambda call_info: NotConfiguredRtpDatagramProtocol(
hass,
),
) )
self.hass = hass self.hass = hass
self.devices = devices self.devices = devices
@ -69,6 +78,7 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
voip_device: VoIPDevice, voip_device: VoIPDevice,
pipeline_timeout: float = 30.0, pipeline_timeout: float = 30.0,
audio_timeout: float = 2.0, audio_timeout: float = 2.0,
listening_tone_enabled: bool = True,
) -> None: ) -> None:
"""Set up pipeline RTP server.""" """Set up pipeline RTP server."""
# STT expects 16Khz mono with 16-bit samples # STT expects 16Khz mono with 16-bit samples
@ -80,11 +90,14 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
self.pipeline: Pipeline | None = None self.pipeline: Pipeline | None = None
self.pipeline_timeout = pipeline_timeout self.pipeline_timeout = pipeline_timeout
self.audio_timeout = audio_timeout self.audio_timeout = audio_timeout
self.listening_tone_enabled = listening_tone_enabled
self._audio_queue: asyncio.Queue[bytes] = asyncio.Queue() self._audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
self._context = Context() self._context = Context()
self._conversation_id: str | None = None self._conversation_id: str | None = None
self._pipeline_task: asyncio.Task | None = None self._pipeline_task: asyncio.Task | None = None
self._session_id: str | None = None
self._tone_bytes: bytes | None = None
def connection_made(self, transport): def connection_made(self, transport):
"""Server is ready.""" """Server is ready."""
@ -113,23 +126,42 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
self, self,
) -> None: ) -> None:
"""Forward audio to pipeline STT and handle TTS.""" """Forward audio to pipeline STT and handle TTS."""
_LOGGER.debug("Starting pipeline") if self._session_id is None:
self._session_id = ulid()
async def stt_stream(): if self.listening_tone_enabled:
try: await self._play_listening_tone()
async for chunk in self._segment_audio():
yield chunk
except asyncio.TimeoutError:
# Expected after caller hangs up
_LOGGER.debug("Audio timeout")
if self.transport is not None:
self.transport.close()
self.transport = None
finally:
self._clear_audio_queue()
try: try:
# Wait for speech before starting pipeline
segmenter = VoiceCommandSegmenter()
chunk_buffer: deque[bytes] = deque(
maxlen=_BUFFERED_CHUNKS_BEFORE_SPEECH,
)
speech_detected = await self._wait_for_speech(
segmenter,
chunk_buffer,
)
if not speech_detected:
_LOGGER.debug("No speech detected")
return
_LOGGER.debug("Starting pipeline")
async def stt_stream():
try:
async for chunk in self._segment_audio(
segmenter,
chunk_buffer,
):
yield chunk
except asyncio.TimeoutError:
# Expected after caller hangs up
_LOGGER.debug("Audio timeout")
self._session_id = None
self.disconnect()
finally:
self._clear_audio_queue()
# Run pipeline with a timeout # Run pipeline with a timeout
async with async_timeout.timeout(self.pipeline_timeout): async with async_timeout.timeout(self.pipeline_timeout):
await async_pipeline_from_audio_stream( await async_pipeline_from_audio_stream(
@ -155,17 +187,48 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
except asyncio.TimeoutError: except asyncio.TimeoutError:
# Expected after caller hangs up # Expected after caller hangs up
_LOGGER.debug("Pipeline timeout") _LOGGER.debug("Pipeline timeout")
self._session_id = None
if self.transport is not None: self.disconnect()
self.transport.close()
self.transport = None
finally: finally:
# Allow pipeline to run again # Allow pipeline to run again
self._pipeline_task = None self._pipeline_task = None
async def _segment_audio(self) -> AsyncIterable[bytes]: async def _wait_for_speech(
segmenter = VoiceCommandSegmenter() self,
chunk_buffer: deque[bytes] = deque(maxlen=_BUFFERED_CHUNKS_BEFORE_SPEECH) segmenter: VoiceCommandSegmenter,
chunk_buffer: MutableSequence[bytes],
):
"""Buffer audio chunks until speech is detected.
Returns True if speech was detected, False otherwise.
"""
# Timeout if no audio comes in for a while.
# This means the caller hung up.
async with async_timeout.timeout(self.audio_timeout):
chunk = await self._audio_queue.get()
while chunk:
segmenter.process(chunk)
if segmenter.in_command:
return True
# Buffer until command starts
chunk_buffer.append(chunk)
async with async_timeout.timeout(self.audio_timeout):
chunk = await self._audio_queue.get()
return False
async def _segment_audio(
self,
segmenter: VoiceCommandSegmenter,
chunk_buffer: Sequence[bytes],
) -> AsyncIterable[bytes]:
"""Yield audio chunks until voice command has finished."""
# Buffered chunks first
for buffered_chunk in chunk_buffer:
yield buffered_chunk
# Timeout if no audio comes in for a while. # Timeout if no audio comes in for a while.
# This means the caller hung up. # This means the caller hung up.
@ -177,18 +240,7 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
# Voice command is finished # Voice command is finished
break break
if segmenter.in_command: yield chunk
if chunk_buffer:
# Release audio in buffer first
for buffered_chunk in chunk_buffer:
yield buffered_chunk
chunk_buffer.clear()
yield chunk
else:
# Buffer until command starts
chunk_buffer.append(chunk)
async with async_timeout.timeout(self.audio_timeout): async with async_timeout.timeout(self.audio_timeout):
chunk = await self._audio_queue.get() chunk = await self._audio_queue.get()
@ -225,4 +277,74 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
_LOGGER.debug("Sending %s byte(s) of audio", len(audio_bytes)) _LOGGER.debug("Sending %s byte(s) of audio", len(audio_bytes))
# Assume TTS audio is 16Khz 16-bit mono # Assume TTS audio is 16Khz 16-bit mono
await self.send_audio(audio_bytes, rate=16000, width=2, channels=1) await self.hass.async_add_executor_job(
partial(self.send_audio, audio_bytes, rate=16000, width=2, channels=1)
)
async def _play_listening_tone(self) -> None:
"""Play a tone to indicate that Home Assistant is listening."""
if self._tone_bytes is None:
# Do I/O in executor
self._tone_bytes = await self.hass.async_add_executor_job(
self._load_tone,
)
await self.hass.async_add_executor_job(
partial(
self.send_audio,
self._tone_bytes,
rate=16000,
width=2,
channels=1,
silence_before=_TONE_DELAY,
)
)
def _load_tone(self) -> bytes:
"""Load raw tone audio (16Khz, 16-bit mono)."""
return (Path(__file__).parent / "tone.raw").read_bytes()
class NotConfiguredRtpDatagramProtocol(RtpDatagramProtocol):
"""Plays audio on a loop to inform the user to configure the phone in Home Assistant."""
def __init__(self, hass: HomeAssistant) -> None:
"""Set up RTP server."""
super().__init__(rate=16000, width=2, channels=1)
self.hass = hass
self._audio_task: asyncio.Task | None = None
self._audio_bytes: bytes | None = None
def on_chunk(self, audio_bytes: bytes) -> None:
"""Handle raw audio chunk."""
if self.transport is None:
return
if self._audio_bytes is None:
# 16Khz, 16-bit mono audio message
self._audio_bytes = (
Path(__file__).parent / "not_configured.raw"
).read_bytes()
if self._audio_task is None:
self._audio_task = self.hass.async_create_background_task(
self._play_message(),
"voip_not_connected",
)
async def _play_message(self) -> None:
await self.hass.async_add_executor_job(
partial(
self.send_audio,
self._audio_bytes,
16000,
2,
1,
silence_before=_MESSAGE_DELAY,
)
)
await asyncio.sleep(_LOOP_DELAY)
# Allow message to play again
self._audio_task = None

View File

@ -2591,7 +2591,7 @@ venstarcolortouch==0.19
vilfo-api-client==0.3.2 vilfo-api-client==0.3.2
# homeassistant.components.voip # homeassistant.components.voip
voip-utils==0.0.2 voip-utils==0.0.5
# homeassistant.components.volkszaehler # homeassistant.components.volkszaehler
volkszaehler==0.4.0 volkszaehler==0.4.0

View File

@ -1867,7 +1867,7 @@ venstarcolortouch==0.19
vilfo-api-client==0.3.2 vilfo-api-client==0.3.2
# homeassistant.components.voip # homeassistant.components.voip
voip-utils==0.0.2 voip-utils==0.0.5
# homeassistant.components.volvooncall # homeassistant.components.volvooncall
volvooncall==0.10.2 volvooncall==0.10.2

View File

@ -35,7 +35,6 @@ async def test_pipeline(
async for _chunk in stt_stream: async for _chunk in stt_stream:
# Stream will end when VAD detects end of "speech" # Stream will end when VAD detects end of "speech"
assert _chunk != bad_chunk assert _chunk != bad_chunk
pass
# Test empty data # Test empty data
event_callback( event_callback(
@ -84,14 +83,17 @@ async def test_pipeline(
new=async_get_media_source_audio, new=async_get_media_source_audio,
): ):
rtp_protocol = voip.voip.PipelineRtpDatagramProtocol( rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
hass, hass.config.language, voip_device hass,
hass.config.language,
voip_device,
listening_tone_enabled=False,
) )
rtp_protocol.transport = Mock() rtp_protocol.transport = Mock()
# Ensure audio queue is cleared before pipeline starts # Ensure audio queue is cleared before pipeline starts
rtp_protocol._audio_queue.put_nowait(bad_chunk) rtp_protocol._audio_queue.put_nowait(bad_chunk)
async def send_audio(*args, **kwargs): def send_audio(*args, **kwargs):
# Test finished successfully # Test finished successfully
done.set() done.set()
@ -123,9 +125,16 @@ async def test_pipeline_timeout(hass: HomeAssistant, voip_device: VoIPDevice) ->
with patch( with patch(
"homeassistant.components.voip.voip.async_pipeline_from_audio_stream", "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
new=async_pipeline_from_audio_stream, new=async_pipeline_from_audio_stream,
), patch(
"homeassistant.components.voip.voip.PipelineRtpDatagramProtocol._wait_for_speech",
return_value=True,
): ):
rtp_protocol = voip.voip.PipelineRtpDatagramProtocol( rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
hass, hass.config.language, voip_device, pipeline_timeout=0.001 hass,
hass.config.language,
voip_device,
pipeline_timeout=0.001,
listening_tone_enabled=False,
) )
transport = Mock(spec=["close"]) transport = Mock(spec=["close"])
rtp_protocol.connection_made(transport) rtp_protocol.connection_made(transport)
@ -158,7 +167,11 @@ async def test_stt_stream_timeout(hass: HomeAssistant, voip_device: VoIPDevice)
new=async_pipeline_from_audio_stream, new=async_pipeline_from_audio_stream,
): ):
rtp_protocol = voip.voip.PipelineRtpDatagramProtocol( rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
hass, hass.config.language, voip_device, audio_timeout=0.001 hass,
hass.config.language,
voip_device,
audio_timeout=0.001,
listening_tone_enabled=False,
) )
transport = Mock(spec=["close"]) transport = Mock(spec=["close"])
rtp_protocol.connection_made(transport) rtp_protocol.connection_made(transport)