VoIP listening tone and "not configured" message (#91762)

* Play tone when starting a VoIP call * Play audio message when call is rejected * Add option to disable tone for tests * Send RTP audio in executor to reduce jitter * Don't start pipeline until speech * Bump voip utils
2023-04-20 19:30:51 -05:00 · 2023-04-20 19:30:51 -05:00 · 5080654776
parent f4f3962ee9
commit 5080654776
7 changed files with 180 additions and 45 deletions
--- a/homeassistant/components/voip/manifest.json
+++ b/homeassistant/components/voip/manifest.json
@ -7,5 +7,5 @@
  "documentation": "https://www.home-assistant.io/integrations/voip",
  "iot_class": "local_push",
  "quality_scale": "internal",
-  "requirements": ["voip-utils==0.0.2"]
+  "requirements": ["voip-utils==0.0.5"]
 }
--- a/homeassistant/components/voip/not_configured.raw
+++ b/homeassistant/components/voip/not_configured.raw
--- a/homeassistant/components/voip/tone.raw
+++ b/homeassistant/components/voip/tone.raw
--- a/homeassistant/components/voip/voip.py
+++ b/homeassistant/components/voip/voip.py
@ -3,8 +3,10 @@ from __future__ import annotations
 import asyncio
 from collections import deque
-from collections.abc import AsyncIterable
+from collections.abc import AsyncIterable, MutableSequence, Sequence
 from functools import partial
 import logging
 from pathlib import Path
 import time
 from typing import TYPE_CHECKING
@ -22,6 +24,7 @@ from homeassistant.components.assist_pipeline import (
 from homeassistant.components.assist_pipeline.vad import VoiceCommandSegmenter
 from homeassistant.const import __version__
 from homeassistant.core import Context, HomeAssistant
 from homeassistant.util.ulid import ulid
 from .const import DOMAIN
@ -29,6 +32,9 @@ if TYPE_CHECKING:
    from .devices import VoIPDevice, VoIPDevices
 _BUFFERED_CHUNKS_BEFORE_SPEECH = 100  # ~2 seconds
 _TONE_DELAY = 0.2  # seconds before playing tone
 _MESSAGE_DELAY = 1.0  # seconds before playing "not configured" message
 _LOOP_DELAY = 2.0  # seconds before replaying not-configured message
 _LOGGER = logging.getLogger(__name__)
@ -44,11 +50,14 @@ class HassVoipDatagramProtocol(VoipDatagramProtocol):
                session_name="voip_hass",
                version=__version__,
            ),
-            protocol_factory=lambda call_info: PipelineRtpDatagramProtocol(
+            valid_protocol_factory=lambda call_info: PipelineRtpDatagramProtocol(
                hass,
                hass.config.language,
                devices.async_get_or_create(call_info),
            ),
            invalid_protocol_factory=lambda call_info: NotConfiguredRtpDatagramProtocol(
                hass,
            ),
        )
        self.hass = hass
        self.devices = devices
@ -69,6 +78,7 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        voip_device: VoIPDevice,
        pipeline_timeout: float = 30.0,
        audio_timeout: float = 2.0,
        listening_tone_enabled: bool = True,
    ) -> None:
        """Set up pipeline RTP server."""
        # STT expects 16Khz mono with 16-bit samples
@ -80,11 +90,14 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        self.pipeline: Pipeline | None = None
        self.pipeline_timeout = pipeline_timeout
        self.audio_timeout = audio_timeout
        self.listening_tone_enabled = listening_tone_enabled
        self._audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
        self._context = Context()
        self._conversation_id: str | None = None
        self._pipeline_task: asyncio.Task | None = None
        self._session_id: str | None = None
        self._tone_bytes: bytes | None = None
    def connection_made(self, transport):
        """Server is ready."""
@ -113,23 +126,42 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        self,
    ) -> None:
        """Forward audio to pipeline STT and handle TTS."""
        if self._session_id is None:
            self._session_id = ulid()
            if self.listening_tone_enabled:
                await self._play_listening_tone()
        try:
            # Wait for speech before starting pipeline
            segmenter = VoiceCommandSegmenter()
            chunk_buffer: deque[bytes] = deque(
                maxlen=_BUFFERED_CHUNKS_BEFORE_SPEECH,
            )
            speech_detected = await self._wait_for_speech(
                segmenter,
                chunk_buffer,
            )
            if not speech_detected:
                _LOGGER.debug("No speech detected")
                return
            _LOGGER.debug("Starting pipeline")
            async def stt_stream():
                try:
-                async for chunk in self._segment_audio():
+                    async for chunk in self._segment_audio(
                        segmenter,
                        chunk_buffer,
                    ):
                        yield chunk
                except asyncio.TimeoutError:
                    # Expected after caller hangs up
                    _LOGGER.debug("Audio timeout")
-
+                    self._session_id = None
-                if self.transport is not None:
+                    self.disconnect()
                    self.transport.close()
                    self.transport = None
                finally:
                    self._clear_audio_queue()
        try:
            # Run pipeline with a timeout
            async with async_timeout.timeout(self.pipeline_timeout):
                await async_pipeline_from_audio_stream(
@ -155,17 +187,48 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        except asyncio.TimeoutError:
            # Expected after caller hangs up
            _LOGGER.debug("Pipeline timeout")
-
+            self._session_id = None
-            if self.transport is not None:
+            self.disconnect()
                self.transport.close()
                self.transport = None
        finally:
            # Allow pipeline to run again
            self._pipeline_task = None
-    async def _segment_audio(self) -> AsyncIterable[bytes]:
+    async def _wait_for_speech(
-        segmenter = VoiceCommandSegmenter()
+        self,
-        chunk_buffer: deque[bytes] = deque(maxlen=_BUFFERED_CHUNKS_BEFORE_SPEECH)
+        segmenter: VoiceCommandSegmenter,
        chunk_buffer: MutableSequence[bytes],
    ):
        """Buffer audio chunks until speech is detected.
        Returns True if speech was detected, False otherwise.
        """
        # Timeout if no audio comes in for a while.
        # This means the caller hung up.
        async with async_timeout.timeout(self.audio_timeout):
            chunk = await self._audio_queue.get()
        while chunk:
            segmenter.process(chunk)
            if segmenter.in_command:
                return True
            # Buffer until command starts
            chunk_buffer.append(chunk)
            async with async_timeout.timeout(self.audio_timeout):
                chunk = await self._audio_queue.get()
        return False
    async def _segment_audio(
        self,
        segmenter: VoiceCommandSegmenter,
        chunk_buffer: Sequence[bytes],
    ) -> AsyncIterable[bytes]:
        """Yield audio chunks until voice command has finished."""
        # Buffered chunks first
        for buffered_chunk in chunk_buffer:
            yield buffered_chunk
        # Timeout if no audio comes in for a while.
        # This means the caller hung up.
@ -177,18 +240,7 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
                # Voice command is finished
                break
            if segmenter.in_command:
                if chunk_buffer:
                    # Release audio in buffer first
                    for buffered_chunk in chunk_buffer:
                        yield buffered_chunk
                    chunk_buffer.clear()
            yield chunk
            else:
                # Buffer until command starts
                chunk_buffer.append(chunk)
            async with async_timeout.timeout(self.audio_timeout):
                chunk = await self._audio_queue.get()
@ -225,4 +277,74 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        _LOGGER.debug("Sending %s byte(s) of audio", len(audio_bytes))
        # Assume TTS audio is 16Khz 16-bit mono
-        await self.send_audio(audio_bytes, rate=16000, width=2, channels=1)
+        await self.hass.async_add_executor_job(
            partial(self.send_audio, audio_bytes, rate=16000, width=2, channels=1)
        )
    async def _play_listening_tone(self) -> None:
        """Play a tone to indicate that Home Assistant is listening."""
        if self._tone_bytes is None:
            # Do I/O in executor
            self._tone_bytes = await self.hass.async_add_executor_job(
                self._load_tone,
            )
        await self.hass.async_add_executor_job(
            partial(
                self.send_audio,
                self._tone_bytes,
                rate=16000,
                width=2,
                channels=1,
                silence_before=_TONE_DELAY,
            )
        )
    def _load_tone(self) -> bytes:
        """Load raw tone audio (16Khz, 16-bit mono)."""
        return (Path(__file__).parent / "tone.raw").read_bytes()
 class NotConfiguredRtpDatagramProtocol(RtpDatagramProtocol):
    """Plays audio on a loop to inform the user to configure the phone in Home Assistant."""
    def __init__(self, hass: HomeAssistant) -> None:
        """Set up RTP server."""
        super().__init__(rate=16000, width=2, channels=1)
        self.hass = hass
        self._audio_task: asyncio.Task | None = None
        self._audio_bytes: bytes | None = None
    def on_chunk(self, audio_bytes: bytes) -> None:
        """Handle raw audio chunk."""
        if self.transport is None:
            return
        if self._audio_bytes is None:
            # 16Khz, 16-bit mono audio message
            self._audio_bytes = (
                Path(__file__).parent / "not_configured.raw"
            ).read_bytes()
        if self._audio_task is None:
            self._audio_task = self.hass.async_create_background_task(
                self._play_message(),
                "voip_not_connected",
            )
    async def _play_message(self) -> None:
        await self.hass.async_add_executor_job(
            partial(
                self.send_audio,
                self._audio_bytes,
                16000,
                2,
                1,
                silence_before=_MESSAGE_DELAY,
            )
        )
        await asyncio.sleep(_LOOP_DELAY)
        # Allow message to play again
        self._audio_task = None
--- a/requirements_all.txt
+++ b/requirements_all.txt
@ -2591,7 +2591,7 @@ venstarcolortouch==0.19
 vilfo-api-client==0.3.2
 # homeassistant.components.voip
-voip-utils==0.0.2
+voip-utils==0.0.5
 # homeassistant.components.volkszaehler
 volkszaehler==0.4.0
--- a/requirements_test_all.txt
+++ b/requirements_test_all.txt
@ -1867,7 +1867,7 @@ venstarcolortouch==0.19
 vilfo-api-client==0.3.2
 # homeassistant.components.voip
-voip-utils==0.0.2
+voip-utils==0.0.5
 # homeassistant.components.volvooncall
 volvooncall==0.10.2
--- a/tests/components/voip/test_voip.py
+++ b/tests/components/voip/test_voip.py
@ -35,7 +35,6 @@ async def test_pipeline(
        async for _chunk in stt_stream:
            # Stream will end when VAD detects end of "speech"
            assert _chunk != bad_chunk
            pass
        # Test empty data
        event_callback(
@ -84,14 +83,17 @@ async def test_pipeline(
        new=async_get_media_source_audio,
    ):
        rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
-            hass, hass.config.language, voip_device
+            hass,
            hass.config.language,
            voip_device,
            listening_tone_enabled=False,
        )
        rtp_protocol.transport = Mock()
        # Ensure audio queue is cleared before pipeline starts
        rtp_protocol._audio_queue.put_nowait(bad_chunk)
-        async def send_audio(*args, **kwargs):
+        def send_audio(*args, **kwargs):
            # Test finished successfully
            done.set()
@ -123,9 +125,16 @@ async def test_pipeline_timeout(hass: HomeAssistant, voip_device: VoIPDevice) ->
    with patch(
        "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
        new=async_pipeline_from_audio_stream,
    ), patch(
        "homeassistant.components.voip.voip.PipelineRtpDatagramProtocol._wait_for_speech",
        return_value=True,
    ):
        rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
-            hass, hass.config.language, voip_device, pipeline_timeout=0.001
+            hass,
            hass.config.language,
            voip_device,
            pipeline_timeout=0.001,
            listening_tone_enabled=False,
        )
        transport = Mock(spec=["close"])
        rtp_protocol.connection_made(transport)
@ -158,7 +167,11 @@ async def test_stt_stream_timeout(hass: HomeAssistant, voip_device: VoIPDevice)
        new=async_pipeline_from_audio_stream,
    ):
        rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
-            hass, hass.config.language, voip_device, audio_timeout=0.001
+            hass,
            hass.config.language,
            voip_device,
            audio_timeout=0.001,
            listening_tone_enabled=False,
        )
        transport = Mock(spec=["close"])
        rtp_protocol.connection_made(transport)