VoIP listening tone and "not configured" message (#91762)

* Play tone when starting a VoIP call * Play audio message when call is rejected * Add option to disable tone for tests * Send RTP audio in executor to reduce jitter * Don't start pipeline until speech * Bump voip utils
2023-04-20 19:30:51 -05:00 · 2023-04-20 19:30:51 -05:00 · 5080654776
parent f4f3962ee9
commit 5080654776
7 changed files with 180 additions and 45 deletions
--- a/homeassistant/components/voip/manifest.json
+++ b/homeassistant/components/voip/manifest.json
@ -7,5 +7,5 @@
  "documentation": "https://www.home-assistant.io/integrations/voip",
  "iot_class": "local_push",
  "quality_scale": "internal",
-  "requirements": ["voip-utils==0.0.2"]
+  "requirements": ["voip-utils==0.0.5"]
 }
--- a/homeassistant/components/voip/not_configured.raw
+++ b/homeassistant/components/voip/not_configured.raw
--- a/homeassistant/components/voip/tone.raw
+++ b/homeassistant/components/voip/tone.raw
--- a/homeassistant/components/voip/voip.py
+++ b/homeassistant/components/voip/voip.py
@ -3,8 +3,10 @@ from __future__ import annotations

 import asyncio
 from collections import deque
-from collections.abc import AsyncIterable
+from collections.abc import AsyncIterable, MutableSequence, Sequence
+from functools import partial
 import logging
+from pathlib import Path
 import time
 from typing import TYPE_CHECKING

@ -22,6 +24,7 @@ from homeassistant.components.assist_pipeline import (
 from homeassistant.components.assist_pipeline.vad import VoiceCommandSegmenter
 from homeassistant.const import __version__
 from homeassistant.core import Context, HomeAssistant
+from homeassistant.util.ulid import ulid

 from .const import DOMAIN

@ -29,6 +32,9 @@ if TYPE_CHECKING:
    from .devices import VoIPDevice, VoIPDevices

 _BUFFERED_CHUNKS_BEFORE_SPEECH = 100  # ~2 seconds
+_TONE_DELAY = 0.2  # seconds before playing tone
+_MESSAGE_DELAY = 1.0  # seconds before playing "not configured" message
+_LOOP_DELAY = 2.0  # seconds before replaying not-configured message
 _LOGGER = logging.getLogger(__name__)


@ -44,11 +50,14 @@ class HassVoipDatagramProtocol(VoipDatagramProtocol):
                session_name="voip_hass",
                version=__version__,
            ),
-            protocol_factory=lambda call_info: PipelineRtpDatagramProtocol(
+            valid_protocol_factory=lambda call_info: PipelineRtpDatagramProtocol(
                hass,
                hass.config.language,
                devices.async_get_or_create(call_info),
            ),
+            invalid_protocol_factory=lambda call_info: NotConfiguredRtpDatagramProtocol(
+                hass,
+            ),
        )
        self.hass = hass
        self.devices = devices
@ -69,6 +78,7 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        voip_device: VoIPDevice,
        pipeline_timeout: float = 30.0,
        audio_timeout: float = 2.0,
+        listening_tone_enabled: bool = True,
    ) -> None:
        """Set up pipeline RTP server."""
        # STT expects 16Khz mono with 16-bit samples
@ -80,11 +90,14 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        self.pipeline: Pipeline | None = None
        self.pipeline_timeout = pipeline_timeout
        self.audio_timeout = audio_timeout
+        self.listening_tone_enabled = listening_tone_enabled

        self._audio_queue: asyncio.Queue[bytes] = asyncio.Queue()
        self._context = Context()
        self._conversation_id: str | None = None
        self._pipeline_task: asyncio.Task | None = None
+        self._session_id: str | None = None
+        self._tone_bytes: bytes | None = None

    def connection_made(self, transport):
        """Server is ready."""
@ -113,23 +126,42 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        self,
    ) -> None:
        """Forward audio to pipeline STT and handle TTS."""
-        _LOGGER.debug("Starting pipeline")
-
-        async def stt_stream():
-            try:
-                async for chunk in self._segment_audio():
-                    yield chunk
-            except asyncio.TimeoutError:
-                # Expected after caller hangs up
-                _LOGGER.debug("Audio timeout")
-
-                if self.transport is not None:
-                    self.transport.close()
-                    self.transport = None
-            finally:
-                self._clear_audio_queue()
+        if self._session_id is None:
+            self._session_id = ulid()
+            if self.listening_tone_enabled:
+                await self._play_listening_tone()

        try:
+            # Wait for speech before starting pipeline
+            segmenter = VoiceCommandSegmenter()
+            chunk_buffer: deque[bytes] = deque(
+                maxlen=_BUFFERED_CHUNKS_BEFORE_SPEECH,
+            )
+            speech_detected = await self._wait_for_speech(
+                segmenter,
+                chunk_buffer,
+            )
+            if not speech_detected:
+                _LOGGER.debug("No speech detected")
+                return
+
+            _LOGGER.debug("Starting pipeline")
+
+            async def stt_stream():
+                try:
+                    async for chunk in self._segment_audio(
+                        segmenter,
+                        chunk_buffer,
+                    ):
+                        yield chunk
+                except asyncio.TimeoutError:
+                    # Expected after caller hangs up
+                    _LOGGER.debug("Audio timeout")
+                    self._session_id = None
+                    self.disconnect()
+                finally:
+                    self._clear_audio_queue()
+
            # Run pipeline with a timeout
            async with async_timeout.timeout(self.pipeline_timeout):
                await async_pipeline_from_audio_stream(
@ -155,17 +187,48 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        except asyncio.TimeoutError:
            # Expected after caller hangs up
            _LOGGER.debug("Pipeline timeout")
-
-            if self.transport is not None:
-                self.transport.close()
-                self.transport = None
+            self._session_id = None
+            self.disconnect()
        finally:
            # Allow pipeline to run again
            self._pipeline_task = None

-    async def _segment_audio(self) -> AsyncIterable[bytes]:
-        segmenter = VoiceCommandSegmenter()
-        chunk_buffer: deque[bytes] = deque(maxlen=_BUFFERED_CHUNKS_BEFORE_SPEECH)
+    async def _wait_for_speech(
+        self,
+        segmenter: VoiceCommandSegmenter,
+        chunk_buffer: MutableSequence[bytes],
+    ):
+        """Buffer audio chunks until speech is detected.
+
+        Returns True if speech was detected, False otherwise.
+        """
+        # Timeout if no audio comes in for a while.
+        # This means the caller hung up.
+        async with async_timeout.timeout(self.audio_timeout):
+            chunk = await self._audio_queue.get()
+
+        while chunk:
+            segmenter.process(chunk)
+            if segmenter.in_command:
+                return True
+
+            # Buffer until command starts
+            chunk_buffer.append(chunk)
+
+            async with async_timeout.timeout(self.audio_timeout):
+                chunk = await self._audio_queue.get()
+
+        return False
+
+    async def _segment_audio(
+        self,
+        segmenter: VoiceCommandSegmenter,
+        chunk_buffer: Sequence[bytes],
+    ) -> AsyncIterable[bytes]:
+        """Yield audio chunks until voice command has finished."""
+        # Buffered chunks first
+        for buffered_chunk in chunk_buffer:
+            yield buffered_chunk

        # Timeout if no audio comes in for a while.
        # This means the caller hung up.
@ -177,18 +240,7 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
                # Voice command is finished
                break

-            if segmenter.in_command:
-                if chunk_buffer:
-                    # Release audio in buffer first
-                    for buffered_chunk in chunk_buffer:
-                        yield buffered_chunk
-
-                    chunk_buffer.clear()
-
-                yield chunk
-            else:
-                # Buffer until command starts
-                chunk_buffer.append(chunk)
+            yield chunk

            async with async_timeout.timeout(self.audio_timeout):
                chunk = await self._audio_queue.get()
@ -225,4 +277,74 @@ class PipelineRtpDatagramProtocol(RtpDatagramProtocol):
        _LOGGER.debug("Sending %s byte(s) of audio", len(audio_bytes))

        # Assume TTS audio is 16Khz 16-bit mono
-        await self.send_audio(audio_bytes, rate=16000, width=2, channels=1)
+        await self.hass.async_add_executor_job(
+            partial(self.send_audio, audio_bytes, rate=16000, width=2, channels=1)
+        )
+
+    async def _play_listening_tone(self) -> None:
+        """Play a tone to indicate that Home Assistant is listening."""
+        if self._tone_bytes is None:
+            # Do I/O in executor
+            self._tone_bytes = await self.hass.async_add_executor_job(
+                self._load_tone,
+            )
+
+        await self.hass.async_add_executor_job(
+            partial(
+                self.send_audio,
+                self._tone_bytes,
+                rate=16000,
+                width=2,
+                channels=1,
+                silence_before=_TONE_DELAY,
+            )
+        )
+
+    def _load_tone(self) -> bytes:
+        """Load raw tone audio (16Khz, 16-bit mono)."""
+        return (Path(__file__).parent / "tone.raw").read_bytes()
+
+
+class NotConfiguredRtpDatagramProtocol(RtpDatagramProtocol):
+    """Plays audio on a loop to inform the user to configure the phone in Home Assistant."""
+
+    def __init__(self, hass: HomeAssistant) -> None:
+        """Set up RTP server."""
+        super().__init__(rate=16000, width=2, channels=1)
+        self.hass = hass
+        self._audio_task: asyncio.Task | None = None
+        self._audio_bytes: bytes | None = None
+
+    def on_chunk(self, audio_bytes: bytes) -> None:
+        """Handle raw audio chunk."""
+        if self.transport is None:
+            return
+
+        if self._audio_bytes is None:
+            # 16Khz, 16-bit mono audio message
+            self._audio_bytes = (
+                Path(__file__).parent / "not_configured.raw"
+            ).read_bytes()
+
+        if self._audio_task is None:
+            self._audio_task = self.hass.async_create_background_task(
+                self._play_message(),
+                "voip_not_connected",
+            )
+
+    async def _play_message(self) -> None:
+        await self.hass.async_add_executor_job(
+            partial(
+                self.send_audio,
+                self._audio_bytes,
+                16000,
+                2,
+                1,
+                silence_before=_MESSAGE_DELAY,
+            )
+        )
+
+        await asyncio.sleep(_LOOP_DELAY)
+
+        # Allow message to play again
+        self._audio_task = None
--- a/requirements_all.txt
+++ b/requirements_all.txt
@ -2591,7 +2591,7 @@ venstarcolortouch==0.19
 vilfo-api-client==0.3.2

 # homeassistant.components.voip
-voip-utils==0.0.2
+voip-utils==0.0.5

 # homeassistant.components.volkszaehler
 volkszaehler==0.4.0
--- a/requirements_test_all.txt
+++ b/requirements_test_all.txt
@ -1867,7 +1867,7 @@ venstarcolortouch==0.19
 vilfo-api-client==0.3.2

 # homeassistant.components.voip
-voip-utils==0.0.2
+voip-utils==0.0.5

 # homeassistant.components.volvooncall
 volvooncall==0.10.2
--- a/tests/components/voip/test_voip.py
+++ b/tests/components/voip/test_voip.py
@ -35,7 +35,6 @@ async def test_pipeline(
        async for _chunk in stt_stream:
            # Stream will end when VAD detects end of "speech"
            assert _chunk != bad_chunk
-            pass

        # Test empty data
        event_callback(
@ -84,14 +83,17 @@ async def test_pipeline(
        new=async_get_media_source_audio,
    ):
        rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
-            hass, hass.config.language, voip_device
+            hass,
+            hass.config.language,
+            voip_device,
+            listening_tone_enabled=False,
        )
        rtp_protocol.transport = Mock()

        # Ensure audio queue is cleared before pipeline starts
        rtp_protocol._audio_queue.put_nowait(bad_chunk)

-        async def send_audio(*args, **kwargs):
+        def send_audio(*args, **kwargs):
            # Test finished successfully
            done.set()

@ -123,9 +125,16 @@ async def test_pipeline_timeout(hass: HomeAssistant, voip_device: VoIPDevice) ->
    with patch(
        "homeassistant.components.voip.voip.async_pipeline_from_audio_stream",
        new=async_pipeline_from_audio_stream,
+    ), patch(
+        "homeassistant.components.voip.voip.PipelineRtpDatagramProtocol._wait_for_speech",
+        return_value=True,
    ):
        rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
-            hass, hass.config.language, voip_device, pipeline_timeout=0.001
+            hass,
+            hass.config.language,
+            voip_device,
+            pipeline_timeout=0.001,
+            listening_tone_enabled=False,
        )
        transport = Mock(spec=["close"])
        rtp_protocol.connection_made(transport)
@ -158,7 +167,11 @@ async def test_stt_stream_timeout(hass: HomeAssistant, voip_device: VoIPDevice)
        new=async_pipeline_from_audio_stream,
    ):
        rtp_protocol = voip.voip.PipelineRtpDatagramProtocol(
-            hass, hass.config.language, voip_device, audio_timeout=0.001
+            hass,
+            hass.config.language,
+            voip_device,
+            audio_timeout=0.001,
+            listening_tone_enabled=False,
        )
        transport = Mock(spec=["close"])
        rtp_protocol.connection_made(transport)