core/homeassistant/components/voip/assist_satellite.py

"""Assist satellite entity for VoIP integration."""

from __future__ import annotations

import asyncio
from datetime import timedelta
from enum import IntFlag
from functools import partial
import io
import logging
from pathlib import Path
import socket
import time
from typing import TYPE_CHECKING, Any, Final
import wave

from voip_utils import SIP_PORT, RtpDatagramProtocol
from voip_utils.sip import SipDatagramProtocol, SipEndpoint, get_sip_endpoint

from homeassistant.components import intent, tts
from homeassistant.components.assist_pipeline import PipelineEvent, PipelineEventType
from homeassistant.components.assist_satellite import (
    AssistSatelliteAnnouncement,
    AssistSatelliteConfiguration,
    AssistSatelliteEntity,
    AssistSatelliteEntityDescription,
    AssistSatelliteEntityFeature,
)
from homeassistant.components.intent import TimerEventType, TimerInfo
from homeassistant.components.network import async_get_source_ip
from homeassistant.config_entries import ConfigEntry
from homeassistant.core import Context, HomeAssistant, callback
from homeassistant.exceptions import HomeAssistantError
from homeassistant.helpers.entity_platform import AddConfigEntryEntitiesCallback

from .const import (
    CHANNELS,
    CONF_SIP_PORT,
    CONF_SIP_USER,
    DOMAIN,
    RATE,
    RTP_AUDIO_SETTINGS,
    WIDTH,
)
from .devices import VoIPDevice
from .entity import VoIPEntity

if TYPE_CHECKING:
    from . import DomainData

_LOGGER = logging.getLogger(__name__)

_PIPELINE_TIMEOUT_SEC: Final = 30
_HANGUP_SEC: Final = 0.5
_ANNOUNCEMENT_BEFORE_DELAY: Final = 0.5
_ANNOUNCEMENT_AFTER_DELAY: Final = 1.0
_ANNOUNCEMENT_RING_TIMEOUT: Final = 30


class Tones(IntFlag):
    """Feedback tones for specific events."""

    LISTENING = 1
    PROCESSING = 2
    ERROR = 4


_TONE_FILENAMES: dict[Tones, str] = {
    Tones.LISTENING: "tone.pcm",
    Tones.PROCESSING: "processing.pcm",
    Tones.ERROR: "error.pcm",
}


async def async_setup_entry(
    hass: HomeAssistant,
    config_entry: ConfigEntry,
    async_add_entities: AddConfigEntryEntitiesCallback,
) -> None:
    """Set up VoIP Assist satellite entity."""
    domain_data: DomainData = hass.data[DOMAIN]

    @callback
    def async_add_device(device: VoIPDevice) -> None:
        """Add device."""
        async_add_entities([VoipAssistSatellite(hass, device, config_entry)])

    domain_data.devices.async_add_new_device_listener(async_add_device)

    entities: list[VoIPEntity] = [
        VoipAssistSatellite(hass, device, config_entry)
        for device in domain_data.devices
    ]

    async_add_entities(entities)


class VoipAssistSatellite(VoIPEntity, AssistSatelliteEntity, RtpDatagramProtocol):
    """Assist satellite for VoIP devices."""

    entity_description = AssistSatelliteEntityDescription(key="assist_satellite")
    _attr_translation_key = "assist_satellite"
    _attr_name = None
    _attr_icon = "mdi:phone-classic"
    _attr_supported_features = (
        AssistSatelliteEntityFeature.ANNOUNCE
        | AssistSatelliteEntityFeature.START_CONVERSATION
    )

    def __init__(
        self,
        hass: HomeAssistant,
        voip_device: VoIPDevice,
        config_entry: ConfigEntry,
        tones=Tones.LISTENING | Tones.PROCESSING | Tones.ERROR,
    ) -> None:
        """Initialize an Assist satellite."""
        VoIPEntity.__init__(self, voip_device)
        AssistSatelliteEntity.__init__(self)
        RtpDatagramProtocol.__init__(self)

        self.config_entry = config_entry

        self._audio_queue: asyncio.Queue[bytes | None] = asyncio.Queue()
        self._audio_chunk_timeout: float = 2.0
        self._run_pipeline_task: asyncio.Task | None = None
        self._pipeline_had_error: bool = False
        self._tts_done = asyncio.Event()
        self._tts_extra_timeout: float = 1.0
        self._tone_bytes: dict[Tones, bytes] = {}
        self._tones = tones
        self._processing_tone_done = asyncio.Event()

        self._announcement: AssistSatelliteAnnouncement | None = None
        self._announcment_start_time: float = 0.0
        self._check_announcement_pickup_task: asyncio.Task | None = None
        self._check_hangup_task: asyncio.Task | None = None
        self._call_end_future: asyncio.Future[Any] = asyncio.Future()
        self._last_chunk_time: float | None = None
        self._rtp_port: int | None = None
        self._run_pipeline_after_announce: bool = False

    @property
    def pipeline_entity_id(self) -> str | None:
        """Return the entity ID of the pipeline to use for the next conversation."""
        return self.voip_device.get_pipeline_entity_id(self.hass)

    @property
    def vad_sensitivity_entity_id(self) -> str | None:
        """Return the entity ID of the VAD sensitivity to use for the next conversation."""
        return self.voip_device.get_vad_sensitivity_entity_id(self.hass)

    @property
    def tts_options(self) -> dict[str, Any] | None:
        """Options passed for text-to-speech."""
        return {
            tts.ATTR_PREFERRED_FORMAT: "wav",
            tts.ATTR_PREFERRED_SAMPLE_RATE: 16000,
            tts.ATTR_PREFERRED_SAMPLE_CHANNELS: 1,
            tts.ATTR_PREFERRED_SAMPLE_BYTES: 2,
        }

    async def async_added_to_hass(self) -> None:
        """Run when entity about to be added to hass."""
        await super().async_added_to_hass()
        self.voip_device.protocol = self

        assert self.device_entry is not None
        self.async_on_remove(
            intent.async_register_timer_handler(
                self.hass, self.device_entry.id, self.async_handle_timer_event
            )
        )

    async def async_will_remove_from_hass(self) -> None:
        """Run when entity will be removed from hass."""
        await super().async_will_remove_from_hass()
        assert self.voip_device.protocol == self
        self.voip_device.protocol = None

    @callback
    def async_get_configuration(
        self,
    ) -> AssistSatelliteConfiguration:
        """Get the current satellite configuration."""
        raise NotImplementedError

    @callback
    def async_handle_timer_event(
        self,
        event_type: TimerEventType,
        timer_info: TimerInfo,
    ) -> None:
        """Handle timer event."""
        if event_type != TimerEventType.FINISHED:
            return

        if timer_info.name:
            message = f"{timer_info.name} finished"
        else:
            message = f"{timedelta(seconds=timer_info.created_seconds)} timer finished"

        async def announce_message():
            announcement = await self._resolve_announcement_media_id(message, None)
            await self.async_announce(announcement)

        self.config_entry.async_create_background_task(
            self.hass, announce_message(), "voip_announce_timer"
        )

    async def async_set_configuration(
        self, config: AssistSatelliteConfiguration
    ) -> None:
        """Set the current satellite configuration."""
        raise NotImplementedError

    async def async_announce(self, announcement: AssistSatelliteAnnouncement) -> None:
        """Announce media on the satellite.

        Plays announcement in a loop, blocking until the caller hangs up.
        """
        await self._do_announce(announcement, run_pipeline_after=False)

    async def _do_announce(
        self, announcement: AssistSatelliteAnnouncement, run_pipeline_after: bool
    ) -> None:
        """Announce media on the satellite.

        Optionally run a voice pipeline after the announcement has finished.
        """
        if announcement.media_id_source != "tts":
            raise HomeAssistantError(
                translation_domain=DOMAIN,
                translation_key="non_tts_announcement",
            )

        self._call_end_future = asyncio.Future()
        self._run_pipeline_after_announce = run_pipeline_after

        if self._rtp_port is None:
            # Choose random port for RTP
            sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            sock.setblocking(False)
            sock.bind(("", 0))
            _rtp_ip, self._rtp_port = sock.getsockname()
            sock.close()

        # HA SIP server
        source_ip = await async_get_source_ip(self.hass)
        sip_port = self.config_entry.options.get(CONF_SIP_PORT, SIP_PORT)
        sip_user = self.config_entry.options.get(CONF_SIP_USER)
        source_endpoint = get_sip_endpoint(
            host=source_ip, port=sip_port, username=sip_user
        )

        try:
            # VoIP ID is SIP header
            destination_endpoint = SipEndpoint(self.voip_device.voip_id)
        except ValueError:
            # VoIP ID is IP address
            destination_endpoint = get_sip_endpoint(
                host=self.voip_device.voip_id, port=SIP_PORT
            )

        # Reset state so we can time out if needed
        self._last_chunk_time = None
        self._announcment_start_time = time.monotonic()
        self._announcement = announcement

        # Make the call
        sip_protocol: SipDatagramProtocol = self.hass.data[DOMAIN].protocol
        call_info = sip_protocol.outgoing_call(
            source=source_endpoint,
            destination=destination_endpoint,
            rtp_port=self._rtp_port,
        )

        # Check if caller didn't pick up
        self._check_announcement_pickup_task = (
            self.config_entry.async_create_background_task(
                self.hass,
                self._check_announcement_pickup(),
                "voip_announcement_pickup",
            )
        )

        try:
            await self._call_end_future
        except TimeoutError:
            # Stop ringing
            _LOGGER.debug("Caller did not pick up in time")
            sip_protocol.cancel_call(call_info)
            raise

    async def _check_announcement_pickup(self) -> None:
        """Continuously checks if an audio chunk was received within a time limit.

        If not, the caller is presumed to have not picked up the phone and the announcement is ended.
        """
        while True:
            current_time = time.monotonic()
            if (self._last_chunk_time is None) and (
                (current_time - self._announcment_start_time)
                > _ANNOUNCEMENT_RING_TIMEOUT
            ):
                # Ring timeout
                _LOGGER.debug("Ring timeout")
                self._announcement = None
                self._check_announcement_pickup_task = None
                self._call_end_future.set_exception(
                    TimeoutError("User did not pick up in time")
                )
                _LOGGER.debug("Timed out waiting for the user to pick up the phone")
                break
            if self._last_chunk_time is not None:
                _LOGGER.debug("Picked up the phone")
                self._check_announcement_pickup_task = None
                break

            await asyncio.sleep(_HANGUP_SEC / 2)

    async def _check_hangup(self) -> None:
        """Continuously checks if an audio chunk was received within a time limit.

        If not, the caller is presumed to have hung up and the call is ended.
        """
        try:
            while True:
                current_time = time.monotonic()
                if (self._last_chunk_time is not None) and (
                    (current_time - self._last_chunk_time) > _HANGUP_SEC
                ):
                    # Caller hung up
                    _LOGGER.debug("Hang up")
                    self._announcement = None
                    if self._run_pipeline_task is not None:
                        _LOGGER.debug("Cancelling running pipeline")
                        self._run_pipeline_task.cancel()
                    if not self._call_end_future.done():
                        self._call_end_future.set_result(None)
                    self.disconnect()
                    break

                await asyncio.sleep(_HANGUP_SEC / 2)
        except asyncio.CancelledError:
            # Don't swallow cancellation
            if (current_task := asyncio.current_task()) and current_task.cancelling():
                raise
            _LOGGER.debug("Check hangup cancelled")

    async def async_start_conversation(
        self, start_announcement: AssistSatelliteAnnouncement
    ) -> None:
        """Start a conversation from the satellite."""
        await self._do_announce(start_announcement, run_pipeline_after=True)

    # -------------------------------------------------------------------------
    # VoIP
    # -------------------------------------------------------------------------

    def disconnect(self):
        """Server disconnected."""
        super().disconnect()
        if self._check_hangup_task is not None:
            self._check_hangup_task.cancel()
            self._check_hangup_task = None

    def connection_made(self, transport):
        """Server is ready."""
        super().connection_made(transport)
        self._last_chunk_time = time.monotonic()
        # Check if caller hung up
        self._check_hangup_task = self.config_entry.async_create_background_task(
            self.hass,
            self._check_hangup(),
            "voip_hangup",
        )

    def on_chunk(self, audio_bytes: bytes) -> None:
        """Handle raw audio chunk."""
        self._last_chunk_time = time.monotonic()

        if self._announcement is None:
            # Pipeline with STT
            if self._run_pipeline_task is None:
                # Run pipeline until voice command finishes, then start over
                self._clear_audio_queue()
                self._tts_done.clear()
                self._run_pipeline_task = (
                    self.config_entry.async_create_background_task(
                        self.hass,
                        self._run_pipeline(),
                        "voip_pipeline_run",
                    )
                )

            self._audio_queue.put_nowait(audio_bytes)
        elif self._run_pipeline_task is None:
            # Announcement only
            # Play announcement (will repeat)
            self._run_pipeline_task = self.config_entry.async_create_background_task(
                self.hass,
                self._play_announcement(self._announcement),
                "voip_play_announcement",
            )

    async def _run_pipeline(self) -> None:
        """Run a pipeline with STT input and TTS output."""
        _LOGGER.debug("Starting pipeline")

        self.async_set_context(Context(user_id=self.config_entry.data["user"]))
        self.voip_device.set_is_active(True)

        async def stt_stream():
            retry: bool = True
            while True:
                try:
                    async with asyncio.timeout(self._audio_chunk_timeout):
                        chunk = await self._audio_queue.get()
                        if not chunk:
                            _LOGGER.debug("STT stream got None")
                            break

                    yield chunk
                except TimeoutError:
                    _LOGGER.debug("STT Stream timed out")
                    if not retry:
                        _LOGGER.debug("No more retries, ending STT stream")
                        break
                    retry = False

        # Play listening tone at the start of each cycle
        await self._play_tone(Tones.LISTENING, silence_before=0.2)

        try:
            await self.async_accept_pipeline_from_satellite(
                audio_stream=stt_stream(),
            )

            if self._pipeline_had_error:
                _LOGGER.debug("Pipeline error")
                self._pipeline_had_error = False
                await self._play_tone(Tones.ERROR)
            else:
                # Block until TTS is done speaking.
                #
                # This is set in _send_tts and has a timeout that's based on the
                # length of the TTS audio.
                await self._tts_done.wait()
        except TimeoutError:
            # This shouldn't happen anymore, we are detecting hang ups with a separate task
            _LOGGER.exception("Timeout error")
            self.disconnect()  # caller hung up
        except asyncio.CancelledError:
            _LOGGER.debug("Pipeline cancelled")
            # Don't swallow cancellation
            if (current_task := asyncio.current_task()) and current_task.cancelling():
                raise
        finally:
            # Stop audio stream
            await self._audio_queue.put(None)

            self.voip_device.set_is_active(False)
            self._run_pipeline_task = None
            _LOGGER.debug("Pipeline finished")

    async def _play_announcement(
        self, announcement: AssistSatelliteAnnouncement
    ) -> None:
        """Play an announcement once."""
        _LOGGER.debug("Playing announcement")

        if announcement.tts_token is None:
            _LOGGER.error("Only TTS announcements are supported")
            return

        await asyncio.sleep(_ANNOUNCEMENT_BEFORE_DELAY)
        stream = tts.async_get_stream(self.hass, announcement.tts_token)
        if stream is None:
            _LOGGER.error("TTS stream no longer available")
            return

        try:
            await self._send_tts(stream, wait_for_tone=False)
            if not self._run_pipeline_after_announce:
                # Delay before looping announcement
                await asyncio.sleep(_ANNOUNCEMENT_AFTER_DELAY)
        except Exception:
            _LOGGER.exception("Unexpected error while playing announcement")
            raise
        finally:
            self._run_pipeline_task = None
            _LOGGER.debug("Announcement finished")

            if self._run_pipeline_after_announce:
                # Clear announcement to allow pipeline to run
                _LOGGER.debug("Clearing announcement")
                self._announcement = None

    def _clear_audio_queue(self) -> None:
        """Ensure audio queue is empty."""
        while not self._audio_queue.empty():
            self._audio_queue.get_nowait()

    def on_pipeline_event(self, event: PipelineEvent) -> None:
        """Set state based on pipeline stage."""
        if event.type == PipelineEventType.STT_END:
            if (self._tones & Tones.PROCESSING) == Tones.PROCESSING:
                self._processing_tone_done.clear()
                self.config_entry.async_create_background_task(
                    self.hass, self._play_tone(Tones.PROCESSING), "voip_process_tone"
                )
        elif event.type == PipelineEventType.TTS_END:
            # Send TTS audio to caller over RTP
            if (
                event.data
                and (tts_output := event.data["tts_output"])
                and (stream := tts.async_get_stream(self.hass, tts_output["token"]))
            ):
                self.config_entry.async_create_background_task(
                    self.hass,
                    self._send_tts(tts_stream=stream),
                    "voip_pipeline_tts",
                )
            else:
                # Empty TTS response
                _LOGGER.debug("Empty TTS response")
                self._tts_done.set()
        elif event.type == PipelineEventType.ERROR:
            # Play error tone instead of wait for TTS when pipeline is finished.
            self._pipeline_had_error = True
            _LOGGER.warning(event)

    async def _send_tts(
        self,
        tts_stream: tts.ResultStream,
        wait_for_tone: bool = True,
    ) -> None:
        """Send TTS audio to caller via RTP."""
        try:
            if self.transport is None:
                return  # not connected

            data = b"".join([chunk async for chunk in tts_stream.async_stream_result()])

            if tts_stream.extension != "wav":
                raise ValueError(
                    f"Only TTS WAV audio can be streamed, got {tts_stream.extension}"
                )

            if wait_for_tone and ((self._tones & Tones.PROCESSING) == Tones.PROCESSING):
                # Don't overlap TTS and processing beep
                _LOGGER.debug("Waiting for processing tone")
                await self._processing_tone_done.wait()

            with io.BytesIO(data) as wav_io:
                with wave.open(wav_io, "rb") as wav_file:
                    sample_rate = wav_file.getframerate()
                    sample_width = wav_file.getsampwidth()
                    sample_channels = wav_file.getnchannels()

                    if (
                        (sample_rate != RATE)
                        or (sample_width != WIDTH)
                        or (sample_channels != CHANNELS)
                    ):
                        raise ValueError(
                            f"Expected rate/width/channels as {RATE}/{WIDTH}/{CHANNELS},"
                            f" got {sample_rate}/{sample_width}/{sample_channels}"
                        )

                audio_bytes = wav_file.readframes(wav_file.getnframes())

            _LOGGER.debug("Sending %s byte(s) of audio", len(audio_bytes))

            # Time out 1 second after TTS audio should be finished
            tts_samples = len(audio_bytes) / (WIDTH * CHANNELS)
            tts_seconds = tts_samples / RATE

            async with asyncio.timeout(tts_seconds + self._tts_extra_timeout):
                # TTS audio is 16Khz 16-bit mono
                await self._async_send_audio(audio_bytes)
        except TimeoutError:
            _LOGGER.warning("TTS timeout")
            raise
        finally:
            # Update satellite state
            self.tts_response_finished()

            # Signal pipeline to restart
            self._tts_done.set()

    async def _async_send_audio(self, audio_bytes: bytes, **kwargs):
        """Send audio in executor."""
        await self.hass.async_add_executor_job(
            partial(self.send_audio, audio_bytes, **RTP_AUDIO_SETTINGS, **kwargs)
        )

    async def _play_tone(self, tone: Tones, silence_before: float = 0.0) -> None:
        """Play a tone as feedback to the user if it's enabled."""
        if (self._tones & tone) != tone:
            return  # not enabled

        if tone not in self._tone_bytes:
            # Do I/O in executor
            self._tone_bytes[tone] = await self.hass.async_add_executor_job(
                self._load_pcm,
                _TONE_FILENAMES[tone],
            )

        await self._async_send_audio(
            self._tone_bytes[tone],
            silence_before=silence_before,
        )

        if tone == Tones.PROCESSING:
            self._processing_tone_done.set()

    def _load_pcm(self, file_name: str) -> bytes:
        """Load raw audio (16Khz, 16-bit mono)."""
        return (Path(__file__).parent / file_name).read_bytes()