Improve websocket message coalescing to handle thundering herds better (#118268)

* Increase websocket peak messages to match max expected entities During startup the websocket would frequently disconnect if more than 4096 entities were added back to back. Some MQTT setups will have more than 10000 entities. Match the websocket peak value to the max expected entities * coalesce more * delay more if the backlog gets large * wait to send if the queue is building rapidly * tweak * tweak for chrome since it works great in firefox but chrome cannot handle it * Revert "tweak for chrome since it works great in firefox but chrome cannot handle it" This reverts commit 439e2d76b1. * adjust for chrome * lower number * remove code * fixes * fast path for bytes * compact * adjust test since we see the close right away now on overload * simplify check * reduce loop * tweak * handle ready right away
2024-05-28 17:14:06 -10:00 · 2024-05-28 17:14:06 -10:00 · 79bc179ce8
parent b94bf1f214
commit 79bc179ce8
5 changed files with 124 additions and 69 deletions
--- a/homeassistant/components/auth/init.py
+++ b/homeassistant/components/auth/init.py
@ -125,6 +125,7 @@ as part of a config flow.

 from __future__ import annotations

+import asyncio
 from collections.abc import Callable
 from datetime import datetime, timedelta
 from http import HTTPStatus
@ -168,6 +169,8 @@ type RetrieveResultType = Callable[[str, str], Credentials | None]

 CONFIG_SCHEMA = cv.empty_config_schema(DOMAIN)

+DELETE_CURRENT_TOKEN_DELAY = 2
+

@bind_hass
 def create_auth_code(
@ -644,11 +647,34 @@ def websocket_delete_all_refresh_tokens(
    else:
        connection.send_result(msg["id"], {})

+    async def _delete_current_token_soon() -> None:
+        """Delete the current token after a delay.
+
+        We do not want to delete the current token immediately as it will
+        close the connection.
+
+        This is implemented as a tracked task to ensure the token
+        is still deleted if Home Assistant is shut down during
+        the delay.
+
+        It should not be refactored to use a call_later as that
+        would not be tracked and the token would not be deleted
+        if Home Assistant was shut down during the delay.
+        """
+        try:
+            await asyncio.sleep(DELETE_CURRENT_TOKEN_DELAY)
+        finally:
+            # If the task is cancelled because we are shutting down, delete
+            # the token right away.
+            hass.auth.async_remove_refresh_token(current_refresh_token)
+
    if delete_current_token and (
        not limit_token_types or current_refresh_token.token_type == token_type
    ):
-        # This will close the connection so we need to send the result first.
-        hass.loop.call_soon(hass.auth.async_remove_refresh_token, current_refresh_token)
+        # Deleting the token will close the connection so we need
+        # to do it with a delay in a tracked task to ensure it still
+        # happens if Home Assistant is shutting down.
+        hass.async_create_task(_delete_current_token_soon())


@websocket_api.websocket_command(
--- a/homeassistant/components/websocket_api/const.py
+++ b/homeassistant/components/websocket_api/const.py
@ -25,8 +25,15 @@ PENDING_MSG_PEAK_TIME: Final = 5
 # Maximum number of messages that can be pending at any given time.
 # This is effectively the upper limit of the number of entities
 # that can fire state changes within ~1 second.
+# Ideally we would use homeassistant.const.MAX_EXPECTED_ENTITY_IDS
+# but since chrome will lock up with too many messages we need to
+# limit it to a lower number.
 MAX_PENDING_MSG: Final = 4096

+# Maximum number of messages that are pending before we force
+# resolve the ready future.
+PENDING_MSG_MAX_FORCE_READY: Final = 256
+
 ERR_ID_REUSE: Final = "id_reuse"
 ERR_INVALID_FORMAT: Final = "invalid_format"
 ERR_NOT_ALLOWED: Final = "not_allowed"
--- a/homeassistant/components/websocket_api/http.py
+++ b/homeassistant/components/websocket_api/http.py
@ -24,6 +24,7 @@ from .auth import AUTH_REQUIRED_MESSAGE, AuthPhase
 from .const import (
    DATA_CONNECTIONS,
    MAX_PENDING_MSG,
+    PENDING_MSG_MAX_FORCE_READY,
    PENDING_MSG_PEAK,
    PENDING_MSG_PEAK_TIME,
    SIGNAL_WEBSOCKET_CONNECTED,
@ -67,6 +68,7 @@ class WebSocketHandler:

    __slots__ = (
        "_hass",
+        "_loop",
        "_request",
        "_wsock",
        "_handle_task",
@ -78,11 +80,13 @@ class WebSocketHandler:
        "_connection",
        "_message_queue",
        "_ready_future",
+        "_release_ready_queue_size",
    )

    def __init__(self, hass: HomeAssistant, request: web.Request) -> None:
        """Initialize an active connection."""
        self._hass = hass
+        self._loop = hass.loop
        self._request: web.Request = request
        self._wsock = web.WebSocketResponse(heartbeat=55)
        self._handle_task: asyncio.Task | None = None
@ -97,8 +101,9 @@ class WebSocketHandler:
        # to where messages are queued. This allows the implementation
        # to use a deque and an asyncio.Future to avoid the overhead of
        # an asyncio.Queue.
-        self._message_queue: deque[bytes | None] = deque()
-        self._ready_future: asyncio.Future[None] | None = None
+        self._message_queue: deque[bytes] = deque()
+        self._ready_future: asyncio.Future[int] | None = None
+        self._release_ready_queue_size: int = 0

    def __repr__(self) -> str:
        """Return the representation."""
@ -126,45 +131,35 @@ class WebSocketHandler:
        message_queue = self._message_queue
        logger = self._logger
        wsock = self._wsock
-        loop = self._hass.loop
+        loop = self._loop
+        is_debug_log_enabled = partial(logger.isEnabledFor, logging.DEBUG)
        debug = logger.debug
-        is_enabled_for = logger.isEnabledFor
-        logging_debug = logging.DEBUG
+        can_coalesce = self._connection and self._connection.can_coalesce
+        ready_message_count = len(message_queue)
        # Exceptions if Socket disconnected or cancelled by connection handler
        try:
            while not wsock.closed:
-                if (messages_remaining := len(message_queue)) == 0:
+                if not message_queue:
                    self._ready_future = loop.create_future()
-                    await self._ready_future
-                    messages_remaining = len(message_queue)
+                    ready_message_count = await self._ready_future

-                # A None message is used to signal the end of the connection
-                if (message := message_queue.popleft()) is None:
+                if self._closing:
                    return

-                debug_enabled = is_enabled_for(logging_debug)
-                messages_remaining -= 1
+                if not can_coalesce:
+                    # coalesce may be enabled later in the connection
+                    can_coalesce = self._connection and self._connection.can_coalesce

-                if (
-                    not messages_remaining
-                    or not (connection := self._connection)
-                    or not connection.can_coalesce
-                ):
-                    if debug_enabled:
+                if not can_coalesce or ready_message_count == 1:
+                    message = message_queue.popleft()
+                    if is_debug_log_enabled():
                        debug("%s: Sending %s", self.description, message)
                    await send_bytes_text(message)
                    continue

-                messages: list[bytes] = [message]
-                while messages_remaining:
-                    # A None message is used to signal the end of the connection
-                    if (message := message_queue.popleft()) is None:
-                        return
-                    messages.append(message)
-                    messages_remaining -= 1
-
-                coalesced_messages = b"".join((b"[", b",".join(messages), b"]"))
-                if debug_enabled:
+                coalesced_messages = b"".join((b"[", b",".join(message_queue), b"]"))
+                message_queue.clear()
+                if is_debug_log_enabled():
                    debug("%s: Sending %s", self.description, coalesced_messages)
                await send_bytes_text(coalesced_messages)
        except asyncio.CancelledError:
@ -197,14 +192,15 @@ class WebSocketHandler:
            # max pending messages.
            return

-        if isinstance(message, dict):
-            message = message_to_json_bytes(message)
-        elif isinstance(message, str):
-            message = message.encode("utf-8")
+        if type(message) is not bytes:  # noqa: E721
+            if isinstance(message, dict):
+                message = message_to_json_bytes(message)
+            elif isinstance(message, str):
+                message = message.encode("utf-8")

        message_queue = self._message_queue
-        queue_size_before_add = len(message_queue)
-        if queue_size_before_add >= MAX_PENDING_MSG:
+        message_queue.append(message)
+        if (queue_size_after_add := len(message_queue)) >= MAX_PENDING_MSG:
            self._logger.error(
                (
                    "%s: Client unable to keep up with pending messages. Reached %s pending"
@ -218,14 +214,14 @@ class WebSocketHandler:
            self._cancel()
            return

-        message_queue.append(message)
-        ready_future = self._ready_future
-        if ready_future and not ready_future.done():
-            ready_future.set_result(None)
+        if self._release_ready_queue_size == 0:
+            # Try to coalesce more messages to reduce the number of writes
+            self._release_ready_queue_size = queue_size_after_add
+            self._loop.call_soon(self._release_ready_future_or_reschedule)

        peak_checker_active = self._peak_checker_unsub is not None

-        if queue_size_before_add <= PENDING_MSG_PEAK:
+        if queue_size_after_add <= PENDING_MSG_PEAK:
            if peak_checker_active:
                self._cancel_peak_checker()
            return
@ -235,6 +231,32 @@ class WebSocketHandler:
                self._hass, PENDING_MSG_PEAK_TIME, self._check_write_peak
            )

+    @callback
+    def _release_ready_future_or_reschedule(self) -> None:
+        """Release the ready future or reschedule.
+
+        We will release the ready future if the queue did not grow since the
+        last time we tried to release the ready future.
+
+        If we reach PENDING_MSG_MAX_FORCE_READY, we will release the ready future
+        immediately so avoid the coalesced messages from growing too large.
+        """
+        if not (ready_future := self._ready_future) or not (
+            queue_size := len(self._message_queue)
+        ):
+            self._release_ready_queue_size = 0
+            return
+        # If we are below the max pending to force ready, and there are new messages
+        # in the queue since the last time we tried to release the ready future, we
+        # try again later so we can coalesce more messages.
+        if queue_size > self._release_ready_queue_size < PENDING_MSG_MAX_FORCE_READY:
+            self._release_ready_queue_size = queue_size
+            self._loop.call_soon(self._release_ready_future_or_reschedule)
+            return
+        self._release_ready_queue_size = 0
+        if not ready_future.done():
+            ready_future.set_result(queue_size)
+
    @callback
    def _check_write_peak(self, _utc_time: dt.datetime) -> None:
        """Check that we are no longer above the write peak."""
@ -440,10 +462,8 @@ class WebSocketHandler:
                connection.async_handle_close()

            self._closing = True
-
-            self._message_queue.append(None)
            if self._ready_future and not self._ready_future.done():
-                self._ready_future.set_result(None)
+                self._ready_future.set_result(len(self._message_queue))

            # If the writer gets canceled we still need to close the websocket
            # so we have another finally block to make sure we close the websocket
--- a/tests/components/auth/test_init.py
+++ b/tests/components/auth/test_init.py
@ -546,20 +546,21 @@ async def test_ws_delete_all_refresh_tokens_error(

    tokens = result["result"]

-    await ws_client.send_json(
-        {
-            "id": 6,
-            "type": "auth/delete_all_refresh_tokens",
-        }
-    )
+    with patch("homeassistant.components.auth.DELETE_CURRENT_TOKEN_DELAY", 0.001):
+        await ws_client.send_json(
+            {
+                "id": 6,
+                "type": "auth/delete_all_refresh_tokens",
+            }
+        )

-    caplog.clear()
-    result = await ws_client.receive_json()
-    assert result, result["success"] is False
-    assert result["error"] == {
-        "code": "token_removing_error",
-        "message": "During removal, an error was raised.",
-    }
+        caplog.clear()
+        result = await ws_client.receive_json()
+        assert result, result["success"] is False
+        assert result["error"] == {
+            "code": "token_removing_error",
+            "message": "During removal, an error was raised.",
+        }

    records = [
        record
@ -571,6 +572,7 @@ async def test_ws_delete_all_refresh_tokens_error(
    assert records[0].exc_info and str(records[0].exc_info[1]) == "I'm bad"
    assert records[0].name == "homeassistant.components.auth"

+    await hass.async_block_till_done()
    for token in tokens:
        refresh_token = hass.auth.async_get_refresh_token(token["id"])
        assert refresh_token is None
@ -629,18 +631,20 @@ async def test_ws_delete_all_refresh_tokens(
    result = await ws_client.receive_json()
    assert result["success"], result

-    await ws_client.send_json(
-        {
-            "id": 6,
-            "type": "auth/delete_all_refresh_tokens",
-            **delete_token_type,
-            **delete_current_token,
-        }
-    )
+    with patch("homeassistant.components.auth.DELETE_CURRENT_TOKEN_DELAY", 0.001):
+        await ws_client.send_json(
+            {
+                "id": 6,
+                "type": "auth/delete_all_refresh_tokens",
+                **delete_token_type,
+                **delete_current_token,
+            }
+        )

-    result = await ws_client.receive_json()
-    assert result, result["success"]
+        result = await ws_client.receive_json()
+        assert result, result["success"]

+    await hass.async_block_till_done()
    # We need to enumerate the user since we may remove the token
    # that is used to authenticate the user which will prevent the websocket
    # connection from working
--- a/tests/components/websocket_api/test_http.py
+++ b/tests/components/websocket_api/test_http.py
@ -294,8 +294,6 @@ async def test_pending_msg_peak_recovery(
    instance._send_message({})
    instance._handle_task.cancel()

-    msg = await websocket_client.receive()
-    assert msg.type == WSMsgType.TEXT
    msg = await websocket_client.receive()
    assert msg.type is WSMsgType.CLOSE
    assert "Client unable to keep up with pending messages" not in caplog.text