Fix memory churn in state templates (#90685)

* Fix memory churn in state templates The LRU for state templates was limited to 512 states. As soon as it was exaused, system performance would tank as each template that iterated all states would have to create and GC any state > 512 * does it scale? * avoid copy on all * comment * preen * cover * cover * comments * comments * comments * preen * preen
2023-04-02 14:51:25 -10:00 · 2023-04-02 14:51:25 -10:00 · e10e3ee7cc
parent 83b7018be2
commit e10e3ee7cc
3 changed files with 128 additions and 11 deletions
--- a/homeassistant/bootstrap.py
+++ b/homeassistant/bootstrap.py
@ -239,6 +239,7 @@ async def load_registries(hass: core.HomeAssistant) -> None:

    # Load the registries and cache the result of platform.uname().processor
    entity.async_setup(hass)
+    template.async_setup(hass)
    await asyncio.gather(
        area_registry.async_load(hass),
        device_registry.async_load(hass),
--- a/homeassistant/helpers/template.py
+++ b/homeassistant/helpers/template.py
@ -5,7 +5,7 @@ from ast import literal_eval
 import asyncio
 import base64
 import collections.abc
-from collections.abc import Callable, Collection, Generator, Iterable
+from collections.abc import Callable, Collection, Generator, Iterable, MutableMapping
 from contextlib import contextmanager, suppress
 from contextvars import ContextVar
 from datetime import datetime, timedelta
@ -41,6 +41,7 @@ from jinja2 import pass_context, pass_environment, pass_eval_context
 from jinja2.runtime import AsyncLoopContext, LoopContext
 from jinja2.sandbox import ImmutableSandboxedEnvironment
 from jinja2.utils import Namespace
+from lru import LRU  # pylint: disable=no-name-in-module
 import voluptuous as vol

 from homeassistant.const import (
@ -49,6 +50,8 @@ from homeassistant.const import (
    ATTR_LONGITUDE,
    ATTR_PERSONS,
    ATTR_UNIT_OF_MEASUREMENT,
+    EVENT_HOMEASSISTANT_START,
+    EVENT_HOMEASSISTANT_STOP,
    STATE_UNAVAILABLE,
    STATE_UNKNOWN,
    UnitOfLength,
@ -121,11 +124,77 @@ template_cv: ContextVar[tuple[str, str] | None] = ContextVar(
    "template_cv", default=None
 )

+#
+# CACHED_TEMPLATE_STATES is a rough estimate of the number of entities
+# on a typical system. It is used as the initial size of the LRU cache
+# for TemplateState objects.
+#
+# If the cache is too small we will end up creating and destroying
+# TemplateState objects too often which will cause a lot of GC activity
+# and slow down the system. For systems with a lot of entities and
+# templates, this can reach 100000s of object creations and destructions
+# per minute.
+#
+# Since entity counts may grow over time, we will increase
+# the size if the number of entities grows via _async_adjust_lru_sizes
+# at the start of the system and every 10 minutes if needed.
+#
 CACHED_TEMPLATE_STATES = 512
 EVAL_CACHE_SIZE = 512

 MAX_CUSTOM_TEMPLATE_SIZE = 5 * 1024 * 1024

+CACHED_TEMPLATE_LRU: MutableMapping[State, TemplateState] = LRU(CACHED_TEMPLATE_STATES)
+CACHED_TEMPLATE_NO_COLLECT_LRU: MutableMapping[State, TemplateState] = LRU(
+    CACHED_TEMPLATE_STATES
+)
+ENTITY_COUNT_GROWTH_FACTOR = 1.2
+
+
+def _template_state_no_collect(hass: HomeAssistant, state: State) -> TemplateState:
+    """Return a TemplateState for a state without collecting."""
+    if template_state := CACHED_TEMPLATE_NO_COLLECT_LRU.get(state):
+        return template_state
+    template_state = _create_template_state_no_collect(hass, state)
+    CACHED_TEMPLATE_NO_COLLECT_LRU[state] = template_state
+    return template_state
+
+
+def _template_state(hass: HomeAssistant, state: State) -> TemplateState:
+    """Return a TemplateState for a state that collects."""
+    if template_state := CACHED_TEMPLATE_LRU.get(state):
+        return template_state
+    template_state = TemplateState(hass, state)
+    CACHED_TEMPLATE_LRU[state] = template_state
+    return template_state
+
+
+def async_setup(hass: HomeAssistant) -> bool:
+    """Set up tracking the template LRUs."""
+
+    @callback
+    def _async_adjust_lru_sizes(_: Any) -> None:
+        """Adjust the lru cache sizes."""
+        new_size = int(
+            round(hass.states.async_entity_ids_count() * ENTITY_COUNT_GROWTH_FACTOR)
+        )
+        for lru in (CACHED_TEMPLATE_LRU, CACHED_TEMPLATE_NO_COLLECT_LRU):
+            # There is no typing for LRU
+            current_size = lru.get_size()  # type: ignore[attr-defined]
+            if new_size > current_size:
+                lru.set_size(new_size)  # type: ignore[attr-defined]
+
+    from .event import (  # pylint: disable=import-outside-toplevel
+        async_track_time_interval,
+    )
+
+    cancel = async_track_time_interval(
+        hass, _async_adjust_lru_sizes, timedelta(minutes=10)
+    )
+    hass.bus.async_listen_once(EVENT_HOMEASSISTANT_START, _async_adjust_lru_sizes)
+    hass.bus.async_listen_once(EVENT_HOMEASSISTANT_STOP, callback(lambda _: cancel()))
+    return True
+

@bind_hass
 def attach(hass: HomeAssistant, obj: Any) -> None:
@ -969,21 +1038,33 @@ class TemplateStateFromEntityId(TemplateStateBase):
        return f"<template TemplateStateFromEntityId({self._entity_id})>"


+_create_template_state_no_collect = partial(TemplateState, collect=False)
+
+
 def _collect_state(hass: HomeAssistant, entity_id: str) -> None:
    if (entity_collect := hass.data.get(_RENDER_INFO)) is not None:
        entity_collect.entities.add(entity_id)


-_template_state_no_collect = lru_cache(maxsize=CACHED_TEMPLATE_STATES)(
-    partial(TemplateState, collect=False)
-)
-
-
 def _state_generator(
    hass: HomeAssistant, domain: str | None
 ) -> Generator[TemplateState, None, None]:
    """State generator for a domain or all states."""
-    for state in hass.states.async_all(domain):
+    states = hass.states
+    # If domain is None, we want to iterate over all states, but making
+    # a copy of the dict is expensive. So we iterate over the protected
+    # _states dict instead. This is safe because we're not modifying it
+    # and everything is happening in the same thread (MainThread).
+    #
+    # We do not want to expose this method in the public API though to
+    # ensure it does not get misused.
+    #
+    container: Iterable[State]
+    if domain is None:
+        container = states._states.values()  # pylint: disable=protected-access
+    else:
+        container = states.async_all(domain)
+    for state in container:
        yield _template_state_no_collect(hass, state)


@ -998,9 +1079,6 @@ def _get_state(hass: HomeAssistant, entity_id: str) -> TemplateState | None:
    return _get_template_state_from_state(hass, entity_id, hass.states.get(entity_id))


-_template_state = lru_cache(maxsize=CACHED_TEMPLATE_STATES)(TemplateState)
-
-
 def _get_template_state_from_state(
    hass: HomeAssistant, entity_id: str, state: State | None
 ) -> TemplateState | None:
--- a/tests/helpers/test_template.py
+++ b/tests/helpers/test_template.py
@ -43,7 +43,7 @@ from homeassistant.setup import async_setup_component
 import homeassistant.util.dt as dt_util
 from homeassistant.util.unit_system import UnitSystem

-from tests.common import MockConfigEntry
+from tests.common import MockConfigEntry, async_fire_time_changed


 def _set_up_units(hass: HomeAssistant) -> None:
@ -4497,3 +4497,41 @@ async def test_render_to_info_with_exception(hass: HomeAssistant) -> None:

    assert info.all_states is False
    assert info.entities == {"test_domain.object"}
+
+
+async def test_lru_increases_with_many_entities(hass: HomeAssistant) -> None:
+    """Test that the template internal LRU cache increases with many entities."""
+    # We do not actually want to record 4096 entities so we mock the entity count
+    mock_entity_count = 4096
+
+    assert template.CACHED_TEMPLATE_LRU.get_size() == template.CACHED_TEMPLATE_STATES
+    assert (
+        template.CACHED_TEMPLATE_NO_COLLECT_LRU.get_size()
+        == template.CACHED_TEMPLATE_STATES
+    )
+
+    template.async_setup(hass)
+    with patch.object(
+        hass.states, "async_entity_ids_count", return_value=mock_entity_count
+    ):
+        async_fire_time_changed(hass, dt_util.utcnow() + timedelta(minutes=10))
+        await hass.async_block_till_done()
+
+    assert template.CACHED_TEMPLATE_LRU.get_size() == int(
+        round(mock_entity_count * template.ENTITY_COUNT_GROWTH_FACTOR)
+    )
+    assert template.CACHED_TEMPLATE_NO_COLLECT_LRU.get_size() == int(
+        round(mock_entity_count * template.ENTITY_COUNT_GROWTH_FACTOR)
+    )
+
+    await hass.async_stop()
+    with patch.object(hass.states, "async_entity_ids_count", return_value=8192):
+        async_fire_time_changed(hass, dt_util.utcnow() + timedelta(minutes=20))
+        await hass.async_block_till_done()
+
+    assert template.CACHED_TEMPLATE_LRU.get_size() == int(
+        round(mock_entity_count * template.ENTITY_COUNT_GROWTH_FACTOR)
+    )
+    assert template.CACHED_TEMPLATE_NO_COLLECT_LRU.get_size() == int(
+        round(mock_entity_count * template.ENTITY_COUNT_GROWTH_FACTOR)
+    )