Fix memory churn in state templates (#90685)

* Fix memory churn in state templates

The LRU for state templates was limited to 512 states. As soon
as it was exaused, system performance would tank as each template
that iterated all states would have to create and GC any state
> 512

* does it scale?

* avoid copy on all

* comment

* preen

* cover

* cover

* comments

* comments

* comments

* preen

* preen
pull/90855/head
J. Nick Koston 2023-04-02 14:51:25 -10:00 committed by Paulus Schoutsen
parent 83b7018be2
commit e10e3ee7cc
3 changed files with 128 additions and 11 deletions

View File

@ -239,6 +239,7 @@ async def load_registries(hass: core.HomeAssistant) -> None:
# Load the registries and cache the result of platform.uname().processor
entity.async_setup(hass)
template.async_setup(hass)
await asyncio.gather(
area_registry.async_load(hass),
device_registry.async_load(hass),

View File

@ -5,7 +5,7 @@ from ast import literal_eval
import asyncio
import base64
import collections.abc
from collections.abc import Callable, Collection, Generator, Iterable
from collections.abc import Callable, Collection, Generator, Iterable, MutableMapping
from contextlib import contextmanager, suppress
from contextvars import ContextVar
from datetime import datetime, timedelta
@ -41,6 +41,7 @@ from jinja2 import pass_context, pass_environment, pass_eval_context
from jinja2.runtime import AsyncLoopContext, LoopContext
from jinja2.sandbox import ImmutableSandboxedEnvironment
from jinja2.utils import Namespace
from lru import LRU # pylint: disable=no-name-in-module
import voluptuous as vol
from homeassistant.const import (
@ -49,6 +50,8 @@ from homeassistant.const import (
ATTR_LONGITUDE,
ATTR_PERSONS,
ATTR_UNIT_OF_MEASUREMENT,
EVENT_HOMEASSISTANT_START,
EVENT_HOMEASSISTANT_STOP,
STATE_UNAVAILABLE,
STATE_UNKNOWN,
UnitOfLength,
@ -121,11 +124,77 @@ template_cv: ContextVar[tuple[str, str] | None] = ContextVar(
"template_cv", default=None
)
#
# CACHED_TEMPLATE_STATES is a rough estimate of the number of entities
# on a typical system. It is used as the initial size of the LRU cache
# for TemplateState objects.
#
# If the cache is too small we will end up creating and destroying
# TemplateState objects too often which will cause a lot of GC activity
# and slow down the system. For systems with a lot of entities and
# templates, this can reach 100000s of object creations and destructions
# per minute.
#
# Since entity counts may grow over time, we will increase
# the size if the number of entities grows via _async_adjust_lru_sizes
# at the start of the system and every 10 minutes if needed.
#
CACHED_TEMPLATE_STATES = 512
EVAL_CACHE_SIZE = 512
MAX_CUSTOM_TEMPLATE_SIZE = 5 * 1024 * 1024
CACHED_TEMPLATE_LRU: MutableMapping[State, TemplateState] = LRU(CACHED_TEMPLATE_STATES)
CACHED_TEMPLATE_NO_COLLECT_LRU: MutableMapping[State, TemplateState] = LRU(
CACHED_TEMPLATE_STATES
)
ENTITY_COUNT_GROWTH_FACTOR = 1.2
def _template_state_no_collect(hass: HomeAssistant, state: State) -> TemplateState:
"""Return a TemplateState for a state without collecting."""
if template_state := CACHED_TEMPLATE_NO_COLLECT_LRU.get(state):
return template_state
template_state = _create_template_state_no_collect(hass, state)
CACHED_TEMPLATE_NO_COLLECT_LRU[state] = template_state
return template_state
def _template_state(hass: HomeAssistant, state: State) -> TemplateState:
"""Return a TemplateState for a state that collects."""
if template_state := CACHED_TEMPLATE_LRU.get(state):
return template_state
template_state = TemplateState(hass, state)
CACHED_TEMPLATE_LRU[state] = template_state
return template_state
def async_setup(hass: HomeAssistant) -> bool:
"""Set up tracking the template LRUs."""
@callback
def _async_adjust_lru_sizes(_: Any) -> None:
"""Adjust the lru cache sizes."""
new_size = int(
round(hass.states.async_entity_ids_count() * ENTITY_COUNT_GROWTH_FACTOR)
)
for lru in (CACHED_TEMPLATE_LRU, CACHED_TEMPLATE_NO_COLLECT_LRU):
# There is no typing for LRU
current_size = lru.get_size() # type: ignore[attr-defined]
if new_size > current_size:
lru.set_size(new_size) # type: ignore[attr-defined]
from .event import ( # pylint: disable=import-outside-toplevel
async_track_time_interval,
)
cancel = async_track_time_interval(
hass, _async_adjust_lru_sizes, timedelta(minutes=10)
)
hass.bus.async_listen_once(EVENT_HOMEASSISTANT_START, _async_adjust_lru_sizes)
hass.bus.async_listen_once(EVENT_HOMEASSISTANT_STOP, callback(lambda _: cancel()))
return True
@bind_hass
def attach(hass: HomeAssistant, obj: Any) -> None:
@ -969,21 +1038,33 @@ class TemplateStateFromEntityId(TemplateStateBase):
return f"<template TemplateStateFromEntityId({self._entity_id})>"
_create_template_state_no_collect = partial(TemplateState, collect=False)
def _collect_state(hass: HomeAssistant, entity_id: str) -> None:
if (entity_collect := hass.data.get(_RENDER_INFO)) is not None:
entity_collect.entities.add(entity_id)
_template_state_no_collect = lru_cache(maxsize=CACHED_TEMPLATE_STATES)(
partial(TemplateState, collect=False)
)
def _state_generator(
hass: HomeAssistant, domain: str | None
) -> Generator[TemplateState, None, None]:
"""State generator for a domain or all states."""
for state in hass.states.async_all(domain):
states = hass.states
# If domain is None, we want to iterate over all states, but making
# a copy of the dict is expensive. So we iterate over the protected
# _states dict instead. This is safe because we're not modifying it
# and everything is happening in the same thread (MainThread).
#
# We do not want to expose this method in the public API though to
# ensure it does not get misused.
#
container: Iterable[State]
if domain is None:
container = states._states.values() # pylint: disable=protected-access
else:
container = states.async_all(domain)
for state in container:
yield _template_state_no_collect(hass, state)
@ -998,9 +1079,6 @@ def _get_state(hass: HomeAssistant, entity_id: str) -> TemplateState | None:
return _get_template_state_from_state(hass, entity_id, hass.states.get(entity_id))
_template_state = lru_cache(maxsize=CACHED_TEMPLATE_STATES)(TemplateState)
def _get_template_state_from_state(
hass: HomeAssistant, entity_id: str, state: State | None
) -> TemplateState | None:

View File

@ -43,7 +43,7 @@ from homeassistant.setup import async_setup_component
import homeassistant.util.dt as dt_util
from homeassistant.util.unit_system import UnitSystem
from tests.common import MockConfigEntry
from tests.common import MockConfigEntry, async_fire_time_changed
def _set_up_units(hass: HomeAssistant) -> None:
@ -4497,3 +4497,41 @@ async def test_render_to_info_with_exception(hass: HomeAssistant) -> None:
assert info.all_states is False
assert info.entities == {"test_domain.object"}
async def test_lru_increases_with_many_entities(hass: HomeAssistant) -> None:
"""Test that the template internal LRU cache increases with many entities."""
# We do not actually want to record 4096 entities so we mock the entity count
mock_entity_count = 4096
assert template.CACHED_TEMPLATE_LRU.get_size() == template.CACHED_TEMPLATE_STATES
assert (
template.CACHED_TEMPLATE_NO_COLLECT_LRU.get_size()
== template.CACHED_TEMPLATE_STATES
)
template.async_setup(hass)
with patch.object(
hass.states, "async_entity_ids_count", return_value=mock_entity_count
):
async_fire_time_changed(hass, dt_util.utcnow() + timedelta(minutes=10))
await hass.async_block_till_done()
assert template.CACHED_TEMPLATE_LRU.get_size() == int(
round(mock_entity_count * template.ENTITY_COUNT_GROWTH_FACTOR)
)
assert template.CACHED_TEMPLATE_NO_COLLECT_LRU.get_size() == int(
round(mock_entity_count * template.ENTITY_COUNT_GROWTH_FACTOR)
)
await hass.async_stop()
with patch.object(hass.states, "async_entity_ids_count", return_value=8192):
async_fire_time_changed(hass, dt_util.utcnow() + timedelta(minutes=20))
await hass.async_block_till_done()
assert template.CACHED_TEMPLATE_LRU.get_size() == int(
round(mock_entity_count * template.ENTITY_COUNT_GROWTH_FACTOR)
)
assert template.CACHED_TEMPLATE_NO_COLLECT_LRU.get_size() == int(
round(mock_entity_count * template.ENTITY_COUNT_GROWTH_FACTOR)
)