Improve purge performance for PostgreSQL with large databases (#133699)

2024-12-20 23:53:15 -10:00 · 2024-12-20 23:53:15 -10:00 · 43fab48d4e
parent 02785a4ded
commit 43fab48d4e
5 changed files with 17 additions and 3 deletions
--- a/homeassistant/components/recorder/models/database.py
+++ b/homeassistant/components/recorder/models/database.py
@ -32,4 +32,8 @@ class DatabaseOptimizer:
    #
    # https://jira.mariadb.org/browse/MDEV-25020
    #
+    # PostgreSQL does not support a skip/loose index scan so its
+    # also slow for large distinct queries:
+    # https://wiki.postgresql.org/wiki/Loose_indexscan
+    # https://github.com/home-assistant/core/issues/126084
    slow_range_in_select: bool
--- a/homeassistant/components/recorder/purge.py
+++ b/homeassistant/components/recorder/purge.py
@ -346,6 +346,10 @@ def _select_unused_attributes_ids(
        # We now break the query into groups of 100 and use a lambda_stmt to ensure
        # that the query is only cached once.
        #
+        # PostgreSQL also suffers from the same issue as older MariaDB with the distinct query
+        # when the database gets large because it doesn't support skip/loose index scan.
+        # https://wiki.postgresql.org/wiki/Loose_indexscan
+        # https://github.com/home-assistant/core/issues/126084
        groups = [iter(attributes_ids)] * 100
        for attr_ids in zip_longest(*groups, fillvalue=None):
            seen_ids |= {
--- a/homeassistant/components/recorder/queries.py
+++ b/homeassistant/components/recorder/queries.py
@ -78,7 +78,7 @@ def find_states_metadata_ids(entity_ids: Iterable[str]) -> StatementLambdaElemen

 def _state_attrs_exist(attr: int | None) -> Select:
    """Check if a state attributes id exists in the states table."""
-    return select(func.min(States.attributes_id)).where(States.attributes_id == attr)
+    return select(States.attributes_id).where(States.attributes_id == attr).limit(1)


 def attributes_ids_exist_in_states_with_fast_in_distinct(
@ -315,7 +315,7 @@ def data_ids_exist_in_events_with_fast_in_distinct(

 def _event_data_id_exist(data_id: int | None) -> Select:
    """Check if a event data id exists in the events table."""
-    return select(func.min(Events.data_id)).where(Events.data_id == data_id)
+    return select(Events.data_id).where(Events.data_id == data_id).limit(1)


 def data_ids_exist_in_events(
--- a/homeassistant/components/recorder/util.py
+++ b/homeassistant/components/recorder/util.py
@ -600,6 +600,12 @@ def setup_connection_for_dialect(
        execute_on_connection(dbapi_connection, "SET time_zone = '+00:00'")
    elif dialect_name == SupportedDialect.POSTGRESQL:
        max_bind_vars = DEFAULT_MAX_BIND_VARS
+        # PostgreSQL does not support a skip/loose index scan so its
+        # also slow for large distinct queries:
+        # https://wiki.postgresql.org/wiki/Loose_indexscan
+        # https://github.com/home-assistant/core/issues/126084
+        # so we set slow_range_in_select to True
+        slow_range_in_select = True
        if first_connection:
            # server_version_num was added in 2006
            result = query_on_connection(dbapi_connection, "SHOW server_version")
--- a/tests/components/recorder/test_util.py
+++ b/tests/components/recorder/test_util.py
@ -502,7 +502,7 @@ def test_supported_pgsql(caplog: pytest.LogCaptureFixture, pgsql_version) -> Non

    assert "minimum supported version" not in caplog.text
    assert database_engine is not None
-    assert database_engine.optimizer.slow_range_in_select is False
+    assert database_engine.optimizer.slow_range_in_select is True


@pytest.mark.parametrize(