Improve purge performance for PostgreSQL with large databases (#133699)

pull/133707/head
J. Nick Koston 2024-12-20 23:53:15 -10:00 committed by GitHub
parent 02785a4ded
commit 43fab48d4e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 17 additions and 3 deletions

View File

@ -32,4 +32,8 @@ class DatabaseOptimizer:
#
# https://jira.mariadb.org/browse/MDEV-25020
#
# PostgreSQL does not support a skip/loose index scan so its
# also slow for large distinct queries:
# https://wiki.postgresql.org/wiki/Loose_indexscan
# https://github.com/home-assistant/core/issues/126084
slow_range_in_select: bool

View File

@ -346,6 +346,10 @@ def _select_unused_attributes_ids(
# We now break the query into groups of 100 and use a lambda_stmt to ensure
# that the query is only cached once.
#
# PostgreSQL also suffers from the same issue as older MariaDB with the distinct query
# when the database gets large because it doesn't support skip/loose index scan.
# https://wiki.postgresql.org/wiki/Loose_indexscan
# https://github.com/home-assistant/core/issues/126084
groups = [iter(attributes_ids)] * 100
for attr_ids in zip_longest(*groups, fillvalue=None):
seen_ids |= {

View File

@ -78,7 +78,7 @@ def find_states_metadata_ids(entity_ids: Iterable[str]) -> StatementLambdaElemen
def _state_attrs_exist(attr: int | None) -> Select:
"""Check if a state attributes id exists in the states table."""
return select(func.min(States.attributes_id)).where(States.attributes_id == attr)
return select(States.attributes_id).where(States.attributes_id == attr).limit(1)
def attributes_ids_exist_in_states_with_fast_in_distinct(
@ -315,7 +315,7 @@ def data_ids_exist_in_events_with_fast_in_distinct(
def _event_data_id_exist(data_id: int | None) -> Select:
"""Check if a event data id exists in the events table."""
return select(func.min(Events.data_id)).where(Events.data_id == data_id)
return select(Events.data_id).where(Events.data_id == data_id).limit(1)
def data_ids_exist_in_events(

View File

@ -600,6 +600,12 @@ def setup_connection_for_dialect(
execute_on_connection(dbapi_connection, "SET time_zone = '+00:00'")
elif dialect_name == SupportedDialect.POSTGRESQL:
max_bind_vars = DEFAULT_MAX_BIND_VARS
# PostgreSQL does not support a skip/loose index scan so its
# also slow for large distinct queries:
# https://wiki.postgresql.org/wiki/Loose_indexscan
# https://github.com/home-assistant/core/issues/126084
# so we set slow_range_in_select to True
slow_range_in_select = True
if first_connection:
# server_version_num was added in 2006
result = query_on_connection(dbapi_connection, "SHOW server_version")

View File

@ -502,7 +502,7 @@ def test_supported_pgsql(caplog: pytest.LogCaptureFixture, pgsql_version) -> Non
assert "minimum supported version" not in caplog.text
assert database_engine is not None
assert database_engine.optimizer.slow_range_in_select is False
assert database_engine.optimizer.slow_range_in_select is True
@pytest.mark.parametrize(