2017-02-26 22:38:06 +00:00
|
|
|
"""Schema migration helpers."""
|
|
|
|
import logging
|
2018-08-19 16:57:06 +00:00
|
|
|
import os
|
2017-02-26 22:38:06 +00:00
|
|
|
|
|
|
|
from .util import session_scope
|
|
|
|
|
|
|
|
_LOGGER = logging.getLogger(__name__)
|
2018-08-19 16:57:06 +00:00
|
|
|
PROGRESS_FILE = '.migration_progress'
|
2017-02-26 22:38:06 +00:00
|
|
|
|
|
|
|
|
|
|
|
def migrate_schema(instance):
|
|
|
|
"""Check if the schema needs to be upgraded."""
|
|
|
|
from .models import SchemaChanges, SCHEMA_VERSION
|
|
|
|
|
2018-08-19 16:57:06 +00:00
|
|
|
progress_path = instance.hass.config.path(PROGRESS_FILE)
|
|
|
|
|
2017-02-26 22:38:06 +00:00
|
|
|
with session_scope(session=instance.get_session()) as session:
|
|
|
|
res = session.query(SchemaChanges).order_by(
|
|
|
|
SchemaChanges.change_id.desc()).first()
|
|
|
|
current_version = getattr(res, 'schema_version', None)
|
|
|
|
|
|
|
|
if current_version == SCHEMA_VERSION:
|
2018-08-19 16:57:06 +00:00
|
|
|
# Clean up if old migration left file
|
|
|
|
if os.path.isfile(progress_path):
|
|
|
|
_LOGGER.warning("Found existing migration file, cleaning up")
|
|
|
|
os.remove(instance.hass.config.path(PROGRESS_FILE))
|
2017-02-26 22:38:06 +00:00
|
|
|
return
|
|
|
|
|
2018-08-19 16:57:06 +00:00
|
|
|
with open(progress_path, 'w'):
|
|
|
|
pass
|
|
|
|
|
|
|
|
_LOGGER.warning("Database requires upgrade. Schema version: %s",
|
|
|
|
current_version)
|
2017-02-26 22:38:06 +00:00
|
|
|
|
|
|
|
if current_version is None:
|
|
|
|
current_version = _inspect_schema_version(instance.engine, session)
|
|
|
|
_LOGGER.debug("No schema version found. Inspected version: %s",
|
|
|
|
current_version)
|
|
|
|
|
2018-08-19 16:57:06 +00:00
|
|
|
try:
|
|
|
|
for version in range(current_version, SCHEMA_VERSION):
|
|
|
|
new_version = version + 1
|
|
|
|
_LOGGER.info("Upgrading recorder db schema to version %s",
|
|
|
|
new_version)
|
|
|
|
_apply_update(instance.engine, new_version, current_version)
|
|
|
|
session.add(SchemaChanges(schema_version=new_version))
|
2017-02-26 22:38:06 +00:00
|
|
|
|
2018-08-19 16:57:06 +00:00
|
|
|
_LOGGER.info("Upgrade to version %s done", new_version)
|
|
|
|
finally:
|
|
|
|
os.remove(instance.hass.config.path(PROGRESS_FILE))
|
2017-02-26 22:38:06 +00:00
|
|
|
|
|
|
|
|
2017-03-24 03:48:31 +00:00
|
|
|
def _create_index(engine, table_name, index_name):
|
|
|
|
"""Create an index for the specified table.
|
|
|
|
|
|
|
|
The index name should match the name given for the index
|
|
|
|
within the table definition described in the models
|
|
|
|
"""
|
2017-02-26 22:38:06 +00:00
|
|
|
from sqlalchemy import Table
|
|
|
|
from . import models
|
|
|
|
|
2017-03-24 03:48:31 +00:00
|
|
|
table = Table(table_name, models.Base.metadata)
|
|
|
|
_LOGGER.debug("Looking up index for table %s", table_name)
|
2017-09-23 15:15:46 +00:00
|
|
|
# Look up the index object by name from the table is the models
|
2017-03-24 03:48:31 +00:00
|
|
|
index = next(idx for idx in table.indexes if idx.name == index_name)
|
|
|
|
_LOGGER.debug("Creating %s index", index_name)
|
History query and schema optimizations for huge performance boost (#8748)
* Add DEBUG-level log for db row to native object conversion
This is now the bottleneck (by a large margin) for big history queries, so I'm leaving this log feature in to help diagnose users with a slow history page
* Rewrite of the "first synthetic datapoint" query for multiple entities
The old method was written in a manner that prevented an index from being used in the inner-most GROUP BY statement, causing massive performance issues especially when querying for a large time period.
The new query does have one material change that will cause it to return different results than before: instead of using max(state_id) to get the latest entry, we now get the max(last_updated). This is more appropriate (primary key should not be assumed to be in order of event firing) and allows an index to be used on the inner-most query. I added another JOIN layer to account for cases where there are two entries on the exact same `last_created` for a given entity. In this case we do use `state_id` as a tiebreaker.
For performance reasons the domain filters were also moved to the outermost query, as it's way more efficient to do it there than on the innermost query as before (due to indexing with GROUP BY problems)
The result is a query that only needs to do a filesort on the final result set, which will only be as many rows as there are entities.
* Remove the ORDER BY entity_id when fetching states, and add logging
Having this ORDER BY in the query prevents it from using an index due to the range filter, so it has been removed.
We already do a `groupby` in the `states_to_json` method which accomplishes exactly what the ORDER BY in the query was trying to do anyway, so this change causes no functional difference.
Also added DEBUG-level logging to allow diagnosing a user's slow history page.
* Add DEBUG-level logging for the synthetic-first-datapoint query
For diagnosing a user's slow history page
* Missed a couple instances of `created` that should be `last_updated`
* Remove `entity_id` sorting from state_changes; match significant_update
This is the same change as 09b3498f410106881fc5e095c49a8d527fa89644 , but applied to the `state_changes_during_period` method which I missed before. This should give the same performance boost to the history sensor component!
* Bugfix in History query used for History Sensor
The date filter was using a different column for the upper and lower bounds. It would work, but it would be slow!
* Update Recorder purge script to use more appropriate columns
Two reasons: 1. the `created` column's meaning is fairly arbitrary and does not represent when an event or state change actually ocurred. It seems more correct to purge based on the event date than the time the database row was written.
2. The new columns are indexed, which will speed up this purge script by orders of magnitude
* Updating db model to match new query optimizations
A few things here: 1. New schema version with a new index and several removed indexes
2. A new method in the migration script to drop old indexes
3. Added an INFO-level log message when a new index will be added, as this can take quite some time on a Raspberry Pi
2017-08-05 06:16:53 +00:00
|
|
|
_LOGGER.info("Adding index `%s` to database. Note: this can take several "
|
|
|
|
"minutes on large databases and slow computers. Please "
|
|
|
|
"be patient!", index_name)
|
2017-03-24 03:48:31 +00:00
|
|
|
index.create(engine)
|
|
|
|
_LOGGER.debug("Finished creating %s", index_name)
|
|
|
|
|
|
|
|
|
History query and schema optimizations for huge performance boost (#8748)
* Add DEBUG-level log for db row to native object conversion
This is now the bottleneck (by a large margin) for big history queries, so I'm leaving this log feature in to help diagnose users with a slow history page
* Rewrite of the "first synthetic datapoint" query for multiple entities
The old method was written in a manner that prevented an index from being used in the inner-most GROUP BY statement, causing massive performance issues especially when querying for a large time period.
The new query does have one material change that will cause it to return different results than before: instead of using max(state_id) to get the latest entry, we now get the max(last_updated). This is more appropriate (primary key should not be assumed to be in order of event firing) and allows an index to be used on the inner-most query. I added another JOIN layer to account for cases where there are two entries on the exact same `last_created` for a given entity. In this case we do use `state_id` as a tiebreaker.
For performance reasons the domain filters were also moved to the outermost query, as it's way more efficient to do it there than on the innermost query as before (due to indexing with GROUP BY problems)
The result is a query that only needs to do a filesort on the final result set, which will only be as many rows as there are entities.
* Remove the ORDER BY entity_id when fetching states, and add logging
Having this ORDER BY in the query prevents it from using an index due to the range filter, so it has been removed.
We already do a `groupby` in the `states_to_json` method which accomplishes exactly what the ORDER BY in the query was trying to do anyway, so this change causes no functional difference.
Also added DEBUG-level logging to allow diagnosing a user's slow history page.
* Add DEBUG-level logging for the synthetic-first-datapoint query
For diagnosing a user's slow history page
* Missed a couple instances of `created` that should be `last_updated`
* Remove `entity_id` sorting from state_changes; match significant_update
This is the same change as 09b3498f410106881fc5e095c49a8d527fa89644 , but applied to the `state_changes_during_period` method which I missed before. This should give the same performance boost to the history sensor component!
* Bugfix in History query used for History Sensor
The date filter was using a different column for the upper and lower bounds. It would work, but it would be slow!
* Update Recorder purge script to use more appropriate columns
Two reasons: 1. the `created` column's meaning is fairly arbitrary and does not represent when an event or state change actually ocurred. It seems more correct to purge based on the event date than the time the database row was written.
2. The new columns are indexed, which will speed up this purge script by orders of magnitude
* Updating db model to match new query optimizations
A few things here: 1. New schema version with a new index and several removed indexes
2. A new method in the migration script to drop old indexes
3. Added an INFO-level log message when a new index will be added, as this can take quite some time on a Raspberry Pi
2017-08-05 06:16:53 +00:00
|
|
|
def _drop_index(engine, table_name, index_name):
|
|
|
|
"""Drop an index from a specified table.
|
|
|
|
|
|
|
|
There is no universal way to do something like `DROP INDEX IF EXISTS`
|
|
|
|
so we will simply execute the DROP command and ignore any exceptions
|
|
|
|
|
|
|
|
WARNING: Due to some engines (MySQL at least) being unable to use bind
|
|
|
|
parameters in a DROP INDEX statement (at least via SQLAlchemy), the query
|
|
|
|
string here is generated from the method parameters without sanitizing.
|
|
|
|
DO NOT USE THIS FUNCTION IN ANY OPERATION THAT TAKES USER INPUT.
|
|
|
|
"""
|
|
|
|
from sqlalchemy import text
|
|
|
|
from sqlalchemy.exc import SQLAlchemyError
|
|
|
|
|
|
|
|
_LOGGER.debug("Dropping index %s from table %s", index_name, table_name)
|
|
|
|
success = False
|
|
|
|
|
|
|
|
# Engines like DB2/Oracle
|
|
|
|
try:
|
|
|
|
engine.execute(text("DROP INDEX {index}".format(
|
|
|
|
index=index_name)))
|
|
|
|
except SQLAlchemyError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
success = True
|
|
|
|
|
|
|
|
# Engines like SQLite, SQL Server
|
|
|
|
if not success:
|
|
|
|
try:
|
|
|
|
engine.execute(text("DROP INDEX {table}.{index}".format(
|
|
|
|
index=index_name,
|
|
|
|
table=table_name)))
|
|
|
|
except SQLAlchemyError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
success = True
|
|
|
|
|
|
|
|
if not success:
|
|
|
|
# Engines like MySQL, MS Access
|
|
|
|
try:
|
|
|
|
engine.execute(text("DROP INDEX {index} ON {table}".format(
|
|
|
|
index=index_name,
|
|
|
|
table=table_name)))
|
|
|
|
except SQLAlchemyError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
success = True
|
|
|
|
|
|
|
|
if success:
|
|
|
|
_LOGGER.debug("Finished dropping index %s from table %s",
|
|
|
|
index_name, table_name)
|
|
|
|
else:
|
|
|
|
_LOGGER.warning("Failed to drop index %s from table %s. Schema "
|
|
|
|
"Migration will continue; this is not a "
|
|
|
|
"critical operation.", index_name, table_name)
|
|
|
|
|
|
|
|
|
2018-08-10 16:09:01 +00:00
|
|
|
def _add_columns(engine, table_name, columns_def):
|
|
|
|
"""Add columns to a table."""
|
|
|
|
from sqlalchemy import text
|
2018-08-19 15:22:09 +00:00
|
|
|
from sqlalchemy.exc import OperationalError
|
|
|
|
|
|
|
|
_LOGGER.info("Adding columns %s to table %s. Note: this can take several "
|
|
|
|
"minutes on large databases and slow computers. Please "
|
|
|
|
"be patient!",
|
|
|
|
', '.join(column.split(' ')[0] for column in columns_def),
|
|
|
|
table_name)
|
2018-08-10 16:09:01 +00:00
|
|
|
|
2018-08-19 16:57:06 +00:00
|
|
|
columns_def = ['ADD {}'.format(col_def) for col_def in columns_def]
|
2018-08-10 16:09:01 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
engine.execute(text("ALTER TABLE {table} {columns_def}".format(
|
|
|
|
table=table_name,
|
|
|
|
columns_def=', '.join(columns_def))))
|
|
|
|
return
|
2018-08-19 15:22:09 +00:00
|
|
|
except OperationalError:
|
|
|
|
# Some engines support adding all columns at once,
|
|
|
|
# this error is when they dont'
|
|
|
|
_LOGGER.info('Unable to use quick column add. Adding 1 by 1.')
|
2018-08-10 16:09:01 +00:00
|
|
|
|
|
|
|
for column_def in columns_def:
|
2018-08-19 15:22:09 +00:00
|
|
|
try:
|
|
|
|
engine.execute(text("ALTER TABLE {table} {column_def}".format(
|
|
|
|
table=table_name,
|
|
|
|
column_def=column_def)))
|
|
|
|
except OperationalError as err:
|
|
|
|
if 'duplicate' not in str(err).lower():
|
|
|
|
raise
|
|
|
|
|
|
|
|
_LOGGER.warning('Column %s already exists on %s, continueing',
|
2018-08-19 16:57:06 +00:00
|
|
|
column_def.split(' ')[1], table_name)
|
2018-08-10 16:09:01 +00:00
|
|
|
|
|
|
|
|
History query and schema optimizations for huge performance boost (#8748)
* Add DEBUG-level log for db row to native object conversion
This is now the bottleneck (by a large margin) for big history queries, so I'm leaving this log feature in to help diagnose users with a slow history page
* Rewrite of the "first synthetic datapoint" query for multiple entities
The old method was written in a manner that prevented an index from being used in the inner-most GROUP BY statement, causing massive performance issues especially when querying for a large time period.
The new query does have one material change that will cause it to return different results than before: instead of using max(state_id) to get the latest entry, we now get the max(last_updated). This is more appropriate (primary key should not be assumed to be in order of event firing) and allows an index to be used on the inner-most query. I added another JOIN layer to account for cases where there are two entries on the exact same `last_created` for a given entity. In this case we do use `state_id` as a tiebreaker.
For performance reasons the domain filters were also moved to the outermost query, as it's way more efficient to do it there than on the innermost query as before (due to indexing with GROUP BY problems)
The result is a query that only needs to do a filesort on the final result set, which will only be as many rows as there are entities.
* Remove the ORDER BY entity_id when fetching states, and add logging
Having this ORDER BY in the query prevents it from using an index due to the range filter, so it has been removed.
We already do a `groupby` in the `states_to_json` method which accomplishes exactly what the ORDER BY in the query was trying to do anyway, so this change causes no functional difference.
Also added DEBUG-level logging to allow diagnosing a user's slow history page.
* Add DEBUG-level logging for the synthetic-first-datapoint query
For diagnosing a user's slow history page
* Missed a couple instances of `created` that should be `last_updated`
* Remove `entity_id` sorting from state_changes; match significant_update
This is the same change as 09b3498f410106881fc5e095c49a8d527fa89644 , but applied to the `state_changes_during_period` method which I missed before. This should give the same performance boost to the history sensor component!
* Bugfix in History query used for History Sensor
The date filter was using a different column for the upper and lower bounds. It would work, but it would be slow!
* Update Recorder purge script to use more appropriate columns
Two reasons: 1. the `created` column's meaning is fairly arbitrary and does not represent when an event or state change actually ocurred. It seems more correct to purge based on the event date than the time the database row was written.
2. The new columns are indexed, which will speed up this purge script by orders of magnitude
* Updating db model to match new query optimizations
A few things here: 1. New schema version with a new index and several removed indexes
2. A new method in the migration script to drop old indexes
3. Added an INFO-level log message when a new index will be added, as this can take quite some time on a Raspberry Pi
2017-08-05 06:16:53 +00:00
|
|
|
def _apply_update(engine, new_version, old_version):
|
2017-03-24 03:48:31 +00:00
|
|
|
"""Perform operations to bring schema up to date."""
|
2017-02-26 22:38:06 +00:00
|
|
|
if new_version == 1:
|
2017-03-24 03:48:31 +00:00
|
|
|
_create_index(engine, "events", "ix_events_time_fired")
|
|
|
|
elif new_version == 2:
|
|
|
|
# Create compound start/end index for recorder_runs
|
|
|
|
_create_index(engine, "recorder_runs", "ix_recorder_runs_start_end")
|
|
|
|
# Create indexes for states
|
|
|
|
_create_index(engine, "states", "ix_states_last_updated")
|
2017-07-01 21:10:17 +00:00
|
|
|
elif new_version == 3:
|
History query and schema optimizations for huge performance boost (#8748)
* Add DEBUG-level log for db row to native object conversion
This is now the bottleneck (by a large margin) for big history queries, so I'm leaving this log feature in to help diagnose users with a slow history page
* Rewrite of the "first synthetic datapoint" query for multiple entities
The old method was written in a manner that prevented an index from being used in the inner-most GROUP BY statement, causing massive performance issues especially when querying for a large time period.
The new query does have one material change that will cause it to return different results than before: instead of using max(state_id) to get the latest entry, we now get the max(last_updated). This is more appropriate (primary key should not be assumed to be in order of event firing) and allows an index to be used on the inner-most query. I added another JOIN layer to account for cases where there are two entries on the exact same `last_created` for a given entity. In this case we do use `state_id` as a tiebreaker.
For performance reasons the domain filters were also moved to the outermost query, as it's way more efficient to do it there than on the innermost query as before (due to indexing with GROUP BY problems)
The result is a query that only needs to do a filesort on the final result set, which will only be as many rows as there are entities.
* Remove the ORDER BY entity_id when fetching states, and add logging
Having this ORDER BY in the query prevents it from using an index due to the range filter, so it has been removed.
We already do a `groupby` in the `states_to_json` method which accomplishes exactly what the ORDER BY in the query was trying to do anyway, so this change causes no functional difference.
Also added DEBUG-level logging to allow diagnosing a user's slow history page.
* Add DEBUG-level logging for the synthetic-first-datapoint query
For diagnosing a user's slow history page
* Missed a couple instances of `created` that should be `last_updated`
* Remove `entity_id` sorting from state_changes; match significant_update
This is the same change as 09b3498f410106881fc5e095c49a8d527fa89644 , but applied to the `state_changes_during_period` method which I missed before. This should give the same performance boost to the history sensor component!
* Bugfix in History query used for History Sensor
The date filter was using a different column for the upper and lower bounds. It would work, but it would be slow!
* Update Recorder purge script to use more appropriate columns
Two reasons: 1. the `created` column's meaning is fairly arbitrary and does not represent when an event or state change actually ocurred. It seems more correct to purge based on the event date than the time the database row was written.
2. The new columns are indexed, which will speed up this purge script by orders of magnitude
* Updating db model to match new query optimizations
A few things here: 1. New schema version with a new index and several removed indexes
2. A new method in the migration script to drop old indexes
3. Added an INFO-level log message when a new index will be added, as this can take quite some time on a Raspberry Pi
2017-08-05 06:16:53 +00:00
|
|
|
# There used to be a new index here, but it was removed in version 4.
|
|
|
|
pass
|
|
|
|
elif new_version == 4:
|
|
|
|
# Queries were rewritten in this schema release. Most indexes from
|
|
|
|
# earlier versions of the schema are no longer needed.
|
|
|
|
|
|
|
|
if old_version == 3:
|
|
|
|
# Remove index that was added in version 3
|
|
|
|
_drop_index(engine, "states", "ix_states_created_domain")
|
|
|
|
if old_version == 2:
|
|
|
|
# Remove index that was added in version 2
|
|
|
|
_drop_index(engine, "states", "ix_states_entity_id_created")
|
|
|
|
|
|
|
|
# Remove indexes that were added in version 0
|
|
|
|
_drop_index(engine, "states", "states__state_changes")
|
|
|
|
_drop_index(engine, "states", "states__significant_changes")
|
|
|
|
_drop_index(engine, "states", "ix_states_entity_id_created")
|
|
|
|
|
|
|
|
_create_index(engine, "states", "ix_states_entity_id_last_updated")
|
2018-03-03 21:54:38 +00:00
|
|
|
elif new_version == 5:
|
|
|
|
# Create supporting index for States.event_id foreign key
|
|
|
|
_create_index(engine, "states", "ix_states_event_id")
|
2018-08-10 16:09:01 +00:00
|
|
|
elif new_version == 6:
|
|
|
|
_add_columns(engine, "events", [
|
|
|
|
'context_id CHARACTER(36)',
|
|
|
|
'context_user_id CHARACTER(36)',
|
|
|
|
])
|
|
|
|
_create_index(engine, "events", "ix_events_context_id")
|
|
|
|
_create_index(engine, "events", "ix_events_context_user_id")
|
|
|
|
_add_columns(engine, "states", [
|
|
|
|
'context_id CHARACTER(36)',
|
|
|
|
'context_user_id CHARACTER(36)',
|
|
|
|
])
|
|
|
|
_create_index(engine, "states", "ix_states_context_id")
|
|
|
|
_create_index(engine, "states", "ix_states_context_user_id")
|
2017-02-26 22:38:06 +00:00
|
|
|
else:
|
|
|
|
raise ValueError("No schema migration defined for version {}"
|
|
|
|
.format(new_version))
|
|
|
|
|
|
|
|
|
|
|
|
def _inspect_schema_version(engine, session):
|
|
|
|
"""Determine the schema version by inspecting the db structure.
|
|
|
|
|
2017-09-23 15:15:46 +00:00
|
|
|
When the schema version is not present in the db, either db was just
|
2017-02-26 22:38:06 +00:00
|
|
|
created with the correct schema, or this is a db created before schema
|
|
|
|
versions were tracked. For now, we'll test if the changes for schema
|
|
|
|
version 1 are present to make the determination. Eventually this logic
|
|
|
|
can be removed and we can assume a new db is being created.
|
|
|
|
"""
|
|
|
|
from sqlalchemy.engine import reflection
|
|
|
|
from .models import SchemaChanges, SCHEMA_VERSION
|
|
|
|
|
|
|
|
inspector = reflection.Inspector.from_engine(engine)
|
|
|
|
indexes = inspector.get_indexes("events")
|
|
|
|
|
|
|
|
for index in indexes:
|
|
|
|
if index['column_names'] == ["time_fired"]:
|
|
|
|
# Schema addition from version 1 detected. New DB.
|
|
|
|
session.add(SchemaChanges(
|
|
|
|
schema_version=SCHEMA_VERSION))
|
|
|
|
return SCHEMA_VERSION
|
|
|
|
|
|
|
|
# Version 1 schema changes not found, this db needs to be migrated.
|
|
|
|
current_version = SchemaChanges(schema_version=0)
|
|
|
|
session.add(current_version)
|
|
|
|
return current_version.schema_version
|