ha-core/homeassistant/components/recorder/migration.py

"""Schema migration helpers."""
import contextlib
from datetime import timedelta
import logging

import sqlalchemy
from sqlalchemy import ForeignKeyConstraint, MetaData, Table, func, text
from sqlalchemy.exc import (
    InternalError,
    OperationalError,
    ProgrammingError,
    SQLAlchemyError,
)
from sqlalchemy.schema import AddConstraint, DropConstraint
from sqlalchemy.sql.expression import true

from .models import (
    SCHEMA_VERSION,
    TABLE_STATES,
    Base,
    SchemaChanges,
    Statistics,
    StatisticsMeta,
    StatisticsRuns,
    StatisticsShortTerm,
    process_timestamp,
)
from .statistics import delete_duplicates, get_start_time
from .util import session_scope

_LOGGER = logging.getLogger(__name__)


def raise_if_exception_missing_str(ex, match_substrs):
    """Raise an exception if the exception and cause do not contain the match substrs."""
    lower_ex_strs = [str(ex).lower(), str(ex.__cause__).lower()]
    for str_sub in match_substrs:
        for exc_str in lower_ex_strs:
            if exc_str and str_sub in exc_str:
                return

    raise ex


def get_schema_version(instance):
    """Get the schema version."""
    with session_scope(session=instance.get_session()) as session:
        res = (
            session.query(SchemaChanges)
            .order_by(SchemaChanges.change_id.desc())
            .first()
        )
        current_version = getattr(res, "schema_version", None)

        if current_version is None:
            current_version = _inspect_schema_version(instance.engine, session)
            _LOGGER.debug(
                "No schema version found. Inspected version: %s", current_version
            )

        return current_version


def schema_is_current(current_version):
    """Check if the schema is current."""
    return current_version == SCHEMA_VERSION


def migrate_schema(instance, current_version):
    """Check if the schema needs to be upgraded."""
    with session_scope(session=instance.get_session()) as session:
        _LOGGER.warning(
            "Database is about to upgrade. Schema version: %s", current_version
        )
        for version in range(current_version, SCHEMA_VERSION):
            new_version = version + 1
            _LOGGER.info("Upgrading recorder db schema to version %s", new_version)
            _apply_update(instance, session, new_version, current_version)
            session.add(SchemaChanges(schema_version=new_version))

            _LOGGER.info("Upgrade to version %s done", new_version)


def _create_index(connection, table_name, index_name):
    """Create an index for the specified table.

    The index name should match the name given for the index
    within the table definition described in the models
    """
    table = Table(table_name, Base.metadata)
    _LOGGER.debug("Looking up index %s for table %s", index_name, table_name)
    # Look up the index object by name from the table is the models
    index_list = [idx for idx in table.indexes if idx.name == index_name]
    if not index_list:
        _LOGGER.debug("The index %s no longer exists", index_name)
        return
    index = index_list[0]
    _LOGGER.debug("Creating %s index", index_name)
    _LOGGER.warning(
        "Adding index `%s` to database. Note: this can take several "
        "minutes on large databases and slow computers. Please "
        "be patient!",
        index_name,
    )
    try:
        index.create(connection)
    except (InternalError, ProgrammingError, OperationalError) as err:
        raise_if_exception_missing_str(err, ["already exists", "duplicate"])
        _LOGGER.warning(
            "Index %s already exists on %s, continuing", index_name, table_name
        )

    _LOGGER.debug("Finished creating %s", index_name)


def _drop_index(connection, table_name, index_name):
    """Drop an index from a specified table.

    There is no universal way to do something like `DROP INDEX IF EXISTS`
    so we will simply execute the DROP command and ignore any exceptions

    WARNING: Due to some engines (MySQL at least) being unable to use bind
    parameters in a DROP INDEX statement (at least via SQLAlchemy), the query
    string here is generated from the method parameters without sanitizing.
    DO NOT USE THIS FUNCTION IN ANY OPERATION THAT TAKES USER INPUT.
    """
    _LOGGER.debug("Dropping index %s from table %s", index_name, table_name)
    success = False

    # Engines like DB2/Oracle
    try:
        connection.execute(text(f"DROP INDEX {index_name}"))
    except SQLAlchemyError:
        pass
    else:
        success = True

    # Engines like SQLite, SQL Server
    if not success:
        try:
            connection.execute(
                text(
                    "DROP INDEX {table}.{index}".format(
                        index=index_name, table=table_name
                    )
                )
            )
        except SQLAlchemyError:
            pass
        else:
            success = True

    if not success:
        # Engines like MySQL, MS Access
        try:
            connection.execute(
                text(
                    "DROP INDEX {index} ON {table}".format(
                        index=index_name, table=table_name
                    )
                )
            )
        except SQLAlchemyError:
            pass
        else:
            success = True

    if success:
        _LOGGER.debug(
            "Finished dropping index %s from table %s", index_name, table_name
        )
    else:
        if index_name == "ix_states_context_parent_id":
            # Was only there on nightly so we do not want
            # to generate log noise or issues about it.
            return

        _LOGGER.warning(
            "Failed to drop index %s from table %s. Schema "
            "Migration will continue; this is not a "
            "critical operation",
            index_name,
            table_name,
        )


def _add_columns(connection, table_name, columns_def):
    """Add columns to a table."""
    _LOGGER.warning(
        "Adding columns %s to table %s. Note: this can take several "
        "minutes on large databases and slow computers. Please "
        "be patient!",
        ", ".join(column.split(" ")[0] for column in columns_def),
        table_name,
    )

    columns_def = [f"ADD {col_def}" for col_def in columns_def]

    try:
        connection.execute(
            text(
                "ALTER TABLE {table} {columns_def}".format(
                    table=table_name, columns_def=", ".join(columns_def)
                )
            )
        )
        return
    except (InternalError, OperationalError):
        # Some engines support adding all columns at once,
        # this error is when they don't
        _LOGGER.info("Unable to use quick column add. Adding 1 by 1")

    for column_def in columns_def:
        try:
            connection.execute(
                text(
                    "ALTER TABLE {table} {column_def}".format(
                        table=table_name, column_def=column_def
                    )
                )
            )
        except (InternalError, OperationalError) as err:
            raise_if_exception_missing_str(err, ["already exists", "duplicate"])
            _LOGGER.warning(
                "Column %s already exists on %s, continuing",
                column_def.split(" ")[1],
                table_name,
            )


def _modify_columns(connection, engine, table_name, columns_def):
    """Modify columns in a table."""
    if engine.dialect.name == "sqlite":
        _LOGGER.debug(
            "Skipping to modify columns %s in table %s; "
            "Modifying column length in SQLite is unnecessary, "
            "it does not impose any length restrictions",
            ", ".join(column.split(" ")[0] for column in columns_def),
            table_name,
        )
        return

    _LOGGER.warning(
        "Modifying columns %s in table %s. Note: this can take several "
        "minutes on large databases and slow computers. Please "
        "be patient!",
        ", ".join(column.split(" ")[0] for column in columns_def),
        table_name,
    )

    if engine.dialect.name == "postgresql":
        columns_def = [
            "ALTER {column} TYPE {type}".format(
                **dict(zip(["column", "type"], col_def.split(" ", 1)))
            )
            for col_def in columns_def
        ]
    elif engine.dialect.name == "mssql":
        columns_def = [f"ALTER COLUMN {col_def}" for col_def in columns_def]
    else:
        columns_def = [f"MODIFY {col_def}" for col_def in columns_def]

    try:
        connection.execute(
            text(
                "ALTER TABLE {table} {columns_def}".format(
                    table=table_name, columns_def=", ".join(columns_def)
                )
            )
        )
        return
    except (InternalError, OperationalError):
        _LOGGER.info("Unable to use quick column modify. Modifying 1 by 1")

    for column_def in columns_def:
        try:
            connection.execute(
                text(
                    "ALTER TABLE {table} {column_def}".format(
                        table=table_name, column_def=column_def
                    )
                )
            )
        except (InternalError, OperationalError):
            _LOGGER.exception(
                "Could not modify column %s in table %s", column_def, table_name
            )


def _update_states_table_with_foreign_key_options(connection, engine):
    """Add the options to foreign key constraints."""
    inspector = sqlalchemy.inspect(engine)
    alters = []
    for foreign_key in inspector.get_foreign_keys(TABLE_STATES):
        if foreign_key["name"] and (
            # MySQL/MariaDB will have empty options
            not foreign_key.get("options")
            or
            # Postgres will have ondelete set to None
            foreign_key.get("options", {}).get("ondelete") is None
        ):
            alters.append(
                {
                    "old_fk": ForeignKeyConstraint((), (), name=foreign_key["name"]),
                    "columns": foreign_key["constrained_columns"],
                }
            )

    if not alters:
        return

    states_key_constraints = Base.metadata.tables[TABLE_STATES].foreign_key_constraints
    old_states_table = Table(  # noqa: F841 pylint: disable=unused-variable
        TABLE_STATES, MetaData(), *(alter["old_fk"] for alter in alters)
    )

    for alter in alters:
        try:
            connection.execute(DropConstraint(alter["old_fk"]))
            for fkc in states_key_constraints:
                if fkc.column_keys == alter["columns"]:
                    connection.execute(AddConstraint(fkc))
        except (InternalError, OperationalError):
            _LOGGER.exception(
                "Could not update foreign options in %s table", TABLE_STATES
            )


def _drop_foreign_key_constraints(connection, engine, table, columns):
    """Drop foreign key constraints for a table on specific columns."""
    inspector = sqlalchemy.inspect(engine)
    drops = []
    for foreign_key in inspector.get_foreign_keys(table):
        if (
            foreign_key["name"]
            and foreign_key.get("options", {}).get("ondelete")
            and foreign_key["constrained_columns"] == columns
        ):
            drops.append(ForeignKeyConstraint((), (), name=foreign_key["name"]))

    # Bind the ForeignKeyConstraints to the table
    old_table = Table(  # noqa: F841 pylint: disable=unused-variable
        table, MetaData(), *drops
    )

    for drop in drops:
        try:
            connection.execute(DropConstraint(drop))
        except (InternalError, OperationalError):
            _LOGGER.exception(
                "Could not drop foreign constraints in %s table on %s",
                TABLE_STATES,
                columns,
            )


def _apply_update(instance, session, new_version, old_version):  # noqa: C901
    """Perform operations to bring schema up to date."""
    engine = instance.engine
    connection = session.connection()
    if new_version == 1:
        _create_index(connection, "events", "ix_events_time_fired")
    elif new_version == 2:
        # Create compound start/end index for recorder_runs
        _create_index(connection, "recorder_runs", "ix_recorder_runs_start_end")
        # Create indexes for states
        _create_index(connection, "states", "ix_states_last_updated")
    elif new_version == 3:
        # There used to be a new index here, but it was removed in version 4.
        pass
    elif new_version == 4:
        # Queries were rewritten in this schema release. Most indexes from
        # earlier versions of the schema are no longer needed.

        if old_version == 3:
            # Remove index that was added in version 3
            _drop_index(connection, "states", "ix_states_created_domain")
        if old_version == 2:
            # Remove index that was added in version 2
            _drop_index(connection, "states", "ix_states_entity_id_created")

        # Remove indexes that were added in version 0
        _drop_index(connection, "states", "states__state_changes")
        _drop_index(connection, "states", "states__significant_changes")
        _drop_index(connection, "states", "ix_states_entity_id_created")

        _create_index(connection, "states", "ix_states_entity_id_last_updated")
    elif new_version == 5:
        # Create supporting index for States.event_id foreign key
        _create_index(connection, "states", "ix_states_event_id")
    elif new_version == 6:
        _add_columns(
            session,
            "events",
            ["context_id CHARACTER(36)", "context_user_id CHARACTER(36)"],
        )
        _create_index(connection, "events", "ix_events_context_id")
        _create_index(connection, "events", "ix_events_context_user_id")
        _add_columns(
            connection,
            "states",
            ["context_id CHARACTER(36)", "context_user_id CHARACTER(36)"],
        )
        _create_index(connection, "states", "ix_states_context_id")
        _create_index(connection, "states", "ix_states_context_user_id")
    elif new_version == 7:
        _create_index(connection, "states", "ix_states_entity_id")
    elif new_version == 8:
        _add_columns(connection, "events", ["context_parent_id CHARACTER(36)"])
        _add_columns(connection, "states", ["old_state_id INTEGER"])
        _create_index(connection, "events", "ix_events_context_parent_id")
    elif new_version == 9:
        # We now get the context from events with a join
        # since its always there on state_changed events
        #
        # Ideally we would drop the columns from the states
        # table as well but sqlite doesn't support that
        # and we would have to move to something like
        # sqlalchemy alembic to make that work
        #
        _drop_index(connection, "states", "ix_states_context_id")
        _drop_index(connection, "states", "ix_states_context_user_id")
        # This index won't be there if they were not running
        # nightly but we don't treat that as a critical issue
        _drop_index(connection, "states", "ix_states_context_parent_id")
        # Redundant keys on composite index:
        # We already have ix_states_entity_id_last_updated
        _drop_index(connection, "states", "ix_states_entity_id")
        _create_index(connection, "events", "ix_events_event_type_time_fired")
        _drop_index(connection, "events", "ix_events_event_type")
    elif new_version == 10:
        # Now done in step 11
        pass
    elif new_version == 11:
        _create_index(connection, "states", "ix_states_old_state_id")
        _update_states_table_with_foreign_key_options(connection, engine)
    elif new_version == 12:
        if engine.dialect.name == "mysql":
            _modify_columns(connection, engine, "events", ["event_data LONGTEXT"])
            _modify_columns(connection, engine, "states", ["attributes LONGTEXT"])
    elif new_version == 13:
        if engine.dialect.name == "mysql":
            _modify_columns(
                connection,
                engine,
                "events",
                ["time_fired DATETIME(6)", "created DATETIME(6)"],
            )
            _modify_columns(
                connection,
                engine,
                "states",
                [
                    "last_changed DATETIME(6)",
                    "last_updated DATETIME(6)",
                    "created DATETIME(6)",
                ],
            )
    elif new_version == 14:
        _modify_columns(connection, engine, "events", ["event_type VARCHAR(64)"])
    elif new_version == 15:
        # This dropped the statistics table, done again in version 18.
        pass
    elif new_version == 16:
        _drop_foreign_key_constraints(
            connection, engine, TABLE_STATES, ["old_state_id"]
        )
    elif new_version == 17:
        # This dropped the statistics table, done again in version 18.
        pass
    elif new_version == 18:
        # Recreate the statistics and statistics meta tables.
        #
        # Order matters! Statistics and StatisticsShortTerm have a relation with
        # StatisticsMeta, so statistics need to be deleted before meta (or in pair
        # depending on the SQL backend); and meta needs to be created before statistics.
        Base.metadata.drop_all(
            bind=engine,
            tables=[
                StatisticsShortTerm.__table__,
                Statistics.__table__,
                StatisticsMeta.__table__,
            ],
        )

        StatisticsMeta.__table__.create(engine)
        StatisticsShortTerm.__table__.create(engine)
        Statistics.__table__.create(engine)
    elif new_version == 19:
        # This adds the statistic runs table, insert a fake run to prevent duplicating
        # statistics.
        session.add(StatisticsRuns(start=get_start_time()))
    elif new_version == 20:
        # This changed the precision of statistics from float to double
        if engine.dialect.name in ["mysql", "postgresql"]:
            _modify_columns(
                connection,
                engine,
                "statistics",
                [
                    "mean DOUBLE PRECISION",
                    "min DOUBLE PRECISION",
                    "max DOUBLE PRECISION",
                    "state DOUBLE PRECISION",
                    "sum DOUBLE PRECISION",
                ],
            )
    elif new_version == 21:
        # Try to change the character set of the statistic_meta table
        if engine.dialect.name == "mysql":
            for table in ("events", "states", "statistics_meta"):
                _LOGGER.warning(
                    "Updating character set and collation of table %s to utf8mb4. "
                    "Note: this can take several minutes on large databases and slow "
                    "computers. Please be patient!",
                    table,
                )
                with contextlib.suppress(SQLAlchemyError):
                    connection.execute(
                        # Using LOCK=EXCLUSIVE to prevent the database from corrupting
                        # https://github.com/home-assistant/core/issues/56104
                        text(
                            f"ALTER TABLE {table} CONVERT TO "
                            "CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci LOCK=EXCLUSIVE"
                        )
                    )
    elif new_version == 22:
        # Recreate the all statistics tables for Oracle DB with Identity columns
        #
        # Order matters! Statistics has a relation with StatisticsMeta,
        # so statistics need to be deleted before meta (or in pair depending
        # on the SQL backend); and meta needs to be created before statistics.
        if engine.dialect.name == "oracle":
            Base.metadata.drop_all(
                bind=engine,
                tables=[
                    StatisticsShortTerm.__table__,
                    Statistics.__table__,
                    StatisticsMeta.__table__,
                    StatisticsRuns.__table__,
                ],
            )

            StatisticsRuns.__table__.create(engine)
            StatisticsMeta.__table__.create(engine)
            StatisticsShortTerm.__table__.create(engine)
            Statistics.__table__.create(engine)

        # Block 5-minute statistics for one hour from the last run, or it will overlap
        # with existing hourly statistics. Don't block on a database with no existing
        # statistics.
        if session.query(Statistics.id).count() and (
            last_run_string := session.query(func.max(StatisticsRuns.start)).scalar()
        ):
            last_run_start_time = process_timestamp(last_run_string)
            if last_run_start_time:
                fake_start_time = last_run_start_time + timedelta(minutes=5)
                while fake_start_time < last_run_start_time + timedelta(hours=1):
                    session.add(StatisticsRuns(start=fake_start_time))
                    fake_start_time += timedelta(minutes=5)

        # When querying the database, be careful to only explicitly query for columns
        # which were present in schema version 21. If querying the table, SQLAlchemy
        # will refer to future columns.
        for sum_statistic in session.query(StatisticsMeta.id).filter_by(has_sum=true()):
            last_statistic = (
                session.query(
                    Statistics.start,
                    Statistics.last_reset,
                    Statistics.state,
                    Statistics.sum,
                )
                .filter_by(metadata_id=sum_statistic.id)
                .order_by(Statistics.start.desc())
                .first()
            )
            if last_statistic:
                session.add(
                    StatisticsShortTerm(
                        metadata_id=sum_statistic.id,
                        start=last_statistic.start,
                        last_reset=last_statistic.last_reset,
                        state=last_statistic.state,
                        sum=last_statistic.sum,
                    )
                )
    elif new_version == 23:
        # Add name column to StatisticsMeta
        _add_columns(session, "statistics_meta", ["name VARCHAR(255)"])
    elif new_version == 24:
        # Delete duplicated statistics
        delete_duplicates(instance, session)
        # Recreate statistics indices to block duplicated statistics
        _drop_index(connection, "statistics", "ix_statistics_statistic_id_start")
        _create_index(connection, "statistics", "ix_statistics_statistic_id_start")
        _drop_index(
            connection,
            "statistics_short_term",
            "ix_statistics_short_term_statistic_id_start",
        )
        _create_index(
            connection,
            "statistics_short_term",
            "ix_statistics_short_term_statistic_id_start",
        )

    else:
        raise ValueError(f"No schema migration defined for version {new_version}")


def _inspect_schema_version(engine, session):
    """Determine the schema version by inspecting the db structure.

    When the schema version is not present in the db, either db was just
    created with the correct schema, or this is a db created before schema
    versions were tracked. For now, we'll test if the changes for schema
    version 1 are present to make the determination. Eventually this logic
    can be removed and we can assume a new db is being created.
    """
    inspector = sqlalchemy.inspect(engine)
    indexes = inspector.get_indexes("events")

    for index in indexes:
        if index["column_names"] == ["time_fired"]:
            # Schema addition from version 1 detected. New DB.
            session.add(StatisticsRuns(start=get_start_time()))
            session.add(SchemaChanges(schema_version=SCHEMA_VERSION))
            return SCHEMA_VERSION

    # Version 1 schema changes not found, this db needs to be migrated.
    current_version = SchemaChanges(schema_version=0)
    session.add(current_version)
    return current_version.schema_version