Improve handling of unrecoverable storage corruption (#96712)

* Improve handling of unrecoverable storage corruption

fixes #96574

If something in storage gets corrupted core can boot loop
or if its integration specific, the integration will fail to
start.

We now complainly loudly in the log, move away the corrupt data
and start fresh to allow startup to proceed so the user can
get to the UI and restore from backup without having to attach
a console (or otherwise login to the OS and manually modify files).

* test for corruption

* ensure OSError is still fatal

* one more case

* create an issue for corrupt storage

* fix key

* persist

* feedback

* feedback

* better to give the full path

* tweaks

* grammar

* add time

* feedback

* adjust

* try to get issue_domain from storage key

* coverage

* tweak wording some more
pull/96890/head
J. Nick Koston 2023-07-19 02:23:12 -05:00 committed by GitHub
parent 3e58e1987c
commit 01e66d6fb2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 236 additions and 7 deletions

View File

@ -27,6 +27,17 @@
"no_platform_setup": {
"title": "Unused YAML configuration for the {platform} integration",
"description": "It's not possible to configure {platform} {domain} by adding `{platform_key}` to the {domain} configuration. Please check the documentation for more information on how to set up this integration.\n\nTo resolve this:\n1. Remove `{platform_key}` occurences from the `{domain}:` configuration in your YAML configuration file.\n2. Restart Home Assistant.\n\nExample that should be removed:\n{yaml_example}\n"
},
"storage_corruption": {
"title": "Storage corruption detected for `{storage_key}`",
"fix_flow": {
"step": {
"confirm": {
"title": "[%key:component::homeassistant::issues::storage_corruption::title%]",
"description": "The `{storage_key}` storage could not be parsed and has been renamed to `{corrupt_path}` to allow Home Assistant to continue.\n\nA default `{storage_key}` may have been created automatically.\n\nIf you made manual edits to the storage file, fix any syntax errors in `{corrupt_path}`, restore the file to the original path `{original_path}`, and restart Home Assistant. Otherwise, restore the system from a backup.\n\nClick SUBMIT below to confirm you have repaired the file or restored from a backup.\n\nThe exact error was: {error}"
}
}
}
}
},
"system_health": {

View File

@ -6,15 +6,24 @@ from collections.abc import Callable, Mapping, Sequence
from contextlib import suppress
from copy import deepcopy
import inspect
from json import JSONEncoder
from json import JSONDecodeError, JSONEncoder
import logging
import os
from typing import Any, Generic, TypeVar
from homeassistant.const import EVENT_HOMEASSISTANT_FINAL_WRITE
from homeassistant.core import CALLBACK_TYPE, CoreState, Event, HomeAssistant, callback
from homeassistant.core import (
CALLBACK_TYPE,
DOMAIN as HOMEASSISTANT_DOMAIN,
CoreState,
Event,
HomeAssistant,
callback,
)
from homeassistant.exceptions import HomeAssistantError
from homeassistant.loader import MAX_LOAD_CONCURRENTLY, bind_hass
from homeassistant.util import json as json_util
import homeassistant.util.dt as dt_util
from homeassistant.util.file import WriteError
from . import json as json_helper
@ -146,9 +155,63 @@ class Store(Generic[_T]):
# and we don't want that to mess with what we're trying to store.
data = deepcopy(data)
else:
data = await self.hass.async_add_executor_job(
json_util.load_json, self.path
)
try:
data = await self.hass.async_add_executor_job(
json_util.load_json, self.path
)
except HomeAssistantError as err:
if isinstance(err.__cause__, JSONDecodeError):
# If we have a JSONDecodeError, it means the file is corrupt.
# We can't recover from this, so we'll log an error, rename the file and
# return None so that we can start with a clean slate which will
# allow startup to continue so they can restore from a backup.
isotime = dt_util.utcnow().isoformat()
corrupt_postfix = f".corrupt.{isotime}"
corrupt_path = f"{self.path}{corrupt_postfix}"
await self.hass.async_add_executor_job(
os.rename, self.path, corrupt_path
)
storage_key = self.key
_LOGGER.error(
"Unrecoverable error decoding storage %s at %s; "
"This may indicate an unclean shutdown, invalid syntax "
"from manual edits, or disk corruption; "
"The corrupt file has been saved as %s; "
"It is recommended to restore from backup: %s",
storage_key,
self.path,
corrupt_path,
err,
)
from .issue_registry import ( # pylint: disable=import-outside-toplevel
IssueSeverity,
async_create_issue,
)
issue_domain = HOMEASSISTANT_DOMAIN
if (
domain := (storage_key.partition(".")[0])
) and domain in self.hass.config.components:
issue_domain = domain
async_create_issue(
self.hass,
HOMEASSISTANT_DOMAIN,
f"storage_corruption_{storage_key}_{isotime}",
is_fixable=True,
issue_domain=issue_domain,
translation_key="storage_corruption",
is_persistent=True,
severity=IssueSeverity.CRITICAL,
translation_placeholders={
"storage_key": storage_key,
"original_path": self.path,
"corrupt_path": corrupt_path,
"error": str(err),
},
)
return None
raise
if data == {}:
return None

View File

@ -2,6 +2,7 @@
import asyncio
from datetime import timedelta
import json
import os
from typing import Any, NamedTuple
from unittest.mock import Mock, patch
@ -12,8 +13,9 @@ from homeassistant.const import (
EVENT_HOMEASSISTANT_FINAL_WRITE,
EVENT_HOMEASSISTANT_STOP,
)
from homeassistant.core import CoreState, HomeAssistant
from homeassistant.helpers import storage
from homeassistant.core import DOMAIN as HOMEASSISTANT_DOMAIN, CoreState, HomeAssistant
from homeassistant.exceptions import HomeAssistantError
from homeassistant.helpers import issue_registry as ir, storage
from homeassistant.util import dt as dt_util
from homeassistant.util.color import RGBColor
@ -548,3 +550,156 @@ async def test_saving_load_round_trip(tmpdir: py.path.local) -> None:
}
await hass.async_stop(force=True)
async def test_loading_corrupt_core_file(
tmpdir: py.path.local, caplog: pytest.LogCaptureFixture
) -> None:
"""Test we handle unrecoverable corruption in a core file."""
loop = asyncio.get_running_loop()
hass = await async_test_home_assistant(loop)
tmp_storage = await hass.async_add_executor_job(tmpdir.mkdir, "temp_storage")
hass.config.config_dir = tmp_storage
storage_key = "core.anything"
store = storage.Store(
hass, MOCK_VERSION_2, storage_key, minor_version=MOCK_MINOR_VERSION_1
)
await store.async_save({"hello": "world"})
storage_path = os.path.join(tmp_storage, ".storage")
store_file = os.path.join(storage_path, store.key)
data = await store.async_load()
assert data == {"hello": "world"}
def _corrupt_store():
with open(store_file, "w") as f:
f.write("corrupt")
await hass.async_add_executor_job(_corrupt_store)
data = await store.async_load()
assert data is None
assert "Unrecoverable error decoding storage" in caplog.text
issue_registry = ir.async_get(hass)
found_issue = None
issue_entry = None
for (domain, issue), entry in issue_registry.issues.items():
if domain == HOMEASSISTANT_DOMAIN and issue.startswith(
f"storage_corruption_{storage_key}_"
):
found_issue = issue
issue_entry = entry
break
assert found_issue is not None
assert issue_entry is not None
assert issue_entry.is_fixable is True
assert issue_entry.translation_placeholders["storage_key"] == storage_key
assert issue_entry.issue_domain == HOMEASSISTANT_DOMAIN
assert (
issue_entry.translation_placeholders["error"]
== "unexpected character: line 1 column 1 (char 0)"
)
files = await hass.async_add_executor_job(
os.listdir, os.path.join(tmp_storage, ".storage")
)
assert ".corrupt" in files[0]
await hass.async_stop(force=True)
async def test_loading_corrupt_file_known_domain(
tmpdir: py.path.local, caplog: pytest.LogCaptureFixture
) -> None:
"""Test we handle unrecoverable corruption for a known domain."""
loop = asyncio.get_running_loop()
hass = await async_test_home_assistant(loop)
hass.config.components.add("testdomain")
storage_key = "testdomain.testkey"
tmp_storage = await hass.async_add_executor_job(tmpdir.mkdir, "temp_storage")
hass.config.config_dir = tmp_storage
store = storage.Store(
hass, MOCK_VERSION_2, storage_key, minor_version=MOCK_MINOR_VERSION_1
)
await store.async_save({"hello": "world"})
storage_path = os.path.join(tmp_storage, ".storage")
store_file = os.path.join(storage_path, store.key)
data = await store.async_load()
assert data == {"hello": "world"}
def _corrupt_store():
with open(store_file, "w") as f:
f.write('{"valid":"json"}..with..corrupt')
await hass.async_add_executor_job(_corrupt_store)
data = await store.async_load()
assert data is None
assert "Unrecoverable error decoding storage" in caplog.text
issue_registry = ir.async_get(hass)
found_issue = None
issue_entry = None
for (domain, issue), entry in issue_registry.issues.items():
if domain == HOMEASSISTANT_DOMAIN and issue.startswith(
f"storage_corruption_{storage_key}_"
):
found_issue = issue
issue_entry = entry
break
assert found_issue is not None
assert issue_entry is not None
assert issue_entry.is_fixable is True
assert issue_entry.translation_placeholders["storage_key"] == storage_key
assert issue_entry.issue_domain == "testdomain"
assert (
issue_entry.translation_placeholders["error"]
== "unexpected content after document: line 1 column 17 (char 16)"
)
files = await hass.async_add_executor_job(
os.listdir, os.path.join(tmp_storage, ".storage")
)
assert ".corrupt" in files[0]
await hass.async_stop(force=True)
async def test_os_error_is_fatal(tmpdir: py.path.local) -> None:
"""Test OSError during load is fatal."""
loop = asyncio.get_running_loop()
hass = await async_test_home_assistant(loop)
tmp_storage = await hass.async_add_executor_job(tmpdir.mkdir, "temp_storage")
hass.config.config_dir = tmp_storage
store = storage.Store(
hass, MOCK_VERSION_2, MOCK_KEY, minor_version=MOCK_MINOR_VERSION_1
)
await store.async_save({"hello": "world"})
with pytest.raises(OSError), patch(
"homeassistant.helpers.storage.json_util.load_json", side_effect=OSError
):
await store.async_load()
base_os_error = OSError()
base_os_error.errno = 30
home_assistant_error = HomeAssistantError()
home_assistant_error.__cause__ = base_os_error
with pytest.raises(HomeAssistantError), patch(
"homeassistant.helpers.storage.json_util.load_json",
side_effect=home_assistant_error,
):
await store.async_load()
await hass.async_stop(force=True)