Improve handling of unrecoverable storage corruption (#96712)
* Improve handling of unrecoverable storage corruption fixes #96574 If something in storage gets corrupted core can boot loop or if its integration specific, the integration will fail to start. We now complainly loudly in the log, move away the corrupt data and start fresh to allow startup to proceed so the user can get to the UI and restore from backup without having to attach a console (or otherwise login to the OS and manually modify files). * test for corruption * ensure OSError is still fatal * one more case * create an issue for corrupt storage * fix key * persist * feedback * feedback * better to give the full path * tweaks * grammar * add time * feedback * adjust * try to get issue_domain from storage key * coverage * tweak wording some morepull/96890/head
parent
3e58e1987c
commit
01e66d6fb2
|
@ -27,6 +27,17 @@
|
|||
"no_platform_setup": {
|
||||
"title": "Unused YAML configuration for the {platform} integration",
|
||||
"description": "It's not possible to configure {platform} {domain} by adding `{platform_key}` to the {domain} configuration. Please check the documentation for more information on how to set up this integration.\n\nTo resolve this:\n1. Remove `{platform_key}` occurences from the `{domain}:` configuration in your YAML configuration file.\n2. Restart Home Assistant.\n\nExample that should be removed:\n{yaml_example}\n"
|
||||
},
|
||||
"storage_corruption": {
|
||||
"title": "Storage corruption detected for `{storage_key}`",
|
||||
"fix_flow": {
|
||||
"step": {
|
||||
"confirm": {
|
||||
"title": "[%key:component::homeassistant::issues::storage_corruption::title%]",
|
||||
"description": "The `{storage_key}` storage could not be parsed and has been renamed to `{corrupt_path}` to allow Home Assistant to continue.\n\nA default `{storage_key}` may have been created automatically.\n\nIf you made manual edits to the storage file, fix any syntax errors in `{corrupt_path}`, restore the file to the original path `{original_path}`, and restart Home Assistant. Otherwise, restore the system from a backup.\n\nClick SUBMIT below to confirm you have repaired the file or restored from a backup.\n\nThe exact error was: {error}"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"system_health": {
|
||||
|
|
|
@ -6,15 +6,24 @@ from collections.abc import Callable, Mapping, Sequence
|
|||
from contextlib import suppress
|
||||
from copy import deepcopy
|
||||
import inspect
|
||||
from json import JSONEncoder
|
||||
from json import JSONDecodeError, JSONEncoder
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Generic, TypeVar
|
||||
|
||||
from homeassistant.const import EVENT_HOMEASSISTANT_FINAL_WRITE
|
||||
from homeassistant.core import CALLBACK_TYPE, CoreState, Event, HomeAssistant, callback
|
||||
from homeassistant.core import (
|
||||
CALLBACK_TYPE,
|
||||
DOMAIN as HOMEASSISTANT_DOMAIN,
|
||||
CoreState,
|
||||
Event,
|
||||
HomeAssistant,
|
||||
callback,
|
||||
)
|
||||
from homeassistant.exceptions import HomeAssistantError
|
||||
from homeassistant.loader import MAX_LOAD_CONCURRENTLY, bind_hass
|
||||
from homeassistant.util import json as json_util
|
||||
import homeassistant.util.dt as dt_util
|
||||
from homeassistant.util.file import WriteError
|
||||
|
||||
from . import json as json_helper
|
||||
|
@ -146,9 +155,63 @@ class Store(Generic[_T]):
|
|||
# and we don't want that to mess with what we're trying to store.
|
||||
data = deepcopy(data)
|
||||
else:
|
||||
data = await self.hass.async_add_executor_job(
|
||||
json_util.load_json, self.path
|
||||
)
|
||||
try:
|
||||
data = await self.hass.async_add_executor_job(
|
||||
json_util.load_json, self.path
|
||||
)
|
||||
except HomeAssistantError as err:
|
||||
if isinstance(err.__cause__, JSONDecodeError):
|
||||
# If we have a JSONDecodeError, it means the file is corrupt.
|
||||
# We can't recover from this, so we'll log an error, rename the file and
|
||||
# return None so that we can start with a clean slate which will
|
||||
# allow startup to continue so they can restore from a backup.
|
||||
isotime = dt_util.utcnow().isoformat()
|
||||
corrupt_postfix = f".corrupt.{isotime}"
|
||||
corrupt_path = f"{self.path}{corrupt_postfix}"
|
||||
await self.hass.async_add_executor_job(
|
||||
os.rename, self.path, corrupt_path
|
||||
)
|
||||
storage_key = self.key
|
||||
_LOGGER.error(
|
||||
"Unrecoverable error decoding storage %s at %s; "
|
||||
"This may indicate an unclean shutdown, invalid syntax "
|
||||
"from manual edits, or disk corruption; "
|
||||
"The corrupt file has been saved as %s; "
|
||||
"It is recommended to restore from backup: %s",
|
||||
storage_key,
|
||||
self.path,
|
||||
corrupt_path,
|
||||
err,
|
||||
)
|
||||
from .issue_registry import ( # pylint: disable=import-outside-toplevel
|
||||
IssueSeverity,
|
||||
async_create_issue,
|
||||
)
|
||||
|
||||
issue_domain = HOMEASSISTANT_DOMAIN
|
||||
if (
|
||||
domain := (storage_key.partition(".")[0])
|
||||
) and domain in self.hass.config.components:
|
||||
issue_domain = domain
|
||||
|
||||
async_create_issue(
|
||||
self.hass,
|
||||
HOMEASSISTANT_DOMAIN,
|
||||
f"storage_corruption_{storage_key}_{isotime}",
|
||||
is_fixable=True,
|
||||
issue_domain=issue_domain,
|
||||
translation_key="storage_corruption",
|
||||
is_persistent=True,
|
||||
severity=IssueSeverity.CRITICAL,
|
||||
translation_placeholders={
|
||||
"storage_key": storage_key,
|
||||
"original_path": self.path,
|
||||
"corrupt_path": corrupt_path,
|
||||
"error": str(err),
|
||||
},
|
||||
)
|
||||
return None
|
||||
raise
|
||||
|
||||
if data == {}:
|
||||
return None
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
import asyncio
|
||||
from datetime import timedelta
|
||||
import json
|
||||
import os
|
||||
from typing import Any, NamedTuple
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
|
@ -12,8 +13,9 @@ from homeassistant.const import (
|
|||
EVENT_HOMEASSISTANT_FINAL_WRITE,
|
||||
EVENT_HOMEASSISTANT_STOP,
|
||||
)
|
||||
from homeassistant.core import CoreState, HomeAssistant
|
||||
from homeassistant.helpers import storage
|
||||
from homeassistant.core import DOMAIN as HOMEASSISTANT_DOMAIN, CoreState, HomeAssistant
|
||||
from homeassistant.exceptions import HomeAssistantError
|
||||
from homeassistant.helpers import issue_registry as ir, storage
|
||||
from homeassistant.util import dt as dt_util
|
||||
from homeassistant.util.color import RGBColor
|
||||
|
||||
|
@ -548,3 +550,156 @@ async def test_saving_load_round_trip(tmpdir: py.path.local) -> None:
|
|||
}
|
||||
|
||||
await hass.async_stop(force=True)
|
||||
|
||||
|
||||
async def test_loading_corrupt_core_file(
|
||||
tmpdir: py.path.local, caplog: pytest.LogCaptureFixture
|
||||
) -> None:
|
||||
"""Test we handle unrecoverable corruption in a core file."""
|
||||
loop = asyncio.get_running_loop()
|
||||
hass = await async_test_home_assistant(loop)
|
||||
|
||||
tmp_storage = await hass.async_add_executor_job(tmpdir.mkdir, "temp_storage")
|
||||
hass.config.config_dir = tmp_storage
|
||||
|
||||
storage_key = "core.anything"
|
||||
store = storage.Store(
|
||||
hass, MOCK_VERSION_2, storage_key, minor_version=MOCK_MINOR_VERSION_1
|
||||
)
|
||||
await store.async_save({"hello": "world"})
|
||||
storage_path = os.path.join(tmp_storage, ".storage")
|
||||
store_file = os.path.join(storage_path, store.key)
|
||||
|
||||
data = await store.async_load()
|
||||
assert data == {"hello": "world"}
|
||||
|
||||
def _corrupt_store():
|
||||
with open(store_file, "w") as f:
|
||||
f.write("corrupt")
|
||||
|
||||
await hass.async_add_executor_job(_corrupt_store)
|
||||
|
||||
data = await store.async_load()
|
||||
assert data is None
|
||||
assert "Unrecoverable error decoding storage" in caplog.text
|
||||
|
||||
issue_registry = ir.async_get(hass)
|
||||
found_issue = None
|
||||
issue_entry = None
|
||||
for (domain, issue), entry in issue_registry.issues.items():
|
||||
if domain == HOMEASSISTANT_DOMAIN and issue.startswith(
|
||||
f"storage_corruption_{storage_key}_"
|
||||
):
|
||||
found_issue = issue
|
||||
issue_entry = entry
|
||||
break
|
||||
|
||||
assert found_issue is not None
|
||||
assert issue_entry is not None
|
||||
assert issue_entry.is_fixable is True
|
||||
assert issue_entry.translation_placeholders["storage_key"] == storage_key
|
||||
assert issue_entry.issue_domain == HOMEASSISTANT_DOMAIN
|
||||
assert (
|
||||
issue_entry.translation_placeholders["error"]
|
||||
== "unexpected character: line 1 column 1 (char 0)"
|
||||
)
|
||||
|
||||
files = await hass.async_add_executor_job(
|
||||
os.listdir, os.path.join(tmp_storage, ".storage")
|
||||
)
|
||||
assert ".corrupt" in files[0]
|
||||
|
||||
await hass.async_stop(force=True)
|
||||
|
||||
|
||||
async def test_loading_corrupt_file_known_domain(
|
||||
tmpdir: py.path.local, caplog: pytest.LogCaptureFixture
|
||||
) -> None:
|
||||
"""Test we handle unrecoverable corruption for a known domain."""
|
||||
loop = asyncio.get_running_loop()
|
||||
hass = await async_test_home_assistant(loop)
|
||||
hass.config.components.add("testdomain")
|
||||
storage_key = "testdomain.testkey"
|
||||
|
||||
tmp_storage = await hass.async_add_executor_job(tmpdir.mkdir, "temp_storage")
|
||||
hass.config.config_dir = tmp_storage
|
||||
|
||||
store = storage.Store(
|
||||
hass, MOCK_VERSION_2, storage_key, minor_version=MOCK_MINOR_VERSION_1
|
||||
)
|
||||
await store.async_save({"hello": "world"})
|
||||
storage_path = os.path.join(tmp_storage, ".storage")
|
||||
store_file = os.path.join(storage_path, store.key)
|
||||
|
||||
data = await store.async_load()
|
||||
assert data == {"hello": "world"}
|
||||
|
||||
def _corrupt_store():
|
||||
with open(store_file, "w") as f:
|
||||
f.write('{"valid":"json"}..with..corrupt')
|
||||
|
||||
await hass.async_add_executor_job(_corrupt_store)
|
||||
|
||||
data = await store.async_load()
|
||||
assert data is None
|
||||
assert "Unrecoverable error decoding storage" in caplog.text
|
||||
|
||||
issue_registry = ir.async_get(hass)
|
||||
found_issue = None
|
||||
issue_entry = None
|
||||
for (domain, issue), entry in issue_registry.issues.items():
|
||||
if domain == HOMEASSISTANT_DOMAIN and issue.startswith(
|
||||
f"storage_corruption_{storage_key}_"
|
||||
):
|
||||
found_issue = issue
|
||||
issue_entry = entry
|
||||
break
|
||||
|
||||
assert found_issue is not None
|
||||
assert issue_entry is not None
|
||||
assert issue_entry.is_fixable is True
|
||||
assert issue_entry.translation_placeholders["storage_key"] == storage_key
|
||||
assert issue_entry.issue_domain == "testdomain"
|
||||
assert (
|
||||
issue_entry.translation_placeholders["error"]
|
||||
== "unexpected content after document: line 1 column 17 (char 16)"
|
||||
)
|
||||
|
||||
files = await hass.async_add_executor_job(
|
||||
os.listdir, os.path.join(tmp_storage, ".storage")
|
||||
)
|
||||
assert ".corrupt" in files[0]
|
||||
|
||||
await hass.async_stop(force=True)
|
||||
|
||||
|
||||
async def test_os_error_is_fatal(tmpdir: py.path.local) -> None:
|
||||
"""Test OSError during load is fatal."""
|
||||
loop = asyncio.get_running_loop()
|
||||
hass = await async_test_home_assistant(loop)
|
||||
|
||||
tmp_storage = await hass.async_add_executor_job(tmpdir.mkdir, "temp_storage")
|
||||
hass.config.config_dir = tmp_storage
|
||||
|
||||
store = storage.Store(
|
||||
hass, MOCK_VERSION_2, MOCK_KEY, minor_version=MOCK_MINOR_VERSION_1
|
||||
)
|
||||
await store.async_save({"hello": "world"})
|
||||
|
||||
with pytest.raises(OSError), patch(
|
||||
"homeassistant.helpers.storage.json_util.load_json", side_effect=OSError
|
||||
):
|
||||
await store.async_load()
|
||||
|
||||
base_os_error = OSError()
|
||||
base_os_error.errno = 30
|
||||
home_assistant_error = HomeAssistantError()
|
||||
home_assistant_error.__cause__ = base_os_error
|
||||
|
||||
with pytest.raises(HomeAssistantError), patch(
|
||||
"homeassistant.helpers.storage.json_util.load_json",
|
||||
side_effect=home_assistant_error,
|
||||
):
|
||||
await store.async_load()
|
||||
|
||||
await hass.async_stop(force=True)
|
||||
|
|
Loading…
Reference in New Issue