core/homeassistant/components/profiler/__init__.py

552 lines
18 KiB
Python

"""The profiler integration."""
import asyncio
from contextlib import suppress
from datetime import timedelta
from functools import _lru_cache_wrapper
import logging
import reprlib
import sys
import threading
import time
import traceback
from typing import Any, cast
from lru import LRU
import voluptuous as vol
from homeassistant.components import persistent_notification
from homeassistant.config_entries import ConfigEntry
from homeassistant.const import CONF_SCAN_INTERVAL, CONF_TYPE
from homeassistant.core import HomeAssistant, ServiceCall, callback
from homeassistant.exceptions import HomeAssistantError
import homeassistant.helpers.config_validation as cv
from homeassistant.helpers.event import async_track_time_interval
from homeassistant.helpers.service import async_register_admin_service
from .const import DOMAIN
SERVICE_START = "start"
SERVICE_MEMORY = "memory"
SERVICE_START_LOG_OBJECTS = "start_log_objects"
SERVICE_STOP_LOG_OBJECTS = "stop_log_objects"
SERVICE_START_LOG_OBJECT_SOURCES = "start_log_object_sources"
SERVICE_STOP_LOG_OBJECT_SOURCES = "stop_log_object_sources"
SERVICE_DUMP_LOG_OBJECTS = "dump_log_objects"
SERVICE_LRU_STATS = "lru_stats"
SERVICE_LOG_THREAD_FRAMES = "log_thread_frames"
SERVICE_LOG_EVENT_LOOP_SCHEDULED = "log_event_loop_scheduled"
_LRU_CACHE_WRAPPER_OBJECT = _lru_cache_wrapper.__name__
_SQLALCHEMY_LRU_OBJECT = "LRUCache"
_KNOWN_LRU_CLASSES = (
"EventDataManager",
"EventTypeManager",
"StatesMetaManager",
"StateAttributesManager",
"StatisticsMetaManager",
"IntegrationMatcher",
)
SERVICES = (
SERVICE_START,
SERVICE_MEMORY,
SERVICE_START_LOG_OBJECTS,
SERVICE_STOP_LOG_OBJECTS,
SERVICE_DUMP_LOG_OBJECTS,
SERVICE_LRU_STATS,
SERVICE_LOG_THREAD_FRAMES,
SERVICE_LOG_EVENT_LOOP_SCHEDULED,
)
DEFAULT_SCAN_INTERVAL = timedelta(seconds=30)
DEFAULT_MAX_OBJECTS = 5
CONF_SECONDS = "seconds"
CONF_MAX_OBJECTS = "max_objects"
LOG_INTERVAL_SUB = "log_interval_subscription"
_LOGGER = logging.getLogger(__name__)
async def async_setup_entry( # noqa: C901
hass: HomeAssistant, entry: ConfigEntry
) -> bool:
"""Set up Profiler from a config entry."""
lock = asyncio.Lock()
domain_data = hass.data[DOMAIN] = {}
async def _async_run_profile(call: ServiceCall) -> None:
async with lock:
await _async_generate_profile(hass, call)
async def _async_run_memory_profile(call: ServiceCall) -> None:
async with lock:
await _async_generate_memory_profile(hass, call)
async def _async_start_log_objects(call: ServiceCall) -> None:
if LOG_INTERVAL_SUB in domain_data:
raise HomeAssistantError("Object logging already started")
persistent_notification.async_create(
hass,
(
"Object growth logging has started. See [the logs](/config/logs) to"
" track the growth of new objects."
),
title="Object growth logging started",
notification_id="profile_object_logging",
)
await hass.async_add_executor_job(_log_objects)
domain_data[LOG_INTERVAL_SUB] = async_track_time_interval(
hass, _log_objects, call.data[CONF_SCAN_INTERVAL]
)
async def _async_stop_log_objects(call: ServiceCall) -> None:
if LOG_INTERVAL_SUB not in domain_data:
raise HomeAssistantError("Object logging not running")
persistent_notification.async_dismiss(hass, "profile_object_logging")
domain_data.pop(LOG_INTERVAL_SUB)()
async def _async_start_object_sources(call: ServiceCall) -> None:
if LOG_INTERVAL_SUB in domain_data:
raise HomeAssistantError("Object logging already started")
persistent_notification.async_create(
hass,
(
"Object source logging has started. See [the logs](/config/logs) to"
" track the growth of new objects."
),
title="Object source logging started",
notification_id="profile_object_source_logging",
)
last_ids: set[int] = set()
last_stats: dict[str, int] = {}
async def _log_object_sources_with_max(*_: Any) -> None:
await hass.async_add_executor_job(
_log_object_sources, call.data[CONF_MAX_OBJECTS], last_ids, last_stats
)
await _log_object_sources_with_max()
cancel_track = async_track_time_interval(
hass, _log_object_sources_with_max, call.data[CONF_SCAN_INTERVAL]
)
@callback
def _cancel():
cancel_track()
last_ids.clear()
last_stats.clear()
domain_data[LOG_INTERVAL_SUB] = _cancel
@callback
def _async_stop_object_sources(call: ServiceCall) -> None:
if LOG_INTERVAL_SUB not in domain_data:
raise HomeAssistantError("Object logging not running")
persistent_notification.async_dismiss(hass, "profile_object_source_logging")
domain_data.pop(LOG_INTERVAL_SUB)()
def _dump_log_objects(call: ServiceCall) -> None:
# Imports deferred to avoid loading modules
# in memory since usually only one part of this
# integration is used at a time
import objgraph # pylint: disable=import-outside-toplevel
obj_type = call.data[CONF_TYPE]
for obj in objgraph.by_type(obj_type):
_LOGGER.critical(
"%s object in memory: %s",
obj_type,
_safe_repr(obj),
)
persistent_notification.create(
hass,
(
f"Objects with type {obj_type} have been dumped to the log. See [the"
" logs](/config/logs) to review the repr of the objects."
),
title="Object dump completed",
notification_id="profile_object_dump",
)
def _lru_stats(call: ServiceCall) -> None:
"""Log the stats of all lru caches."""
# Imports deferred to avoid loading modules
# in memory since usually only one part of this
# integration is used at a time
import objgraph # pylint: disable=import-outside-toplevel
for lru in objgraph.by_type(_LRU_CACHE_WRAPPER_OBJECT):
lru = cast(_lru_cache_wrapper, lru)
_LOGGER.critical(
"Cache stats for lru_cache %s at %s: %s",
lru.__wrapped__,
_get_function_absfile(lru.__wrapped__) or "unknown",
lru.cache_info(),
)
for _class in _KNOWN_LRU_CLASSES:
for class_with_lru_attr in objgraph.by_type(_class):
for maybe_lru in class_with_lru_attr.__dict__.values():
if isinstance(maybe_lru, LRU):
_LOGGER.critical(
"Cache stats for LRU %s at %s: %s",
type(class_with_lru_attr),
_get_function_absfile(class_with_lru_attr) or "unknown",
maybe_lru.get_stats(),
)
for lru in objgraph.by_type(_SQLALCHEMY_LRU_OBJECT):
if (data := getattr(lru, "_data", None)) and isinstance(data, dict):
for key, value in dict(data).items():
_LOGGER.critical(
"Cache data for sqlalchemy LRUCache %s: %s: %s", lru, key, value
)
persistent_notification.create(
hass,
(
"LRU cache states have been dumped to the log. See [the"
" logs](/config/logs) to review the stats."
),
title="LRU stats completed",
notification_id="profile_lru_stats",
)
async def _async_dump_thread_frames(call: ServiceCall) -> None:
"""Log all thread frames."""
frames = sys._current_frames() # pylint: disable=protected-access
main_thread = threading.main_thread()
for thread in threading.enumerate():
if thread == main_thread:
continue
ident = cast(int, thread.ident)
_LOGGER.critical(
"Thread [%s]: %s",
thread.name,
"".join(traceback.format_stack(frames.get(ident))).strip(),
)
async def _async_dump_scheduled(call: ServiceCall) -> None:
"""Log all scheduled in the event loop."""
arepr = reprlib.aRepr
original_maxstring = arepr.maxstring
original_maxother = arepr.maxother
arepr.maxstring = 300
arepr.maxother = 300
handle: asyncio.Handle
try:
for handle in getattr(hass.loop, "_scheduled"):
if not handle.cancelled():
_LOGGER.critical("Scheduled: %s", handle)
finally:
arepr.maxstring = original_maxstring
arepr.maxother = original_maxother
async_register_admin_service(
hass,
DOMAIN,
SERVICE_START,
_async_run_profile,
schema=vol.Schema(
{vol.Optional(CONF_SECONDS, default=60.0): vol.Coerce(float)}
),
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_MEMORY,
_async_run_memory_profile,
schema=vol.Schema(
{vol.Optional(CONF_SECONDS, default=60.0): vol.Coerce(float)}
),
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_START_LOG_OBJECTS,
_async_start_log_objects,
schema=vol.Schema(
{
vol.Optional(
CONF_SCAN_INTERVAL, default=DEFAULT_SCAN_INTERVAL
): cv.time_period
}
),
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_STOP_LOG_OBJECTS,
_async_stop_log_objects,
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_START_LOG_OBJECT_SOURCES,
_async_start_object_sources,
schema=vol.Schema(
{
vol.Optional(
CONF_SCAN_INTERVAL, default=DEFAULT_SCAN_INTERVAL
): cv.time_period,
vol.Optional(CONF_MAX_OBJECTS, default=DEFAULT_MAX_OBJECTS): vol.Range(
min=1, max=1024
),
}
),
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_STOP_LOG_OBJECT_SOURCES,
_async_stop_object_sources,
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_DUMP_LOG_OBJECTS,
_dump_log_objects,
schema=vol.Schema({vol.Required(CONF_TYPE): str}),
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_LRU_STATS,
_lru_stats,
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_LOG_THREAD_FRAMES,
_async_dump_thread_frames,
)
async_register_admin_service(
hass,
DOMAIN,
SERVICE_LOG_EVENT_LOOP_SCHEDULED,
_async_dump_scheduled,
)
return True
async def async_unload_entry(hass: HomeAssistant, entry: ConfigEntry) -> bool:
"""Unload a config entry."""
for service in SERVICES:
hass.services.async_remove(domain=DOMAIN, service=service)
if LOG_INTERVAL_SUB in hass.data[DOMAIN]:
hass.data[DOMAIN][LOG_INTERVAL_SUB]()
hass.data.pop(DOMAIN)
return True
async def _async_generate_profile(hass: HomeAssistant, call: ServiceCall):
# Imports deferred to avoid loading modules
# in memory since usually only one part of this
# integration is used at a time
import cProfile # pylint: disable=import-outside-toplevel
start_time = int(time.time() * 1000000)
persistent_notification.async_create(
hass,
(
"The profile has started. This notification will be updated when it is"
" complete."
),
title="Profile Started",
notification_id=f"profiler_{start_time}",
)
profiler = cProfile.Profile()
profiler.enable()
await asyncio.sleep(float(call.data[CONF_SECONDS]))
profiler.disable()
cprofile_path = hass.config.path(f"profile.{start_time}.cprof")
callgrind_path = hass.config.path(f"callgrind.out.{start_time}")
await hass.async_add_executor_job(
_write_profile, profiler, cprofile_path, callgrind_path
)
persistent_notification.async_create(
hass,
(
f"Wrote cProfile data to {cprofile_path} and callgrind data to"
f" {callgrind_path}"
),
title="Profile Complete",
notification_id=f"profiler_{start_time}",
)
async def _async_generate_memory_profile(hass: HomeAssistant, call: ServiceCall):
# Imports deferred to avoid loading modules
# in memory since usually only one part of this
# integration is used at a time
from guppy import hpy # pylint: disable=import-outside-toplevel
start_time = int(time.time() * 1000000)
persistent_notification.async_create(
hass,
(
"The memory profile has started. This notification will be updated when it"
" is complete."
),
title="Profile Started",
notification_id=f"memory_profiler_{start_time}",
)
heap_profiler = hpy()
heap_profiler.setref()
await asyncio.sleep(float(call.data[CONF_SECONDS]))
heap = heap_profiler.heap()
heap_path = hass.config.path(f"heap_profile.{start_time}.hpy")
await hass.async_add_executor_job(_write_memory_profile, heap, heap_path)
persistent_notification.async_create(
hass,
f"Wrote heapy memory profile to {heap_path}",
title="Profile Complete",
notification_id=f"memory_profiler_{start_time}",
)
def _write_profile(profiler, cprofile_path, callgrind_path):
# Imports deferred to avoid loading modules
# in memory since usually only one part of this
# integration is used at a time
from pyprof2calltree import convert # pylint: disable=import-outside-toplevel
profiler.create_stats()
profiler.dump_stats(cprofile_path)
convert(profiler.getstats(), callgrind_path)
def _write_memory_profile(heap, heap_path):
heap.byrcs.dump(heap_path)
def _log_objects(*_):
# Imports deferred to avoid loading modules
# in memory since usually only one part of this
# integration is used at a time
import objgraph # pylint: disable=import-outside-toplevel
_LOGGER.critical("Memory Growth: %s", objgraph.growth(limit=1000))
def _get_function_absfile(func: Any) -> str | None:
"""Get the absolute file path of a function."""
import inspect # pylint: disable=import-outside-toplevel
abs_file: str | None = None
with suppress(Exception):
abs_file = inspect.getabsfile(func)
return abs_file
def _safe_repr(obj: Any) -> str:
"""Get the repr of an object but keep going if there is an exception.
We wrap repr to ensure if one object cannot be serialized, we can
still get the rest.
"""
try:
return repr(obj)
except Exception: # pylint: disable=broad-except
return f"Failed to serialize {type(obj)}"
def _find_backrefs_not_to_self(_object: Any) -> list[str]:
import objgraph # pylint: disable=import-outside-toplevel
return [
_safe_repr(backref)
for backref in objgraph.find_backref_chain(
_object, lambda obj: obj is not _object
)
]
def _log_object_sources(
max_objects: int, last_ids: set[int], last_stats: dict[str, int]
) -> None:
# Imports deferred to avoid loading modules
# in memory since usually only one part of this
# integration is used at a time
import gc # pylint: disable=import-outside-toplevel
gc.collect()
objects = gc.get_objects()
new_objects: list[object] = []
new_objects_overflow: dict[str, int] = {}
current_ids = set()
new_stats: dict[str, int] = {}
had_new_object_growth = False
try:
for _object in objects:
object_type = type(_object).__name__
new_stats[object_type] = new_stats.get(object_type, 0) + 1
for _object in objects:
id_ = id(_object)
current_ids.add(id_)
if id_ in last_ids:
continue
object_type = type(_object).__name__
if last_stats.get(object_type, 0) < new_stats[object_type]:
if len(new_objects) < max_objects:
new_objects.append(_object)
else:
new_objects_overflow.setdefault(object_type, 0)
new_objects_overflow[object_type] += 1
for _object in new_objects:
had_new_object_growth = True
object_type = type(_object).__name__
_LOGGER.critical(
"New object %s (%s/%s) at %s: %s",
object_type,
last_stats.get(object_type, 0),
new_stats[object_type],
_get_function_absfile(_object) or _find_backrefs_not_to_self(_object),
_safe_repr(_object),
)
for object_type, count in last_stats.items():
new_stats[object_type] = max(new_stats.get(object_type, 0), count)
finally:
# Break reference cycles
del objects
del new_objects
last_ids.clear()
last_ids.update(current_ids)
last_stats.clear()
last_stats.update(new_stats)
del new_stats
del current_ids
if new_objects_overflow:
_LOGGER.critical("New objects overflowed by %s", new_objects_overflow)
elif not had_new_object_growth:
_LOGGER.critical("No new object growth found")