From 190894010c3e7b2b7c8377ecd48fccc4fa407cc6 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Wed, 31 Jan 2024 17:41:21 +0100 Subject: [PATCH] Reset failed API call counter on successful API call (#4862) * Reset failed API call counter on successful API call Make sure to reset the failed API call counter after a successful API call. While at it also update the log messages a bit to make it clearer what the problem is exactly. * Address pytest changes --- supervisor/misc/tasks.py | 23 ++++++++++++++--------- tests/misc/test_tasks.py | 18 ++++++++++++------ 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/supervisor/misc/tasks.py b/supervisor/misc/tasks.py index f2a17204a..af8558b75 100644 --- a/supervisor/misc/tasks.py +++ b/supervisor/misc/tasks.py @@ -16,8 +16,9 @@ from ..utils.sentry import capture_exception _LOGGER: logging.Logger = logging.getLogger(__name__) -HASS_WATCHDOG_API = "HASS_WATCHDOG_API" +HASS_WATCHDOG_API_FAILURES = "HASS_WATCHDOG_API_FAILURES" HASS_WATCHDOG_REANIMATE_FAILURES = "HASS_WATCHDOG_REANIMATE_FAILURES" +HASS_WATCHDOG_MAX_API_ATTEMPTS = 2 HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS = 5 RUN_UPDATE_SUPERVISOR = 29100 @@ -169,6 +170,7 @@ class Tasks(CoreSysAttributes): if await self.sys_homeassistant.api.check_api_state(): # Home Assistant is running properly self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0 + self._cache[HASS_WATCHDOG_API_FAILURES] = 0 return # Give up after 5 reanimation failures in a row. Supervisor cannot fix this issue. @@ -176,23 +178,26 @@ class Tasks(CoreSysAttributes): if reanimate_fails >= HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS: if reanimate_fails == HASS_WATCHDOG_MAX_REANIMATE_ATTEMPTS: _LOGGER.critical( - "Watchdog cannot reanimate Home Assistant, failed all %s attempts.", + "Watchdog cannot reanimate Home Assistant Core, failed all %s attempts.", reanimate_fails, ) self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] += 1 return # Init cache data - retry_scan = self._cache.get(HASS_WATCHDOG_API, 0) + api_fails = self._cache.get(HASS_WATCHDOG_API_FAILURES, 0) # Look like we run into a problem - retry_scan += 1 - if retry_scan == 1: - self._cache[HASS_WATCHDOG_API] = retry_scan - _LOGGER.warning("Watchdog miss API response from Home Assistant") + api_fails += 1 + if api_fails < HASS_WATCHDOG_MAX_API_ATTEMPTS: + self._cache[HASS_WATCHDOG_API_FAILURES] = api_fails + _LOGGER.warning("Watchdog missed an Home Assistant Core API response.") return - _LOGGER.error("Watchdog found a problem with Home Assistant API!") + _LOGGER.error( + "Watchdog missed %s Home Assistant Core API responses in a row. Restarting Home Assistant Core API!", + HASS_WATCHDOG_MAX_API_ATTEMPTS, + ) try: await self.sys_homeassistant.core.restart() except HomeAssistantError as err: @@ -203,7 +208,7 @@ class Tasks(CoreSysAttributes): else: self._cache[HASS_WATCHDOG_REANIMATE_FAILURES] = 0 finally: - self._cache[HASS_WATCHDOG_API] = 0 + self._cache[HASS_WATCHDOG_API_FAILURES] = 0 @Job(name="tasks_update_cli", conditions=PLUGIN_AUTO_UPDATE_CONDITIONS) async def _update_cli(self): diff --git a/tests/misc/test_tasks.py b/tests/misc/test_tasks.py index feff83403..c29d2e617 100644 --- a/tests/misc/test_tasks.py +++ b/tests/misc/test_tasks.py @@ -34,15 +34,21 @@ async def test_watchdog_homeassistant_api( await tasks._watchdog_homeassistant_api() restart.assert_not_called() - assert "Watchdog miss API response from Home Assistant" in caplog.text - assert "Watchdog found a problem with Home Assistant API!" not in caplog.text + assert "Watchdog missed an Home Assistant Core API response." in caplog.text + assert ( + "Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core API!" + not in caplog.text + ) caplog.clear() await tasks._watchdog_homeassistant_api() restart.assert_called_once() - assert "Watchdog miss API response from Home Assistant" not in caplog.text - assert "Watchdog found a problem with Home Assistant API!" in caplog.text + assert "Watchdog missed an Home Assistant Core API response." not in caplog.text + assert ( + "Watchdog missed 2 Home Assistant Core API responses in a row. Restarting Home Assistant Core API!" + in caplog.text + ) async def test_watchdog_homeassistant_api_off(tasks: Tasks, coresys: CoreSys): @@ -120,10 +126,10 @@ async def test_watchdog_homeassistant_api_reanimation_limit( await tasks._watchdog_homeassistant_api() restart.assert_not_called() - assert "Watchdog miss API response from Home Assistant" not in caplog.text + assert "Watchdog missed an Home Assistant Core API response." not in caplog.text assert "Watchdog found a problem with Home Assistant API!" not in caplog.text assert ( - "Watchdog cannot reanimate Home Assistant, failed all 5 attempts." + "Watchdog cannot reanimate Home Assistant Core, failed all 5 attempts." in caplog.text )