core/homeassistant/components/feedreader/coordinator.py

247 lines
9.2 KiB
Python

"""Data update coordinator for RSS/Atom feeds."""
from __future__ import annotations
from calendar import timegm
from datetime import datetime
import html
from logging import getLogger
from time import gmtime, struct_time
from typing import TYPE_CHECKING
from urllib.error import URLError
import feedparser
from homeassistant.config_entries import ConfigEntry
from homeassistant.const import CONF_URL
from homeassistant.core import HomeAssistant, callback
from homeassistant.exceptions import ConfigEntryNotReady
from homeassistant.helpers.storage import Store
from homeassistant.helpers.update_coordinator import DataUpdateCoordinator, UpdateFailed
from homeassistant.util import dt as dt_util
from .const import CONF_MAX_ENTRIES, DEFAULT_SCAN_INTERVAL, DOMAIN, EVENT_FEEDREADER
DELAY_SAVE = 30
STORAGE_VERSION = 1
_LOGGER = getLogger(__name__)
type FeedReaderConfigEntry = ConfigEntry[FeedReaderCoordinator]
class FeedReaderCoordinator(
DataUpdateCoordinator[list[feedparser.FeedParserDict] | None]
):
"""Abstraction over Feedparser module."""
config_entry: FeedReaderConfigEntry
def __init__(
self,
hass: HomeAssistant,
config_entry: FeedReaderConfigEntry,
storage: StoredData,
) -> None:
"""Initialize the FeedManager object, poll as per scan interval."""
self.url = config_entry.data[CONF_URL]
self.feed_author: str | None = None
self.feed_version: str | None = None
self._max_entries = config_entry.options[CONF_MAX_ENTRIES]
self._storage = storage
self._last_entry_timestamp: struct_time | None = None
self._event_type = EVENT_FEEDREADER
self._feed: feedparser.FeedParserDict | None = None
self._feed_id = self.url
super().__init__(
hass=hass,
logger=_LOGGER,
config_entry=config_entry,
name=f"{DOMAIN} {self.url}",
update_interval=DEFAULT_SCAN_INTERVAL,
)
@callback
def _log_no_entries(self) -> None:
"""Send no entries log at debug level."""
_LOGGER.debug("No new entries to be published in feed %s", self.url)
async def _async_fetch_feed(self) -> feedparser.FeedParserDict:
"""Fetch the feed data."""
_LOGGER.debug("Fetching new data from feed %s", self.url)
def _parse_feed() -> feedparser.FeedParserDict:
return feedparser.parse(
self.url,
etag=None if not self._feed else self._feed.get("etag"),
modified=None if not self._feed else self._feed.get("modified"),
)
feed = await self.hass.async_add_executor_job(_parse_feed)
if not feed:
raise UpdateFailed(f"Error fetching feed data from {self.url}")
# The 'bozo' flag really only indicates that there was an issue
# during the initial parsing of the XML, but it doesn't indicate
# whether this is an unrecoverable error. In this case the
# feedparser lib is trying a less strict parsing approach.
# If an error is detected here, log warning message but continue
# processing the feed entries if present.
if feed.bozo != 0:
if isinstance(feed.bozo_exception, URLError):
raise UpdateFailed(
f"Error fetching feed data from {self.url} : {feed.bozo_exception}"
)
# no connection issue, but parsing issue
_LOGGER.warning(
"Possible issue parsing feed %s: %s",
self.url,
feed.bozo_exception,
)
return feed
async def async_setup(self) -> None:
"""Set up the feed manager."""
try:
feed = await self._async_fetch_feed()
except UpdateFailed as err:
raise ConfigEntryNotReady from err
self.logger.debug("Feed data fetched from %s : %s", self.url, feed["feed"])
if feed_author := feed["feed"].get("author"):
self.feed_author = html.unescape(feed_author)
self.feed_version = feedparser.api.SUPPORTED_VERSIONS.get(feed["version"])
self._feed = feed
async def _async_update_data(self) -> list[feedparser.FeedParserDict] | None:
"""Update the feed and publish new entries to the event bus."""
assert self._feed is not None
# _last_entry_timestamp is not set during async_setup, but we have already
# fetched data, so we can use them, instead of fetch again
if self._last_entry_timestamp:
self._feed = await self._async_fetch_feed()
# Using etag and modified, if there's no new data available,
# the entries list will be empty
_LOGGER.debug(
"%s entri(es) available in feed %s",
len(self._feed.entries),
self.url,
)
if not self._feed.entries:
self._log_no_entries()
return None
if TYPE_CHECKING:
assert isinstance(self._feed.entries, list)
self._filter_entries()
self._publish_new_entries()
_LOGGER.debug("Fetch from feed %s completed", self.url)
if self._last_entry_timestamp:
self._storage.async_put_timestamp(self._feed_id, self._last_entry_timestamp)
return self._feed.entries
@callback
def _filter_entries(self) -> None:
"""Filter the entries provided and return the ones to keep."""
assert self._feed is not None
if len(self._feed.entries) > self._max_entries:
_LOGGER.debug(
"Processing only the first %s entries in feed %s",
self._max_entries,
self.url,
)
self._feed.entries = self._feed.entries[0 : self._max_entries]
@callback
def _update_and_fire_entry(self, entry: feedparser.FeedParserDict) -> None:
"""Update last_entry_timestamp and fire entry."""
# Check if the entry has a updated or published date.
# Start from a updated date because generally `updated` > `published`.
if time_stamp := entry.get("updated_parsed") or entry.get("published_parsed"):
self._last_entry_timestamp = time_stamp
else:
_LOGGER.debug(
"No updated_parsed or published_parsed info available for entry %s",
entry,
)
entry["feed_url"] = self.url
self.hass.bus.async_fire(self._event_type, entry)
_LOGGER.debug("New event fired for entry %s", entry.get("link"))
@callback
def _publish_new_entries(self) -> None:
"""Publish new entries to the event bus."""
assert self._feed is not None
new_entry_count = 0
firstrun = False
self._last_entry_timestamp = self._storage.get_timestamp(self._feed_id)
if not self._last_entry_timestamp:
firstrun = True
# Set last entry timestamp as epoch time if not available
self._last_entry_timestamp = dt_util.utc_from_timestamp(0).timetuple()
# locally cache self._last_entry_timestamp so that entries published at identical times can be processed
last_entry_timestamp = self._last_entry_timestamp
for entry in self._feed.entries:
if firstrun or (
(
time_stamp := entry.get("updated_parsed")
or entry.get("published_parsed")
)
and time_stamp > last_entry_timestamp
):
self._update_and_fire_entry(entry)
new_entry_count += 1
else:
_LOGGER.debug("Already processed entry %s", entry.get("link"))
if new_entry_count == 0:
self._log_no_entries()
else:
_LOGGER.debug("%d entries published in feed %s", new_entry_count, self.url)
class StoredData:
"""Represent a data storage."""
def __init__(self, hass: HomeAssistant) -> None:
"""Initialize data storage."""
self._data: dict[str, struct_time] = {}
self.hass = hass
self._store: Store[dict[str, str]] = Store(hass, STORAGE_VERSION, DOMAIN)
self.is_initialized = False
async def async_setup(self) -> None:
"""Set up storage."""
if (store_data := await self._store.async_load()) is not None:
# Make sure that dst is set to 0, by using gmtime() on the timestamp.
self._data = {
feed_id: gmtime(datetime.fromisoformat(timestamp_string).timestamp())
for feed_id, timestamp_string in store_data.items()
}
self.is_initialized = True
def get_timestamp(self, feed_id: str) -> struct_time | None:
"""Return stored timestamp for given feed id."""
return self._data.get(feed_id)
@callback
def async_put_timestamp(self, feed_id: str, timestamp: struct_time) -> None:
"""Update timestamp for given feed id."""
self._data[feed_id] = timestamp
self._store.async_delay_save(self._async_save_data, DELAY_SAVE)
@callback
def _async_save_data(self) -> dict[str, str]:
"""Save feed data to storage."""
return {
feed_id: dt_util.utc_from_timestamp(timegm(struct_utc)).isoformat()
for feed_id, struct_utc in self._data.items()
}