core/homeassistant/components/feedreader/__init__.py

292 lines
11 KiB
Python

"""Support for RSS/Atom feeds."""
from __future__ import annotations
from calendar import timegm
from datetime import datetime, timedelta
from logging import getLogger
import os
import pickle
from time import gmtime, struct_time
import feedparser
import voluptuous as vol
from homeassistant.const import CONF_SCAN_INTERVAL, EVENT_HOMEASSISTANT_START
from homeassistant.core import Event, HomeAssistant, callback
import homeassistant.helpers.config_validation as cv
from homeassistant.helpers.event import async_track_time_interval
from homeassistant.helpers.storage import Store
from homeassistant.helpers.typing import ConfigType
from homeassistant.util import dt as dt_util
_LOGGER = getLogger(__name__)
CONF_URLS = "urls"
CONF_MAX_ENTRIES = "max_entries"
DEFAULT_MAX_ENTRIES = 20
DEFAULT_SCAN_INTERVAL = timedelta(hours=1)
DELAY_SAVE = 30
DOMAIN = "feedreader"
EVENT_FEEDREADER = "feedreader"
STORAGE_VERSION = 1
CONFIG_SCHEMA = vol.Schema(
{
DOMAIN: {
vol.Required(CONF_URLS): vol.All(cv.ensure_list, [cv.url]),
vol.Optional(
CONF_SCAN_INTERVAL, default=DEFAULT_SCAN_INTERVAL
): cv.time_period,
vol.Optional(
CONF_MAX_ENTRIES, default=DEFAULT_MAX_ENTRIES
): cv.positive_int,
}
},
extra=vol.ALLOW_EXTRA,
)
async def async_setup(hass: HomeAssistant, config: ConfigType) -> bool:
"""Set up the Feedreader component."""
urls: list[str] = config[DOMAIN][CONF_URLS]
if not urls:
return False
scan_interval: timedelta = config[DOMAIN][CONF_SCAN_INTERVAL]
max_entries: int = config[DOMAIN][CONF_MAX_ENTRIES]
old_data_file = hass.config.path(f"{DOMAIN}.pickle")
storage = StoredData(hass, old_data_file)
await storage.async_setup()
feeds = [
FeedManager(hass, url, scan_interval, max_entries, storage) for url in urls
]
for feed in feeds:
feed.async_setup()
return True
class FeedManager:
"""Abstraction over Feedparser module."""
def __init__(
self,
hass: HomeAssistant,
url: str,
scan_interval: timedelta,
max_entries: int,
storage: StoredData,
) -> None:
"""Initialize the FeedManager object, poll as per scan interval."""
self._hass = hass
self._url = url
self._scan_interval = scan_interval
self._max_entries = max_entries
self._feed: feedparser.FeedParserDict | None = None
self._firstrun = True
self._storage = storage
self._last_entry_timestamp: struct_time | None = None
self._has_published_parsed = False
self._has_updated_parsed = False
self._event_type = EVENT_FEEDREADER
self._feed_id = url
@callback
def async_setup(self) -> None:
"""Set up the feed manager."""
self._hass.bus.async_listen_once(EVENT_HOMEASSISTANT_START, self._async_update)
async_track_time_interval(
self._hass, self._async_update, self._scan_interval, cancel_on_shutdown=True
)
def _log_no_entries(self) -> None:
"""Send no entries log at debug level."""
_LOGGER.debug("No new entries to be published in feed %s", self._url)
async def _async_update(self, _: datetime | Event) -> None:
"""Update the feed and publish new entries to the event bus."""
last_entry_timestamp = await self._hass.async_add_executor_job(self._update)
if last_entry_timestamp:
self._storage.async_put_timestamp(self._feed_id, last_entry_timestamp)
def _update(self) -> struct_time | None:
"""Update the feed and publish new entries to the event bus."""
_LOGGER.debug("Fetching new data from feed %s", self._url)
self._feed: feedparser.FeedParserDict = feedparser.parse( # type: ignore[no-redef]
self._url,
etag=None if not self._feed else self._feed.get("etag"),
modified=None if not self._feed else self._feed.get("modified"),
)
if not self._feed:
_LOGGER.error("Error fetching feed data from %s", self._url)
return None
# The 'bozo' flag really only indicates that there was an issue
# during the initial parsing of the XML, but it doesn't indicate
# whether this is an unrecoverable error. In this case the
# feedparser lib is trying a less strict parsing approach.
# If an error is detected here, log warning message but continue
# processing the feed entries if present.
if self._feed.bozo != 0:
_LOGGER.warning(
"Possible issue parsing feed %s: %s",
self._url,
self._feed.bozo_exception,
)
# Using etag and modified, if there's no new data available,
# the entries list will be empty
_LOGGER.debug(
"%s entri(es) available in feed %s",
len(self._feed.entries),
self._url,
)
if not self._feed.entries:
self._log_no_entries()
return None
self._filter_entries()
self._publish_new_entries()
_LOGGER.debug("Fetch from feed %s completed", self._url)
if (
self._has_published_parsed or self._has_updated_parsed
) and self._last_entry_timestamp:
return self._last_entry_timestamp
return None
def _filter_entries(self) -> None:
"""Filter the entries provided and return the ones to keep."""
assert self._feed is not None
if len(self._feed.entries) > self._max_entries:
_LOGGER.debug(
"Processing only the first %s entries in feed %s",
self._max_entries,
self._url,
)
self._feed.entries = self._feed.entries[0 : self._max_entries]
def _update_and_fire_entry(self, entry: feedparser.FeedParserDict) -> None:
"""Update last_entry_timestamp and fire entry."""
# Check if the entry has a updated or published date.
# Start from a updated date because generally `updated` > `published`.
if "updated_parsed" in entry and entry.updated_parsed:
# We are lucky, `updated_parsed` data available, let's make use of
# it to publish only new available entries since the last run
self._has_updated_parsed = True
self._last_entry_timestamp = max(
entry.updated_parsed, self._last_entry_timestamp
)
elif "published_parsed" in entry and entry.published_parsed:
# We are lucky, `published_parsed` data available, let's make use of
# it to publish only new available entries since the last run
self._has_published_parsed = True
self._last_entry_timestamp = max(
entry.published_parsed, self._last_entry_timestamp
)
else:
self._has_updated_parsed = False
self._has_published_parsed = False
_LOGGER.debug(
"No updated_parsed or published_parsed info available for entry %s",
entry,
)
entry.update({"feed_url": self._url})
self._hass.bus.fire(self._event_type, entry)
def _publish_new_entries(self) -> None:
"""Publish new entries to the event bus."""
assert self._feed is not None
new_entries = False
self._last_entry_timestamp = self._storage.get_timestamp(self._feed_id)
if self._last_entry_timestamp:
self._firstrun = False
else:
# Set last entry timestamp as epoch time if not available
self._last_entry_timestamp = dt_util.utc_from_timestamp(0).timetuple()
for entry in self._feed.entries:
if (
self._firstrun
or (
"published_parsed" in entry
and entry.published_parsed > self._last_entry_timestamp
)
or (
"updated_parsed" in entry
and entry.updated_parsed > self._last_entry_timestamp
)
):
self._update_and_fire_entry(entry)
new_entries = True
else:
_LOGGER.debug("Entry %s already processed", entry)
if not new_entries:
self._log_no_entries()
self._firstrun = False
class StoredData:
"""Represent a data storage."""
def __init__(self, hass: HomeAssistant, legacy_data_file: str) -> None:
"""Initialize data storage."""
self._legacy_data_file = legacy_data_file
self._data: dict[str, struct_time] = {}
self._hass = hass
self._store: Store[dict[str, str]] = Store(hass, STORAGE_VERSION, DOMAIN)
async def async_setup(self) -> None:
"""Set up storage."""
if not os.path.exists(self._store.path):
# Remove the legacy store loading after deprecation period.
data = await self._hass.async_add_executor_job(self._legacy_fetch_data)
else:
if (store_data := await self._store.async_load()) is None:
return
# Make sure that dst is set to 0, by using gmtime() on the timestamp.
data = {
feed_id: gmtime(datetime.fromisoformat(timestamp_string).timestamp())
for feed_id, timestamp_string in store_data.items()
}
self._data = data
def _legacy_fetch_data(self) -> dict[str, struct_time]:
"""Fetch data stored in pickle file."""
_LOGGER.debug("Fetching data from legacy file %s", self._legacy_data_file)
try:
with open(self._legacy_data_file, "rb") as myfile:
return pickle.load(myfile) or {}
except FileNotFoundError:
pass
except (OSError, pickle.PickleError) as err:
_LOGGER.error(
"Error loading data from pickled file %s: %s",
self._legacy_data_file,
err,
)
return {}
def get_timestamp(self, feed_id: str) -> struct_time | None:
"""Return stored timestamp for given feed id."""
return self._data.get(feed_id)
@callback
def async_put_timestamp(self, feed_id: str, timestamp: struct_time) -> None:
"""Update timestamp for given feed id."""
self._data[feed_id] = timestamp
self._store.async_delay_save(self._async_save_data, DELAY_SAVE)
@callback
def _async_save_data(self) -> dict[str, str]:
"""Save feed data to storage."""
return {
feed_id: dt_util.utc_from_timestamp(timegm(struct_utc)).isoformat()
for feed_id, struct_utc in self._data.items()
}