feat: Save previous entries in feed settings

This commit is contained in:
selfhoster selfhoster 2025-04-15 16:36:09 +02:00
parent 90f29bd2a4
commit 6bb67a8129
3 changed files with 92 additions and 11 deletions

View File

@ -31,6 +31,12 @@ pub struct FeedStoreFeedInfo {
///
/// Used to let the server know whether we need a new entry or not.
pub fetch_data: Option<FetchData>,
/// A cache of already downloaded articles.
///
/// Kept for historical purposes, but also to inject retrieval date
/// when published date was not exposed in an article.
#[serde(default)]
pub entries: BTreeMap<Url, FeedStoreEntry>,
}
impl FeedStoreFeedInfo {
@ -38,6 +44,7 @@ impl FeedStoreFeedInfo {
Self {
added: Utc::now(),
fetch_data: None,
entries: BTreeMap::new(),
}
}
}
@ -147,7 +154,7 @@ impl FeedStoreFeed {
debug!("Storing fetchdata for {}", self.url);
self.info.fetch_data = Some(fetchdata);
Self::write(&self.path_settings, toml::to_string(&self.info)?)?;
self.save_info()?;
if !self.has_changed(&feed)? {
return Ok(false);
@ -158,10 +165,16 @@ impl FeedStoreFeed {
&self.path_feed_ron,
to_string_pretty(&feed, PrettyConfig::default())?,
)?;
Self::write(&self.path_feed, body)?;
Self::write(&self.path_feed, &body)?;
self.raw_feed = Some(String::from_utf8_lossy(&body).to_string());
Ok(true)
}
pub fn save_info(&self) -> Result<()> {
Self::write(&self.path_settings, toml::to_string(&self.info)?)?;
Ok(())
}
/// refresh in hours
pub fn fetch(&mut self, fetcher: &super::Fetcher, refresh: usize) -> Result<bool> {
let mut builder = fetcher
@ -247,14 +260,14 @@ impl FeedStore {
Self { _dir: dir, feeds }
}
pub fn collect(&mut self, max_entries: usize) -> (HashMap<String, Feed>, Vec<Entry>) {
pub fn collect(&mut self, max_entries: usize) -> (HashMap<String, Feed>, Vec<FeedStoreEntry>) {
debug!("Collecting feeds");
let mut feeds = HashMap::new();
let mut entries = Vec::new();
for (feed_url, feed_store_feed) in self.feeds.iter_mut() {
debug!("Collecting {feed_url}");
let mut feed = match feed_store_feed.load_feed(true) {
let feed = match feed_store_feed.load_feed(true) {
Ok(feed) => feed,
Err(e) => {
warn!("Problem parsing feed file for feed {}: {}", feed_url, e);
@ -262,17 +275,48 @@ impl FeedStore {
}
};
for entry in &mut feed.entries {
entry.source = Some(feed_url.to_string());
let mut changed_info = false;
for entry in &feed.entries {
// If we already have the entry in store, don't parse it again.
// Simply change the raw entry attribute.
let entry_link = Url::parse(&entry.links.first().cloned().unwrap().href).unwrap();
if let Some(archived_entry) = feed_store_feed.info.entries.get_mut(&entry_link) {
if &archived_entry.entry != entry {
changed_info = true;
archived_entry.entry = entry.clone();
}
entries.push(archived_entry.clone());
} else {
// TODO: should this be done here? or earlier in the fetching process?
let enhanced_entry = FeedStoreEntry::from_entry(
entry.clone(),
entry_link.clone(),
feed_url.clone(),
);
feed_store_feed
.info
.entries
.insert(entry_link.clone(), enhanced_entry.clone());
entries.push(enhanced_entry);
}
}
entries.extend(feed.entries.clone());
if entries.len() > 4 * max_entries {
entries = trim_entries(entries, max_entries);
}
feeds.insert(feed_url.to_string(), feed.clone());
// If some info from an entry was changed, save feed info
if changed_info {
feed_store_feed
.save_info()
.expect("Failed to save feed info");
}
}
(feeds, trim_entries(entries, max_entries))
}
@ -286,8 +330,9 @@ impl FeedStore {
}
}
fn trim_entries(mut entries: Vec<Entry>, max_entries: usize) -> Vec<Entry> {
entries.sort_by_key(|e| std::cmp::Reverse(e.updated.or(e.published).unwrap_or_default()));
fn trim_entries(mut entries: Vec<FeedStoreEntry>, max_entries: usize) -> Vec<FeedStoreEntry> {
entries.sort_by(|a, b| a.mars_date.cmp(&b.mars_date));
entries.reverse();
entries.truncate(max_entries);
entries
}
@ -298,3 +343,34 @@ fn hv(headers: &HeaderMap, key: &str) -> String {
_ => "".to_string(),
}
}
/// A single article in the [FeedStore].
///
/// This transformation allows to inject retrieval date when published date
/// is not available in the feed,
///
/// Used for ordering by date and accessing source information.
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
pub struct FeedStoreEntry {
/// Usually extracted from the feed, but when not present,
/// the first retrieval time is used.
pub mars_date: DateTime<Utc>,
/// [FeedStoreFeed::url]
pub mars_source: Url,
/// Canonical URL
pub mars_url: Url,
/// Usual RSS feed entry
#[serde(flatten)]
pub entry: Entry,
}
impl FeedStoreEntry {
pub fn from_entry(entry: Entry, url: Url, source: Url) -> Self {
Self {
mars_source: source,
mars_url: url,
mars_date: entry.published.unwrap_or(Utc::now()),
entry,
}
}
}

View File

@ -54,7 +54,7 @@ struct Args {
}
/// Config to be parsed from toml file given as cmdline option
#[derive(Deserialize)]
#[derive(Deserialize, Serialize)]
struct Config {
/// to be used as part of the fetchers username header
bot_name: String,
@ -101,8 +101,12 @@ pub fn to_checked_pathbuf(dir: &Utf8Path) -> Utf8PathBuf {
///
/// This is a separate struct in case one wants to configure additional
/// information in the future.
#[derive(Deserialize)]
#[derive(Debug, Deserialize, Serialize)]
struct FeedConfig {
/// short name for the feed
name: String,
/// homepage URL for the website
homepage: Url,
/// url of an ATOM, RSS or Json feed
url: String,
}

View File

@ -18,6 +18,7 @@ pub fn build(config: &Config, feed_store: &mut FeedStore) -> Result<()> {
let mut context = tera::Context::new();
let (feeds, entries): (HashMap<String, Feed>, _) = feed_store.collect(config.max_entries);
context.insert("config", config);
context.insert("feeds", &feeds);
context.insert("entries", &entries);
context.insert("lang", &config.lang);