feat: Save previous entries in feed settings

This commit is contained in:
selfhoster selfhoster 2025-04-15 16:36:09 +02:00
parent 90f29bd2a4
commit 6bb67a8129
3 changed files with 92 additions and 11 deletions

View File

@ -31,6 +31,12 @@ pub struct FeedStoreFeedInfo {
/// ///
/// Used to let the server know whether we need a new entry or not. /// Used to let the server know whether we need a new entry or not.
pub fetch_data: Option<FetchData>, pub fetch_data: Option<FetchData>,
/// A cache of already downloaded articles.
///
/// Kept for historical purposes, but also to inject retrieval date
/// when published date was not exposed in an article.
#[serde(default)]
pub entries: BTreeMap<Url, FeedStoreEntry>,
} }
impl FeedStoreFeedInfo { impl FeedStoreFeedInfo {
@ -38,6 +44,7 @@ impl FeedStoreFeedInfo {
Self { Self {
added: Utc::now(), added: Utc::now(),
fetch_data: None, fetch_data: None,
entries: BTreeMap::new(),
} }
} }
} }
@ -147,7 +154,7 @@ impl FeedStoreFeed {
debug!("Storing fetchdata for {}", self.url); debug!("Storing fetchdata for {}", self.url);
self.info.fetch_data = Some(fetchdata); self.info.fetch_data = Some(fetchdata);
Self::write(&self.path_settings, toml::to_string(&self.info)?)?; self.save_info()?;
if !self.has_changed(&feed)? { if !self.has_changed(&feed)? {
return Ok(false); return Ok(false);
@ -158,10 +165,16 @@ impl FeedStoreFeed {
&self.path_feed_ron, &self.path_feed_ron,
to_string_pretty(&feed, PrettyConfig::default())?, to_string_pretty(&feed, PrettyConfig::default())?,
)?; )?;
Self::write(&self.path_feed, body)?; Self::write(&self.path_feed, &body)?;
self.raw_feed = Some(String::from_utf8_lossy(&body).to_string());
Ok(true) Ok(true)
} }
pub fn save_info(&self) -> Result<()> {
Self::write(&self.path_settings, toml::to_string(&self.info)?)?;
Ok(())
}
/// refresh in hours /// refresh in hours
pub fn fetch(&mut self, fetcher: &super::Fetcher, refresh: usize) -> Result<bool> { pub fn fetch(&mut self, fetcher: &super::Fetcher, refresh: usize) -> Result<bool> {
let mut builder = fetcher let mut builder = fetcher
@ -247,14 +260,14 @@ impl FeedStore {
Self { _dir: dir, feeds } Self { _dir: dir, feeds }
} }
pub fn collect(&mut self, max_entries: usize) -> (HashMap<String, Feed>, Vec<Entry>) { pub fn collect(&mut self, max_entries: usize) -> (HashMap<String, Feed>, Vec<FeedStoreEntry>) {
debug!("Collecting feeds"); debug!("Collecting feeds");
let mut feeds = HashMap::new(); let mut feeds = HashMap::new();
let mut entries = Vec::new(); let mut entries = Vec::new();
for (feed_url, feed_store_feed) in self.feeds.iter_mut() { for (feed_url, feed_store_feed) in self.feeds.iter_mut() {
debug!("Collecting {feed_url}"); debug!("Collecting {feed_url}");
let mut feed = match feed_store_feed.load_feed(true) { let feed = match feed_store_feed.load_feed(true) {
Ok(feed) => feed, Ok(feed) => feed,
Err(e) => { Err(e) => {
warn!("Problem parsing feed file for feed {}: {}", feed_url, e); warn!("Problem parsing feed file for feed {}: {}", feed_url, e);
@ -262,17 +275,48 @@ impl FeedStore {
} }
}; };
for entry in &mut feed.entries { let mut changed_info = false;
entry.source = Some(feed_url.to_string());
for entry in &feed.entries {
// If we already have the entry in store, don't parse it again.
// Simply change the raw entry attribute.
let entry_link = Url::parse(&entry.links.first().cloned().unwrap().href).unwrap();
if let Some(archived_entry) = feed_store_feed.info.entries.get_mut(&entry_link) {
if &archived_entry.entry != entry {
changed_info = true;
archived_entry.entry = entry.clone();
}
entries.push(archived_entry.clone());
} else {
// TODO: should this be done here? or earlier in the fetching process?
let enhanced_entry = FeedStoreEntry::from_entry(
entry.clone(),
entry_link.clone(),
feed_url.clone(),
);
feed_store_feed
.info
.entries
.insert(entry_link.clone(), enhanced_entry.clone());
entries.push(enhanced_entry);
}
} }
entries.extend(feed.entries.clone());
if entries.len() > 4 * max_entries { if entries.len() > 4 * max_entries {
entries = trim_entries(entries, max_entries); entries = trim_entries(entries, max_entries);
} }
feeds.insert(feed_url.to_string(), feed.clone()); feeds.insert(feed_url.to_string(), feed.clone());
// If some info from an entry was changed, save feed info
if changed_info {
feed_store_feed
.save_info()
.expect("Failed to save feed info");
}
} }
(feeds, trim_entries(entries, max_entries)) (feeds, trim_entries(entries, max_entries))
} }
@ -286,8 +330,9 @@ impl FeedStore {
} }
} }
fn trim_entries(mut entries: Vec<Entry>, max_entries: usize) -> Vec<Entry> { fn trim_entries(mut entries: Vec<FeedStoreEntry>, max_entries: usize) -> Vec<FeedStoreEntry> {
entries.sort_by_key(|e| std::cmp::Reverse(e.updated.or(e.published).unwrap_or_default())); entries.sort_by(|a, b| a.mars_date.cmp(&b.mars_date));
entries.reverse();
entries.truncate(max_entries); entries.truncate(max_entries);
entries entries
} }
@ -298,3 +343,34 @@ fn hv(headers: &HeaderMap, key: &str) -> String {
_ => "".to_string(), _ => "".to_string(),
} }
} }
/// A single article in the [FeedStore].
///
/// This transformation allows to inject retrieval date when published date
/// is not available in the feed,
///
/// Used for ordering by date and accessing source information.
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
pub struct FeedStoreEntry {
/// Usually extracted from the feed, but when not present,
/// the first retrieval time is used.
pub mars_date: DateTime<Utc>,
/// [FeedStoreFeed::url]
pub mars_source: Url,
/// Canonical URL
pub mars_url: Url,
/// Usual RSS feed entry
#[serde(flatten)]
pub entry: Entry,
}
impl FeedStoreEntry {
pub fn from_entry(entry: Entry, url: Url, source: Url) -> Self {
Self {
mars_source: source,
mars_url: url,
mars_date: entry.published.unwrap_or(Utc::now()),
entry,
}
}
}

View File

@ -54,7 +54,7 @@ struct Args {
} }
/// Config to be parsed from toml file given as cmdline option /// Config to be parsed from toml file given as cmdline option
#[derive(Deserialize)] #[derive(Deserialize, Serialize)]
struct Config { struct Config {
/// to be used as part of the fetchers username header /// to be used as part of the fetchers username header
bot_name: String, bot_name: String,
@ -101,8 +101,12 @@ pub fn to_checked_pathbuf(dir: &Utf8Path) -> Utf8PathBuf {
/// ///
/// This is a separate struct in case one wants to configure additional /// This is a separate struct in case one wants to configure additional
/// information in the future. /// information in the future.
#[derive(Deserialize)] #[derive(Debug, Deserialize, Serialize)]
struct FeedConfig { struct FeedConfig {
/// short name for the feed
name: String,
/// homepage URL for the website
homepage: Url,
/// url of an ATOM, RSS or Json feed /// url of an ATOM, RSS or Json feed
url: String, url: String,
} }

View File

@ -18,6 +18,7 @@ pub fn build(config: &Config, feed_store: &mut FeedStore) -> Result<()> {
let mut context = tera::Context::new(); let mut context = tera::Context::new();
let (feeds, entries): (HashMap<String, Feed>, _) = feed_store.collect(config.max_entries); let (feeds, entries): (HashMap<String, Feed>, _) = feed_store.collect(config.max_entries);
context.insert("config", config);
context.insert("feeds", &feeds); context.insert("feeds", &feeds);
context.insert("entries", &entries); context.insert("entries", &entries);
context.insert("lang", &config.lang); context.insert("lang", &config.lang);