diff --git a/src/feed_store.rs b/src/feed_store.rs index 0bde5d4..1eec4c7 100644 --- a/src/feed_store.rs +++ b/src/feed_store.rs @@ -31,6 +31,12 @@ pub struct FeedStoreFeedInfo { /// /// Used to let the server know whether we need a new entry or not. pub fetch_data: Option, + /// A cache of already downloaded articles. + /// + /// Kept for historical purposes, but also to inject retrieval date + /// when published date was not exposed in an article. + #[serde(default)] + pub entries: BTreeMap, } impl FeedStoreFeedInfo { @@ -38,6 +44,7 @@ impl FeedStoreFeedInfo { Self { added: Utc::now(), fetch_data: None, + entries: BTreeMap::new(), } } } @@ -147,7 +154,7 @@ impl FeedStoreFeed { debug!("Storing fetchdata for {}", self.url); self.info.fetch_data = Some(fetchdata); - Self::write(&self.path_settings, toml::to_string(&self.info)?)?; + self.save_info()?; if !self.has_changed(&feed)? { return Ok(false); @@ -158,10 +165,16 @@ impl FeedStoreFeed { &self.path_feed_ron, to_string_pretty(&feed, PrettyConfig::default())?, )?; - Self::write(&self.path_feed, body)?; + Self::write(&self.path_feed, &body)?; + self.raw_feed = Some(String::from_utf8_lossy(&body).to_string()); Ok(true) } + pub fn save_info(&self) -> Result<()> { + Self::write(&self.path_settings, toml::to_string(&self.info)?)?; + Ok(()) + } + /// refresh in hours pub fn fetch(&mut self, fetcher: &super::Fetcher, refresh: usize) -> Result { let mut builder = fetcher @@ -247,14 +260,14 @@ impl FeedStore { Self { _dir: dir, feeds } } - pub fn collect(&mut self, max_entries: usize) -> (HashMap, Vec) { + pub fn collect(&mut self, max_entries: usize) -> (HashMap, Vec) { debug!("Collecting feeds"); let mut feeds = HashMap::new(); let mut entries = Vec::new(); for (feed_url, feed_store_feed) in self.feeds.iter_mut() { debug!("Collecting {feed_url}"); - let mut feed = match feed_store_feed.load_feed(true) { + let feed = match feed_store_feed.load_feed(true) { Ok(feed) => feed, Err(e) => { warn!("Problem parsing feed file for feed {}: {}", feed_url, e); @@ -262,17 +275,48 @@ impl FeedStore { } }; - for entry in &mut feed.entries { - entry.source = Some(feed_url.to_string()); + let mut changed_info = false; + + for entry in &feed.entries { + // If we already have the entry in store, don't parse it again. + // Simply change the raw entry attribute. + let entry_link = Url::parse(&entry.links.first().cloned().unwrap().href).unwrap(); + if let Some(archived_entry) = feed_store_feed.info.entries.get_mut(&entry_link) { + if &archived_entry.entry != entry { + changed_info = true; + archived_entry.entry = entry.clone(); + } + + entries.push(archived_entry.clone()); + } else { + // TODO: should this be done here? or earlier in the fetching process? + let enhanced_entry = FeedStoreEntry::from_entry( + entry.clone(), + entry_link.clone(), + feed_url.clone(), + ); + feed_store_feed + .info + .entries + .insert(entry_link.clone(), enhanced_entry.clone()); + entries.push(enhanced_entry); + } } - entries.extend(feed.entries.clone()); if entries.len() > 4 * max_entries { entries = trim_entries(entries, max_entries); } feeds.insert(feed_url.to_string(), feed.clone()); + + // If some info from an entry was changed, save feed info + if changed_info { + feed_store_feed + .save_info() + .expect("Failed to save feed info"); + } } + (feeds, trim_entries(entries, max_entries)) } @@ -286,8 +330,9 @@ impl FeedStore { } } -fn trim_entries(mut entries: Vec, max_entries: usize) -> Vec { - entries.sort_by_key(|e| std::cmp::Reverse(e.updated.or(e.published).unwrap_or_default())); +fn trim_entries(mut entries: Vec, max_entries: usize) -> Vec { + entries.sort_by(|a, b| a.mars_date.cmp(&b.mars_date)); + entries.reverse(); entries.truncate(max_entries); entries } @@ -298,3 +343,34 @@ fn hv(headers: &HeaderMap, key: &str) -> String { _ => "".to_string(), } } + +/// A single article in the [FeedStore]. +/// +/// This transformation allows to inject retrieval date when published date +/// is not available in the feed, +/// +/// Used for ordering by date and accessing source information. +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)] +pub struct FeedStoreEntry { + /// Usually extracted from the feed, but when not present, + /// the first retrieval time is used. + pub mars_date: DateTime, + /// [FeedStoreFeed::url] + pub mars_source: Url, + /// Canonical URL + pub mars_url: Url, + /// Usual RSS feed entry + #[serde(flatten)] + pub entry: Entry, +} + +impl FeedStoreEntry { + pub fn from_entry(entry: Entry, url: Url, source: Url) -> Self { + Self { + mars_source: source, + mars_url: url, + mars_date: entry.published.unwrap_or(Utc::now()), + entry, + } + } +} diff --git a/src/main.rs b/src/main.rs index 3779967..65da312 100644 --- a/src/main.rs +++ b/src/main.rs @@ -54,7 +54,7 @@ struct Args { } /// Config to be parsed from toml file given as cmdline option -#[derive(Deserialize)] +#[derive(Deserialize, Serialize)] struct Config { /// to be used as part of the fetchers username header bot_name: String, @@ -101,8 +101,12 @@ pub fn to_checked_pathbuf(dir: &Utf8Path) -> Utf8PathBuf { /// /// This is a separate struct in case one wants to configure additional /// information in the future. -#[derive(Deserialize)] +#[derive(Debug, Deserialize, Serialize)] struct FeedConfig { + /// short name for the feed + name: String, + /// homepage URL for the website + homepage: Url, /// url of an ATOM, RSS or Json feed url: String, } diff --git a/src/template_engine.rs b/src/template_engine.rs index f015ee2..06ab50e 100644 --- a/src/template_engine.rs +++ b/src/template_engine.rs @@ -18,6 +18,7 @@ pub fn build(config: &Config, feed_store: &mut FeedStore) -> Result<()> { let mut context = tera::Context::new(); let (feeds, entries): (HashMap, _) = feed_store.collect(config.max_entries); + context.insert("config", config); context.insert("feeds", &feeds); context.insert("entries", &entries); context.insert("lang", &config.lang);