From a15d380bc901a97f53ad71e96af216c0960ffce9 Mon Sep 17 00:00:00 2001 From: selfhoster1312 Date: Mon, 14 Apr 2025 22:33:42 +0200 Subject: [PATCH] meta: start refactoring --- Cargo.lock | 23 ++- Cargo.toml | 2 + src/feed_store.rs | 311 +++++++++++++++++++++++++++-------------- src/fetcher.rs | 46 +----- src/main.rs | 32 ++--- src/template_engine.rs | 9 +- 6 files changed, 244 insertions(+), 179 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3cfb214..13646af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -182,6 +182,15 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" +[[package]] +name = "camino" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b96ec4966b5813e2c0507c1f86115c8c5abaadc3980879c3424042a02fd1ad3" +dependencies = [ + "serde", +] + [[package]] name = "cc" version = "1.2.9" @@ -199,9 +208,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.39" +version = "0.4.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" dependencies = [ "android-tzdata", "iana-time-zone", @@ -209,7 +218,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets", + "windows-link", ] [[package]] @@ -1109,6 +1118,8 @@ name = "planet-mars" version = "0.1.1" dependencies = [ "anyhow", + "camino", + "chrono", "clap", "env_logger", "feed-rs", @@ -1887,6 +1898,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-link" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" + [[package]] name = "windows-sys" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index 400ac83..63919e0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,8 @@ categories = ["web-programming"] [dependencies] anyhow = "1" +camino = { version = "1.1.9", features = ["serde", "serde1"] } +chrono = { version = "0.4.40", features = ["now", "serde"] } clap = { version = "4", features = ["derive"] } env_logger = "0" feed-rs = "2" diff --git a/src/feed_store.rs b/src/feed_store.rs index ac05723..196ec5f 100644 --- a/src/feed_store.rs +++ b/src/feed_store.rs @@ -1,84 +1,204 @@ -use anyhow::bail; use anyhow::Result; +use camino::{Utf8Path, Utf8PathBuf}; +use chrono::{DateTime, Utc}; use feed_rs::model::Entry; use feed_rs::model::Feed; use ron::ser::{to_string_pretty, PrettyConfig}; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::convert::AsRef; use std::fs; -use std::io::BufReader; -use std::path::PathBuf; +use std::time::Instant; use ureq::http::HeaderMap; use ureq::http::Response; use ureq::Body; use url::Url; -#[derive(Deserialize, Serialize, Default)] -pub struct FetchData { - pub etag: String, - pub last_modified: String, +pub fn slugify_url(url: &Url) -> String { + let domain = url.domain().unwrap(); + let query = url.query().unwrap_or(""); + slug::slugify(format!("{domain}{}{query}", url.path())) } -pub struct FeedStore { - pub dir: PathBuf, +/// Stored settings/info about a feed. +#[derive(Debug, Deserialize, Serialize)] +pub struct FeedStoreFeedInfo { + /// First time we added this feed. + /// + /// Used for historical purposes only. + pub added: DateTime, + /// Last known cached entry, if any. + /// + /// Used to let the server know whether we need a new entry or not. + pub fetch_data: Option, } -impl FeedStore { - pub fn new(dir: &str) -> Self { +impl FeedStoreFeedInfo { + pub fn new() -> Self { Self { - dir: super::to_checked_pathbuf(dir), + added: Utc::now(), + fetch_data: None, } } +} - fn slugify_url(url: &Url) -> Result { - let Some(domain) = url.domain() else { - bail!("Url has no domain: '{url}'.") +/// Storage for a single feed. +/// +/// Contains one [FeedStoreVersion] for every time the feed has been successfully fetched, +/// and one [FeedStoreEntry] for each article referenced throughout the entries. +#[derive(Debug, Deserialize, Serialize)] +pub struct FeedStoreFeed { + /// The feed URL + pub url: Url, + /// Where it's stored, should be inside the [FeedStore::dir]. + pub dir: Utf8PathBuf, + /// Raw feed path + pub path_feed: Utf8PathBuf, + /// Raw feed RON path + pub path_feed_ron: Utf8PathBuf, + /// Settings path + pub path_settings: Utf8PathBuf, + /// Detailed settings/info about a feed. + pub info: FeedStoreFeedInfo, + /// Stored copy of the raw XML feed (if any) + pub raw_feed: Option, +} + +impl FeedStoreFeed { + pub fn new(basedir: &Utf8Path, url: &Url) -> Self { + let dir = basedir.join(slugify_url(url)); + if !dir.is_dir() { + std::fs::create_dir_all(&dir).unwrap(); + } + + let path_settings = dir.join("settings.toml"); + let info: FeedStoreFeedInfo = match std::fs::read_to_string(&path_settings) { + Ok(s) => toml::from_str(&s).unwrap(), + Err(_e) => { + // Assume file has not been created yet. Initialize + let info = FeedStoreFeedInfo::new(); + std::fs::write(&path_settings, toml::to_string(&info).unwrap()).unwrap(); + info + } }; - let query = url.query().unwrap_or(""); - Ok(slug::slugify(format!("{domain}{}{query}", url.path()))) - } - fn generic_path(&self, url: &Url, ext: &str) -> Result { - Ok(format!( - "{}/{}{ext}", - self.dir.display(), - Self::slugify_url(url)? - )) - } + let raw_feed: Option = std::fs::read_to_string(dir.join("feed.xml")).ok(); - fn feed_path(&self, url: &Url) -> Result { - self.generic_path(url, "") - } - - fn fetchdata_path(&self, url: &Url) -> Result { - self.generic_path(url, ".toml") - } - - pub fn load_fetchdata(&self, url: &Url) -> Result { - let path = self.fetchdata_path(url)?; - if !fs::exists(path.clone())? { - return Ok(FetchData::default()); + Self { + dir: dir.clone(), + path_feed: dir.join("feed.xml"), + path_feed_ron: dir.join("feed.ron"), + path_settings: dir.join("settings.toml"), + url: url.clone(), + info, + raw_feed, } - Ok(toml::from_str(&fs::read_to_string(path)?)?) } - fn has_changed(&self, url: &Url, new_feed: &Feed) -> Result { - let Some(old_feed) = self.load_feed(url, false)? else { - return Ok(true); + pub fn load_fetchdata(&self) -> Option<&FetchData> { + self.info.fetch_data.as_ref() + } + + pub fn load_feed(&self, sanitize: bool) -> Option { + if let Some(raw_feed) = &self.raw_feed { + let parser = feed_rs::parser::Builder::new() + .sanitize_content(sanitize) + .build(); + Some(parser.parse(raw_feed.as_bytes()).unwrap()) + } else { + None + } + } + + pub fn has_changed(&self, new_feed: &Feed) -> bool { + let Some(old_feed) = self.load_feed(false) else { + return true; }; let mut old_iter = old_feed.entries.iter(); for new in &new_feed.entries { let Some(old) = old_iter.next() else { - return Ok(true); + return true; }; if old != new { - return Ok(true); + return true; } } // ignoring any entries left in old_iter - Ok(false) + false + } + + pub fn store(&mut self, mut response: Response) -> Result { + let headers = response.headers(); + let fetchdata = FetchData { + etag: hv(headers, "etag"), + last_modified: hv(headers, "last_modified"), + when: Utc::now(), + }; + + let body = response.body_mut().with_config().read_to_vec()?; + let feed = match feed_rs::parser::parse(body.as_slice()) { + Ok(f) => f, + Err(e) => { + warn!("Error when parsing feed for {}: {e:?}", self.url); + return Ok(false); + } + }; + if !self.has_changed(&feed) { + return Ok(false); + } + debug!("Storing feed for {}.", self.url); + // todo don't serialize to string but to writer + Self::write( + &self.path_feed_ron, + to_string_pretty(&feed, PrettyConfig::default())?, + )?; + Self::write(&self.path_feed, body)?; + // Save info + self.info.fetch_data = Some(fetchdata); + Self::write(&self.path_settings, toml::to_string(&self.info)?)?; + Ok(true) + } + + pub fn fetch(&mut self, fetcher: &super::Fetcher) -> Result { + let mut builder = fetcher + .agent + .get(self.url.to_string()) + .header("FROM", fetcher.from.clone()); + + if let Some(fetchdata) = self.load_fetchdata() { + if !fetchdata.etag.is_empty() { + builder = builder.header("If-None-Match", fetchdata.etag.clone()); + } + if !fetchdata.last_modified.is_empty() { + builder = builder.header("If-Modified-Since", fetchdata.last_modified.clone()); + } + } + + let start_instant = Instant::now(); + let result = builder.call(); + let duration = start_instant.elapsed(); + + let response = result?; + debug!( + "fetched with status {} in {} ms: {}", + response.status(), + duration.as_millis(), + self.url, + ); + let status = response.status(); + match status.as_u16() { + 304 => Ok(false), // Not Modified -> nothing to do + 200 => self.store(response), + _ => { + warn!( + "HTTP Status {} not implemented for {}", + response.status(), + self.url, + ); + Ok(false) + } + } } fn write + std::fmt::Display, C: AsRef<[u8]>>( @@ -90,85 +210,66 @@ impl FeedStore { } fs::write(path, contents) } +} - pub fn store(&self, url: &Url, mut response: Response) -> Result { - let headers = response.headers(); - let fetchdata = FetchData { - etag: hv(headers, "etag"), - last_modified: hv(headers, "last_modified"), - }; +#[derive(Clone, Debug, Deserialize, Serialize, Default)] +pub struct FetchData { + pub when: DateTime, + pub etag: String, + pub last_modified: String, +} - let body = response.body_mut().with_config().read_to_vec()?; - let feed = match feed_rs::parser::parse(body.as_slice()) { - Ok(f) => f, - Err(e) => { - warn!("Error when parsing feed for {url}: {e:?}"); - return Ok(false); - } - }; - if !self.has_changed(url, &feed)? { - return Ok(false); +#[derive(Debug)] +pub struct FeedStore { + pub _dir: Utf8PathBuf, + pub feeds: BTreeMap, +} + +impl FeedStore { + pub fn new(dir: &str, feedlist: &Vec) -> Self { + let dir = super::to_checked_pathbuf(dir); + let mut feeds: BTreeMap = BTreeMap::new(); + + for feed_config in feedlist { + let feed_url = Url::parse(&feed_config.url).unwrap(); + feeds.insert(feed_url.clone(), FeedStoreFeed::new(&dir, &feed_url)); } - debug!("Storing feed for {url}."); - // todo don't serialize to string but to writer - Self::write( - self.generic_path(url, ".ron")?, - to_string_pretty(&feed, PrettyConfig::default())?, - )?; - Self::write(self.feed_path(url)?, body)?; - Self::write(self.fetchdata_path(url)?, toml::to_string(&fetchdata)?)?; - Ok(true) + + Self { _dir: dir, feeds } } - fn load_feed(&self, url: &Url, sanitize: bool) -> Result> { - let parser = feed_rs::parser::Builder::new() - .sanitize_content(sanitize) - .build(); - - let path = self.feed_path(url)?; - if !fs::exists(path.clone())? { - return Ok(None); - } - let file = fs::File::open(path)?; - Ok(Some(parser.parse(BufReader::new(file))?)) - } - - pub fn collect( - &self, - feed_configs: &Vec, - max_entries: usize, - ) -> (HashMap, Vec) { + pub fn collect(&mut self, max_entries: usize) -> (HashMap, Vec) { let mut feeds = HashMap::new(); let mut entries = Vec::new(); - for feed_config in feed_configs { - let mut feed = match (|| { - let url = Url::parse(&feed_config.url)?; - self.load_feed(&url, true) - })() { - Err(e) => { - warn!( - "Problem parsing feed file for feed {}: {e:?}", - feed_config.url - ); - continue; - } - Ok(None) => continue, - Ok(Some(f)) => f, + for (feed_url, feed_store_feed) in self.feeds.iter_mut() { + let Some(mut feed) = feed_store_feed.load_feed(true) else { + warn!("Problem parsing feed file for feed {}", feed_url); + continue; }; + for entry in &mut feed.entries { - entry.source = Some(feed_config.url.clone()); + entry.source = Some(feed_url.to_string()); } entries.append(&mut std::mem::take(&mut feed.entries)); - feeds.insert(feed_config.url.clone(), feed); - // optimization to reduce memory usage if entries.len() > 4 * max_entries { entries = trim_entries(entries, max_entries); } + + feeds.insert(feed_url.to_string(), feed.clone()); } (feeds, trim_entries(entries, max_entries)) } + + pub fn fetch(&mut self, fetcher: &super::Fetcher) -> Result { + let mut rebuild = false; + for (_url, feed) in self.feeds.iter_mut() { + rebuild |= feed.fetch(fetcher)?; + } + + Ok(rebuild) + } } fn trim_entries(mut entries: Vec, max_entries: usize) -> Vec { diff --git a/src/fetcher.rs b/src/fetcher.rs index 0cf7f26..4445083 100644 --- a/src/fetcher.rs +++ b/src/fetcher.rs @@ -1,15 +1,10 @@ -use anyhow::Result; -use std::time::Instant; use ureq::tls::{TlsConfig, TlsProvider}; use ureq::Agent; -use url::Url; - -use crate::FeedStore; pub struct Fetcher { - agent: Agent, + pub agent: Agent, /// FROM header for requests - from: String, + pub from: String, } impl Fetcher { @@ -36,41 +31,4 @@ impl Fetcher { from: from.to_string(), } } - - pub fn fetch(&self, url: Url, feed_store: &FeedStore) -> Result { - let fetchdata = feed_store.load_fetchdata(&url)?; - let mut builder = self - .agent - .get(url.to_string()) - .header("FROM", self.from.clone()); - if !fetchdata.etag.is_empty() { - builder = builder.header("If-None-Match", fetchdata.etag); - } - if !fetchdata.last_modified.is_empty() { - builder = builder.header("If-Modified-Since", fetchdata.last_modified); - } - - let start_instant = Instant::now(); - let result = builder.call(); - let duration = start_instant.elapsed(); - - let response = result?; - debug!( - "fetched with status {} in {} ms: {url}", - response.status(), - duration.as_millis() - ); - let status = response.status(); - match status.as_u16() { - 304 => Ok(false), // Not Modified -> nothing to do - 200 => feed_store.store(&url, response), - _ => { - warn!( - "HTTP Status {} not implemented for {url}", - response.status() - ); - Ok(false) - } - } - } } diff --git a/src/main.rs b/src/main.rs index 42ce8c3..d7d817c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -26,11 +26,10 @@ extern crate log; use crate::feed_store::FeedStore; use crate::fetcher::Fetcher; use anyhow::Result; +use camino::Utf8PathBuf; use clap::Parser; use serde::Deserialize; use std::fs; -use std::path::PathBuf; -use url::Url; //mod atom_serializer; mod feed_store; @@ -70,13 +69,13 @@ struct Config { max_entries: usize, } -pub fn to_checked_pathbuf(dir: &str) -> PathBuf { - let dir: PathBuf = PathBuf::from(dir); +pub fn to_checked_pathbuf(dir: &str) -> Utf8PathBuf { + let dir = Utf8PathBuf::from(dir); let m = dir .metadata() - .unwrap_or_else(|_| panic!("Could not get metadata of dir: {}", dir.display())); - assert!(m.is_dir(), "Not a dir: {}", dir.display()); + .unwrap_or_else(|_| panic!("Could not get metadata of dir: {dir}")); + assert!(m.is_dir(), "Not a dir: {dir}"); dir } @@ -90,20 +89,9 @@ struct FeedConfig { url: String, } -fn fetch(config: &Config, feed_store: &FeedStore) -> Result { +fn fetch(config: &Config, feed_store: &mut FeedStore) -> Result { let fetcher = Fetcher::new(&config.bot_name, &config.from); - let mut rebuild = false; - for feed in &config.feeds { - let url = match Url::parse(&feed.url) { - Ok(x) => x, - Err(e) => { - error!("Error parsing url '{}': {e:?}", feed.url); - continue; - } - }; - rebuild |= fetcher.fetch(url, feed_store)?; - } - info!("Done fetching. Rebuild needed: {rebuild}"); + let rebuild = feed_store.fetch(&fetcher)?; Ok(rebuild) } @@ -122,15 +110,15 @@ fn main() -> Result<()> { let _ = to_checked_pathbuf(&config.templates_dir); let _ = to_checked_pathbuf(&config.out_dir); - let feed_store = FeedStore::new(&config.feed_dir); + let mut feed_store = FeedStore::new(&config.feed_dir, &config.feeds); let should_build = if args.no_fetch { true } else { - fetch(&config, &feed_store)? + fetch(&config, &mut feed_store)? }; if should_build { - template_engine::build(&config, &feed_store)?; + template_engine::build(&config, &mut feed_store)?; } Ok(()) } diff --git a/src/template_engine.rs b/src/template_engine.rs index aecead9..50ff93c 100644 --- a/src/template_engine.rs +++ b/src/template_engine.rs @@ -7,13 +7,12 @@ use std::collections::HashMap; use std::fs::File; use tera::{from_value, Tera}; -pub fn build(config: &Config, feed_store: &FeedStore) -> Result<()> { +pub fn build(config: &Config, feed_store: &mut FeedStore) -> Result<()> { let mut tera = create_tera(&config.templates_dir)?; let out_dir = to_checked_pathbuf(&config.out_dir); let mut context = tera::Context::new(); - let (feeds, entries): (HashMap, _) = - feed_store.collect(&config.feeds, config.max_entries); + let (feeds, entries): (HashMap, _) = feed_store.collect(config.max_entries); context.insert("feeds", &feeds); context.insert("entries", &entries); context.insert("PKG_AUTHORS", env!("CARGO_PKG_AUTHORS")); @@ -24,7 +23,7 @@ pub fn build(config: &Config, feed_store: &FeedStore) -> Result<()> { for name in tera.get_template_names() { debug!("Processing template {name}"); - let file = File::create(format!("{}/{name}", out_dir.display()))?; + let file = File::create(format!("{out_dir}/{name}"))?; tera.render_to(name, &context, file)?; } Ok(()) @@ -32,7 +31,7 @@ pub fn build(config: &Config, feed_store: &FeedStore) -> Result<()> { fn create_tera(templates_dir: &str) -> Result { let dir = to_checked_pathbuf(templates_dir); - let mut tera = tera::Tera::new(&format!("{}/*", &dir.display()))?; + let mut tera = tera::Tera::new(&format!("{dir}/*"))?; // disable autoescape as this would corrupt urls or the entriy contents. todo check this! tera.autoescape_on(vec![]); Ok(tera)