Compare commits

..

No commits in common. "5271c4c9aaf567afa3bd207a79cf6cae4b3bc363" and "5c629bce7af392f6b910c82ff0ed37dd614a8dc6" have entirely different histories.

11 changed files with 219 additions and 325 deletions

2
.gitignore vendored
View File

@ -1,4 +1,2 @@
/target /target
/mars.toml /mars.toml
/out
/feeds

67
Cargo.lock generated
View File

@ -128,9 +128,9 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "2.7.0" version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1be3f42a67d6d345ecd59f675f3f012d6974981560836e938c22b424b85ce1be" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
dependencies = [ dependencies = [
"serde", "serde",
] ]
@ -182,20 +182,11 @@ version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
[[package]]
name = "camino"
version = "1.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b96ec4966b5813e2c0507c1f86115c8c5abaadc3980879c3424042a02fd1ad3"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.2.9" version = "1.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b" checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7"
dependencies = [ dependencies = [
"shlex", "shlex",
] ]
@ -208,9 +199,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]] [[package]]
name = "chrono" name = "chrono"
version = "0.4.40" version = "0.4.39"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825"
dependencies = [ dependencies = [
"android-tzdata", "android-tzdata",
"iana-time-zone", "iana-time-zone",
@ -218,7 +209,7 @@ dependencies = [
"num-traits", "num-traits",
"serde", "serde",
"wasm-bindgen", "wasm-bindgen",
"windows-link", "windows-targets",
] ]
[[package]] [[package]]
@ -245,9 +236,9 @@ dependencies = [
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.5.26" version = "4.5.25"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783" checksum = "b95dca1b68188a08ca6af9d96a6576150f598824bdb528c1190460c2940a0b48"
dependencies = [ dependencies = [
"clap_builder", "clap_builder",
"clap_derive", "clap_derive",
@ -255,9 +246,9 @@ dependencies = [
[[package]] [[package]]
name = "clap_builder" name = "clap_builder"
version = "4.5.26" version = "4.5.25"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121" checksum = "9ab52925392148efd3f7562f2136a81ffb778076bcc85727c6e020d6dd57cf15"
dependencies = [ dependencies = [
"anstream", "anstream",
"anstyle", "anstyle",
@ -1115,15 +1106,14 @@ checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
[[package]] [[package]]
name = "planet-mars" name = "planet-mars"
version = "0.1.1" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"camino",
"chrono",
"clap", "clap",
"env_logger", "env_logger",
"feed-rs", "feed-rs",
"log", "log",
"quick-xml",
"ron", "ron",
"serde", "serde",
"slug", "slug",
@ -1150,9 +1140,9 @@ dependencies = [
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.93" version = "1.0.92"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
dependencies = [ dependencies = [
"unicode-ident", "unicode-ident",
] ]
@ -1165,6 +1155,7 @@ checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003"
dependencies = [ dependencies = [
"encoding_rs", "encoding_rs",
"memchr", "memchr",
"serde",
] ]
[[package]] [[package]]
@ -1277,9 +1268,9 @@ dependencies = [
[[package]] [[package]]
name = "rustls" name = "rustls"
version = "0.23.21" version = "0.23.20"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8" checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b"
dependencies = [ dependencies = [
"log", "log",
"once_cell", "once_cell",
@ -1469,9 +1460,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.96" version = "2.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" checksum = "46f71c0377baf4ef1cc3e3402ded576dccc315800fbc62dfc7fe04b009773b4a"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -1527,18 +1518,18 @@ dependencies = [
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "2.0.11" version = "2.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" checksum = "a3ac7f54ca534db81081ef1c1e7f6ea8a3ef428d2fc069097c079443d24124d3"
dependencies = [ dependencies = [
"thiserror-impl", "thiserror-impl",
] ]
[[package]] [[package]]
name = "thiserror-impl" name = "thiserror-impl"
version = "2.0.11" version = "2.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" checksum = "9e9465d30713b56a37ede7185763c3492a91be2f5fa68d958c44e41ab9248beb"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -1898,12 +1889,6 @@ dependencies = [
"windows-targets", "windows-targets",
] ]
[[package]]
name = "windows-link"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38"
[[package]] [[package]]
name = "windows-sys" name = "windows-sys"
version = "0.52.0" version = "0.52.0"
@ -1988,9 +1973,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]] [[package]]
name = "winnow" name = "winnow"
version = "0.6.24" version = "0.6.22"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" checksum = "39281189af81c07ec09db316b302a3e67bf9bd7cbf6c820b50e35fee9c2fa980"
dependencies = [ dependencies = [
"memchr", "memchr",
] ]

View File

@ -1,6 +1,6 @@
[package] [package]
name = "planet-mars" name = "planet-mars"
version = "0.1.1" version = "0.1.0"
edition = "2021" edition = "2021"
authors = ["Thomas Koch <thomas@koch.ro>"] authors = ["Thomas Koch <thomas@koch.ro>"]
description = "Feed aggregation planet like Planet Venus, produces static HTML and ATOM feed from fetched feeds." description = "Feed aggregation planet like Planet Venus, produces static HTML and ATOM feed from fetched feeds."
@ -10,18 +10,16 @@ keywords = ["atom", "rss", "planet", "feed", "blogging"]
categories = ["web-programming"] categories = ["web-programming"]
[dependencies] [dependencies]
anyhow = "1" anyhow = "*"
camino = { version = "1.1.9", features = ["serde", "serde1"] } clap = { version = "*", features = ["derive"] }
chrono = { version = "0.4.40", features = ["now", "serde"] } env_logger = "*"
clap = { version = "4", features = ["derive"] } feed-rs = "*"
env_logger = "0" log = "*"
feed-rs = "2" ron = "*" # todo for development, to check atom-rs internal representation of feeds
log = "0" serde = { version = "*", features = ["derive"] }
ron = "0" slug = "*"
serde = { version = "1", features = ["derive"] } tera = "*"
slug = "0" toml = "*"
tera = "1"
toml = "0"
ureq = { version = "3.0.0-rc5", features = ["brotli", "charset", "gzip", "native-tls"]} ureq = { version = "3.0.0-rc5", features = ["brotli", "charset", "gzip", "native-tls"]}
url = "2" url = "*"
quick-xml = { version = "*", features = ["serialize"] }

1
OWNERS
View File

@ -1 +0,0 @@
thk

View File

@ -1,24 +1,13 @@
Simple successor to Planet Venus but in Rust and maintained. Simple planet like planet venus but in rust and maintained.
Please see the rustdoc of main.rs for further information. Please see the rustdoc of main.rs for further information.
## Todo ## todo
* find and use a nice lib to process the config file * use a nice lib to process the config file
* should check whether dirs exists and are writeable * should check whether dirs exists and are writeable
* should check whether feed urls can be parsed * should check whether feed urls can be parsed
## Planet Venus
Planet Venus is used by many planets on the internet. However its code has not
been maintained since ~2011 and it uses Python 2.
Planet Mars should be a lightweight successor to Planet Venus.
Still the Planet Venus documentation contains some useful information on
[Etiquette](https://intertwingly.net/code/venus/docs/etiquette.html) for
Planet hosters.
## Credits ## Credits
While writing this, I read and also copied code from: While writing this, I read and also copied code from:

View File

@ -1,17 +0,0 @@
{ depot, pkgs, ... }:
pkgs.rustPlatform.buildRustPackage {
name = "planet-mars";
src = depot.third_party.gitignoreSource ./.;
cargoLock.lockFile = ./Cargo.lock;
nativeBuildInputs = [ pkgs.pkg-config ];
buildInputs = [ pkgs.openssl ];
# planet-mars is mirrored to Github.
passthru.meta.ci.extraSteps.github = depot.tools.releases.filteredGitPush {
filter = ":/web/planet-mars";
remote = "git@github.com:thkoch2001/planet-mars.git";
ref = "refs/heads/master";
};
}

View File

@ -3,7 +3,6 @@ feed_dir = "/var/lib/planet-mars/feeds"
from = "thomas@koch.ro" from = "thomas@koch.ro"
out_dir = "/var/lib/planet-mars/out" out_dir = "/var/lib/planet-mars/out"
templates_dir = "/var/lib/planet-mars/templates" templates_dir = "/var/lib/planet-mars/templates"
max_entries = 50
[[feeds]] [[feeds]]
url = "https://blog.fefe.de/rss.xml" url = "https://blog.fefe.de/rss.xml"

View File

@ -1,213 +1,84 @@
use anyhow::bail;
use anyhow::Result; use anyhow::Result;
use camino::{Utf8Path, Utf8PathBuf};
use chrono::{DateTime, Duration, Utc};
use feed_rs::model::Entry; use feed_rs::model::Entry;
use feed_rs::model::Feed; use feed_rs::model::Feed;
use ron::ser::{to_string_pretty, PrettyConfig}; use ron::ser::{to_string_pretty, PrettyConfig};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, HashMap}; use std::collections::HashMap;
use std::convert::AsRef; use std::convert::AsRef;
use std::fs; use std::fs;
use std::time::Instant; use std::io::BufReader;
use std::path::PathBuf;
use ureq::http::HeaderMap; use ureq::http::HeaderMap;
use ureq::http::Response; use ureq::http::Response;
use ureq::Body; use ureq::Body;
use url::Url; use url::Url;
pub fn slugify_url(url: &Url) -> String { #[derive(Deserialize, Serialize, Default)]
let domain = url.domain().unwrap(); pub struct FetchData {
let query = url.query().unwrap_or(""); pub etag: String,
slug::slugify(format!("{domain}{}{query}", url.path())) pub last_modified: String,
} }
/// Stored settings/info about a feed. pub struct FeedStore {
#[derive(Debug, Deserialize, Serialize)] pub dir: PathBuf,
pub struct FeedStoreFeedInfo {
/// First time we added this feed.
///
/// Used for historical purposes only.
pub added: DateTime<Utc>,
/// Last known cached entry, if any.
///
/// Used to let the server know whether we need a new entry or not.
pub fetch_data: Option<FetchData>,
} }
impl FeedStoreFeedInfo { impl FeedStore {
pub fn new() -> Self { pub fn new(dir: &str) -> Self {
Self { Self {
added: Utc::now(), dir: super::to_checked_pathbuf(dir),
fetch_data: None,
} }
} }
}
/// Storage for a single feed. fn slugify_url(url: &Url) -> Result<String> {
/// let Some(domain) = url.domain() else {
/// Contains one [FeedStoreVersion] for every time the feed has been successfully fetched, bail!("Url has no domain: '{url}'.")
/// and one [FeedStoreEntry] for each article referenced throughout the entries.
#[derive(Debug, Deserialize, Serialize)]
pub struct FeedStoreFeed {
/// The feed URL
pub url: Url,
/// Where it's stored, should be inside the [FeedStore::dir].
pub dir: Utf8PathBuf,
/// Raw feed path
pub path_feed: Utf8PathBuf,
/// Raw feed RON path
pub path_feed_ron: Utf8PathBuf,
/// Settings path
pub path_settings: Utf8PathBuf,
/// Detailed settings/info about a feed.
pub info: FeedStoreFeedInfo,
/// Stored copy of the raw XML feed (if any)
pub raw_feed: Option<String>,
}
impl FeedStoreFeed {
pub fn new(basedir: &Utf8Path, url: &Url) -> Self {
let dir = basedir.join(slugify_url(url));
if !dir.is_dir() {
std::fs::create_dir_all(&dir).unwrap();
}
let path_settings = dir.join("settings.toml");
let info: FeedStoreFeedInfo = match std::fs::read_to_string(&path_settings) {
Ok(s) => toml::from_str(&s).unwrap(),
Err(_e) => {
// Assume file has not been created yet. Initialize
let info = FeedStoreFeedInfo::new();
std::fs::write(&path_settings, toml::to_string(&info).unwrap()).unwrap();
info
}
}; };
let query = url.query().unwrap_or("");
Ok(slug::slugify(format!("{domain}{}{query}", url.path())))
}
let raw_feed: Option<String> = std::fs::read_to_string(dir.join("feed.xml")).ok(); fn generic_path(&self, url: &Url, ext: &str) -> Result<String> {
Ok(format!(
"{}/{}{ext}",
self.dir.display(),
Self::slugify_url(url)?
))
}
Self { fn feed_path(&self, url: &Url) -> Result<String> {
dir: dir.clone(), self.generic_path(url, "")
path_feed: dir.join("feed.xml"), }
path_feed_ron: dir.join("feed.ron"),
path_settings: dir.join("settings.toml"), fn fetchdata_path(&self, url: &Url) -> Result<String> {
url: url.clone(), self.generic_path(url, ".toml")
info, }
raw_feed,
pub fn load_fetchdata(&self, url: &Url) -> Result<FetchData> {
let path = self.fetchdata_path(url)?;
if !fs::exists(path.clone())? {
return Ok(FetchData::default());
} }
Ok(toml::from_str(&fs::read_to_string(path)?)?)
} }
pub fn load_fetchdata(&self) -> Option<&FetchData> { fn has_changed(&self, url: &Url, new_feed: &Feed) -> Result<bool> {
self.info.fetch_data.as_ref() let Some(old_feed) = self.load_feed(url, false)? else {
} return Ok(true);
pub fn load_feed(&self, sanitize: bool) -> Option<Feed> {
if let Some(raw_feed) = &self.raw_feed {
let parser = feed_rs::parser::Builder::new()
.sanitize_content(sanitize)
.build();
Some(parser.parse(raw_feed.as_bytes()).unwrap())
} else {
None
}
}
pub fn has_changed(&self, new_feed: &Feed) -> bool {
let Some(old_feed) = self.load_feed(false) else {
return true;
}; };
let mut old_iter = old_feed.entries.iter(); let mut old_iter = old_feed.entries.iter();
for new in &new_feed.entries { for new in &new_feed.entries {
let Some(old) = old_iter.next() else { let Some(old) = old_iter.next() else {
return true; return Ok(true);
}; };
if old != new { if old != new {
return true; return Ok(true);
} }
} }
// ignoring any entries left in old_iter // ignoring any entries left in old_iter
false Ok(false)
}
pub fn store(&mut self, mut response: Response<Body>) -> Result<bool> {
let headers = response.headers();
let fetchdata = FetchData {
etag: hv(headers, "etag"),
last_modified: hv(headers, "last_modified"),
when: Utc::now(),
};
let body = response.body_mut().with_config().read_to_vec()?;
let feed = match feed_rs::parser::parse(body.as_slice()) {
Ok(f) => f,
Err(e) => {
warn!("Error when parsing feed for {}: {e:?}", self.url);
return Ok(false);
}
};
debug!("Storing fetchdata for {}", self.url);
self.info.fetch_data = Some(fetchdata);
Self::write(&self.path_settings, toml::to_string(&self.info)?)?;
if !self.has_changed(&feed) {
return Ok(false);
}
debug!("Storing feed for {}.", self.url);
// todo don't serialize to string but to writer
Self::write(
&self.path_feed_ron,
to_string_pretty(&feed, PrettyConfig::default())?,
)?;
Self::write(&self.path_feed, body)?;
Ok(true)
}
/// refresh in hours
pub fn fetch(&mut self, fetcher: &super::Fetcher, refresh: usize) -> Result<bool> {
let mut builder = fetcher
.agent
.get(self.url.to_string())
.header("FROM", fetcher.from.clone());
if let Some(fetchdata) = self.load_fetchdata() {
if !fetchdata.etag.is_empty() {
builder = builder.header("If-None-Match", fetchdata.etag.clone());
}
if !fetchdata.last_modified.is_empty() {
builder = builder.header("If-Modified-Since", fetchdata.last_modified.clone());
}
// Check if we have hit time for refresh
if fetchdata.when + Duration::try_hours(refresh as i64).unwrap() >= Utc::now() {
// No need to rebuild, check again later
return Ok(false);
}
}
let start_instant = Instant::now();
let result = builder.call();
let duration = start_instant.elapsed();
let response = result?;
debug!(
"fetched with status {} in {} ms: {}",
response.status(),
duration.as_millis(),
self.url,
);
let status = response.status();
match status.as_u16() {
304 => Ok(false), // Not Modified -> nothing to do
200 => self.store(response),
_ => {
warn!(
"HTTP Status {} not implemented for {}",
response.status(),
self.url,
);
Ok(false)
}
}
} }
fn write<P: AsRef<std::path::Path> + std::fmt::Display, C: AsRef<[u8]>>( fn write<P: AsRef<std::path::Path> + std::fmt::Display, C: AsRef<[u8]>>(
@ -219,66 +90,85 @@ impl FeedStoreFeed {
} }
fs::write(path, contents) fs::write(path, contents)
} }
}
#[derive(Clone, Debug, Deserialize, Serialize, Default)] pub fn store(&self, url: &Url, mut response: Response<Body>) -> Result<bool> {
pub struct FetchData { let headers = response.headers();
pub when: DateTime<Utc>, let fetchdata = FetchData {
pub etag: String, etag: hv(headers, "etag"),
pub last_modified: String, last_modified: hv(headers, "last_modified"),
} };
#[derive(Debug)] let body = response.body_mut().with_config().read_to_vec()?;
pub struct FeedStore { let feed = match feed_rs::parser::parse(body.as_slice()) {
pub _dir: Utf8PathBuf, Ok(f) => f,
pub feeds: BTreeMap<Url, FeedStoreFeed>, Err(e) => {
} warn!("Error when parsing feed for {url}: {e:?}");
return Ok(false);
impl FeedStore { }
pub fn new(dir: &str, feedlist: &Vec<super::FeedConfig>) -> Self { };
let dir = super::to_checked_pathbuf(dir); if !self.has_changed(url, &feed)? {
let mut feeds: BTreeMap<Url, FeedStoreFeed> = BTreeMap::new(); return Ok(false);
for feed_config in feedlist {
let feed_url = Url::parse(&feed_config.url).unwrap();
feeds.insert(feed_url.clone(), FeedStoreFeed::new(&dir, &feed_url));
} }
debug!("Storing feed for {url}.");
Self { _dir: dir, feeds } // todo don't serialize to string but to writer
Self::write(
self.generic_path(url, ".ron")?,
to_string_pretty(&feed, PrettyConfig::default())?,
)?;
Self::write(self.feed_path(url)?, body)?;
Self::write(self.fetchdata_path(url)?, toml::to_string(&fetchdata)?)?;
Ok(true)
} }
pub fn collect(&mut self, max_entries: usize) -> (HashMap<String, Feed>, Vec<Entry>) { fn load_feed(&self, url: &Url, sanitize: bool) -> Result<Option<Feed>> {
let parser = feed_rs::parser::Builder::new()
.sanitize_content(sanitize)
.build();
let path = self.feed_path(url)?;
if !fs::exists(path.clone())? {
return Ok(None);
}
let file = fs::File::open(path)?;
Ok(Some(parser.parse(BufReader::new(file))?))
}
pub fn collect(
&self,
feed_configs: &Vec<super::FeedConfig>,
max_entries: usize,
) -> (HashMap<String, Feed>, Vec<Entry>) {
let mut feeds = HashMap::new(); let mut feeds = HashMap::new();
let mut entries = Vec::new(); let mut entries = Vec::new();
for (feed_url, feed_store_feed) in self.feeds.iter_mut() { for feed_config in feed_configs {
let Some(mut feed) = feed_store_feed.load_feed(true) else { let mut feed = match (|| {
warn!("Problem parsing feed file for feed {}", feed_url); let url = Url::parse(&feed_config.url)?;
continue; self.load_feed(&url, true)
})() {
Err(e) => {
warn!(
"Problem parsing feed file for feed {}: {e:?}",
feed_config.url
);
continue;
}
Ok(None) => continue,
Ok(Some(f)) => f,
}; };
for entry in &mut feed.entries { for entry in &mut feed.entries {
entry.source = Some(feed_url.to_string()); entry.source = Some(feed_config.url.clone());
} }
entries.append(&mut std::mem::take(&mut feed.entries)); entries.append(&mut std::mem::take(&mut feed.entries));
feeds.insert(feed_config.url.clone(), feed);
// optimization to reduce memory usage
if entries.len() > 4 * max_entries { if entries.len() > 4 * max_entries {
entries = trim_entries(entries, max_entries); entries = trim_entries(entries, max_entries);
} }
feeds.insert(feed_url.to_string(), feed.clone());
} }
(feeds, trim_entries(entries, max_entries)) (feeds, trim_entries(entries, max_entries))
} }
pub fn fetch(&mut self, fetcher: &super::Fetcher, refresh: usize) -> Result<bool> {
let mut rebuild = false;
for (_url, feed) in self.feeds.iter_mut() {
rebuild |= feed.fetch(fetcher, refresh)?;
}
Ok(rebuild)
}
} }
fn trim_entries(mut entries: Vec<Entry>, max_entries: usize) -> Vec<Entry> { fn trim_entries(mut entries: Vec<Entry>, max_entries: usize) -> Vec<Entry> {

View File

@ -1,10 +1,15 @@
use anyhow::Result;
use std::time::Instant;
use ureq::tls::{TlsConfig, TlsProvider}; use ureq::tls::{TlsConfig, TlsProvider};
use ureq::Agent; use ureq::Agent;
use url::Url;
use crate::FeedStore;
pub struct Fetcher { pub struct Fetcher {
pub agent: Agent, agent: Agent,
/// FROM header for requests /// FROM header for requests
pub from: String, from: String,
} }
impl Fetcher { impl Fetcher {
@ -31,4 +36,41 @@ impl Fetcher {
from: from.to_string(), from: from.to_string(),
} }
} }
pub fn fetch(&self, url: Url, feed_store: &FeedStore) -> Result<bool> {
let fetchdata = feed_store.load_fetchdata(&url)?;
let mut builder = self
.agent
.get(url.to_string())
.header("FROM", self.from.clone());
if !fetchdata.etag.is_empty() {
builder = builder.header("If-None-Match", fetchdata.etag);
}
if !fetchdata.last_modified.is_empty() {
builder = builder.header("If-Modified-Since", fetchdata.last_modified);
}
let start_instant = Instant::now();
let result = builder.call();
let duration = start_instant.elapsed();
let response = result?;
debug!(
"fetched with status {} in {} ms: {url}",
response.status(),
duration.as_millis()
);
let status = response.status();
match status.as_u16() {
304 => Ok(false), // Not Modified -> nothing to do
200 => feed_store.store(&url, response),
_ => {
warn!(
"HTTP Status {} not implemented for {url}",
response.status()
);
Ok(false)
}
}
}
} }

View File

@ -26,10 +26,11 @@ extern crate log;
use crate::feed_store::FeedStore; use crate::feed_store::FeedStore;
use crate::fetcher::Fetcher; use crate::fetcher::Fetcher;
use anyhow::Result; use anyhow::Result;
use camino::Utf8PathBuf;
use clap::Parser; use clap::Parser;
use serde::Deserialize; use serde::Deserialize;
use std::fs; use std::fs;
use std::path::PathBuf;
use url::Url;
//mod atom_serializer; //mod atom_serializer;
mod feed_store; mod feed_store;
@ -67,17 +68,15 @@ struct Config {
templates_dir: String, templates_dir: String,
/// How many feed entries should be included in the planet /// How many feed entries should be included in the planet
max_entries: usize, max_entries: usize,
/// How soon to refresh, in hours
refresh: usize,
} }
pub fn to_checked_pathbuf(dir: &str) -> Utf8PathBuf { pub fn to_checked_pathbuf(dir: &str) -> PathBuf {
let dir = Utf8PathBuf::from(dir); let dir: PathBuf = PathBuf::from(dir);
let m = dir let m = dir
.metadata() .metadata()
.unwrap_or_else(|_| panic!("Could not get metadata of dir: {dir}")); .unwrap_or_else(|_| panic!("Could not get metadata of dir: {}", dir.display()));
assert!(m.is_dir(), "Not a dir: {dir}"); assert!(m.is_dir(), "Not a dir: {}", dir.display());
dir dir
} }
@ -91,9 +90,20 @@ struct FeedConfig {
url: String, url: String,
} }
fn fetch(config: &Config, feed_store: &mut FeedStore) -> Result<bool> { fn fetch(config: &Config, feed_store: &FeedStore) -> Result<bool> {
let fetcher = Fetcher::new(&config.bot_name, &config.from); let fetcher = Fetcher::new(&config.bot_name, &config.from);
let rebuild = feed_store.fetch(&fetcher, config.refresh)?; let mut rebuild = false;
for feed in &config.feeds {
let url = match Url::parse(&feed.url) {
Ok(x) => x,
Err(e) => {
error!("Error parsing url '{}': {e:?}", feed.url);
continue;
}
};
rebuild |= fetcher.fetch(url, feed_store)?;
}
info!("Done fetching. Rebuild needed: {rebuild}");
Ok(rebuild) Ok(rebuild)
} }
@ -112,15 +122,15 @@ fn main() -> Result<()> {
let _ = to_checked_pathbuf(&config.templates_dir); let _ = to_checked_pathbuf(&config.templates_dir);
let _ = to_checked_pathbuf(&config.out_dir); let _ = to_checked_pathbuf(&config.out_dir);
let mut feed_store = FeedStore::new(&config.feed_dir, &config.feeds); let feed_store = FeedStore::new(&config.feed_dir);
let should_build = if args.no_fetch { let should_build = if args.no_fetch {
true true
} else { } else {
fetch(&config, &mut feed_store)? fetch(&config, &feed_store)?
}; };
if should_build { if should_build {
template_engine::build(&config, &mut feed_store)?; template_engine::build(&config, &feed_store)?;
} }
Ok(()) Ok(())
} }

View File

@ -7,12 +7,13 @@ use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use tera::{from_value, Tera}; use tera::{from_value, Tera};
pub fn build(config: &Config, feed_store: &mut FeedStore) -> Result<()> { pub fn build(config: &Config, feed_store: &FeedStore) -> Result<()> {
let mut tera = create_tera(&config.templates_dir)?; let mut tera = create_tera(&config.templates_dir)?;
let out_dir = to_checked_pathbuf(&config.out_dir); let out_dir = to_checked_pathbuf(&config.out_dir);
let mut context = tera::Context::new(); let mut context = tera::Context::new();
let (feeds, entries): (HashMap<String, Feed>, _) = feed_store.collect(config.max_entries); let (feeds, entries): (HashMap<String, Feed>, _) =
feed_store.collect(&config.feeds, config.max_entries);
context.insert("feeds", &feeds); context.insert("feeds", &feeds);
context.insert("entries", &entries); context.insert("entries", &entries);
context.insert("PKG_AUTHORS", env!("CARGO_PKG_AUTHORS")); context.insert("PKG_AUTHORS", env!("CARGO_PKG_AUTHORS"));
@ -23,7 +24,7 @@ pub fn build(config: &Config, feed_store: &mut FeedStore) -> Result<()> {
for name in tera.get_template_names() { for name in tera.get_template_names() {
debug!("Processing template {name}"); debug!("Processing template {name}");
let file = File::create(format!("{out_dir}/{name}"))?; let file = File::create(format!("{}/{name}", out_dir.display()))?;
tera.render_to(name, &context, file)?; tera.render_to(name, &context, file)?;
} }
Ok(()) Ok(())
@ -31,7 +32,7 @@ pub fn build(config: &Config, feed_store: &mut FeedStore) -> Result<()> {
fn create_tera(templates_dir: &str) -> Result<Tera> { fn create_tera(templates_dir: &str) -> Result<Tera> {
let dir = to_checked_pathbuf(templates_dir); let dir = to_checked_pathbuf(templates_dir);
let mut tera = tera::Tera::new(&format!("{dir}/*"))?; let mut tera = tera::Tera::new(&format!("{}/*", &dir.display()))?;
// disable autoescape as this would corrupt urls or the entriy contents. todo check this! // disable autoescape as this would corrupt urls or the entriy contents. todo check this!
tera.autoescape_on(vec![]); tera.autoescape_on(vec![]);
Ok(tera) Ok(tera)