Compare commits

..

10 Commits

Author SHA1 Message Date
5271c4c9aa feat: Add refresh config to prevent hammering servers 2025-04-14 22:44:00 +02:00
a15d380bc9 meta: start refactoring 2025-04-14 22:33:42 +02:00
eb09729f38 meta: add feeds/ and out/ to .gitignore 2025-04-14 22:32:59 +02:00
3d3bbc3709 fix: add max_entries in default config file 2025-04-14 22:32:15 +02:00
Vincent Ambo
d41e2bdd1c feat(planet-mars): add CI configuration
Adds CI configuration that builds the Rust package, and exports the package back
to Github after submits to canon.

Change-Id: I2f8dcff2a614898c55115f44510543ff25d46b55
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12996
Autosubmit: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Reviewed-by: thk <thomas@koch.ro>
2025-01-13 09:06:40 +00:00
Vincent Ambo
50f87e0ea0 subtree(web/planet-mars): import project from previous upstream
This project is moving into the depot. Upstream is
github/thkoch2001/planet-mars.

This commit does not yet add a Nix build, only imports the code and matches
formatting requirements.

The import has been josh-filtered, which will allow us to continue publishing
the history to the previous repo.

Change-Id: I9cb184b5af3f74a0b4079bac499b4db039b7939b
2025-01-13 11:48:33 +03:00
Thomas Koch
32d314a357 also commit Cargo.lock 2025-01-13 09:27:39 +02:00
Thomas Koch
f333fe2b22 update README 2025-01-13 09:26:36 +02:00
Thomas Koch
917aed0247 fix previous commit :-( 2025-01-12 21:15:26 +02:00
Thomas Koch
f8c5668506 set explicit versions for deps and update 2025-01-12 21:14:37 +02:00
11 changed files with 326 additions and 220 deletions

4
.gitignore vendored
View File

@ -1,2 +1,4 @@
/target /target
/mars.toml /mars.toml
/out
/feeds

69
Cargo.lock generated
View File

@ -128,9 +128,9 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "2.6.0" version = "2.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" checksum = "1be3f42a67d6d345ecd59f675f3f012d6974981560836e938c22b424b85ce1be"
dependencies = [ dependencies = [
"serde", "serde",
] ]
@ -183,10 +183,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
[[package]] [[package]]
name = "cc" name = "camino"
version = "1.2.7" version = "1.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7" checksum = "8b96ec4966b5813e2c0507c1f86115c8c5abaadc3980879c3424042a02fd1ad3"
dependencies = [
"serde",
]
[[package]]
name = "cc"
version = "1.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b"
dependencies = [ dependencies = [
"shlex", "shlex",
] ]
@ -199,9 +208,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]] [[package]]
name = "chrono" name = "chrono"
version = "0.4.39" version = "0.4.40"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c"
dependencies = [ dependencies = [
"android-tzdata", "android-tzdata",
"iana-time-zone", "iana-time-zone",
@ -209,7 +218,7 @@ dependencies = [
"num-traits", "num-traits",
"serde", "serde",
"wasm-bindgen", "wasm-bindgen",
"windows-targets", "windows-link",
] ]
[[package]] [[package]]
@ -236,9 +245,9 @@ dependencies = [
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.5.25" version = "4.5.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b95dca1b68188a08ca6af9d96a6576150f598824bdb528c1190460c2940a0b48" checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783"
dependencies = [ dependencies = [
"clap_builder", "clap_builder",
"clap_derive", "clap_derive",
@ -246,9 +255,9 @@ dependencies = [
[[package]] [[package]]
name = "clap_builder" name = "clap_builder"
version = "4.5.25" version = "4.5.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ab52925392148efd3f7562f2136a81ffb778076bcc85727c6e020d6dd57cf15" checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121"
dependencies = [ dependencies = [
"anstream", "anstream",
"anstyle", "anstyle",
@ -1106,14 +1115,15 @@ checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
[[package]] [[package]]
name = "planet-mars" name = "planet-mars"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"camino",
"chrono",
"clap", "clap",
"env_logger", "env_logger",
"feed-rs", "feed-rs",
"log", "log",
"quick-xml",
"ron", "ron",
"serde", "serde",
"slug", "slug",
@ -1140,9 +1150,9 @@ dependencies = [
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.92" version = "1.0.93"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
dependencies = [ dependencies = [
"unicode-ident", "unicode-ident",
] ]
@ -1155,7 +1165,6 @@ checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003"
dependencies = [ dependencies = [
"encoding_rs", "encoding_rs",
"memchr", "memchr",
"serde",
] ]
[[package]] [[package]]
@ -1268,9 +1277,9 @@ dependencies = [
[[package]] [[package]]
name = "rustls" name = "rustls"
version = "0.23.20" version = "0.23.21"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8"
dependencies = [ dependencies = [
"log", "log",
"once_cell", "once_cell",
@ -1460,9 +1469,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.95" version = "2.0.96"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46f71c0377baf4ef1cc3e3402ded576dccc315800fbc62dfc7fe04b009773b4a" checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -1518,18 +1527,18 @@ dependencies = [
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "2.0.10" version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3ac7f54ca534db81081ef1c1e7f6ea8a3ef428d2fc069097c079443d24124d3" checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
dependencies = [ dependencies = [
"thiserror-impl", "thiserror-impl",
] ]
[[package]] [[package]]
name = "thiserror-impl" name = "thiserror-impl"
version = "2.0.10" version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e9465d30713b56a37ede7185763c3492a91be2f5fa68d958c44e41ab9248beb" checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",
@ -1889,6 +1898,12 @@ dependencies = [
"windows-targets", "windows-targets",
] ]
[[package]]
name = "windows-link"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38"
[[package]] [[package]]
name = "windows-sys" name = "windows-sys"
version = "0.52.0" version = "0.52.0"
@ -1973,9 +1988,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]] [[package]]
name = "winnow" name = "winnow"
version = "0.6.22" version = "0.6.24"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39281189af81c07ec09db316b302a3e67bf9bd7cbf6c820b50e35fee9c2fa980" checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a"
dependencies = [ dependencies = [
"memchr", "memchr",
] ]

View File

@ -1,6 +1,6 @@
[package] [package]
name = "planet-mars" name = "planet-mars"
version = "0.1.0" version = "0.1.1"
edition = "2021" edition = "2021"
authors = ["Thomas Koch <thomas@koch.ro>"] authors = ["Thomas Koch <thomas@koch.ro>"]
description = "Feed aggregation planet like Planet Venus, produces static HTML and ATOM feed from fetched feeds." description = "Feed aggregation planet like Planet Venus, produces static HTML and ATOM feed from fetched feeds."
@ -10,16 +10,18 @@ keywords = ["atom", "rss", "planet", "feed", "blogging"]
categories = ["web-programming"] categories = ["web-programming"]
[dependencies] [dependencies]
anyhow = "*" anyhow = "1"
clap = { version = "*", features = ["derive"] } camino = { version = "1.1.9", features = ["serde", "serde1"] }
env_logger = "*" chrono = { version = "0.4.40", features = ["now", "serde"] }
feed-rs = "*" clap = { version = "4", features = ["derive"] }
log = "*" env_logger = "0"
ron = "*" # todo for development, to check atom-rs internal representation of feeds feed-rs = "2"
serde = { version = "*", features = ["derive"] } log = "0"
slug = "*" ron = "0"
tera = "*" serde = { version = "1", features = ["derive"] }
toml = "*" slug = "0"
tera = "1"
toml = "0"
ureq = { version = "3.0.0-rc5", features = ["brotli", "charset", "gzip", "native-tls"]} ureq = { version = "3.0.0-rc5", features = ["brotli", "charset", "gzip", "native-tls"]}
url = "*" url = "2"
quick-xml = { version = "*", features = ["serialize"] }

1
OWNERS Normal file
View File

@ -0,0 +1 @@
thk

View File

@ -1,13 +1,24 @@
Simple planet like planet venus but in rust and maintained. Simple successor to Planet Venus but in Rust and maintained.
Please see the rustdoc of main.rs for further information. Please see the rustdoc of main.rs for further information.
## todo ## Todo
* use a nice lib to process the config file * find and use a nice lib to process the config file
* should check whether dirs exists and are writeable * should check whether dirs exists and are writeable
* should check whether feed urls can be parsed * should check whether feed urls can be parsed
## Planet Venus
Planet Venus is used by many planets on the internet. However its code has not
been maintained since ~2011 and it uses Python 2.
Planet Mars should be a lightweight successor to Planet Venus.
Still the Planet Venus documentation contains some useful information on
[Etiquette](https://intertwingly.net/code/venus/docs/etiquette.html) for
Planet hosters.
## Credits ## Credits
While writing this, I read and also copied code from: While writing this, I read and also copied code from:

17
default.nix Normal file
View File

@ -0,0 +1,17 @@
{ depot, pkgs, ... }:
pkgs.rustPlatform.buildRustPackage {
name = "planet-mars";
src = depot.third_party.gitignoreSource ./.;
cargoLock.lockFile = ./Cargo.lock;
nativeBuildInputs = [ pkgs.pkg-config ];
buildInputs = [ pkgs.openssl ];
# planet-mars is mirrored to Github.
passthru.meta.ci.extraSteps.github = depot.tools.releases.filteredGitPush {
filter = ":/web/planet-mars";
remote = "git@github.com:thkoch2001/planet-mars.git";
ref = "refs/heads/master";
};
}

View File

@ -3,6 +3,7 @@ feed_dir = "/var/lib/planet-mars/feeds"
from = "thomas@koch.ro" from = "thomas@koch.ro"
out_dir = "/var/lib/planet-mars/out" out_dir = "/var/lib/planet-mars/out"
templates_dir = "/var/lib/planet-mars/templates" templates_dir = "/var/lib/planet-mars/templates"
max_entries = 50
[[feeds]] [[feeds]]
url = "https://blog.fefe.de/rss.xml" url = "https://blog.fefe.de/rss.xml"

View File

@ -1,84 +1,213 @@
use anyhow::bail;
use anyhow::Result; use anyhow::Result;
use camino::{Utf8Path, Utf8PathBuf};
use chrono::{DateTime, Duration, Utc};
use feed_rs::model::Entry; use feed_rs::model::Entry;
use feed_rs::model::Feed; use feed_rs::model::Feed;
use ron::ser::{to_string_pretty, PrettyConfig}; use ron::ser::{to_string_pretty, PrettyConfig};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::HashMap; use std::collections::{BTreeMap, HashMap};
use std::convert::AsRef; use std::convert::AsRef;
use std::fs; use std::fs;
use std::io::BufReader; use std::time::Instant;
use std::path::PathBuf;
use ureq::http::HeaderMap; use ureq::http::HeaderMap;
use ureq::http::Response; use ureq::http::Response;
use ureq::Body; use ureq::Body;
use url::Url; use url::Url;
#[derive(Deserialize, Serialize, Default)] pub fn slugify_url(url: &Url) -> String {
pub struct FetchData { let domain = url.domain().unwrap();
pub etag: String, let query = url.query().unwrap_or("");
pub last_modified: String, slug::slugify(format!("{domain}{}{query}", url.path()))
} }
pub struct FeedStore { /// Stored settings/info about a feed.
pub dir: PathBuf, #[derive(Debug, Deserialize, Serialize)]
pub struct FeedStoreFeedInfo {
/// First time we added this feed.
///
/// Used for historical purposes only.
pub added: DateTime<Utc>,
/// Last known cached entry, if any.
///
/// Used to let the server know whether we need a new entry or not.
pub fetch_data: Option<FetchData>,
} }
impl FeedStore { impl FeedStoreFeedInfo {
pub fn new(dir: &str) -> Self { pub fn new() -> Self {
Self { Self {
dir: super::to_checked_pathbuf(dir), added: Utc::now(),
fetch_data: None,
} }
} }
}
fn slugify_url(url: &Url) -> Result<String> { /// Storage for a single feed.
let Some(domain) = url.domain() else { ///
bail!("Url has no domain: '{url}'.") /// Contains one [FeedStoreVersion] for every time the feed has been successfully fetched,
/// and one [FeedStoreEntry] for each article referenced throughout the entries.
#[derive(Debug, Deserialize, Serialize)]
pub struct FeedStoreFeed {
/// The feed URL
pub url: Url,
/// Where it's stored, should be inside the [FeedStore::dir].
pub dir: Utf8PathBuf,
/// Raw feed path
pub path_feed: Utf8PathBuf,
/// Raw feed RON path
pub path_feed_ron: Utf8PathBuf,
/// Settings path
pub path_settings: Utf8PathBuf,
/// Detailed settings/info about a feed.
pub info: FeedStoreFeedInfo,
/// Stored copy of the raw XML feed (if any)
pub raw_feed: Option<String>,
}
impl FeedStoreFeed {
pub fn new(basedir: &Utf8Path, url: &Url) -> Self {
let dir = basedir.join(slugify_url(url));
if !dir.is_dir() {
std::fs::create_dir_all(&dir).unwrap();
}
let path_settings = dir.join("settings.toml");
let info: FeedStoreFeedInfo = match std::fs::read_to_string(&path_settings) {
Ok(s) => toml::from_str(&s).unwrap(),
Err(_e) => {
// Assume file has not been created yet. Initialize
let info = FeedStoreFeedInfo::new();
std::fs::write(&path_settings, toml::to_string(&info).unwrap()).unwrap();
info
}
}; };
let query = url.query().unwrap_or("");
Ok(slug::slugify(format!("{domain}{}{query}", url.path())))
}
fn generic_path(&self, url: &Url, ext: &str) -> Result<String> { let raw_feed: Option<String> = std::fs::read_to_string(dir.join("feed.xml")).ok();
Ok(format!(
"{}/{}{ext}",
self.dir.display(),
Self::slugify_url(url)?
))
}
fn feed_path(&self, url: &Url) -> Result<String> { Self {
self.generic_path(url, "") dir: dir.clone(),
} path_feed: dir.join("feed.xml"),
path_feed_ron: dir.join("feed.ron"),
fn fetchdata_path(&self, url: &Url) -> Result<String> { path_settings: dir.join("settings.toml"),
self.generic_path(url, ".toml") url: url.clone(),
} info,
raw_feed,
pub fn load_fetchdata(&self, url: &Url) -> Result<FetchData> {
let path = self.fetchdata_path(url)?;
if !fs::exists(path.clone())? {
return Ok(FetchData::default());
} }
Ok(toml::from_str(&fs::read_to_string(path)?)?)
} }
fn has_changed(&self, url: &Url, new_feed: &Feed) -> Result<bool> { pub fn load_fetchdata(&self) -> Option<&FetchData> {
let Some(old_feed) = self.load_feed(url, false)? else { self.info.fetch_data.as_ref()
return Ok(true); }
pub fn load_feed(&self, sanitize: bool) -> Option<Feed> {
if let Some(raw_feed) = &self.raw_feed {
let parser = feed_rs::parser::Builder::new()
.sanitize_content(sanitize)
.build();
Some(parser.parse(raw_feed.as_bytes()).unwrap())
} else {
None
}
}
pub fn has_changed(&self, new_feed: &Feed) -> bool {
let Some(old_feed) = self.load_feed(false) else {
return true;
}; };
let mut old_iter = old_feed.entries.iter(); let mut old_iter = old_feed.entries.iter();
for new in &new_feed.entries { for new in &new_feed.entries {
let Some(old) = old_iter.next() else { let Some(old) = old_iter.next() else {
return Ok(true); return true;
}; };
if old != new { if old != new {
return Ok(true); return true;
} }
} }
// ignoring any entries left in old_iter // ignoring any entries left in old_iter
Ok(false) false
}
pub fn store(&mut self, mut response: Response<Body>) -> Result<bool> {
let headers = response.headers();
let fetchdata = FetchData {
etag: hv(headers, "etag"),
last_modified: hv(headers, "last_modified"),
when: Utc::now(),
};
let body = response.body_mut().with_config().read_to_vec()?;
let feed = match feed_rs::parser::parse(body.as_slice()) {
Ok(f) => f,
Err(e) => {
warn!("Error when parsing feed for {}: {e:?}", self.url);
return Ok(false);
}
};
debug!("Storing fetchdata for {}", self.url);
self.info.fetch_data = Some(fetchdata);
Self::write(&self.path_settings, toml::to_string(&self.info)?)?;
if !self.has_changed(&feed) {
return Ok(false);
}
debug!("Storing feed for {}.", self.url);
// todo don't serialize to string but to writer
Self::write(
&self.path_feed_ron,
to_string_pretty(&feed, PrettyConfig::default())?,
)?;
Self::write(&self.path_feed, body)?;
Ok(true)
}
/// refresh in hours
pub fn fetch(&mut self, fetcher: &super::Fetcher, refresh: usize) -> Result<bool> {
let mut builder = fetcher
.agent
.get(self.url.to_string())
.header("FROM", fetcher.from.clone());
if let Some(fetchdata) = self.load_fetchdata() {
if !fetchdata.etag.is_empty() {
builder = builder.header("If-None-Match", fetchdata.etag.clone());
}
if !fetchdata.last_modified.is_empty() {
builder = builder.header("If-Modified-Since", fetchdata.last_modified.clone());
}
// Check if we have hit time for refresh
if fetchdata.when + Duration::try_hours(refresh as i64).unwrap() >= Utc::now() {
// No need to rebuild, check again later
return Ok(false);
}
}
let start_instant = Instant::now();
let result = builder.call();
let duration = start_instant.elapsed();
let response = result?;
debug!(
"fetched with status {} in {} ms: {}",
response.status(),
duration.as_millis(),
self.url,
);
let status = response.status();
match status.as_u16() {
304 => Ok(false), // Not Modified -> nothing to do
200 => self.store(response),
_ => {
warn!(
"HTTP Status {} not implemented for {}",
response.status(),
self.url,
);
Ok(false)
}
}
} }
fn write<P: AsRef<std::path::Path> + std::fmt::Display, C: AsRef<[u8]>>( fn write<P: AsRef<std::path::Path> + std::fmt::Display, C: AsRef<[u8]>>(
@ -90,85 +219,66 @@ impl FeedStore {
} }
fs::write(path, contents) fs::write(path, contents)
} }
}
pub fn store(&self, url: &Url, mut response: Response<Body>) -> Result<bool> { #[derive(Clone, Debug, Deserialize, Serialize, Default)]
let headers = response.headers(); pub struct FetchData {
let fetchdata = FetchData { pub when: DateTime<Utc>,
etag: hv(headers, "etag"), pub etag: String,
last_modified: hv(headers, "last_modified"), pub last_modified: String,
}; }
let body = response.body_mut().with_config().read_to_vec()?; #[derive(Debug)]
let feed = match feed_rs::parser::parse(body.as_slice()) { pub struct FeedStore {
Ok(f) => f, pub _dir: Utf8PathBuf,
Err(e) => { pub feeds: BTreeMap<Url, FeedStoreFeed>,
warn!("Error when parsing feed for {url}: {e:?}"); }
return Ok(false);
} impl FeedStore {
}; pub fn new(dir: &str, feedlist: &Vec<super::FeedConfig>) -> Self {
if !self.has_changed(url, &feed)? { let dir = super::to_checked_pathbuf(dir);
return Ok(false); let mut feeds: BTreeMap<Url, FeedStoreFeed> = BTreeMap::new();
for feed_config in feedlist {
let feed_url = Url::parse(&feed_config.url).unwrap();
feeds.insert(feed_url.clone(), FeedStoreFeed::new(&dir, &feed_url));
} }
debug!("Storing feed for {url}.");
// todo don't serialize to string but to writer Self { _dir: dir, feeds }
Self::write(
self.generic_path(url, ".ron")?,
to_string_pretty(&feed, PrettyConfig::default())?,
)?;
Self::write(self.feed_path(url)?, body)?;
Self::write(self.fetchdata_path(url)?, toml::to_string(&fetchdata)?)?;
Ok(true)
} }
fn load_feed(&self, url: &Url, sanitize: bool) -> Result<Option<Feed>> { pub fn collect(&mut self, max_entries: usize) -> (HashMap<String, Feed>, Vec<Entry>) {
let parser = feed_rs::parser::Builder::new()
.sanitize_content(sanitize)
.build();
let path = self.feed_path(url)?;
if !fs::exists(path.clone())? {
return Ok(None);
}
let file = fs::File::open(path)?;
Ok(Some(parser.parse(BufReader::new(file))?))
}
pub fn collect(
&self,
feed_configs: &Vec<super::FeedConfig>,
max_entries: usize,
) -> (HashMap<String, Feed>, Vec<Entry>) {
let mut feeds = HashMap::new(); let mut feeds = HashMap::new();
let mut entries = Vec::new(); let mut entries = Vec::new();
for feed_config in feed_configs { for (feed_url, feed_store_feed) in self.feeds.iter_mut() {
let mut feed = match (|| { let Some(mut feed) = feed_store_feed.load_feed(true) else {
let url = Url::parse(&feed_config.url)?; warn!("Problem parsing feed file for feed {}", feed_url);
self.load_feed(&url, true) continue;
})() {
Err(e) => {
warn!(
"Problem parsing feed file for feed {}: {e:?}",
feed_config.url
);
continue;
}
Ok(None) => continue,
Ok(Some(f)) => f,
}; };
for entry in &mut feed.entries { for entry in &mut feed.entries {
entry.source = Some(feed_config.url.clone()); entry.source = Some(feed_url.to_string());
} }
entries.append(&mut std::mem::take(&mut feed.entries)); entries.append(&mut std::mem::take(&mut feed.entries));
feeds.insert(feed_config.url.clone(), feed);
// optimization to reduce memory usage
if entries.len() > 4 * max_entries { if entries.len() > 4 * max_entries {
entries = trim_entries(entries, max_entries); entries = trim_entries(entries, max_entries);
} }
feeds.insert(feed_url.to_string(), feed.clone());
} }
(feeds, trim_entries(entries, max_entries)) (feeds, trim_entries(entries, max_entries))
} }
pub fn fetch(&mut self, fetcher: &super::Fetcher, refresh: usize) -> Result<bool> {
let mut rebuild = false;
for (_url, feed) in self.feeds.iter_mut() {
rebuild |= feed.fetch(fetcher, refresh)?;
}
Ok(rebuild)
}
} }
fn trim_entries(mut entries: Vec<Entry>, max_entries: usize) -> Vec<Entry> { fn trim_entries(mut entries: Vec<Entry>, max_entries: usize) -> Vec<Entry> {

View File

@ -1,15 +1,10 @@
use anyhow::Result;
use std::time::Instant;
use ureq::tls::{TlsConfig, TlsProvider}; use ureq::tls::{TlsConfig, TlsProvider};
use ureq::Agent; use ureq::Agent;
use url::Url;
use crate::FeedStore;
pub struct Fetcher { pub struct Fetcher {
agent: Agent, pub agent: Agent,
/// FROM header for requests /// FROM header for requests
from: String, pub from: String,
} }
impl Fetcher { impl Fetcher {
@ -36,41 +31,4 @@ impl Fetcher {
from: from.to_string(), from: from.to_string(),
} }
} }
pub fn fetch(&self, url: Url, feed_store: &FeedStore) -> Result<bool> {
let fetchdata = feed_store.load_fetchdata(&url)?;
let mut builder = self
.agent
.get(url.to_string())
.header("FROM", self.from.clone());
if !fetchdata.etag.is_empty() {
builder = builder.header("If-None-Match", fetchdata.etag);
}
if !fetchdata.last_modified.is_empty() {
builder = builder.header("If-Modified-Since", fetchdata.last_modified);
}
let start_instant = Instant::now();
let result = builder.call();
let duration = start_instant.elapsed();
let response = result?;
debug!(
"fetched with status {} in {} ms: {url}",
response.status(),
duration.as_millis()
);
let status = response.status();
match status.as_u16() {
304 => Ok(false), // Not Modified -> nothing to do
200 => feed_store.store(&url, response),
_ => {
warn!(
"HTTP Status {} not implemented for {url}",
response.status()
);
Ok(false)
}
}
}
} }

View File

@ -26,11 +26,10 @@ extern crate log;
use crate::feed_store::FeedStore; use crate::feed_store::FeedStore;
use crate::fetcher::Fetcher; use crate::fetcher::Fetcher;
use anyhow::Result; use anyhow::Result;
use camino::Utf8PathBuf;
use clap::Parser; use clap::Parser;
use serde::Deserialize; use serde::Deserialize;
use std::fs; use std::fs;
use std::path::PathBuf;
use url::Url;
//mod atom_serializer; //mod atom_serializer;
mod feed_store; mod feed_store;
@ -68,15 +67,17 @@ struct Config {
templates_dir: String, templates_dir: String,
/// How many feed entries should be included in the planet /// How many feed entries should be included in the planet
max_entries: usize, max_entries: usize,
/// How soon to refresh, in hours
refresh: usize,
} }
pub fn to_checked_pathbuf(dir: &str) -> PathBuf { pub fn to_checked_pathbuf(dir: &str) -> Utf8PathBuf {
let dir: PathBuf = PathBuf::from(dir); let dir = Utf8PathBuf::from(dir);
let m = dir let m = dir
.metadata() .metadata()
.unwrap_or_else(|_| panic!("Could not get metadata of dir: {}", dir.display())); .unwrap_or_else(|_| panic!("Could not get metadata of dir: {dir}"));
assert!(m.is_dir(), "Not a dir: {}", dir.display()); assert!(m.is_dir(), "Not a dir: {dir}");
dir dir
} }
@ -90,20 +91,9 @@ struct FeedConfig {
url: String, url: String,
} }
fn fetch(config: &Config, feed_store: &FeedStore) -> Result<bool> { fn fetch(config: &Config, feed_store: &mut FeedStore) -> Result<bool> {
let fetcher = Fetcher::new(&config.bot_name, &config.from); let fetcher = Fetcher::new(&config.bot_name, &config.from);
let mut rebuild = false; let rebuild = feed_store.fetch(&fetcher, config.refresh)?;
for feed in &config.feeds {
let url = match Url::parse(&feed.url) {
Ok(x) => x,
Err(e) => {
error!("Error parsing url '{}': {e:?}", feed.url);
continue;
}
};
rebuild |= fetcher.fetch(url, feed_store)?;
}
info!("Done fetching. Rebuild needed: {rebuild}");
Ok(rebuild) Ok(rebuild)
} }
@ -122,15 +112,15 @@ fn main() -> Result<()> {
let _ = to_checked_pathbuf(&config.templates_dir); let _ = to_checked_pathbuf(&config.templates_dir);
let _ = to_checked_pathbuf(&config.out_dir); let _ = to_checked_pathbuf(&config.out_dir);
let feed_store = FeedStore::new(&config.feed_dir); let mut feed_store = FeedStore::new(&config.feed_dir, &config.feeds);
let should_build = if args.no_fetch { let should_build = if args.no_fetch {
true true
} else { } else {
fetch(&config, &feed_store)? fetch(&config, &mut feed_store)?
}; };
if should_build { if should_build {
template_engine::build(&config, &feed_store)?; template_engine::build(&config, &mut feed_store)?;
} }
Ok(()) Ok(())
} }

View File

@ -7,13 +7,12 @@ use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use tera::{from_value, Tera}; use tera::{from_value, Tera};
pub fn build(config: &Config, feed_store: &FeedStore) -> Result<()> { pub fn build(config: &Config, feed_store: &mut FeedStore) -> Result<()> {
let mut tera = create_tera(&config.templates_dir)?; let mut tera = create_tera(&config.templates_dir)?;
let out_dir = to_checked_pathbuf(&config.out_dir); let out_dir = to_checked_pathbuf(&config.out_dir);
let mut context = tera::Context::new(); let mut context = tera::Context::new();
let (feeds, entries): (HashMap<String, Feed>, _) = let (feeds, entries): (HashMap<String, Feed>, _) = feed_store.collect(config.max_entries);
feed_store.collect(&config.feeds, config.max_entries);
context.insert("feeds", &feeds); context.insert("feeds", &feeds);
context.insert("entries", &entries); context.insert("entries", &entries);
context.insert("PKG_AUTHORS", env!("CARGO_PKG_AUTHORS")); context.insert("PKG_AUTHORS", env!("CARGO_PKG_AUTHORS"));
@ -24,7 +23,7 @@ pub fn build(config: &Config, feed_store: &FeedStore) -> Result<()> {
for name in tera.get_template_names() { for name in tera.get_template_names() {
debug!("Processing template {name}"); debug!("Processing template {name}");
let file = File::create(format!("{}/{name}", out_dir.display()))?; let file = File::create(format!("{out_dir}/{name}"))?;
tera.render_to(name, &context, file)?; tera.render_to(name, &context, file)?;
} }
Ok(()) Ok(())
@ -32,7 +31,7 @@ pub fn build(config: &Config, feed_store: &FeedStore) -> Result<()> {
fn create_tera(templates_dir: &str) -> Result<Tera> { fn create_tera(templates_dir: &str) -> Result<Tera> {
let dir = to_checked_pathbuf(templates_dir); let dir = to_checked_pathbuf(templates_dir);
let mut tera = tera::Tera::new(&format!("{}/*", &dir.display()))?; let mut tera = tera::Tera::new(&format!("{dir}/*"))?;
// disable autoescape as this would corrupt urls or the entriy contents. todo check this! // disable autoescape as this would corrupt urls or the entriy contents. todo check this!
tera.autoescape_on(vec![]); tera.autoescape_on(vec![]);
Ok(tera) Ok(tera)