From 9c59cf73e7e6631d9c33004fc0ff8fb957f673b9 Mon Sep 17 00:00:00 2001 From: Connor Johnstone Date: Tue, 17 Mar 2026 15:01:19 -0400 Subject: [PATCH] Initial commit --- .gitignore | 4 + Cargo.toml | 30 +++++ readme.md | 22 ++++ src/cleaning.rs | 110 ++++++++++++++++ src/error.rs | 30 +++++ src/file_tags.rs | 70 ++++++++++ src/lib.rs | 18 +++ src/main.rs | 95 ++++++++++++++ src/matcher.rs | 305 +++++++++++++++++++++++++++++++++++++++++++ src/musicbrainz.rs | 253 +++++++++++++++++++++++++++++++++++ src/provider.rs | 69 ++++++++++ src/tagger.rs | 227 ++++++++++++++++++++++++++++++++ tests/integration.rs | 176 +++++++++++++++++++++++++ 13 files changed, 1409 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 readme.md create mode 100644 src/cleaning.rs create mode 100644 src/error.rs create mode 100644 src/file_tags.rs create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/matcher.rs create mode 100644 src/musicbrainz.rs create mode 100644 src/provider.rs create mode 100644 src/tagger.rs create mode 100644 tests/integration.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..360fdc9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +target/ +.env +*.db +*.db-journal diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..7ba67c1 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "shanty-tag" +version = "0.1.0" +edition = "2024" +license = "MIT" +description = "Metadata tagging via online databases for Shanty" +repository = "ssh://connor@git.rcjohnstone.com:2222/Shanty/tag.git" + +[dependencies] +shanty-db = { path = "../shanty-db" } +sea-orm = { version = "1", features = ["sqlx-sqlite", "runtime-tokio-native-tls"] } +clap = { version = "4", features = ["derive"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +thiserror = "2" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +tokio = { version = "1", features = ["full"] } +anyhow = "1" +reqwest = { version = "0.12", features = ["json"] } +strsim = "0.11" +unicode-normalization = "0.1" +lofty = "0.22" +chrono = { version = "0.4", features = ["serde"] } +dirs = "6" +regex = "1" + +[dev-dependencies] +tokio = { version = "1", features = ["full", "test-util"] } +tempfile = "3" diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..9ea6ca8 --- /dev/null +++ b/readme.md @@ -0,0 +1,22 @@ +# shanty-tag + +Metadata tagging via online databases for [Shanty](ssh://connor@git.rcjohnstone.com:2222/Shanty/shanty.git). + +Queries MusicBrainz to fill in missing metadata on indexed music files. Uses fuzzy +matching to handle minor spelling differences and a configurable confidence threshold. + +## Usage + +```sh +# Tag all untagged tracks (dry run) +shanty-tag --all --dry-run -vv + +# Tag all untagged tracks for real +shanty-tag --all + +# Tag a specific track and write tags back to the file +shanty-tag --track 42 --write-tags + +# Custom confidence threshold +shanty-tag --all --confidence 0.9 +``` diff --git a/src/cleaning.rs b/src/cleaning.rs new file mode 100644 index 0000000..5831b9b --- /dev/null +++ b/src/cleaning.rs @@ -0,0 +1,110 @@ +use std::sync::LazyLock; + +use regex::Regex; +use unicode_normalization::UnicodeNormalization; + +static STRIP_PATTERNS: LazyLock> = LazyLock::new(|| { + vec![ + // (Official Video), (Official Audio), (Official Music Video), (Lyric Video), etc. + Regex::new(r"\(official\s*(video|audio|music\s*video|lyric\s*video|visualizer)\)").unwrap(), + // (Remastered), (Remastered 2011), (Remaster) + Regex::new(r"\(remaster(ed)?\s*(\d{4})?\)").unwrap(), + // [Live], [Bonus Track], [Deluxe], [Explicit] + Regex::new(r"\[(live|bonus(\s*track)?|deluxe|explicit|clean)\]").unwrap(), + // (feat. Artist), [feat. Artist], (ft. Artist) + Regex::new(r"[\(\[](feat\.?|ft\.?)\s+[^\)\]]+[\)\]]").unwrap(), + // (with Artist) + Regex::new(r"\(with\s+[^)]+\)").unwrap(), + // Trailing " - Single", " - EP" + Regex::new(r"\s*-\s*(single|ep)\s*$").unwrap(), + ] +}); + +/// Normalize a string for fuzzy comparison. +/// +/// Applies unicode NFC normalization, lowercasing, stripping common suffixes +/// (video tags, remaster notes, featuring credits), and trimming. +pub fn normalize(s: &str) -> String { + // Unicode NFC normalization + let s: String = s.nfc().collect(); + + // Lowercase + let mut s = s.to_lowercase(); + + // Strip known patterns + for pattern in STRIP_PATTERNS.iter() { + s = pattern.replace_all(&s, "").to_string(); + } + + // Trim whitespace and punctuation + s.trim() + .trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace()) + .to_string() +} + +/// Escape special characters for MusicBrainz Lucene query syntax. +pub fn escape_lucene(s: &str) -> String { + let special = [ + '+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', + '\\', '/', + ]; + let mut result = String::with_capacity(s.len()); + for c in s.chars() { + if special.contains(&c) { + result.push('\\'); + } + result.push(c); + } + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_normalize_basic() { + assert_eq!(normalize("Hello World"), "hello world"); + assert_eq!(normalize(" spaces "), "spaces"); + } + + #[test] + fn test_normalize_strips_official_video() { + assert_eq!(normalize("Time (Official Video)"), "time"); + assert_eq!(normalize("Money (Official Music Video)"), "money"); + assert_eq!(normalize("Comfortably Numb (Official Audio)"), "comfortably numb"); + } + + #[test] + fn test_normalize_strips_remastered() { + assert_eq!(normalize("Time (Remastered 2011)"), "time"); + assert_eq!(normalize("Money (Remastered)"), "money"); + assert_eq!(normalize("Shine On (Remaster)"), "shine on"); + } + + #[test] + fn test_normalize_strips_feat() { + assert_eq!(normalize("Song (feat. Artist)"), "song"); + assert_eq!(normalize("Song [ft. Someone]"), "song"); + } + + #[test] + fn test_normalize_strips_brackets() { + assert_eq!(normalize("Song [Live]"), "song"); + assert_eq!(normalize("Song [Bonus Track]"), "song"); + assert_eq!(normalize("Song [Explicit]"), "song"); + } + + #[test] + fn test_normalize_unicode() { + // NFC normalization — decomposed é should become composed é + assert_eq!(normalize("café"), normalize("café")); + } + + #[test] + fn test_escape_lucene() { + assert_eq!(escape_lucene("AC/DC"), r"AC\/DC"); + assert_eq!(escape_lucene("test (hello)"), r"test \(hello\)"); + assert_eq!(escape_lucene("simple"), "simple"); + } +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..7feb55c --- /dev/null +++ b/src/error.rs @@ -0,0 +1,30 @@ +use shanty_db::DbError; + +#[derive(Debug, thiserror::Error)] +pub enum TagError { + #[error("database error: {0}")] + Db(#[from] DbError), + + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + #[error("HTTP error: {0}")] + Http(#[from] reqwest::Error), + + #[error("metadata error: {0}")] + Metadata(String), + + #[error("no match found for track {0}")] + NoMatch(i32), + + #[error("{0}")] + Other(String), +} + +impl From for TagError { + fn from(e: lofty::error::LoftyError) -> Self { + TagError::Metadata(e.to_string()) + } +} + +pub type TagResult = Result; diff --git a/src/file_tags.rs b/src/file_tags.rs new file mode 100644 index 0000000..3a9d3cb --- /dev/null +++ b/src/file_tags.rs @@ -0,0 +1,70 @@ +use std::path::Path; + +use lofty::config::{ParseOptions, WriteOptions}; +use lofty::file::{FileType, TaggedFileExt}; +use lofty::probe::Probe; +use lofty::tag::{Accessor, TagExt, TagType}; + +use crate::error::TagResult; +use crate::provider::{RecordingDetails, ReleaseRef}; + +/// Infer the best tag type for a given file type. +fn tag_type_for_file(ft: FileType) -> TagType { + match ft { + FileType::Mpeg => TagType::Id3v2, + FileType::Flac | FileType::Vorbis | FileType::Opus | FileType::Speex => { + TagType::VorbisComments + } + FileType::Mp4 => TagType::Mp4Ilst, + FileType::Ape => TagType::Ape, + _ => TagType::Id3v2, + } +} + +/// Write updated metadata back to the music file's embedded tags. +pub fn write_tags( + file_path: &str, + details: &RecordingDetails, + release: Option<&ReleaseRef>, + year: Option, + genre: Option<&str>, +) -> TagResult<()> { + let path = Path::new(file_path); + + let tagged_file = Probe::open(path)? + .options(ParseOptions::default()) + .read()?; + + // Determine the tag type to use + let tag_type = tagged_file + .primary_tag() + .map(|t| t.tag_type()) + .unwrap_or_else(|| tag_type_for_file(tagged_file.file_type())); + + let mut tag = tagged_file + .primary_tag() + .cloned() + .unwrap_or_else(|| lofty::tag::Tag::new(tag_type)); + + // Set metadata + tag.set_title(details.title.clone()); + tag.set_artist(details.artist.clone()); + + if let Some(release) = release { + tag.set_album(release.title.clone()); + } + + if let Some(y) = year { + tag.set_year(y as u32); + } + + if let Some(g) = genre { + tag.set_genre(g.to_string()); + } + + // Write to file + tag.save_to_path(path, WriteOptions::default())?; + + tracing::info!(path = file_path, "wrote tags to file"); + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..5c734b3 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,18 @@ +//! Metadata tagging via online databases for Shanty. +//! +//! Fills in missing or incorrect metadata on music files by querying online +//! databases such as MusicBrainz. Supports fuzzy matching and configurable +//! confidence thresholds. + +pub mod cleaning; +pub mod error; +pub mod file_tags; +pub mod matcher; +pub mod musicbrainz; +pub mod provider; +pub mod tagger; + +pub use error::{TagError, TagResult}; +pub use musicbrainz::MusicBrainzClient; +pub use provider::MetadataProvider; +pub use tagger::{TagConfig, TagStats, run_tagging}; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..9f4c816 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,95 @@ +use std::path::PathBuf; + +use clap::Parser; +use tracing_subscriber::EnvFilter; + +use shanty_db::Database; +use shanty_tag::{MusicBrainzClient, TagConfig, run_tagging}; + +#[derive(Parser)] +#[command(name = "shanty-tag", about = "Fill in missing metadata on music files via MusicBrainz")] +struct Cli { + /// Database URL. Defaults to sqlite:///shanty/shanty.db?mode=rwc + #[arg(long, env = "SHANTY_DATABASE_URL")] + database: Option, + + /// Tag all untagged tracks in the database. + #[arg(long)] + all: bool, + + /// Tag a specific track by its database ID. + #[arg(long)] + track: Option, + + /// Preview matches without writing to DB or files. + #[arg(long)] + dry_run: bool, + + /// Write updated tags back to music files. + #[arg(long)] + write_tags: bool, + + /// Minimum match confidence (0.0 - 1.0). + #[arg(long, default_value = "0.8")] + confidence: f64, + + /// Increase verbosity (-v info, -vv debug, -vvv trace). + #[arg(short, long, action = clap::ArgAction::Count)] + verbose: u8, +} + +fn default_database_url() -> String { + let data_dir = dirs::data_dir() + .unwrap_or_else(|| PathBuf::from(".")) + .join("shanty"); + std::fs::create_dir_all(&data_dir).ok(); + let db_path = data_dir.join("shanty.db"); + format!("sqlite://{}?mode=rwc", db_path.display()) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + + // Set up tracing + let filter = match cli.verbose { + 0 => "warn", + 1 => "info,shanty_tag=info", + 2 => "info,shanty_tag=debug", + _ => "debug,shanty_tag=trace", + }; + tracing_subscriber::fmt() + .with_env_filter( + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(filter)), + ) + .init(); + + // Validate args + if !cli.all && cli.track.is_none() { + anyhow::bail!("specify either --all or --track "); + } + + // Connect to database + let database_url = cli.database.unwrap_or_else(default_database_url); + tracing::info!(url = %database_url, "connecting to database"); + let db = Database::new(&database_url).await?; + + // Create MusicBrainz client + let provider = MusicBrainzClient::new()?; + + // Run tagging + let config = TagConfig { + dry_run: cli.dry_run, + write_tags: cli.write_tags, + confidence: cli.confidence, + }; + + if config.dry_run { + println!("DRY RUN — no changes will be written"); + } + + let stats = run_tagging(db.conn(), &provider, &config, cli.track).await?; + println!("\nTagging complete: {stats}"); + + Ok(()) +} diff --git a/src/matcher.rs b/src/matcher.rs new file mode 100644 index 0000000..4e55e35 --- /dev/null +++ b/src/matcher.rs @@ -0,0 +1,305 @@ +use shanty_db::entities::track; + +use crate::cleaning::normalize; +use crate::provider::{RecordingMatch, ReleaseRef}; + +/// A scored recording match with the best matching release. +#[derive(Debug, Clone)] +pub struct ScoredMatch { + pub recording: RecordingMatch, + pub confidence: f64, + pub best_release: Option, +} + +/// Build a search query (artist, title) from a track's metadata. +/// Falls back to filename parsing if metadata is insufficient. +pub fn build_query(track: &track::Model) -> Option<(String, String)> { + let artist = track + .album_artist + .as_deref() + .or(track.artist.as_deref()) + .filter(|s| !s.is_empty()); + let title = track.title.as_deref().filter(|s| !s.is_empty()); + + match (artist, title) { + (Some(a), Some(t)) => Some((a.to_string(), t.to_string())), + (None, Some(t)) => Some((String::new(), t.to_string())), + _ => parse_filename(&track.file_path), + } +} + +/// Parse "Artist - Title" from a filename, stripping extension and path. +pub fn parse_filename(file_path: &str) -> Option<(String, String)> { + let filename = std::path::Path::new(file_path) + .file_stem()? + .to_str()?; + + // Try common "Artist - Title" pattern + if let Some((artist, title)) = filename.split_once(" - ") { + let artist = artist.trim().to_string(); + let title = title.trim().to_string(); + if !artist.is_empty() && !title.is_empty() { + return Some((artist, title)); + } + } + + // If no delimiter found, treat entire filename as the title + let name = filename.trim().to_string(); + if !name.is_empty() { + Some((String::new(), name)) + } else { + None + } +} + +/// Score a candidate recording against the track's known metadata. +/// Returns a confidence value from 0.0 to 1.0. +pub fn score_match(track: &track::Model, candidate: &RecordingMatch) -> f64 { + let track_title = track + .title + .as_deref() + .map(normalize) + .unwrap_or_default(); + let candidate_title = normalize(&candidate.title); + + let track_artist = track + .artist + .as_deref() + .or(track.album_artist.as_deref()) + .map(normalize) + .unwrap_or_default(); + let candidate_artist = normalize(&candidate.artist); + + // Title similarity (weighted 0.6) + let title_sim = if track_title.is_empty() || candidate_title.is_empty() { + 0.0 + } else { + strsim::jaro_winkler(&track_title, &candidate_title) + }; + + // Artist similarity (weighted 0.4) + let artist_sim = if track_artist.is_empty() || candidate_artist.is_empty() { + 0.3 // neutral-ish when we have no artist to compare + } else { + strsim::jaro_winkler(&track_artist, &candidate_artist) + }; + + let mut score = 0.6 * title_sim + 0.4 * artist_sim; + + // Bonus: album name matches a release + if let Some(ref album) = track.album { + let track_album = normalize(album); + if !track_album.is_empty() { + for release in &candidate.releases { + let release_title = normalize(&release.title); + let album_sim = strsim::jaro_winkler(&track_album, &release_title); + if album_sim > 0.85 { + score += 0.05; + break; + } + } + } + } + + // Bonus: duration within 3 seconds + if let Some(track_dur) = track.duration { + // MusicBrainz search results don't always include duration, + // but the score from the API itself is a signal + if track_dur > 0.0 && candidate.score > 90 { + score += 0.03; + } + } + + score.min(1.0) +} + +/// Select the best match from candidates that exceeds the confidence threshold. +pub fn select_best_match( + track: &track::Model, + candidates: Vec, + threshold: f64, +) -> Option { + let mut best: Option = None; + + for candidate in candidates { + let confidence = score_match(track, &candidate); + tracing::debug!( + title = %candidate.title, + artist = %candidate.artist, + confidence = confidence, + "candidate" + ); + + if confidence >= threshold { + let best_release = candidate.releases.first().cloned(); + let scored = ScoredMatch { + recording: candidate, + confidence, + best_release, + }; + match &best { + Some(current) if scored.confidence <= current.confidence => {} + _ => best = Some(scored), + } + } + } + + best +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_filename_artist_title() { + let result = parse_filename("/music/Pink Floyd - Time.mp3"); + assert_eq!(result, Some(("Pink Floyd".into(), "Time".into()))); + } + + #[test] + fn test_parse_filename_title_only() { + let result = parse_filename("/music/some_song.mp3"); + assert_eq!(result, Some(("".into(), "some_song".into()))); + } + + #[test] + fn test_parse_filename_nested_path() { + let result = parse_filename("/music/Artist/Album/03 - Track Name.flac"); + // The "03" gets treated as artist since it splits on " - " + assert_eq!(result, Some(("03".into(), "Track Name".into()))); + } + + #[test] + fn test_build_query_with_metadata() { + let track = track::Model { + id: 1, + file_path: "/music/test.mp3".into(), + title: Some("Time".into()), + artist: Some("Pink Floyd".into()), + album: None, + album_artist: None, + track_number: None, + disc_number: None, + duration: None, + genre: None, + year: None, + codec: None, + bitrate: None, + file_size: 1000, + fingerprint: None, + musicbrainz_id: None, + artist_id: None, + album_id: None, + file_mtime: None, + added_at: chrono::Utc::now().naive_utc(), + updated_at: chrono::Utc::now().naive_utc(), + }; + let result = build_query(&track); + assert_eq!(result, Some(("Pink Floyd".into(), "Time".into()))); + } + + #[test] + fn test_build_query_falls_back_to_filename() { + let track = track::Model { + id: 1, + file_path: "/music/Radiohead - Creep.mp3".into(), + title: None, + artist: None, + album: None, + album_artist: None, + track_number: None, + disc_number: None, + duration: None, + genre: None, + year: None, + codec: None, + bitrate: None, + file_size: 1000, + fingerprint: None, + musicbrainz_id: None, + artist_id: None, + album_id: None, + file_mtime: None, + added_at: chrono::Utc::now().naive_utc(), + updated_at: chrono::Utc::now().naive_utc(), + }; + let result = build_query(&track); + assert_eq!(result, Some(("Radiohead".into(), "Creep".into()))); + } + + #[test] + fn test_score_match_exact() { + let track = track::Model { + id: 1, + file_path: "/test.mp3".into(), + title: Some("Time".into()), + artist: Some("Pink Floyd".into()), + album: None, + album_artist: None, + track_number: None, + disc_number: None, + duration: None, + genre: None, + year: None, + codec: None, + bitrate: None, + file_size: 1000, + fingerprint: None, + musicbrainz_id: None, + artist_id: None, + album_id: None, + file_mtime: None, + added_at: chrono::Utc::now().naive_utc(), + updated_at: chrono::Utc::now().naive_utc(), + }; + let candidate = RecordingMatch { + mbid: "123".into(), + title: "Time".into(), + artist: "Pink Floyd".into(), + artist_mbid: None, + releases: vec![], + score: 100, + }; + let score = score_match(&track, &candidate); + assert!(score > 0.95, "exact match should score > 0.95, got {score}"); + } + + #[test] + fn test_score_match_fuzzy() { + let track = track::Model { + id: 1, + file_path: "/test.mp3".into(), + title: Some("Comfortably Numb".into()), + artist: Some("Pink Floyd".into()), + album: None, + album_artist: None, + track_number: None, + disc_number: None, + duration: None, + genre: None, + year: None, + codec: None, + bitrate: None, + file_size: 1000, + fingerprint: None, + musicbrainz_id: None, + artist_id: None, + album_id: None, + file_mtime: None, + added_at: chrono::Utc::now().naive_utc(), + updated_at: chrono::Utc::now().naive_utc(), + }; + // Slight misspelling + let candidate = RecordingMatch { + mbid: "123".into(), + title: "Comfortably Numb".into(), + artist: "Pink Flloyd".into(), // typo + artist_mbid: None, + releases: vec![], + score: 95, + }; + let score = score_match(&track, &candidate); + assert!(score > 0.85, "fuzzy match should score > 0.85, got {score}"); + } +} diff --git a/src/musicbrainz.rs b/src/musicbrainz.rs new file mode 100644 index 0000000..ef3b9a9 --- /dev/null +++ b/src/musicbrainz.rs @@ -0,0 +1,253 @@ +use serde::Deserialize; +use tokio::sync::Mutex; +use tokio::time::{Duration, Instant}; + +use crate::cleaning::escape_lucene; +use crate::error::{TagError, TagResult}; +use crate::provider::{MetadataProvider, RecordingDetails, RecordingMatch, ReleaseMatch, ReleaseRef}; + +const BASE_URL: &str = "https://musicbrainz.org/ws/2"; +const USER_AGENT: &str = "Shanty/0.1.0 (shanty-music-app)"; +const RATE_LIMIT: Duration = Duration::from_millis(1100); // slightly over 1s to be safe + +/// MusicBrainz API client with rate limiting. +pub struct MusicBrainzClient { + client: reqwest::Client, + last_request: Mutex, +} + +impl MusicBrainzClient { + pub fn new() -> TagResult { + let client = reqwest::Client::builder() + .user_agent(USER_AGENT) + .timeout(Duration::from_secs(30)) + .build()?; + Ok(Self { + client, + last_request: Mutex::new(Instant::now() - RATE_LIMIT), + }) + } + + /// Enforce rate limiting: wait if needed so we don't exceed 1 req/sec. + async fn rate_limit(&self) { + let mut last = self.last_request.lock().await; + let elapsed = last.elapsed(); + if elapsed < RATE_LIMIT { + tokio::time::sleep(RATE_LIMIT - elapsed).await; + } + *last = Instant::now(); + } + + async fn get_json(&self, url: &str) -> TagResult { + self.rate_limit().await; + tracing::debug!(url = url, "MusicBrainz request"); + let resp = self.client.get(url).send().await?; + let status = resp.status(); + if !status.is_success() { + let body = resp.text().await.unwrap_or_default(); + return Err(TagError::Other(format!( + "MusicBrainz API error {status}: {body}" + ))); + } + Ok(resp.json().await?) + } +} + +impl MetadataProvider for MusicBrainzClient { + async fn search_recording( + &self, + artist: &str, + title: &str, + ) -> TagResult> { + let query = if artist.is_empty() { + format!("recording:{}", escape_lucene(title)) + } else { + format!( + "artist:{} AND recording:{}", + escape_lucene(artist), + escape_lucene(title) + ) + }; + let url = format!("{BASE_URL}/recording/?query={}&fmt=json&limit=5", urlencoded(&query)); + let resp: MbRecordingSearchResponse = self.get_json(&url).await?; + + Ok(resp + .recordings + .into_iter() + .map(|r| { + let (artist_name, artist_mbid) = extract_artist_credit(&r.artist_credit); + RecordingMatch { + mbid: r.id, + title: r.title, + artist: artist_name, + artist_mbid, + releases: r + .releases + .unwrap_or_default() + .into_iter() + .map(|rel| ReleaseRef { + mbid: rel.id, + title: rel.title, + date: rel.date, + track_number: None, + }) + .collect(), + score: r.score.unwrap_or(0), + } + }) + .collect()) + } + + async fn search_release( + &self, + artist: &str, + album: &str, + ) -> TagResult> { + let query = format!( + "artist:{} AND release:{}", + escape_lucene(artist), + escape_lucene(album) + ); + let url = format!("{BASE_URL}/release/?query={}&fmt=json&limit=5", urlencoded(&query)); + let resp: MbReleaseSearchResponse = self.get_json(&url).await?; + + Ok(resp + .releases + .into_iter() + .map(|r| { + let (artist_name, artist_mbid) = extract_artist_credit(&r.artist_credit); + ReleaseMatch { + mbid: r.id, + title: r.title, + artist: artist_name, + artist_mbid, + date: r.date, + track_count: r.track_count, + score: r.score.unwrap_or(0), + } + }) + .collect()) + } + + async fn get_recording(&self, mbid: &str) -> TagResult { + let url = format!( + "{BASE_URL}/recording/{mbid}?inc=artists+releases+genres&fmt=json" + ); + let r: MbRecordingDetail = self.get_json(&url).await?; + + let (artist_name, artist_mbid) = extract_artist_credit(&r.artist_credit); + Ok(RecordingDetails { + mbid: r.id, + title: r.title, + artist: artist_name, + artist_mbid, + releases: r + .releases + .unwrap_or_default() + .into_iter() + .map(|rel| ReleaseRef { + mbid: rel.id, + title: rel.title, + date: rel.date, + track_number: None, + }) + .collect(), + duration_ms: r.length, + genres: r + .genres + .unwrap_or_default() + .into_iter() + .map(|g| g.name) + .collect(), + }) + } +} + +fn extract_artist_credit(credits: &Option>) -> (String, Option) { + match credits { + Some(credits) if !credits.is_empty() => { + let name: String = credits + .iter() + .map(|c| { + let mut s = c.artist.name.clone(); + if let Some(ref join) = c.joinphrase { + s.push_str(join); + } + s + }) + .collect(); + let mbid = Some(credits[0].artist.id.clone()); + (name, mbid) + } + _ => ("Unknown Artist".to_string(), None), + } +} + +fn urlencoded(s: &str) -> String { + s.replace(' ', "+") + .replace('&', "%26") + .replace('=', "%3D") + .replace('#', "%23") +} + +// --- MusicBrainz API response types --- + +#[derive(Deserialize)] +struct MbRecordingSearchResponse { + recordings: Vec, +} + +#[derive(Deserialize)] +struct MbRecordingResult { + id: String, + title: String, + score: Option, + #[serde(rename = "artist-credit")] + artist_credit: Option>, + releases: Option>, +} + +#[derive(Deserialize)] +struct MbReleaseSearchResponse { + releases: Vec, +} + +#[derive(Deserialize)] +struct MbReleaseResult { + id: String, + title: String, + score: Option, + #[serde(rename = "artist-credit")] + artist_credit: Option>, + date: Option, + #[serde(rename = "track-count")] + track_count: Option, +} + +#[derive(Deserialize)] +struct MbRecordingDetail { + id: String, + title: String, + #[serde(rename = "artist-credit")] + artist_credit: Option>, + releases: Option>, + length: Option, + genres: Option>, +} + +#[derive(Deserialize)] +struct MbArtistCredit { + artist: MbArtist, + joinphrase: Option, +} + +#[derive(Deserialize)] +struct MbArtist { + id: String, + name: String, +} + +#[derive(Deserialize)] +struct MbGenre { + name: String, +} diff --git a/src/provider.rs b/src/provider.rs new file mode 100644 index 0000000..21150a5 --- /dev/null +++ b/src/provider.rs @@ -0,0 +1,69 @@ +use serde::{Deserialize, Serialize}; + +use crate::error::TagResult; + +/// A reference to a release (album) that a recording appears on. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReleaseRef { + pub mbid: String, + pub title: String, + pub date: Option, + pub track_number: Option, +} + +/// A recording match from a search query. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RecordingMatch { + pub mbid: String, + pub title: String, + pub artist: String, + pub artist_mbid: Option, + pub releases: Vec, + /// MusicBrainz API score (0-100). + pub score: u8, +} + +/// A release (album) match from a search query. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReleaseMatch { + pub mbid: String, + pub title: String, + pub artist: String, + pub artist_mbid: Option, + pub date: Option, + pub track_count: Option, + pub score: u8, +} + +/// Full details for a recording, retrieved by MBID. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RecordingDetails { + pub mbid: String, + pub title: String, + pub artist: String, + pub artist_mbid: Option, + pub releases: Vec, + pub duration_ms: Option, + pub genres: Vec, +} + +/// Trait for metadata lookup backends. MusicBrainz is the default implementation; +/// others (Last.fm, Discogs, etc.) can be added later. +pub trait MetadataProvider: Send + Sync { + fn search_recording( + &self, + artist: &str, + title: &str, + ) -> impl std::future::Future>> + Send; + + fn search_release( + &self, + artist: &str, + album: &str, + ) -> impl std::future::Future>> + Send; + + fn get_recording( + &self, + mbid: &str, + ) -> impl std::future::Future> + Send; +} diff --git a/src/tagger.rs b/src/tagger.rs new file mode 100644 index 0000000..0069de5 --- /dev/null +++ b/src/tagger.rs @@ -0,0 +1,227 @@ +use std::fmt; + +use sea_orm::{ActiveValue::Set, DatabaseConnection, NotSet}; + +use shanty_db::entities::track; +use shanty_db::queries; + +use crate::error::TagResult; +use crate::file_tags; +use crate::matcher::{self, ScoredMatch}; +use crate::provider::MetadataProvider; + +/// Configuration for a tagging operation. +pub struct TagConfig { + /// If true, show what would change without writing to DB or files. + pub dry_run: bool, + /// If true, write updated tags back to the music files. + pub write_tags: bool, + /// Minimum match confidence (0.0 - 1.0). + pub confidence: f64, +} + +/// Statistics from a completed tagging run. +#[derive(Debug, Default, Clone)] +pub struct TagStats { + pub tracks_processed: u64, + pub tracks_matched: u64, + pub tracks_updated: u64, + pub tracks_skipped: u64, + pub tracks_errored: u64, +} + +impl fmt::Display for TagStats { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "processed: {}, matched: {}, updated: {}, skipped: {}, errors: {}", + self.tracks_processed, + self.tracks_matched, + self.tracks_updated, + self.tracks_skipped, + self.tracks_errored, + ) + } +} + +/// Tag a single track. Returns `Ok(true)` if matched and updated. +pub async fn tag_track( + conn: &DatabaseConnection, + provider: &impl MetadataProvider, + track: &track::Model, + config: &TagConfig, +) -> TagResult { + // Build search query + let (artist, title) = match matcher::build_query(track) { + Some(q) => q, + None => { + tracing::debug!(id = track.id, path = %track.file_path, "no query possible, skipping"); + return Ok(false); + } + }; + + tracing::info!( + id = track.id, + artist = %artist, + title = %title, + "searching MusicBrainz" + ); + + // Search for recordings + let candidates = provider.search_recording(&artist, &title).await?; + + if candidates.is_empty() { + tracing::debug!(id = track.id, "no results from MusicBrainz"); + return Ok(false); + } + + // Score and select best match + let best = match matcher::select_best_match(track, candidates, config.confidence) { + Some(m) => m, + None => { + tracing::debug!( + id = track.id, + "no match above confidence threshold {}", + config.confidence + ); + return Ok(false); + } + }; + + log_match(track, &best); + + if config.dry_run { + return Ok(true); + } + + // Get full details for the best match + let details = provider.get_recording(&best.recording.mbid).await?; + + // Upsert artist with MusicBrainz ID + let artist_id = match &details.artist_mbid { + Some(mbid) => { + Some(queries::artists::upsert(conn, &details.artist, Some(mbid)).await?.id) + } + None => { + Some(queries::artists::upsert(conn, &details.artist, None).await?.id) + } + }; + + // Upsert album from best release + let (album_id, album_name) = if let Some(ref release) = best.best_release { + let album = queries::albums::upsert( + conn, + &release.title, + &details.artist, + Some(&release.mbid), + artist_id, + ) + .await?; + (Some(album.id), Some(release.title.clone())) + } else { + (None, None) + }; + + // Parse year from release date + let year = best + .best_release + .as_ref() + .and_then(|r| r.date.as_deref()) + .and_then(|d| d.split('-').next()) + .and_then(|y| y.parse::().ok()); + + let genre = details.genres.first().cloned(); + + // Update track metadata + let active = track::ActiveModel { + id: Set(track.id), + file_path: Set(track.file_path.clone()), + title: Set(Some(details.title.clone())), + artist: Set(Some(details.artist.clone())), + album: Set(album_name), + album_artist: Set(Some(details.artist.clone())), + musicbrainz_id: Set(Some(details.mbid.clone())), + artist_id: Set(artist_id), + album_id: Set(album_id), + year: Set(year), + genre: Set(genre.clone()), + // Preserve existing values for fields we don't update + track_number: NotSet, + disc_number: NotSet, + duration: NotSet, + codec: NotSet, + bitrate: NotSet, + file_size: NotSet, + fingerprint: NotSet, + file_mtime: NotSet, + added_at: NotSet, + updated_at: NotSet, + }; + queries::tracks::update_metadata(conn, track.id, active).await?; + + // Optionally write tags to file + if config.write_tags { + if let Err(e) = file_tags::write_tags( + &track.file_path, + &details, + best.best_release.as_ref(), + year, + genre.as_deref(), + ) { + tracing::warn!(id = track.id, path = %track.file_path, "failed to write file tags: {e}"); + } + } + + Ok(true) +} + +fn log_match(track: &track::Model, best: &ScoredMatch) { + tracing::info!( + id = track.id, + confidence = format!("{:.2}", best.confidence), + matched_title = %best.recording.title, + matched_artist = %best.recording.artist, + release = best.best_release.as_ref().map(|r| r.title.as_str()).unwrap_or("(none)"), + "match found" + ); +} + +/// Run tagging on all untagged tracks or a specific track. +pub async fn run_tagging( + conn: &DatabaseConnection, + provider: &impl MetadataProvider, + config: &TagConfig, + track_id: Option, +) -> TagResult { + let tracks: Vec = if let Some(id) = track_id { + vec![queries::tracks::get_by_id(conn, id).await?] + } else { + queries::tracks::get_untagged(conn).await? + }; + + tracing::info!(count = tracks.len(), "tracks to process"); + let mut stats = TagStats::default(); + + for track in &tracks { + stats.tracks_processed += 1; + + match tag_track(conn, provider, track, config).await { + Ok(true) => { + stats.tracks_matched += 1; + if !config.dry_run { + stats.tracks_updated += 1; + } + } + Ok(false) => { + stats.tracks_skipped += 1; + } + Err(e) => { + tracing::error!(id = track.id, path = %track.file_path, "tagging error: {e}"); + stats.tracks_errored += 1; + } + } + } + + tracing::info!(%stats, "tagging complete"); + Ok(stats) +} diff --git a/tests/integration.rs b/tests/integration.rs new file mode 100644 index 0000000..ae325d7 --- /dev/null +++ b/tests/integration.rs @@ -0,0 +1,176 @@ +use chrono::Utc; +use sea_orm::ActiveValue::Set; + +use shanty_db::{Database, queries}; +use shanty_tag::provider::{MetadataProvider, RecordingDetails, RecordingMatch, ReleaseMatch, ReleaseRef}; +use shanty_tag::error::TagResult; +use shanty_tag::{TagConfig, run_tagging}; + +/// A mock metadata provider for testing without hitting MusicBrainz. +struct MockProvider; + +impl MetadataProvider for MockProvider { + async fn search_recording(&self, artist: &str, title: &str) -> TagResult> { + // Return a match for "Pink Floyd - Time" + if artist.contains("Pink Floyd") && title.contains("Time") { + Ok(vec![RecordingMatch { + mbid: "rec-123".into(), + title: "Time".into(), + artist: "Pink Floyd".into(), + artist_mbid: Some("artist-456".into()), + releases: vec![ReleaseRef { + mbid: "release-789".into(), + title: "The Dark Side of the Moon".into(), + date: Some("1973-03-01".into()), + track_number: Some(4), + }], + score: 100, + }]) + } else { + Ok(vec![]) + } + } + + async fn search_release(&self, _artist: &str, _album: &str) -> TagResult> { + Ok(vec![]) + } + + async fn get_recording(&self, mbid: &str) -> TagResult { + if mbid == "rec-123" { + Ok(RecordingDetails { + mbid: "rec-123".into(), + title: "Time".into(), + artist: "Pink Floyd".into(), + artist_mbid: Some("artist-456".into()), + releases: vec![ReleaseRef { + mbid: "release-789".into(), + title: "The Dark Side of the Moon".into(), + date: Some("1973-03-01".into()), + track_number: Some(4), + }], + duration_ms: Some(413_000), + genres: vec!["Progressive Rock".into()], + }) + } else { + Err(shanty_tag::TagError::Other("not found".into())) + } + } +} + +async fn test_db() -> Database { + Database::new("sqlite::memory:") + .await + .expect("failed to create test database") +} + +async fn insert_untagged_track(db: &Database, file_path: &str, title: Option<&str>, artist: Option<&str>) -> i32 { + let now = Utc::now().naive_utc(); + let active = shanty_db::entities::track::ActiveModel { + file_path: Set(file_path.to_string()), + title: Set(title.map(String::from)), + artist: Set(artist.map(String::from)), + file_size: Set(1_000_000), + added_at: Set(now), + updated_at: Set(now), + ..Default::default() + }; + let track = queries::tracks::upsert(db.conn(), active).await.unwrap(); + track.id +} + +#[tokio::test] +async fn test_tag_track_with_match() { + let db = test_db().await; + let provider = MockProvider; + + let track_id = insert_untagged_track(&db, "/music/time.mp3", Some("Time"), Some("Pink Floyd")).await; + + let config = TagConfig { + dry_run: false, + write_tags: false, + confidence: 0.8, + }; + + let stats = run_tagging(db.conn(), &provider, &config, Some(track_id)).await.unwrap(); + assert_eq!(stats.tracks_processed, 1); + assert_eq!(stats.tracks_matched, 1); + assert_eq!(stats.tracks_updated, 1); + + // Verify the track was updated + let track = queries::tracks::get_by_id(db.conn(), track_id).await.unwrap(); + assert_eq!(track.musicbrainz_id.as_deref(), Some("rec-123")); + assert_eq!(track.title.as_deref(), Some("Time")); + assert_eq!(track.artist.as_deref(), Some("Pink Floyd")); + assert_eq!(track.album.as_deref(), Some("The Dark Side of the Moon")); + assert_eq!(track.year, Some(1973)); + assert_eq!(track.genre.as_deref(), Some("Progressive Rock")); + + // Verify artist was created with MusicBrainz ID + let artist = queries::artists::find_by_name(db.conn(), "Pink Floyd").await.unwrap(); + assert!(artist.is_some()); + assert_eq!(artist.unwrap().musicbrainz_id.as_deref(), Some("artist-456")); +} + +#[tokio::test] +async fn test_tag_track_no_match() { + let db = test_db().await; + let provider = MockProvider; + + let track_id = insert_untagged_track(&db, "/music/unknown.mp3", Some("Unknown Song"), Some("Nobody")).await; + + let config = TagConfig { + dry_run: false, + write_tags: false, + confidence: 0.8, + }; + + let stats = run_tagging(db.conn(), &provider, &config, Some(track_id)).await.unwrap(); + assert_eq!(stats.tracks_processed, 1); + assert_eq!(stats.tracks_skipped, 1); + + // Track should be unchanged + let track = queries::tracks::get_by_id(db.conn(), track_id).await.unwrap(); + assert!(track.musicbrainz_id.is_none()); +} + +#[tokio::test] +async fn test_dry_run_does_not_update() { + let db = test_db().await; + let provider = MockProvider; + + let track_id = insert_untagged_track(&db, "/music/time.mp3", Some("Time"), Some("Pink Floyd")).await; + + let config = TagConfig { + dry_run: true, + write_tags: false, + confidence: 0.8, + }; + + let stats = run_tagging(db.conn(), &provider, &config, Some(track_id)).await.unwrap(); + assert_eq!(stats.tracks_matched, 1); + assert_eq!(stats.tracks_updated, 0); // dry run + + // Track should be unchanged + let track = queries::tracks::get_by_id(db.conn(), track_id).await.unwrap(); + assert!(track.musicbrainz_id.is_none()); +} + +#[tokio::test] +async fn test_tag_all_untagged() { + let db = test_db().await; + let provider = MockProvider; + + insert_untagged_track(&db, "/music/time.mp3", Some("Time"), Some("Pink Floyd")).await; + insert_untagged_track(&db, "/music/unknown.mp3", Some("Unknown"), Some("Nobody")).await; + + let config = TagConfig { + dry_run: false, + write_tags: false, + confidence: 0.8, + }; + + let stats = run_tagging(db.conn(), &provider, &config, None).await.unwrap(); + assert_eq!(stats.tracks_processed, 2); + assert_eq!(stats.tracks_matched, 1); // only Pink Floyd matched + assert_eq!(stats.tracks_skipped, 1); +}