Initial commit

This commit is contained in:
Connor Johnstone
2026-03-17 15:01:19 -04:00
commit 9c59cf73e7
13 changed files with 1409 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
target/
.env
*.db
*.db-journal

30
Cargo.toml Normal file
View File

@@ -0,0 +1,30 @@
[package]
name = "shanty-tag"
version = "0.1.0"
edition = "2024"
license = "MIT"
description = "Metadata tagging via online databases for Shanty"
repository = "ssh://connor@git.rcjohnstone.com:2222/Shanty/tag.git"
[dependencies]
shanty-db = { path = "../shanty-db" }
sea-orm = { version = "1", features = ["sqlx-sqlite", "runtime-tokio-native-tls"] }
clap = { version = "4", features = ["derive"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
thiserror = "2"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
tokio = { version = "1", features = ["full"] }
anyhow = "1"
reqwest = { version = "0.12", features = ["json"] }
strsim = "0.11"
unicode-normalization = "0.1"
lofty = "0.22"
chrono = { version = "0.4", features = ["serde"] }
dirs = "6"
regex = "1"
[dev-dependencies]
tokio = { version = "1", features = ["full", "test-util"] }
tempfile = "3"

22
readme.md Normal file
View File

@@ -0,0 +1,22 @@
# shanty-tag
Metadata tagging via online databases for [Shanty](ssh://connor@git.rcjohnstone.com:2222/Shanty/shanty.git).
Queries MusicBrainz to fill in missing metadata on indexed music files. Uses fuzzy
matching to handle minor spelling differences and a configurable confidence threshold.
## Usage
```sh
# Tag all untagged tracks (dry run)
shanty-tag --all --dry-run -vv
# Tag all untagged tracks for real
shanty-tag --all
# Tag a specific track and write tags back to the file
shanty-tag --track 42 --write-tags
# Custom confidence threshold
shanty-tag --all --confidence 0.9
```

110
src/cleaning.rs Normal file
View File

@@ -0,0 +1,110 @@
use std::sync::LazyLock;
use regex::Regex;
use unicode_normalization::UnicodeNormalization;
static STRIP_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
vec![
// (Official Video), (Official Audio), (Official Music Video), (Lyric Video), etc.
Regex::new(r"\(official\s*(video|audio|music\s*video|lyric\s*video|visualizer)\)").unwrap(),
// (Remastered), (Remastered 2011), (Remaster)
Regex::new(r"\(remaster(ed)?\s*(\d{4})?\)").unwrap(),
// [Live], [Bonus Track], [Deluxe], [Explicit]
Regex::new(r"\[(live|bonus(\s*track)?|deluxe|explicit|clean)\]").unwrap(),
// (feat. Artist), [feat. Artist], (ft. Artist)
Regex::new(r"[\(\[](feat\.?|ft\.?)\s+[^\)\]]+[\)\]]").unwrap(),
// (with Artist)
Regex::new(r"\(with\s+[^)]+\)").unwrap(),
// Trailing " - Single", " - EP"
Regex::new(r"\s*-\s*(single|ep)\s*$").unwrap(),
]
});
/// Normalize a string for fuzzy comparison.
///
/// Applies unicode NFC normalization, lowercasing, stripping common suffixes
/// (video tags, remaster notes, featuring credits), and trimming.
pub fn normalize(s: &str) -> String {
// Unicode NFC normalization
let s: String = s.nfc().collect();
// Lowercase
let mut s = s.to_lowercase();
// Strip known patterns
for pattern in STRIP_PATTERNS.iter() {
s = pattern.replace_all(&s, "").to_string();
}
// Trim whitespace and punctuation
s.trim()
.trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace())
.to_string()
}
/// Escape special characters for MusicBrainz Lucene query syntax.
pub fn escape_lucene(s: &str) -> String {
let special = [
'+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':',
'\\', '/',
];
let mut result = String::with_capacity(s.len());
for c in s.chars() {
if special.contains(&c) {
result.push('\\');
}
result.push(c);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_basic() {
assert_eq!(normalize("Hello World"), "hello world");
assert_eq!(normalize(" spaces "), "spaces");
}
#[test]
fn test_normalize_strips_official_video() {
assert_eq!(normalize("Time (Official Video)"), "time");
assert_eq!(normalize("Money (Official Music Video)"), "money");
assert_eq!(normalize("Comfortably Numb (Official Audio)"), "comfortably numb");
}
#[test]
fn test_normalize_strips_remastered() {
assert_eq!(normalize("Time (Remastered 2011)"), "time");
assert_eq!(normalize("Money (Remastered)"), "money");
assert_eq!(normalize("Shine On (Remaster)"), "shine on");
}
#[test]
fn test_normalize_strips_feat() {
assert_eq!(normalize("Song (feat. Artist)"), "song");
assert_eq!(normalize("Song [ft. Someone]"), "song");
}
#[test]
fn test_normalize_strips_brackets() {
assert_eq!(normalize("Song [Live]"), "song");
assert_eq!(normalize("Song [Bonus Track]"), "song");
assert_eq!(normalize("Song [Explicit]"), "song");
}
#[test]
fn test_normalize_unicode() {
// NFC normalization — decomposed é should become composed é
assert_eq!(normalize("café"), normalize("café"));
}
#[test]
fn test_escape_lucene() {
assert_eq!(escape_lucene("AC/DC"), r"AC\/DC");
assert_eq!(escape_lucene("test (hello)"), r"test \(hello\)");
assert_eq!(escape_lucene("simple"), "simple");
}
}

30
src/error.rs Normal file
View File

@@ -0,0 +1,30 @@
use shanty_db::DbError;
#[derive(Debug, thiserror::Error)]
pub enum TagError {
#[error("database error: {0}")]
Db(#[from] DbError),
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
#[error("HTTP error: {0}")]
Http(#[from] reqwest::Error),
#[error("metadata error: {0}")]
Metadata(String),
#[error("no match found for track {0}")]
NoMatch(i32),
#[error("{0}")]
Other(String),
}
impl From<lofty::error::LoftyError> for TagError {
fn from(e: lofty::error::LoftyError) -> Self {
TagError::Metadata(e.to_string())
}
}
pub type TagResult<T> = Result<T, TagError>;

70
src/file_tags.rs Normal file
View File

@@ -0,0 +1,70 @@
use std::path::Path;
use lofty::config::{ParseOptions, WriteOptions};
use lofty::file::{FileType, TaggedFileExt};
use lofty::probe::Probe;
use lofty::tag::{Accessor, TagExt, TagType};
use crate::error::TagResult;
use crate::provider::{RecordingDetails, ReleaseRef};
/// Infer the best tag type for a given file type.
fn tag_type_for_file(ft: FileType) -> TagType {
match ft {
FileType::Mpeg => TagType::Id3v2,
FileType::Flac | FileType::Vorbis | FileType::Opus | FileType::Speex => {
TagType::VorbisComments
}
FileType::Mp4 => TagType::Mp4Ilst,
FileType::Ape => TagType::Ape,
_ => TagType::Id3v2,
}
}
/// Write updated metadata back to the music file's embedded tags.
pub fn write_tags(
file_path: &str,
details: &RecordingDetails,
release: Option<&ReleaseRef>,
year: Option<i32>,
genre: Option<&str>,
) -> TagResult<()> {
let path = Path::new(file_path);
let tagged_file = Probe::open(path)?
.options(ParseOptions::default())
.read()?;
// Determine the tag type to use
let tag_type = tagged_file
.primary_tag()
.map(|t| t.tag_type())
.unwrap_or_else(|| tag_type_for_file(tagged_file.file_type()));
let mut tag = tagged_file
.primary_tag()
.cloned()
.unwrap_or_else(|| lofty::tag::Tag::new(tag_type));
// Set metadata
tag.set_title(details.title.clone());
tag.set_artist(details.artist.clone());
if let Some(release) = release {
tag.set_album(release.title.clone());
}
if let Some(y) = year {
tag.set_year(y as u32);
}
if let Some(g) = genre {
tag.set_genre(g.to_string());
}
// Write to file
tag.save_to_path(path, WriteOptions::default())?;
tracing::info!(path = file_path, "wrote tags to file");
Ok(())
}

18
src/lib.rs Normal file
View File

@@ -0,0 +1,18 @@
//! Metadata tagging via online databases for Shanty.
//!
//! Fills in missing or incorrect metadata on music files by querying online
//! databases such as MusicBrainz. Supports fuzzy matching and configurable
//! confidence thresholds.
pub mod cleaning;
pub mod error;
pub mod file_tags;
pub mod matcher;
pub mod musicbrainz;
pub mod provider;
pub mod tagger;
pub use error::{TagError, TagResult};
pub use musicbrainz::MusicBrainzClient;
pub use provider::MetadataProvider;
pub use tagger::{TagConfig, TagStats, run_tagging};

95
src/main.rs Normal file
View File

@@ -0,0 +1,95 @@
use std::path::PathBuf;
use clap::Parser;
use tracing_subscriber::EnvFilter;
use shanty_db::Database;
use shanty_tag::{MusicBrainzClient, TagConfig, run_tagging};
#[derive(Parser)]
#[command(name = "shanty-tag", about = "Fill in missing metadata on music files via MusicBrainz")]
struct Cli {
/// Database URL. Defaults to sqlite://<XDG_DATA_HOME>/shanty/shanty.db?mode=rwc
#[arg(long, env = "SHANTY_DATABASE_URL")]
database: Option<String>,
/// Tag all untagged tracks in the database.
#[arg(long)]
all: bool,
/// Tag a specific track by its database ID.
#[arg(long)]
track: Option<i32>,
/// Preview matches without writing to DB or files.
#[arg(long)]
dry_run: bool,
/// Write updated tags back to music files.
#[arg(long)]
write_tags: bool,
/// Minimum match confidence (0.0 - 1.0).
#[arg(long, default_value = "0.8")]
confidence: f64,
/// Increase verbosity (-v info, -vv debug, -vvv trace).
#[arg(short, long, action = clap::ArgAction::Count)]
verbose: u8,
}
fn default_database_url() -> String {
let data_dir = dirs::data_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join("shanty");
std::fs::create_dir_all(&data_dir).ok();
let db_path = data_dir.join("shanty.db");
format!("sqlite://{}?mode=rwc", db_path.display())
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
// Set up tracing
let filter = match cli.verbose {
0 => "warn",
1 => "info,shanty_tag=info",
2 => "info,shanty_tag=debug",
_ => "debug,shanty_tag=trace",
};
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(filter)),
)
.init();
// Validate args
if !cli.all && cli.track.is_none() {
anyhow::bail!("specify either --all or --track <id>");
}
// Connect to database
let database_url = cli.database.unwrap_or_else(default_database_url);
tracing::info!(url = %database_url, "connecting to database");
let db = Database::new(&database_url).await?;
// Create MusicBrainz client
let provider = MusicBrainzClient::new()?;
// Run tagging
let config = TagConfig {
dry_run: cli.dry_run,
write_tags: cli.write_tags,
confidence: cli.confidence,
};
if config.dry_run {
println!("DRY RUN — no changes will be written");
}
let stats = run_tagging(db.conn(), &provider, &config, cli.track).await?;
println!("\nTagging complete: {stats}");
Ok(())
}

305
src/matcher.rs Normal file
View File

@@ -0,0 +1,305 @@
use shanty_db::entities::track;
use crate::cleaning::normalize;
use crate::provider::{RecordingMatch, ReleaseRef};
/// A scored recording match with the best matching release.
#[derive(Debug, Clone)]
pub struct ScoredMatch {
pub recording: RecordingMatch,
pub confidence: f64,
pub best_release: Option<ReleaseRef>,
}
/// Build a search query (artist, title) from a track's metadata.
/// Falls back to filename parsing if metadata is insufficient.
pub fn build_query(track: &track::Model) -> Option<(String, String)> {
let artist = track
.album_artist
.as_deref()
.or(track.artist.as_deref())
.filter(|s| !s.is_empty());
let title = track.title.as_deref().filter(|s| !s.is_empty());
match (artist, title) {
(Some(a), Some(t)) => Some((a.to_string(), t.to_string())),
(None, Some(t)) => Some((String::new(), t.to_string())),
_ => parse_filename(&track.file_path),
}
}
/// Parse "Artist - Title" from a filename, stripping extension and path.
pub fn parse_filename(file_path: &str) -> Option<(String, String)> {
let filename = std::path::Path::new(file_path)
.file_stem()?
.to_str()?;
// Try common "Artist - Title" pattern
if let Some((artist, title)) = filename.split_once(" - ") {
let artist = artist.trim().to_string();
let title = title.trim().to_string();
if !artist.is_empty() && !title.is_empty() {
return Some((artist, title));
}
}
// If no delimiter found, treat entire filename as the title
let name = filename.trim().to_string();
if !name.is_empty() {
Some((String::new(), name))
} else {
None
}
}
/// Score a candidate recording against the track's known metadata.
/// Returns a confidence value from 0.0 to 1.0.
pub fn score_match(track: &track::Model, candidate: &RecordingMatch) -> f64 {
let track_title = track
.title
.as_deref()
.map(normalize)
.unwrap_or_default();
let candidate_title = normalize(&candidate.title);
let track_artist = track
.artist
.as_deref()
.or(track.album_artist.as_deref())
.map(normalize)
.unwrap_or_default();
let candidate_artist = normalize(&candidate.artist);
// Title similarity (weighted 0.6)
let title_sim = if track_title.is_empty() || candidate_title.is_empty() {
0.0
} else {
strsim::jaro_winkler(&track_title, &candidate_title)
};
// Artist similarity (weighted 0.4)
let artist_sim = if track_artist.is_empty() || candidate_artist.is_empty() {
0.3 // neutral-ish when we have no artist to compare
} else {
strsim::jaro_winkler(&track_artist, &candidate_artist)
};
let mut score = 0.6 * title_sim + 0.4 * artist_sim;
// Bonus: album name matches a release
if let Some(ref album) = track.album {
let track_album = normalize(album);
if !track_album.is_empty() {
for release in &candidate.releases {
let release_title = normalize(&release.title);
let album_sim = strsim::jaro_winkler(&track_album, &release_title);
if album_sim > 0.85 {
score += 0.05;
break;
}
}
}
}
// Bonus: duration within 3 seconds
if let Some(track_dur) = track.duration {
// MusicBrainz search results don't always include duration,
// but the score from the API itself is a signal
if track_dur > 0.0 && candidate.score > 90 {
score += 0.03;
}
}
score.min(1.0)
}
/// Select the best match from candidates that exceeds the confidence threshold.
pub fn select_best_match(
track: &track::Model,
candidates: Vec<RecordingMatch>,
threshold: f64,
) -> Option<ScoredMatch> {
let mut best: Option<ScoredMatch> = None;
for candidate in candidates {
let confidence = score_match(track, &candidate);
tracing::debug!(
title = %candidate.title,
artist = %candidate.artist,
confidence = confidence,
"candidate"
);
if confidence >= threshold {
let best_release = candidate.releases.first().cloned();
let scored = ScoredMatch {
recording: candidate,
confidence,
best_release,
};
match &best {
Some(current) if scored.confidence <= current.confidence => {}
_ => best = Some(scored),
}
}
}
best
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_filename_artist_title() {
let result = parse_filename("/music/Pink Floyd - Time.mp3");
assert_eq!(result, Some(("Pink Floyd".into(), "Time".into())));
}
#[test]
fn test_parse_filename_title_only() {
let result = parse_filename("/music/some_song.mp3");
assert_eq!(result, Some(("".into(), "some_song".into())));
}
#[test]
fn test_parse_filename_nested_path() {
let result = parse_filename("/music/Artist/Album/03 - Track Name.flac");
// The "03" gets treated as artist since it splits on " - "
assert_eq!(result, Some(("03".into(), "Track Name".into())));
}
#[test]
fn test_build_query_with_metadata() {
let track = track::Model {
id: 1,
file_path: "/music/test.mp3".into(),
title: Some("Time".into()),
artist: Some("Pink Floyd".into()),
album: None,
album_artist: None,
track_number: None,
disc_number: None,
duration: None,
genre: None,
year: None,
codec: None,
bitrate: None,
file_size: 1000,
fingerprint: None,
musicbrainz_id: None,
artist_id: None,
album_id: None,
file_mtime: None,
added_at: chrono::Utc::now().naive_utc(),
updated_at: chrono::Utc::now().naive_utc(),
};
let result = build_query(&track);
assert_eq!(result, Some(("Pink Floyd".into(), "Time".into())));
}
#[test]
fn test_build_query_falls_back_to_filename() {
let track = track::Model {
id: 1,
file_path: "/music/Radiohead - Creep.mp3".into(),
title: None,
artist: None,
album: None,
album_artist: None,
track_number: None,
disc_number: None,
duration: None,
genre: None,
year: None,
codec: None,
bitrate: None,
file_size: 1000,
fingerprint: None,
musicbrainz_id: None,
artist_id: None,
album_id: None,
file_mtime: None,
added_at: chrono::Utc::now().naive_utc(),
updated_at: chrono::Utc::now().naive_utc(),
};
let result = build_query(&track);
assert_eq!(result, Some(("Radiohead".into(), "Creep".into())));
}
#[test]
fn test_score_match_exact() {
let track = track::Model {
id: 1,
file_path: "/test.mp3".into(),
title: Some("Time".into()),
artist: Some("Pink Floyd".into()),
album: None,
album_artist: None,
track_number: None,
disc_number: None,
duration: None,
genre: None,
year: None,
codec: None,
bitrate: None,
file_size: 1000,
fingerprint: None,
musicbrainz_id: None,
artist_id: None,
album_id: None,
file_mtime: None,
added_at: chrono::Utc::now().naive_utc(),
updated_at: chrono::Utc::now().naive_utc(),
};
let candidate = RecordingMatch {
mbid: "123".into(),
title: "Time".into(),
artist: "Pink Floyd".into(),
artist_mbid: None,
releases: vec![],
score: 100,
};
let score = score_match(&track, &candidate);
assert!(score > 0.95, "exact match should score > 0.95, got {score}");
}
#[test]
fn test_score_match_fuzzy() {
let track = track::Model {
id: 1,
file_path: "/test.mp3".into(),
title: Some("Comfortably Numb".into()),
artist: Some("Pink Floyd".into()),
album: None,
album_artist: None,
track_number: None,
disc_number: None,
duration: None,
genre: None,
year: None,
codec: None,
bitrate: None,
file_size: 1000,
fingerprint: None,
musicbrainz_id: None,
artist_id: None,
album_id: None,
file_mtime: None,
added_at: chrono::Utc::now().naive_utc(),
updated_at: chrono::Utc::now().naive_utc(),
};
// Slight misspelling
let candidate = RecordingMatch {
mbid: "123".into(),
title: "Comfortably Numb".into(),
artist: "Pink Flloyd".into(), // typo
artist_mbid: None,
releases: vec![],
score: 95,
};
let score = score_match(&track, &candidate);
assert!(score > 0.85, "fuzzy match should score > 0.85, got {score}");
}
}

253
src/musicbrainz.rs Normal file
View File

@@ -0,0 +1,253 @@
use serde::Deserialize;
use tokio::sync::Mutex;
use tokio::time::{Duration, Instant};
use crate::cleaning::escape_lucene;
use crate::error::{TagError, TagResult};
use crate::provider::{MetadataProvider, RecordingDetails, RecordingMatch, ReleaseMatch, ReleaseRef};
const BASE_URL: &str = "https://musicbrainz.org/ws/2";
const USER_AGENT: &str = "Shanty/0.1.0 (shanty-music-app)";
const RATE_LIMIT: Duration = Duration::from_millis(1100); // slightly over 1s to be safe
/// MusicBrainz API client with rate limiting.
pub struct MusicBrainzClient {
client: reqwest::Client,
last_request: Mutex<Instant>,
}
impl MusicBrainzClient {
pub fn new() -> TagResult<Self> {
let client = reqwest::Client::builder()
.user_agent(USER_AGENT)
.timeout(Duration::from_secs(30))
.build()?;
Ok(Self {
client,
last_request: Mutex::new(Instant::now() - RATE_LIMIT),
})
}
/// Enforce rate limiting: wait if needed so we don't exceed 1 req/sec.
async fn rate_limit(&self) {
let mut last = self.last_request.lock().await;
let elapsed = last.elapsed();
if elapsed < RATE_LIMIT {
tokio::time::sleep(RATE_LIMIT - elapsed).await;
}
*last = Instant::now();
}
async fn get_json<T: serde::de::DeserializeOwned>(&self, url: &str) -> TagResult<T> {
self.rate_limit().await;
tracing::debug!(url = url, "MusicBrainz request");
let resp = self.client.get(url).send().await?;
let status = resp.status();
if !status.is_success() {
let body = resp.text().await.unwrap_or_default();
return Err(TagError::Other(format!(
"MusicBrainz API error {status}: {body}"
)));
}
Ok(resp.json().await?)
}
}
impl MetadataProvider for MusicBrainzClient {
async fn search_recording(
&self,
artist: &str,
title: &str,
) -> TagResult<Vec<RecordingMatch>> {
let query = if artist.is_empty() {
format!("recording:{}", escape_lucene(title))
} else {
format!(
"artist:{} AND recording:{}",
escape_lucene(artist),
escape_lucene(title)
)
};
let url = format!("{BASE_URL}/recording/?query={}&fmt=json&limit=5", urlencoded(&query));
let resp: MbRecordingSearchResponse = self.get_json(&url).await?;
Ok(resp
.recordings
.into_iter()
.map(|r| {
let (artist_name, artist_mbid) = extract_artist_credit(&r.artist_credit);
RecordingMatch {
mbid: r.id,
title: r.title,
artist: artist_name,
artist_mbid,
releases: r
.releases
.unwrap_or_default()
.into_iter()
.map(|rel| ReleaseRef {
mbid: rel.id,
title: rel.title,
date: rel.date,
track_number: None,
})
.collect(),
score: r.score.unwrap_or(0),
}
})
.collect())
}
async fn search_release(
&self,
artist: &str,
album: &str,
) -> TagResult<Vec<ReleaseMatch>> {
let query = format!(
"artist:{} AND release:{}",
escape_lucene(artist),
escape_lucene(album)
);
let url = format!("{BASE_URL}/release/?query={}&fmt=json&limit=5", urlencoded(&query));
let resp: MbReleaseSearchResponse = self.get_json(&url).await?;
Ok(resp
.releases
.into_iter()
.map(|r| {
let (artist_name, artist_mbid) = extract_artist_credit(&r.artist_credit);
ReleaseMatch {
mbid: r.id,
title: r.title,
artist: artist_name,
artist_mbid,
date: r.date,
track_count: r.track_count,
score: r.score.unwrap_or(0),
}
})
.collect())
}
async fn get_recording(&self, mbid: &str) -> TagResult<RecordingDetails> {
let url = format!(
"{BASE_URL}/recording/{mbid}?inc=artists+releases+genres&fmt=json"
);
let r: MbRecordingDetail = self.get_json(&url).await?;
let (artist_name, artist_mbid) = extract_artist_credit(&r.artist_credit);
Ok(RecordingDetails {
mbid: r.id,
title: r.title,
artist: artist_name,
artist_mbid,
releases: r
.releases
.unwrap_or_default()
.into_iter()
.map(|rel| ReleaseRef {
mbid: rel.id,
title: rel.title,
date: rel.date,
track_number: None,
})
.collect(),
duration_ms: r.length,
genres: r
.genres
.unwrap_or_default()
.into_iter()
.map(|g| g.name)
.collect(),
})
}
}
fn extract_artist_credit(credits: &Option<Vec<MbArtistCredit>>) -> (String, Option<String>) {
match credits {
Some(credits) if !credits.is_empty() => {
let name: String = credits
.iter()
.map(|c| {
let mut s = c.artist.name.clone();
if let Some(ref join) = c.joinphrase {
s.push_str(join);
}
s
})
.collect();
let mbid = Some(credits[0].artist.id.clone());
(name, mbid)
}
_ => ("Unknown Artist".to_string(), None),
}
}
fn urlencoded(s: &str) -> String {
s.replace(' ', "+")
.replace('&', "%26")
.replace('=', "%3D")
.replace('#', "%23")
}
// --- MusicBrainz API response types ---
#[derive(Deserialize)]
struct MbRecordingSearchResponse {
recordings: Vec<MbRecordingResult>,
}
#[derive(Deserialize)]
struct MbRecordingResult {
id: String,
title: String,
score: Option<u8>,
#[serde(rename = "artist-credit")]
artist_credit: Option<Vec<MbArtistCredit>>,
releases: Option<Vec<MbReleaseResult>>,
}
#[derive(Deserialize)]
struct MbReleaseSearchResponse {
releases: Vec<MbReleaseResult>,
}
#[derive(Deserialize)]
struct MbReleaseResult {
id: String,
title: String,
score: Option<u8>,
#[serde(rename = "artist-credit")]
artist_credit: Option<Vec<MbArtistCredit>>,
date: Option<String>,
#[serde(rename = "track-count")]
track_count: Option<i32>,
}
#[derive(Deserialize)]
struct MbRecordingDetail {
id: String,
title: String,
#[serde(rename = "artist-credit")]
artist_credit: Option<Vec<MbArtistCredit>>,
releases: Option<Vec<MbReleaseResult>>,
length: Option<u64>,
genres: Option<Vec<MbGenre>>,
}
#[derive(Deserialize)]
struct MbArtistCredit {
artist: MbArtist,
joinphrase: Option<String>,
}
#[derive(Deserialize)]
struct MbArtist {
id: String,
name: String,
}
#[derive(Deserialize)]
struct MbGenre {
name: String,
}

69
src/provider.rs Normal file
View File

@@ -0,0 +1,69 @@
use serde::{Deserialize, Serialize};
use crate::error::TagResult;
/// A reference to a release (album) that a recording appears on.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReleaseRef {
pub mbid: String,
pub title: String,
pub date: Option<String>,
pub track_number: Option<i32>,
}
/// A recording match from a search query.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecordingMatch {
pub mbid: String,
pub title: String,
pub artist: String,
pub artist_mbid: Option<String>,
pub releases: Vec<ReleaseRef>,
/// MusicBrainz API score (0-100).
pub score: u8,
}
/// A release (album) match from a search query.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ReleaseMatch {
pub mbid: String,
pub title: String,
pub artist: String,
pub artist_mbid: Option<String>,
pub date: Option<String>,
pub track_count: Option<i32>,
pub score: u8,
}
/// Full details for a recording, retrieved by MBID.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RecordingDetails {
pub mbid: String,
pub title: String,
pub artist: String,
pub artist_mbid: Option<String>,
pub releases: Vec<ReleaseRef>,
pub duration_ms: Option<u64>,
pub genres: Vec<String>,
}
/// Trait for metadata lookup backends. MusicBrainz is the default implementation;
/// others (Last.fm, Discogs, etc.) can be added later.
pub trait MetadataProvider: Send + Sync {
fn search_recording(
&self,
artist: &str,
title: &str,
) -> impl std::future::Future<Output = TagResult<Vec<RecordingMatch>>> + Send;
fn search_release(
&self,
artist: &str,
album: &str,
) -> impl std::future::Future<Output = TagResult<Vec<ReleaseMatch>>> + Send;
fn get_recording(
&self,
mbid: &str,
) -> impl std::future::Future<Output = TagResult<RecordingDetails>> + Send;
}

227
src/tagger.rs Normal file
View File

@@ -0,0 +1,227 @@
use std::fmt;
use sea_orm::{ActiveValue::Set, DatabaseConnection, NotSet};
use shanty_db::entities::track;
use shanty_db::queries;
use crate::error::TagResult;
use crate::file_tags;
use crate::matcher::{self, ScoredMatch};
use crate::provider::MetadataProvider;
/// Configuration for a tagging operation.
pub struct TagConfig {
/// If true, show what would change without writing to DB or files.
pub dry_run: bool,
/// If true, write updated tags back to the music files.
pub write_tags: bool,
/// Minimum match confidence (0.0 - 1.0).
pub confidence: f64,
}
/// Statistics from a completed tagging run.
#[derive(Debug, Default, Clone)]
pub struct TagStats {
pub tracks_processed: u64,
pub tracks_matched: u64,
pub tracks_updated: u64,
pub tracks_skipped: u64,
pub tracks_errored: u64,
}
impl fmt::Display for TagStats {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"processed: {}, matched: {}, updated: {}, skipped: {}, errors: {}",
self.tracks_processed,
self.tracks_matched,
self.tracks_updated,
self.tracks_skipped,
self.tracks_errored,
)
}
}
/// Tag a single track. Returns `Ok(true)` if matched and updated.
pub async fn tag_track(
conn: &DatabaseConnection,
provider: &impl MetadataProvider,
track: &track::Model,
config: &TagConfig,
) -> TagResult<bool> {
// Build search query
let (artist, title) = match matcher::build_query(track) {
Some(q) => q,
None => {
tracing::debug!(id = track.id, path = %track.file_path, "no query possible, skipping");
return Ok(false);
}
};
tracing::info!(
id = track.id,
artist = %artist,
title = %title,
"searching MusicBrainz"
);
// Search for recordings
let candidates = provider.search_recording(&artist, &title).await?;
if candidates.is_empty() {
tracing::debug!(id = track.id, "no results from MusicBrainz");
return Ok(false);
}
// Score and select best match
let best = match matcher::select_best_match(track, candidates, config.confidence) {
Some(m) => m,
None => {
tracing::debug!(
id = track.id,
"no match above confidence threshold {}",
config.confidence
);
return Ok(false);
}
};
log_match(track, &best);
if config.dry_run {
return Ok(true);
}
// Get full details for the best match
let details = provider.get_recording(&best.recording.mbid).await?;
// Upsert artist with MusicBrainz ID
let artist_id = match &details.artist_mbid {
Some(mbid) => {
Some(queries::artists::upsert(conn, &details.artist, Some(mbid)).await?.id)
}
None => {
Some(queries::artists::upsert(conn, &details.artist, None).await?.id)
}
};
// Upsert album from best release
let (album_id, album_name) = if let Some(ref release) = best.best_release {
let album = queries::albums::upsert(
conn,
&release.title,
&details.artist,
Some(&release.mbid),
artist_id,
)
.await?;
(Some(album.id), Some(release.title.clone()))
} else {
(None, None)
};
// Parse year from release date
let year = best
.best_release
.as_ref()
.and_then(|r| r.date.as_deref())
.and_then(|d| d.split('-').next())
.and_then(|y| y.parse::<i32>().ok());
let genre = details.genres.first().cloned();
// Update track metadata
let active = track::ActiveModel {
id: Set(track.id),
file_path: Set(track.file_path.clone()),
title: Set(Some(details.title.clone())),
artist: Set(Some(details.artist.clone())),
album: Set(album_name),
album_artist: Set(Some(details.artist.clone())),
musicbrainz_id: Set(Some(details.mbid.clone())),
artist_id: Set(artist_id),
album_id: Set(album_id),
year: Set(year),
genre: Set(genre.clone()),
// Preserve existing values for fields we don't update
track_number: NotSet,
disc_number: NotSet,
duration: NotSet,
codec: NotSet,
bitrate: NotSet,
file_size: NotSet,
fingerprint: NotSet,
file_mtime: NotSet,
added_at: NotSet,
updated_at: NotSet,
};
queries::tracks::update_metadata(conn, track.id, active).await?;
// Optionally write tags to file
if config.write_tags {
if let Err(e) = file_tags::write_tags(
&track.file_path,
&details,
best.best_release.as_ref(),
year,
genre.as_deref(),
) {
tracing::warn!(id = track.id, path = %track.file_path, "failed to write file tags: {e}");
}
}
Ok(true)
}
fn log_match(track: &track::Model, best: &ScoredMatch) {
tracing::info!(
id = track.id,
confidence = format!("{:.2}", best.confidence),
matched_title = %best.recording.title,
matched_artist = %best.recording.artist,
release = best.best_release.as_ref().map(|r| r.title.as_str()).unwrap_or("(none)"),
"match found"
);
}
/// Run tagging on all untagged tracks or a specific track.
pub async fn run_tagging(
conn: &DatabaseConnection,
provider: &impl MetadataProvider,
config: &TagConfig,
track_id: Option<i32>,
) -> TagResult<TagStats> {
let tracks: Vec<track::Model> = if let Some(id) = track_id {
vec![queries::tracks::get_by_id(conn, id).await?]
} else {
queries::tracks::get_untagged(conn).await?
};
tracing::info!(count = tracks.len(), "tracks to process");
let mut stats = TagStats::default();
for track in &tracks {
stats.tracks_processed += 1;
match tag_track(conn, provider, track, config).await {
Ok(true) => {
stats.tracks_matched += 1;
if !config.dry_run {
stats.tracks_updated += 1;
}
}
Ok(false) => {
stats.tracks_skipped += 1;
}
Err(e) => {
tracing::error!(id = track.id, path = %track.file_path, "tagging error: {e}");
stats.tracks_errored += 1;
}
}
}
tracing::info!(%stats, "tagging complete");
Ok(stats)
}

176
tests/integration.rs Normal file
View File

@@ -0,0 +1,176 @@
use chrono::Utc;
use sea_orm::ActiveValue::Set;
use shanty_db::{Database, queries};
use shanty_tag::provider::{MetadataProvider, RecordingDetails, RecordingMatch, ReleaseMatch, ReleaseRef};
use shanty_tag::error::TagResult;
use shanty_tag::{TagConfig, run_tagging};
/// A mock metadata provider for testing without hitting MusicBrainz.
struct MockProvider;
impl MetadataProvider for MockProvider {
async fn search_recording(&self, artist: &str, title: &str) -> TagResult<Vec<RecordingMatch>> {
// Return a match for "Pink Floyd - Time"
if artist.contains("Pink Floyd") && title.contains("Time") {
Ok(vec![RecordingMatch {
mbid: "rec-123".into(),
title: "Time".into(),
artist: "Pink Floyd".into(),
artist_mbid: Some("artist-456".into()),
releases: vec![ReleaseRef {
mbid: "release-789".into(),
title: "The Dark Side of the Moon".into(),
date: Some("1973-03-01".into()),
track_number: Some(4),
}],
score: 100,
}])
} else {
Ok(vec![])
}
}
async fn search_release(&self, _artist: &str, _album: &str) -> TagResult<Vec<ReleaseMatch>> {
Ok(vec![])
}
async fn get_recording(&self, mbid: &str) -> TagResult<RecordingDetails> {
if mbid == "rec-123" {
Ok(RecordingDetails {
mbid: "rec-123".into(),
title: "Time".into(),
artist: "Pink Floyd".into(),
artist_mbid: Some("artist-456".into()),
releases: vec![ReleaseRef {
mbid: "release-789".into(),
title: "The Dark Side of the Moon".into(),
date: Some("1973-03-01".into()),
track_number: Some(4),
}],
duration_ms: Some(413_000),
genres: vec!["Progressive Rock".into()],
})
} else {
Err(shanty_tag::TagError::Other("not found".into()))
}
}
}
async fn test_db() -> Database {
Database::new("sqlite::memory:")
.await
.expect("failed to create test database")
}
async fn insert_untagged_track(db: &Database, file_path: &str, title: Option<&str>, artist: Option<&str>) -> i32 {
let now = Utc::now().naive_utc();
let active = shanty_db::entities::track::ActiveModel {
file_path: Set(file_path.to_string()),
title: Set(title.map(String::from)),
artist: Set(artist.map(String::from)),
file_size: Set(1_000_000),
added_at: Set(now),
updated_at: Set(now),
..Default::default()
};
let track = queries::tracks::upsert(db.conn(), active).await.unwrap();
track.id
}
#[tokio::test]
async fn test_tag_track_with_match() {
let db = test_db().await;
let provider = MockProvider;
let track_id = insert_untagged_track(&db, "/music/time.mp3", Some("Time"), Some("Pink Floyd")).await;
let config = TagConfig {
dry_run: false,
write_tags: false,
confidence: 0.8,
};
let stats = run_tagging(db.conn(), &provider, &config, Some(track_id)).await.unwrap();
assert_eq!(stats.tracks_processed, 1);
assert_eq!(stats.tracks_matched, 1);
assert_eq!(stats.tracks_updated, 1);
// Verify the track was updated
let track = queries::tracks::get_by_id(db.conn(), track_id).await.unwrap();
assert_eq!(track.musicbrainz_id.as_deref(), Some("rec-123"));
assert_eq!(track.title.as_deref(), Some("Time"));
assert_eq!(track.artist.as_deref(), Some("Pink Floyd"));
assert_eq!(track.album.as_deref(), Some("The Dark Side of the Moon"));
assert_eq!(track.year, Some(1973));
assert_eq!(track.genre.as_deref(), Some("Progressive Rock"));
// Verify artist was created with MusicBrainz ID
let artist = queries::artists::find_by_name(db.conn(), "Pink Floyd").await.unwrap();
assert!(artist.is_some());
assert_eq!(artist.unwrap().musicbrainz_id.as_deref(), Some("artist-456"));
}
#[tokio::test]
async fn test_tag_track_no_match() {
let db = test_db().await;
let provider = MockProvider;
let track_id = insert_untagged_track(&db, "/music/unknown.mp3", Some("Unknown Song"), Some("Nobody")).await;
let config = TagConfig {
dry_run: false,
write_tags: false,
confidence: 0.8,
};
let stats = run_tagging(db.conn(), &provider, &config, Some(track_id)).await.unwrap();
assert_eq!(stats.tracks_processed, 1);
assert_eq!(stats.tracks_skipped, 1);
// Track should be unchanged
let track = queries::tracks::get_by_id(db.conn(), track_id).await.unwrap();
assert!(track.musicbrainz_id.is_none());
}
#[tokio::test]
async fn test_dry_run_does_not_update() {
let db = test_db().await;
let provider = MockProvider;
let track_id = insert_untagged_track(&db, "/music/time.mp3", Some("Time"), Some("Pink Floyd")).await;
let config = TagConfig {
dry_run: true,
write_tags: false,
confidence: 0.8,
};
let stats = run_tagging(db.conn(), &provider, &config, Some(track_id)).await.unwrap();
assert_eq!(stats.tracks_matched, 1);
assert_eq!(stats.tracks_updated, 0); // dry run
// Track should be unchanged
let track = queries::tracks::get_by_id(db.conn(), track_id).await.unwrap();
assert!(track.musicbrainz_id.is_none());
}
#[tokio::test]
async fn test_tag_all_untagged() {
let db = test_db().await;
let provider = MockProvider;
insert_untagged_track(&db, "/music/time.mp3", Some("Time"), Some("Pink Floyd")).await;
insert_untagged_track(&db, "/music/unknown.mp3", Some("Unknown"), Some("Nobody")).await;
let config = TagConfig {
dry_run: false,
write_tags: false,
confidence: 0.8,
};
let stats = run_tagging(db.conn(), &provider, &config, None).await.unwrap();
assert_eq!(stats.tracks_processed, 2);
assert_eq!(stats.tracks_matched, 1); // only Pink Floyd matched
assert_eq!(stats.tracks_skipped, 1);
}