From 9c59cf73e7e6631d9c33004fc0ff8fb957f673b9 Mon Sep 17 00:00:00 2001
From: Connor Johnstone <connor.johnstone@arcfield.com>
Date: Tue, 17 Mar 2026 15:01:19 -0400
Subject: [PATCH] Initial commit

---
 .gitignore           |   4 +
 Cargo.toml           |  30 +++++
 readme.md            |  22 ++++
 src/cleaning.rs      | 110 ++++++++++++++++
 src/error.rs         |  30 +++++
 src/file_tags.rs     |  70 ++++++++++
 src/lib.rs           |  18 +++
 src/main.rs          |  95 ++++++++++++++
 src/matcher.rs       | 305 +++++++++++++++++++++++++++++++++++++++++++
 src/musicbrainz.rs   | 253 +++++++++++++++++++++++++++++++++++
 src/provider.rs      |  69 ++++++++++
 src/tagger.rs        | 227 ++++++++++++++++++++++++++++++++
 tests/integration.rs | 176 +++++++++++++++++++++++++
 13 files changed, 1409 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Cargo.toml
 create mode 100644 readme.md
 create mode 100644 src/cleaning.rs
 create mode 100644 src/error.rs
 create mode 100644 src/file_tags.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/main.rs
 create mode 100644 src/matcher.rs
 create mode 100644 src/musicbrainz.rs
 create mode 100644 src/provider.rs
 create mode 100644 src/tagger.rs
 create mode 100644 tests/integration.rs
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..360fdc9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+target/
+.env
+*.db
+*.db-journal
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..7ba67c1
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "shanty-tag"
+version = "0.1.0"
+edition = "2024"
+license = "MIT"
+description = "Metadata tagging via online databases for Shanty"
+repository = "ssh://connor@git.rcjohnstone.com:2222/Shanty/tag.git"
+
+[dependencies]
+shanty-db = { path = "../shanty-db" }
+sea-orm = { version = "1", features = ["sqlx-sqlite", "runtime-tokio-native-tls"] }
+clap = { version = "4", features = ["derive"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+thiserror = "2"
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+tokio = { version = "1", features = ["full"] }
+anyhow = "1"
+reqwest = { version = "0.12", features = ["json"] }
+strsim = "0.11"
+unicode-normalization = "0.1"
+lofty = "0.22"
+chrono = { version = "0.4", features = ["serde"] }
+dirs = "6"
+regex = "1"
+
+[dev-dependencies]
+tokio = { version = "1", features = ["full", "test-util"] }
+tempfile = "3"
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..9ea6ca8
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,22 @@
+# shanty-tag
+
+Metadata tagging via online databases for [Shanty](ssh://connor@git.rcjohnstone.com:2222/Shanty/shanty.git).
+
+Queries MusicBrainz to fill in missing metadata on indexed music files. Uses fuzzy
+matching to handle minor spelling differences and a configurable confidence threshold.
+
+## Usage
+
+```sh
+# Tag all untagged tracks (dry run)
+shanty-tag --all --dry-run -vv
+
+# Tag all untagged tracks for real
+shanty-tag --all
+
+# Tag a specific track and write tags back to the file
+shanty-tag --track 42 --write-tags
+
+# Custom confidence threshold
+shanty-tag --all --confidence 0.9
+```
diff --git a/src/cleaning.rs b/src/cleaning.rs
new file mode 100644
index 0000000..5831b9b
--- /dev/null
+++ b/src/cleaning.rs
@@ -0,0 +1,110 @@
+use std::sync::LazyLock;
+
+use regex::Regex;
+use unicode_normalization::UnicodeNormalization;
+
+static STRIP_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
+    vec![
+        // (Official Video), (Official Audio), (Official Music Video), (Lyric Video), etc.
+        Regex::new(r"\(official\s*(video|audio|music\s*video|lyric\s*video|visualizer)\)").unwrap(),
+        // (Remastered), (Remastered 2011), (Remaster)
+        Regex::new(r"\(remaster(ed)?\s*(\d{4})?\)").unwrap(),
+        // [Live], [Bonus Track], [Deluxe], [Explicit]
+        Regex::new(r"\[(live|bonus(\s*track)?|deluxe|explicit|clean)\]").unwrap(),
+        // (feat. Artist), [feat. Artist], (ft. Artist)
+        Regex::new(r"[\(\[](feat\.?|ft\.?)\s+[^\)\]]+[\)\]]").unwrap(),
+        // (with Artist)
+        Regex::new(r"\(with\s+[^)]+\)").unwrap(),
+        // Trailing " - Single", " - EP"
+        Regex::new(r"\s*-\s*(single|ep)\s*$").unwrap(),
+    ]
+});
+
+/// Normalize a string for fuzzy comparison.
+///
+/// Applies unicode NFC normalization, lowercasing, stripping common suffixes
+/// (video tags, remaster notes, featuring credits), and trimming.
+pub fn normalize(s: &str) -> String {
+    // Unicode NFC normalization
+    let s: String = s.nfc().collect();
+
+    // Lowercase
+    let mut s = s.to_lowercase();
+
+    // Strip known patterns
+    for pattern in STRIP_PATTERNS.iter() {
+        s = pattern.replace_all(&s, "").to_string();
+    }
+
+    // Trim whitespace and punctuation
+    s.trim()
+        .trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace())
+        .to_string()
+}
+
+/// Escape special characters for MusicBrainz Lucene query syntax.
+pub fn escape_lucene(s: &str) -> String {
+    let special = [
+        '+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':',
+        '\\', '/',
+    ];
+    let mut result = String::with_capacity(s.len());
+    for c in s.chars() {
+        if special.contains(&c) {
+            result.push('\\');
+        }
+        result.push(c);
+    }
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_normalize_basic() {
+        assert_eq!(normalize("Hello World"), "hello world");
+        assert_eq!(normalize("  spaces  "), "spaces");
+    }
+
+    #[test]
+    fn test_normalize_strips_official_video() {
+        assert_eq!(normalize("Time (Official Video)"), "time");
+        assert_eq!(normalize("Money (Official Music Video)"), "money");
+        assert_eq!(normalize("Comfortably Numb (Official Audio)"), "comfortably numb");
+    }
+
+    #[test]
+    fn test_normalize_strips_remastered() {
+        assert_eq!(normalize("Time (Remastered 2011)"), "time");
+        assert_eq!(normalize("Money (Remastered)"), "money");
+        assert_eq!(normalize("Shine On (Remaster)"), "shine on");
+    }
+
+    #[test]
+    fn test_normalize_strips_feat() {
+        assert_eq!(normalize("Song (feat. Artist)"), "song");
+        assert_eq!(normalize("Song [ft. Someone]"), "song");
+    }
+
+    #[test]
+    fn test_normalize_strips_brackets() {
+        assert_eq!(normalize("Song [Live]"), "song");
+        assert_eq!(normalize("Song [Bonus Track]"), "song");
+        assert_eq!(normalize("Song [Explicit]"), "song");
+    }
+
+    #[test]
+    fn test_normalize_unicode() {
+        // NFC normalization — decomposed é should become composed é
+        assert_eq!(normalize("café"), normalize("café"));
+    }
+
+    #[test]
+    fn test_escape_lucene() {
+        assert_eq!(escape_lucene("AC/DC"), r"AC\/DC");
+        assert_eq!(escape_lucene("test (hello)"), r"test \(hello\)");
+        assert_eq!(escape_lucene("simple"), "simple");
+    }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..7feb55c
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,30 @@
+use shanty_db::DbError;
+
+#[derive(Debug, thiserror::Error)]
+pub enum TagError {
+    #[error("database error: {0}")]
+    Db(#[from] DbError),
+
+    #[error("I/O error: {0}")]
+    Io(#[from] std::io::Error),
+
+    #[error("HTTP error: {0}")]
+    Http(#[from] reqwest::Error),
+
+    #[error("metadata error: {0}")]
+    Metadata(String),
+
+    #[error("no match found for track {0}")]
+    NoMatch(i32),
+
+    #[error("{0}")]
+    Other(String),
+}
+
+impl From<lofty::error::LoftyError> for TagError {
+    fn from(e: lofty::error::LoftyError) -> Self {
+        TagError::Metadata(e.to_string())
+    }
+}
+
+pub type TagResult<T> = Result<T, TagError>;
diff --git a/src/file_tags.rs b/src/file_tags.rs
new file mode 100644
index 0000000..3a9d3cb
--- /dev/null
+++ b/src/file_tags.rs
@@ -0,0 +1,70 @@
+use std::path::Path;
+
+use lofty::config::{ParseOptions, WriteOptions};
+use lofty::file::{FileType, TaggedFileExt};
+use lofty::probe::Probe;
+use lofty::tag::{Accessor, TagExt, TagType};
+
+use crate::error::TagResult;
+use crate::provider::{RecordingDetails, ReleaseRef};
+
+/// Infer the best tag type for a given file type.
+fn tag_type_for_file(ft: FileType) -> TagType {
+    match ft {
+        FileType::Mpeg => TagType::Id3v2,
+        FileType::Flac | FileType::Vorbis | FileType::Opus | FileType::Speex => {
+            TagType::VorbisComments
+        }
+        FileType::Mp4 => TagType::Mp4Ilst,
+        FileType::Ape => TagType::Ape,
+        _ => TagType::Id3v2,
+    }
+}
+
+/// Write updated metadata back to the music file's embedded tags.
+pub fn write_tags(
+    file_path: &str,
+    details: &RecordingDetails,
+    release: Option<&ReleaseRef>,
+    year: Option<i32>,
+    genre: Option<&str>,
+) -> TagResult<()> {
+    let path = Path::new(file_path);
+
+    let tagged_file = Probe::open(path)?
+        .options(ParseOptions::default())
+        .read()?;
+
+    // Determine the tag type to use
+    let tag_type = tagged_file
+        .primary_tag()
+        .map(|t| t.tag_type())
+        .unwrap_or_else(|| tag_type_for_file(tagged_file.file_type()));
+
+    let mut tag = tagged_file
+        .primary_tag()
+        .cloned()
+        .unwrap_or_else(|| lofty::tag::Tag::new(tag_type));
+
+    // Set metadata
+    tag.set_title(details.title.clone());
+    tag.set_artist(details.artist.clone());
+
+    if let Some(release) = release {
+        tag.set_album(release.title.clone());
+    }
+
+    if let Some(y) = year {
+        tag.set_year(y as u32);
+    }
+
+    if let Some(g) = genre {
+        tag.set_genre(g.to_string());
+    }
+
+    // Write to file
+    tag.save_to_path(path, WriteOptions::default())?;
+
+    tracing::info!(path = file_path, "wrote tags to file");
+    Ok(())
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..5c734b3
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,18 @@
+//! Metadata tagging via online databases for Shanty.
+//!
+//! Fills in missing or incorrect metadata on music files by querying online
+//! databases such as MusicBrainz. Supports fuzzy matching and configurable
+//! confidence thresholds.
+
+pub mod cleaning;
+pub mod error;
+pub mod file_tags;
+pub mod matcher;
+pub mod musicbrainz;
+pub mod provider;
+pub mod tagger;
+
+pub use error::{TagError, TagResult};
+pub use musicbrainz::MusicBrainzClient;
+pub use provider::MetadataProvider;
+pub use tagger::{TagConfig, TagStats, run_tagging};
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..9f4c816
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,95 @@
+use std::path::PathBuf;
+
+use clap::Parser;
+use tracing_subscriber::EnvFilter;
+
+use shanty_db::Database;
+use shanty_tag::{MusicBrainzClient, TagConfig, run_tagging};
+
+#[derive(Parser)]
+#[command(name = "shanty-tag", about = "Fill in missing metadata on music files via MusicBrainz")]
+struct Cli {
+    /// Database URL. Defaults to sqlite://<XDG_DATA_HOME>/shanty/shanty.db?mode=rwc
+    #[arg(long, env = "SHANTY_DATABASE_URL")]
+    database: Option<String>,
+
+    /// Tag all untagged tracks in the database.
+    #[arg(long)]
+    all: bool,
+
+    /// Tag a specific track by its database ID.
+    #[arg(long)]
+    track: Option<i32>,
+
+    /// Preview matches without writing to DB or files.
+    #[arg(long)]
+    dry_run: bool,
+
+    /// Write updated tags back to music files.
+    #[arg(long)]
+    write_tags: bool,
+
+    /// Minimum match confidence (0.0 - 1.0).
+    #[arg(long, default_value = "0.8")]
+    confidence: f64,
+
+    /// Increase verbosity (-v info, -vv debug, -vvv trace).
+    #[arg(short, long, action = clap::ArgAction::Count)]
+    verbose: u8,
+}
+
+fn default_database_url() -> String {
+    let data_dir = dirs::data_dir()
+        .unwrap_or_else(|| PathBuf::from("."))
+        .join("shanty");
+    std::fs::create_dir_all(&data_dir).ok();
+    let db_path = data_dir.join("shanty.db");
+    format!("sqlite://{}?mode=rwc", db_path.display())
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let cli = Cli::parse();
+
+    // Set up tracing
+    let filter = match cli.verbose {
+        0 => "warn",
+        1 => "info,shanty_tag=info",
+        2 => "info,shanty_tag=debug",
+        _ => "debug,shanty_tag=trace",
+    };
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(filter)),
+        )
+        .init();
+
+    // Validate args
+    if !cli.all && cli.track.is_none() {
+        anyhow::bail!("specify either --all or --track <id>");
+    }
+
+    // Connect to database
+    let database_url = cli.database.unwrap_or_else(default_database_url);
+    tracing::info!(url = %database_url, "connecting to database");
+    let db = Database::new(&database_url).await?;
+
+    // Create MusicBrainz client
+    let provider = MusicBrainzClient::new()?;
+
+    // Run tagging
+    let config = TagConfig {
+        dry_run: cli.dry_run,
+        write_tags: cli.write_tags,
+        confidence: cli.confidence,
+    };
+
+    if config.dry_run {
+        println!("DRY RUN — no changes will be written");
+    }
+
+    let stats = run_tagging(db.conn(), &provider, &config, cli.track).await?;
+    println!("\nTagging complete: {stats}");
+
+    Ok(())
+}
diff --git a/src/matcher.rs b/src/matcher.rs
new file mode 100644
index 0000000..4e55e35
--- /dev/null
+++ b/src/matcher.rs
@@ -0,0 +1,305 @@
+use shanty_db::entities::track;
+
+use crate::cleaning::normalize;
+use crate::provider::{RecordingMatch, ReleaseRef};
+
+/// A scored recording match with the best matching release.
+#[derive(Debug, Clone)]
+pub struct ScoredMatch {
+    pub recording: RecordingMatch,
+    pub confidence: f64,
+    pub best_release: Option<ReleaseRef>,
+}
+
+/// Build a search query (artist, title) from a track's metadata.
+/// Falls back to filename parsing if metadata is insufficient.
+pub fn build_query(track: &track::Model) -> Option<(String, String)> {
+    let artist = track
+        .album_artist
+        .as_deref()
+        .or(track.artist.as_deref())
+        .filter(|s| !s.is_empty());
+    let title = track.title.as_deref().filter(|s| !s.is_empty());
+
+    match (artist, title) {
+        (Some(a), Some(t)) => Some((a.to_string(), t.to_string())),
+        (None, Some(t)) => Some((String::new(), t.to_string())),
+        _ => parse_filename(&track.file_path),
+    }
+}
+
+/// Parse "Artist - Title" from a filename, stripping extension and path.
+pub fn parse_filename(file_path: &str) -> Option<(String, String)> {
+    let filename = std::path::Path::new(file_path)
+        .file_stem()?
+        .to_str()?;
+
+    // Try common "Artist - Title" pattern
+    if let Some((artist, title)) = filename.split_once(" - ") {
+        let artist = artist.trim().to_string();
+        let title = title.trim().to_string();
+        if !artist.is_empty() && !title.is_empty() {
+            return Some((artist, title));
+        }
+    }
+
+    // If no delimiter found, treat entire filename as the title
+    let name = filename.trim().to_string();
+    if !name.is_empty() {
+        Some((String::new(), name))
+    } else {
+        None
+    }
+}
+
+/// Score a candidate recording against the track's known metadata.
+/// Returns a confidence value from 0.0 to 1.0.
+pub fn score_match(track: &track::Model, candidate: &RecordingMatch) -> f64 {
+    let track_title = track
+        .title
+        .as_deref()
+        .map(normalize)
+        .unwrap_or_default();
+    let candidate_title = normalize(&candidate.title);
+
+    let track_artist = track
+        .artist
+        .as_deref()
+        .or(track.album_artist.as_deref())
+        .map(normalize)
+        .unwrap_or_default();
+    let candidate_artist = normalize(&candidate.artist);
+
+    // Title similarity (weighted 0.6)
+    let title_sim = if track_title.is_empty() || candidate_title.is_empty() {
+        0.0
+    } else {
+        strsim::jaro_winkler(&track_title, &candidate_title)
+    };
+
+    // Artist similarity (weighted 0.4)
+    let artist_sim = if track_artist.is_empty() || candidate_artist.is_empty() {
+        0.3 // neutral-ish when we have no artist to compare
+    } else {
+        strsim::jaro_winkler(&track_artist, &candidate_artist)
+    };
+
+    let mut score = 0.6 * title_sim + 0.4 * artist_sim;
+
+    // Bonus: album name matches a release
+    if let Some(ref album) = track.album {
+        let track_album = normalize(album);
+        if !track_album.is_empty() {
+            for release in &candidate.releases {
+                let release_title = normalize(&release.title);
+                let album_sim = strsim::jaro_winkler(&track_album, &release_title);
+                if album_sim > 0.85 {
+                    score += 0.05;
+                    break;
+                }
+            }
+        }
+    }
+
+    // Bonus: duration within 3 seconds
+    if let Some(track_dur) = track.duration {
+        // MusicBrainz search results don't always include duration,
+        // but the score from the API itself is a signal
+        if track_dur > 0.0 && candidate.score > 90 {
+            score += 0.03;
+        }
+    }
+
+    score.min(1.0)
+}
+
+/// Select the best match from candidates that exceeds the confidence threshold.
+pub fn select_best_match(
+    track: &track::Model,
+    candidates: Vec<RecordingMatch>,
+    threshold: f64,
+) -> Option<ScoredMatch> {
+    let mut best: Option<ScoredMatch> = None;
+
+    for candidate in candidates {
+        let confidence = score_match(track, &candidate);
+        tracing::debug!(
+            title = %candidate.title,
+            artist = %candidate.artist,
+            confidence = confidence,
+            "candidate"
+        );
+
+        if confidence >= threshold {
+            let best_release = candidate.releases.first().cloned();
+            let scored = ScoredMatch {
+                recording: candidate,
+                confidence,
+                best_release,
+            };
+            match &best {
+                Some(current) if scored.confidence <= current.confidence => {}
+                _ => best = Some(scored),
+            }
+        }
+    }
+
+    best
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_filename_artist_title() {
+        let result = parse_filename("/music/Pink Floyd - Time.mp3");
+        assert_eq!(result, Some(("Pink Floyd".into(), "Time".into())));
+    }
+
+    #[test]
+    fn test_parse_filename_title_only() {
+        let result = parse_filename("/music/some_song.mp3");
+        assert_eq!(result, Some(("".into(), "some_song".into())));
+    }
+
+    #[test]
+    fn test_parse_filename_nested_path() {
+        let result = parse_filename("/music/Artist/Album/03 - Track Name.flac");
+        // The "03" gets treated as artist since it splits on " - "
+        assert_eq!(result, Some(("03".into(), "Track Name".into())));
+    }
+
+    #[test]
+    fn test_build_query_with_metadata() {
+        let track = track::Model {
+            id: 1,
+            file_path: "/music/test.mp3".into(),
+            title: Some("Time".into()),
+            artist: Some("Pink Floyd".into()),
+            album: None,
+            album_artist: None,
+            track_number: None,
+            disc_number: None,
+            duration: None,
+            genre: None,
+            year: None,
+            codec: None,
+            bitrate: None,
+            file_size: 1000,
+            fingerprint: None,
+            musicbrainz_id: None,
+            artist_id: None,
+            album_id: None,
+            file_mtime: None,
+            added_at: chrono::Utc::now().naive_utc(),
+            updated_at: chrono::Utc::now().naive_utc(),
+        };
+        let result = build_query(&track);
+        assert_eq!(result, Some(("Pink Floyd".into(), "Time".into())));
+    }
+
+    #[test]
+    fn test_build_query_falls_back_to_filename() {
+        let track = track::Model {
+            id: 1,
+            file_path: "/music/Radiohead - Creep.mp3".into(),
+            title: None,
+            artist: None,
+            album: None,
+            album_artist: None,
+            track_number: None,
+            disc_number: None,
+            duration: None,
+            genre: None,
+            year: None,
+            codec: None,
+            bitrate: None,
+            file_size: 1000,
+            fingerprint: None,
+            musicbrainz_id: None,
+            artist_id: None,
+            album_id: None,
+            file_mtime: None,
+            added_at: chrono::Utc::now().naive_utc(),
+            updated_at: chrono::Utc::now().naive_utc(),
+        };
+        let result = build_query(&track);
+        assert_eq!(result, Some(("Radiohead".into(), "Creep".into())));
+    }
+
+    #[test]
+    fn test_score_match_exact() {
+        let track = track::Model {
+            id: 1,
+            file_path: "/test.mp3".into(),
+            title: Some("Time".into()),
+            artist: Some("Pink Floyd".into()),
+            album: None,
+            album_artist: None,
+            track_number: None,
+            disc_number: None,
+            duration: None,
+            genre: None,
+            year: None,
+            codec: None,
+            bitrate: None,
+            file_size: 1000,
+            fingerprint: None,
+            musicbrainz_id: None,
+            artist_id: None,
+            album_id: None,
+            file_mtime: None,
+            added_at: chrono::Utc::now().naive_utc(),
+            updated_at: chrono::Utc::now().naive_utc(),
+        };
+        let candidate = RecordingMatch {
+            mbid: "123".into(),
+            title: "Time".into(),
+            artist: "Pink Floyd".into(),
+            artist_mbid: None,
+            releases: vec![],
+            score: 100,
+        };
+        let score = score_match(&track, &candidate);
+        assert!(score > 0.95, "exact match should score > 0.95, got {score}");
+    }
+
+    #[test]
+    fn test_score_match_fuzzy() {
+        let track = track::Model {
+            id: 1,
+            file_path: "/test.mp3".into(),
+            title: Some("Comfortably Numb".into()),
+            artist: Some("Pink Floyd".into()),
+            album: None,
+            album_artist: None,
+            track_number: None,
+            disc_number: None,
+            duration: None,
+            genre: None,
+            year: None,
+            codec: None,
+            bitrate: None,
+            file_size: 1000,
+            fingerprint: None,
+            musicbrainz_id: None,
+            artist_id: None,
+            album_id: None,
+            file_mtime: None,
+            added_at: chrono::Utc::now().naive_utc(),
+            updated_at: chrono::Utc::now().naive_utc(),
+        };
+        // Slight misspelling
+        let candidate = RecordingMatch {
+            mbid: "123".into(),
+            title: "Comfortably Numb".into(),
+            artist: "Pink Flloyd".into(), // typo
+            artist_mbid: None,
+            releases: vec![],
+            score: 95,
+        };
+        let score = score_match(&track, &candidate);
+        assert!(score > 0.85, "fuzzy match should score > 0.85, got {score}");
+    }
+}
diff --git a/src/musicbrainz.rs b/src/musicbrainz.rs
new file mode 100644
index 0000000..ef3b9a9
--- /dev/null
+++ b/src/musicbrainz.rs
@@ -0,0 +1,253 @@
+use serde::Deserialize;
+use tokio::sync::Mutex;
+use tokio::time::{Duration, Instant};
+
+use crate::cleaning::escape_lucene;
+use crate::error::{TagError, TagResult};
+use crate::provider::{MetadataProvider, RecordingDetails, RecordingMatch, ReleaseMatch, ReleaseRef};
+
+const BASE_URL: &str = "https://musicbrainz.org/ws/2";
+const USER_AGENT: &str = "Shanty/0.1.0 (shanty-music-app)";
+const RATE_LIMIT: Duration = Duration::from_millis(1100); // slightly over 1s to be safe
+
+/// MusicBrainz API client with rate limiting.
+pub struct MusicBrainzClient {
+    client: reqwest::Client,
+    last_request: Mutex<Instant>,
+}
+
+impl MusicBrainzClient {
+    pub fn new() -> TagResult<Self> {
+        let client = reqwest::Client::builder()
+            .user_agent(USER_AGENT)
+            .timeout(Duration::from_secs(30))
+            .build()?;
+        Ok(Self {
+            client,
+            last_request: Mutex::new(Instant::now() - RATE_LIMIT),
+        })
+    }
+
+    /// Enforce rate limiting: wait if needed so we don't exceed 1 req/sec.
+    async fn rate_limit(&self) {
+        let mut last = self.last_request.lock().await;
+        let elapsed = last.elapsed();
+        if elapsed < RATE_LIMIT {
+            tokio::time::sleep(RATE_LIMIT - elapsed).await;
+        }
+        *last = Instant::now();
+    }
+
+    async fn get_json<T: serde::de::DeserializeOwned>(&self, url: &str) -> TagResult<T> {
+        self.rate_limit().await;
+        tracing::debug!(url = url, "MusicBrainz request");
+        let resp = self.client.get(url).send().await?;
+        let status = resp.status();
+        if !status.is_success() {
+            let body = resp.text().await.unwrap_or_default();
+            return Err(TagError::Other(format!(
+                "MusicBrainz API error {status}: {body}"
+            )));
+        }
+        Ok(resp.json().await?)
+    }
+}
+
+impl MetadataProvider for MusicBrainzClient {
+    async fn search_recording(
+        &self,
+        artist: &str,
+        title: &str,
+    ) -> TagResult<Vec<RecordingMatch>> {
+        let query = if artist.is_empty() {
+            format!("recording:{}", escape_lucene(title))
+        } else {
+            format!(
+                "artist:{} AND recording:{}",
+                escape_lucene(artist),
+                escape_lucene(title)
+            )
+        };
+        let url = format!("{BASE_URL}/recording/?query={}&fmt=json&limit=5", urlencoded(&query));
+        let resp: MbRecordingSearchResponse = self.get_json(&url).await?;
+
+        Ok(resp
+            .recordings
+            .into_iter()
+            .map(|r| {
+                let (artist_name, artist_mbid) = extract_artist_credit(&r.artist_credit);
+                RecordingMatch {
+                    mbid: r.id,
+                    title: r.title,
+                    artist: artist_name,
+                    artist_mbid,
+                    releases: r
+                        .releases
+                        .unwrap_or_default()
+                        .into_iter()
+                        .map(|rel| ReleaseRef {
+                            mbid: rel.id,
+                            title: rel.title,
+                            date: rel.date,
+                            track_number: None,
+                        })
+                        .collect(),
+                    score: r.score.unwrap_or(0),
+                }
+            })
+            .collect())
+    }
+
+    async fn search_release(
+        &self,
+        artist: &str,
+        album: &str,
+    ) -> TagResult<Vec<ReleaseMatch>> {
+        let query = format!(
+            "artist:{} AND release:{}",
+            escape_lucene(artist),
+            escape_lucene(album)
+        );
+        let url = format!("{BASE_URL}/release/?query={}&fmt=json&limit=5", urlencoded(&query));
+        let resp: MbReleaseSearchResponse = self.get_json(&url).await?;
+
+        Ok(resp
+            .releases
+            .into_iter()
+            .map(|r| {
+                let (artist_name, artist_mbid) = extract_artist_credit(&r.artist_credit);
+                ReleaseMatch {
+                    mbid: r.id,
+                    title: r.title,
+                    artist: artist_name,
+                    artist_mbid,
+                    date: r.date,
+                    track_count: r.track_count,
+                    score: r.score.unwrap_or(0),
+                }
+            })
+            .collect())
+    }
+
+    async fn get_recording(&self, mbid: &str) -> TagResult<RecordingDetails> {
+        let url = format!(
+            "{BASE_URL}/recording/{mbid}?inc=artists+releases+genres&fmt=json"
+        );
+        let r: MbRecordingDetail = self.get_json(&url).await?;
+
+        let (artist_name, artist_mbid) = extract_artist_credit(&r.artist_credit);
+        Ok(RecordingDetails {
+            mbid: r.id,
+            title: r.title,
+            artist: artist_name,
+            artist_mbid,
+            releases: r
+                .releases
+                .unwrap_or_default()
+                .into_iter()
+                .map(|rel| ReleaseRef {
+                    mbid: rel.id,
+                    title: rel.title,
+                    date: rel.date,
+                    track_number: None,
+                })
+                .collect(),
+            duration_ms: r.length,
+            genres: r
+                .genres
+                .unwrap_or_default()
+                .into_iter()
+                .map(|g| g.name)
+                .collect(),
+        })
+    }
+}
+
+fn extract_artist_credit(credits: &Option<Vec<MbArtistCredit>>) -> (String, Option<String>) {
+    match credits {
+        Some(credits) if !credits.is_empty() => {
+            let name: String = credits
+                .iter()
+                .map(|c| {
+                    let mut s = c.artist.name.clone();
+                    if let Some(ref join) = c.joinphrase {
+                        s.push_str(join);
+                    }
+                    s
+                })
+                .collect();
+            let mbid = Some(credits[0].artist.id.clone());
+            (name, mbid)
+        }
+        _ => ("Unknown Artist".to_string(), None),
+    }
+}
+
+fn urlencoded(s: &str) -> String {
+    s.replace(' ', "+")
+        .replace('&', "%26")
+        .replace('=', "%3D")
+        .replace('#', "%23")
+}
+
+// --- MusicBrainz API response types ---
+
+#[derive(Deserialize)]
+struct MbRecordingSearchResponse {
+    recordings: Vec<MbRecordingResult>,
+}
+
+#[derive(Deserialize)]
+struct MbRecordingResult {
+    id: String,
+    title: String,
+    score: Option<u8>,
+    #[serde(rename = "artist-credit")]
+    artist_credit: Option<Vec<MbArtistCredit>>,
+    releases: Option<Vec<MbReleaseResult>>,
+}
+
+#[derive(Deserialize)]
+struct MbReleaseSearchResponse {
+    releases: Vec<MbReleaseResult>,
+}
+
+#[derive(Deserialize)]
+struct MbReleaseResult {
+    id: String,
+    title: String,
+    score: Option<u8>,
+    #[serde(rename = "artist-credit")]
+    artist_credit: Option<Vec<MbArtistCredit>>,
+    date: Option<String>,
+    #[serde(rename = "track-count")]
+    track_count: Option<i32>,
+}
+
+#[derive(Deserialize)]
+struct MbRecordingDetail {
+    id: String,
+    title: String,
+    #[serde(rename = "artist-credit")]
+    artist_credit: Option<Vec<MbArtistCredit>>,
+    releases: Option<Vec<MbReleaseResult>>,
+    length: Option<u64>,
+    genres: Option<Vec<MbGenre>>,
+}
+
+#[derive(Deserialize)]
+struct MbArtistCredit {
+    artist: MbArtist,
+    joinphrase: Option<String>,
+}
+
+#[derive(Deserialize)]
+struct MbArtist {
+    id: String,
+    name: String,
+}
+
+#[derive(Deserialize)]
+struct MbGenre {
+    name: String,
+}
diff --git a/src/provider.rs b/src/provider.rs
new file mode 100644
index 0000000..21150a5
--- /dev/null
+++ b/src/provider.rs
@@ -0,0 +1,69 @@
+use serde::{Deserialize, Serialize};
+
+use crate::error::TagResult;
+
+/// A reference to a release (album) that a recording appears on.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ReleaseRef {
+    pub mbid: String,
+    pub title: String,
+    pub date: Option<String>,
+    pub track_number: Option<i32>,
+}
+
+/// A recording match from a search query.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RecordingMatch {
+    pub mbid: String,
+    pub title: String,
+    pub artist: String,
+    pub artist_mbid: Option<String>,
+    pub releases: Vec<ReleaseRef>,
+    /// MusicBrainz API score (0-100).
+    pub score: u8,
+}
+
+/// A release (album) match from a search query.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ReleaseMatch {
+    pub mbid: String,
+    pub title: String,
+    pub artist: String,
+    pub artist_mbid: Option<String>,
+    pub date: Option<String>,
+    pub track_count: Option<i32>,
+    pub score: u8,
+}
+
+/// Full details for a recording, retrieved by MBID.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RecordingDetails {
+    pub mbid: String,
+    pub title: String,
+    pub artist: String,
+    pub artist_mbid: Option<String>,
+    pub releases: Vec<ReleaseRef>,
+    pub duration_ms: Option<u64>,
+    pub genres: Vec<String>,
+}
+
+/// Trait for metadata lookup backends. MusicBrainz is the default implementation;
+/// others (Last.fm, Discogs, etc.) can be added later.
+pub trait MetadataProvider: Send + Sync {
+    fn search_recording(
+        &self,
+        artist: &str,
+        title: &str,
+    ) -> impl std::future::Future<Output = TagResult<Vec<RecordingMatch>>> + Send;
+
+    fn search_release(
+        &self,
+        artist: &str,
+        album: &str,
+    ) -> impl std::future::Future<Output = TagResult<Vec<ReleaseMatch>>> + Send;
+
+    fn get_recording(
+        &self,
+        mbid: &str,
+    ) -> impl std::future::Future<Output = TagResult<RecordingDetails>> + Send;
+}
diff --git a/src/tagger.rs b/src/tagger.rs
new file mode 100644
index 0000000..0069de5
--- /dev/null
+++ b/src/tagger.rs
@@ -0,0 +1,227 @@
+use std::fmt;
+
+use sea_orm::{ActiveValue::Set, DatabaseConnection, NotSet};
+
+use shanty_db::entities::track;
+use shanty_db::queries;
+
+use crate::error::TagResult;
+use crate::file_tags;
+use crate::matcher::{self, ScoredMatch};
+use crate::provider::MetadataProvider;
+
+/// Configuration for a tagging operation.
+pub struct TagConfig {
+    /// If true, show what would change without writing to DB or files.
+    pub dry_run: bool,
+    /// If true, write updated tags back to the music files.
+    pub write_tags: bool,
+    /// Minimum match confidence (0.0 - 1.0).
+    pub confidence: f64,
+}
+
+/// Statistics from a completed tagging run.
+#[derive(Debug, Default, Clone)]
+pub struct TagStats {
+    pub tracks_processed: u64,
+    pub tracks_matched: u64,
+    pub tracks_updated: u64,
+    pub tracks_skipped: u64,
+    pub tracks_errored: u64,
+}
+
+impl fmt::Display for TagStats {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "processed: {}, matched: {}, updated: {}, skipped: {}, errors: {}",
+            self.tracks_processed,
+            self.tracks_matched,
+            self.tracks_updated,
+            self.tracks_skipped,
+            self.tracks_errored,
+        )
+    }
+}
+
+/// Tag a single track. Returns `Ok(true)` if matched and updated.
+pub async fn tag_track(
+    conn: &DatabaseConnection,
+    provider: &impl MetadataProvider,
+    track: &track::Model,
+    config: &TagConfig,
+) -> TagResult<bool> {
+    // Build search query
+    let (artist, title) = match matcher::build_query(track) {
+        Some(q) => q,
+        None => {
+            tracing::debug!(id = track.id, path = %track.file_path, "no query possible, skipping");
+            return Ok(false);
+        }
+    };
+
+    tracing::info!(
+        id = track.id,
+        artist = %artist,
+        title = %title,
+        "searching MusicBrainz"
+    );
+
+    // Search for recordings
+    let candidates = provider.search_recording(&artist, &title).await?;
+
+    if candidates.is_empty() {
+        tracing::debug!(id = track.id, "no results from MusicBrainz");
+        return Ok(false);
+    }
+
+    // Score and select best match
+    let best = match matcher::select_best_match(track, candidates, config.confidence) {
+        Some(m) => m,
+        None => {
+            tracing::debug!(
+                id = track.id,
+                "no match above confidence threshold {}",
+                config.confidence
+            );
+            return Ok(false);
+        }
+    };
+
+    log_match(track, &best);
+
+    if config.dry_run {
+        return Ok(true);
+    }
+
+    // Get full details for the best match
+    let details = provider.get_recording(&best.recording.mbid).await?;
+
+    // Upsert artist with MusicBrainz ID
+    let artist_id = match &details.artist_mbid {
+        Some(mbid) => {
+            Some(queries::artists::upsert(conn, &details.artist, Some(mbid)).await?.id)
+        }
+        None => {
+            Some(queries::artists::upsert(conn, &details.artist, None).await?.id)
+        }
+    };
+
+    // Upsert album from best release
+    let (album_id, album_name) = if let Some(ref release) = best.best_release {
+        let album = queries::albums::upsert(
+            conn,
+            &release.title,
+            &details.artist,
+            Some(&release.mbid),
+            artist_id,
+        )
+        .await?;
+        (Some(album.id), Some(release.title.clone()))
+    } else {
+        (None, None)
+    };
+
+    // Parse year from release date
+    let year = best
+        .best_release
+        .as_ref()
+        .and_then(|r| r.date.as_deref())
+        .and_then(|d| d.split('-').next())
+        .and_then(|y| y.parse::<i32>().ok());
+
+    let genre = details.genres.first().cloned();
+
+    // Update track metadata
+    let active = track::ActiveModel {
+        id: Set(track.id),
+        file_path: Set(track.file_path.clone()),
+        title: Set(Some(details.title.clone())),
+        artist: Set(Some(details.artist.clone())),
+        album: Set(album_name),
+        album_artist: Set(Some(details.artist.clone())),
+        musicbrainz_id: Set(Some(details.mbid.clone())),
+        artist_id: Set(artist_id),
+        album_id: Set(album_id),
+        year: Set(year),
+        genre: Set(genre.clone()),
+        // Preserve existing values for fields we don't update
+        track_number: NotSet,
+        disc_number: NotSet,
+        duration: NotSet,
+        codec: NotSet,
+        bitrate: NotSet,
+        file_size: NotSet,
+        fingerprint: NotSet,
+        file_mtime: NotSet,
+        added_at: NotSet,
+        updated_at: NotSet,
+    };
+    queries::tracks::update_metadata(conn, track.id, active).await?;
+
+    // Optionally write tags to file
+    if config.write_tags {
+        if let Err(e) = file_tags::write_tags(
+            &track.file_path,
+            &details,
+            best.best_release.as_ref(),
+            year,
+            genre.as_deref(),
+        ) {
+            tracing::warn!(id = track.id, path = %track.file_path, "failed to write file tags: {e}");
+        }
+    }
+
+    Ok(true)
+}
+
+fn log_match(track: &track::Model, best: &ScoredMatch) {
+    tracing::info!(
+        id = track.id,
+        confidence = format!("{:.2}", best.confidence),
+        matched_title = %best.recording.title,
+        matched_artist = %best.recording.artist,
+        release = best.best_release.as_ref().map(|r| r.title.as_str()).unwrap_or("(none)"),
+        "match found"
+    );
+}
+
+/// Run tagging on all untagged tracks or a specific track.
+pub async fn run_tagging(
+    conn: &DatabaseConnection,
+    provider: &impl MetadataProvider,
+    config: &TagConfig,
+    track_id: Option<i32>,
+) -> TagResult<TagStats> {
+    let tracks: Vec<track::Model> = if let Some(id) = track_id {
+        vec![queries::tracks::get_by_id(conn, id).await?]
+    } else {
+        queries::tracks::get_untagged(conn).await?
+    };
+
+    tracing::info!(count = tracks.len(), "tracks to process");
+    let mut stats = TagStats::default();
+
+    for track in &tracks {
+        stats.tracks_processed += 1;
+
+        match tag_track(conn, provider, track, config).await {
+            Ok(true) => {
+                stats.tracks_matched += 1;
+                if !config.dry_run {
+                    stats.tracks_updated += 1;
+                }
+            }
+            Ok(false) => {
+                stats.tracks_skipped += 1;
+            }
+            Err(e) => {
+                tracing::error!(id = track.id, path = %track.file_path, "tagging error: {e}");
+                stats.tracks_errored += 1;
+            }
+        }
+    }
+
+    tracing::info!(%stats, "tagging complete");
+    Ok(stats)
+}
diff --git a/tests/integration.rs b/tests/integration.rs
new file mode 100644
index 0000000..ae325d7
--- /dev/null
+++ b/tests/integration.rs
@@ -0,0 +1,176 @@
+use chrono::Utc;
+use sea_orm::ActiveValue::Set;
+
+use shanty_db::{Database, queries};
+use shanty_tag::provider::{MetadataProvider, RecordingDetails, RecordingMatch, ReleaseMatch, ReleaseRef};
+use shanty_tag::error::TagResult;
+use shanty_tag::{TagConfig, run_tagging};
+
+/// A mock metadata provider for testing without hitting MusicBrainz.
+struct MockProvider;
+
+impl MetadataProvider for MockProvider {
+    async fn search_recording(&self, artist: &str, title: &str) -> TagResult<Vec<RecordingMatch>> {
+        // Return a match for "Pink Floyd - Time"
+        if artist.contains("Pink Floyd") && title.contains("Time") {
+            Ok(vec![RecordingMatch {
+                mbid: "rec-123".into(),
+                title: "Time".into(),
+                artist: "Pink Floyd".into(),
+                artist_mbid: Some("artist-456".into()),
+                releases: vec![ReleaseRef {
+                    mbid: "release-789".into(),
+                    title: "The Dark Side of the Moon".into(),
+                    date: Some("1973-03-01".into()),
+                    track_number: Some(4),
+                }],
+                score: 100,
+            }])
+        } else {
+            Ok(vec![])
+        }
+    }
+
+    async fn search_release(&self, _artist: &str, _album: &str) -> TagResult<Vec<ReleaseMatch>> {
+        Ok(vec![])
+    }
+
+    async fn get_recording(&self, mbid: &str) -> TagResult<RecordingDetails> {
+        if mbid == "rec-123" {
+            Ok(RecordingDetails {
+                mbid: "rec-123".into(),
+                title: "Time".into(),
+                artist: "Pink Floyd".into(),
+                artist_mbid: Some("artist-456".into()),
+                releases: vec![ReleaseRef {
+                    mbid: "release-789".into(),
+                    title: "The Dark Side of the Moon".into(),
+                    date: Some("1973-03-01".into()),
+                    track_number: Some(4),
+                }],
+                duration_ms: Some(413_000),
+                genres: vec!["Progressive Rock".into()],
+            })
+        } else {
+            Err(shanty_tag::TagError::Other("not found".into()))
+        }
+    }
+}
+
+async fn test_db() -> Database {
+    Database::new("sqlite::memory:")
+        .await
+        .expect("failed to create test database")
+}
+
+async fn insert_untagged_track(db: &Database, file_path: &str, title: Option<&str>, artist: Option<&str>) -> i32 {
+    let now = Utc::now().naive_utc();
+    let active = shanty_db::entities::track::ActiveModel {
+        file_path: Set(file_path.to_string()),
+        title: Set(title.map(String::from)),
+        artist: Set(artist.map(String::from)),
+        file_size: Set(1_000_000),
+        added_at: Set(now),
+        updated_at: Set(now),
+        ..Default::default()
+    };
+    let track = queries::tracks::upsert(db.conn(), active).await.unwrap();
+    track.id
+}
+
+#[tokio::test]
+async fn test_tag_track_with_match() {
+    let db = test_db().await;
+    let provider = MockProvider;
+
+    let track_id = insert_untagged_track(&db, "/music/time.mp3", Some("Time"), Some("Pink Floyd")).await;
+
+    let config = TagConfig {
+        dry_run: false,
+        write_tags: false,
+        confidence: 0.8,
+    };
+
+    let stats = run_tagging(db.conn(), &provider, &config, Some(track_id)).await.unwrap();
+    assert_eq!(stats.tracks_processed, 1);
+    assert_eq!(stats.tracks_matched, 1);
+    assert_eq!(stats.tracks_updated, 1);
+
+    // Verify the track was updated
+    let track = queries::tracks::get_by_id(db.conn(), track_id).await.unwrap();
+    assert_eq!(track.musicbrainz_id.as_deref(), Some("rec-123"));
+    assert_eq!(track.title.as_deref(), Some("Time"));
+    assert_eq!(track.artist.as_deref(), Some("Pink Floyd"));
+    assert_eq!(track.album.as_deref(), Some("The Dark Side of the Moon"));
+    assert_eq!(track.year, Some(1973));
+    assert_eq!(track.genre.as_deref(), Some("Progressive Rock"));
+
+    // Verify artist was created with MusicBrainz ID
+    let artist = queries::artists::find_by_name(db.conn(), "Pink Floyd").await.unwrap();
+    assert!(artist.is_some());
+    assert_eq!(artist.unwrap().musicbrainz_id.as_deref(), Some("artist-456"));
+}
+
+#[tokio::test]
+async fn test_tag_track_no_match() {
+    let db = test_db().await;
+    let provider = MockProvider;
+
+    let track_id = insert_untagged_track(&db, "/music/unknown.mp3", Some("Unknown Song"), Some("Nobody")).await;
+
+    let config = TagConfig {
+        dry_run: false,
+        write_tags: false,
+        confidence: 0.8,
+    };
+
+    let stats = run_tagging(db.conn(), &provider, &config, Some(track_id)).await.unwrap();
+    assert_eq!(stats.tracks_processed, 1);
+    assert_eq!(stats.tracks_skipped, 1);
+
+    // Track should be unchanged
+    let track = queries::tracks::get_by_id(db.conn(), track_id).await.unwrap();
+    assert!(track.musicbrainz_id.is_none());
+}
+
+#[tokio::test]
+async fn test_dry_run_does_not_update() {
+    let db = test_db().await;
+    let provider = MockProvider;
+
+    let track_id = insert_untagged_track(&db, "/music/time.mp3", Some("Time"), Some("Pink Floyd")).await;
+
+    let config = TagConfig {
+        dry_run: true,
+        write_tags: false,
+        confidence: 0.8,
+    };
+
+    let stats = run_tagging(db.conn(), &provider, &config, Some(track_id)).await.unwrap();
+    assert_eq!(stats.tracks_matched, 1);
+    assert_eq!(stats.tracks_updated, 0); // dry run
+
+    // Track should be unchanged
+    let track = queries::tracks::get_by_id(db.conn(), track_id).await.unwrap();
+    assert!(track.musicbrainz_id.is_none());
+}
+
+#[tokio::test]
+async fn test_tag_all_untagged() {
+    let db = test_db().await;
+    let provider = MockProvider;
+
+    insert_untagged_track(&db, "/music/time.mp3", Some("Time"), Some("Pink Floyd")).await;
+    insert_untagged_track(&db, "/music/unknown.mp3", Some("Unknown"), Some("Nobody")).await;
+
+    let config = TagConfig {
+        dry_run: false,
+        write_tags: false,
+        confidence: 0.8,
+    };
+
+    let stats = run_tagging(db.conn(), &provider, &config, None).await.unwrap();
+    assert_eq!(stats.tracks_processed, 2);
+    assert_eq!(stats.tracks_matched, 1); // only Pink Floyd matched
+    assert_eq!(stats.tracks_skipped, 1);
+}