Files
tag/src/matcher.rs
T
2026-03-20 14:52:16 -04:00

300 lines
9.3 KiB
Rust

use shanty_db::entities::track;
use crate::cleaning::normalize;
use shanty_data::{RecordingMatch, ReleaseRef};
/// A scored recording match with the best matching release.
#[derive(Debug, Clone)]
pub struct ScoredMatch {
pub recording: RecordingMatch,
pub confidence: f64,
pub best_release: Option<ReleaseRef>,
}
/// Build a search query (artist, title) from a track's metadata.
/// Falls back to filename parsing if metadata is insufficient.
pub fn build_query(track: &track::Model) -> Option<(String, String)> {
let artist = track
.album_artist
.as_deref()
.or(track.artist.as_deref())
.filter(|s| !s.is_empty());
let title = track.title.as_deref().filter(|s| !s.is_empty());
match (artist, title) {
(Some(a), Some(t)) => Some((a.to_string(), t.to_string())),
(None, Some(t)) => Some((String::new(), t.to_string())),
_ => parse_filename(&track.file_path),
}
}
/// Parse "Artist - Title" from a filename, stripping extension and path.
pub fn parse_filename(file_path: &str) -> Option<(String, String)> {
let filename = std::path::Path::new(file_path).file_stem()?.to_str()?;
// Try common "Artist - Title" pattern
if let Some((artist, title)) = filename.split_once(" - ") {
let artist = artist.trim().to_string();
let title = title.trim().to_string();
if !artist.is_empty() && !title.is_empty() {
return Some((artist, title));
}
}
// If no delimiter found, treat entire filename as the title
let name = filename.trim().to_string();
if !name.is_empty() {
Some((String::new(), name))
} else {
None
}
}
/// Score a candidate recording against the track's known metadata.
/// Returns a confidence value from 0.0 to 1.0.
pub fn score_match(track: &track::Model, candidate: &RecordingMatch) -> f64 {
let track_title = track.title.as_deref().map(normalize).unwrap_or_default();
let candidate_title = normalize(&candidate.title);
let track_artist = track
.artist
.as_deref()
.or(track.album_artist.as_deref())
.map(normalize)
.unwrap_or_default();
let candidate_artist = normalize(&candidate.artist);
// Title similarity (weighted 0.6)
let title_sim = if track_title.is_empty() || candidate_title.is_empty() {
0.0
} else {
strsim::jaro_winkler(&track_title, &candidate_title)
};
// Artist similarity (weighted 0.4)
let artist_sim = if track_artist.is_empty() || candidate_artist.is_empty() {
0.3 // neutral-ish when we have no artist to compare
} else {
strsim::jaro_winkler(&track_artist, &candidate_artist)
};
let mut score = 0.6 * title_sim + 0.4 * artist_sim;
// Bonus: album name matches a release
if let Some(ref album) = track.album {
let track_album = normalize(album);
if !track_album.is_empty() {
for release in &candidate.releases {
let release_title = normalize(&release.title);
let album_sim = strsim::jaro_winkler(&track_album, &release_title);
if album_sim > 0.85 {
score += 0.05;
break;
}
}
}
}
// Bonus: duration within 3 seconds
if let Some(track_dur) = track.duration {
// MusicBrainz search results don't always include duration,
// but the score from the API itself is a signal
if track_dur > 0.0 && candidate.score > 90 {
score += 0.03;
}
}
score.min(1.0)
}
/// Select the best match from candidates that exceeds the confidence threshold.
pub fn select_best_match(
track: &track::Model,
candidates: Vec<RecordingMatch>,
threshold: f64,
) -> Option<ScoredMatch> {
let mut best: Option<ScoredMatch> = None;
for candidate in candidates {
let confidence = score_match(track, &candidate);
tracing::debug!(
title = %candidate.title,
artist = %candidate.artist,
confidence = confidence,
"candidate"
);
if confidence >= threshold {
let best_release = candidate.releases.first().cloned();
let scored = ScoredMatch {
recording: candidate,
confidence,
best_release,
};
match &best {
Some(current) if scored.confidence <= current.confidence => {}
_ => best = Some(scored),
}
}
}
best
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_filename_artist_title() {
let result = parse_filename("/music/Pink Floyd - Time.mp3");
assert_eq!(result, Some(("Pink Floyd".into(), "Time".into())));
}
#[test]
fn test_parse_filename_title_only() {
let result = parse_filename("/music/some_song.mp3");
assert_eq!(result, Some(("".into(), "some_song".into())));
}
#[test]
fn test_parse_filename_nested_path() {
let result = parse_filename("/music/Artist/Album/03 - Track Name.flac");
// The "03" gets treated as artist since it splits on " - "
assert_eq!(result, Some(("03".into(), "Track Name".into())));
}
#[test]
fn test_build_query_with_metadata() {
let track = track::Model {
id: 1,
file_path: "/music/test.mp3".into(),
title: Some("Time".into()),
artist: Some("Pink Floyd".into()),
album: None,
album_artist: None,
track_number: None,
disc_number: None,
duration: None,
genre: None,
year: None,
codec: None,
bitrate: None,
file_size: 1000,
fingerprint: None,
musicbrainz_id: None,
artist_id: None,
album_id: None,
file_mtime: None,
added_at: chrono::Utc::now().naive_utc(),
updated_at: chrono::Utc::now().naive_utc(),
};
let result = build_query(&track);
assert_eq!(result, Some(("Pink Floyd".into(), "Time".into())));
}
#[test]
fn test_build_query_falls_back_to_filename() {
let track = track::Model {
id: 1,
file_path: "/music/Radiohead - Creep.mp3".into(),
title: None,
artist: None,
album: None,
album_artist: None,
track_number: None,
disc_number: None,
duration: None,
genre: None,
year: None,
codec: None,
bitrate: None,
file_size: 1000,
fingerprint: None,
musicbrainz_id: None,
artist_id: None,
album_id: None,
file_mtime: None,
added_at: chrono::Utc::now().naive_utc(),
updated_at: chrono::Utc::now().naive_utc(),
};
let result = build_query(&track);
assert_eq!(result, Some(("Radiohead".into(), "Creep".into())));
}
#[test]
fn test_score_match_exact() {
let track = track::Model {
id: 1,
file_path: "/test.mp3".into(),
title: Some("Time".into()),
artist: Some("Pink Floyd".into()),
album: None,
album_artist: None,
track_number: None,
disc_number: None,
duration: None,
genre: None,
year: None,
codec: None,
bitrate: None,
file_size: 1000,
fingerprint: None,
musicbrainz_id: None,
artist_id: None,
album_id: None,
file_mtime: None,
added_at: chrono::Utc::now().naive_utc(),
updated_at: chrono::Utc::now().naive_utc(),
};
let candidate = RecordingMatch {
mbid: "123".into(),
title: "Time".into(),
artist: "Pink Floyd".into(),
artist_mbid: None,
releases: vec![],
score: 100,
};
let score = score_match(&track, &candidate);
assert!(score > 0.95, "exact match should score > 0.95, got {score}");
}
#[test]
fn test_score_match_fuzzy() {
let track = track::Model {
id: 1,
file_path: "/test.mp3".into(),
title: Some("Comfortably Numb".into()),
artist: Some("Pink Floyd".into()),
album: None,
album_artist: None,
track_number: None,
disc_number: None,
duration: None,
genre: None,
year: None,
codec: None,
bitrate: None,
file_size: 1000,
fingerprint: None,
musicbrainz_id: None,
artist_id: None,
album_id: None,
file_mtime: None,
added_at: chrono::Utc::now().naive_utc(),
updated_at: chrono::Utc::now().naive_utc(),
};
// Slight misspelling
let candidate = RecordingMatch {
mbid: "123".into(),
title: "Comfortably Numb".into(),
artist: "Pink Flloyd".into(), // typo
artist_mbid: None,
releases: vec![],
score: 95,
};
let score = score_match(&track, &candidate);
assert!(score > 0.85, "fuzzy match should score > 0.85, got {score}");
}
}