Files
tag/src/cleaning.rs
Connor Johnstone 5957d69e7d Formatting
2026-03-18 15:36:54 -04:00

114 lines
3.6 KiB
Rust

use std::sync::LazyLock;
use regex::Regex;
use unicode_normalization::UnicodeNormalization;
static STRIP_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
vec![
// (Official Video), (Official Audio), (Official Music Video), (Lyric Video), etc.
Regex::new(r"\(official\s*(video|audio|music\s*video|lyric\s*video|visualizer)\)").unwrap(),
// (Remastered), (Remastered 2011), (Remaster)
Regex::new(r"\(remaster(ed)?\s*(\d{4})?\)").unwrap(),
// [Live], [Bonus Track], [Deluxe], [Explicit]
Regex::new(r"\[(live|bonus(\s*track)?|deluxe|explicit|clean)\]").unwrap(),
// (feat. Artist), [feat. Artist], (ft. Artist)
Regex::new(r"[\(\[](feat\.?|ft\.?)\s+[^\)\]]+[\)\]]").unwrap(),
// (with Artist)
Regex::new(r"\(with\s+[^)]+\)").unwrap(),
// Trailing " - Single", " - EP"
Regex::new(r"\s*-\s*(single|ep)\s*$").unwrap(),
]
});
/// Normalize a string for fuzzy comparison.
///
/// Applies unicode NFC normalization, lowercasing, stripping common suffixes
/// (video tags, remaster notes, featuring credits), and trimming.
pub fn normalize(s: &str) -> String {
// Unicode NFC normalization
let s: String = s.nfc().collect();
// Lowercase
let mut s = s.to_lowercase();
// Strip known patterns
for pattern in STRIP_PATTERNS.iter() {
s = pattern.replace_all(&s, "").to_string();
}
// Trim whitespace and punctuation
s.trim()
.trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace())
.to_string()
}
/// Escape special characters for MusicBrainz Lucene query syntax.
pub fn escape_lucene(s: &str) -> String {
let special = [
'+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '\\',
'/',
];
let mut result = String::with_capacity(s.len());
for c in s.chars() {
if special.contains(&c) {
result.push('\\');
}
result.push(c);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_basic() {
assert_eq!(normalize("Hello World"), "hello world");
assert_eq!(normalize(" spaces "), "spaces");
}
#[test]
fn test_normalize_strips_official_video() {
assert_eq!(normalize("Time (Official Video)"), "time");
assert_eq!(normalize("Money (Official Music Video)"), "money");
assert_eq!(
normalize("Comfortably Numb (Official Audio)"),
"comfortably numb"
);
}
#[test]
fn test_normalize_strips_remastered() {
assert_eq!(normalize("Time (Remastered 2011)"), "time");
assert_eq!(normalize("Money (Remastered)"), "money");
assert_eq!(normalize("Shine On (Remaster)"), "shine on");
}
#[test]
fn test_normalize_strips_feat() {
assert_eq!(normalize("Song (feat. Artist)"), "song");
assert_eq!(normalize("Song [ft. Someone]"), "song");
}
#[test]
fn test_normalize_strips_brackets() {
assert_eq!(normalize("Song [Live]"), "song");
assert_eq!(normalize("Song [Bonus Track]"), "song");
assert_eq!(normalize("Song [Explicit]"), "song");
}
#[test]
fn test_normalize_unicode() {
// NFC normalization — decomposed é should become composed é
assert_eq!(normalize("café"), normalize("café"));
}
#[test]
fn test_escape_lucene() {
assert_eq!(escape_lucene("AC/DC"), r"AC\/DC");
assert_eq!(escape_lucene("test (hello)"), r"test \(hello\)");
assert_eq!(escape_lucene("simple"), "simple");
}
}