use std::sync::LazyLock; use regex::Regex; use unicode_normalization::UnicodeNormalization; static STRIP_PATTERNS: LazyLock> = LazyLock::new(|| { vec![ // (Official Video), (Official Audio), (Official Music Video), (Lyric Video), etc. Regex::new(r"\(official\s*(video|audio|music\s*video|lyric\s*video|visualizer)\)").unwrap(), // (Remastered), (Remastered 2011), (Remaster) Regex::new(r"\(remaster(ed)?\s*(\d{4})?\)").unwrap(), // [Live], [Bonus Track], [Deluxe], [Explicit] Regex::new(r"\[(live|bonus(\s*track)?|deluxe|explicit|clean)\]").unwrap(), // (feat. Artist), [feat. Artist], (ft. Artist) Regex::new(r"[\(\[](feat\.?|ft\.?)\s+[^\)\]]+[\)\]]").unwrap(), // (with Artist) Regex::new(r"\(with\s+[^)]+\)").unwrap(), // Trailing " - Single", " - EP" Regex::new(r"\s*-\s*(single|ep)\s*$").unwrap(), ] }); /// Normalize a string for fuzzy comparison. /// /// Applies unicode NFC normalization, lowercasing, stripping common suffixes /// (video tags, remaster notes, featuring credits), and trimming. pub fn normalize(s: &str) -> String { // Unicode NFC normalization let s: String = s.nfc().collect(); // Lowercase let mut s = s.to_lowercase(); // Strip known patterns for pattern in STRIP_PATTERNS.iter() { s = pattern.replace_all(&s, "").to_string(); } // Trim whitespace and punctuation s.trim() .trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace()) .to_string() } /// Escape special characters for MusicBrainz Lucene query syntax. pub fn escape_lucene(s: &str) -> String { let special = [ '+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '\\', '/', ]; let mut result = String::with_capacity(s.len()); for c in s.chars() { if special.contains(&c) { result.push('\\'); } result.push(c); } result } #[cfg(test)] mod tests { use super::*; #[test] fn test_normalize_basic() { assert_eq!(normalize("Hello World"), "hello world"); assert_eq!(normalize(" spaces "), "spaces"); } #[test] fn test_normalize_strips_official_video() { assert_eq!(normalize("Time (Official Video)"), "time"); assert_eq!(normalize("Money (Official Music Video)"), "money"); assert_eq!( normalize("Comfortably Numb (Official Audio)"), "comfortably numb" ); } #[test] fn test_normalize_strips_remastered() { assert_eq!(normalize("Time (Remastered 2011)"), "time"); assert_eq!(normalize("Money (Remastered)"), "money"); assert_eq!(normalize("Shine On (Remaster)"), "shine on"); } #[test] fn test_normalize_strips_feat() { assert_eq!(normalize("Song (feat. Artist)"), "song"); assert_eq!(normalize("Song [ft. Someone]"), "song"); } #[test] fn test_normalize_strips_brackets() { assert_eq!(normalize("Song [Live]"), "song"); assert_eq!(normalize("Song [Bonus Track]"), "song"); assert_eq!(normalize("Song [Explicit]"), "song"); } #[test] fn test_normalize_unicode() { // NFC normalization — decomposed é should become composed é assert_eq!(normalize("café"), normalize("café")); } #[test] fn test_escape_lucene() { assert_eq!(escape_lucene("AC/DC"), r"AC\/DC"); assert_eq!(escape_lucene("test (hello)"), r"test \(hello\)"); assert_eq!(escape_lucene("simple"), "simple"); } }