114 lines
3.6 KiB
Rust
114 lines
3.6 KiB
Rust
use std::sync::LazyLock;
|
|
|
|
use regex::Regex;
|
|
use unicode_normalization::UnicodeNormalization;
|
|
|
|
static STRIP_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
|
|
vec![
|
|
// (Official Video), (Official Audio), (Official Music Video), (Lyric Video), etc.
|
|
Regex::new(r"\(official\s*(video|audio|music\s*video|lyric\s*video|visualizer)\)").unwrap(),
|
|
// (Remastered), (Remastered 2011), (Remaster)
|
|
Regex::new(r"\(remaster(ed)?\s*(\d{4})?\)").unwrap(),
|
|
// [Live], [Bonus Track], [Deluxe], [Explicit]
|
|
Regex::new(r"\[(live|bonus(\s*track)?|deluxe|explicit|clean)\]").unwrap(),
|
|
// (feat. Artist), [feat. Artist], (ft. Artist)
|
|
Regex::new(r"[\(\[](feat\.?|ft\.?)\s+[^\)\]]+[\)\]]").unwrap(),
|
|
// (with Artist)
|
|
Regex::new(r"\(with\s+[^)]+\)").unwrap(),
|
|
// Trailing " - Single", " - EP"
|
|
Regex::new(r"\s*-\s*(single|ep)\s*$").unwrap(),
|
|
]
|
|
});
|
|
|
|
/// Normalize a string for fuzzy comparison.
|
|
///
|
|
/// Applies unicode NFC normalization, lowercasing, stripping common suffixes
|
|
/// (video tags, remaster notes, featuring credits), and trimming.
|
|
pub fn normalize(s: &str) -> String {
|
|
// Unicode NFC normalization
|
|
let s: String = s.nfc().collect();
|
|
|
|
// Lowercase
|
|
let mut s = s.to_lowercase();
|
|
|
|
// Strip known patterns
|
|
for pattern in STRIP_PATTERNS.iter() {
|
|
s = pattern.replace_all(&s, "").to_string();
|
|
}
|
|
|
|
// Trim whitespace and punctuation
|
|
s.trim()
|
|
.trim_matches(|c: char| c.is_ascii_punctuation() || c.is_whitespace())
|
|
.to_string()
|
|
}
|
|
|
|
/// Escape special characters for MusicBrainz Lucene query syntax.
|
|
pub fn escape_lucene(s: &str) -> String {
|
|
let special = [
|
|
'+', '-', '&', '|', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '\\',
|
|
'/',
|
|
];
|
|
let mut result = String::with_capacity(s.len());
|
|
for c in s.chars() {
|
|
if special.contains(&c) {
|
|
result.push('\\');
|
|
}
|
|
result.push(c);
|
|
}
|
|
result
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_normalize_basic() {
|
|
assert_eq!(normalize("Hello World"), "hello world");
|
|
assert_eq!(normalize(" spaces "), "spaces");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_strips_official_video() {
|
|
assert_eq!(normalize("Time (Official Video)"), "time");
|
|
assert_eq!(normalize("Money (Official Music Video)"), "money");
|
|
assert_eq!(
|
|
normalize("Comfortably Numb (Official Audio)"),
|
|
"comfortably numb"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_strips_remastered() {
|
|
assert_eq!(normalize("Time (Remastered 2011)"), "time");
|
|
assert_eq!(normalize("Money (Remastered)"), "money");
|
|
assert_eq!(normalize("Shine On (Remaster)"), "shine on");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_strips_feat() {
|
|
assert_eq!(normalize("Song (feat. Artist)"), "song");
|
|
assert_eq!(normalize("Song [ft. Someone]"), "song");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_strips_brackets() {
|
|
assert_eq!(normalize("Song [Live]"), "song");
|
|
assert_eq!(normalize("Song [Bonus Track]"), "song");
|
|
assert_eq!(normalize("Song [Explicit]"), "song");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_unicode() {
|
|
// NFC normalization — decomposed é should become composed é
|
|
assert_eq!(normalize("café"), normalize("café"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_escape_lucene() {
|
|
assert_eq!(escape_lucene("AC/DC"), r"AC\/DC");
|
|
assert_eq!(escape_lucene("test (hello)"), r"test \(hello\)");
|
|
assert_eq!(escape_lucene("simple"), "simple");
|
|
}
|
|
}
|