461 lines
16 KiB
Rust
461 lines
16 KiB
Rust
//! Timbral feature extraction module.
|
|
//!
|
|
//! Contains functions to extract & summarize the zero-crossing rate,
|
|
//! spectral centroid, spectral flatness and spectral roll-off of
|
|
//! a given Song.
|
|
|
|
use bliss_audio_aubio_rs::vec::CVec;
|
|
use bliss_audio_aubio_rs::{bin_to_freq, PVoc, SpecDesc, SpecShape};
|
|
use ndarray::{arr1, Axis};
|
|
|
|
use super::utils::{geometric_mean, mean, number_crossings, Normalize};
|
|
use crate::{BlissError, BlissResult, SAMPLE_RATE};
|
|
|
|
/**
|
|
* General object holding all the spectral descriptor.
|
|
*
|
|
* Holds 3 spectral descriptors together. It would be better conceptually
|
|
* to have 3 different spectral descriptor objects, but this avoids re-computing
|
|
* the same FFT three times.
|
|
*
|
|
* Current spectral descriptors are spectral centroid, spectral rolloff and
|
|
* spectral flatness (see `values_object` for a further description of the
|
|
* object.
|
|
*
|
|
* All descriptors are currently summarized by their mean only.
|
|
*/
|
|
pub(crate) struct SpectralDesc {
|
|
phase_vocoder: PVoc,
|
|
sample_rate: u32,
|
|
|
|
centroid_aubio_desc: SpecDesc,
|
|
rolloff_aubio_desc: SpecDesc,
|
|
values_centroid: Vec<f32>,
|
|
values_rolloff: Vec<f32>,
|
|
values_flatness: Vec<f32>,
|
|
}
|
|
|
|
impl SpectralDesc {
|
|
pub const WINDOW_SIZE: usize = 512;
|
|
pub const HOP_SIZE: usize = SpectralDesc::WINDOW_SIZE / 4;
|
|
|
|
/**
|
|
* Compute score related to the
|
|
* [spectral centroid](https://en.wikipedia.org/wiki/Spectral_centroid) values,
|
|
* obtained after repeatedly calling `do_` on all of the song's chunks.
|
|
*
|
|
* Spectral centroid is used to determine the "brightness" of a sound, i.e.
|
|
* how much high frequency there is in an audio signal.
|
|
*
|
|
* It of course depends of the instrument used: a piano-only track that makes
|
|
* use of high frequencies will still score less than a song using a lot of
|
|
* percussive sound, because the piano frequency range is lower.
|
|
*
|
|
* The value range is between 0 and `sample_rate / 2`.
|
|
*/
|
|
pub fn get_centroid(&mut self) -> Vec<f32> {
|
|
vec![
|
|
self.normalize(mean(&self.values_centroid)),
|
|
self.normalize(
|
|
arr1(&self.values_centroid)
|
|
.std_axis(Axis(0), 0.)
|
|
.into_scalar(),
|
|
),
|
|
]
|
|
}
|
|
|
|
/**
|
|
* Compute score related to the spectral roll-off values, obtained
|
|
* after repeatedly calling `do_` on all of the song's chunks.
|
|
*
|
|
* Spectral roll-off is the bin frequency number below which a certain
|
|
* percentage of the spectral energy is found, here, 95%.
|
|
*
|
|
* It can be used to distinguish voiced speech (low roll-off) and unvoiced
|
|
* speech (high roll-off). It is also a good indication of the energy
|
|
* repartition of a song.
|
|
*
|
|
* The value range is between 0 and `sample_rate / 2`
|
|
*/
|
|
pub fn get_rolloff(&mut self) -> Vec<f32> {
|
|
vec![
|
|
self.normalize(mean(&self.values_rolloff)),
|
|
self.normalize(
|
|
arr1(&self.values_rolloff)
|
|
.std_axis(Axis(0), 0.)
|
|
.into_scalar(),
|
|
),
|
|
]
|
|
}
|
|
|
|
/**
|
|
* Compute score related to the
|
|
* [spectral flatness](https://en.wikipedia.org/wiki/Spectral_flatness) values,
|
|
* obtained after repeatedly calling `do_` on all of the song's chunks.
|
|
*
|
|
* Spectral flatness is the ratio between the geometric mean of the spectrum
|
|
* and its arithmetic mean.
|
|
*
|
|
* It is used to distinguish between tone-like and noise-like signals.
|
|
* Tone-like audio is f.ex. a piano key, something that has one or more
|
|
* specific frequencies, while (white) noise has an equal distribution
|
|
* of intensity among all frequencies.
|
|
*
|
|
* The value range is between 0 and 1, since the geometric mean is always less
|
|
* than the arithmetic mean.
|
|
*/
|
|
pub fn get_flatness(&mut self) -> Vec<f32> {
|
|
let max_value = 1.;
|
|
let min_value = 0.;
|
|
// Range is different from the other spectral algorithms, so normalizing
|
|
// manually here.
|
|
vec![
|
|
2. * (mean(&self.values_flatness) - min_value) / (max_value - min_value) - 1.,
|
|
2. * (arr1(&self.values_flatness)
|
|
.std_axis(Axis(0), 0.)
|
|
.into_scalar()
|
|
- min_value)
|
|
/ (max_value - min_value)
|
|
- 1.,
|
|
]
|
|
}
|
|
|
|
pub fn new(sample_rate: u32) -> BlissResult<Self> {
|
|
Ok(SpectralDesc {
|
|
centroid_aubio_desc: SpecDesc::new(SpecShape::Centroid, SpectralDesc::WINDOW_SIZE)
|
|
.map_err(|e| {
|
|
BlissError::AnalysisError(format!(
|
|
"error while loading aubio centroid object: {}",
|
|
e
|
|
))
|
|
})?,
|
|
rolloff_aubio_desc: SpecDesc::new(SpecShape::Rolloff, SpectralDesc::WINDOW_SIZE)
|
|
.map_err(|e| {
|
|
BlissError::AnalysisError(format!(
|
|
"error while loading aubio rolloff object: {}",
|
|
e
|
|
))
|
|
})?,
|
|
phase_vocoder: PVoc::new(SpectralDesc::WINDOW_SIZE, SpectralDesc::HOP_SIZE).map_err(
|
|
|e| {
|
|
BlissError::AnalysisError(format!(
|
|
"error while loading aubio pvoc object: {}",
|
|
e
|
|
))
|
|
},
|
|
)?,
|
|
values_centroid: Vec::new(),
|
|
values_rolloff: Vec::new(),
|
|
values_flatness: Vec::new(),
|
|
sample_rate,
|
|
})
|
|
}
|
|
|
|
/**
|
|
* Compute all the descriptors' value for the given chunk.
|
|
*
|
|
* After using this on all the song's chunks, you can call
|
|
* `get_centroid`, `get_flatness` and `get_rolloff` to get the respective
|
|
* descriptors' values.
|
|
*/
|
|
pub fn do_(&mut self, chunk: &[f32]) -> BlissResult<()> {
|
|
let mut fftgrain: Vec<f32> = vec![0.0; SpectralDesc::WINDOW_SIZE];
|
|
self.phase_vocoder
|
|
.do_(chunk, fftgrain.as_mut_slice())
|
|
.map_err(|e| {
|
|
BlissError::AnalysisError(format!("error while processing aubio pv object: {}", e))
|
|
})?;
|
|
|
|
let bin = self
|
|
.centroid_aubio_desc
|
|
.do_result(fftgrain.as_slice())
|
|
.map_err(|e| {
|
|
BlissError::AnalysisError(format!(
|
|
"error while processing aubio centroid object: {}",
|
|
e
|
|
))
|
|
})?;
|
|
|
|
let freq = bin_to_freq(
|
|
bin,
|
|
self.sample_rate as f32,
|
|
SpectralDesc::WINDOW_SIZE as f32,
|
|
);
|
|
self.values_centroid.push(freq);
|
|
|
|
let mut bin = self
|
|
.rolloff_aubio_desc
|
|
.do_result(fftgrain.as_slice())
|
|
.unwrap();
|
|
|
|
// Until https://github.com/aubio/aubio/pull/318 is in
|
|
if bin > SpectralDesc::WINDOW_SIZE as f32 / 2. {
|
|
bin = SpectralDesc::WINDOW_SIZE as f32 / 2.;
|
|
}
|
|
|
|
let freq = bin_to_freq(
|
|
bin,
|
|
self.sample_rate as f32,
|
|
SpectralDesc::WINDOW_SIZE as f32,
|
|
);
|
|
self.values_rolloff.push(freq);
|
|
|
|
let cvec: CVec = fftgrain.as_slice().into();
|
|
let geo_mean = geometric_mean(cvec.norm());
|
|
if geo_mean == 0.0 {
|
|
self.values_flatness.push(0.0);
|
|
return Ok(());
|
|
}
|
|
let flatness = geo_mean / mean(cvec.norm());
|
|
self.values_flatness.push(flatness);
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
impl Normalize for SpectralDesc {
|
|
const MAX_VALUE: f32 = SAMPLE_RATE as f32 / 2.;
|
|
const MIN_VALUE: f32 = 0.;
|
|
}
|
|
|
|
/**
|
|
* [Zero-crossing rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)
|
|
* detection object.
|
|
*
|
|
* Zero-crossing rate is mostly used to detect percussive sounds in an audio
|
|
* signal, as well as whether an audio signal contains speech or not.
|
|
*
|
|
* It is a good metric to differentiate between songs with people speaking clearly,
|
|
* (e.g. slam) and instrumental songs.
|
|
*
|
|
* The value range is between 0 and 1.
|
|
*/
|
|
#[derive(Default)]
|
|
pub(crate) struct ZeroCrossingRateDesc {
|
|
values: Vec<u32>,
|
|
number_samples: usize,
|
|
}
|
|
|
|
impl ZeroCrossingRateDesc {
|
|
#[allow(dead_code)]
|
|
pub fn new(_sample_rate: u32) -> Self {
|
|
ZeroCrossingRateDesc::default()
|
|
}
|
|
|
|
/// Count the number of zero-crossings for the current `chunk`.
|
|
pub fn do_(&mut self, chunk: &[f32]) {
|
|
self.values.push(number_crossings(chunk));
|
|
self.number_samples += chunk.len();
|
|
}
|
|
|
|
/// Sum the number of zero-crossings witnessed and divide by
|
|
/// the total number of samples.
|
|
pub fn get_value(&mut self) -> f32 {
|
|
self.normalize((self.values.iter().sum::<u32>()) as f32 / self.number_samples as f32)
|
|
}
|
|
}
|
|
|
|
impl Normalize for ZeroCrossingRateDesc {
|
|
const MAX_VALUE: f32 = 1.;
|
|
const MIN_VALUE: f32 = 0.;
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::Song;
|
|
use std::path::Path;
|
|
|
|
#[test]
|
|
fn test_zcr_boundaries() {
|
|
let mut zcr_desc = ZeroCrossingRateDesc::default();
|
|
let chunk = vec![0.; 1024];
|
|
zcr_desc.do_(&chunk);
|
|
assert_eq!(-1., zcr_desc.get_value());
|
|
|
|
let one_chunk = vec![-1., 1.];
|
|
let chunks = std::iter::repeat(one_chunk.iter())
|
|
.take(512)
|
|
.flatten()
|
|
.cloned()
|
|
.collect::<Vec<f32>>();
|
|
let mut zcr_desc = ZeroCrossingRateDesc::default();
|
|
zcr_desc.do_(&chunks);
|
|
assert!(0.001 > (0.9980469 - zcr_desc.get_value()).abs());
|
|
}
|
|
|
|
#[test]
|
|
fn test_zcr() {
|
|
let song = Song::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
|
|
let mut zcr_desc = ZeroCrossingRateDesc::default();
|
|
for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
|
|
zcr_desc.do_(&chunk);
|
|
}
|
|
assert!(0.001 > (-0.85036 - zcr_desc.get_value()).abs());
|
|
}
|
|
|
|
#[test]
|
|
fn test_spectral_flatness_boundaries() {
|
|
let mut spectral_desc = SpectralDesc::new(10).unwrap();
|
|
let chunk = vec![0.; 1024];
|
|
|
|
let expected_values = vec![-1., -1.];
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
for (expected, actual) in expected_values
|
|
.iter()
|
|
.zip(spectral_desc.get_flatness().iter())
|
|
{
|
|
assert!(0.0000001 > (expected - actual).abs());
|
|
}
|
|
|
|
let song = Song::decode(Path::new("data/white_noise.flac")).unwrap();
|
|
let mut spectral_desc = SpectralDesc::new(22050).unwrap();
|
|
for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
}
|
|
// White noise - as close to 1 as possible
|
|
let expected_values = vec![0.6706717, -0.9685736];
|
|
for (expected, actual) in expected_values
|
|
.iter()
|
|
.zip(spectral_desc.get_flatness().iter())
|
|
{
|
|
assert!(0.001 > (expected - actual).abs());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_spectral_flatness() {
|
|
let song = Song::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
|
|
let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
|
|
for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
}
|
|
// Spectral flatness mean value computed here with phase vocoder before normalization: 0.111949615
|
|
// Essentia value with spectrum / hann window: 0.11197535695207445
|
|
let expected_values = vec![-0.77610075, -0.8148179];
|
|
for (expected, actual) in expected_values
|
|
.iter()
|
|
.zip(spectral_desc.get_flatness().iter())
|
|
{
|
|
assert!(0.01 > (expected - actual).abs());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_spectral_roll_off_boundaries() {
|
|
let mut spectral_desc = SpectralDesc::new(10).unwrap();
|
|
let chunk = vec![0.; 512];
|
|
|
|
let expected_values = vec![-1., -1.];
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
for (expected, actual) in expected_values
|
|
.iter()
|
|
.zip(spectral_desc.get_rolloff().iter())
|
|
{
|
|
assert!(0.0000001 > (expected - actual).abs());
|
|
}
|
|
|
|
let song = Song::decode(Path::new("data/tone_11080Hz.flac")).unwrap();
|
|
let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
|
|
for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
}
|
|
let expected_values = vec![0.9967681, -0.99615175];
|
|
for (expected, actual) in expected_values
|
|
.iter()
|
|
.zip(spectral_desc.get_rolloff().iter())
|
|
{
|
|
assert!(0.0001 > (expected - actual).abs());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_spectral_roll_off() {
|
|
let song = Song::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
|
|
let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
|
|
for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
}
|
|
let expected_values = vec![-0.6326486, -0.7260933];
|
|
// Roll-off mean value computed here with phase vocoder before normalization: 2026.7644
|
|
// Essentia value with spectrum / hann window: 1979.632683520047
|
|
for (expected, actual) in expected_values
|
|
.iter()
|
|
.zip(spectral_desc.get_rolloff().iter())
|
|
{
|
|
assert!(0.01 > (expected - actual).abs());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_spectral_centroid() {
|
|
let song = Song::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
|
|
let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
|
|
for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
}
|
|
// Spectral centroid mean value computed here with phase vocoder before normalization: 1354.2273
|
|
// Essentia value with spectrum / hann window: 1351
|
|
let expected_values = vec![-0.75483, -0.87916887];
|
|
for (expected, actual) in expected_values
|
|
.iter()
|
|
.zip(spectral_desc.get_centroid().iter())
|
|
{
|
|
assert!(0.0001 > (expected - actual).abs());
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_spectral_centroid_boundaries() {
|
|
let mut spectral_desc = SpectralDesc::new(10).unwrap();
|
|
let chunk = vec![0.; 512];
|
|
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
let expected_values = vec![-1., -1.];
|
|
for (expected, actual) in expected_values
|
|
.iter()
|
|
.zip(spectral_desc.get_centroid().iter())
|
|
{
|
|
assert!(0.0000001 > (expected - actual).abs());
|
|
}
|
|
let song = Song::decode(Path::new("data/tone_11080Hz.flac")).unwrap();
|
|
let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
|
|
for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
}
|
|
let expected_values = vec![0.97266, -0.9609926];
|
|
for (expected, actual) in expected_values
|
|
.iter()
|
|
.zip(spectral_desc.get_centroid().iter())
|
|
{
|
|
assert!(0.00001 > (expected - actual).abs());
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(all(feature = "bench", test))]
|
|
mod bench {
|
|
extern crate test;
|
|
use crate::timbral::{SpectralDesc, ZeroCrossingRateDesc};
|
|
use test::Bencher;
|
|
|
|
#[bench]
|
|
fn bench_spectral_desc(b: &mut Bencher) {
|
|
let mut spectral_desc = SpectralDesc::new(10).unwrap();
|
|
let chunk = vec![0.; 512];
|
|
|
|
b.iter(|| {
|
|
spectral_desc.do_(&chunk).unwrap();
|
|
});
|
|
}
|
|
|
|
#[bench]
|
|
fn bench_zcr_desc(b: &mut Bencher) {
|
|
let mut zcr_desc = ZeroCrossingRateDesc::new(10);
|
|
let chunk = vec![0.; 512];
|
|
|
|
b.iter(|| {
|
|
zcr_desc.do_(&chunk);
|
|
});
|
|
}
|
|
}
|