Source code for calpy.dsp.audio_features

# -*- coding: utf-8 -*-
import numpy
from scipy.fftpack import dct
import scipy.io.wavfile as wf
from .yin import *
from .. import utilities

def _silence_or_sounding(signal, eps=1e-5):
    """Determine silence and sounding of a given (usually a relatively long period of time) audio.

        Implements algorithms of `PAPER`_ .
        
        Args:
            signal (numpy.array(float)): Sound signal in time domain.
            eps (float, optional): The minimum threshold. Defaults to 1e-5 (previous 1e-8).
        
        Returns:
            list: A 0-1 list marking silence (0) and sounding (1).

        .. _paper:
            http://www.iaeng.org/publication/WCE2009/WCE2009_pp801-806.pdf
    """
    
    N = len(signal)
    signal = signal ** 2
    
    e_max, e_min = signal[0] if signal[0]>eps else 2*eps, eps

    marker = list()
    lamda  = (e_max - e_min)/e_max
    threshold  = (1 - lamda) * e_max + lamda * e_min

    marker.append(1 if signal[0]>threshold else 0)

    for i in range(1, N):
        #YYY modified the original equation from engy = signal[i] to the following.
        #YYY doesn't know why, bu it works more acurately than the original.
        #Happy accident
        engy = signal[i] ** 2
        if engy > e_max:
            e_max = engy
        elif engy < e_min:
            e_min = eps if engy <= eps else engy
        #thresholding
        lamda = (e_max - e_min) / e_max
        threshold = (1 - lamda) * e_max + lamda * e_min
        marker.append(1 if engy > threshold else 0)
        
    return marker

[docs]def pause_profile(signal, sampling_rate, min_silence_duration=0.01, time_step = 0.01, frame_window = 0.025): """Find pauses in audio. Args: signal (:obj:`numpy.array(float)`): Audio signal. sampling_rate (float): Sampling frequency in Hz. min_silence_duration (float, optional): The minimum duration in seconds to be considered pause. Default to 0.01. time_step (float, optional): The time interval (in seconds) between two pauses. Default to 0.01. frame_window (float, optional): The length of speech (in seconds) used to estimate pause. Default to 0.025. Returns: numpy.array(float): 0-1 1D numpy integer array with 1s marking sounding. """ signal = signal / max(abs(signal)) signal = _silence_or_sounding(signal) signal = numpy.array(signal) N = len(signal) ans = numpy.zeros(N) i, count, start, end = 0, 0, 0, 0 T = min_silence_duration * sampling_rate for i in range(N): if signal[i] == 0: if i == N-1 and signal[i-1] == 0 and count >= T: ans[start:end] = 1 elif count==0: start, end, count = i, i+1, count+1 else: end, count = end+1, count+1 elif i > 0 and signal[i-1] == 0: if count >= T: ans[start:end] = 1 count = 0 ans = numpy.logical_not(ans) return utilities.compress_pause_to_time(ans, sampling_rate, time_step=time_step, frame_window=frame_window)
[docs]def dB_profile(signal, sampling_rate, time_step = 0.01, frame_window = 0.025): """Computes decible of signal amplitude of an entire conversation Args: signal (numpy.array(float)): Padded audio signal. sampling_rate (float): Sampling frequency in Hz. time_step (float, optional): The time interval (in seconds) between two dB values. Default to 0.01. frame_window (float, optional): The length of speech (in seconds) used to estimate dB. Default to 0.025. Returns: numpy.array(float): The decibles. """ signal = numpy.abs(signal) T = int(sampling_rate * time_step) Fr = int(sampling_rate * frame_window) Fr += (sampling_rate * frame_window - Fr) > 0 N = (len(signal) - Fr) // T + 1 dB = numpy.empty(N) #use mean over the entire converstaion as reference signal #should avoid using square opertion on row signal in case of overflow ref = 20*numpy.log(numpy.mean(signal[signal>0])) for i in range(N): dB[i] = sum(signal[i*Fr:(i+1)*Fr])/(Fr-1) vfunc = numpy.vectorize(lambda x: -float('inf') if not x else 20 * numpy.log(x) - ref) return vfunc(dB)
[docs]def pitch_profile(signal, sampling_rate, time_step = 0.01, frame_window = 0.025, lower_threshold = 75, upper_threshold = 255): """Compute pitch for a long (usually over an entire conversation) sound signal Args: signal (numpy.array(float)): Padded audio signal. sampling_rate (float): Sampling frequency in Hz. time_step (float, optional): The time interval (in seconds) between two pitches. Default to 0.01. frame_window (float, optional): The length of speech (in seconds) used to estimate pitch. Default to 0.025. lower_threshold (int, optional): Defaults to 75. upper_threshold (int, optional): Defaults to 225. Returns: numpy.array(float): Estimated pitch in Hz. """ T = int(sampling_rate * time_step) Fr = int(sampling_rate * frame_window) Fr += sampling_rate * frame_window - Fr > 0 N = (len(signal) - Fr) // T + 1 if not N: print("Warning: not enough signal, pitch profile will be empty.") p = numpy.empty( N ) for i in range(N): p[i] = instantaneous_pitch(signal[i*T:i*T+Fr], sampling_rate) p[numpy.where( (p > upper_threshold) | (p < lower_threshold) )] = 0 return p
[docs]def mfcc_profile(signal, sampling_rate, time_step = 0.01, frame_window = 0.025, NFFT = 512, nfilt = 40, ceps = 12): """ Compute MFCC for a long (usually over an entire conversation) sound signal. Reference: http://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html Args: signal (numpy.array(float)): Padded audio signal. sampling_rate (float): Sampling frequency in Hz. time_step (float, optional): The time interval (in seconds) between two MFCC. Default to 0.01. frame_window (float, optional): The length of speech (in seconds) used to estimate MFCC. Default to 0.025. NFFT (int, optional): NFFT-point FFT. Defaults to 512. nfilt (int, optional): Number of frequency bands in Mel-scaling. Defaults to 40. ceps (int, optional): Number of mel frequency ceptral coefficients to be retained. Defaults to 12. Returns: numpy.array() : Calculated Mel-Frequecy Cepstral Coefficients Matrix. """ T = int(sampling_rate * time_step) Fr = int(sampling_rate * frame_window) Fr += sampling_rate * frame_window - Fr > 0 N = (len(signal) - Fr) // T + 1 res = numpy.empty((ceps, N)) if not N: print("Warning: not enough signal, mfcc profile will be empty.") #pre-calculate routine filter bank array low_mel, high_mel = 0, 2595 * numpy.log10(1 + sampling_rate / 1400) # Mel points mel_pts = numpy.linspace(low_mel, high_mel, nfilt + 2) # Corresponding Hz points hz_pts = (10 ** (mel_pts / 2595) - 1) * 700 hz_pts = numpy.floor((NFFT + 1) * hz_pts / sampling_rate) # filter bank array fltBank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1)))) for i in range(1, nfilt + 1): left, mid, right = int(hz_pts[i - 1]), int(hz_pts[i]), int(hz_pts[i + 1]) for k in range(left, mid): fltBank[i - 1, k] = (k - hz_pts[i - 1]) / (hz_pts[i] - hz_pts[i - 1]) for k in range(mid, right): fltBank[i - 1, k] = (hz_pts[i + 1] - k) / (hz_pts[i + 1] - hz_pts[i]) fltBank = fltBank.T for i in range(N): frame = signal[i * T : i * T + Fr] # applies a hamming window before STFT, optional but highly recommended frame *= numpy.hamming(Fr) # power spectrum of FFT pow_frame = (1.0 / NFFT) * (numpy.absolute(numpy.fft.rfft(frame, NFFT)) ** 2) # filter bank it filter_bank = numpy.dot(pow_frame, fltBank) # special process of 0 points filter_bank = numpy.where(filter_bank == 0, numpy.finfo(float).eps, filter_bank) #scale to dB filter_bank = 20 * numpy.log10(filter_bank) #mfcc mfcc = dct(filter_bank, norm='ortho') res[:, i] = mfcc[1 : ceps + 1] return res
[docs]def remove_long_pauses(inputfilename, outputfilename, long_pause=0.5, min_silence_duration=0.01): """Remove long pauses/silence in a wav file. Args: inputfilename (string): file name of input wav. outputfilename (string): file name of output wav. long_pause (float, optional): minimum duration of silence to be considered a long pause, in seconds. Defaults to 0.5. min_silence_duration (float, optional): The minimum duration in seconds to be considered pause. Default to 0.01. Returns: NULL: writes a wav file to disk. """ fs, sound = wf.read(inputfilename) long_pause = int(long_pause / min_silence_duration) if len(sound.shape) > 1: pauses = pause_profile(sound[:,0], fs, min_silence_duration=min_silence_duration) else: pauses = pause_profile(sound, fs, min_silence_duration=min_silence_duration) cnt = idx0 = 0 idxs = [] for idx, pause in enumerate(pauses): if pause: if cnt == 0: idx0 = idx cnt += 1 elif cnt: if cnt >= long_pause: idxs.append((idx0, idx)) cnt = 0 if cnt: idxs.append((idx0, idx)) if idxs[-1][-1] != pauses.shape[0]: idxs.append((pauses.shape[0], pauses.shape[0])) if len(sound.shape) > 1: sounding_sound = numpy.array([[0], [0]], dtype=type(sound[0, 0])).T s = 0 for idx in idxs: e = idx[0] * fs // 100 sounding_sound = numpy.append(sounding_sound, sound[s:e,:], axis=0) s = idx[1] * fs // 100 else: sounding_sound = numpy.array([], dtype=type(sound[0])) s = 0 for idx in idxs: e = idx[0] * fs // 100 sounding_sound = numpy.append(sounding_sound, sound[s:e]) s = idx[1] * fs // 100 wf.write(outputfilename, fs, sounding_sound)
[docs]def get_pause_length(pauses): """Compute the length of pause. Args: pauses (numpy array, bool): True indicates occurrence of pause. Returns: res (numpy array): The length of consecutive pauses. """ res = [] cnt = 0 for pause in pauses: if pause: cnt += 1 elif cnt: res.append(cnt) cnt = 0 if cnt: res.append(cnt) return numpy.array(res)
[docs]def pause_length_histogram(pauses, min_silence_duration=0.01, bins=30): """Compute the histogram of pause lenghth. Args: pauses (numpy array, bool): True indicates occurrence of pause. min_silence_duration (float, optional): The minimum duration in seconds to be considered pause. If not provided, then default to 0.01. bins (int, optional): Defines the number of equal-width bins in the given range. Defaults to 30. Returns: hist (numpy array): The values of the histogram. bin_edges (numpy array, float): the bin edges (length(hist)+1) in seconds. """ assert type(bins) == int, "input to bins must be an integer." pause_len = get_pause_length(pauses) hist, bin_edges = numpy.histogram(pause_len,bins=bins) return (hist, bin_edges * min_silence_duration)