RASTA-PLP 听觉谱应用到语音分类

2023-10-26 11:40:04
python 实现PLP代码和语音分类项目
https://github.com/linhndt/spoken_language_classification/blob/master/feature_extractor/python_speech_features/rasta.py
import librosa
import librosa.filters
import scipy.fftpack as fft
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import spectrum


def rastaplp(x, fs=16000, win_time=0.040, hop_time=0.020, dorasta=True, modelorder=8):
    # first compute power spectrum
    p_spectrum, _ = powspec(x, fs, win_time, hop_time)
    # next group to critical bands
    aspectrum = audspec(p_spectrum, fs)
    nbands = aspectrum.shape[0]

    if dorasta:
        # put in log domain
        nl_aspectrum = np.log(aspectrum)
        # next do rasta filtering
        ras_nl_aspectrum = rastafilt(nl_aspectrum)
        # do inverse log
        aspectrum = np.exp(ras_nl_aspectrum)

    postspectrum, _ = postaud(aspectrum, fs / 2)

    lpcas = dolpc(postspectrum, modelorder)
    cepstra = lpc2cep(lpcas, modelorder + 1)

    if modelorder > 0:
        lpcas = dolpc(postspectrum, modelorder)
        cepstra = lpc2cep(lpcas, modelorder + 1)
        spectra, F, pM = lpc2spec(lpcas, nbands)
    else:
        spectra = postspectrum
        cepstra = spec2cep(spectra)

    cepstra = lifter(cepstra, 0.6)

    return cepstra


def powspec(x, fs=16000, window_time=0.040, hop_time=0.020, dither=1):
    win_length = int(np.round(window_time * fs))
    hop_length = int(np.round(hop_time * fs))
    fft_length = int(np.power(2, np.ceil(np.log2(window_time * fs))))

    X = librosa.stft(np.multiply(32768, x.astype(float)), n_fft=fft_length, hop_length=hop_length,
                     win_length=win_length, window='hann', center=False)
    pow_X = np.power(np.abs(X), 2)
    if dither:
        pow_X = np.add(pow_X, win_length)
    e = np.log(np.sum(pow_X, axis=0))
    return pow_X, e


def hz2bark(f):
    z = np.multiply(6, np.arcsinh(np.divide(f, 600)))
    return z


def bark2hz(z):
    hz = np.multiply(600, np.sinh(np.divide(z, 6)))
    return hz


def fft2barkmx(fft_length, fs, nfilts=0, band_width=1, min_freq=0, max_freq=0):
    if max_freq == 0:
        max_freq = fs / 2

    min_bark = hz2bark(min_freq)
    nyqbark = hz2bark(max_freq) - min_bark

    if nfilts == 0:
        nfilts = np.add(np.ceil(nyqbark), 1)

    wts = np.zeros((int(nfilts), int(fft_length)))
    step_barks = np.divide(nyqbark, np.subtract(nfilts, 1))
    binbarks = hz2bark(np.multiply(np.arange(0, np.add(np.divide(fft_length, 2), 1)), np.divide(fs, fft_length)))

    for i in range(int(nfilts)):
        f_bark_mid = min_bark + np.multiply(i, step_barks)
        lof = np.subtract(np.subtract(binbarks, f_bark_mid), 0.5)
        hif = np.add(np.subtract(binbarks, f_bark_mid), 0.5)
        wts[i, 0: int(fft_length / 2) + 1] = np.power(10, np.minimum(0,
                                                                     np.divide(np.minimum(hif, np.multiply(-2.5, lof)),
                                                                               band_width)))
    return wts


def rastafilt(x):
    numer = np.arange(-2, 3)
    numer = np.divide(-numer, np.sum(np.multiply(numer, numer)))
    denom = np.array([1, -0.94])

    zi = signal.lfilter_zi(numer, 1)
    y = np.zeros((x.shape))
    for i in range(x.shape[0]):
        y1, zi = signal.lfilter(numer, 1, x[i, 0:4], axis=0, zi=zi * x[i, 0])
        y1 = y1 * 0
        y2, _ = signal.lfilter(numer, denom, x[i, 4:x.shape[1]], axis=0, zi=zi)
        y[i, :] = np.append(y1, y2)
    return y


def dolpc(x, modelorder=12):
    nbands, nframes = x.shape
    ncorr = 2 * (nbands - 1)
    R = np.zeros((ncorr, nframes))

    R[0:nbands, :] = x
    for i in range(nbands - 1):
        R[i + nbands - 1, :] = x[nbands - (i + 1), :]

    r = fft.ifft(R.T).real.T
    r = r[0:nbands, :]

    y = np.ones((nframes, modelorder + 1))
    e = np.zeros((nframes, 1))

    if modelorder == 0:
        for i in range(nframes):
            _, e_tmp, _ = spectrum.LEVINSON(r[:, i], modelorder, allow_singularity=True)
            e[i, 0] = e_tmp
    else:
        for i in range(nframes):
            y_tmp, e_tmp, _ = spectrum.LEVINSON(r[:, i], modelorder, allow_singularity=True)
            y[i, 1:modelorder + 1] = y_tmp
            e[i, 0] = e_tmp

    y = np.divide(y.T, np.add(np.tile(e.T, (modelorder + 1, 1)), 1e-8))

    return y


def lpc2cep(a, nout=0):
    nin, ncol = a.shape

    order = nin - 1

    if nout == 0:
        nout = order + 1

    cep = np.zeros((nout, ncol))
    cep[0, :] = -np.log(a[0, :])

    norm_a = np.divide(a, np.add(np.tile(a[0, :], (nin, 1)), 1e-8))

    for n in range(1, nout):
        sum = 0
        for m in range(1, n):
            sum = np.add(sum, np.multiply(np.multiply((n - m), norm_a[m, :]), cep[(n - m), :]))

        cep[n, :] = -np.add(norm_a[n, :], np.divide(sum, n))

    return cep


def lifter(x, lift=0.6, invs=False):
    ncep = x.shape[0]

    if lift == 0:
        y = x
    else:
        if lift < 0:
            warnings.warn('HTK liftering does not support yet; default liftering')
            lift = 0.6
        liftwts = np.power(np.arange(1, ncep), lift)
        liftwts = np.append(1, liftwts)

        if (invs):
            liftwts = np.divide(1, liftwts)

        y = np.matmul(np.diag(liftwts), x)

    return y


def melfcc(x, fs=16000, min_freq=50, max_freq=6500, n_mfcc=13, n_bands=40, lifterexp=0.6,
           fbtype='fcmel', dcttype=1, usecmp=True, window_time=0.040, hop_time=0.020,
           preemph=0.97, dither=1, sumpower=1, band_width=1, modelorder=0,
           broaden=0, useenergy=False):
    if preemph != 0:
        b = [1, -preemph]
        a = 1
        x = signal.lfilter(b, a, x)

    pspectrum, logE = powspec(x, fs=fs, window_time=window_time, hop_time=hop_time, dither=dither)
    aspectrum = audspec(pspectrum, fs=fs, nfilts=n_bands, fbtype=fbtype,
                        min_freq=min_freq, max_freq=max_freq)

    if usecmp:
        aspectrum, _ = postaud(aspectrum, fmax=max_freq, fbtype=fbtype)

    if modelorder > 0:
        lpcas = dolpc(aspectrum, modelorder)
        cepstra = lpc2cep(lpcas, nout=n_mfcc)

    else:
        cepstra, _ = spec2cep(aspectrum, ncep=n_mfcc, dcttype=dcttype)

    cepstra = lifter(cepstra, lift=lifterexp)

    if useenergy == True:
        cepstra[0, :] = logE

    return cepstra


def hz2mel(f, htk=False):
    if htk:
        z = np.multiply(2595, np.log10(np.add(1, np.divide(f, 700))))
    else:
        f_0 = 0.0
        f_sp = 200 / 3
        brkfrq = 1000
        brkpt = (brkfrq - f_0) / f_sp
        logstep = np.exp(np.log(6.4) / 27.0)

        f = np.array(f, ndmin=1)
        z = np.zeros((f.shape[0],))

        for i in range(f.shape[0]):
            if f[i] < brkpt:
                z[i] = (f[i] - f_0) / f_sp
            else:
                z[i] = brkpt + (np.log(f[i] / brkfrq) / np.log(logstep))
    return z


def mel2hz(z, htk=False):
    if htk:
        f = np.multiply(700, np.subtract(np.power(10, np.divide(z, 2595)), 1))
    else:
        f_0 = 0
        f_sp = 200 / 3
        brkfrq = 1000
        brkpt = (brkfrq - f_0) / f_sp
        logstep = np.exp(np.log(6.4) / 27.0)

        z = np.array(z, ndmin=1)
        f = np.zeros((z.shape[0],))

        for i in range(z.shape[0]):
            if z[i] < brkpt:
                f[i] = f_0 + f_sp * z[i]
            else:
                f[i] = brkfrq * np.exp(np.log(logstep) * (z[i] - brkpt))
    return f


def fft2melmx(fft_length, fs, nfilts=0, band_width=1, min_freq=0, max_freq=0, htk=False, constamp=False):
    if nfilts == 0:
        nfilts = np.ceil(hz2mel(max_freq, htk) / 2)
    if max_freq == 0:
        max_freq = fs / 2

    wts = np.zeros((int(nfilts), int(fft_length)))
    fftfrqs = np.multiply(np.divide(np.arange(0, fft_length / 2 + 1), fft_length), fs)

    min_mel = hz2mel(min_freq, htk)
    max_mel = hz2mel(max_freq, htk)
    binfrqs = mel2hz(np.add(min_mel, np.multiply(np.arange(0, nfilts + 2),
                                                 (max_mel - min_mel) / (nfilts + 1))), htk)

    for i in range(int(nfilts)):
        fs_tmp = binfrqs[np.add(np.arange(0, 3), i)]
        fs_tmp = np.add(fs_tmp[1], np.multiply(band_width, np.subtract(fs_tmp, fs_tmp[1])))
        loslope = np.divide(np.subtract(fftfrqs, fs_tmp[0]), np.subtract(fs_tmp[1], fs_tmp[0]))
        hislope = np.divide(np.subtract(fs_tmp[2], fftfrqs), np.subtract(fs_tmp[2], fs_tmp[1]))
        wts[i, 0: int(fft_length / 2) + 1] = np.maximum(0, np.minimum(loslope, hislope))

    if constamp == False:
        wts = np.matmul(np.diag(np.divide(2, np.subtract(binfrqs[2: int(nfilts) + 2],
                                                         binfrqs[0: int(nfilts)]))), wts)

    return wts


def audspec(p_spectrum, fs=16000, nfilts=0, fbtype='bark', min_freq=0, max_freq=0, sumpower=1, band_width=1):
    if nfilts == 0:
        np.add(np.ceil(hz2bark(fs / 2)), 1)
    if max_freq == 0:
        max_freq = fs / 2
    nfreqs = p_spectrum.shape[0]
    nfft = (int(nfreqs) - 1) * 2

    if fbtype == 'bark':
        wts = fft2barkmx(nfft, fs, nfilts, band_width, min_freq, max_freq)
    elif fbtype == 'mel':
        wts = fft2melmx(nfft, fs, nfilts, band_width, min_freq, max_freq)
    elif fbtype == 'htkmel':
        wts = fft2melmx(nfft, fs, nfilts, band_width, min_freq, max_freq, htk=True, constamp=True)
    elif fbtype == 'fcmel':
        wts = fft2melmx(nfft, fs, nfilts, band_width, min_freq, max_freq, htk=True, constamp=False)

    wts = wts[:, 0: nfreqs]

    if sumpower:
        aspectrum = np.matmul(wts, p_spectrum)
    else:
        aspectrum = np.power(np.matmul(wts, np.sqrt(p_spectrum)), 2)
    return aspectrum


def postaud(x, fmax, fbtype='bark', broaden=0):
    nbands, nframes = x.shape
    nfpts = int(nbands + 2 * broaden)

    if fbtype == 'bark':
        bandcfhz = bark2hz(np.linspace(0, hz2bark(fmax), nfpts))
    elif fbtype == 'mel':
        bandcfhz = mel2hz(np.linspace(0, hz2mel(fmax), nfpts))
    elif fbtype == 'htkmel' or fbtype == 'fcmel':
        bandcfhz = mel2hz(np.linspace(0, hz2mel(fmax, htk=True), nfpts), htk=True)

    bandcfhz = bandcfhz[broaden: (nfpts - broaden)]

    fsq = np.power(bandcfhz, 2)
    ftmp = np.add(fsq, 1.6e5)
    eql = np.multiply(np.power(np.divide(fsq, ftmp), 2), np.divide(np.add(fsq, 1.44e6), np.add(fsq, 9.61e6)))

    z = np.multiply(np.tile(eql, (nframes, 1)).T, x)
    z = np.power(z, 0.33)

    if broaden:
        y = np.zeros((z.shape[0] + 2, z.shape[1]))
        y[0, :] = z[0, :]
        y[1:nbands + 1, :] = z
        y[nbands + 1, :] = z[z.shape[0] - 1, :]
    else:
        y = np.zeros((z.shape[0], z.shape[1]))
        y[0, :] = z[1, :]
        y[1:nbands - 1, :] = z[1:z.shape[0] - 1, :]
        y[nbands - 1, :] = z[z.shape[0] - 2, :]

    return y, eql


def spec2cep(spec, ncep, dcttype):
    nrow, ncol = spec.shape[0], spec.shape[1]
    dctm = np.zeros((ncep, nrow))

    if dcttype == 2 or dcttype == 3:
        for i in range(ncep):
            dctm[i, :] = np.multiply(
                np.cos(np.multiply(np.divide(np.multiply(i, np.arange(1, 2 * nrow, 2)), (2 * nrow)), np.pi)),
                np.sqrt(2 / nrow))

        if dcttype == 2:
            dctm[0, :] = np.divide(dctm[0, :], np.sqrt(2))

    elif dcttype == 4:
        for i in range(ncep):
            dctm[i, :] = np.multiply(
                np.cos(np.multiply(np.divide(np.multiply(i, np.arange(1, nrow + 1)), (nrow + 1)), np.pi)), 2)
            dctm[i, 0] = np.add(dctm[i, 0], 1)
            dctm[i, int(nrow - 1)] = np.multiply(dctm[i, int(nrow - 1)], np.power(-1, i))
        dctm = np.divide(dctm, 2 * (nrow + 1))

    else:
        for i in range(ncep):
            dctm[i, :] = np.divide(
                np.multiply(np.cos(np.multiply(np.divide(np.multiply(i, np.arange(0, nrow)), (nrow - 1)), np.pi)), 2),
                2 * (nrow - 1))
        dctm[:, 0] = np.divide(dctm[:, 0], 2)
        dctm[:, int(nrow - 1)] = np.divide(dctm[:, int(nrow - 1)], 2)

    cep = np.matmul(dctm, np.log(np.add(spec, 1e-8)))

    return cep, dctm


def lpc2spec(lpcas, nout=17, FMout=False):
    rows, cols = lpcas.shape
    order = rows - 1

    gg = lpcas[0, :]
    aa = np.divide(lpcas, np.tile(gg, (rows, 1)))

    #    Calculate the actual z-plane polyvals: nout points around unit circle
    tmp_1 = np.array(np.arange(0, nout), ndmin=2).T
    tmp_1 = np.divide(np.multiply(-1j, np.multiply(tmp_1, np.pi)), (nout - 1))
    tmp_2 = np.array(np.arange(0, order + 1), ndmin=2)
    zz = np.exp(np.matmul(tmp_1, tmp_2))
    #    Actual polyvals, in power (mag^2)
    features = np.divide(np.power(np.divide(1, np.abs(np.matmul(zz, aa))), 2), np.tile(gg, (nout, 1)))
    F = np.zeros((cols, int(np.ceil(rows / 2))))
    M = F

    if FMout == True:
        for c in range(cols):
            aaa = aa[:, c]
            rr = np.roots(aaa)
            ff_tmp = np.angle(rr)
            ff = np.array(ff_tmp, ndmin=2).T
            zz = np.exp(np.multiply(1j, np.matmul(ff, np.array(np.arange(0, aaa.shape[0]), ndmin=2))))
            mags = np.sqrt(np.divide(np.power(np.divide(1, np.abs(np.matmul(zz, np.array(aaa, ndmin=2).T))), 2), gg[c]))

            ix = np.argsort(ff_tmp)
            dummy = np.sort(ff_tmp)
            mp_F_list = []
            tmp_M_list = []

            for i in range(ff.shape[0]):
                if dummy[i] > 0:
                    tmp_F_list = np.append(tmp_F_list, dummy[i])
                    tmp_M_list = np.append(tmp_M_list, mags[ix[i]])

            M[c, 0: tmp_M_list.shape[0]] = tmp_M_list
            F[c, 0: tmp_F_list.shape[0]] = tmp_F_list

    return features, F, M


def deltas(x, w=9):
    rows, cols = x.shape
    hlen = np.floor(w / 2)
    win = np.arange(hlen, -(hlen + 1), -1, dtype='float32')

    xx = np.append(np.append(np.tile(x[:, 0], (int(hlen), 1)).T, x, axis=1),
                   np.tile(x[:, cols - 1], (int(hlen), 1)).T, axis=1)

    d = signal.lfilter(win, 1, xx, axis=1)
    d = d[:, int(2 * hlen): int(2 * hlen + cols)]
    return d


def cep2spec(cep, nfreq, dcttype=2):
    ncep, ncol = cep.shape

    dctm = np.zeros((ncep, nfreq))
    idctm = np.zeros((nfreq, ncep))

    if dcttype == 2 or dcttype == 3:
        for i in range(ncep):
            dctm[i, :] = np.multiply(np.cos(np.multiply(np.divide(np.multiply(i, np.arange(1, 2 * nfreq, 2)),
                                                                  (2 * nfreq)), np.pi)), np.sqrt(2 / nfreq))

        if dcttype == 2:
            dctm[0, :] = np.divide(dctm[0, :], np.sqrt(2))
        else:
            dctm[0, :] = np.divide(dctm[0, :], 2)

        idctm = dctm.T

    elif dcttype == 4:
        for i in range(ncep):
            idctm[:, i] = np.multiply(
                np.cos(np.multiply(np.divide(np.multiply(i, np.arange(1, nfreq + 1).T), (nfreq + 1)), np.pi)), 2)

        idctm[:, 0:ncep] = np.divide(idctm[:, 0:ncep], 2)

    else:
        for i in range(ncep):
            idctm[:, i] = np.multiply(
                np.cos(np.multiply(np.divide(np.multiply(i, np.arange(0, nfreq).T), (nfreq - 1)), np.pi)), 2)

        idctm[:, [0, -1]] = np.divide(idctm[:, [0, -1]], 2)

    spec = np.exp(np.matmul(idctm, cep))

    return spec, idctm


def invpostaud(y, fmax, fbtype='bark', broaden=0):
    nbands, nframes = y.shape

    if fbtype == 'bark':
        bandcfhz = bark2hz(np.linspace(0, hz2bark(fmax), nbands))
    elif fbtype == 'mel':
        bandcfhz = mel2hz(np.linspace(0, hz2mel(fmax), nbands))
    elif fbtype == 'htkmel' or fbtype == 'fcmel':
        bandcfhz = mel2hz(np.linspace(0, hz2mel(fmax, htk=True), nbands), htk=True)

    bandcfhz = bandcfhz[broaden: (nbands - broaden)]

    fsq = np.power(bandcfhz, 2)
    ftmp = np.add(fsq, 1.6e5)
    eql = np.multiply(np.power(np.divide(fsq, ftmp), 2),
                      np.divide(np.add(fsq, 1.44e6), np.add(fsq, 9.61e6)))

    x = np.power(y, np.divide(1, 0.33))

    if eql[0] == 0:
        eql[0] = eql[1]
        eql[-1] = eql[-2]

    x = np.divide(x[broaden: (nbands - broaden + 1), :], np.add(np.tile(eql.T, (nframes, 1)).T, 1e-8))

    return x, eql


def invpowspec(y, fs, win_time, hop_time, excit=[]):
    nrow, ncol = y.shape
    r = excit

    winpts = int(np.round(np.multiply(win_time, fs)))
    steppts = int(np.round(np.multiply(hop_time, fs)))
    nfft = int(np.power(2, np.ceil(np.divide(np.log(winpts), np.log(2)))))

    # Can't predict librosa stft length...
    tmp = librosa.istft(y, hop_length=steppts, win_length=winpts,
                        window='hann', center=False)
    xlen = len(tmp)
    # xlen = int(np.add(winpts, np.multiply(steppts, np.subtract(ncol, 1))))
    # xlen = int(np.multiply(steppts, np.subtract(ncol, 1)))

    if len(r) == 0:
        r = np.squeeze(np.random.randn(xlen, 1))
    r = r[0:xlen]

    R = librosa.stft(np.divide(r, 32768 * 12), n_fft=nfft, hop_length=steppts,
                     win_length=winpts, window='hann', center=False)

    R = np.multiply(R, np.sqrt(y))
    x = librosa.istft(R, hop_length=steppts, win_length=winpts,
                      window='hann', center=False)

    return x


def invaudspec(aspectrum, fs=16000, nfft=512, fbtype='bark',
               min_freq=0, max_freq=0, sumpower=True, band_width=1):
    if max_freq == 0:
        max_freq = fs / 2
    nfilts, nframes = aspectrum.shape

    if fbtype == 'bark':
        wts = fft2barkmx(nfft, fs, nfilts, band_width, min_freq, max_freq)
    elif fbtype == 'mel':
        wts = fft2melmx(nfft, fs, nfilts, band_width, min_freq, max_freq)
    elif fbtype == 'htkmel':
        wts = fft2melmx(nfft, fs, nfilts, band_width, min_freq, max_freq, htk=True, constamp=True)
    elif fbtype == 'fcmel':
        wts = fft2melmx(nfft, fs, nfilts, band_width, min_freq, max_freq, htk=True, constamp=False)

    wts = wts[:, 0:int(nfft / 2 + 1)]

    ww = np.matmul(wts.T, wts)
    itws = np.divide(wts.T, np.tile(np.maximum(np.divide(np.mean(np.diag(ww)), 100),
                                               np.sum(ww, axis=0)), (nfilts, 1)).T)
    if sumpower == True:
        spec = np.matmul(itws, aspectrum)
    else:
        spec = np.power(np.matmul(itws, np.sqrt(aspectrum)))

    return spec, wts, itws


def invmelfcc(cep, fs, win_time=0.040, hop_time=0.020, lifterexp=0.6, sumpower=True,
              preemph=0.97, max_freq=6500, min_freq=50, n_bands=40, band_width=1,
              dcttype=2, fbtype='mel', usecmp=False, modelorder=0, broaden=0, excitation=[]):
    winpts = int(np.round(np.multiply(win_time, fs)))
    nfft = int(np.power(2, np.ceil(np.divide(np.log(winpts), np.log(2)))))

    cep = lifter(cep, lift=lifterexp, invs=True)

    pspc, _ = cep2spec(cep, nfreq=int(n_bands + 2 * broaden), dcttype=dcttype)

    if usecmp == True:
        aspc, _ = invpostaud(pspc, fmax=max_freq, fbtype=fbtype, broaden=broaden)
    else:
        aspc = pspc

    spec, _, _ = invaudspec(aspc, fs=fs, nfft=nfft, fbtype=fbtype, min_freq=min_freq,
                            max_freq=max_freq, sumpower=sumpower, band_width=band_width)

    x = invpowspec(spec, fs, win_time=win_time, hop_time=hop_time, excit=excitation)

    if preemph != 0:
        b = [1, -preemph]
        a = 1
        x = signal.lfilter(b, a, x)

    return x, aspc, spec, pspc
# reference
# https://labrosa.ee.columbia.edu/matlab/rastamat/
if __name__ == '__main__':
    d,fs=librosa.load("ATC00035.wav",16000)
    plt.figure()
    plt.imshow(rastaplp(d))
    plt.show()
码农公寓

相关文章