[Audio processing] 数据集生成 & 性别年龄分类训练 Python

2023-02-24 22:46:09

1、重命名，Python中文路径各种错误，所以需要先将所有文件的路径名全都改成中文。用的是MAC系统，所以WIN下的命令行批处理没法解决，所以用C来完成

//  Created by Carl on 16.

//  Copyright (c) 2016年 Carl. All rights reserved.

//

#include <iostream>

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <dirent.h>

#include <unistd.h>

using namespace std;

void getFileList()

{

    string sourceDir = "/Users/karl/Work/database/rawdata/children_CN/";

    string targetDir = "/Users/karl/Work/database/rawdata/children/";

    DIR *dir;

    struct dirent *ptr;

    int i = ;

    if ((dir=opendir(sourceDir.c_str())) == NULL)

    {

        perror("Open dir error...");

        exit();

    }

    while ((ptr=readdir(dir)) != NULL)

    {

        if(strcmp(ptr->d_name,".")== || strcmp(ptr->d_name,"..")==)    ///current dir OR parrent

            continue;

        else if(ptr->d_type == )

        {

            printf("%s %s\n",(sourceDir + ptr->d_name).c_str(),(targetDir + to_string(i) + ".wav").c_str());

            if(rename((sourceDir + ptr->d_name).c_str(), (targetDir + to_string(i++) + ".wav").c_str())<)

                cout<<"error"<<endl;

            else

                cout<<"ok"<<endl;

        }

    }

    return;

}

int main() {

    getFileList();

    return ;

}

2、然后再使用FFMPEG那篇文章写的Python代码，将所有音频文件转成统一格式

#coding=utf-8

#!/usr/bin/env python

'''CREATED:2016-03-08

Use example of ffmpeg

'''

import argparse

import sys

import os

import string

import subprocess as sp

#Full path of ffmpeg

FFMPEG_BIN = "/Users/karl/Documents/python/audio/tool/ffmpeg"

#Full path of sourceDir

sourceDir = "/Users/karl/Work/database/rawdata/male/"

#Full path of targetDir

targetDir = "/Users/karl/Work/database/age/male/"

#Channel setting 1 for mono

ac = 1

#Sample frequency

sf = 16000

#Extension setting

ext = 'wav'

def convert(sourceDir, targetDir, ac, sf, ext):

    i = 0

    if not os.path.exists(targetDir):

        os.mkdir(targetDir)

    files = os.listdir(sourceDir)

    for f in files:

        if f.endswith('.wav'):

            command = [ FFMPEG_BIN,

                       '-i', os.path.join(sourceDir, f),

                       '-ac', str(ac),

                       '-ar', str(sf), os.path.join(targetDir, str(i) + "." + ext)]

            i += 1

            print command

            pipe = sp.Popen(command, stdout = sp.PIPE, bufsize = 10**8)

if __name__ == '__main__':

    convert(sourceDir, targetDir, ac, sf, ext)

3、用时域上RMS去除静音帧(Optional)

#---Cut the silent head and tail of audio

def rmsdemo(y):

    return np.sqrt((y**2).mean())

def cutheadntail(y, winlen, threshold):

    totallen = y.shape[0]

    num = totallen / winlen

    i = 1

    j = num

    for i in range(num):

        if rmsdemo(y[i * winlen : (i + 1) * winlen - 1]) > threshold:

            break

    for j in range(-1,0,-1):

        if rmsdemo(y[i * winlen : (i + 1) * winlen - 1]) > threshold or j == i:

            break

    #percentage = (j - i + 1) * 1.0 / num;

    #print(i, j, percentage)

    yy = y[i * winlen : (j + 1) * winlen - 1]

    return yy

4、用librosa提取特征，包括MFCC、DMFCC

from __future__ import print_function

import argparse

import sys

import os

import pprint

import sklearn as sl

import numpy as np

import librosa

import librosa.feature.spectral as f

import svmutil

#---Feature extraction and store, including MFCC, DMFCC

def mfcclist(data_dir):

    m = []

    dm = []

    for i in range(300):

        filepath = os.path.join(data_dir, str(i) + '.wav')

        print(filepath)

        am, adm = mfccfile(filepath)

        m.append(am)

        dm.append(adm)

        i += 1

    np.savetxt("TrainFemaleMFCC",m,fmt='%s',newline='\n')

    np.savetxt("TrainFemaleDMFCC",dm,fmt='%s',newline='\n')

    #print(m)

    #print(dm)

'''

    fout = open(output_file,'w')

    fout.write(str(am) + '\n')

    fout.write(str(adm))

    fout.close()

'''

def mfccfile(input_file):

    print('Loading ', input_file)

    y, sr = librosa.load(input_file)

    M = f.mfcc(y, sr, None, 13)

    DM = M[::,1::] - M[::,0:-1:1]

    am = np.mean(M, axis = 1)

    adm = np.mean(DM, axis = 1)

    return (am, adm)

#---Loading stored features file

def loadfeatures(features_file):

    fin = open(features_file, 'r')

    features = [map(float,ln.strip().split(' '))

                for ln in fin.read().splitlines() if ln.strip()]

                #pprint.pprint(features)

    print(features)

5、用libsvm训练和预测，包括归一化

#---SVM training and predicting process

def svmtraindemo(x, modelname, scalar):

    x = scalar.transform(x)

    #x = sl.preprocessing.scale(x)

    x = x.tolist()

    print(x)

    y = [1.0] * 300 + [1] * 300 + [-1.0] * 600

    model = svm_train(y, x, '-b 1')

    svm_save_model(modelname + str(0), model)

    p_label, p_acc, p_val = svm_predict(y[:1200], x[:1200], model, '-b 1')

def svmpredictdemo(x, modelname, scalar):

    x = scalar.transform(x)

    #x = sl.preprocessing.scale(x)

    x = x.tolist()

    print(len(x))

    y = [1.0] * 100 + [1] * 100 + [-1.0] * 200

    m = svm_load_model(modelname + str(0))

    print(p_label)

    p_label, p_acc, p_val = svm_predict(y[:400], x[:400], m, '-b 1')

附：

1、经过试验，发现用无监督的方式，准确来说是基于规则的方式分辨男、女、小孩的声音还是不太靠谱，频域上的分布还是用有监督的方式自己学习应该更可靠。

2、用有噪音的推无噪音的小孩，准确率80%，无噪音推有噪音的，准确率才60+%，所以训练还是最好用噪音环境的数据集吧，之前想的是训练应该用无噪音的样本还是太天真了。其实混合起来效果还不错。

3、男女的准确率也就80%，样本分布还是比较好，而且均有噪音，估计在实际应用中效果也不会比80%差太远。

码农公寓

相关文章