【语音识别】基于MFCC特征实现说话人语音识别matlab源码

1 模型

采用能够反映人对语音的感知特性的Mel频率倒谱系数(MFCC)作为特征参数,以及为避免时间规整问题采用矢量量化技术开发的说话人识别系统.MFCC主要的是模拟人耳的听觉过程,相对于其它参数它对语音波形的变化不敏感,更加稳定,系统取得很好的识别结果,实验表明系统训练和识别的计算量和存储量都比较低.

【语音识别】基于MFCC特征实现说话人语音识别matlab源码

【语音识别】基于MFCC特征实现说话人语音识别matlab源码

2 部分代码

function varargout = Main(varargin)
% MAIN M-file for Main.fig
%     MAIN, by itself, creates a new MAIN or raises the existing
%     singleton*.
%
%     H = MAIN returns the handle to a new MAIN or the handle to
%     the existing singleton*.
%
%     MAIN('CALLBACK',hObject,eventData,handles,...) calls the local
%     function named CALLBACK in MAIN.M with the given input arguments.
%
%     MAIN('Property','Value',...) creates a new MAIN or raises the
%     existing singleton*. Starting from the left, property value pairs are
%     applied to the GUI before Main_OpeningFcn gets called. An
%     unrecognized property name or invalid value makes property application
%     stop. All inputs are passed to Main_OpeningFcn via varargin.
%
%     *See GUI Options on GUIDE's Tools menu. Choose "GUI allows only one
%     instance to run (singleton)".
%
% See also: GUIDE, GUIDATA, GUIHANDLES

% Edit the above text to modify the response to help Main

% Last Modified by GUIDE v2.5 11-Aug-2016 00:35:18

% Begin initialization code - DO NOT EDIT
gui_Singleton = 1;
gui_State = struct('gui_Name',       mfilename, ...
                  'gui_Singleton',  gui_Singleton, ...
                  'gui_OpeningFcn', @Main_OpeningFcn, ...
                  'gui_OutputFcn',  @Main_OutputFcn, ...
                  'gui_LayoutFcn', [] , ...
                  'gui_Callback',   []);
if nargin && ischar(varargin{1})
   gui_State.gui_Callback = str2func(varargin{1});
end

if nargout
  [varargout{1:nargout}] = gui_mainfcn(gui_State, varargin{:});
else
   gui_mainfcn(gui_State, varargin{:});
end
% End initialization code - DO NOT EDIT


% --- Executes just before Main is made visible.
function Main_OpeningFcn(hObject, eventdata, handles, varargin)
% This function has no output args, see OutputFcn.
% hObject   handle to figure
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)
% varargin   command line arguments to Main (see VARARGIN)

% Choose default command line output for Main
handles.output = hObject;

% Update handles structure
guidata(hObject, handles);

% UIWAIT makes Main wait for user response (see UIRESUME)
% uiwait(handles.figure1);
load TrainingSet;
load TrainingLable;
[totalSampl,q]=size(TrainingSet);
str=num2str(tabulate(TrainingLable));

set(handles.totalrecords,'String',strcat(str));
set(handles.resultText,'String',strcat('Total Samples: ',num2str(totalSampl)));


% --- Outputs from this function are returned to the command line.
function varargout = Main_OutputFcn(hObject, eventdata, handles) 
% varargout cell array for returning output args (see VARARGOUT);
% hObject   handle to figure
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)

% Get default command line output from handles structure
varargout{1} = handles.output;



% --- Executes on button press in trainBtn.
function trainBtn_Callback(hObject, eventdata, handles)
% hObject   handle to trainBtn (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)
clc;
% clear all;
% close all;
set(handles.statusText,'String','Start Speaking...');
pause(0.001);
Fs = 8000; % Sampling Freq (Hz)
%%Duration = 2; % Duration (sec)
%%audio_rec_obj = audiorecorder(Fs, 16, 1);
% get(audio_rec_obj);
% Record your voice for Duration seconds.
myRecording = wavrecord(2*Fs,Fs);
%%recordblocking(audio_rec_obj, Duration);
% disp('End of Recording.');
set(handles.statusText,'String','Saving....');
pause(0.001);
% Play back the recording.
%%play(audio_rec_obj);
% Store data in double-precision array.
%%myRecording = getaudiodata(audio_rec_obj);

% Plot the waveform.
% figure,
%plot(myRecording);
%grid on;
% title('Input Signal');
%xlabel('Samples');
%ylabel('Magnitude(db)');

%pre-empasis or high pass filter
Prem=0.97;
Filtered_output=filter([1,-Prem],1,myRecording);
%sound(Filtered_output);
wavwrite(Filtered_output, Fs, 16,'RAW');
wavplay(Filtered_output,Fs);
% figure,
%plot(Filtered_output);
%grid on;
% title('Pre-empasis Signal/Filtered Signal');
%xlabel('Samples');
%ylabel('Magnitude(db)');
len=length(Filtered_output);

Frame_size = Fs*32/1000; %200 (sample points)
Frame_overlap = Fs*16/1000; %120 (sample points)
Frame_step = Frame_size-Frame_overlap; % 80 (sample points)
Frame_rate = round(Fs/Frame_step)+1; %100; frames/sec
Fft_size=Frame_size;

numFrames=length(Filtered_output)/Frame_step;

%padd the zeros for equal frame length
for i=1:numFrames*Frame_size
paddesSignal(i,:)=0;    
end

%get orignal signal
for n=1:len
paddesSignal(n,:) = Filtered_output(n,:);
end
       
%frame blocking or farming
for i=1:numFrames
   for n=1:Frame_size
       fdata(i,n)=paddesSignal(i*Frame_step+n,:);
   end
end

%% (2) Windowing..
   frameSize = size(fdata); 
   nbFrames = frameSize(1); 
   nbSamples = frameSize(2); 

   % Hamming window.. 
   w = hamming(nbSamples); 
   afterWindow = zeros(nbFrames,nbSamples);
   for i = 1:nbFrames
       singleFrame = fdata(i,1:nbSamples); 
       afterWindow(i, 1:nbSamples) = w'.*singleFrame; 
   end
%     figure,
%plot(afterWindow);
%grid on;
%xlabel('Samples');
%ylabel('Magnitude(db)');
%     title('Windowing graph');
   
   

%ylabel('Magnitude(db)');
%     title('mfcc normalized freq graph');
%       disp('done feature extraction ');
set(handles.statusText,'String','Input Saved in .wav file format');
pause(0.001);
%     %get size of train variable
   %%%try
       %%%load TrainingSet;
       %%%load TrainingLable;
   %%%catch er
       %%%TrainingSet=[];
       %%%TrainingLable=[];
       %%%disp('created new training');
   %%%end
   
   %%%[featuresCnt,Samples]=size(TrainingSet);
   %%%TrainingSet(featuresCnt+1,:)=meanMFCC;           %craete training matrix
   
   %create lables for features from user input
   inputLable=input('Press any key ', 's');
   
   %%%TrainingLable(featuresCnt+1)=str2num(inputLable);
   disp('Select saved input through "Train with Audio" for Feature Extraction');
   %store training and labels in .mat files for classifier training
   %%%try
   %%%save('TrainingSet','TrainingSet');
   %%%save('TrainingLable','TrainingLable');
   %%%set(handles.statusText,'String','Done with Training and Saved');
   %%%pause(0.001);
   %%%catch ers
       %%%disp('Unable to save training set try again');
   %%%end
   
  
   


% --- Executes on button press in testBtn.
function testBtn_Callback(hObject, eventdata, handles)
% hObject   handle to testBtn (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)
clc;
set(handles.statusText,'String','Start Speaking...');
Fs = 8000; % Sampling Freq (Hz)
%%Duration = 2; % Duration (sec)
%%audio_rec_obj = audiorecorder(Fs, 16, 1);
% get(audio_rec_obj);
% Record your voice for Duration seconds.
% disp('Start speaking.')
myRecording =audiorecorder(2*Fs,Fs);
pause(0.01);
set(handles.outputText,'String','--');
%%recordblocking(audio_rec_obj, Duration);
% disp('End of Recording.');
set(handles.statusText,'String','Stop Speaking');
pause(0.001);
% Play back the recording.
%%play(audio_rec_obj);
% Store data in double-precision array.
%%myRecording = getaudiodata(audio_rec_obj);

% Plot the waveform.
% figure,
axes(handles.axes1);
plot(myRecording);
grid on;
%title('Input Signal');
xlabel('Samples');
ylabel('Magnitude(db)');
set(handles.statusText,'String','Done with Recording...');
pause(0.001);
%pre-empasis or high pass filter
Prem=0.97;
Filtered_output=filter([1,-Prem],1,myRecording);
sound(Filtered_output);
% figure,
axes(handles.axes2);
plot(Filtered_output);
grid on;
%title('Pre-empasis Signal/Filtered Signal');
xlabel('Samples');
ylabel('Magnitude(db)');
len=length(Filtered_output);

Frame_size = Fs*32/1000; %200 (sample points)
Frame_overlap = Fs*16/1000; %120 (sample points)
Frame_step = Frame_size-Frame_overlap; % 80 (sample points)
Frame_rate = round(Fs/Frame_step)+1; %100; frames/sec
Fft_size=Frame_size;

numFrames=length(Filtered_output)/Frame_step;

%padd the zeros for equal frame length
for i=1:numFrames*Frame_size
paddesSignal(i,:)=0;    
end

%get orignal signal
for n=1:len
paddesSignal(n,:) = Filtered_output(n,:);
end
       
%frame blocking or farming
for i=1:numFrames
   for n=1:Frame_size
       fdata(i,n)=paddesSignal(i*Frame_step+n,:);
   end
end

%% (2) Windowing..
   frameSize = size(fdata); 
   nbFrames = frameSize(1); 
   nbSamples = frameSize(2); 

   % Hamming window.. 
   w = hamming(nbSamples); 
   afterWindow = zeros(nbFrames,nbSamples);
   for i = 1:nbFrames
       singleFrame = fdata(i,1:nbSamples); 
       afterWindow(i, 1:nbSamples) = w'.*singleFrame; 
   end
%     figure,
   axes(handles.axes3);
   plot(afterWindow);
   grid on;
   xlabel('Samples');
   ylabel('Magnitude(db)');
   %title('Windowing graph');
   
   
         Tw = 25;           % analysis frame duration (ms)
         Ts = 10;           % analysis frame shift (ms)
         alpha = 0.97;      % preemphasis coefficient
         R = [ 300 3700 ];  % frequency range to consider
         M = 20;            % number of filterbank channels 
         N = 13;            % number of cepstral coefficients
         L = 22;    
         nfft = 2^nextpow2( nbFrames );     % length of FFT analysis 
         K = nfft/2+1;                      % length of the unique part of the FFT 
   
    %% HANDY INLINE FUNCTION HANDLES

   % Forward and backward mel frequency warping.
   % Note that base 10 is used in [1], while base e is used here and in HTK code
   hz2mel = @( hz )( 1127*log(1+hz/700) );     % Hertz to mel warping function
   mel2hz = @( mel )( 700*exp(mel/1127)-700 ); % mel to Hertz warping function

   % Type III DCT matrix routine 
   dctm = @( N, M )( sqrt(2.0/M) * cos( repmat([0:N-1].',1,M).* repmat(pi*([1:M]-0.5)/M,N,1) ) );

   % Cepstral lifter routine 
   ceplifter = @( N, L )( 1+0.5*L*sin(pi*[0:N-1]/L) );
   
   
   MAG = abs( fft(afterWindow,nfft,1) ); 
%     figure,
%     plot(MAG);
%     title('fft magnitude garaph');
   % Triangular filterbank with uniformly spaced filters on mel scale
   H = trifbank( M, K, R, Fs, hz2mel, mel2hz ); % size of H is M x K 

   % Filterbank application to unique part of the magnitude spectrum
   FBE = H * MAG(1:K,:); % FBE( FBE<1.0 ) = 1.0; % apply mel floor
   
   % DCT matrix computation
   temp = dctm( N, M );

   % Conversion of logFBEs to cepstral coefficients through DCT
   CC =  temp * log( FBE );
   
   % Cepstral lifter computation
   lifter = ceplifter( N, L );

   % Cepstral liftering gives liftered cepstral coefficients
   CC = diag( lifter ) * CC; % ~ HTK's MFCCs
 
   %%%%%%%%%%%%%%%%%%%% training %%%%%%%%%%%%%%%%%%%
   %to train the classifier normalize the values by taking the mean of CC;
   meanMFCC=mean(CC);                  %mean of CC 1xN
%     plot(CC)
%     figure,
  axes(handles.axes4);
   plot(meanMFCC);
   grid on;
   %title('mfcc normalized freq graph');
   xlabel('Samples');
   ylabel('Magnitude(db)');
   set(handles.statusText,'String','Done');
%       disp('done feature extraction ');
%     %get size of train variable
%     try
%         load TrainingSet;
%         load TrainingLable;
%     catch er
%         TrainingSet=[];
%         TrainingLable=[];
%         disp('created new training');
%     end
%     
%     [featuresCnt,Samples]=size(TrainingSet);
%     TrainingSet(featuresCnt+1,:)=meanMFCC;           %craete training matrix
%     
%     %create lables for features from user input
%     inputLable=input('Type the language lable (e.g. 1 for Marathi, 0 for English): ', 's');
%     
%     TrainingLable(featuresCnt+1)=str2num(inputLable);
%     disp('done feature extraction');
%     %store training and labels in .mat files for classifier training
%     try
%     save('TrainingSet','TrainingSet');
%     save('TrainingLable','TrainingLable');
%     catch ers
%         disp('Unable to save training set try again');
%     end
   
   clc;
   testData=meanMFCC;
   %call svm training function
   load Traininglable;
   load TrainingSet;
   
%     svmStruct = svmtrain(TrainingSet,TrainingLable','showplot',false); 
%     classes = svmclassify(svmStruct,testData,'showplot',false);
   classes = multisvm(TrainingSet, TrainingLable', testData)
%     disp('Done training');
   
set(handles.outputText,'String','--');

  if(classes==1)
   set(handles.outputText,'String','English');
  end
      
   if(classes==2)

   set(handles.outputText,'String','Marathi');
   end

     if(classes==3)
   set(handles.outputText,'String','Hindi');
   end
   
   
   



   % Conversion of logFBEs to cepstral coefficients through DCT
   CC =  DCT * log( FBE );
   
   % Cepstral lifter computation
   lifter = ceplifter( N, L );

   % Cepstral liftering gives liftered cepstral coefficients
   CC = diag( lifter ) * CC; % ~ HTK's MFCCs
 
   %%%%%%%%%%%%%%%%%%%% training %%%%%%%%%%%%%%%%%%%
   %to train the classifier normalize the values by taking the mean of CC;
   meanMFCC=mean(CC);                 %mean of CC 1xN
   
   set(handles.statusText,'String','Done Feature extraction');
   axes(handles.axes4);
    plot(meanMFCC);
    grid on;
   %title('mfcc normalized freq graph');
   xlabel('Samples');
   ylabel('Magnitude(db)');
   set(handles.statusText,'String','Done feature extraction');

   
   clc;
   testData=meanMFCC;
   %call svm training function
   load Traininglable;
   load TrainingSet;
   
%     svmStruct = svmtrain(TrainingSet,TrainingLable','showplot',false); 
%     classes = svmclassify(svmStruct,testData,'showplot',false);
   classes = multisvm(TrainingSet, TrainingLable', testData);
%     disp('Done training');
   
set(handles.outputText,'String','--');

  if(classes==1)
   set(handles.outputText,'String','English');
  end
   
  
   
   if(classes==2)

   set(handles.outputText,'String','Marathi');
   end

     if(classes==3)
   set(handles.outputText,'String','Hindi');
   end
   



function edit1_Callback(hObject, eventdata, handles)
% hObject   handle to edit1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)

% Hints: get(hObject,'String') returns contents of edit1 as text
%       str2double(get(hObject,'String')) returns contents of edit1 as a double


% --- Executes during object creation, after setting all properties.
function edit1_CreateFcn(hObject, eventdata, handles)
% hObject   handle to edit1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   empty - handles not created until after all CreateFcns called

% Hint: edit controls usually have a white background on Windows.
%       See ISPC and COMPUTER.
if ispc && isequal(get(hObject,'BackgroundColor'), get(0,'defaultUicontrolBackgroundColor'))
   set(hObject,'BackgroundColor','white');
end


% --------------------------------------------------------------------
function uipanel1_ButtonDownFcn(hObject, eventdata, handles)
% hObject   handle to uipanel1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)


% --- Executes on mouse press over figure background.
function figure1_ButtonDownFcn(hObject, eventdata, handles)
% hObject   handle to figure1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)


% --- If Enable == 'on', executes on mouse press in 5 pixel border.
% --- Otherwise, executes on mouse press in 5 pixel border or over trainWithFilebtn.
function trainWithFilebtn_ButtonDownFcn(hObject, eventdata, handles)
% hObject   handle to trainWithFilebtn (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)


% --- Executes when figure1 is resized.
function figure1_ResizeFcn(hObject, eventdata, handles)
% hObject   handle to figure1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)


% --- Executes on key press with focus on testWithAudioBtn and none of its controls.
function testWithAudioBtn_KeyPressFcn(hObject, eventdata, handles)
% hObject   handle to testWithAudioBtn (see GCBO)
% eventdata structure with the following fields (see UICONTROL)
%Key: name of the key that was pressed, in lower case
%Character: character interpretation of the key(s) that was pressed
%Modifier: name(s) of the modifier key(s) (i.e., control, shift) pressed
% handles   structure with handles and user data (see GUIDATA)

3 仿真结果

【语音识别】基于MFCC特征实现说话人语音识别matlab源码

4 参考文献

[1]王伟, and 邓辉文. "基于MFCC参数和VQ的说话人识别系统." 第四届全国信息获取与处理学术会议 0.

【语音识别】基于MFCC特征实现说话人语音识别matlab源码

上一篇:python实现文件格式转换(tkinter)


下一篇:[ffmpeg] h264并行解码