2. SGD训练
SGD权重更新方式,同LR二分类的基本相同;所不同的是,二分类LR只用训练一个权重向量,而K分类LR需要训练K-1个权重向量。函数接口如下:
// train by SGD on the sample file bool TrainSGDOnSampleFile ( const char * sFileName, int iClassNum, int iFeatureNum, // about the samples double dLearningRate, // about the learning int iMaxLoop, double dMinImproveRatio // about the stop criteria );
调用private函数如下:
// initialize the theta matrix with iClassNum and iFeatureNum bool InitThetaMatrix (int iClassNum, int iFeatureNum); // calculate the model function output for iClassIndex by feature vector double CalcFuncOutByFeaVec (vector<FeaValNode> & FeaValNodeVec, int iClassIndex); // calculate the model function output for all the classes, and return the class index with max probability int CalcFuncOutByFeaVecForAllClass (vector<FeaValNode> & FeaValNodeVec, vector<double> & ClassProbVec); // calculate the gradient and update the theta matrix, it returns the cost double UpdateThetaMatrix (Sample & theSample, vector<double> & ClassProbVec, double dLearningRate);
函数功能分别是初始化权重矩阵、利用现有LR模型参数计算当前类别的预测概率、计算所有类别的预测概率、更新权重。
SGD的函数实现代码如下:
// the sample format: classid feature1_value feature2_value... bool LogisticRegression::TrainSGDOnSampleFile ( const char * sFileName, int iClassNum, int iFeatureNum, // about the samples double dLearningRate = 0.05, // about the learning int iMaxLoop = 1, double dMinImproveRatio = 0.01 // about the stop criteria ) { ifstream in (sFileName); if (!in) { cerr << "Can not open the file of " << sFileName << endl; return false; } if (!InitThetaMatrix (iClassNum, iFeatureNum)) return false; double dCost = 0.0; double dPreCost = 100.0; for (int iLoop = 0; iLoop < iMaxLoop; iLoop++) { int iSampleNum = 0; int iErrNum = 0; string sLine; while (getline (in, sLine)) { Sample theSample; if (ReadSampleFrmLine (sLine, theSample)) { vector<double> ClassProbVec; int iPredClassIndex = CalcFuncOutByFeaVecForAllClass (theSample.FeaValNodeVec, ClassProbVec); if (iPredClassIndex != theSample.iClass) iErrNum++; dCost += UpdateThetaMatrix (theSample, ClassProbVec, dLearningRate); iSampleNum++; } } dCost /= iSampleNum; double dTmpRatio = (dPreCost - dCost) / dPreCost; double dTmpErrRate = (double)iErrNum / iSampleNum; // show info on screen cout << "In loop " << iLoop << ": current cost (" << dCost << ") previous cost (" << dPreCost << ") ratio (" << dTmpRatio << ") "<< endl; cout << "And Error rate : " << dTmpErrRate << endl; if (dTmpRatio < dMinImproveRatio) break; else { dPreCost = dCost; dCost = 0.0; //reset the current reading position of file in.clear(); in.seekg (0, ios::beg); } } return true; }
其中计算各个类别概率方式如下:
// it returns the value of f(x) = exp (W*X) for iClassIndex < K, otherwise 1.0 for iClassIndex == K double LogisticRegression::CalcFuncOutByFeaVec(vector<FeaValNode> & FeaValNodeVec, int iClassIndex) { if (iClassIndex >= iClassNum || iClassIndex <0) // wrong situation return 0.0; if (iClassIndex == (iClassNum-1) ) // the default class (here is the class with max index) return 1.0; double dX = 0.0; vector<FeaValNode>::iterator p = FeaValNodeVec.begin(); while (p != FeaValNodeVec.end()) { if (p->iFeatureId < (int)ThetaMatrix.at(iClassIndex).size()) // all input is evil dX += ThetaMatrix[iClassIndex][p->iFeatureId] * p->dValue; p++; } double dY = exp (dX); return dY; }
注意两点:1. 在K个类别中,第K个类别是default类别;2. 此时返回的不是sigmoid函数值,而是指数函数值。最终的概率在如下代码中计算:
// the class probability is calculated by : // f(x) = exp (W*X) / {1.0 + sum_exp (W*X)} as long as iClassIndex < K // f(x) = 1.0 / {1.0 + sum_exp (W*X)} as long as iClassIndex == K int LogisticRegression::CalcFuncOutByFeaVecForAllClass (vector<FeaValNode> & FeaValNodeVec, vector<double> & ClassProbVec) { ClassProbVec.clear(); ClassProbVec.resize (iClassNum, 0.0); double dSum = 1.0; for (int i=0; i<iClassNum; i++) { ClassProbVec.at(i) = CalcFuncOutByFeaVec (FeaValNodeVec, i); dSum += ClassProbVec.at(i); } double dMaxProb = 0.0; int iClassMaxProb = -1; for (int i=0; i<iClassNum; i++) { ClassProbVec.at(i) /= dSum; if (ClassProbVec.at(i) > dMaxProb) iClassMaxProb = i; } return iClassMaxProb; }计算出的概率实际上是softmax概率。权重更新函数:
// the update formula is : theta_new = theta_old - dLearningRate * (dY - iClass) * dXi double LogisticRegression::UpdateThetaMatrix (Sample & theSample, vector<double> & ClassProbVec, double dLearningRate) { double dCost = 0.0; for (int i=0; i<iClassNum-1; i++) { if (i == theSample.iClass) { vector<FeaValNode>::iterator p = theSample.FeaValNodeVec.begin(); while (p != theSample.FeaValNodeVec.end()) { if (p->iFeatureId < (int)ThetaMatrix[i].size()) { double dGradient = (ClassProbVec[i] - 1.0) * p->dValue; double dDelta = dGradient * dLearningRate; ThetaMatrix[i][p->iFeatureId] -= dDelta; } p++; } // cost = log(dY) when the sample class is the predicted class, otherwise cost = log(1.0 - dY) dCost -= log (ClassProbVec[i]); } else { vector<FeaValNode>::iterator p = theSample.FeaValNodeVec.begin(); while (p != theSample.FeaValNodeVec.end()) { if (p->iFeatureId < (int)ThetaMatrix[i].size()) { double dGradient = ClassProbVec[i] * p->dValue; double dDelta = dGradient * dLearningRate; ThetaMatrix[i][p->iFeatureId] -= dDelta; } p++; } // cost = log(dY) when the sample class is the predicted class, otherwise cost = log(1.0 - dY) dCost -= log (1.0 - ClassProbVec[i]); } } return dCost; }
完。
转载请注明出处:http://blog.csdn.net/xceman1997/article/details/18449317