其中第一步的量化与普通量化相同,步骤如下:
代码理解见注释(仅个人理解,欢迎指正):
void QuantRDOQ::xRateDistOptQuant(TransformUnit &tu, const ComponentID &compID, const CCoeffBuf &pSrc, TCoeff &uiAbsSum, const QpParam &cQP, const Ctx &ctx)
{
const FracBitsAccess& fracBits = ctx.getFracBitsAcess();
const SPS &sps = *tu.cs->sps;
const CompArea &rect = tu.blocks[compID];
const uint32_t uiWidth = rect.width;
const uint32_t uiHeight = rect.height;
const ChannelType chType = toChannelType(compID);
const int channelBitDepth = sps.getBitDepth( chType );
const bool extendedPrecision = sps.getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
const int maxLog2TrDynamicRange = sps.getMaxLog2TrDynamicRange(chType);
const bool useIntraSubPartitions = tu.cu->ispMode && isLuma(compID);
/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
*/
// Represents scaling through forward transform
int iTransformShift = getTransformShift(channelBitDepth, rect.size(), maxLog2TrDynamicRange);
if (tu.mtsIdx[compID] == MTS_SKIP && extendedPrecision)
{
iTransformShift = std::max<int>(0, iTransformShift);
}
double d64BlockUncodedCost = 0;
const uint32_t uiLog2BlockWidth = floorLog2(uiWidth);
const uint32_t uiLog2BlockHeight = floorLog2(uiHeight);
const uint32_t uiMaxNumCoeff = rect.area();
CHECK(compID >= MAX_NUM_TBLOCKS, "Invalid component ID");
int scalingListType = getScalingListType(tu.cu->predMode, compID);
CHECK(scalingListType >= SCALING_LIST_NUM, "Invalid scaling list");
const TCoeff *plSrcCoeff = pSrc.buf;
TCoeff *piDstCoeff = tu.getCoeffs(compID).buf;
double *pdCostCoeff = m_pdCostCoeff;
double *pdCostSig = m_pdCostSig;
double *pdCostCoeff0 = m_pdCostCoeff0;
int *rateIncUp = m_rateIncUp;
int *rateIncDown = m_rateIncDown;
int *sigRateDelta = m_sigRateDelta;
TCoeff *deltaU = m_deltaU;
memset(piDstCoeff, 0, sizeof(*piDstCoeff) * uiMaxNumCoeff);
memset( m_pdCostCoeff, 0, sizeof( double ) * uiMaxNumCoeff );
memset( m_pdCostSig, 0, sizeof( double ) * uiMaxNumCoeff );
memset( m_rateIncUp, 0, sizeof( int ) * uiMaxNumCoeff );
memset( m_rateIncDown, 0, sizeof( int ) * uiMaxNumCoeff );
memset( m_sigRateDelta, 0, sizeof( int ) * uiMaxNumCoeff );
memset( m_deltaU, 0, sizeof( TCoeff ) * uiMaxNumCoeff );
const bool needSqrtAdjustment= TU::needsBlockSizeTrafoScale( tu, compID );
const bool isTransformSkip = (tu.mtsIdx[compID] == MTS_SKIP);
const double *const pdErrScale = xGetErrScaleCoeffSL(scalingListType, uiLog2BlockWidth, uiLog2BlockHeight, cQP.rem(isTransformSkip));
const int *const piQCoef = getQuantCoeff(scalingListType, cQP.rem(isTransformSkip), uiLog2BlockWidth, uiLog2BlockHeight);
const bool disableSMForLFNST = tu.cs->slice->getExplicitScalingListUsed() ? tu.cs->slice->getSPS()->getDisableScalingMatrixForLfnstBlks() : false;
const bool isLfnstApplied = tu.cu->lfnstIdx > 0 && (tu.cu->isSepTree() ? true : isLuma(compID));
const bool disableSMForACT = tu.cs->slice->getSPS()->getScalingMatrixForAlternativeColourSpaceDisabledFlag() && (tu.cs->slice->getSPS()->getScalingMatrixDesignatedColourSpaceFlag() == tu.cu->colorTransform);
const bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, isTransformSkip, isLfnstApplied, disableSMForLFNST, disableSMForACT);
const int defaultQuantisationCoefficient = g_quantScales[ needSqrtAdjustment ?1:0][cQP.rem(isTransformSkip)];
const double defaultErrorScale = xGetErrScaleCoeffNoScalingList(scalingListType, uiLog2BlockWidth, uiLog2BlockHeight, cQP.rem(isTransformSkip));
const int iQBits = QUANT_SHIFT + cQP.per(isTransformSkip) + iTransformShift + (needSqrtAdjustment?-1:0); // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits
const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
CoeffCodingContext cctx(tu, compID, tu.cs->slice->getSignDataHidingEnabledFlag());
const int iCGSizeM1 = (1 << cctx.log2CGSize()) - 1;
int iCGLastScanPos = -1;
double d64BaseCost = 0;
int iLastScanPos = -1;
int ctxBinSampleRatio = (compID == COMPONENT_Y) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
int remRegBins = (uiWidth * uiHeight * ctxBinSampleRatio) >> 4;
uint32_t goRiceParam = 0;
double *pdCostCoeffGroupSig = m_pdCostCoeffGroupSig;
memset( pdCostCoeffGroupSig, 0, ( uiMaxNumCoeff >> cctx.log2CGSize() ) * sizeof( double ) );
int iScanPos;
coeffGroupRDStats rdStats;
#if ENABLE_TRACING
DTRACE( g_trace_ctx, D_RDOQ, "%d: %3d, %3d, %dx%d, comp=%d\n", DTRACE_GET_COUNTER( g_trace_ctx, D_RDOQ ), rect.x, rect.y, rect.width, rect.height, compID );
#endif
const uint32_t lfnstIdx = tu.cu->lfnstIdx;
const int iCGNum = lfnstIdx > 0 ? 1 : std::min<int>(JVET_C0024_ZERO_OUT_TH, uiWidth) * std::min<int>(JVET_C0024_ZERO_OUT_TH, uiHeight) >> cctx.log2CGSize();
for (int subSetId = iCGNum - 1; subSetId >= 0; subSetId--)
{//遍历cg
cctx.initSubblock( subSetId );
uint32_t maxNonZeroPosInCG = iCGSizeM1;
if( lfnstIdx > 0 && ( ( uiWidth == 4 && uiHeight == 4 ) || ( uiWidth == 8 && uiHeight == 8 && cctx.cgPosX() == 0 && cctx.cgPosY() == 0 ) ) )
{
maxNonZeroPosInCG = 7;
}
memset( &rdStats, 0, sizeof (coeffGroupRDStats));
for( int iScanPosinCG = iCGSizeM1; iScanPosinCG > maxNonZeroPosInCG; iScanPosinCG-- )
{
iScanPos = cctx.minSubPos() + iScanPosinCG;
uint32_t blkPos = cctx.blockPos( iScanPos );
piDstCoeff[ blkPos ] = 0;
}
for( int iScanPosinCG = maxNonZeroPosInCG; iScanPosinCG >= 0; iScanPosinCG-- )
{//遍历cg中的点,按照z行扫描顺序遍历
iScanPos = cctx.minSubPos() + iScanPosinCG;
//===== quantization =====第一步,预量化
uint32_t uiBlkPos = cctx.blockPos(iScanPos);
// set coeff
//defaultQuantisationCoefficient是MF
const int quantisationCoefficient = (enableScalingLists) ? piQCoef [uiBlkPos] : defaultQuantisationCoefficient;
const double errorScale = (enableScalingLists) ? pdErrScale[uiBlkPos] : defaultErrorScale;
//d*MF
const int64_t tmpLevel = int64_t(abs(plSrcCoeff[ uiBlkPos ])) * quantisationCoefficient;
//lLevelDouble,应该还是d*MF
const Intermediate_Int lLevelDouble = (Intermediate_Int)std::min<int64_t>(tmpLevel, std::numeric_limits<Intermediate_Int>::max() - (Intermediate_Int(1) << (iQBits - 1)));
//计算出量化值
uint32_t uiMaxAbsLevel = std::min<uint32_t>(uint32_t(entropyCodingMaximum), uint32_t((lLevelDouble + (Intermediate_Int(1) << (iQBits - 1))) >> iQBits));
const double dErr = double( lLevelDouble );
pdCostCoeff0[ iScanPos ] = dErr * dErr * errorScale;//计算量化成0的cost
d64BlockUncodedCost += pdCostCoeff0[ iScanPos ];//d64BlockUncodedCost表示tu内部全部量化为0的cost
piDstCoeff[ uiBlkPos ] = uiMaxAbsLevel;//把量化值放在piDstCoeff[ uiBlkPos ]中
if ( uiMaxAbsLevel > 0 && iLastScanPos < 0 )
{//设置当前tu块中最后一个量化系数
iLastScanPos = iScanPos;
iCGLastScanPos = cctx.subSetId();
}
if ( iLastScanPos >= 0 )//说明存在非0量化值
{
#if ENABLE_TRACING
uint32_t uiCGPosY = cctx.cgPosX();
uint32_t uiCGPosX = cctx.cgPosY();
uint32_t uiPosY = cctx.posY( iScanPos );
uint32_t uiPosX = cctx.posX( iScanPos );
DTRACE( g_trace_ctx, D_RDOQ, "%d [%d][%d][%2d:%2d][%2d:%2d]", DTRACE_GET_COUNTER( g_trace_ctx, D_RDOQ ), iScanPos, uiBlkPos, uiCGPosX, uiCGPosY, uiPosX, uiPosY );
#endif
//===== coefficient level estimation =====第二步,确定最优量化值
unsigned ctxIdSig = 0;
if( iScanPos != iLastScanPos )//如果不是最后一个量化系数
{
ctxIdSig = cctx.sigCtxIdAbs( iScanPos, piDstCoeff, 0 );
}
uint32_t uiLevel;
uint8_t ctxOffset = cctx.ctxOffsetAbs ();
uint32_t uiParCtx = cctx.parityCtxIdAbs ( ctxOffset );
uint32_t uiGt1Ctx = cctx.greater1CtxIdAbs ( ctxOffset );
uint32_t uiGt2Ctx = cctx.greater2CtxIdAbs ( ctxOffset );
uint32_t goRiceZero = 0;
if( remRegBins < 4 )
{
unsigned sumAbs = cctx.templateAbsSum( iScanPos, piDstCoeff, 0 );
goRiceParam = g_auiGoRiceParsCoeff [ sumAbs ];
goRiceZero = g_auiGoRicePosCoeff0(0, goRiceParam);
}
const BinFracBits fracBitsPar = fracBits.getFracBitsArray( uiParCtx );
const BinFracBits fracBitsGt1 = fracBits.getFracBitsArray( uiGt1Ctx );
const BinFracBits fracBitsGt2 = fracBits.getFracBitsArray( uiGt2Ctx );
if( iScanPos == iLastScanPos )//如果是最后一个量化系数
{
//在xGetCodedLevel中获取当前系数的最优量化值,放在uiLevel中
uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
lLevelDouble, uiMaxAbsLevel, nullptr, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, iQBits, errorScale, 1, extendedPrecision, maxLog2TrDynamicRange );
}
else//不是最后一个位置的量化系数
{
DTRACE_COND( ( uiMaxAbsLevel != 0 ), g_trace_ctx, D_RDOQ_MORE, " uiCtxSig=%d", ctxIdSig );
const BinFracBits fracBitsSig = fracBits.getFracBitsArray( ctxIdSig );
uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
lLevelDouble, uiMaxAbsLevel, &fracBitsSig, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, iQBits, errorScale, 0, extendedPrecision, maxLog2TrDynamicRange );
//非0系数符号的cost
sigRateDelta[ uiBlkPos ] = ( remRegBins < 4 ? 0 : fracBitsSig.intBits[1] - fracBitsSig.intBits[0] );
}
DTRACE( g_trace_ctx, D_RDOQ, " Lev=%d \n", uiLevel );
DTRACE_COND( ( uiMaxAbsLevel != 0 ), g_trace_ctx, D_RDOQ, " CostC0=%d\n", (int64_t)( pdCostCoeff0[iScanPos] ) );
DTRACE_COND( ( uiMaxAbsLevel != 0 ), g_trace_ctx, D_RDOQ, " CostC =%d\n", (int64_t)( pdCostCoeff[iScanPos] ) );
deltaU[ uiBlkPos ] = TCoeff((lLevelDouble - (Intermediate_Int(uiLevel) << iQBits)) >> (iQBits-8));
if( uiLevel > 0 )//量化值大于0的,计算附近3个值的rate
{
int rateNow = xGetICRate( uiLevel, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange );
rateIncUp [ uiBlkPos ] = xGetICRate( uiLevel+1, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
rateIncDown [ uiBlkPos ] = xGetICRate( uiLevel-1, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
}
else // uiLevel == 0
{
if( remRegBins < 4 )
{//xGetICRate函数理解见下方
int rateNow = xGetICRate( uiLevel, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange );
rateIncUp [ uiBlkPos ] = xGetICRate( uiLevel+1, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
}
else
{
rateIncUp [ uiBlkPos ] = fracBitsGt1.intBits[ 0 ];
}
}
piDstCoeff[ uiBlkPos ] = uiLevel;//当前量化值为uiLevel
d64BaseCost += pdCostCoeff [ iScanPos ];//计算cost和,是整个tu的cost(系数的cost)
if( ( (iScanPos & iCGSizeM1) == 0 ) && ( iScanPos > 0 ) )
{
goRiceParam = 0;
}
else if( remRegBins >= 4 )
{
int sumAll = cctx.templateAbsSum(iScanPos, piDstCoeff, 4);
goRiceParam = g_auiGoRiceParsCoeff[sumAll];
remRegBins -= (uiLevel < 2 ? uiLevel : 3) + (iScanPos != iLastScanPos);
}
}
else//如果还不存在非0量化值
{
d64BaseCost += pdCostCoeff0[ iScanPos ];//那就加上量化为0 的cost
}
rdStats.d64SigCost += pdCostSig[ iScanPos ];//加上编码符号位的cost
if (iScanPosinCG == 0 )//如果是cg中的左上角第一个点
{
rdStats.d64SigCost_0 = pdCostSig[ iScanPos ];//编码cg中的左上角第一个点一个符号位的cost
}
if (piDstCoeff[ uiBlkPos ] )//如果当前系数非0
{
cctx.setSigGroup();
//d64CodedLevelandDist就加上(编码这个量化值的cost-符号的cost),d64CodedLevelandDist为量化值的cost(不加上符号的)
rdStats.d64CodedLevelandDist += pdCostCoeff[ iScanPos ] - pdCostSig[ iScanPos ];//应该是只算error的cost和码率,不包括符号
rdStats.d64UncodedDist += pdCostCoeff0[ iScanPos ];//d64UncodedDist为量化为全0的cost,只包括error和码率
if ( iScanPosinCG != 0 )//如果不是cg左上角第一个点,但是量化系数非0
{
rdStats.iNNZbeforePos0++;
}
}
} //end for (iScanPosinCG)//当前cg内部的遍历
if (iCGLastScanPos >= 0)//如果目前tu块中已存在非0的量化系数
{
if( cctx.subSetId() )
{
if( !cctx.isSigGroup() )
{
const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray( cctx.sigGroupCtxId() );
d64BaseCost += xGetRateSigCoeffGroup(fracBitsSigGroup, 0) - rdStats.d64SigCost;
pdCostCoeffGroupSig[ cctx.subSetId() ] = xGetRateSigCoeffGroup(fracBitsSigGroup, 0);
}
else
{
//跳过最后一个含有非0系数的cg,在下面的步骤(确定最后一个非0系数)时会处理它
if (cctx.subSetId() < iCGLastScanPos) //skip the last coefficient group, which will be handled together with last position below.
{
if ( rdStats.iNNZbeforePos0 == 0 )//说明除了左上角第一个点,其余量化值都为0
{
d64BaseCost -= rdStats.d64SigCost_0;//减去标识这个位置有非0系数的cost
rdStats.d64SigCost -= rdStats.d64SigCost_0;//rdStats.d64SigCost也减去标识这个位置有非0系数的cost
}
// rd-cost if SigCoeffGroupFlag = 0, initialization
//如果SigCoeffGroupFlag=0,则rd-cost初始化
double d64CostZeroCG = d64BaseCost; //这里获得的是目前算的tu中的cost
const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray( cctx.sigGroupCtxId() );
if (cctx.subSetId() < iCGLastScanPos)
{
d64BaseCost += xGetRateSigCoeffGroup(fracBitsSigGroup,1);//1表示标识当前cg中含有非0系数的cost
d64CostZeroCG += xGetRateSigCoeffGroup(fracBitsSigGroup,0);//0表示标识当前cg为全0的cost
pdCostCoeffGroupSig[ cctx.subSetId() ] = xGetRateSigCoeffGroup(fracBitsSigGroup,1);//标识当前cg中含有非0系数的cost
}
// try to convert the current coeff group from non-zero to all-zero
//计算把非0 的全部变为0的cost
//加上把非0变成0的失真
d64CostZeroCG += rdStats.d64UncodedDist; // distortion for resetting non-zero levels to zero levels
//减去保持非0系数的成本
d64CostZeroCG -= rdStats.d64CodedLevelandDist; // distortion and level cost for keeping all non-zero levels
//减去标识所有0和非0的cost
d64CostZeroCG -= rdStats.d64SigCost; // sig cost for all coeffs, including zero levels and non-zerl levels
// if we can save cost, change this block to all-zero block
if ( d64CostZeroCG < d64BaseCost )//如果当前cg量化为0的cost小于 保持量化值不变的cost
{
cctx.resetSigGroup();
d64BaseCost = d64CostZeroCG;
if (cctx.subSetId() < iCGLastScanPos)
{
pdCostCoeffGroupSig[ cctx.subSetId() ] = xGetRateSigCoeffGroup(fracBitsSigGroup,0);
}
// reset coeffs to 0 in this block
for( int iScanPosinCG = maxNonZeroPosInCG; iScanPosinCG >= 0; iScanPosinCG-- )
{//遍历当前cg中的每个位置
iScanPos = cctx.minSubPos() + iScanPosinCG;
uint32_t uiBlkPos = cctx.blockPos( iScanPos );
if (piDstCoeff[ uiBlkPos ])//如果量化系数不为0
{
piDstCoeff [ uiBlkPos ] = 0;//将该处的量化系数置0
pdCostCoeff[ iScanPos ] = pdCostCoeff0[ iScanPos ];//cost为置0的cost
pdCostSig [ iScanPos ] = 0;//标识是否含有非0系数的cost也为0
}
}
} // end if ( d64CostAllZeros < d64BaseCost )
}
} // end if if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
}
else
{
cctx.setSigGroup();
}
}
} //end for (cctx.subSetId)//当前cg全部结束,进入下一个cg
//===== estimate last position =====第4步,确定最后一个位置
if ( iLastScanPos < 0 )//如果当前tu中所有cg遍历结束后,全是0,那么返回
{
return;
}
double d64BestCost = 0;
int iBestLastIdxP1 = 0;
if( !CU::isIntra( *tu.cu ) && isLuma( compID ) && tu.depth == 0 )
{
const BinFracBits fracBitsQtRootCbf = fracBits.getFracBitsArray( Ctx::QtRootCbf() );
d64BestCost = d64BlockUncodedCost + xGetICost( fracBitsQtRootCbf.intBits[ 0 ] );
d64BaseCost += xGetICost( fracBitsQtRootCbf.intBits[ 1 ] );
}
else
{
bool previousCbf = tu.cbf[COMPONENT_Cb];
bool lastCbfIsInferred = false;
if( useIntraSubPartitions )
{
bool rootCbfSoFar = false;
bool isLastSubPartition = CU::isISPLast(*tu.cu, tu.Y(), compID);
uint32_t nTus = tu.cu->ispMode == HOR_INTRA_SUBPARTITIONS ? tu.cu->lheight() >> floorLog2(tu.lheight()) : tu.cu->lwidth() >> floorLog2(tu.lwidth());
if( isLastSubPartition )
{
TransformUnit* tuPointer = tu.cu->firstTU;
for( int tuIdx = 0; tuIdx < nTus - 1; tuIdx++ )
{
rootCbfSoFar |= TU::getCbfAtDepth(*tuPointer, COMPONENT_Y, tu.depth);
tuPointer = tuPointer->next;
}
if( !rootCbfSoFar )
{
lastCbfIsInferred = true;
}
}
if( !lastCbfIsInferred )
{
previousCbf = TU::getPrevTuCbfAtDepth(tu, compID, tu.depth);
}
}
BinFracBits fracBitsQtCbf = fracBits.getFracBitsArray( Ctx::QtCbf[compID]( DeriveCtx::CtxQtCbf( rect.compID, previousCbf, useIntraSubPartitions ) ) );
if( !lastCbfIsInferred )
{
d64BestCost = d64BlockUncodedCost + xGetICost(fracBitsQtCbf.intBits[0]);
d64BaseCost += xGetICost(fracBitsQtCbf.intBits[1]);
}
else
{
d64BestCost = d64BlockUncodedCost;//d64BlockUncodedCost为量化为全0的cost
}
}
int lastBitsX[LAST_SIGNIFICANT_GROUPS] = { 0 };
int lastBitsY[LAST_SIGNIFICANT_GROUPS] = { 0 };
{
int dim1 = std::min<int>(JVET_C0024_ZERO_OUT_TH, uiWidth);
int dim2 = std::min<int>(JVET_C0024_ZERO_OUT_TH, uiHeight);
int bitsX = 0;
int bitsY = 0;
int ctxId;
//X-coordinate
for ( ctxId = 0; ctxId < g_uiGroupIdx[dim1-1]; ctxId++)
{
const BinFracBits fB = fracBits.getFracBitsArray( cctx.lastXCtxId(ctxId) );
lastBitsX[ ctxId ] = bitsX + fB.intBits[ 0 ];
bitsX += fB.intBits[ 1 ];
}
lastBitsX[ctxId] = bitsX;
//Y-coordinate
for ( ctxId = 0; ctxId < g_uiGroupIdx[dim2-1]; ctxId++)
{
const BinFracBits fB = fracBits.getFracBitsArray( cctx.lastYCtxId(ctxId) );
lastBitsY[ ctxId ] = bitsY + fB.intBits[ 0 ];
bitsY += fB.intBits[ 1 ];
}
lastBitsY[ctxId] = bitsY;
}
bool bFoundLast = false;
for (int iCGScanPos = iCGLastScanPos; iCGScanPos >= 0; iCGScanPos--)
{//遍历cg(从最后一个有非0系数的cg开始)
d64BaseCost -= pdCostCoeffGroupSig [ iCGScanPos ];//先减去当前cg标识为(存在非0系数)的cost
if (cctx.isSigGroup( iCGScanPos ) )
{
uint32_t maxNonZeroPosInCG = iCGSizeM1;
if( lfnstIdx > 0 && ( ( uiWidth == 4 && uiHeight == 4 ) || ( uiWidth == 8 && uiHeight == 8 && cctx.cgPosX() == 0 && cctx.cgPosY() == 0 ) ) )
{
maxNonZeroPosInCG = 7;
}
for( int iScanPosinCG = maxNonZeroPosInCG; iScanPosinCG >= 0; iScanPosinCG-- )
{//遍历cg中的系数
iScanPos = iCGScanPos * (iCGSizeM1 + 1) + iScanPosinCG;
if (iScanPos > iLastScanPos)//如果iScanPos > 最后一个非0系数的位置(也就是还没遍历到最后一个非0系数哪里,则continue)
{
continue;
}
uint32_t uiBlkPos = cctx.blockPos( iScanPos );
if( piDstCoeff[ uiBlkPos ] )//如果当前量化值不为0,把当前系数作为最后一个量化系数
{
uint32_t uiPosY = uiBlkPos >> uiLog2BlockWidth;
uint32_t uiPosX = uiBlkPos - ( uiPosY << uiLog2BlockWidth );
double d64CostLast = xGetRateLast( lastBitsX, lastBitsY, uiPosX, uiPosY );//得到编码最后一个系数位置 的bits
//加上编码这一个系数的bits,减去编码标识这个系数为非0系数的cost
double totalCost = d64BaseCost + d64CostLast - pdCostSig[ iScanPos ];
if( totalCost < d64BestCost )//如果总的cost小于d64BestCost(d64BestCost为量化为全0的cost)
{
iBestLastIdxP1 = iScanPos + 1;
d64BestCost = totalCost;
}
if( piDstCoeff[ uiBlkPos ] > 1 )//如果遇到大于1的系数,那么跳出循环
{
bFoundLast = true;
break;
}
d64BaseCost -= pdCostCoeff[ iScanPos ];//减去保持量化值不变的cost
d64BaseCost += pdCostCoeff0[ iScanPos ];//加上量化为0的cost (因为要遍历下一个点了,当前的点应该置0了)
}
else//如果量化值为0
{
d64BaseCost -= pdCostSig[ iScanPos ];//那就减去标识这个位置为0的cost
}
} //end for
if (bFoundLast)
{
break;
}
} // end if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
DTRACE( g_trace_ctx, D_RDOQ_COST, "%d: %3d, %3d, %dx%d, comp=%d\n", DTRACE_GET_COUNTER( g_trace_ctx, D_RDOQ_COST ), rect.x, rect.y, rect.width, rect.height, compID );
DTRACE( g_trace_ctx, D_RDOQ_COST, "Uncoded=%d\n", (int64_t)( d64BlockUncodedCost ) );
DTRACE( g_trace_ctx, D_RDOQ_COST, "Coded =%d\n", (int64_t)( d64BaseCost ) );
} // end for
for ( int scanPos = 0; scanPos < iBestLastIdxP1; scanPos++ )
{//对整个tu遍历,从刚刚选出的最后一个非0系数开始
//记录当前点的量化值,放在piDstCoeff中
int blkPos = cctx.blockPos( scanPos );
TCoeff level = piDstCoeff[ blkPos ];
uiAbsSum += level;
piDstCoeff[ blkPos ] = ( plSrcCoeff[ blkPos ] < 0 ) ? -level : level;
}
//===== clean uncoded coefficients =====清除未编码的系数
for ( int scanPos = iBestLastIdxP1; scanPos <= iLastScanPos; scanPos++ )
{
piDstCoeff[ cctx.blockPos( scanPos ) ] = 0;
}
//SDH技术
if( cctx.signHiding() && uiAbsSum>=2)//如果使用SDH技术,并且系数绝对值之和大于2
{
const double inverseQuantScale = double(g_invQuantScales[0][cQP.rem(isTransformSkip)]);
int64_t rdFactor = (int64_t)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per(isTransformSkip))) / m_dLambda / 16
/ (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth)))
+ 0.5);
int lastCG = -1;
int absSum = 0 ;
int n ;
for (int subSet = iCGNum - 1; subSet >= 0; subSet--)
{
int subPos = subSet << cctx.log2CGSize();
int firstNZPosInCG = iCGSizeM1 + 1, lastNZPosInCG = -1;
absSum = 0 ;
for( n = iCGSizeM1; n >= 0; --n )
{
if( piDstCoeff[ cctx.blockPos( n + subPos )] )
{
lastNZPosInCG = n;
break;
}
}
for( n = 0; n <= iCGSizeM1; n++ )
{
if( piDstCoeff[ cctx.blockPos( n + subPos )] )
{
firstNZPosInCG = n;
break;
}
}
for( n = firstNZPosInCG; n <= lastNZPosInCG; n++ )
{
absSum += int(piDstCoeff[ cctx.blockPos( n + subPos )]);
}
if(lastNZPosInCG>=0 && lastCG==-1)
{
lastCG = 1;
}
if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
{
uint32_t signbit = (piDstCoeff[cctx.blockPos(subPos+firstNZPosInCG)]>0?0:1);
if( signbit!=(absSum&0x1) ) // hide but need tune
{
// calculate the cost
int64_t minCostInc = std::numeric_limits<int64_t>::max(), curCost = std::numeric_limits<int64_t>::max();
int minPos = -1, finalChange = 0, curChange = 0;
for( n = (lastCG == 1 ? lastNZPosInCG : iCGSizeM1); n >= 0; --n )
{
uint32_t uiBlkPos = cctx.blockPos( n + subPos );
if(piDstCoeff[ uiBlkPos ] != 0 )
{
int64_t costUp = rdFactor * ( - deltaU[uiBlkPos] ) + rateIncUp[uiBlkPos];
int64_t costDown = rdFactor * ( deltaU[uiBlkPos] ) + rateIncDown[uiBlkPos]
- ((abs(piDstCoeff[uiBlkPos]) == 1) ? sigRateDelta[uiBlkPos] : 0);
if(lastCG==1 && lastNZPosInCG==n && abs(piDstCoeff[uiBlkPos])==1)
{
costDown -= (4<<SCALE_BITS);
}
if(costUp<costDown)
{
curCost = costUp;
curChange = 1;
}
else
{
curChange = -1;
if(n==firstNZPosInCG && abs(piDstCoeff[uiBlkPos])==1)
{
curCost = std::numeric_limits<int64_t>::max();
}
else
{
curCost = costDown;
}
}
}
else
{
curCost = rdFactor * ( - (abs(deltaU[uiBlkPos])) ) + (1<<SCALE_BITS) + rateIncUp[uiBlkPos] + sigRateDelta[uiBlkPos] ;
curChange = 1 ;
if(n<firstNZPosInCG)
{
uint32_t thissignbit = (plSrcCoeff[uiBlkPos]>=0?0:1);
if(thissignbit != signbit )
{
curCost = std::numeric_limits<int64_t>::max();
}
}
}
if( curCost<minCostInc)
{
minCostInc = curCost;
finalChange = curChange;
minPos = uiBlkPos;
}
}
if(piDstCoeff[minPos] == entropyCodingMaximum || piDstCoeff[minPos] == entropyCodingMinimum)
{
finalChange = -1;
}
if(plSrcCoeff[minPos]>=0)
{
piDstCoeff[minPos] += finalChange ;
}
else
{
piDstCoeff[minPos] -= finalChange ;
}
}
}
if(lastCG==1)
{
lastCG=0 ;
}
}
}
}
最后一部分为SDH技术:
具体为:(可见万帅老师的书,新一代高效视频编码H.265HEVC原理、标准与实现,中的265面)
每个非零系数的符号采用语法元素coeff_sign_flag来标识,该语法元素可以通过旁路编码器进行嫡编码,其在视频压缩码流中占据了很大的比例(15%~20%)。在H.265/HEVC中,对非零系数符号的编码允许使用一种符号数据隐藏(Sign Data Hiding,SDH)技术[28],来减少编码符号数据的比特数。
SDH技术为:首先计算CG内所有非零系数幅值绝对值之和;然后对和值进行奇偶判断,若和值为偶数,则最后一个非零系数的符号被判为“+”,若和值为奇数,则最后一个非零系数的符号被判为“-”。使用SDH技术,解码端直接判断CG中最后一个非零系数的符号,因此编码端可以省略它的语法元素coeff_sign_flag 的嫡编码。然而,若SDH的最终结果与CG中最后一个非零系数的真实符号不一致,需要对CG中的系数进行调整以使其保持一致,可以采用以下两种方法。
一种方法是编码过程中采用率失真优化量化[29](RDOQ)的方法,即编码器允许使用SDH技术,通过调整量化系数,来使SDH 判决结果与CG中最后一个非零系数的真实符号保持一致。具体哪个系数修改以及怎样修改,则根据率失真代价来决定。这种方法是基于RDOQ进行的,无须增加额外的运算量,因此编码复杂度增加不多。
对于不进行RDOQ的编码器,引入下面的方法[30。在一个CG中,计算原始系数值和反量化系数值之间的差值,对差值最大的量化值进行修正:若差值为正,则量化值加1,若差值为负,则量化值减1。由于差值最大的系数最接近其可行量化值,因此这种量化值的调整所产生的影响较小,且复杂度很低。
是否采用 SDH 技术需要显式标识,图像参数集中的语法元素sign_data_hiding_enabled_flag 置为1表示允许编码器应用SDH技术。具体使用方法规定:当编码器允许使用SDH 技术且当前编码的CG中第一个非零系数和最后一个非零系数之间的间隔大于等于4时[3"’,则该CG才能省略最后一个非零系数符号的嫡编码。
其中进行第二部,确定最优量化值时调用了xGetCodedLevel函数,函数解析见注释。
inline uint32_t QuantRDOQ::xGetCodedLevel( double& rd64CodedCost,
double& rd64CodedCost0,
double& rd64CodedCostSig,
Intermediate_Int lLevelDouble,
uint32_t uiMaxAbsLevel,
const BinFracBits* fracBitsSig,
const BinFracBits& fracBitsPar,
const BinFracBits& fracBitsGt1,
const BinFracBits& fracBitsGt2,
const int remRegBins,
unsigned goRiceZero,
uint16_t ui16AbsGoRice,
int iQBits,
double errorScale,
bool bLast,
bool useLimitedPrefixLength,
const int maxLog2TrDynamicRange
) const
{
double dCurrCostSig = 0;
uint32_t uiBestAbsLevel = 0;//先设置最优预测值为0
if( !bLast && uiMaxAbsLevel < 3 )//如果不是tu中给最后一个非0系数,并且预量化值小于3
{//计算量化为0的cost
rd64CodedCostSig = xGetRateSigCoef( *fracBitsSig, 0 );//表示当前位置上的系数为0
rd64CodedCost = rd64CodedCost0 + rd64CodedCostSig;
if( uiMaxAbsLevel == 0 )//如果预量化值本身是0,那么直接返回
{
return uiBestAbsLevel;
}
}
else//是当前tu最后一个非0系数,或者 ,量化值大于等于3的
{//令rd64CodedCost为无穷大,这样这个量化值一定会被改变
//(好像也不一定,后面遍历量化值的时候还有本来的预量化值,如果预量化值的cost比较小的话,还是有可能选中的)
rd64CodedCost = MAX_DOUBLE;
}
if( !bLast )//如果不是tu最后一个非0系数
{//标定这个非0系数的位置的cost
dCurrCostSig = xGetRateSigCoef( *fracBitsSig, 1 );//表示当前位置上的系数非0
}
uint32_t uiMinAbsLevel = ( uiMaxAbsLevel > 1 ? uiMaxAbsLevel - 1 : 1 );
for( int uiAbsLevel = uiMaxAbsLevel; uiAbsLevel >= uiMinAbsLevel ; uiAbsLevel-- )
{
double dErr = double( lLevelDouble - ( Intermediate_Int(uiAbsLevel) << iQBits ) );
double dCurrCost = dErr * dErr * errorScale + xGetICost( xGetICRate( uiAbsLevel, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, ui16AbsGoRice, true, maxLog2TrDynamicRange ) );
dCurrCost += dCurrCostSig;
if( dCurrCost < rd64CodedCost )
{
uiBestAbsLevel = uiAbsLevel;
rd64CodedCost = dCurrCost;
rd64CodedCostSig = dCurrCostSig;
}
}
return uiBestAbsLevel;
}