x265-1.8版本-encoder/slicetype.cpp注释

118 阅读 0 评论 78 点赞
我是靠谱客的博主微笑牛排，最近开发中收集的这篇文章主要介绍x265-1.8版本-encoder/slicetype.cpp注释，觉得挺不错的，现在分享给大家，希望可以做个参考。
概述

注：问号以及未注释部分会在x265-1.9版本内更新
/*****************************************************************************
* Copyright (C) 2013 x265 project
*
* Authors: Gopu Govindaswamy <gopu@multicorewareinc.com>
*
Steve Borho <steve@borho.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at license @ x265.com.
*****************************************************************************/
#include "common.h"
#include "frame.h"
#include "framedata.h"
#include "picyuv.h"
#include "primitives.h"
#include "lowres.h"
#include "mv.h"
#include "slicetype.h"
#include "motion.h"
#include "ratecontrol.h"
#if DETAILED_CU_STATS
#define ProfileLookaheadTime(elapsed, count) ScopedElapsedTime _scope(elapsed); count++
#else
#define ProfileLookaheadTime(elapsed, count)
#endif
using namespace X265_NS;
namespace {
/*推算公式：
E =(Σx)/n
n*S^2=Σ(x - E)^2
=Σ(x^2 -2 xE+E^2)
=Σ(x^2) – 2EΣx+ nE^2
= Σ(x^2) – 2((Σx)/n)*Σx+ ((Σx)*((Σx)/n)
=(Σ(x^2) - (Σx * Σx)/n )
所以acEnergyVar 返回值 Σ(x^2) - (Σx * Σx)/n = n * s^2
*/
/** 函数功能
：将元素和和平方和累加到当前帧中的m_lowres
返回当前块Σ(x^2) - (Σx * Σx)/n = n*Variance (n倍的方差)。
/*
调用范围
：只在acEnergyPlane函数中被调用
* 参数 curFrame：当前帧
* 参数 sum_ssd ：当前块的元素和以及平方和
* 参数 shift
：移位数目(如果当前16x16，则当前值为8，表示当前有n=2^8 = 256 个像素值)
* 参数 plane
：0,1,2 分别表示Y、U、V分量
* 返回值
：当前块的交流（AC）分量
*/
/* Compute variance to derive AC energy of each block */
inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int plane)
{
uint32_t sum = (uint32_t)sum_ssd;
//当前块所有元素的和Σx
uint32_t ssd = (uint32_t)(sum_ssd >> 32);//当前块所有元素的平方和Σ(x^2)
curFrame->m_lowres.wp_sum[plane] += sum;
curFrame->m_lowres.wp_ssd[plane] += ssd;
return ssd - ((uint64_t)sum * sum >> shift);
}
/** 函数功能
： acEnergyVar将元素和和平方和累加到当前帧中的m_lowres
返回当前块n*Variance。
/*
调用范围
： 只在LookaheadTLD::acEnergyCu函数中被调用
* 参数 curFrame
： 当前帧
* 参数 src
： 当前块在一帧视频中的地址
* 参数 srcStride
： 当前帧的步长
* 参数 plane
： 0,1,2 分别表示Y、U、V
* 参数 colorFormat： 数据格式，1 为420格式
/* Find the energy of each block in Y/Cb/Cr plane */
inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat)
{
if ((colorFormat != X265_CSP_I444) && plane)//如果当前是420格式，则plane= 0,1,2 分别为Y,U V
{
//对色度进行计算
ALIGN_VAR_8(pixel, pix[8 * 8]);
primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
//8 表示 dstStride
return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);//6 表示当前的元素个数：2^6 = 64 = 8*8
}
else
return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane); //8 表示当前的元素个数： 2^8 = 256 = 16*16
//primitives.cu[BLOCK_16x16].var
返回一个64位整数,低32位存储当前16x16所有元素的和，高32位存储当前16x16所有元素的平方和
//primitives.cu[BLOCK_8x8].var
作用同上，返回的是8x8对应的数据
//primitives.cu[BLOCK_8x8].copy_pp 将源8x8数据copy到对应缓冲区
}
} // end anonymous namespace
/** 函数功能
： acEnergyVar将元素和和平方和累加到当前帧中的m_lowres
返回当前块YUV的AC能量。
/*
调用范围
： 只在LookaheadTLD::calcAdaptiveQuantFrame函数中被调用
* 参数 curFrame
： 当前帧（含有原始帧数据)
* 参数 blockX
： 当前16x16的左偏移量（单位像素）
* 参数 blockY
： 当前16x16的下偏移量（单位像素）
* 参数 csp
： 数据格式，1 为420格式
/* Find the total AC energy of each block in all planes */
uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp)
{
/*假设原始帧亮度16x16块的方差标记为 sY
像素个数nY = 16x16 =256
AC能量记为： acY = nY*sY
假设原始帧色度 8x8 块的方差标记为 sCr、sCb
像素个数nC = 8x8 =64 AC能量记为： acCb = nC*sCb acCr = nC*sCr
当前块的AC能量记为：ac =
acY + acCb + acCr
**/
intptr_t stride = curFrame->m_fencPic->m_stride;
//原始帧亮度的步长
intptr_t cStride = curFrame->m_fencPic->m_strideC;
//原始帧色度的步长
intptr_t blockOffsetLuma = blockX + (blockY * stride);
//当前亮度位置相对于原始帧左上角像素的偏移地址
int hShift = CHROMA_H_SHIFT(csp);
//如：420格式，色度宽高是亮度的1/2 ,则此处为1，表示需要右移1位
int vShift = CHROMA_V_SHIFT(csp);
//如：420格式，色度宽高是亮度的1/2 ,则此处为1，表示需要右移1位
intptr_t blockOffsetChroma = (blockX >> hShift) + ((blockY >> vShift) * cStride);//当前色度位置相对于原始帧左上角像素的偏移地址
uint32_t var;
var
= acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
//返回当前亮度块16x16的 n*Variance 值
var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp); //返回当前色度块8 x 8的 n*Variance 值
var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp); //返回当前色度块8 x 8的 n*Variance 值
x265_emms();//清除MMX寄存器中的内容，即初始化（以避免和浮点数操作发生冲突）。
return var; //返回亮度、色度的 n*Variance 值
}
/** 函数功能
： 初始化m_lowres中的qpCuTreeOffset等信息，获取整帧的像素和和AC能量。
/*
调用范围
： 只在PreLookaheadGroup::processTasks函数中被调用
* 参数 curFrame
： 当前帧（含有原始帧数据)
* 参数 param
： 编码器配置的参数 */
void LookaheadTLD::calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param)
{
/* Actual adaptive quantization */
int maxCol = curFrame->m_fencPic->m_picWidth; //原始帧的宽度信息
int maxRow = curFrame->m_fencPic->m_picHeight;//原始帧的高度信息
int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;//当前帧下采样1/2 后总共有多少个8x8块，而在原始帧上刚好等于其16x16的个数
for (int y = 0; y < 3; y++)
{
curFrame->m_lowres.wp_ssd[y] = 0;
//初始化为0， 用于累加整帧像素的平方和
curFrame->m_lowres.wp_sum[y] = 0;
//初始化为0， 用于累加整帧像素的和
}
/* Calculate Qp offset for each 16x16 block in the frame */
int blockXY = 0;
int blockX = 0, blockY = 0;
double strength = 0.f;
if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0)
{
/* Need to init it anyways for CU tree */
int cuCount = widthInCU * heightInCU;
//初始化QP-offset信息
if (param->rc.aqMode && param->rc.aqStrength == 0)
{
memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
for (int cuxy = 0; cuxy < cuCount; cuxy++)
curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
}
//将当前未下采样的帧分成16x16计算其AC能量（用方差估算）
//存储未下采样原始帧所有像素的平方和Σ(x^2) 0,1,2 分别表示Y、U、V
//存储未下采样原始帧所有像素的和(Σx)
0,1,2 分别表示Y、U、V
/* Need variance data for weighted prediction */
if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
{
for (blockY = 0; blockY < maxRow; blockY += 16)
for (blockX = 0; blockX < maxCol; blockX += 16)
acEnergyCu(curFrame, blockX, blockY, param->internalCsp);//计算每个16x16的AC能量：n*Variance 值
}
}
else
{
blockXY = 0;
double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
double bias_strength = 0.f;
if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
{
double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));//值： 1/（2^（2*(X265_DEPTH-8)））
for (blockY = 0; blockY < maxRow; blockY += 16)
{
for (blockX = 0; blockX < maxCol; blockX += 16)
{
uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);//计算每个16x16的AC能量：energy =n*Variance 值
qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
//qp_adj = (energy*（1/（2^（2*(X265_DEPTH-8)））） + 1)^0.1
curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
//初始化m_lowres.qpCuTreeOffset
avg_adj += qp_adj;
avg_adj_pow2 += qp_adj * qp_adj;
blockXY++;
}
}
avg_adj /= blockCount;
//当前为qp_adj的平均值 (Σx)/n
avg_adj_pow2 /= blockCount;
//当前为qp_adj的平方和均值(Σx^2)/n
strength = param->rc.aqStrength * avg_adj;
//修正当前的strength值 ，param->rc.aqStrength 为1.0 表示
strength = (Σx)/n
avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f)) / avg_adj;
//修正当前qp_adj的平均值：(Σx)/n - 0.5 * ( (Σx^2)/n
- 11) / ((Σx)/n) (当前为8位位宽)
bias_strength = param->rc.aqStrength;
//获取strength
}
else
strength = param->rc.aqStrength * 1.0397f;
//如果X265_AQ_VARIANCE strength = param->rc.aqStrength * 1.0397f;
blockXY = 0;
for (blockY = 0; blockY < maxRow; blockY += 16)
{
for (blockX = 0; blockX < maxCol; blockX += 16)
{
if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
{
qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 11.f / (qp_adj * qp_adj));
}
else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
{
qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
//获取前面的(energy*（1/（2^（2*(X265_DEPTH-8)））） + 1)^0.1
qp_adj = strength * (qp_adj - avg_adj);
//更新当前的adj
}
else
{
uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
}
curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
//获取量化权重系数,能量(方差)越大,量化权重系数越小
blockXY++;
/*假设原始帧亮度16x16块的方差标记为 sY
像素个数nY = 16x16 =256
AC能量记为： acY = nY*sY
假设原始帧色度 8x8 块的方差标记为 sCr、sCb
像素个数nC = 8x8 =64 AC能量记为： acCb = nC*sCb acCr = nC*sCr
当前块的AC能量记为：ac =
acY + acCb + acCr
每个16x16块按照光栅扫描记为 ac(i) , 总共个数为n
AC能量经过矫正后的值：Jac = (energy*（1/（2^（2*(X265_DEPTH-8)））） + 1)^0.1
当前所有块的平均值为 JAveAc = (Σ(Jac)/n
当前所有块的平方和均值为 JAveAc2 = (Σ((Jac)^2))/n
当前自适应量化的强度记为：strength
当前的位宽为 depth
如果：param->rc.aqMode = X265_AQ_NONE
qpAqOffset
[i] =
0
qpCuTreeOffset [i] =
0
invQscaleFactor[i] =
256(即权重系数为1 因为真正的权重为 invQscaleFactor[i]>>8)
如果：param->rc.aqMode = X265_AQ_VARIANCE
strength = param->rc.aqStrength * 1.0397f;
qp_adj
= strength * （log2(Jac(i)) - (14.427 + 2*(depth-8)))）
qpAqOffset
[i] = qp_adj
qpCuTreeOffset [i] = qp_adj
invQscaleFactor[i] = x265_exp2fix8(qp_adj)
如果：param->rc.aqMode = X265_AQ_AUTO_VARIANCE
strength = param->rc.aqStrength * JAveAc;
bias_strength = param->rc.aqStrength;
qp_adj
= strength * (Jac - JAveAc);
qpAqOffset
[i] = qp_adj
qpCuTreeOffset [i] = qp_adj
invQscaleFactor[i] = x265_exp2fix8(qp_adj)
如果：param->rc.aqMode = X265_AQ_AUTO_VARIANCE_BIASED
strength = param->rc.aqStrength * JAveAc ;
bias_strength = param->rc.aqStrength;
qp_adj
=
strength * (Jac - JAveAc) + bias_strength * (1.f - 11.f / (Jac * Jac));
qpAqOffset
[i] = qp_adj
qpCuTreeOffset [i] = qp_adj
invQscaleFactor[i] = x265_exp2fix8(qp_adj)
**/
}
}
}
if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
{
int hShift = CHROMA_H_SHIFT(param->internalCsp); //如：420格式，色度宽高是亮度的1/2 ,则此处为1，表示需要右移1位
int vShift = CHROMA_V_SHIFT(param->internalCsp); //如：420格式，色度宽高是亮度的1/2 ,则此处为1，表示需要右移1位
maxCol = ((maxCol + 8) >> 4) << 4;
//如果当前图像宽度整除16后的余数小于8，省去，大于8则补全16，如当前为417 = 26*16 +1，则返回26*16
//当前为424 = 26*16 +8，则返回26*17
maxRow = ((maxRow + 8) >> 4) << 4;
//如果当前图像高度整除16后的余数小于8，省去，大于8则补全16
int width[3]
= { maxCol, maxCol >> hShift, maxCol >> hShift }; //分别存储亮度、色度的宽度
int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift }; //分别存储亮度、色度的高度
for (int i = 0; i < 3; i++) //遍历Y、U、V
{
uint64_t sum, ssd;
sum = curFrame->m_lowres.wp_sum[i];
//未下采样原始帧所有像素的和(Σx)
ssd = curFrame->m_lowres.wp_ssd[i];
//未下采样原始帧所有像素的平方和Σ(x^2)
curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
//返回一整帧的AC能量（约等于 n倍的方差 n为图像像素个数(n=width[i] * height[i])) 加上n/2是为了四舍五入）
}
}
}
/** 函数功能
： 计算当前1/2下采样帧的intra SATD值以及最优intra模式
/*
调用范围
： 只在PreLookaheadGroup::processTasks函数中被调用
* 参数 fenc
： 当前帧（经过1/2下采样后的数据）
*
返回值
： null
**/
void LookaheadTLD::lowresIntraEstimate(Lowres& fenc)
{
ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);//申请空间大小8x8 并内存对齐 用于存储当前预测块的像素值
pixel fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
//申请空间大小8x8 用于存储当前编码块的原始像素值
pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
//存储当前周边行的数据 [0][]是参考周边数据，[1][]是经过intra滤波后的数据
//存储方式 0 left-above 1~8 ~ above 9~16 above-right 17~24 left 25~32 bottom-left
pixel* samples = neighbours[0], *filtered = neighbours[1];
//samples是参考周边数据，filtered是经过intra滤波后的数据
const int lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
//X265_LOOKAHEAD_QP 为 12 + 6 * (X265_DEPTH - 8))
，当前的lamda为 2^(qp/6 - 2) 并取整
//注意：当前QP并不一定是最优，可以继续修正，因为当前只是估算1/2 下采样视频的 intra编码代价
const int intraPenalty = 5 * lookAheadLambda;
const int lowresPenalty = 4; /* fixed CU cost overhead */
//intra方向 bits估算：intraPenalty + lowresPenalty 注意：当前估算方式并一定最优，可以进一步优化
const int cuSize
= X265_LOWRES_CU_SIZE;
//当前用于估算的CU大小，在x265中统一用x88
const int cuSize2 = cuSize << 1;
//用于快速索引left存储位置区域
const int sizeIdx = X265_LOWRES_CU_BITS - 2;
//便于找到asm代码函数，sizeIdx 0,1,2,3 分别对应着4x4、8x8、16x16、32x32
pixelcmp_t satd = primitives.pu[sizeIdx].satd;
//计算8x8哈达玛变换代价的asm函数
int planar = !!(cuSize >= 8);
int costEst = 0, costEstAq = 0;
//costEst用于存储当前lowres中除边界块的全部intra8x8的SATD的累加和
//costEstAq用于存储当前lowres中除边界块的全部intra8x8的SATD乘以invQscaleFactor后的累加和
for (int cuY = 0; cuY < heightInCU; cuY++)
{
fenc.rowSatds[0][0][cuY] = 0;
//初始化一行cost为0
for (int cuX = 0; cuX < widthInCU; cuX++)
{
const int cuXY = cuX + cuY * widthInCU;
const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc.lumaStride; //计算当前8x8首地址相对于帧首地址的偏移量
pixel *pixCur = fenc.lowresPlane[0] + pelOffset;
//计算当前8x8的首地址
/* copy fenc pixels */
primitives.cu[sizeIdx].copy_pp(fencIntra, cuSize, pixCur, fenc.lumaStride);//从帧中copy对应原始数据到8x8空间中，便于后面计算
/* collect reference sample pixels */
pixCur -= fenc.lumaStride + 1;
//假设当前块的左上角像素点坐标为(x,y) ，则当前位置为（x-1，y-1)
memcpy(samples, pixCur, (2 * cuSize + 1) * sizeof(pixel)); /* top */ //获取0 left-above 1~8 ~ above 9~16 above-right与当前块相邻的行数据
for (int i = 1; i <= 2 * cuSize; i++)
samples[cuSize2 + i] = pixCur[i * fenc.lumaStride];
/* left */ //获取17~24 left 25~32 bottom-left与当前块相邻的行数据
primitives.cu[sizeIdx].intra_filter(samples, filtered); //将相邻行参考像素数据进行滤波 intraFilter函数
/*
滤波前数据：
45
39
36
32
31
33
31
32
33
30
35
43
40
49
52
64
67
44
45
42
42
42
46
46
44
33
33
42
48
60
58
46
44
滤波后数据：
43
40
36
33
32
32
32
32
32
32
36
40
43
48
54
62
67
45
44
43
42
43
45
46
42
36
35
41
50
57
56
49
44
采用1、2、1滤波
其中samples[0] 采用top行第一个数据和left列第一个数据进行滤波 filtered[0] = (samples[1] + 2*samples[0] + samples[2*8+1] +2 )>>2 = (39 + 45*2 +44 +2 )>>2 = 43
left列最后一个数据与top行最后一个数据直接进行copy: filtered[4*8] = samples[4*8] = 44 filtered[2*8] = samples[2*8] = 67
left列第一个数据：filtered[2*8+1] = (samples[0] + 2*samples[2*8+1] + samples[2*8+2] +2 )>>2 = (45 + 2*44 + 45 +2 )>>2 = 45 因为samples[0] 与samples[2*8+1]在列位置上相邻
其它像素采用相邻像素进行滤波
**/
int cost, icost = me.COST_MAX;
//icost 存储当前最优模式的cost
uint32_t ilowmode = 0;
//ilowmod 存储当前最优的模式
/* DC and planar */
primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, samples, 0, cuSize <= 16); //DC 模式不需要滤波
cost = satd(fencIntra, cuSize, prediction, cuSize);//计算当前模式的satd值
COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
//如果当前cost小于最优的代价，更新最优模式为当前模式，更新最优cost为当前cost
/*
例如当前原始数据：fencIntra
39
33
35
34
31
33
34
30
37
35
38
37
33
31
30
34
38
36
34
35
34
31
32
35
42
36
37
36
33
37
39
40
41
38
34
32
34
33
35
36
41
34
31
36
34
38
38
34
38
30
36
34
34
34
31
32
30
37
41
36
34
32
33
40
周边块数据：
滤波前数据：
45
39
36
32
31
33
31
32
33
30
35
43
40
49
52
64
67
44
45
42
42
42
46
46
44
33
33
42
48
60
58
46
44
滤波后数据：
43
40
36
33
32
32
32
32
32
32
36
40
43
48
54
62
67
45
44
43
42
43
45
46
42
36
35
41
50
57
56
49
44 (当前为DC模式，无须滤波)
则预测数据：prediction
40
38
37
37
38
37
37
38
41
39
39
39
39
39
39
39
40
39
39
39
39
39
39
39
40
39
39
39
39
39
39
39
40
39
39
39
39
39
39
39
41
39
39
39
39
39
39
39
41
39
39
39
39
39
39
39
40
39
39
39
39
39
39
39
dcValue = （left像素和 + above像素和 + 当前块宽度)>>(log2（N）+ 1)
=((44+45+42+42+42+46+46+44) + (39+36+32+31+33+31+32+33) + 8)>>4
= 39
如果当前块大于16x16，全部为dcValue
否则，
预测左上角像素 = (left[0] + above[0] + 2*dcValue+2)>>2 = (44+39+2*39+2)>>2 = 40
预测第一行像素[x] = (above[x]+3*dcValue+2)>>2 如：预测第一行像素[4] = (33+3*39+2)>>2 = 38
预测第一列像素[x] = (left[x]+3*dcValue+2)>>2
如: 预测第一列像素[5] = (46+3*39+2)>>2 = 41
SATD计算：
首先计算编码块与预测块的残差：fencIntra - prediction
-1
-5
-2
-3
-7
-4
-3
-8
-4
-4
-1
-2
-6
-8
-9
-5
-2
-3
-5
-4
-5
-8
-7
-4
2
-3
-2
-3
-6
-2
0
1
1
-1
-5
-7
-5
-6
-4
-3
0
-5
-8
-3
-5
-1
-1
-5
-3
-9
-3
-5
-5
-5
-8
-7
-10 -2
2
-3
-5
-7
-6
1
然后将残差块拆减为4x4块
-1
-5
-2
-3
-7
-4
-3
-8
1
-1
-5
-7
-5
-6
-4
-3
-4
-4
-1
-2
-6
-8
-9
-5
0
-5
-8
-3
-5
-1
-1
-5
-2
-3
-5
-4
-5
-8
-7
-4
-3
-9
-3
-5
-5
-5
-8
-7
2
-3
-2
-3
-6
-2
0
1
-10 -2
2
-3
-5
-7
-6
1
哈达玛矩阵：hadma
1
1
1
1
1
-1
1
-1
1
1
-1
-1
1
-1
-1
1
对其做哈达玛变换：
1
1
1
1
1
-1
1
-1
1
1
-1
-1
1
-1
-1
1
hadma*A*hadma' = 1
1
1
1
-1
-5
-2
-3
1
1
1
1
1
-1
1
-1
*
-4
-4
-1
-2
*
1
-1
1
-1
1
1
-1
-1
-2
-3
-5
-4
1
1
-1
-1
1
-1
-1
1
2
-3
-2
-3
1
-1
-1
1
= -42
12
2
8
-8
-2
4
2
-2
0
-14
-4
8
10
4
6
求得矩阵的绝对值和为：Σabs(x) = 128 则satd = 128/2 = 64
累加4个4x4的satd值为当前8x8的satd值： 64+112+108+96 = 380
**/
primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, neighbours[planar], 0, 0);//计算planar模式
cost = satd(fencIntra, cuSize, prediction, cuSize);
COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX); //如果当前cost小于最优的代价，更新最优模式为当前模式，更新最优cost为当前cost
/* scan angular predictions */
int filter, acost = me.COST_MAX;
uint32_t mode, alowmode = 4;
//alowmode存储当前最优的角度模式
for (mode = 5; mode < 35; mode += 5)
//快速遍历帧内角度模式，每隔五个模式遍历一个模式
{
filter = !!(g_intraFilterFlags[mode] & cuSize);
//判断当前是否用于滤波后参考像素
primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
cost = satd(fencIntra, cuSize, prediction, cuSize);
COPY2_IF_LT(acost, cost, alowmode, mode); //如果当前cost小于最优的代价，更新最优模式为当前模式，更新最优cost为当前cost
}
for (uint32_t dist = 2; dist >= 1; dist--) //遍历角度模式 alowmode-1, alowmode -2, alowmode +1, alowmode+2
{
int minusmode = alowmode - dist;
int plusmode = alowmode + dist;
mode = minusmode;
filter = !!(g_intraFilterFlags[mode] & cuSize);
primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
cost = satd(fencIntra, cuSize, prediction, cuSize);
COPY2_IF_LT(acost, cost, alowmode, mode);
mode = plusmode;
filter = !!(g_intraFilterFlags[mode] & cuSize);
primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
cost = satd(fencIntra, cuSize, prediction, cuSize);
COPY2_IF_LT(acost, cost, alowmode, mode);
}
COPY2_IF_LT(icost, acost, ilowmode, alowmode); //角度模式与DC、planar模式比较获取最优的预测模式
icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */ //加上方向角度估计占用的bits
fenc.lowresCosts[0][0][cuXY] = (uint16_t)(X265_MIN(icost, LOWRES_COST_MASK) | (0 << LOWRES_COST_SHIFT));
//存储intra块最优SATD值
fenc.intraCost[cuXY] = icost;
//存储当前块的SATD值
fenc.intraMode[cuXY] = (uint8_t)ilowmode;
//存储当前块的最优帧内模式
/* do not include edge blocks in the frame cost estimates, they are not very accurate */
const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2; //判断当前是否是边界上的CU，因为边界上的块不够准确，是边界返回false
int icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor[cuXY] + 128) >> 8) : icost;
//如果是边界块：icostAq = icost
//
不是边界块：icostAq = (icost * fenc.invQscaleFactor[cuXY] + 128) >> 8
if (bFrameScoreCU)
{
costEst += icost;
costEstAq += icostAq;
}
fenc.rowSatds[0][0][cuY] += icostAq; //累加当前行的icostAq
}
}
fenc.costEst[0][0] = costEst;
//当前lowres中除边界块的全部intra8x8的SATD的累加和
fenc.costEstAq[0][0] = costEstAq;
//当前lowres中除边界块的全部intra8x8的SATD乘以invQscaleFactor后的累加和
}
/** 函数功能
：计算两帧（wp.bPresentFlag 为 ref是否加权）之间的SATD值
/*
调用范围
：只在LookaheadTLD::weightsAnalyse函数中被调用
* 参数 fenc
：当前帧
* 参数 ref
：前向帧
* 返回
：两帧（wp.bPresentFlag 为 ref是否加权）之间的SATD值 */
uint32_t LookaheadTLD::weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp)
{
pixel *src = ref.fpelPlane[0];
intptr_t stride = fenc.lumaStride;
if (wp.bPresentFlag)
{
int offset = wp.inputOffset << (X265_DEPTH - 8);//offset信息 整帧所有像素偏移值
int scale = wp.inputWeight;
//权重系数w<<wp.log2WeightDenom
int denom = wp.log2WeightDenom;
//权重系数左移位个数（为了保证精度）
int round = denom ? 1 << (denom - 1) : 0;
//四舍五入操作
int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
int widthHeight = (int)stride;
primitives.weight_pp(ref.buffer[0], wbuffer[0], stride, widthHeight, paddedLines,
scale, round << correction, denom + correction, offset);
//P帧加权参考帧获取 void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
src = weightedRef.fpelPlane[0];
//src 更新为加权参考帧像素
}
uint32_t cost = 0;
intptr_t pixoff = 0;
int mb = 0;
//计算当前两帧之间的SATD值
for (int y = 0; y < fenc.lines; y += 8, pixoff = y * stride)
{
for (int x = 0; x < fenc.width; x += 8, mb++, pixoff += 8)
{
int satd = primitives.pu[LUMA_8x8].satd(src + pixoff, stride, fenc.fpelPlane[0] + pixoff, stride);
cost += X265_MIN(satd, fenc.intraCost[mb]);
}
}
return cost;//返回当前两帧的SATD值
}
/** 函数功能
：申请存储空间并初始化weightedRef
/*
调用范围
：只在LookaheadTLD::weightsAnalyse函数中被调用
* 参数 fenc
：当前帧
* 返回
：内存申请成功为true 失败为false */
bool LookaheadTLD::allocWeightedRef(Lowres& fenc)
{
intptr_t planesize = fenc.buffer[1] - fenc.buffer[0];
//lumaStride * (lines + 2 * origPic->m_lumaMarginY); 1/2下采样视频帧+扩边的大小
intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0]; //真实数据到数据buffer首地址的偏移地址（偏移部分是扩边信息）
paddedLines = (int)(planesize / fenc.lumaStride);
//buf行数
wbuffer[0] = X265_MALLOC(pixel, 4 * planesize);
//申请空间方式同Lowres.buf
if (wbuffer[0])
{
wbuffer[1] = wbuffer[0] + planesize;
wbuffer[2] = wbuffer[1] + planesize;
wbuffer[3] = wbuffer[2] + planesize;
}
else
return false;
for (int i = 0; i < 4; i++)
weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;//真实数据区域
weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
//标记ref相应信息
weightedRef.lumaStride = fenc.lumaStride;
weightedRef.isLowres = true;
weightedRef.isWeighted = false;
return true;
}
/** 函数功能
：判断当前两帧是否进行加权 ，结果存储在weightedRef.isWeighted
/*
调用范围
：只在CostEstimateGroup::estimateFrameCost函数中被调用
* 参数 fenc
：当前帧
* 参数 ref
：前向帧
* 返回
：NULL */
void LookaheadTLD::weightsAnalyse(Lowres& fenc, Lowres& ref)
{
static const float epsilon = 1.f / 128.f;
//ε= 1/128
阈值限定，如果加权系数小于ε，表示无须对齐进行加权分析
int deltaIndex = fenc.frameNum - ref.frameNum;
//当前帧与前向帧的poc差值
WeightParam wp;
//用于存储加权参量
wp.bPresentFlag = false;
//先判定不加权预测情况
if (!wbuffer[0])
{
if (!allocWeightedRef(fenc))
//申请存储空间并初始化weightedRef
return;
}
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
float guessScale, fencMean, refMean;
//guessScale通过两帧的AC能量预估一个加权系数w, fencMean存储当前帧的像素平均值, refMean存储参考帧的像素平均值;
x265_emms();
//清除MMX寄存器中的内容，即初始化（以避免和浮点数操作发生冲突）。
if (fenc.wp_ssd[0] && ref.wp_ssd[0])
//分别表示未下采样原始帧整帧的AC能量(n方差)
guessScale = sqrtf((float)fenc.wp_ssd[0] / ref.wp_ssd[0]); //预估一个因子√(AC(rec)/AC(ref))
else
guessScale = 1.0f;
fencMean = (float)fenc.wp_sum[0] / (fenc.lines * fenc.width) / (1 << (X265_DEPTH - 8)); //计算其平均值，因其(含补全8x8块)边像素值，所以其平均值比正常偏大
refMean = (float)ref.wp_sum[0] / (fenc.lines * fenc.width) / (1 << (X265_DEPTH - 8));
//计算其平均值，因其(含补全8x8块)边像素值，所以其平均值比正常偏大
/* Early termination */
if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon)
//提前终止判断，如果其平均值差小于0.5 或者
1- √(AC(rec)/AC(ref)) < 1/128
return;
int minoff = 0, minscale, mindenom;
//minoff 记录最优的offset, minscale记录最优的w, mindenom记录最优w精度扩大位数;
unsigned int minscore = 0, origscore = 1;
//minscore 记录最优的SATD值, origscore 不加权两帧之间的SATD值
int found = 0;
//记录是否应用加权
wp.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true);//设置weight参数，分别为加权参数w， offset，扩大精度7位（乘以128)
mindenom = wp.log2WeightDenom;
//更新最优w精度扩大位数
minscale = wp.inputWeight;
//更新最优加权系数w
origscore = minscore = weightCostLuma(fenc, ref, wp); //计算不加权两帧之间的SATD值
if (!minscore)
//如果计算为0，说明当前没有差异，直接返回退出
return;
unsigned int s = 0;
int curScale = minscale;
int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f);//计算加权帧的像素偏移量： 原始帧像素平均值 - 重构帧像素平均值*w +0.5f (w= curScale/1 << mindenom) = √(AC(rec)/AC(ref)))
if (curOffset < -128 || curOffset > 127)//clip操作，offset 应该属于(-128,128)
{
/* Rescale considering the constraints on curOffset. We do it in this order
* because scale has a much wider range than offset (because of denom), so
* it should almost never need to be clamped. */
curOffset = x265_clip3(-128, 127, curOffset);
curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f);
curScale = x265_clip3(0, 127, curScale);
}
SET_WEIGHT(wp, true, curScale, mindenom, curOffset);//设置WeightParam类数据
/* SET_WEIGHT宏如下:
(w).inputWeight = (curScale,);
(w).log2WeightDenom = ( mindenom);
(w).inputOffset = (curOffset);
(w).bPresentFlag = (true);
**/
s = weightCostLuma(fenc, ref, wp); //计算加权帧与fenc的SATD值
COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);//更新当前最优的选择，加权是否
/* Use a smaller denominator if possible */
while (mindenom > 0 && !(minscale & 1)) //值保证不变，将左移位数减少
{
mindenom--;
minscale >>= 1;
}
if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)//有任意三点的情况，都判为不加权1.加权SATD不是最优 2.偏移值为0并且w为1(minscale == 1 << mindenom 表示w为1)
3.加权SATD比不加权SATD不足够小
return;
else
{
SET_WEIGHT(wp, true, minscale, mindenom, minoff);//设置WeightParam类数据
// set weighted delta cost
fenc.weightedCostDelta[deltaIndex] = minscore / origscore;//存储参考帧（也是下采样的原始帧）加权的SATD/不加权的SATD
int offset = wp.inputOffset << (X265_DEPTH - 8);
//获取相应weight参数
int scale = wp.inputWeight;
int denom = wp.log2WeightDenom;
int round = denom ? 1 << (denom - 1) : 0;
int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
intptr_t stride = ref.lumaStride;
int widthHeight = (int)stride;
for (int i = 0; i < 4; i++)
primitives.weight_pp(ref.buffer[i], wbuffer[i], stride, widthHeight, paddedLines,
scale, round << correction, denom + correction, offset);
//P帧加权参考帧获取 void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
weightedRef.isWeighted = true;//将当前分析结果标记：加权
}
}
/** 函数功能
： 初始化信息
/*
调用范围
： 只在主线程Encoder::create()中应用
* 参数 param
： 配置参数
* 参数 pool
： NAMA模式下的线程池
* 返回
： null * */
Lookahead::Lookahead(x265_param *param, ThreadPool* pool)
{
m_param = param;
m_pool
= pool;
m_lastNonB = NULL;
m_isSceneTransition = false;
m_scratch
= NULL;
m_tld
= NULL;
m_filled
= false;
m_outputSignalRequired = false;
m_isActive = true;
m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_8x8Width - 2) * (m_8x8Height - 2) : m_8x8Width * m_8x8Height;
m_lastKeyframe = -m_param->keyframeMax;
//标记当前前一个关键帧位置，初始化为-maxKeyframe， 保证第一帧是关键帧
m_sliceTypeBusy = false;
m_fullQueueSize = X265_MAX(1, m_param->lookaheadDepth);
m_bAdaptiveQuant = m_param->rc.aqMode || m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred;
/* If we have a thread pool and are using --b-adapt 2, it is generally
* preferable to perform all motion searches for each lowres frame in large
* batched; this will create one job per --bframe per lowres frame, and
* these jobs are performed by workers bonded to the thread running
* slicetypeDecide() */
m_bBatchMotionSearch = m_pool && m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS;
/* It is also beneficial to pre-calculate all possible frame cost estimates
* using worker threads bonded to the worker thread running
* slicetypeDecide(). This creates bframes * bframes jobs which take less
* time than the motion search batches but there are many of them. This may
* do much unnecessary work, some frame cost estimates are not needed, so if
* the thread pool is small we disable this feature after the initial burst
* of work */
m_bBatchFrameCosts = m_bBatchMotionSearch;
if (m_param->lookaheadSlices && !m_pool)
m_param->lookaheadSlices = 0;
if (m_param->lookaheadSlices > 1)
{
m_numRowsPerSlice = m_8x8Height / m_param->lookaheadSlices;
m_numRowsPerSlice = X265_MAX(m_numRowsPerSlice, 10);
// at least 10 rows per slice
m_numRowsPerSlice = X265_MIN(m_numRowsPerSlice, m_8x8Height);
// but no more than the full picture
m_numCoopSlices = m_8x8Height / m_numRowsPerSlice;
m_param->lookaheadSlices = m_numCoopSlices;
// report actual final slice count
}
else
{
m_numRowsPerSlice = m_8x8Height;
m_numCoopSlices = 1;
}
#if DETAILED_CU_STATS
m_slicetypeDecideElapsedTime = 0;
m_preLookaheadElapsedTime = 0;
m_countSlicetypeDecide = 0;
m_countPreLookahead = 0;
#endif
memset(m_histogram, 0, sizeof(m_histogram));
}
#if DETAILED_CU_STATS
void Lookahead::getWorkerStats(int64_t& batchElapsedTime, uint64_t& batchCount, int64_t& coopSliceElapsedTime, uint64_t& coopSliceCount)
{
batchElapsedTime = coopSliceElapsedTime = 0;
coopSliceCount = batchCount = 0;
int tldCount = m_pool ? m_pool->m_numWorkers : 1;
for (int i = 0; i < tldCount; i++)
{
batchElapsedTime += m_tld[i].batchElapsedTime;
coopSliceElapsedTime += m_tld[i].coopSliceElapsedTime;
batchCount += m_tld[i].countBatches;
coopSliceCount += m_tld[i].countCoopSlices;
}
}
#endif
/** 函数功能
： 申请空间
/*
调用范围
： 只在主线程Encoder::create()中应用
* 返回
： 成为为true 失败为fasle * */
bool Lookahead::create()
{
int numTLD = 1 + (m_pool ? m_pool->m_numWorkers : 0);
m_tld = new LookaheadTLD[numTLD];
for (int i = 0; i < numTLD; i++)
m_tld[i].init(m_8x8Width, m_8x8Height, m_8x8Blocks);
m_scratch = X265_MALLOC(int, m_tld[0].widthInCU);
return m_tld && m_scratch;
}
/** 函数功能
： 关闭帧类型决策任务，如果有任务在执行，则等待完毕，再停止任务
/*
调用范围
： 只在主线程Encoder::stopJobs()中应用
* 返回
： null* */
void Lookahead::stopJobs()
{
if (m_pool && !m_inputQueue.empty())//如果输入列表中还有帧类型为决策，则当前还不该退出
{
m_inputLock.acquire();
m_isActive = false;
bool wait = m_outputSignalRequired = m_sliceTypeBusy;
m_inputLock.release();
if (wait)
m_outputSignal.wait(); //等待任务全部完成
}
}
/** 函数功能
： 释放内存
/*
调用范围
： 只在主线程Encoder::destroy()中应用
* 返回
： null* */
void Lookahead::destroy()
{
// these two queues will be empty unless the encode was aborted
while (!m_inputQueue.empty())//该队列一般都已经进入输出队列m_outputQueue中，一般不会进入此
{
Frame* curFrame = m_inputQueue.popFront();
curFrame->destroy();
delete curFrame;
}
while (!m_outputQueue.empty())//该队列一般都已经进入DPB.m_picList中，一般不会进入此
{
Frame* curFrame = m_outputQueue.popFront();
curFrame->destroy();
delete curFrame;
}
X265_FREE(m_scratch);
delete [] m_tld;
}
/* The synchronization of slicetypeDecide is managed here.
The findJob() method
* polls the occupancy of the input queue. If the queue is
* full, it will run slicetypeDecide() and output a mini-gop of frames to the
* output queue. If the flush() method has been called (implying no new pictures
* will be received) then the input queue is considered full if it has even one
* picture left. getDecidedPicture() removes pictures from the output queue and
* only blocks as a last resort. It does not start removing pictures until
* m_filled is true, which occurs after *more than* the lookahead depth of
* pictures have been input so slicetypeDecide() should have started prior to
* output pictures being withdrawn. The first slicetypeDecide() will obviously
* still require a blocking wait, but after this slicetypeDecide() will maintain
* its lead over the encoder (because one picture is added to the input queue
* each time one is removed from the output) and decides slice types of pictures
* just ahead of when the encoder needs them */
/* Called by API thread */
/** 函数功能
： 向输入列表中添加原始帧准备帧类型决策，在buffer满时，触发帧类型决策
/*
调用范围
： 只在主线程Encoder中应用
* 参数 curFrame
： 传入的原始帧
* 参数 sliceType
： 1pass 中为AUTO 2pass中为具体的帧类型
* 返回
： null * */
void Lookahead::addPicture(Frame& curFrame, int sliceType)
{
curFrame.m_lowres.sliceType = sliceType;//设置帧类型
/* determine if the lookahead is (over) filled enough for frames to begin to
* be consumed by frame encoders */
if (!m_filled)// 如果当前未满
{
if (!m_param->bframes & !m_param->lookaheadDepth) //零延迟，直接标记为true
m_filled = true; /* zero-latency */
else if (curFrame.m_poc >= m_param->lookaheadDepth + 2 + m_param->bframes)//buffer以满，标记为true
m_filled = true; /* full capacity plus mini-gop lag */
}
m_inputLock.acquire(); //多线程锁，将输入列表加锁，防止多线程破坏
m_inputQueue.pushBack(curFrame);//将当前帧加入队列
if (m_pool && m_inputQueue.size() >= m_fullQueueSize)//如果输入列表的帧数已经大于搜索的最大帧数
tryWakeOne();//触发开始进行帧类型决策
m_inputLock.release();//解锁
}
/* Called by API thread */
/** 函数功能
： 当前已经读取原始帧完毕，往后不用再继续读取，告知lookahead已满
/*
调用范围
： 只在主线程Encoder::encode中应用
* 返回
： null * */
void Lookahead::flush()
{
/* force slicetypeDecide to run until the input queue is empty */
m_fullQueueSize = 1;
m_filled = true;
}
/** 函数功能
： 触发帧类型决策（在threadMain()主动发起，在getDecidedPicture()被动发起，因为当前发现帧类型不可用，被动发起）
/*
调用范围
： 只在WorkerThread::threadMain()（在addPicture中触发执行）和Lookahead::getDecidedPicture()函数中被调用
* 返回
： null * */
void Lookahead::findJob(int /*workerThreadID*/)
{
bool doDecide;//用于指示是否进行帧类型决策
m_inputLock.acquire();//将m_inputQueue列表加锁，准备读取其数据，加锁防止多线程造成破坏
if (m_inputQueue.size() >= m_fullQueueSize && !m_sliceTypeBusy && m_isActive)//如果当前输入列表的帧数大于等于lookachead最大深度并且当前不再进行sliceDecide并且当前lookachead是触发状态
doDecide = m_sliceTypeBusy = true;//满足条件将其置为true，准备帧类型决策 （其中m_sliceTypeBusy保证系统只有一个线程进行帧类型决策）
else
doDecide = m_helpWanted = false;//不满足条件，将其置为false 准备退出
m_inputLock.release();//释放m_inputQueue列表锁
if (!doDecide) //不满足帧类型决策的条件，直接退出
return;
ProfileLookaheadTime(m_slicetypeDecideElapsedTime, m_countSlicetypeDecide);//统计时间信息（有宏DETAILED_CU_STATS控制）
ProfileScopeEvent(slicetypeDecideEV);
slicetypeDecide(); //获取帧类型并计算frame-cost
m_inputLock.acquire();//将m_inputQueue列表加锁
//与event m_outputSignal;配套使用，初始化为false
true表示需要完成sliceDecide，在Lookahead::findJob中完成sliceDecide置为false并触发m_outputSignal，说明完成sliceDecide
//在Lookahead::getDecidedPicture()函数中会进行检测并阻塞
if (m_outputSignalRequired)//如果前面编码过程中等待所需的帧类型决策完毕
{
m_outputSignal.trigger();//触发帧类型决策完毕，使其不再等待，继续编码
m_outputSignalRequired = false;//帧类型决策完毕，置为false
}
m_sliceTypeBusy = false;//帧类型决策完毕，不再忙
m_inputLock.release();//将m_inputQueue列表解锁
}
/* Called by API thread */
/** 函数功能
： 只在主线程Encoder中应用，获取已经得到帧类型的原始帧
/*
调用范围
： 只在Encoder::encode函数中被调用
* 返回
： 返回当前帧类型决策完毕的待编码帧 * */
Frame* Lookahead::getDecidedPicture()
{
if (m_filled)//当前buf是否已满
{
m_outputLock.acquire();//多线程锁，防止数据别破坏
Frame *out = m_outputQueue.popFront();//抛出已经决定好帧类型列表中的第一帧
m_outputLock.release();//多线程解锁
if (out)//有可用帧直接输出
return out;
findJob(-1); //没有可用帧，准备findjob调用slicetype并得到可用帧，作为补充，如果获取编码帧快于slicetype的情况，这种情况较少，但是也会进入/* run slicetypeDecide() if necessary */
m_inputLock.acquire();//多线程锁，防止数据别破坏
bool wait = m_outputSignalRequired = m_sliceTypeBusy;//如果当前正在进行slicetype 需要等待
m_inputLock.release();//多线程解锁
if (wait)
m_outputSignal.wait();//一直等待直到帧类型决策完毕
return m_outputQueue.popFront();//抛出可用帧
}
else
return NULL;//没满，不能做帧类型决策，直接抛出null
}
/* Called by rate-control to calculate the estimated SATD cost for a given
* picture.
It assumes dpb->prepareEncode() has already been called for the
* picture and all the references are established */
/** 函数功能
： 获取当前帧每个CTU行对应下采样帧的每个8x8的块cost的累计值
/*
调用范围
： 只在Encoder::encode函数中被调用
* 参数 curFrame
： 传入的原始帧
* 返回
： null * */
void Lookahead::getEstimatedPictureCost(Frame *curFrame)
{
Lowres *frames[X265_LOOKAHEAD_MAX];//存储相关的下采样图像
存储方式：前向帧
当前帧
后向帧
/*如当前编码为：
1
3
5
7
2
6
0
4
8
当前帧为6
则在frame的存储方式为：
4 x 6 x 8
p0一定为0
b 为相对于p0的偏移2
p1为相对于b的偏移 4
**/
// POC distances to each reference
Slice *slice = curFrame->m_encData->m_slice;//获取当前slice
int p0 = 0, p1, b;//b：当前帧
P0前向帧
p1后向帧
int poc = slice->m_poc;//当前POC
int l0poc = slice->m_refPOCList[0][0];//获取list0的第一帧的poc
int l1poc = slice->m_refPOCList[1][0];//获取list1的第一帧的poc
switch (slice->m_sliceType)
{
case I_SLICE:
frames[p0] = &curFrame->m_lowres;//当前为I帧，直接获取当前帧的下采样图像
b = p1 = 0;
break;
case P_SLICE:
b = p1 = poc - l0poc;//获取当前帧应该存放的位置
frames[p0] = &slice->m_refPicList[0][0]->m_lowres;//获取前向帧
frames[b] = &curFrame->m_lowres;//获取当前帧
break;
case B_SLICE:
b = poc - l0poc;//获取当前帧应该存放的位置
p1 = b + l1poc - poc;//获取后向帧应该存放的位置
frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
frames[b] = &curFrame->m_lowres;
frames[p1] = &slice->m_refPicList[1][0]->m_lowres;
break;
default:
return;
}
X265_CHECK(curFrame->m_lowres.costEst[b - p0][p1 - b] > 0, "Slice cost not estimatedn")
if (m_param->rc.cuTree && !m_param->rc.bStatRead)//如果应用cutree 并且不是1pass(2pass以上直接获取)
/* update row satds based on cutree offsets */
curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);//如果当前是B帧直接返回其framecost：costEstAq （qpAqOffset加权）否则，重新计算framecost
经过qpCuTreeOffset加权后的数据
else if (m_param->rc.aqMode)
curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];//如果应用自适应量化 获取自适应量化的cost
else
curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];//获取framecost
if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)//如果应用VBV
{
/* aggregate lowres row satds to CTU resolution */
curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b];//获取对应帧计算framecost时的每个8x8块值
uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;//lowresRow 遍历下采样的行号, lowresCol 用于遍历每行8x8块的每个8x8块， lowresCuIdx当前CTU行对应下采样8x8行位置的 8x8块的index, sum intraSum 用于累加计算当前8x8行的cost
uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);//因为是下采样图像，底层的8x8块相当于原始帧的16x16 这是CTU与8x8的缩放关系 64/16
uint32_t numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;//获取有多少CTU行
uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;//分别获取有多少8x8行和8x8列
double *qp_offset = 0;//自适应量化的权重系数
/* Factor in qpoffsets based on Aq/Cutree in CU costs */
if (m_param->rc.aqMode)//如果应用自适应量化，根据情况选择
qp_offset = (frames[b]->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset;
for (uint32_t row = 0; row < numCuInHeight; row++)//遍历CTU行
{
lowresRow = row * scale;//对应下采样图像的行号
for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++)//遍历一个CTU行对应的scale个的下采样8x8行
{
sum = 0; intraSum = 0;//用于累加计算当前8x8行的cost
lowresCuIdx = lowresRow * widthInLowresCu;//获取当前CTU行对应下采样8x8行位置的 8x8块的index
for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++)//遍历每行每个8x8块的cost
{
uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK;//获取当前块的8x8块cost
与操作原因：高14位的数字：0 表示 intra
1 表示前向搜索
2表示后向搜索
3 表示bi搜索
if (qp_offset)//如果应用自适应量化
{
lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8);//重新计算cost，乘以权重系数
int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx]; //获取intracost
curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8;//重新计算cost，乘以权重系数
}
curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost;//重新获得cost
sum += lowresCuCost;//累加cost
intraSum += curFrame->m_lowres.intraCost[lowresCuIdx];//累加intracost
}
curFrame->m_encData->m_rowStat[row].satdForVbv += sum;//每个CTU行对应所有8x8块的cost累加值
curFrame->m_encData->m_rowStat[row].intraSatdForVbv += intraSum;//每个CTU行对应所有8x8块的intracost累加值
}
}
}
}
/** 函数功能
： 初始化lowres并进行下采样、扩边、计算qpCuTreeOffset等信息，获取整帧的像素和和AC能量、计算当前1/2下采样帧的intra SATD值以及最优intra模式、并行处理
/*
调用范围
： 只在WorkerThread::threadMain()和sliceTypeDecide函数中被调用
* 参数 workerThreadID
： 当前运行的内核号
* 返回
： null * */
void PreLookaheadGroup::processTasks(int workerThreadID) //此函数可在不同线程中执行
{
//workerThreadID为当前的内核号，在sliceTypeDecide中为-1 表示在本线程中继续执行
//在WorkerThread::threadMain()中为大于0的一个内核号，在相应线程中执行
if (workerThreadID < 0)
workerThreadID = m_lookahead.m_pool ? m_lookahead.m_pool->m_numWorkers : 0;
//如果workerThreadID<0 则将其置为最后一个id，因为申请的空间为核数+1，如四个核：0,1,2,3
//-1 则为虚拟的4，此时在本线程中继续执行，其它核号在相应线程中执行
LookaheadTLD& tld = m_lookahead.m_tld[workerThreadID];
m_lock.acquire(); //临界资源加锁
while (m_jobAcquired < m_jobTotal) //m_jobTotal 表示有多少帧需要初始化，m_jobAcquired用于计数,这两个数据是线程之间共有的数据
{
Frame* preFrame = m_preframes[m_jobAcquired++]; //获取需要初始化的帧
ProfileLookaheadTime(m_lookahead.m_preLookaheadElapsedTime, m_lookahead.m_countPreLookahead);//在DETAILED_CU_STATS打开 统计时用到
ProfileScopeEvent(prelookahead);
m_lock.release();//已经读完，可以释放临界资源了
preFrame->m_lowres.init(preFrame->m_fencPic, preFrame->m_poc); //初始化信息，并进行下采样和扩边
if (m_lookahead.m_param->rc.bStatRead && m_lookahead.m_param->rc.cuTree && IS_REFERENCED(preFrame))
/* cu-tree offsets were read from stats file */; //如果当前是2pass，并且不是b（非参考帧)(其它 I i B，这三个都是可以当作参考帧的)，则直接从1pass获取，无须重新计算
else if (m_lookahead.m_bAdaptiveQuant)
tld.calcAdaptiveQuantFrame(preFrame, m_lookahead.m_param);//初始化m_lowres中的qpCuTreeOffset等信息，获取整帧的像素和和AC能量。
tld.lowresIntraEstimate(preFrame->m_lowres);
//计算当前1/2下采样帧的intra SATD值以及最优intra模式
preFrame->m_lowresInit = true;
//标记当前lowres已经初始化完毕
m_lock.acquire();
//临界资源加锁
}
m_lock.release();//已完成，可以释放临界资源了
}
/** 函数功能
： 获取帧类型并计算frame-cost
/*
调用范围
： 只在Lookahead::findJob函数中被调用
* 返回
： null * */
/* called by API thread or worker thread with inputQueueLock acquired */
void Lookahead::slicetypeDecide()
{
PreLookaheadGroup pre(*this); //用于初始化lowres的类
Lowres* frames[X265_LOOKAHEAD_MAX + X265_BFRAME_MAX + 4];//对应下采样数据的信息，长度为lookachead depth 其中frame[0] 为m_lastNonB，Frame[1~depth] 为具体帧
Frame*
list[X265_BFRAME_MAX + 4];
//用于存储m_param->bframes + 2帧
用于寻找最优的b帧位置
memset(frames, 0, sizeof(frames));
memset(list, 0, sizeof(list));
int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);//设置最大需要查看的帧数
maxSearch = X265_MAX(1, maxSearch);
//容错处理
{
ScopedLock lock(m_inputLock);
//防止数据被多线程破坏
//获取Frame*
list数据，顺序编号
Frame *curFrame = m_inputQueue.first();
int j;
for (j = 0; j < m_param->bframes + 2; j++)
{
if (!curFrame) break;
list[j] = curFrame;
curFrame = curFrame->m_next;
}
//获取对应对应下采样数据的信息，长度为lookachead depth
//获取未初始化的视频pre.m_preframes
curFrame = m_inputQueue.first();
frames[0] = m_lastNonB;
for (j = 0; j < maxSearch; j++)
{
if (!curFrame) break;
frames[j + 1] = &curFrame->m_lowres;
if (!curFrame->m_lowresInit)
pre.m_preframes[pre.m_jobTotal++] = curFrame;
curFrame = curFrame->m_next;
}
maxSearch = j;
}
/* perform pre-analysis on frames which need it, using a bonded task group */
//所有数据都在pre操作，无论在threadmain调用还是下面的procesTask都是操作的该对象,多线程并行处理帧初始化
//初始化lowres并进行下采样、扩边、计算pCuTreeOffset等信息，获取整帧的像素和和AC能量、计算当前1/2下采样帧的intra SATD值以及最优intra模式、并行处理
if (pre.m_jobTotal)
{
if (m_pool)
pre.tryBondPeers(*m_pool, pre.m_jobTotal);//在threadmain中触发相应processtask，只要是sleep状态的核都可以触发
pre.processTasks(-1);
//在本线程中初始化lowres
pre.waitForExit();
//一直等待所有任务都完成
}
if (m_lastNonB && !m_param->rc.bStatRead &&
((m_param->bFrameAdaptive && m_param->bframes) ||
m_param->rc.cuTree || m_param->scenecutThreshold ||
(m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
//在1pass 并且m_lastNonB不为空会进入； 2pass等情况不会进入
{
slicetypeAnalyse(frames, false);
//获取当前GOP的帧类型，获取可参考帧的qpCuTreeOffset值
}
int bframes, brefs;
//bframes:计数当前GOP中B或者b帧数计数 brefs计数当前X265_TYPE_BREF（B）个数
//循环功能：根据配置情况，修正当前GOP的帧类型，如IDR帧插入等情况
//2pass 进入会有相应帧类型， 1pass进入：只有X265_TYPE_B、X265_TYPE_AUTO、X265_TYPE_P
for (bframes = 0, brefs = 0;; bframes++)
//在1pass中进来的sliceType初始都为X265_TYPE_AUTO，2pass中都会有具体的帧类型
{
Lowres& frm = list[bframes]->m_lowres;
//获取当前GOP的当前帧的小采样帧（里面含有帧类型）
if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)//容错处理：一般不进入，当前帧类型为B(可参考B帧)
{
//!m_param->bBPyramid 当前B帧关闭，不应该有此帧类型,
frm.sliceType = X265_TYPE_B;//强制将X265_TYPE_BREF(B) 更改为
X265_TYPE_B(b)
// brefs == m_param->bBPyramid
前面已经保证m_param->bBPyramid = 0 这里必须保证brefs个数为0
x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramidn",
frm.frameNum);
}
/* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
* smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
//容错处理：一般不进入，当前帧类型为B(可参考B帧) bBPyramid打开（可以有B）
m_param->maxNumReferences <= (brefs + 3))
//bref 已经有参考B帧
L0最多参考个数小于等于(brefs+3) （原因：一个可参考B帧，周边应该有两个b帧）
{
frm.sliceType = X265_TYPE_B;//强制将X265_TYPE_BREF(B) 更改为
X265_TYPE_B(b)
x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference framesn",
frm.sliceType, m_param->maxNumReferences);
}
if (/* (!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax)//如果当前帧号与前一个关键帧位置间隔大于最大关键帧间隔，则强制标记当前为IDR帧
{
if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;//第一帧为IDR， 其它情况： 打开bOpenGOP为X265_TYPE_I，关闭为X265_TYPE_IDR
bool warn = frm.sliceType != X265_TYPE_IDR;
if (warn && m_param->bOpenGOP)
warn &= frm.sliceType != X265_TYPE_I;
//容错处理：一般不进入，防止上条语句没有赋值成功，再赋值一次
if (warn)
{
x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe intervaln",
frm.sliceType, frm.frameNum);
frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
}
}
if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin) //如果当前为 X265_TYPE_I，并且当前I帧间隔大于等于keyframeMin
{
if (m_param->bOpenGOP)
{
m_lastKeyframe = frm.frameNum;
//因为打开bOpenGOP，除第一帧，其它为X265_TYPE_I，所以在此标记当前I帧位置，为后续计算I帧间隔
frm.bKeyframe = true;
//标记当前为关键帧
}
else
frm.sliceType = X265_TYPE_IDR;
//如果关闭bOpenGOP，当前标记为IDR帧
}
if (frm.sliceType == X265_TYPE_IDR)
{
/* Closed GOP */
//打开bOpenGOP,只有第一帧会进入其它不会进入此if，因为在打开bOpenGOP中为X265_TYPE_I， 关闭bOpenGOP只要是X265_TYPE_IDR都会进入
m_lastKeyframe = frm.frameNum;
//标记当前I帧位置，为后续计算I帧间隔
frm.bKeyframe = true;
//标记当前为关键帧
if (bframes > 0)
{
list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P; //如果当前不是当前GOP的第一帧，则将其前一帧设置为P帧，并将bframes--，注意当前并不是直接确定为P帧，后续可能会更改
bframes--;
}
}
if (bframes == m_param->bframes || !list[bframes + 1])//如果当前B或者b帧个数已经达到配置上限，或者下一帧无数据（注意：当前位置是当前GOP的最后一帧，所以应该为非B帧）
{
if (IS_X265_TYPE_B(frm.sliceType))
x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-framesn"); //当前为B或者b帧不合法；一般不进入
if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType))
frm.sliceType = X265_TYPE_P;//将当前不合法的帧类型强制设置为非B帧（P帧）；一般不进入
}
if (frm.sliceType == X265_TYPE_BREF)
//计数当前X265_TYPE_BREF（B）的个数 注意：1pass 一般不进入vbvLookahead中并没有将类型置到.sliceType中，1pass此时全为b帧，2pass 进入统计可参考B帧个数
brefs++;
if (frm.sliceType == X265_TYPE_AUTO)
//在1pass情况下当前没有帧类型，则设置为X265_TYPE_BREF(B) //2pass 一般不进入
frm.sliceType = X265_TYPE_B;
else if (!IS_X265_TYPE_B(frm.sliceType))//如果当前不是B、b帧，退出，遇到P、I、i帧退出
break;
} //for (bframes = 0, brefs = 0;; bframes++)
if (bframes)
list[bframes - 1]->m_lowres.bLastMiniGopBFrame = true;
//记录当前GOP最后一个B帧，因为bframes为B帧个数，B帧时从0开始计数，所以选择bframes - 1
list[bframes]->m_lowres.leadingBframes = bframes;
//设置当前帧前面有几个Bb帧（当前GOP下,此时位置为当前GOP的后向非B帧）
m_lastNonB = &list[bframes]->m_lowres;
//上面是非B帧才退出的，所以当前为I/P帧，记录最近的非B帧位置
m_histogram[bframes]++;
//统计信息，用于统计GOP中B或者b帧数个数
/* insert a bref into the sequence */
if (m_param->bBPyramid && bframes > 1 && !brefs)
//1pass进入：设置中间帧为可参考B帧 2pass不进入
{
list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF;
brefs++;
}
/* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */ //功能，保证RC能够获得framecost
if (m_param->rc.rateControlMode != X265_RC_CQP)
//当前模式不为固定QP模式
{
int p0, p1, b;
//临时变量：b 当前帧 p0 前向帧 p1 后向帧
/* For zero latency tuning, calculate frame cost to be used later in RC */
if (!maxSearch)
//考虑零延迟，没有lookachead情况，前面没有存储frames列表，在此存储相应帧数
{
for (int i = 0; i <= bframes; i++)
//存储frame[0] 非B帧 frame[1~brames] 当前GOP
frames[i + 1] = &list[i]->m_lowres;
}
/* estimate new non-B cost */
p1 = b = bframes + 1;
//选取当前GOP的后向非B帧
p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0;
//选取当前GOP的前向非B帧 （p1是I帧，则GOP=1 直接计算当前帧的I-cost即可）
CostEstimateGroup estGroup(*this, frames);
//多线程计算帧间的framecost
estGroup.singleCost(p0, p1, b);
//以前向帧p0为参考，p1=b 为当前搜索帧的 P-cost,如果是I帧，则计算I-cost
if (bframes) //计算每个B帧的 frame-cost
{
p0 = 0; // last nonb 初始化当前的前向参考帧为当前GOP的前向非B帧
for (b = 1; b <= bframes; b++)//遍历所有B帧
{
if (frames[b]->sliceType == X265_TYPE_B)
for (p1 = b; frames[p1]->sliceType == X265_TYPE_B; p1++) //寻找后向参考帧
; // find new nonb or bref
else
p1 = bframes + 1; //如果当前为可参考B帧，则其后向参考帧为当前GOP的后向非B帧
estGroup.singleCost(p0, p1, b);//计算当前b帧的framecost，前向p0，后向 p1
if (frames[b]->sliceType == X265_TYPE_BREF) //如果当前为可参考B帧， 则后面的B帧的前向参考帧将为此帧
p0 = b;
}
}
}
m_inputLock.acquire();
//准备抛出帧类型已经确定的帧，现将其加锁，防止多线程重复修改
/* dequeue all frames from inputQueue that are about to be enqueued
* in the output queue. The order is important because Frame can
* only be in one list at a time */
int64_t pts[X265_BFRAME_MAX + 1];//存储当前GOP的pts（可以理解为poc),前向非B帧不存储，只存储所有B帧和后向非B帧
for (int i = 0; i <= bframes; i++)
{
Frame *curFrame;
curFrame = m_inputQueue.popFront();//获取当前列表中最前面的帧
pts[i] = curFrame->m_pts;
//暂存当前的pts
maxSearch--;
}
m_inputLock.release();
//已经抛出，将临界资源释放
m_outputLock.acquire(); //准备推进 已经决策类型的帧，现将其加锁，防止多线程重复修改
/* add non-B to output queue */
int idx = 0; //临时变量，记录序号
list[bframes]->m_reorderedPts = pts[idx++]; //设置当前GOP后向非B帧（可以理解为P帧）的编码序号
后向P帧获取当前GOP第一个B帧的PTS
m_outputQueue.pushBack(*list[bframes]);//将当前GOP后向非B帧（可以理解为P帧）推入输出列表
/* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
if (bframes > 1 && m_param->bBPyramid)//如果当前B帧个数大于1并且采用可参考B帧
{
for (int i = 0; i < bframes; i++)
{
if (list[i]->m_lowres.sliceType == X265_TYPE_BREF)
{
list[i]->m_reorderedPts = pts[idx++];//设置当前GOP可参考B帧的编码序号
m_outputQueue.pushBack(*list[i]);
//将当前前GOP可参考B帧推入输出列表
}
}
}
/* add B frames to output queue */
for (int i = 0; i < bframes; i++) //将剩余b帧推入输出列表
{
/* push all the B frames into output queue except B-ref, which already pushed into output queue */
if (list[i]->m_lowres.sliceType != X265_TYPE_BREF)
{
list[i]->m_reorderedPts = pts[idx++];//设置b帧的编码序号
m_outputQueue.pushBack(*list[i]);
//将b帧推入输入列表
}
}
//frame[0] 当前GOP的前向非B帧 m_lastNonB 当前GOP的后向非B帧
bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead;//2pass 不会进入下面操作， 在1pass中会对I帧进行关键帧分析（判定是否为IDR帧）
if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))//对当前I帧进行分析 功能：重新计算当前I帧的 frame-cost等信息以及qpCuTreeOffset等信息
{
m_inputLock.acquire(); //加锁
Frame *curFrame = m_inputQueue.first();
frames[0] = m_lastNonB;
//获取最近的非B帧
int j;
for (j = 0; j < maxSearch; j++)
//获取除上面抛出去的帧外的剩余帧
{
frames[j + 1] = &curFrame->m_lowres;
curFrame = curFrame->m_next;
}
m_inputLock.release();//解锁
frames[j + 1] = NULL;
slicetypeAnalyse(frames, true); //获取可参考帧的qpCuTreeOffset值
}
m_outputLock.release();//已经推进，将临界资源释放
}
/** 函数功能
： 重新计算framecost经过qpCuTreeOffset加权后的数据并吧相应信息存储到第一个GOP中，并将相应的B帧设置为参考B帧
/*
调用范围
： 只在slicetypeAnalyse函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 numframes
： frames列表帧数
* 参数 keyframe
： 是否是IDR帧检测
* 返回
： null * */
void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
{
/************************************************************************/
/*函数功能：重新计算framecost经过qpCuTreeOffset加权后的数据并吧相应信息存储到第一个GOP中，并将相应的B帧设置为参考B帧
/* 只存储到第一个GOP中：plannedType、plannedSatd、indB（用于计数）
/* 例如当前列表：PbbbPbbbPbbbP
出来之后变为：PbBbPbBbPbBbP
/* 第一个GOP(PbbbP) 存储了当前所有的framecost
/* 进行标号：P0b1b2b3P4b5b6b7P8b9b10b11P12
/* P0: 无任何存储
/* b1：b3 P8 b5 b6 b7 P12 b9 b10 b11
/* b2：b1 b3 P8 b5 b6 b7 P12 b9 b10 b11
/* b3：P8 b5 b6 b7 P12 b9 b10 b11
/* P4：b1 b2 b3 P8 b5 b6 b7 P12 b9 b10 b11
/************************************************************************/
int prevNonB = 0, curNonB = 1, idx = 0;
//prevNonB当前GOP前向非B帧, curNonB当前GOP后向非B帧, idx 计数frame列表的帧
while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
//寻找下一个非B帧 ,curNonB指向下一个非B帧
curNonB++;
int nextNonB = keyframe ? prevNonB : curNonB;//第一个GOP下的后向非B帧，IDR检测是由于GOP长度为1，所以指向本身
int nextB = prevNonB + 1;
//第一个B帧
int nextBRef = 0, curBRef = 0; // nextBRef 用于指向当前GOP的B帧参考帧, curBRef 用于标记第一个GOP中的可参考B帧
if (m_param->bBPyramid && curNonB - prevNonB > 1) //curBRef 用于指向当前GOP的B帧参考帧
curBRef = (prevNonB + curNonB + 1) / 2;
int miniGopEnd = keyframe ? prevNonB : curNonB; //如果当前是IDR帧检测，GOP=1，所以最后位置依然是当前I帧，否则最后位置为当前GOP下的后向P帧
while (curNonB < numFrames + !keyframe) //遍历所有GOP
{
/* P/I cost: This shouldn't include the cost of nextNonB */
if (nextNonB != curNonB) //如果不是第一个GOP
{
int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; //如果curNonB是I帧，则p0=curNonB 否则 p0= prevNonB
frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);//如果当前帧号curNonB是B帧直接返回其framecost：costEstAq （qpAqOffset加权）否则，重新计算framecost经过qpCuTreeOffset加权后的数据
frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;//获取curNonB的帧类型
/* Save the nextNonB Cost in each B frame of the current miniGop */
if (curNonB > miniGopEnd)//如果当前遍历的不是第一个GOP
{
for (int j = nextB; j < miniGopEnd; j++)//存储当前的P帧cost
{
frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx];
frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx];
}
}
idx++;//指向下一帧
}
/* Handle the B-frames: coded order */
if (m_param->bBPyramid && curNonB - prevNonB > 1) //获取当前GOP的可参考B帧位置
nextBRef = (prevNonB + curNonB + 1) / 2;
for (int i = prevNonB + 1; i < curNonB; i++, idx++)//遍历当前GOP的所有B帧
{
int64_t satdCost = 0;
//用于存储当前帧的famecost
int type = X265_TYPE_B;//初始帧类型为B帧
if (nextBRef)
{
if (i == nextBRef)//如果当前GOP的可参考B帧位置不为0
{
satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef);//直接返回其framecost：costEstAq （qpAqOffset加权）
type = X265_TYPE_BREF; //设置当前帧类型为可参考B帧
}
else if (i < nextBRef)
satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i);//直接返回其framecost：costEstAq （qpAqOffset加权）
else
satdCost = vbvFrameCost(frames, nextBRef, curNonB, i);//直接返回其framecost：costEstAq （qpAqOffset加权）
}
else
satdCost = vbvFrameCost(frames, prevNonB, curNonB, i);//直接返回其framecost：costEstAq （qpAqOffset加权）
frames[nextNonB]->plannedSatd[idx] = satdCost;//存储当前idx帧号的framecost
frames[nextNonB]->plannedType[idx] = type;
//存储帧类型
/* Save the nextB Cost in each B frame of the current miniGop */
for (int j = nextB; j < miniGopEnd; j++)//将相应framecost 存储到第一个GOP相应帧中
{
if (curBRef && curBRef == i) //如果当前是B参考帧 直接退出
break;
if (j >= i && j !=nextBRef)
//j要比i小 或者 j是当前GOP的B参考帧并且是第一个GOP
continue;
frames[j]->plannedSatd[frames[j]->indB] = satdCost; //存储framecost
frames[j]->plannedType[frames[j]->indB++] = type;
//存储帧类型
}
}
prevNonB = curNonB;//重新设置下一个GOP的前向非B帧
curNonB++;
while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B)//重新设置下一个GOP的后向非B帧
curNonB++;
}
frames[nextNonB]->plannedType[idx] = X265_TYPE_AUTO;//将最后一帧帧类型重置
}
/** 函数功能
： 如果当前帧号b是B帧直接返回其framecost：costEstAq （qpAqOffset加权）否则，重新计算framecost
经过qpCuTreeOffset加权后的数据
/*
调用范围
： 只在Lookahead::vbvLookahead函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 p0
： 前向帧
* 参数 p1
： 后向帧
* 参数 b
： 当前帧号
* 返回
： 返回重新计算的framecost* */
int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b)
{
CostEstimateGroup estGroup(*this, frames);
//用于计算framecost
int64_t cost = estGroup.singleCost(p0, p1, b);//计算framecost
编码帧：b 前向参考帧：p0， 后向参考帧：p1
if (m_param->rc.aqMode) //如果应用自适应量化
{
if (m_param->rc.cuTree)//如果应用cuTree
return frameCostRecalculate(frames, p0, p1, b);//如果当前帧号b是B帧直接返回其framecost：costEstAq （qpAqOffset加权）否则，重新计算framecost
经过qpCuTreeOffset加权后的数据
else
return frames[b]->costEstAq[b - p0][p1 - b];//直接返回当前整帧的 加权(invQscaleFactor)framecost
}
return cost;
}
/** 函数功能
： 获取当前GOP的帧类型，获取可参考帧的qpCuTreeOffset值
/*
调用范围
： 只在sliceTypeDecide函数中被调用 （一般只在1pass中进入）
* 参数 frames
： 当前搜索的frames列表
* 参数 bKeyframe
： 是否是IDR帧检测
* 返回
： null * */
void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe)
{
int numFrames, origNumFrames, keyintLimit, framecnt;
//numFrames待处理帧数, origNumFrames= X265_MIN(framecnt, keyintLimit), keyintLimit与下一个IDR帧之间的最大间隔帧数, framecnt计数未帧类型决策的个数
int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);
//设置最大需要查看的帧数
int cuCount = m_8x8Blocks;
//等于(m_8x8Width - 2) * (m_8x8Height - 2)
int resetStart;
//从当前位置开始往后分析的帧类型全部抛弃 置为X265_TYPE_AUTO
bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth;//是否应用vbv Lookachead
/* count undecided frames */
for (framecnt = 0; framecnt < maxSearch; framecnt++)
//计数未帧类型决策的个数
{
Lowres *fenc = frames[framecnt + 1];
//frames[0]存储最近的非B帧
if (!fenc || fenc->sliceType != X265_TYPE_AUTO)
break;
}
if (!framecnt) //如果是零 一般不会进入
{
if (m_param->rc.cuTree)
cuTree(frames, 0, bKeyframe);//计算可参考帧的qpCuTreeOffset值
return;
}
frames[framecnt + 1] = NULL;
//将最后一帧后面置为null 防止程序出错
keyintLimit = m_param->keyframeMax - frames[0]->frameNum + m_lastKeyframe - 1; //frame[0] 为I帧或者P帧， 此时表示与下一个IDR帧之间的最大间隔帧数
origNumFrames = numFrames = X265_MIN(framecnt, keyintLimit);
if (bIsVbvLookahead)
//根据相应情况更新numFrames值
numFrames = framecnt;
else if (m_param->bOpenGOP && numFrames < framecnt) //说明在当前搜索区域必须插入一个IDR帧,当前已经打开openGOP,所以可以将插入IDR帧置为i
numFrames++;
else if (numFrames == 0)
// 说明在当前搜索区域第一帧就必须插入一个IDR帧，直接将其置为I帧
{
frames[1]->sliceType = X265_TYPE_I;
return;
}
if (m_bBatchMotionSearch)
//m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS 执行
{
/* pre-calculate all motion searches, using many worker threads */
CostEstimateGroup estGroup(*this, frames);
//多线程计算帧间的framecost
for (int b = 2; b < numFrames; b++)
//前后帧等间隔计算：如当前帧为 3 ，则计算(2,3,4)，(1,3,5)，(0,3,6).....
{
for (int i = 1; i <= m_param->bframes + 1; i++)
{
int p0 = b - i;
if (p0 < 0)
continue;
/* Skip search if already done */
if (frames[b]->lowresMvs[0][i - 1][0].x != 0x7FFF)
//如果当前已经搜索计算过，则无须再搜索一次
continue;
/* perform search to p1 at same distance, if possible */
int p1 = b + i;
if (p1 >= numFrames || frames[b]->lowresMvs[1][i - 1][0].x != 0x7FFF) //如果后向帧超出搜索范围或者已经计算过，则将后向帧置为当前帧
p1 = b;
estGroup.add(p0, p1, b); //添加任务，为后面并发执行做准备
}
}
/* auto-disable after the first batch if pool is small */
m_bBatchMotionSearch &= m_pool->m_numWorkers >= 4;
//保证4个内核以上才进行多线程
estGroup.finishBatch();
//触发并发执行并一整等到所有任务执行完毕：计算每帧与其对应参考帧之间的帧间cost
if (m_bBatchFrameCosts)
{
/* pre-calculate all frame cost estimates, using many worker threads */
for (int b = 2; b < numFrames; b++)
//全部情况计算：如当前帧为 3 ，则计算(2,3,4),(2,3,5),(2,3,6),(2,3,7),(1,3,4),(1,3,5),(1,3,6),(1,3,7).....
{
for (int i = 1; i <= m_param->bframes + 1; i++)
{
if (b < i)
//必须保证前向帧在当前帧前面
continue;
/* only measure frame cost in this pass if motion searches
* are already done */
if (frames[b]->lowresMvs[0][i - 1][0].x == 0x7FFF) //如果当前已经搜索计算过，则无须再搜索一次
continue;
int p0 = b - i;
//获取前向帧
for (int j = 0; j <= m_param->bframes; j++)
//全遍历后向帧
{
int p1 = b + j;
//获取后向帧
if (p1 >= numFrames)
//保证不能越界，超出当前的搜索窗口
break;
/* ensure P1 search is done */
if (j && frames[b]->lowresMvs[1][j - 1][0].x == 0x7FFF)
//如果当前已经搜索计算过，则无须再搜索一次
continue;
/* ensure frame cost is not done */
if (frames[b]->costEst[i][j] >= 0)
//如果当前的最后cost已经指定，则无需继续计算
continue;
estGroup.add(p0, p1, b);
//添加任务，为后面并发执行做准备
}
}
}
/* auto-disable after the first batch if the pool is not large */
m_bBatchFrameCosts &= m_pool->m_numWorkers > 12;//保证12个内核以上才进行多线程
estGroup.finishBatch();
//触发并发执行并一整等到所有任务执行完毕：计算每帧与其对应参考帧之间的帧间cost
}
} //end if (m_bBatchMotionSearch)
int numBFrames = 0;
//计数从1开始到第一个P帧前的B帧个数
int numAnalyzed = numFrames;
//numAnalyzed 已经分析的帧数
bool isScenecut = scenecut(frames, 0, 1, true, origNumFrames);//判断当前是否是场景切换帧
/* When scenecut threshold is set, use scenecut detection for I frame placements */
if (m_param->scenecutThreshold && isScenecut)//场景切换帧
{
frames[1]->sliceType = X265_TYPE_I;
//如果是场景切换帧，将当前帧置为I帧
return;
}
if (m_param->bframes)
{
if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS)
{
if (numFrames > 1)
{
char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" };
//循环队列，存储帧类型路径
int best_path_index = numFrames % (X265_BFRAME_MAX + 1);
//最后一次帧类型路径存储在队列的位置
/* Perform the frame type analysis. */
for (int j = 2; j <= numFrames; j++)
//依次求 前面2帧最优的帧类型路径、前面3帧最优的帧类型路径...前面numFrames帧最优的帧类型路径(即最优的帧类型路径)
slicetypePath(frames, j, best_paths);
//计算当前搜索长度下的最优帧类型路径
numBFrames = (int)strspn(best_paths[best_path_index], "B");
//strspn（返回字符串中第一个不在指定字符串中出现的字符下标） 在此的功能为计算前面B帧个数
/* Load the results of the analysis into the frame types. */
for (int j = 1; j < numFrames; j++)
frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P;//按照帧类型路径结构给帧类型赋值
}
frames[numFrames]->sliceType = X265_TYPE_P;//将当前搜索列表的最后一帧置为P帧
}
else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST)
{
CostEstimateGroup estGroup(*this, frames);
//用于计算frame cost
int64_t cost1p0, cost2p0, cost1b1, cost2p1; //存储相应的frame cost
/************************************************************************/
/* 假设当前位置为i
/* cost1p0
P（i）参考帧 P（i+1)编码帧
/* cost2p0
P（i+1）参考帧 P（i+2)编码帧
/* cost1b1
P（i+0）参考帧 B（i+1)编码帧 P（i+2）参考帧
/* cost2p1
P（i+0）参考帧 P（i+2)
/************************************************************************/
for (int i = 0; i <= numFrames - 2; )
//遍历所有帧
{
cost2p1 = estGroup.singleCost(i + 0, i + 2, i + 2, true); //计算P帧的cost （P帧为当前i，i+2为其后向帧）
if (frames[i + 2]->intraMbs[2] > cuCount / 2)
//如果计算的intra块个数大于总共8x8个数的一半，则置当前i+1、i+2为P帧
{
frames[i + 1]->sliceType = X265_TYPE_P;
frames[i + 2]->sliceType = X265_TYPE_P;
i += 2;
continue;
}
cost1b1 = estGroup.singleCost(i + 0, i + 2, i + 1);
//计算当前位置i+1，作为B帧 i+0，i+2作为P帧的frame cost
cost1p0 = estGroup.singleCost(i + 0, i + 1, i + 1);
//计算当前位置i+1，作为B帧 i+0作为P帧的frame cost
cost2p0 = estGroup.singleCost(i + 1, i + 2, i + 2);
//计算当前位置i+2，作为B帧 i+1作为P帧的frame cost
if (cost1p0 + cost2p0 < cost1b1 + cost2p1)
//如果P帧更优，将i+1置为P帧
{
frames[i + 1]->sliceType = X265_TYPE_P;
i += 1;
continue;
}
// arbitrary and untuned
#define INTER_THRESH 300
#define P_SENS_BIAS (50 - m_param->bFrameBias)
frames[i + 1]->sliceType = X265_TYPE_B;
//将i+1置为B帧
int j;
for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++)
{
int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10);
//获取相应阈值
int64_t pcost = estGroup.singleCost(i + 0, j + 1, j + 1, true);
//i为前向帧 j为编码帧的 cost
if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3)
//大于阈值直接退出
break;
frames[j]->sliceType = X265_TYPE_B; //将当前j置为B帧
}
frames[j]->sliceType = X265_TYPE_P;//将当前帧置为P帧
i = j;
}
frames[numFrames]->sliceType = X265_TYPE_P; //将当前搜索列表的最后一帧置为P帧
numBFrames = 0;
while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B)
numBFrames++;
//回去B帧个数
}
else
{
//固定B帧模式
numBFrames = X265_MIN(numFrames - 1, m_param->bframes);
for (int j = 1; j < numFrames; j++)
frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P;//每隔numBFrames帧设置一个P帧
frames[numFrames]->sliceType = X265_TYPE_P;//将当前搜索列表的最后一帧置为P帧
}
/* Check scenecut on the first minigop. */
for (int j = 1; j < numBFrames + 1; j++)
{
if (scenecut(frames, j, j + 1, false, origNumFrames)) //判断当前是否场景切换 ,如果是场景切换，曾将当前帧置为P帧
{
frames[j]->sliceType = X265_TYPE_P;
numAnalyzed = j;
break;
}
}
resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1); //如果当前只是分析当前帧是否为IDR帧 ，则后面分析帧率的全部抛弃，否则抛弃序号numBFrames + 2后面的帧类型
}
else
{
for (int j = 1; j <= numFrames; j++)
frames[j]->sliceType = X265_TYPE_P;
//当前配置为无B帧，全部为P帧
resetStart = bKeyframe ? 1 : 2;
//如果当前只是分析当前帧是否为IDR帧 ，则后面分析帧率的全部抛弃，否则抛弃序号2后面的帧类型
}
if (m_param->rc.cuTree)
cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe); //计算可参考帧的qpCuTreeOffset值
// if (!param->bIntraRefresh)
for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax) //如果当期的阵列表超过IDR间隔数目，则配置下一个IDR帧相应位置为IDR
{
frames[j]->sliceType = X265_TYPE_I;
//设置I帧
resetStart = X265_MIN(resetStart, j + 1); //后面的帧类型全部丢弃
}
if (bIsVbvLookahead)
vbvLookahead(frames, numFrames, bKeyframe);//如果应用VBV:重新计算framecost经过qpCuTreeOffset加权后的数据并把相应信息存储到第一个GOP中，并将相应的B帧设置为参考B帧
int maxp1 = X265_MIN(m_param->bframes + 1, origNumFrames);//获取后向帧的最大位置
/* Restore frame types for all frames that haven't actually been decided yet. */
for (int j = resetStart; j <= numFrames; j++)//将后面的帧类型丢弃，后面会重新搜索
{
frames[j]->sliceType = X265_TYPE_AUTO;
/* If any frame marked as scenecut is being restarted for sliceDecision,
* undo scene Transition flag */
if (j <= maxp1 && frames[j]->bScenecut && m_isSceneTransition)
m_isSceneTransition = false;
}
}
/** 函数功能
： 判断当前是否场景切换（是否需要将p1帧类型设置为I帧）
/*
调用范围
： 只在Lookahead::slicetypeAnalyse函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 p0
： 前一帧 注：p0 p1 相邻
* 参数 p1
： 后一帧
* 参数 bRealScenecut
： 当前是否是真正的场景切换判断，而不是预分析
* 参数 numFrames
： 列表中原始帧个数
* 返回
： 返回当前是否场景切换（是否需要将其帧类型设置为I帧） * */
bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames)
{
/* Only do analysis during a normal scenecut check. */
if (bRealScenecut && m_param->bframes)
//如果当前是真正的判断场景切换，在下面进行精细搜索
{
int origmaxp1 = p0 + 1;
//计数需要搜索P1的最大位置
/* Look ahead to avoid coding short flashes as scenecuts. */
origmaxp1 += m_param->bframes;
//加上搜索B帧的的个数
int maxp1 = X265_MIN(origmaxp1, numFrames); //保证当前帧不能超过列表中原始帧个数
bool fluctuate = false; //标记第二轮搜索为场景切换
bool noScenecuts = false;//标记第一轮搜索之后是否有场景切换帧
int64_t avgSatdCost = 0; //累加第一轮搜索的framecost 然后求其平均
if (frames[0]->costEst[1][0] > -1)
avgSatdCost = frames[0]->costEst[1][0];//如果已经有framecost值 获取前向帧的Pcost
int cnt = 1;//用于计数 累加了多少帧的frame cost
/* Where A and B are scenes: AAAAAABBBAAAAAA
* If BBB is shorter than (maxp1-p0), it is detected as a flash
* and not considered a scenecut. */
for (int cp1 = p1; cp1 <= maxp1; cp1++)
{
if (!scenecutInternal(frames, p0, cp1, false)) //判断当前是否场景切换（是否需要将其帧类型设置为I帧） （如果不是进入）
{
/* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
for (int i = cp1; i > p0; i--)
//如果当前结果不是场景切换帧，则将跟区域标记为false
{
frames[i]->bScenecut = false;
noScenecuts = false;
}
}
else if (scenecutInternal(frames, cp1 - 1, cp1, false))//判断当前搜索帧与前一帧是否也是场景切换 （前面已经判断出 cp1相对于p0是场景切换）
{
/* If current frame is a Scenecut from p0 frame as well as Scenecut from
* preceeding frame, mark it as a Scenecut */
frames[cp1]->bScenecut = true;
noScenecuts = true;
}
/* compute average satdcost of all the frames in the mini-gop to confirm
* whether there is any great fluctuation among them to rule out false positives */
X265_CHECK(frames[cp1]->costEst[cp1 - p0][0]!= -1, "costEst is not done n");
avgSatdCost += frames[cp1]->costEst[cp1 - p0][0]; //累加framecost
cnt++;//计数
}
/* Identify possible scene fluctuations by comparing the satd cost of the frames.
* This could denote the beginning or ending of scene transitions.
* During a scene transition(fade in/fade outs), if fluctuate remains false,
* then the scene had completed its transition or stabilized */
if (noScenecuts)//如果第一轮搜索 有场景切换帧
{
fluctuate = false;//初始化为false
avgSatdCost /= cnt;//第一轮的平均framecost
for (int i = p1; i <= maxp1; i++)
{
int64_t curCost
= frames[i]->costEst[i - p0][0];//获取当前以P0为参考当前为i的framecost
int64_t prevCost = frames[i - 1]->costEst[i - 1 - p0][0];//获取前一帧以P0为参考的framecost
if (fabs((double)(curCost - avgSatdCost)) > 0.1 * avgSatdCost ||
fabs((double)(curCost - prevCost)) > 0.1 * prevCost) //当前framecost过大 判断为场景切换
{
fluctuate = true; //标记
if (!m_isSceneTransition && frames[i]->bScenecut)//如果当前lookachead还没有场景切换帧 并且上轮搜索当前帧为场景切换帧
{
m_isSceneTransition = true; //置为ture
/* just mark the first scenechange in the scene transition as a scenecut. */
for (int j = i + 1; j <= maxp1; j++) //标记其它帧为非场景切换帧
frames[j]->bScenecut = false;
break;
}
}
frames[i]->bScenecut = false;//不是场景切换帧
}
}
if (!fluctuate && !noScenecuts)//如果第一轮和第二轮都没有场景切换帧
m_isSceneTransition = false; /* Signal end of scene transitioning */
}
/* A frame is always analysed with bRealScenecut = true first, and then bRealScenecut = false,
the former for I decisions and the latter for P/B decisions. It's possible that the first
analysis detected scenecuts which were later nulled due to scene transitioning, in which
case do not return a true scenecut for this frame */
if (!frames[p1]->bScenecut)
//如果已经计算过，直接返回false （注：因为初始化为true， false说明已经计算过）
return false;
return scenecutInternal(frames, p0, p1, bRealScenecut);//判断当前是否场景切换（是否需要将其帧类型设置为I帧）（进入此表示是场景切换：再做一次重复计算，主要为了打印log)
}
/** 函数功能
： 判断当前是否场景切换（是否需要将其帧类型设置为I帧）
/*
调用范围
： 只在Lookahead::scenecut函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 p0
： 当前帧 注：p0 p1 不相邻 p1是p0后面某一帧
* 参数 p1
： 后一帧
* 参数 bRealScenecut
： 当前是否是真正的场景切换判断，而不是预分析
* 返回
： 返回当前是否场景切换（是否需要将其帧类型设置为I帧） * */
bool Lookahead::scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut)
{
Lowres *frame = frames[p1];
//获取后向帧
CostEstimateGroup estGroup(*this, frames); //用于计算最优frame cost
estGroup.singleCost(p0, p1, p1);
//计算当前帧与前向参考帧之间的最优frame cost
int64_t icost = frame->costEst[0][0];
//获取后向帧的I帧编码cost
int64_t pcost = frame->costEst[p1 - p0][0];//获取当前帧与前向参考帧之间的最优frame cost
int gopSize = frame->frameNum - m_lastKeyframe; //当前的GOP 大小:当前帧到最近关键帧的距离
float threshMax = (float)(m_param->scenecutThreshold / 100.0);//场景切换阈值
/* magic numbers pulled out of thin air */
float threshMin = (float)(threshMax * 0.25);
//场景切换阈值乘以0.25
double bias = 0.05;
//没有初始化，可能在下面语句有bug; 值越大，越容易判断为场景切换
if (bRealScenecut)
{
if (m_param->keyframeMin == m_param->keyframeMax)
//获取bias
threshMin = threshMax;
if (gopSize <= m_param->keyframeMin / 4)
bias = threshMin / 4;
else if (gopSize <= m_param->keyframeMin)
bias = threshMin * gopSize / m_param->keyframeMin;
else
{
bias = threshMin
+ (threshMax - threshMin)
* (gopSize - m_param->keyframeMin)
/ (m_param->keyframeMax - m_param->keyframeMin);
}
}
bool res = pcost >= (1.0 - bias) * icost;
//P帧的cost比I帧的cost要大，说明P帧不优，当前可能存在场景切换导致前后帧的相似性不大，将其res置为ture，表示当前需要场景切换
if (res && bRealScenecut)
{
int imb = frame->intraMbs[p1 - p0];
//当前需要场景切换，输出相应场景切换的信息，如i宏块个数，p宏块个数等
int pmb = m_8x8Blocks - imb;
x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)n",
frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb);
}
return res;
//返回当前是否场景切换（是否需要将其帧类型设置为I帧）
}
/** 函数功能
： 计算当前搜索长度下的最优帧类型路径
/*
调用范围
： 只在Lookahead::slicetypeAnalyse函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 length
： 当前搜索frame列表的长度 （1~length）
* 参数 *best_paths
： 存储最优帧类型路径
* 返回
： null * */
void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1])
{
char paths[2][X265_LOOKAHEAD_MAX + 1];
//暂存帧类型path的cost
int num_paths = X265_MIN(m_param->bframes + 1, length); //计算需要遍历的B帧个数
int64_t best_cost = 1LL << 62;
//存储最优帧类型path的位置的cost
int idx = 0;
//指示最优帧类型path的位置取反
/************************************************************************/
/* 遍历P帧的合适位置
/* 如当前传入的长度为length为10帧 bframes 为3
/* 则依次遍历的序列为: x 表示前面已经获取的帧类型
/* xxxxxxxxxP
/* xxxxxxxxBP
/* xxxxxxxBBP
/* xxxxxxBBBP
/************************************************************************/
/* Iterate over all currently possible paths */
for (int path = 0; path < num_paths; path++)
//path表示后面需要置为B帧的个数
{
/* Add suffixes to the current path */
int len = length - (path + 1);
//获取直接得到帧类型的长度
memcpy(paths[idx], best_paths[len % (X265_BFRAME_MAX + 1)], len); //将先前计算的帧类型 copy到当前位置 为了节省存储空间：取模
memset(paths[idx] + len, 'B', path);
//设置B帧
strcpy(paths[idx] + len + path, "P");
//将最后一帧设置为P帧
/* Calculate the actual cost of the current path */
int64_t cost = slicetypePathCost(frames, paths[idx], best_cost); //计算当前PB帧设置的frame cost 累加值 计算范围 当前length
if (cost < best_cost)
{
best_cost = cost;
//更新最优cost
idx ^= 1;
//更新最优位置的idx
}
}
/* Store the best path. */
memcpy(best_paths[length % (X265_BFRAME_MAX + 1)], paths[idx ^ 1], length);//将当前最优帧类型位置copy到best path
为了节省存储空间：取模
}
/** 函数功能
： 计算当前PB帧设置的frame cost 累加值
/*
调用范围
： 只在Lookahead::slicetypePath函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 path
： 当前frame列表的帧类型：如BPBBBPxxxx(x：暂未制定类型 第一个x前一定是P)
* 参数 threshold
： 当前最优的framecost
* 返回
： 返回当前PB帧设置的frame cost 累加值 * */
int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold)
{
int64_t cost = 0;
//当前区间的cost， 用于累加
int loc = 1;
//第一个B帧位置
int cur_p = 0;
//当前的P帧（其实是非B帧 因为frame[0] 存储着非B帧）
CostEstimateGroup estGroup(*this, frames);
//用于计算fame cost
path--; /* Since the 1st path element is really the second frame */ //因为frame[0] 存储着非B帧，将path--可以直接对应起当前的fame列表，如：BPBBBPxxxx
P(I) + BPBBBPxxxx
//分区段计算： 如先计算 PBP
在计算 PBBBP
while (path[loc]) //循环遍历当前区间的frame cost
{
int next_p = loc; //标记下一个P帧位置
/* Find the location of the next P-frame. */
while (path[next_p] != 'P')
//寻找下一个P帧
next_p++;
/* Add the cost of the P-frame found above */
cost += estGroup.singleCost(cur_p, next_p, next_p); //计算当前P帧与下一P帧间的framecost，并累加到当前cost中
/* Early terminate if the cost we have found is larger than the best path cost so far */
if (cost > threshold)
//如果当前cost比最优cost大，说明不优，直接退出终止计算
break;
if (m_param->bBPyramid && next_p - cur_p > 2)
//如果当前的B帧可以参考并且两P帧距离大于2(如果小于等于2，中间只有一个B帧，肯定不是可以参考的)
{
int middle = cur_p + (next_p - cur_p) / 2;
// 选择中间位置（向下取整）作为可参考B帧 ： B
cost += estGroup.singleCost(cur_p, next_p, middle); // 计算中间可参考B帧的frame cost
for (int next_b = loc; next_b < middle && cost < threshold; next_b++) //将当前区间分为两段：当前计算前半段的frame cost
cost += estGroup.singleCost(cur_p, middle, next_b);
for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++)//将当前区间分为两段：当前计算后半段的frame cost
cost += estGroup.singleCost(middle, next_p, next_b);
}
else
{
for (int next_b = loc; next_b < next_p && cost < threshold; next_b++) //当前计算B帧不可以作参考帧的情况：分别累加当前B帧：next_b 与前向P帧 后向P帧的 frame cost
cost += estGroup.singleCost(cur_p, next_p, next_b);
}
loc = next_p + 1;
//计算下一个P帧区间：当前位置更改为下一个P帧区间的第一个B帧
cur_p = next_p;
//更新当前P帧位置
}
return cost;
//返回当前PB帧设置的frame cost 累加值
}
/** 函数功能
： 计算可参考帧的qpCuTreeOffset值
/*
调用范围
： 只在slicetypeAnalyse函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 numframes
： frames列表帧数
* 参数 bIntra
： 是否是IDR帧检测
* 返回
： null * */
void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra)
{
int idx = !bIntra;
//如果当前是IDR检测，从0帧开始遍历，0帧为非B帧，否则从1帧（B帧）开始遍历
int lastnonb, curnonb = 1; //lastnonb 记录一个GOP的后向非B帧位置
curnonb 记录一个GOP的前向非B帧位置
int bframes = 0;
//存储当前两P帧间的B帧个数
x265_emms();
//清除MMX寄存器中的内容，即初始化（以避免和浮点数操作发生冲突）。
double totalDuration = 0.0;//当前帧数的播放时长 单位 秒
for (int j = 0; j <= numframes; j++)
totalDuration += (double)m_param->fpsDenom / m_param->fpsNum; //累加播放时长
double averageDuration = totalDuration / (numframes + 1); //平均每帧的播放时长 单位秒
int i = numframes;
//计数，用于倒序寻找最后一个非B帧
int cuCount = m_8x8Width * m_8x8Height;//1/2分辨率中：每帧的8x8块个数
while (i > 0 && frames[i]->sliceType == X265_TYPE_B) //倒序获取最后一个非B帧
i--;
lastnonb = i; //获取最后一个非B帧
/* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
* be applied to the end of a lookahead buffer of any size.
However, it's most needed when
* lookahead=0, so that's what's currently implemented. */
if (!m_param->lookaheadDepth) //如果配置的lookachead depth 为0
{
if (bIntra)
{
memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
//将第一个非B帧全部的传播cost设置为0
memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, cuCount * sizeof(double)); //将AQcopy到qpCuTree
return;
}
std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
//将第一个非B帧传播cost 与最后一个非B帧的传播cost值交换
memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
//将第一个非B帧全部的传播cost设置为0
}
else
{
if (lastnonb < idx)
//当前列表数据太少，直接退出
return;
memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));//将最后一个非B帧全部的传播cost初始化为0
}
CostEstimateGroup estGroup(*this, frames);
//用于计算frame cost
while (i-- > idx)
//从最后一个非B帧开始遍历
{
curnonb = i;
//i,指向当前GOP最后一个B帧
while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0) //获取下一个P帧,curnonb存储当前GOP的前向非B帧
curnonb--;
if (curnonb < idx)
//如果搜索完毕退出，当前curnonb 为P帧
break;
estGroup.singleCost(curnonb, lastnonb, lastnonb);//计算P帧cost：当前编码P帧,lastnonb，参考帧 前一个P帧curnonb
/************************************************************************/
/* 该循环按照每个GOP进行计算： 以GOP PBBBBP为例：（P0,B0,B1,B2,P1） ，从后往前遍历GOP。初始化P0的传播cost
P1的传播cost 继承前面计算的传播cost
/* 首先将P0的传播cost置为0
/*
/* 如果B帧可参考：（P0,b0,B1,b2,P1）
/*
/*
将B1参考B帧的传播cost置为0
/*
/*
计算b2的传播cost：
/*
每个8x8块的propagate_amount: ((加权intracost)*（intracost-最优cost）/intracost )
/*
对应参考帧 B1 P1 中的参考块 累加 加权propagate_amount
/*
计算b0的传播cost：
/*
每个8x8块的propagate_amount: ((加权intracost)*（intracost-最优cost）/intracost )
/*
对应参考帧 P0 B1 中的参考块 累加 加权propagate_amount
/*
计算B1的传播cost：
/*
每个8x8块的propagate_amount: ((加权intracost)*（intracost-最优cost）/intracost )
/*
对应参考帧 P0 B1 中的参考块 累累加 加权propagate_amount
并更新B1的qpCuTreeOffset值
/* 如果B帧不可参考：（P0,b0,b1,b2,P1）
/*
计算b2的传播cost：
/*
每个8x8块的propagate_amount: ((加权intracost)*（intracost-最优cost）/intracost )
/*
对应参考帧 P0 P1 中的参考块 累加 加权propagate_amount
/*
计算b1的传播cost：
/*
每个8x8块的propagate_amount: ((加权intracost)*（intracost-最优cost）/intracost )
/*
对应参考帧 P0 P1 中的参考块 累加 加权propagate_amount
/*
计算b0的传播cost：
/*
每个8x8块的propagate_amount: ((加权intracost)*（intracost-最优cost）/intracost )
/*
对应参考帧 P0 P1 中的参考块 累加 加权propagate_amount
/*
/*计算P1的传播cost
/*
每个8x8块的propagate_amount: ((加权intracost)*（intracost-最优cost）/intracost )
/*
对应参考帧 P0 中的参考块 累加 加权propagate_amount
并更新P1的qpCuTreeOffset值
/************************************************************************/
memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); //将当前GOP的前向P帧的传播cost初始化为0
bframes = lastnonb - curnonb - 1;
//获取当前两P帧间的B帧个数
if (m_param->bBPyramid && bframes > 1)
//如果B帧可以作参考
{
int middle = (bframes + 1) / 2 + curnonb;
//四舍五入 取中间帧
estGroup.singleCost(curnonb, lastnonb, middle);
//计算当前B的frame cost：编码帧：middle 前向参考帧：curnonb 后向参考帧：lastnonb
memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t)); //初始化middle帧的 传播cost为0
while (i > curnonb)
//计算当前GOP所有B帧的frame cost
{
int p0 = i > middle ? middle : curnonb;
//当前当前GOP分为两组，前面一组 前向参考帧p0= curnonb p1 = middle
int p1 = i < middle ? middle : lastnonb;
//当前当前GOP分为两组，后面一组 前向参考帧p0= middle
p1 = lastnonb
if (i != middle)
{
estGroup.singleCost(p0, p1, i);
//计算非参考B帧 i 的frame cost
前向帧：p0 后向帧：p1
estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);//计算非参考B帧的传播cost 并累加相应参考帧P0、P1中对应参考块的list的加权cost
}
i--;
}
estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);//计算当前编码帧每个8x8块对应参考块的传播cost、计算其qpCuTreeOffset值
}
else
{
while (i > curnonb)
{
estGroup.singleCost(curnonb, lastnonb, i);
//计算b帧 i 的frame cost
前向帧：curnonb 后向帧：lastnonb
estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);//计算当前编码帧每个8x8块对应参考块的传播cost
i--;
//指向下一帧
}
}
estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1); //计算当前编码帧每个8x8块对应参考块的传播cost、计算其qpCuTreeOffset值
lastnonb = curnonb; //计算下一个GOP
}
if (!m_param->lookaheadDepth) //如果配置的lookachead depth 为0
{
estGroup.singleCost(0, lastnonb, lastnonb); //计算P帧 lastnonb 的frame cost
前向帧：0
estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1); //计算当前编码帧每个8x8块对应参考块的传播cost、计算其qpCuTreeOffset值
std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
//将第一个非B帧传播cost 与最后一个非B帧的传播cost值交换
}
cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
//计算参考帧frame[0]的qpCuTreeOffset值 因为当前已经遍历到第一个非B帧 0 号帧
if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize)
//如果当前配置B帧可参考，并且rc.vbvBufferSize为0
cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0); //计算中间帧的qpCuTreeOffset值
}
/** 函数功能
： 计算当前编码帧每个8x8块对应参考块的传播cost、如果当前编码帧是可被参考帧计算其qpCuTreeOffset值
/*
调用范围
： 只在Lookahead::cuTree函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 averageDuration
： 平均每帧的播放时长 单位秒
* 参数 p0
： 前向帧
* 参数 p1
： 后向帧
* 参数 b
： 当前帧号
* 参数 referenced
： 当前计算的b号帧是否是可被参考的
* 返回
： null * */
void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
{
uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost };
//获取参考帧的传播cost 存储地址
int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
//获取B帧的距离缩放因子：(当前编码帧距离前向帧的距离/GOP长度)*256
四舍五入取整 128 说明是处于中间位置
int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32; //获取加权系数： 距离缩放因子/4
32 说明处于中间位置
int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight };
//前后两个参考帧的加权系数，当前编码帧距离参考帧越大，加权系数越大
int listDist[2] = { b - p0 - 1, p1 - b - 1 };
//存储前后两帧到当前帧的距离：因为是从0开始计数，所以需要减一，主要用于获取存储地址
memset(m_scratch, 0, m_8x8Width * sizeof(int));
//初始化为0
uint16_t *propagateCost = frames[b]->propagateCost;
//获取当前编码帧的传播cost地址
x265_emms();
//清除MMX寄存器中的内容，即初始化（以避免和浮点数操作发生冲突）。
double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration);//fps因子：配置每帧播放时间/传入的实际每帧播放时间
一般为1.0
/* For non-referred frames the source costs are always zero, so just memset one row and re-use it. */
if (!referenced)
memset(frames[b]->propagateCost, 0, m_8x8Width * sizeof(uint16_t));
//因为propagateCost函数是累加计算，这里将首行初始化为0，表示往后直接计算当前的传播cost，无须累加
int32_t strideInCU = m_8x8Width;
//获取步长
for (uint16_t blocky = 0; blocky < m_8x8Height; blocky++)
//按照8x8行进行计算
{
int cuIndex = blocky * strideInCU;
//获取每行的第一个8x8的索引号
primitives.propagateCost(m_scratch, propagateCost,
frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width); //计算每行的8x8的传播cost(累加传播cost + 加权intracost)*（intracost-最优cost）/intracost
if (referenced)
propagateCost += m_8x8Width;
//如果是非参考帧，则无需指示到下一行，因为无须累加，参考帧需要累加计算，所以需要指示到下一行
for (uint16_t blockx = 0; blockx < m_8x8Width; blockx++, cuIndex++) //遍历当前行的每个8x8块
{
int32_t propagate_amount = m_scratch[blockx];
//获取当前8x8的传播cost
/* Don't propagate for an intra block. */
if (propagate_amount > 0)
//小于0说明intra优，大于0说明inter优
{
/* Access width-2 bitfield. */
int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT; //0 表示 intra
1 表示前向搜索
2表示后向搜索
3 表示bi搜索
/* Follow the MVs to the previous frame(s). */
for (uint16_t list = 0; list < 2; list++)
//遍历前后两个list
{
if ((lists_used >> list) & 1)
//查看当前list是否被应用到
{
#define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1)
int32_t listamount = propagate_amount; //将当前list的传播cost赋初值：前面计算的传播cost
/* Apply bipred weighting. */
if (lists_used == 3)
listamount = (listamount * bipredWeights[list] + 32) >> 6; //如果当前最优模式为bi,则根据前后参考帧的加权系数获得当前list的传播cost
+32 是为了四舍五入
MV *mvs = frames[b]->lowresMvs[list][listDist[list]];
//获得前向/后向 参考帧中搜索的MV首地址
/* Early termination for simple case of mv0. */
if (!mvs[cuIndex].word)
{
CLIP_ADD(refCosts[list][cuIndex], listamount); //如果MV为0
直接获取当前8x8块的传播cost
refCosts[list][cuIndex] += listamount
continue;
}
int32_t x = mvs[cuIndex].x;
//当前8x8块MV的x坐标
int32_t y = mvs[cuIndex].y;
//当前8x8块MV的y坐标
int32_t cux = (x >> 5) + blockx;
//将MV 坐标 分为 高5位，与低5位分别计算 （5=3+2 分别为1/4分像素精度 和8x8块的大小）
int32_t cuy = (y >> 5) + blocky;
//将MV 坐标 分为 高5位，与低5位分别计算 （5=3+2 分别为1/4分像素精度 和8x8块的大小）
int32_t idx0 = cux + cuy * strideInCU;
//获取参考块首地址所在的8x8块
int32_t idx1 = idx0 + 1;
//右边块
int32_t idx2 = idx0 + strideInCU;
//下边块
int32_t idx3 = idx0 + strideInCU + 1;
//右下块
x &= 31;
//低5位MV x坐标
y &= 31;
//低5位MV y坐标
int32_t idx0weight = (32 - y) * (32 - x); //idx0的权重，x,y偏移越大，权重越小，传播cost越小
int32_t idx1weight = (32 - y) * x;
//idx1 右边块的权重 x偏移越小,y偏移越大，权重越小，传播cost越小
int32_t idx2weight = y * (32 - x);
//idx2 下边块的权重 x偏移越大,y偏移越小，权重越小，传播cost越小
int32_t idx3weight = y * x;
//idx3 右下边块的权重 x偏移越小,y偏移越小，权重越小，传播cost越小
/* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
* be counted. */
if (cux < m_8x8Width - 1 && cuy < m_8x8Height - 1 && cux >= 0 && cuy >= 0)
{
CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);//累加相应块的传播cost：refCosts[list][idx0]+=(listamount * idx0weight + 512) /1024
CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);//累加相应块的传播cost：refCosts[list][idx1]+=(listamount * idx1weight + 512) /1024
CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);//累加相应块的传播cost：refCosts[list][idx2]+=(listamount * idx2weight + 512) /1024
CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);//累加相应块的传播cost：refCosts[list][idx3]+=(listamount * idx3weight + 512) /1024
}
else /* Check offsets individually */
{
if (cux < m_8x8Width && cuy < m_8x8Height && cux >= 0 && cuy >= 0)
CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
//超出右边界或下边界1个以内8x8块情况 可以选择idx0
if (cux + 1 < m_8x8Width && cuy < m_8x8Height && cux + 1 >= 0 && cuy >= 0)
CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
//超出右边界2个以内8x8块情况并且超出下边界1个以内8x8块并且超出左边界1个以内8x8块情况 可以选择idx1
if (cux < m_8x8Width && cuy + 1 < m_8x8Height && cux >= 0 && cuy + 1 >= 0)
CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
//超出下边界2个以内8x8块情况并且超出右边界1个以内8x8块并且超出下边界1个以内8x8块情况 可以选择idx2
if (cux + 1 < m_8x8Width && cuy + 1 < m_8x8Height && cux + 1 >= 0 && cuy + 1 >= 0)
CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
//超出边界2个以内8x8块情况,可以选择idx3
}
}
}
}
}
}
if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced) //如果当前是参考帧
cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
//计算参考帧qpCuTreeOffset值
}
/** 函数功能
： 计算参考帧qpCuTreeOffset值
/*
调用范围
： 只在Lookahead::cuTree和Lookahead::estimateCUPropagate函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 averageDuration
： 平均每帧的播放时长 单位秒
* 参数 ref0Distance
： 如果是双向参考则为0，如果当前为单向参考则为当前帧号-前向参考帧号
* 返回
： null * */
void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
{
int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256); //fps因子：配置每帧播放时间*256/传入的实际每帧播放时间
一般为256.0
double weightdelta = 0.0; //初始化加权值
if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
//如果当前是单向预测并且是加权参考
weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]); //值为 1- 加权的SATD/不加权的SATD
/* Allow the strength to be adjusted via qcompress, since the two concepts
* are very similar. */
int cuCount = m_8x8Width * m_8x8Height;
//8x8块个数
double strength = 5.0 * (1.0 - m_param->rc.qCompress); //强度系数
for (int cuIndex = 0; cuIndex < cuCount; cuIndex++)
{
int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8; //当前8x8块的加权intracost
if (intracost)
{
int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
//当前是被参考帧，获取其传播cost*帧率加权
double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta; //获取比例因子
frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio;
// 调整qpCuTreeOffset值， 传播cost越大
qpCuTreeOffset 越小
}
}
}
/** 函数功能
： 如果当前是B帧直接返回其framecost：costEstAq （qpAqOffset加权）否则，重新计算framecost
经过qpCuTreeOffset加权后的数据
/*
调用范围
： 只在Lookahead::getEstimatedPictureCost和Lookahead::vbvFrameCost函数中被调用
* 参数 frames
： 当前搜索的frames列表
* 参数 p0
： 前向帧
* 参数 p1
： 后向帧
* 参数 b
： 当前帧号
* 返回
： 返回重新计算的framecost* */
/* If MB-tree changes the quantizers, we need to recalculate the frame cost without
* re-running lookahead. */
int64_t Lookahead::frameCostRecalculate(Lowres** frames, int p0, int p1, int b)
{
if (frames[b]->sliceType == X265_TYPE_B)
return frames[b]->costEstAq[b - p0][p1 - b]; //如果是B帧，直接返回framecost，无须重新计算
int64_t score = 0;//用于累加framecost
int *rowSatd = frames[b]->rowSatds[b - p0][p1 - b];//获取每行8x8块 经过invQscaleFactor加权的存储地址
double *qp_offset = frames[b]->qpCuTreeOffset;//获取当前的qpCuTreeOffset（已经在cuTreeFinish经过修正）
x265_emms();//清除MMX寄存器中的内容，即初始化（以避免和浮点数操作发生冲突）。
for (int cuy = m_8x8Height - 1; cuy >= 0; cuy--)//遍历每行的8x8
{
rowSatd[cuy] = 0;//初始化为0
for (int cux = m_8x8Width - 1; cux >= 0; cux--)//遍历每个8x8块
{
int cuxy = cux + cuy * m_8x8Width;//8x8块所在标号
int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK;//获取最优的cost
double qp_adj = qp_offset[cuxy];//获取qpCuTreeOffset
cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8;//对最优cost 重新进行invQscaleFactor加权
rowSatd[cuy] += cuCost;//累加当前行的cost
if ((cuy > 0 && cuy < m_8x8Height - 1 &&
cux > 0 && cux < m_8x8Width - 1) ||
m_8x8Width <= 2 || m_8x8Height <= 2)
{
score += cuCost;//只累加边界以内的8x8块
}
}
}
return score;//返回重新计算的framecost
}
/** 函数功能
：单线程计算当前帧与前后参考帧之间的最优frame cost
/*
调用范围
：只在slicetypeDecide()、vbvFrameCost、slicetypeAnalyse、scenecutInternal、slicetypePathCost和cuTree函数中被调用
* 参数 p0
：前向帧
* 参数 p1
：后向帧
* 参数 b
：当前帧号
* 参数 bIntraPenalty
：会加上intra带来的编码代价;其中在slicetypeAnalyse会传入ture(也可能为false)并且bFrameAdaptive == X265_B_ADAPT_FAST，其它为false
* 返回
：当前帧与前后参考帧之间的最优frame cost* */
int64_t CostEstimateGroup::singleCost(int p0, int p1, int b, bool intraPenalty)
{
LookaheadTLD& tld = m_lookahead.m_tld[m_lookahead.m_pool ? m_lookahead.m_pool->m_numWorkers : 0];//m_tld空间大小为内核数+1
numTLD = 1 + (m_pool ? m_pool->m_numWorkers : 0); 例如当前为4核，则申请空间大小为5
//当前在当前线程完成，顾取最后一个(0,1,2,3分别表示具体某个核，4表示当前线程)
return estimateFrameCost(tld, p0, p1, b, intraPenalty); //获取取每个8x8块的帧间cost(SATD + mvcost + 4) 并 获取当前b帧在p0、p1参考帧下的最优帧cost inter(SATD+mvcost+4)*10/(13 +b)或者 intra (SATD+5+4)
}
/** 函数功能
： 添加任务，为后面并发执行做准备
/*
调用范围
： 只在Lookahead::slicetypeAnalyse函数中被调用
* 参数 p0
： 前向帧
* 参数 p1
： 后向帧
* 参数 b
： 当前帧号
* 返回
： null * */
void CostEstimateGroup::add(int p0, int p1, int b)
{
X265_CHECK(m_batchMode || !m_jobTotal, "single CostEstimateGroup instance cannot mix batch modesn");
m_batchMode = true;
//设置m_batchMode为true 准备多线程并发
Estimate& e = m_estimates[m_jobTotal++]; //添加新的任务
e.p0 = p0;
//p0 前向帧
p1 后向帧
b当前帧 （前向帧、后向帧并不是单指前一帧后一帧），如（p0,b,p1）= (5,6,7)，（p0,b,p1）= (8,10,11)
e.p1 = p1;
e.b = b;
if (m_jobTotal == MAX_BATCH_SIZE)
//如果当前任务量超过阈值，先在此全部执行完毕
finishBatch();
}
/** 函数功能
： 触发并发执行并一直等到所有任务执行完毕：计算每帧与其对应参考帧之间的帧间cost
/*
调用范围
： 只在Lookahead::slicetypeAnalyse和CostEstimateGroup::add函数中被调用
* 返回
： null * */
void CostEstimateGroup::finishBatch()
{
if (m_lookahead.m_pool)
tryBondPeers(*m_lookahead.m_pool, m_jobTotal);//在threadmain中触发相应processtask，只要是sleep状态的核都可以触发
processTasks(-1);
//在当前线程中计算cost
waitForExit();
//等待全部任务完成
m_jobTotal = m_jobAcquired = 0;
//全部完成将job数目记为0
}
/** 函数功能
： 功能分两个只能执行其中一个：
1. 获取取每个8x8块的帧间cost(SATD + mvcost + 4) 并 获取当前b帧在p0、p1参考帧下的最优帧cost inter(SATD+mvcost+4)*10/(13 +b)或者 intra (SATD+5+4)
2. Lookachead 多slice并行，执行其中一条slice的每个8x8块的帧间cost(SATD + mvcost + 4)
/*
调用范围
： 只在CostEstimateGroup::finishBatch()和CostEstimateGroup::estimateFrameCost函数中被调用 （分别执行1,2功能）
* 返回
： null * */
void CostEstimateGroup::processTasks(int workerThreadID)
{
ThreadPool* pool = m_lookahead.m_pool;
int id = workerThreadID;
//workerThreadID为当前的内核号，-1表示在本线程中继续执行
//在WorkerThread::threadMain()中为大于0的一个内核号，在相应线程中执行
if (workerThreadID < 0)
id = pool ? pool->m_numWorkers : 0;
//如果workerThreadID<0 则将其置为最后一个id，因为申请的空间为核数+1，如四个核：0,1,2,3
//-1 则为虚拟的4，此时在本线程中继续执行，其它核号在相应线程中执行
LookaheadTLD& tld = m_lookahead.m_tld[id];
m_lock.acquire();//对临界资源加锁
while (m_jobAcquired < m_jobTotal)
{
int i = m_jobAcquired++;//执行其中一个任务
m_lock.release();//释放临界资源锁
if (m_batchMode)
{
//bFrameAdaptive = 2并且内核数大于4 时执行在此
ProfileLookaheadTime(tld.batchElapsedTime, tld.countBatches);
ProfileScopeEvent(estCostSingle);
Estimate& e = m_estimates[i];
//获取当前需要计算的当前帧号与相应参考帧号
estimateFrameCost(tld, e.p0, e.p1, e.b, false); //获取取每个8x8块的帧间cost(SATD + mvcost + 4) 并 获取当前b帧在p0、p1参考帧下的最优帧cost inter(SATD+mvcost+4)*10/(13 +b)或者 intra (SATD+5+4)
}
else
{
// Lookachead 多 slice 并行 在内核比较少且bFrameAdaptive != 2 时执行在此
只能在CostEstimateGroup::estimateFrameCost中进入此
ProfileLookaheadTime(tld.coopSliceElapsedTime, tld.countCoopSlices);
ProfileScopeEvent(estCostCoop);
X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop slicesn");
int firstY = m_lookahead.m_numRowsPerSlice * i;//指向当前slice的第一个8x8行
int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1; //指向当前slice的最后一个8x8行
bool lastRow = true;
//倒序遍历，刚开始是当前slice的最后一行（目的：8x8倒序遍历，最后一行无法获取下边行的mvc）
for (int cuY = lastY; cuY >= firstY; cuY--)//倒序遍历，按照行
{
m_frames[m_coop.b]->rowSatds[m_coop.b - m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0;//将当前行的cost初始化为0
for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)//倒序遍历，遍历每一个8x8
estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i);//获取每个8x8块的帧间cost(SATD + mvcost + 4)
lastRow = false;//标记为不是最后一行，因为是从最后一行开始遍历的
}
}
m_lock.acquire();//对临界资源加锁
}
m_lock.release();//释放临界资源锁
}
/** 函数功能
：获取取每个8x8块的帧间cost(SATD + mvcost + 4) 并 获取当前b帧在p0、p1参考帧下的最优帧cost inter(SATD+mvcost+4)*10/(13 +b)或者 intra (SATD+5+4)
/*
调用范围
：只在CostEstimateGroup::singleCost和CostEstimateGroup::processTasks函数中被调用
* 参数 tld
：当前线程的tld
* 参数 p0
：前向帧
* 参数 p1
：后向帧
* 参数 b
：当前帧号
* 参数 bIntraPenalty
：会加上intra带来的编码代价;在CostEstimateGroup::processTasks中为false 在调用CostEstimateGroup::singleCost中的slicetypeAnalyse会传入ture(也可能为false) 并且bFrameAdaptive == X265_B_ADAPT_FAST
* 返回
：当前b帧在p0、p1参考帧下的最优帧cost inter(SATD+mvcost+4)*10/(13 +b)或者 intra (SATD+5+4) * */
int64_t CostEstimateGroup::estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool bIntraPenalty)
{
Lowres*
fenc
= m_frames[b];
x265_param* param = m_lookahead.m_param;
int64_t
score = 0;
if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1) //初始化-1.-1说明没有计算过，需要计算，如果不为-1说明已经计算过无须重新计算
score = fenc->costEst[b - p0][p1 - b];
//直接获取已经计算过值
else
{
X265_CHECK(p0 != b, "I frame estimates should always be pre-calculatedn");
bool bDoSearch[2];
//用于标示当前是否需要进行搜索（前向、后向）
bDoSearch[0] = p0 < b && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
//如果当前不是同一帧并且是前向帧并且没有搜索过，标记为true
bDoSearch[1] = p1 > b && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
//如果当前不是同一帧并且是后向帧并且没有搜索过，标记为true
#if CHECKED_BUILD
X265_CHECK(!(p0 < b && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFE), "motion search batch duplication L0n");
X265_CHECK(!(p1 > b && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFE), "motion search batch duplication L1n");
if (bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0x7FFE;
//用于检查检错，如果当前判断为ture需要搜索，则将其初始为-1 （一般情况不会出现问题）
if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0x7FFE;
//用于检查检错，如果当前判断为ture需要搜索，则将其初始为-1 （一般情况不会出现问题）
#endif
tld.weightedRef.isWeighted = false;
//初始标记为不加权
if (param->bEnableWeightedPred && bDoSearch[0])
tld.weightsAnalyse(*m_frames[b], *m_frames[p0]);
//判断当前两帧是否进行加权 ，结果存储在weightedRef.isWeighted
fenc->costEst[b - p0][p1 - b] = 0;
//初始化当前帧与前向帧后向帧的cost为0
第一维表示当前帧号poc-前向参考帧号poc
第二维表示后向参考帧号poc-当前帧号poc
fenc->costEstAq[b - p0][p1 - b] = 0;
//初始化当前帧与前向帧后向帧的AQcost为0
第一维表示当前帧号poc-前向参考帧号poc
第二维表示后向参考帧号poc-当前帧号poc
if (!m_batchMode && m_lookahead.m_numCoopSlices > 1 && ((p1 > b) || bDoSearch[0] || bDoSearch[1])) //如果采用lookachead多slice
{
/* Use cooperative mode if a thread pool is available and the cost estimate is
* going to need motion searches or bidir measurements */
memset(&m_slice, 0, sizeof(Slice) * m_lookahead.m_numCoopSlices);
m_lock.acquire();
//加锁，防止多线程读写冲突
X265_CHECK(!m_batchMode, "single CostEstimateGroup instance cannot mix batch modesn");
m_coop.p0 = p0;
//设置前向帧
m_coop.p1 = p1;
//设置后向帧
m_coop.b = b;
//设置当前帧
m_coop.bDoSearch[0] = bDoSearch[0]; //p0是否需要搜索
m_coop.bDoSearch[1] = bDoSearch[1]; //p1是否需要搜索
m_jobTotal = m_lookahead.m_numCoopSlices; //当前的job个数为slice个数
m_jobAcquired = 0;
//计数job完成量
m_lock.release();
//解锁
tryBondPeers(*m_lookahead.m_pool, m_jobTotal);
//触发任务：只要找到一个当前正在sleep的核立即触发
processTasks(-1);
//在当前线程先执行一个slice
waitForExit();
//一直等待所有任务都完成
for (int i = 0; i < m_lookahead.m_numCoopSlices; i++) //遍历当前帧的所有slice
{
fenc->costEst[b - p0][p1 - b] += m_slice[i].costEst;
//存储当前帧与参考帧之间的 satd +
mvcost
fenc->costEstAq[b - p0][p1 - b] += m_slice[i].costEstAq; //存储当前帧与参考帧之间的 加权（satd +
mvcost）
if (p1 == b)
fenc->intraMbs[b - p0] += m_slice[i].intraMbs;
//存储当前帧与参考帧之间的intra块最优个数
}
}
else
{
bool lastRow = true;
//表示当前是否为最后一行
for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--) //倒序遍历，按照行
{
fenc->rowSatds[b - p0][p1 - b][cuY] = 0;
//将当前行的cost初始化为0
for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--) //倒序遍历，遍历每一个8x8
estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1);//功能获取每个8x8块的帧间cost(SATD + mvcost + 4)
lastRow = false; //标记为不是最后一行，因为是从最后一行开始遍历的
}
}
score = fenc->costEst[b - p0][p1 - b]; //获取相应帧之间的最优cost(SATD+mvcost+4）
if (b != p1)
score = score * 100 / (130 + param->bFrameBias); //根据bFrameBias 调整B帧的权重系数
fenc->costEst[b - p0][p1 - b] = score;
//更新当前cost
}
if (bIntraPenalty)
// arbitrary penalty for I-blocks after B-frames
score += score * fenc->intraMbs[b - p0] / (tld.ncu * 8); //加上intra带来的编码代价，intra越多，代价值越大
return score;
}
/** 函数功能
：获取每个8x8块的帧间cost(SATD + mvcost + 4)
/*
调用范围
：只在 CostEstimateGroup::processTasks和CostEstimateGroup::estimateFrameCost函数中被调用 (一般只在CostEstimateGroup::estimateFrameCost)
* 参数 tld
：当前线程先的tld
* 参数 cuX
：当前帧的8x8 X坐标
* 参数 cuY
：当前帧的8x8 Y坐标
* 参数 p0
：前向帧
* 参数 p1
：后向帧
* 参数 b
：当前帧号
* 参数 bDoSearch
：分别表示p0p1需不需要search
* 参数 lastRow
：是否是最后一行
* 参数 slice
：在CostEstimateGroup::processTasks为相应slice
在CostEstimateGroup::estimateFrameCost为-1
* 返回
：null * */
void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice)
{
Lowres *fref0 = m_frames[p0];
//获取前向帧
Lowres *fref1 = m_frames[p1];
//获取后向帧
Lowres *fenc
= m_frames[b];
//获取当前帧
ReferencePlanes *wfref0 = tld.weightedRef.isWeighted ? &tld.weightedRef : fref0;
//如果在estimateFrameCost中的weightsAnalyse分析为加权前向帧，则选择加权的参考帧buf
const int widthInCU = m_lookahead.m_8x8Width;
//lowres 8x8宽度中个数
const int heightInCU = m_lookahead.m_8x8Height;
//lowres 8x8高度中个数
const int bBidir = (b < p1);
//判断当前是否可以搜索bi模式（后向帧在当前帧b后面时）
const int cuXY = cuX + cuY * widthInCU;
//计算XY
const int cuSize = X265_LOWRES_CU_SIZE;
//在1/2采样视频帧中采用8x8块
const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;
//计算偏移地址
if (bBidir || bDoSearch[0] || bDoSearch[1])
//如果需要ME搜索，设置ME
tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize); //设置me对应的asm函数，copy待搜索块数据到待搜索块的缓存
/* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
int lowresPenalty = 4;
//因为当前搜索的是下采样视频，预估一个编码代价
int listDist[2] = { b - p0 - 1, p1 - b - 1 };
//用于标记当前list下的前向/后向第几帧 (如：当前poc = 3
则fenc->lowresMvs[0][0] 表示前向第一帧 poc=2 fenc->lowresMvs[1][1] 表示后向第二帧 poc=5 )
MV mvmin, mvmax;
//最大MV和最小MV，防止越界
int bcost = tld.me.COST_MAX;
//存储最优cost
int listused = 0;
//标记当前用了几个list bi模式等于3
0表示 intra
// establish search bounds that don't cross extended frame boundaries
// 设置最大MV和最小MV，防止越界
mvmin.x = (int16_t)(-cuX * cuSize - 8);
mvmin.y = (int16_t)(-cuY * cuSize - 8);
mvmax.x = (int16_t)((widthInCU - cuX - 1) * cuSize + 8);
mvmax.y = (int16_t)((heightInCU - cuY - 1) * cuSize + 8);
for (int i = 0; i < 1 + bBidir; i++)
//遍历list，前向和后向
{
int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY];
//获取cost的存储地址
if (!bDoSearch[i])
//如果当前不用搜索（前面estimateFrameCost判定，已经搜索过），直接获取cost值
{
COPY2_IF_LT(bcost, fencCost, listused, i + 1);
//更新最优cost 以及 list个数
continue;
}
int numc = 0;
//计数mvc个数
MV mvc[4], mvp;
//分别用于存储mvc 和mvp
MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY];
//获取mv的存储地址
ReferencePlanes* fref = i ? fref1 : wfref0;
//是否获取加权帧，i=0，P帧，选择wfref（上面已经判断是否加权）i=1，选择fref1（后向帧）
/* Reverse-order MV prediction */
#define MVC(mv) mvc[numc++] = mv;
//宏，用于添加MVC（候选MV）
if (cuX < widthInCU - 1)
MVC(fencMV[1]);
//因为当前是倒序搜索，将当前搜索块右边块添加到MVC
if (!lastRow)
{
MVC(fencMV[widthInCU]);
//因为当前是倒序搜索，将当前搜索块下边块添加到MVC
if (cuX > 0)
MVC(fencMV[widthInCU - 1]);
//因为当前是倒序搜索，将当前搜索块左下边块添加到MVC
if (cuX < widthInCU - 1)
MVC(fencMV[widthInCU + 1]);
//因为当前是倒序搜索，将当前搜索块右下边块添加到MVC
}
#undef MVC
//删除宏
if (!numc)
//如果当前mvc个数，设置mvp为(0,0)
mvp = 0;
else
{
ALIGN_VAR_32(pixel, subpelbuf[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); //字节对齐申请空间8x8用于存储分像素插值参考块
int mvpcost = MotionEstimate::COST_MAX;
//用于存储当前最优cost
/* measure SATD cost of each neighbor MV (estimating merge analysis)
* and use the lowest cost MV as MVP (estimating AMVP). Since all
* mvc[] candidates are measured here, none are passed to motionEstimate */
for (int idx = 0; idx < numc; idx++)
//遍历当前的MVC
{
intptr_t stride = X265_LOWRES_CU_SIZE;
//设置默认步长8，会在lowresMC更新真实步长
pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride); //获取当前qmv下的参考考数据块（分像素作伪插值操作）
int cost = tld.me.bufSATD(src, stride);
//计算当前块与参考块的SATD值
COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
//更新当前最优cost 与mvp
}
}
/* ME will never return a cost larger than the cost @MVP, so we do not
* have to check that ME cost is more than the estimated merge cost */
fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV);//搜索获取最优mv到fencMV，返回satd +
mvcost
COPY2_IF_LT(bcost, fencCost, listused, i + 1);
//更新最优cost 以及 list个数
}
if (bBidir) /* B, also consider bidir */ //Bi 模式判断
{
/* NOTE: the wfref0 (weightp) is not used for BIDIR */
/* avg(l0-mv, l1-mv) candidate */
ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); //存储1/4分像素插值数据
ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); //存储1/4分像素插值数据
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0); //获取list0 中的参考数据
pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1); //获取list1 中的参考数据
ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
//用于存储list0参考块与list1参考块的平均值
primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
//获取list0参考块与list1参考块的平均值
int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
//计算当前块与参考块的SATD值
COPY2_IF_LT(bcost, bicost, listused, 3);
//更新最优cost 以及 list个数
/* coloc candidate */
src0 = fref0->lowresPlane[0] + pelOffset;
//尝试list0 与list1对应块作平均的cost
src1 = fref1->lowresPlane[0] + pelOffset;
primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
//更新最优cost 以及 list个数
COPY2_IF_LT(bcost, bicost, listused, 3);
bcost += lowresPenalty;
//将当前最优代价加上下采样带来的预估编码代价
}
else /* P, also consider intra */
{
bcost += lowresPenalty;
//如果当前不是bi模式，将当前某个list的最优代价加上下采样带来的预估编码代价
if (fenc->intraCost[cuXY] < bcost)
//如果当前的cost比intra还大，则更新最优cost为intracost
{
bcost = fenc->intraCost[cuXY];
listused = 0;
}
}
/* do not include edge blocks in the frame cost estimates, they are not very accurate */
const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
//判断当前是否是边界上的CU，因为边界上的块不够准确，是边界返回false
int bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8) : bcost;//如果是边界块：bcostAq = bcost 不是边界块：bcostAq = (bcost * fenc.invQscaleFactor[cuXY] + 128) >> 8
if (bFrameScoreCU)
{
if (slice < 0)
{
fenc->costEst[b - p0][p1 - b] += bcost;
//存储当前帧与参考帧之间的 satd +
mvcost
fenc->costEstAq[b - p0][p1 - b] += bcostAq;
//存储当前帧与参考帧之间的 加权（satd +
mvcost）
if (!listused && !bBidir)
fenc->intraMbs[b - p0]++;
//存储当前帧与参考帧之间的intra块最优个数
}
else
{
m_slice[slice].costEst += bcost;
//存储相应slice之间的(SATD+mvcost+4）的累加和
m_slice[slice].costEstAq += bcostAq;
//存储相应slice之间的(SATD+mvcost+4）加权的累加和
if (!listused && !bBidir)
m_slice[slice].intraMbs++;
//存储相应slice之间的intra块最优个数
}
}
fenc->rowSatds[b - p0][p1 - b][cuY] += bcostAq;
//存储相应帧之间的(SATD+mvcost+4）值(经过fenc.invQscaleFactor[cuXY]加权)
fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT)); //存储当前帧与相应参考帧之间的最优(SATD+mvcost+4）值
其中listused << 14 用于表示当前是intra listx 还是bi模式
}