我是靠谱客的博主 外向电源,最近开发中收集的这篇文章主要介绍Fast gauss blur http://blog.ivank.net/fastest-gaussian-blur.html Fastest Gaussian Blur (in linear time),觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

http://blog.ivank.net/fastest-gaussian-blur.html


Fastest Gaussian Blur (in linear time)

I needed really fast Gaussian blur for one of my projects. After hours of struggling and browsing the internet, I finally found the best solution.

Beginning

My solution is based on Fast image convolutions by Wojciech Jarosz. Presented ideas are very simple and I don't know who is the original author. I am going to describe it a little better and add some mathematics. To get motivated, take a glance at the results. I have implemented this code into Photopea under Filter - Blur - Gaussian Blur.

Definition

The convolution of two 2D functions  f  and  g  is defined as the volume of product of  f  and "shifted"  g . The second function  g  is sometimes called "weight", since it determines, how much of  f  will get into the result

The Gaussian blur of a 2D function can be defined as a convolution of that function with 2DGaussian function. Our gaussian function has an integral 1 (volume under surface) and is uniquely defined by one parameter  σ  called standard deviation. We will also call it "radius" in the text below.

In our discrete finite case, we represent our 2D functions as matrices of values. We compute the volume (integral) as a sum. Gaussian function has near to zero values behind some radius, so we will use only the values  rxr,ryr . This "useful" part of weight is also called the kernel.The value of convolution at [i, j] is the weighted average, i. e. sum of function values around [i, j] multiplied by weight.

Algorithm 1

For a general discrete convolution of  f  and weight function  w , we can compute the result  b as:

b[i,j]=y=iri+rx=jrj+rf[y,x]w[y,x]

For gaussian weight, we can compute only weights around [i, j] (area of  4r2 ). When our matrix has  n  values, the time complexity is  O(nr2) . For large radii, e. g.  r=10 , we have to do  n400  operations, which correspond to 400 loops over the whole matrix and that is ugly.

1// source channel, target channel, width, height, radius
2function gaussBlur_1 (scl, tcl, w, h, r) {
3
var rs = Math.ceil(r * 2.57);
// significant radius
4
for(var i=0; i<h; i++)
5
for(var j=0; j<w; j++) {
6
var val = 0, wsum = 0;
7
for(var iy = i-rs; iy<i+rs+1; iy++)
8
for(var ix = j-rs; ix<j+rs+1; ix++) {
9
var x = Math.min(w-1, Math.max(0, ix));
10
var y = Math.min(h-1, Math.max(0, iy));
11
var dsq = (ix-j)*(ix-j)+(iy-i)*(iy-i);
12
var wght = Math.exp( -dsq / (2*r*r) ) / (Math.PI*2*r*r);
13
val += scl[y*w+x] * wght;
wsum += wght;
14
}
15
tcl[i*w+j] = Math.round(val/wsum);
16
}
17}

Algorithm 2

Let's introduce the box blur. It is the convolution of function  f  and weight  w , but weight is constant and lies within a square (box). The nice feature of box blur is, that when you have some weight function having the same variance, it converges to gaussian blur after several passes.

In this algorithm, we will simulate the gaussian blur with 3 passes of box blur. Let's denote the half of size of square as  br  ("box radius"). The constant value of weight is  1/(2br)2 (so the sum over the whole weight is 1). We can define box blur as:

bb[i,j]=y=ibri+brx=jbrj+brf[y,x]/(2br)2

We have to convert the standard deviation of gaussian blur  r  into dimensions of boxes for box blur. I am not very good at calculus, but fortunatelly I have found this website and used their implementation.

1function boxesForGauss(sigma, n)
// standard deviation, number of boxes
2{
3
var wIdeal = Math.sqrt((12*sigma*sigma/n)+1);
// Ideal averaging filter width 
4
var wl = Math.floor(wIdeal);
if(wl%2==0) wl--;
5
var wu = wl+2;
6
7
var mIdeal = (12*sigma*sigma - n*wl*wl - 4*n*wl - 3*n)/(-4*wl - 4);
8
var m = Math.round(mIdeal);
9
// var sigmaActual = Math.sqrt( (m*wl*wl + (n-m)*wu*wu - n)/12 );
10
11
var sizes = [];
for(var i=0; i<n; i++) sizes.push(i<m?wl:wu);
12
return sizes;
13}

Our algorithm has still the same complexity  O(nr2) , but it has two advantages: first, the area is much smaller ( br  is almost equal to  σ , while significant radius for gaussian is much larger). The second advantage is, that the weight is constant. Even though we have to do it 3 times, it performs faster.

1function gaussBlur_2 (scl, tcl, w, h, r) {
2
var bxs = boxesForGauss(r, 3);
3
boxBlur_2 (scl, tcl, w, h, (bxs[0]-1)/2);
4
boxBlur_2 (tcl, scl, w, h, (bxs[1]-1)/2);
5
boxBlur_2 (scl, tcl, w, h, (bxs[2]-1)/2);
6}
7function boxBlur_2 (scl, tcl, w, h, r) {
8
for(var i=0; i<h; i++)
9
for(var j=0; j<w; j++) {
10
var val = 0;
11
for(var iy=i-r; iy<i+r+1; iy++)
12
for(var ix=j-r; ix<j+r+1; ix++) {
13
var x = Math.min(w-1, Math.max(0, ix));
14
var y = Math.min(h-1, Math.max(0, iy));
15
val += scl[y*w+x];
16
}
17
tcl[i*w+j] = val/((r+r+1)*(r+r+1));
18
}
19}

Algorithm 3

We have already simplified gaussian blur into 3 passes of box blur. Let's do a little experiment. Let's define a horizontal blur and total blur:

bh[i,j]=x=jbrj+brf[i,x]/(2br)bt[i,j]=y=jbrj+brbh[y,j]/(2br)

Those two functions are "looping" in a line, producing "one-dimensional blur". Let's see, what total blur corresponds to:

bt[i,j]=y=ibri+brbh[y,j]/(2br)=y=jbrj+br(x=jbrj+brf[y,x]/(2br))/(2br)=y=ibri+brx=jbrj+brf[y,x]/(2br)2

We just discovered, that our total blur is box blur! Both total blur and horizontal blur have a complexity  O(nr) , so the whole box blur has  O(nr) .

1function gaussBlur_3 (scl, tcl, w, h, r) {
2
var bxs = boxesForGauss(r, 3);
3
boxBlur_3 (scl, tcl, w, h, (bxs[0]-1)/2);
4
boxBlur_3 (tcl, scl, w, h, (bxs[1]-1)/2);
5
boxBlur_3 (scl, tcl, w, h, (bxs[2]-1)/2);
6}
7function boxBlur_3 (scl, tcl, w, h, r) {
8
for(var i=0; i<scl.length; i++) tcl[i] = scl[i];
9
boxBlurH_3(tcl, scl, w, h, r);
10
boxBlurT_3(scl, tcl, w, h, r);
11}
12function boxBlurH_3 (scl, tcl, w, h, r) {
13
for(var i=0; i<h; i++)
14
for(var j=0; j<w; j++)
{
15
var val = 0;
16
for(var ix=j-r; ix<j+r+1; ix++) {
17
var x = Math.min(w-1, Math.max(0, ix));
18
val += scl[i*w+x];
19
}
20
tcl[i*w+j] = val/(r+r+1);
21
}
22}
23function boxBlurT_3 (scl, tcl, w, h, r) {
24
for(var i=0; i<h; i++)
25
for(var j=0; j<w; j++) {
26
var val = 0;
27
for(var iy=i-r; iy<i+r+1; iy++) {
28
var y = Math.min(h-1, Math.max(0, iy));
29
val += scl[y*w+j];
30
}
31
tcl[i*w+j] = val/(r+r+1);
32
}
33}

Algorithm 4

One-dimensional blur can be computed even faster. E. g. we want to compute horizontal blur. We compute  bh[i,j],bh[i,j+1],bh[i,j+2],... . But the neighboring values  bh[i,j]  and  bh[i,j+1]  are almost the same. The only difference is in one left-most value and one right-most value. So bh[i,j+1]=bh[i,j]+f[i,j+r+1]f[i,jr] .

In our new algorithm, we will compute the one-dimensional blur by creating the accumulator. First, we put the value of left-most cell into it. Then we will compute next values just by editing the previous value in constant time. This 1D blur has the complexity  O(n) (independent on  r ). But it is performed twice to get box blur, which is performed 3 times to get gaussian blur. So the complexity of this gaussian blur is 6 *  O(n) .

1function gaussBlur_4 (scl, tcl, w, h, r) {
2
var bxs = boxesForGauss(r, 3);
3
boxBlur_4 (scl, tcl, w, h, (bxs[0]-1)/2);
4
boxBlur_4 (tcl, scl, w, h, (bxs[1]-1)/2);
5
boxBlur_4 (scl, tcl, w, h, (bxs[2]-1)/2);
6}
7function boxBlur_4 (scl, tcl, w, h, r) {
8
for(var i=0; i<scl.length; i++) tcl[i] = scl[i];
9
boxBlurH_4(tcl, scl, w, h, r);
10
boxBlurT_4(scl, tcl, w, h, r);
11}
12function boxBlurH_4 (scl, tcl, w, h, r) {
13
var iarr = 1 / (r+r+1);
14
for(var i=0; i<h; i++) {
15
var ti = i*w, li = ti, ri = ti+r;
16
var fv = scl[ti], lv = scl[ti+w-1], val = (r+1)*fv;
17
for(var j=0; j<r; j++) val += scl[ti+j];
18
for(var j=0
; j<=r ; j++) { val += scl[ri++] - fv
;
tcl[ti++] = Math.round(val*iarr); }
19
for(var j=r+1; j<w-r; j++) { val += scl[ri++] - scl[li++];
tcl[ti++] = Math.round(val*iarr); }
20
for(var j=w-r; j<w
; j++) { val += lv
- scl[li++];
tcl[ti++] = Math.round(val*iarr); }
21
}
22}
23function boxBlurT_4 (scl, tcl, w, h, r) {
24
var iarr = 1 / (r+r+1);
25
for(var i=0; i<w; i++) {
26
var ti = i, li = ti, ri = ti+r*w;
27
var fv = scl[ti], lv = scl[ti+w*(h-1)], val = (r+1)*fv;
28
for(var j=0; j<r; j++) val += scl[ti+j*w];
29
for(var j=0
; j<=r ; j++) { val += scl[ri] - fv
;
tcl[ti] = Math.round(val*iarr);
ri+=w; ti+=w; }
30
for(var j=r+1; j<h-r; j++) { val += scl[ri] - scl[li];
tcl[ti] = Math.round(val*iarr);
li+=w; ri+=w; ti+=w; }
31
for(var j=h-r; j<h
; j++) { val += lv
- scl[li];
tcl[ti] = Math.round(val*iarr);
li+=w; ti+=w; }
32
}
33}

Results

I was testing all 4 algorithms on an image below (4 channels, 800x200 pixels). Here are the results:

Algorithm Time, r=5 Time, r=10 Time complexity
Algorithm 1 7 077 ms 27 021 ms O(nr2)
Algorithm 1 (pre-computed weight) 2 452 ms 8 990 ms O(nr2)
Algorithm 2 586 ms 2 437 ms O(nr2)
Algorithm 3 230 ms 435 ms O(nr)
Algorithm 4 32 ms 34 ms O(n)

Note, that Alg 1 is computing the true Gaussian blur using gaussian kernel, while Alg 2,3,4 are only approximating it with 3 passes of box blur. The difference between Alg 2,3,4 is in complexity of computing box blur, their outputs are the same.


c code as follow:




//利用3个均值模糊 拟合 高斯模糊
//参考:http://blog.ivank.net/fastest-gaussian-blur.html
//横向的均值模糊 srcPix:原始的像素值 destPix将处理过的像素值放入到 destPix中 
void boxBlurH(TByte* srcPix, TByte* destPix, int w, int h, int srcLineBytes, int dstLineBytes,int radius)
{
//用于索引
//int index;


//r g b在遍历是 累加的色彩通道的总和
int a = 0, r = 0, g = 0, b = 0;
int a0 = 0, r0 = 0, g0 = 0, b0 = 0;
int ta, tr, tg, tb; //临时变量
int bitcount = 4;
//临时变量
int color;
int preColor;
TByte* pSrc;
TByte* pDst;


TByte* pSrcPre;
TByte* pDstPre;
int tempvalue = 2*radius;
int tempvalue2=(2*radius+1);
int tempvalue3 = (2*radius+1)*bitcount;


//用于计算权值 1 / num
int num;
float iarr;


for (int i = 0; i < h; ++i)
{
r = 0;
g = 0;
b = 0;


pSrc = srcPix+i*srcLineBytes;
pDst = destPix+i*dstLineBytes;
//index = i * w;
num = radius;


for (int j = 0; j < radius; j++)
{
//累加0,radius-1的色彩的总和
//color = srcPix[index + j];
//a += (color & 0xff000000) >> 24;
//r += (color & 0x00ff0000) >> 16;
//g += (color & 0x0000ff00) >> 8;
//b += (color & 0x000000ff);
r += pSrc[0];
g += pSrc[1];
b += pSrc[2];
pSrc = pSrc+bitcount;


}


//pSrc = srcPix+i*srcLineBytes+radius*bitcount;
//真正开始计算
for (int j = radius; j <= tempvalue; ++j)
{
num++;
iarr = 1.0 / (1.0 * num);


//color = srcPix[index + j + radius];
a += (color & 0xff000000) >> 24;
//r += (color & 0x00ff0000) >> 16;
//g += (color & 0x0000ff00) >> 8;
//b += (color & 0x000000ff);


//ta = (int)(1.0 * a / num);


r += pSrc[0];
g += pSrc[1];
b += pSrc[2];
pSrc = pSrc+bitcount;


tr = (int)(r * iarr);
tg = (int)(g * iarr);
tb = (int)(b * iarr);


pDst[0] = tr;
pDst[1] = tg;
pDst[2] = tb;
pDst = pDst+bitcount;
//destPix[index + j] = tr << 16 | tg << 8 | tb | 0xff000000;
}


iarr = 1.0 / (1.0 * num);
//pSrcPre = srcPix+


//pSrc = srcPix+i*srcLineBytes+radius*bitcount;
for (int j = tempvalue2; j < w; ++j)
{
pSrcPre = pSrc-tempvalue3;
r0 = pSrc[0]-pSrcPre[0];
g0 = pSrc[1]-pSrcPre[1];
b0 = pSrc[2]-pSrcPre[2];
r = r + r0;
g = g + g0;
b = b +  b0;
pSrc = pSrc+bitcount;


//preColor = srcPix[index + j - 1 - radius];
//color = srcPix[index + j + radius];



a += (color & 0xff000000) >> 24 - (preColor & 0xff000000) >> 24;
//r = r + ((color & 0x00ff0000) >> 16) - ((preColor & 0x00ff0000) >> 16);
//g = g + ((color & 0x0000ff00) >> 8)  - ((preColor & 0x0000ff00) >> 8);
//b = b +  (color & 0x000000ff)        -  (preColor & 0x000000ff);


//ta = (int)(1.0 * a / num);
tr = (int)(r * iarr);
tg = (int)(g * iarr);
tb = (int)(b * iarr);


//destPix[index + j] = tr << 16 | tg << 8 | tb | 0xff000000;
pDst[0] = tr;
pDst[1] = tg;
pDst[2] = tb;
pDst = pDst+bitcount;
}


pSrcPre = pSrc-tempvalue3;
//pSrcPre = srcPix+i*srcLineBytes-(1 +radius)
for (int j = w - radius; j < w; ++j)
{
num--;
iarr = 1.0 / (1.0 * num);


r -= pSrcPre[0];
g -= pSrcPre[1];
b -= pSrcPre[2];


//preColor = srcPix[index + j - 1 - radius];


a -= (preColor & 0xff000000) >> 24;
//r -= (preColor & 0x00ff0000) >> 16;
//g -= (preColor & 0x0000ff00) >> 8;
//b -= (preColor & 0x000000ff);


//ta = (int)(1.0 * a / num);
tr = (int)(r * iarr);
tg = (int)(g * iarr);
tb = (int)(b * iarr);


//
//destPix[index + j] = (ta << 24 | tr << 16 | tg << 8 | tb);
//destPix[index + j] = tr << 16 | tg << 8 | tb | 0xff000000;


pDst[0] = tr;
pDst[1] = tg;
pDst[2] = tb;
pDst = pDst+bitcount;
}
}
}




//列的均值模糊 srcPix:原始的像素值 destPix将处理过的像素值放入到 destPix中 
void boxBlurV(TByte* srcPix, TByte* destPix, int w, int h, int srcLineBytes, int dstLineBytes,int radius)
{
//r g b在遍历是 累加的色彩通道的总和
int a = 0, r = 0, g = 0, b = 0;
int ta, tr, tg, tb; //临时变量
int a0 = 0, r0 = 0, g0 = 0, b0 = 0;


//临时变量
int color;
int preColor;
int bitcount = 4;


TByte* pSrc;
TByte* pDst;


TByte* pSrcPre;
TByte* pDstPre;
int tempvalue = 2*radius;
int tempvalue2=(2*radius+1);
int tempvalue3 = (2*radius+1)*srcLineBytes;
//用于计算权值 1 / num
int num;
float iarr;


for (int i = 0; i < w; ++i)
{
r = 0;
g = 0;
b = 0;
pSrc = srcPix+i*bitcount;
pDst = destPix+i*bitcount;
num = radius;


for (int j = 0; j < radius; ++j)
{
r += pSrc[0];
g += pSrc[1];
b += pSrc[2];
pSrc = pSrc+srcLineBytes;



//color = srcPix[j*w + i];
//r += (color & 0x00ff0000) >> 16;
//g += (color & 0x0000ff00) >> 8;
//b += (color & 0x000000ff);
}


for (int j = radius; j <= tempvalue; ++j)
{


num++;
iarr = 1.0 / (1.0 * num);


r += pSrc[0];
g += pSrc[1];
b += pSrc[2];
pSrc = pSrc+srcLineBytes;


//color = srcPix[(j + radius) * w + i];
//r += (color & 0x00ff0000) >> 16;
//g += (color & 0x0000ff00) >> 8;
//b += (color & 0x000000ff);


tr = (int)(r * iarr);
tg = (int)(g * iarr);
tb = (int)(b * iarr);


pDst[0] = tr;
pDst[1] = tg;
pDst[2] = tb;
pDst = pDst+dstLineBytes;


//destPix[j*w + i] = tr << 16 | tg << 8 | tb | 0xff000000;


//num++;
//iarr = 1.0 / (1.0 * num);


//color = srcPix[(j + radius) * w + i];
//r += (color & 0x00ff0000) >> 16;
//g += (color & 0x0000ff00) >> 8;
//b += (color & 0x000000ff);


//tr = (int)(r * iarr);
//tg = (int)(g * iarr);
//tb = (int)(b * iarr);


//destPix[j*w + i] = tr << 16 | tg << 8 | tb | 0xff000000;
}


iarr = 1.0 / (1.0 * num);
for (int j = tempvalue2; j < h; ++j)
{
pSrcPre = pSrc-tempvalue3;


r0 = pSrc[0]-pSrcPre[0];
g0 = pSrc[1]-pSrcPre[1];
b0 = pSrc[2]-pSrcPre[2];
r = r + r0;
g = g + g0;
b = b +  b0;
pSrc = pSrc+srcLineBytes;
//preColor = srcPix[(j - radius - 1) * w + i];
//color = srcPix[(j + radius) * w + i];


//r = r + ((color & 0x00ff0000) >> 16) - ((preColor & 0x00ff0000) >> 16);
//g = g + ((color & 0x0000ff00) >> 8)  - ((preColor & 0x0000ff00) >> 8);
//b = b + (color & 0x000000ff)       - (preColor & 0x000000ff);


tr = (int)(r * iarr);
tg = (int)(g * iarr);
tb = (int)(b * iarr);


pDst[0] = tr;
pDst[1] = tg;
pDst[2] = tb;
pDst = pDst+dstLineBytes;


/*preColor = srcPix[(j - radius - 1) * w + i];
color = srcPix[(j + radius) * w + i];


r = r + ((color & 0x00ff0000) >> 16) - ((preColor & 0x00ff0000) >> 16);
g = g + ((color & 0x0000ff00) >> 8)  - ((preColor & 0x0000ff00) >> 8);
b = b + (color & 0x000000ff)       - (preColor & 0x000000ff);


tr = (int)(r * iarr);
tg = (int)(g * iarr);
tb = (int)(b * iarr);


destPix[j*w + i] = tr << 16 | tg << 8 | tb | 0xff000000;*/
}


pSrcPre = pSrc-tempvalue3;


for (int j = h - radius; j < h; ++j)
{
num--;
iarr = 1.0 / (1.0 * num);


r -= pSrcPre[0];
g -= pSrcPre[1];
b -= pSrcPre[2];


tr = (int)(r * iarr);
tg = (int)(g * iarr);
tb = (int)(b * iarr);


pDst[0] = tr;
pDst[1] = tg;
pDst[2] = tb;
pDst = pDst+dstLineBytes;


//preColor = srcPix[(j - radius - 1) * w + i];


//r -= (preColor & 0x00ff0000) >> 16;
//g -= (preColor & 0x0000ff00) >> 8;
//b -= (preColor & 0x000000ff);


//tr = (int)(r * iarr);
//tg = (int)(g * iarr);
//tb = (int)(b * iarr);


//destPix[j*w + i] = tr << 16 | tg << 8 | tb | 0xff000000;
}
}
}


void boxBlur(TByte* srcPix, TByte* destPix, int w, int h, int srcLineBytes, int dstLineBytes,int r)
{
if (r < 0)
{
LOGD("boxBlur r < 0: %d", r);
return;
}


boxBlurH(srcPix, destPix, w, h,srcLineBytes,dstLineBytes, r);
boxBlurV(destPix, srcPix, w, h,srcLineBytes,dstLineBytes, r);
}


//领用n 个 box 拟合 sigma的高斯函数
//参考:http://www.csse.uwa.edu.au/~pk/research/pkpapers/FastGaussianSmoothing.pdf
void boxesForGauss(float sigma, int* size, int n)
{
float wIdeal = sqrt(12.0 * sigma * sigma / n + 1.0);
int wl = floor(wIdeal);


if (0 == wl % 2)
wl--;


int wu = wl + 2;


float mIdeal = (12.0 * sigma * sigma - n * wl * wl - 4 * n * wl - 3 * n) / (-4 * wl - 4);
int m = BOUND(mIdeal+0.5);


for (int i = 0; i < n; ++i)
size[i] = (i < m ? wl : wu);
}


int QBB_FastGaussBoxBlur(TBITMAP *pSrcBitmap , TBITMAP *pDstBitmap,int r)
{
float sigma = 1.0 * r / 2.57; //2.57 *sigam半径之后基本没有贡献 所以取sigma为 r / 2.57
TBITMAP stempBitmap = {0};


int boxSize = 3;
int* boxR = (int*)malloc(sizeof(int) * boxSize); //需要的个数


stempBitmap.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
stempBitmap.lWidth = pSrcBitmap->lWidth;
stempBitmap.lHeight = pSrcBitmap->lHeight;
TUtilsBitmapAlloc(&stempBitmap);
BitmapCopy(pSrcBitmap,pDstBitmap);
//计算拟合的半径
boxesForGauss(sigma, boxR, boxSize);


// int* tempPix = (int*)malloc(sizeof(int) * w * h);


boxBlur(pDstBitmap->pPlane[0], stempBitmap.pPlane[0], pDstBitmap->lWidth,  pDstBitmap->lHeight, pDstBitmap->lPitch[0],stempBitmap.lPitch[0],(boxR[0] - 1) / 2);
boxBlur(pDstBitmap->pPlane[0], stempBitmap.pPlane[0],pDstBitmap->lWidth, pDstBitmap->lHeight,   pDstBitmap->lPitch[0],stempBitmap.lPitch[0],(boxR[1] - 1) / 2);
boxBlur(pDstBitmap->pPlane[0], stempBitmap.pPlane[0], pDstBitmap->lWidth, pDstBitmap->lHeight,  pDstBitmap->lPitch[0],stempBitmap.lPitch[0],(boxR[2] - 1) / 2);


TUtilsBitmapFree(&stempBitmap);


//清理内存
free(boxR);
return 0;
//free(tempPix)
}


//void gaussBlur2(TByte* pix, int w, int h, int srcLineBytes, int dstLineBytes,int r)
//{
// float sigma = 1.0 * r / 2.57; //2.57 *sigam半径之后基本没有贡献 所以取sigma为 r / 2.57
//
// int boxSize = 3;
// int* boxR = (int*)malloc(sizeof(int) * boxSize); //需要的个数
//
// //计算拟合的半径
// boxesForGauss(sigma, boxR, boxSize);
//
// int* tempPix = (int*)malloc(sizeof(int) * w * h);
//
// boxBlur(pix, tempPix, w, h, srcLineBytes,dstLineBytes,(boxR[0] - 1) / 2);
// boxBlur(pix, tempPix, w, h,  srcLineBytes,dstLineBytes,(boxR[1] - 1) / 2);
// boxBlur(pix, tempPix, w, h,  srcLineBytes,dstLineBytes,(boxR[2] - 1) / 2);
//
// //清理内存
// free(boxR);
// free(tempPix);
//}


TRESULT Func_SkinSoft_BilateralFilterBorderH(TBITMAP*pSrcBitmap,TBITMAP   *pDstBitmap,TBITMAP *pMask,TInt32 bitcount,TRECT rc,TLong lSize,float *space_weight,float *color_weight ,int *space_of_horizontal ,int *space_of_vertical)
{
//注意 pSrcBitmap 为Border 后的数据 的crop 其比pDstBitmap大 2*R
int w;
int h;
TByte *pMaskbuffer;
TByte *pDst;
TByte *pSrc;
TByte* sptr;  //temp.ptr(h+radius)+radius*cn;
TByte* dptr;
int width = pSrcBitmap->lWidth;
int height = pSrcBitmap->lHeight;
register int b0,g0,r0;
register float wvalue;
register float sumb=0,sumg=0,sumr=0,wsum=0;
register int b,g,r;
int maxk;
int k;
float *space_weightTemp = TNull;
int *space_of_horizontalTemp = TNull;
TByte* sptr_k = TNull;
TInt32 lTop;
TInt32 lBottom;
lTop = rc.top;
lBottom = rc.bottom;


maxk = lSize;
if(pMask)
{
for ( h=lTop;h<lBottom;h++)
{
sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
pMaskbuffer = pMask->pPlane[0]+h*pMask->lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[0],g0=sptr[1],r0=sptr[2];
k=0;
space_weightTemp = space_weight;
space_of_horizontalTemp = space_of_horizontal;
if((*pMaskbuffer))
{
for (;k<maxk;k++)
{
sptr_k=sptr+ (*space_of_horizontalTemp);
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=(*space_weightTemp)*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
space_weightTemp++;
space_of_horizontalTemp++;
}


//if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
}
dptr[0] = (TByte)BOUND(b0); dptr[1] = (TByte)BOUND(g0); dptr[2] = (TByte)BOUND(r0);
dptr=dptr+bitcount;
sptr=sptr+bitcount;
pMaskbuffer ++;


}


}
}
else
{
for ( h=lTop;h<lBottom;h++)
{
sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[0],g0=sptr[1],r0=sptr[2];
k=0;
space_weightTemp = space_weight;
space_of_horizontalTemp = space_of_horizontal;
for (;k<maxk;k++)
{
sptr_k=sptr+ (*space_of_horizontalTemp);
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=(*space_weightTemp)*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
space_weightTemp++;
space_of_horizontalTemp++;
}


//if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}




dptr[0] = (TByte)BOUND(b0); dptr[1] = (TByte)BOUND(g0); dptr[2] = (TByte)BOUND(r0);
dptr=dptr+bitcount;
sptr=sptr+bitcount;


}


}


}
return 0;
}






TRESULT Func_SkinSoft_BilateralFilterBorderV(TBITMAP*pSrcBitmap,TBITMAP   *pDstBitmap,TBITMAP *pMask,TInt32 bitcount,TRECT rc,TLong lSize,float *space_weight,float *color_weight ,int *space_of_horizontal ,int *space_of_vertical)
{
//注意 pSrcBitmap 为Border 后的数据 的crop 其比pDstBitmap大 2*R
int w;
int h;
TByte *pMaskbuffer;
TByte *pDst;
TByte *pSrc;
TByte* sptr;  //temp.ptr(h+radius)+radius*cn;
TByte* dptr;
int width = pSrcBitmap->lWidth;
int height = pSrcBitmap->lHeight;
register int b0,g0,r0;
register float wvalue;
register float sumb=0,sumg=0,sumr=0,wsum=0;
register int b,g,r;
int maxk;
int k;
float *space_weightTemp = TNull;
int *space_of_verticalTemp = TNull;
TByte* sptr_k = TNull;
TInt32 lTop;
TInt32 lBottom;
lTop = rc.top;
lBottom = rc.bottom;


maxk = lSize;


if(pMask)
{
for ( h=lTop;h<lBottom;h++)
{
sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
pMaskbuffer = pMask->pPlane[0]+h*pMask->lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[0],g0=sptr[1],r0=sptr[2];
k=0;
space_weightTemp = space_weight;
space_of_verticalTemp = space_of_vertical;
if((*pMaskbuffer))
{
for (;k<maxk;k++)
{
sptr_k=sptr+(*space_of_verticalTemp);
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=(*space_weightTemp)*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
space_weightTemp++;
space_of_verticalTemp++;
}
//if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
}
dptr[0] = (TByte)BOUND(b0); dptr[1] = (TByte)BOUND(g0); dptr[2] = (TByte)BOUND(r0);
dptr=dptr+bitcount;
sptr=sptr+bitcount;
pMaskbuffer ++;
}
}
}
else
{
for ( h=lTop;h<lBottom;h++)
{
sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[0],g0=sptr[1],r0=sptr[2];
k=0;
space_weightTemp = space_weight;
space_of_verticalTemp = space_of_vertical;
for (;k<maxk;k++)
{
sptr_k=sptr+(*space_of_verticalTemp);
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=(*space_weightTemp)*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
space_weightTemp++;
space_of_verticalTemp++;
}
//if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}


dptr[0] = (TByte)BOUND(b0); dptr[1] = (TByte)BOUND(g0); dptr[2] = (TByte)BOUND(r0);
dptr=dptr+bitcount;
sptr=sptr+bitcount;
}
}
}

return 0;
}




#ifdef MUTTILD_THREAD
typedef struct tag_SkinSoft_BilateralFilterBorder_data{
TBITMAP *pSrcBitmap;
TBITMAP  *pDstBitmap;
TBITMAP  *pMask;
TInt32 bitcount;
TRECT rc;
TLong lSize;
float *space_weight;
float *color_weight;
int *space_of_horizontal ;
int *space_of_vertical;
TRESULT res;
}SkinSoft_BilateralFilterBorder_data;


static int mtThreadSkinSoft_BilateralFilterBorderHSttask_tFun(qbbsttask_t *sttask)
{
TRESULT res= TOK;
SkinSoft_BilateralFilterBorder_data *perMTThread = (SkinSoft_BilateralFilterBorder_data *)sttask->task_arg;


Func_SkinSoft_BilateralFilterBorderH(perMTThread->pSrcBitmap,perMTThread->pDstBitmap,perMTThread->pMask,perMTThread->bitcount,perMTThread->rc,perMTThread->lSize,perMTThread->space_weight,perMTThread->color_weight,perMTThread->space_of_horizontal ,perMTThread->space_of_vertical);
return 0;
}


static int mtThreadSkinSoft_BilateralFilterBorderVSttask_tFun(qbbsttask_t *sttask)
{
TRESULT res= TOK;
SkinSoft_BilateralFilterBorder_data *perMTThread = (SkinSoft_BilateralFilterBorder_data *)sttask->task_arg;


Func_SkinSoft_BilateralFilterBorderV(perMTThread->pSrcBitmap,perMTThread->pDstBitmap,perMTThread->pMask,perMTThread->bitcount,perMTThread->rc,perMTThread->lSize,perMTThread->space_weight,perMTThread->color_weight,perMTThread->space_of_horizontal ,perMTThread->space_of_vertical);
return 0;
}


#endif




int QBB_FastBilateralFilterBorder(THandle TPThreadPool,TBITMAP *pSrcBitmap , TBITMAP *pDstBitmap,TBITMAP *pMask,TInt32 iBilRadius,TDouble dsigma_color,TDouble dsigma_space,TBool bSigmasSet)
{
double sigma_color=40.0;
double sigma_space=10.0;
int width = pSrcBitmap->lWidth;
int height = pSrcBitmap->lHeight;


int radius = iBilRadius;


int cn = 3;
int bitcount = 4;
int d=9;
int step = 0;
float *color_weight = TNull;
double gauss_color_coeff = 0;
double gauss_space_coeff = 0;
//radius =sigma_space*1.5;
float *space_weight = TNull;
int *space_of_horizontal = TNull;
int *space_of_vertical = TNull;
TBITMAP sTempBitmap = {0};
TBITMAP sTempBitmapcpy = {0};


int widthBoder = pSrcBitmap->lWidth+2*radius;
int heightBoder =pSrcBitmap->lHeight+2*radius;
int i;
int maxk;
int w;
int h;
int k;
TByte* sptr_k = TNull;
TRESULT res = TOK;


#ifdef _TEST_PERFORMANCE_
TDWord dwtime = TGetCurTimeStamp();
#endif


double ra;
//radius sigma_color=2*radius   sigma_space = radius/2;
d = radius*2 + 1;


if(!bSigmasSet)
{
sigma_color = d*2;
sigma_space = d/2;
}
else
{
sigma_color = dsigma_color;
sigma_space = dsigma_space;
}
gauss_color_coeff = -0.5/(sigma_color*sigma_color);
gauss_space_coeff = -0.5/(sigma_space*sigma_space);




space_weight = (float *)malloc(sizeof(float)*d);
space_of_horizontal = (int *)malloc(sizeof(int)*d);
space_of_vertical = (int *)malloc(sizeof(int)*d);
color_weight = (float *)malloc(sizeof(float)*cn*256);


sTempBitmap.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
sTempBitmap.lWidth = widthBoder;
sTempBitmap.lHeight = heightBoder;
TUtilsBitmapAlloc(&sTempBitmap);


sTempBitmapcpy.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
sTempBitmapcpy.lWidth = pSrcBitmap->lWidth;
sTempBitmapcpy.lHeight = pSrcBitmap->lHeight;
sTempBitmapcpy.lPitch[0] = sTempBitmap.lPitch[0];
sTempBitmapcpy.pPlane[0] = sTempBitmap.pPlane[0]+radius*sTempBitmap.lPitch[0]+radius*bitcount;
step = sTempBitmapcpy.lPitch[0];






for( i = 0; i < 256*cn; i++ )
color_weight[i] = (float)exp(i*i*gauss_color_coeff);


for ( i=-radius,maxk=0;i<=radius;i++)
{
ra=abs(i);
space_weight[maxk]=(float)exp(ra*ra*gauss_space_coeff);
space_of_horizontal[maxk]=(int)(i*bitcount);
space_of_vertical[maxk++]=(int)(i*step);


}
#ifdef _TEST_PERFORMANCE_
LOGI("QBB_FastBilateralFilterBorder1 dwtime=%drn",TGetCurTimeStamp()-dwtime);
#endif
BitmapCopyBoder(pSrcBitmap,&sTempBitmap,radius,radius,radius,radius);


#ifdef _TEST_PERFORMANCE_
LOGI("QBB_FastBilateralFilterBorder2 dwtime=%drn",TGetCurTimeStamp()-dwtime);
#endif
//{
// THandle hTPCM = CreateTPCM();
// TUtilsBitmapSave2(hTPCM,"D:\qbb6\qbbcopy1.bmp",T_BMP,&sTempBitmap);
// DestroyTPCM(hTPCM);
//}




#ifdef MUTTILD_THREAD
if(TPThreadPool)
{
//H
{
TInt32 nTaskNum = MUTTILD_TASK_NUM;
TRECT rc;
TInt32 nYStep = 0;
TInt32 nYMax = 0;
TInt32 i = 0;
SkinSoft_BilateralFilterBorder_data *pmtData = TNull;


pmtData = (SkinSoft_BilateralFilterBorder_data *)TMemAlloc(TNull, sizeof(SkinSoft_BilateralFilterBorder_data)*MUTTILD_TASK_NUM);
if(TNull == pmtData)
{
res = TERR_NO_MEMORY;
goto EXIT_TASKH;
}
TMemSet(pmtData, 0, sizeof(SkinSoft_BilateralFilterBorder_data)*MUTTILD_TASK_NUM);
rc.left = 0;
rc.top = 0;
rc.right = pSrcBitmap->lWidth;
rc.bottom = rc.top;
nYStep = (pSrcBitmap->lHeight+MUTTILD_TASK_NUM-1)/MUTTILD_TASK_NUM;
if(nYStep & 0x1)
{
nYStep += 1;
}
nYMax = pSrcBitmap->lHeight;


for (i=0; i<nTaskNum; i++)
{
rc.top = rc.bottom;
rc.bottom += nYStep;
rc.bottom = MIN(nYMax, rc.bottom);


pmtData[i].pSrcBitmap = &sTempBitmapcpy;
pmtData[i].pDstBitmap = pDstBitmap;
pmtData[i].pMask = pMask;
pmtData[i].bitcount = 4;
pmtData[i].rc.top = rc.top;
pmtData[i].rc.bottom = rc.bottom;
pmtData[i].lSize = maxk;
pmtData[i].space_weight = space_weight;
pmtData[i].color_weight = color_weight;
pmtData[i].space_of_horizontal = space_of_horizontal;
pmtData[i].space_of_vertical = space_of_vertical;


pmtData[i].res = TOK;


qbbtpool_addtask(TPThreadPool, mtThreadSkinSoft_BilateralFilterBorderHSttask_tFun ,(TVoid *)(&pmtData[i]));
}


EXIT_TASKH:
if(TPThreadPool)
{
qbbtpool_task_waitall(TPThreadPool);
}
if(pmtData)
{
TMemFree(TNull,pmtData);
pmtData = TNull;
}


}


}
else
{
TRECT rc = {0};
rc.top = 0;
rc.bottom = height;
rc.left = 0;
rc.right = width;


Func_SkinSoft_BilateralFilterBorderH(&sTempBitmapcpy,pDstBitmap,pMask, bitcount, rc, maxk,space_weight,color_weight ,space_of_horizontal ,space_of_vertical);
}


#else
{
TRECT rc = {0};
rc.top = 0;
rc.bottom = height;
rc.left = 0;
rc.right = width;


Func_SkinSoft_BilateralFilterBorderH(&sTempBitmapcpy,pDstBitmap,pMask, bitcount, rc, maxk,space_weight,color_weight ,space_of_horizontal ,space_of_vertical);
}
#endif






//{
// THandle hTPCM = CreateTPCM();
// TUtilsBitmapSave2(hTPCM,"D:\qbb6\qbbcopybilh1.bmp",T_BMP,pDstBitmap);
// DestroyTPCM(hTPCM);
//}
#ifdef _TEST_PERFORMANCE_
LOGI("QBB_FastBilateralFilterBorder3 dwtime=%drn",TGetCurTimeStamp()-dwtime);
#endif
BitmapCopyBoder(pDstBitmap,&sTempBitmap,radius,radius,radius,radius);


#ifdef _TEST_PERFORMANCE_
LOGI("QBB_FastBilateralFilterBorder4 dwtime=%drn",TGetCurTimeStamp()-dwtime);
#endif

//{
// THandle hTPCM = CreateTPCM();
// TUtilsBitmapSave2(hTPCM,"D:\qbb6\qbbcopy2.bmp",T_BMP,&sTempBitmap);
// DestroyTPCM(hTPCM);
//}





#ifdef MUTTILD_THREAD
if(TPThreadPool)
{
//H
{
TInt32 nTaskNum = MUTTILD_TASK_NUM;
TRECT rc;
TInt32 nYStep = 0;
TInt32 nYMax = 0;
TInt32 i = 0;
SkinSoft_BilateralFilterBorder_data *pmtData = TNull;
//THandle TPThreadPool = TNull;
//TPThreadPool =  TPcreate_threadpool(MUTTILD_PTHREAD_NUM);
//TPThreadPool =  qbbtpool_create(MUTTILD_PTHREAD_NUM);


pmtData = (SkinSoft_BilateralFilterBorder_data *)TMemAlloc(TNull, sizeof(SkinSoft_BilateralFilterBorder_data)*MUTTILD_TASK_NUM);
if(TNull == pmtData)
{
res = TERR_NO_MEMORY;
goto EXIT_TASKV;
}
TMemSet(pmtData, 0, sizeof(SkinSoft_BilateralFilterBorder_data)*MUTTILD_TASK_NUM);
rc.left = 0;
rc.top = 0;
rc.right = pSrcBitmap->lWidth;
rc.bottom = rc.top;
nYStep = (pSrcBitmap->lHeight+MUTTILD_TASK_NUM-1)/MUTTILD_TASK_NUM;
if(nYStep & 0x1)
{
nYStep += 1;
}
nYMax = pSrcBitmap->lHeight;


for (i=0; i<nTaskNum; i++)
{
rc.top = rc.bottom;
rc.bottom += nYStep;
rc.bottom = MIN(nYMax, rc.bottom);


pmtData[i].pSrcBitmap = &sTempBitmapcpy;
pmtData[i].pDstBitmap = pDstBitmap;
pmtData[i].pMask = pMask;
pmtData[i].bitcount = 4;
pmtData[i].rc.top = rc.top;
pmtData[i].rc.bottom = rc.bottom;
pmtData[i].lSize = maxk;
pmtData[i].space_weight = space_weight;
pmtData[i].color_weight = color_weight;
pmtData[i].space_of_horizontal = space_of_horizontal;
pmtData[i].space_of_vertical = space_of_vertical;


pmtData[i].res = TOK;


qbbtpool_addtask(TPThreadPool, mtThreadSkinSoft_BilateralFilterBorderVSttask_tFun ,(TVoid *)(&pmtData[i]));
}


EXIT_TASKV:
if(TPThreadPool)
{
qbbtpool_task_waitall(TPThreadPool);
//qbbtpool_release(TPThreadPool);
//TPdestroy_threadpool(TPThreadPool);
}
if(pmtData)
{
TMemFree(TNull,pmtData);
pmtData = TNull;
}


}


}
else
{
TRECT rc = {0};
rc.top = 0;
rc.bottom = height;
rc.left = 0;
rc.right = width;


Func_SkinSoft_BilateralFilterBorderV(&sTempBitmapcpy,pDstBitmap, pMask,bitcount, rc, maxk,space_weight,color_weight ,space_of_horizontal ,space_of_vertical);
}


#else
{
TRECT rc = {0};
rc.top = 0;
rc.bottom = height;
rc.left = 0;
rc.right = width;


Func_SkinSoft_BilateralFilterBorderV(&sTempBitmapcpy,pDstBitmap,pMask, bitcount, rc, maxk,space_weight,color_weight ,space_of_horizontal ,space_of_vertical);
}
#endif




if(space_weight)
free(space_weight);
if(space_of_horizontal)
free(space_of_horizontal);
if(space_of_vertical)
free(space_of_vertical);
if(color_weight)
free(color_weight);


//{
// THandle hTPCM = CreateTPCM();
// TUtilsBitmapSave2(hTPCM,"D:\qbb6\faceblurtest\qbbcopybilv.bmp",T_BMP,pDstBitmap);
// DestroyTPCM(hTPCM);
//}

TUtilsBitmapFree(&sTempBitmap);


#ifdef _TEST_PERFORMANCE_
LOGI("QBB_FastBilateralFilterBorder5 dwtime=%drn",TGetCurTimeStamp()-dwtime);
#endif
return 0;


}




TRESULT Func_SkinSoft_GaussianBorderBlurH(TBITMAP*pSrcBitmapBorder,TBITMAP   *pDstBitmap,TInt32 bitcount,TRECT rc,TLong lSize,TLong lCenter,TInt32 *pdKernal_1 ,int *space_of_horizontal ,int *space_of_vertical)
{
TByte *pSrcData = TNull,*pDstData =TNull;
TByte *pSrcDataTemp = TNull,*pDstDataTemp =TNull;
TLong lWidth=0,lHeight=0;
register TInt32 dSumR;
register TInt32 dFilterR;
register TInt32 dFilterG;
register TInt32 dFilterB;


TInt32 lTop,lBottom;
TInt32 wTemp = lWidth+lCenter;
TInt32 nLimit,i,j;


lWidth = pSrcBitmapBorder->lWidth;
lHeight = pSrcBitmapBorder->lHeight;


lTop = rc.top;
lBottom = rc.bottom;


wTemp = lWidth+lCenter;
if(bitcount == 1)
{
for(i=lTop; i<lBottom; i++)                               //进行x向的高斯滤波(加权平均)   
{  
pSrcData = pSrcBitmapBorder->pPlane[0] + i*pSrcBitmapBorder->lPitch[0];
pDstData = pDstBitmap->pPlane[0] + i*pDstBitmap->lPitch[0];


for(j=0; j<lWidth; j++)  
{  
dSumR = 0;
dFilterR=0;


dFilterG=0;  


dFilterB=0; 


for(nLimit=0; nLimit<lSize; nLimit++)  
{  

//if((j+nLimit)>=lCenter && (j+nLimit) < wTemp )       //图像不能超出边界   
{  
pSrcDataTemp = pSrcData+space_of_horizontal[nLimit];
dFilterR += (pSrcDataTemp[0] * pdKernal_1[nLimit]);  
dSumR += pdKernal_1[nLimit]; 
}  
}  



if(dSumR != FF_ONE)
{
pDstData[0] = BOUND(dFilterR/dSumR);
}
else
{
pDstData[0] = BOUND(FF2INT(dFilterR));
}
pDstData = pDstData+bitcount;
pSrcData = pSrcData+bitcount;


/* pDstData[4*j] = BOUND(dFilterR/dSumR);  
pDstData[4*j+1] = BOUND(dFilterG/dSumG);  
pDstData[4*j+2] = BOUND(dFilterB/dSumB); */ 
}  

}
else
{
for(i=lTop; i<lBottom; i++)                               //进行x向的高斯滤波(加权平均)   
{  
pSrcData = pSrcBitmapBorder->pPlane[0] + i*pSrcBitmapBorder->lPitch[0];
pDstData = pDstBitmap->pPlane[0] + i*pDstBitmap->lPitch[0];


for(j=0; j<lWidth; j++)  
{  
dSumR = 0;
dFilterR=0;


dFilterG=0;  


dFilterB=0; 


for(nLimit=0; nLimit<lSize; nLimit++)  
{  

//if((j+nLimit)>=lCenter && (j+nLimit) < wTemp )       //图像不能超出边界   
{  
pSrcDataTemp = pSrcData+space_of_horizontal[nLimit];


dFilterR += (pSrcDataTemp[0] * pdKernal_1[nLimit]);  
dSumR += pdKernal_1[nLimit]; 


dFilterG += pSrcDataTemp[1] * pdKernal_1[nLimit];  
//dSumG += pdKernal_1[nLimit]; 


dFilterB += pSrcDataTemp[2] * pdKernal_1[nLimit];  
//dSumB += pdKernal_1[nLimit]; 


}  
}  



if(dSumR != FF_ONE)
{
pDstData[0] = BOUND(dFilterR/dSumR);
pDstData[1] = BOUND(dFilterG/dSumR);
pDstData[2] = BOUND(dFilterB/dSumR); 
}
else
{
pDstData[0] = BOUND(FF2INT(dFilterR));
pDstData[1] = BOUND(FF2INT(dFilterG));
pDstData[2] = BOUND(FF2INT(dFilterB));
}
pDstData = pDstData+bitcount;
pSrcData = pSrcData+bitcount;


/* pDstData[4*j] = BOUND(dFilterR/dSumR);  
pDstData[4*j+1] = BOUND(dFilterG/dSumG);  
pDstData[4*j+2] = BOUND(dFilterB/dSumB); */ 
}  

}
return 0;
}




TRESULT Func_SkinSoft_GaussianBorderBlurV(TBITMAP*pSrcBitmapBorder,TBITMAP   *pDstBitmap,TInt32 bitcount,TRECT rc,TLong lSize,TLong lCenter,TInt32 *pdKernal_1 ,int *space_of_horizontal ,int *space_of_vertical)
{
TByte *pSrcData = TNull,*pDstData =TNull;
TByte *pSrcDataTemp = TNull,*pDstDataTemp =TNull;
TLong lWidth=0,lHeight=0;
register TInt32 dSumR;
register TInt32 dFilterR;
register TInt32 dFilterG;
register TInt32 dFilterB;


TInt32 lTop,lBottom;
TInt32 wTemp ;
TInt32 nLimit,i,j;


lWidth = pSrcBitmapBorder->lWidth;
lHeight = pSrcBitmapBorder->lHeight;


lTop = rc.top;
lBottom = rc.bottom;


wTemp = lHeight+lCenter;
if(bitcount == 1)
{
for(i=lTop; i<lBottom; i++)                               //进行x向的高斯滤波(加权平均)   
{  
pSrcData = pSrcBitmapBorder->pPlane[0] + i*pSrcBitmapBorder->lPitch[0];
pDstData = pDstBitmap->pPlane[0] + i*pDstBitmap->lPitch[0];


for(j=0; j<lWidth; j++)  
{  
dSumR = 0;
dFilterR=0;


dFilterG=0;  


dFilterB=0; 


for(nLimit=0; nLimit<lSize; nLimit++)  
{  

//if((i+nLimit)>=lCenter && (i+nLimit) < wTemp )       //图像不能超出边界   
{  
pSrcDataTemp = pSrcData+space_of_vertical[nLimit];


dFilterR += pSrcDataTemp[0] * pdKernal_1[nLimit];  
dSumR += pdKernal_1[nLimit]; 


}  
}  



if(dSumR != FF_ONE)
{
pDstData[0] = BOUND(dFilterR/dSumR);
}
else
{
pDstData[0] = BOUND(FF2INT(dFilterR));
}
pDstData = pDstData+bitcount;
pSrcData = pSrcData+bitcount;


/* pDstData[4*j] = BOUND(dFilterR/dSumR);  
pDstData[4*j+1] = BOUND(dFilterG/dSumG);  
pDstData[4*j+2] = BOUND(dFilterB/dSumB); */ 
}  

}
else
{
for(i=lTop; i<lBottom; i++)                               //进行x向的高斯滤波(加权平均)   
{  
pSrcData = pSrcBitmapBorder->pPlane[0] + i*pSrcBitmapBorder->lPitch[0];
pDstData = pDstBitmap->pPlane[0] + i*pDstBitmap->lPitch[0];


for(j=0; j<lWidth; j++)  
{  
dSumR = 0;
dFilterR=0;


dFilterG=0;  


dFilterB=0; 


for(nLimit=0; nLimit<lSize; nLimit++)  
{  

//if((i+nLimit)>=lCenter && (i+nLimit) < wTemp )       //图像不能超出边界   
{  
pSrcDataTemp = pSrcData+space_of_vertical[nLimit];


dFilterR += pSrcDataTemp[0] * pdKernal_1[nLimit];  
dSumR += pdKernal_1[nLimit]; 


dFilterG += pSrcDataTemp[1] * pdKernal_1[nLimit];  
//dSumG += pdKernal_1[nLimit]; 


dFilterB += pSrcDataTemp[2] * pdKernal_1[nLimit];  


}  
}  



if(dSumR != FF_ONE)
{
pDstData[0] = BOUND(dFilterR/dSumR);
pDstData[1] = BOUND(dFilterG/dSumR);
pDstData[2] = BOUND(dFilterB/dSumR); 
}
else
{
pDstData[0] = BOUND(FF2INT(dFilterR));
pDstData[1] = BOUND(FF2INT(dFilterG));
pDstData[2] = BOUND(FF2INT(dFilterB));
}
pDstData = pDstData+bitcount;
pSrcData = pSrcData+bitcount;


/* pDstData[4*j] = BOUND(dFilterR/dSumR);  
pDstData[4*j+1] = BOUND(dFilterG/dSumG);  
pDstData[4*j+2] = BOUND(dFilterB/dSumB); */ 
}  

}
 
return 0;
}


TRESULT Func_SkinSoft_GaussianBlurH(TBITMAP*pSrcBitmap,TBITMAP   *pDstBitmap,TInt32 bitcount,TRECT rc,TLong lSize,TLong lCenter,TInt32 *pdKernal_1 ,int *space_of_horizontal ,int *space_of_vertical)
{
TByte *pSrcData = TNull,*pDstData =TNull;
TByte *pSrcDataTemp = TNull,*pDstDataTemp =TNull;
TLong lWidth=0,lHeight=0;
register TInt32 dSumR;
register TInt32 dFilterR;
register TInt32 dFilterG;
register TInt32 dFilterB;


TInt32 lTop,lBottom;
TInt32 wTemp = lWidth+lCenter;
TInt32 nLimit,i,j;


lWidth = pSrcBitmap->lWidth;
lHeight = pSrcBitmap->lHeight;


lTop = rc.top;
lBottom = rc.bottom;


wTemp = lWidth+lCenter;
if(bitcount == 1)
{
for(i=lTop; i<lBottom; i++)                               //进行x向的高斯滤波(加权平均)   
{  
pSrcData = pSrcBitmap->pPlane[0] + i*pSrcBitmap->lPitch[0];
pDstData = pDstBitmap->pPlane[0] + i*pDstBitmap->lPitch[0];


for(j=0; j<lWidth; j++)  
{  
dSumR = 0;
dFilterR=0;


dFilterG=0;  


dFilterB=0; 


for(nLimit=0; nLimit<lSize; nLimit++)  
{  

if((j+nLimit)>=lCenter && (j+nLimit) < wTemp )       //图像不能超出边界   
{  
pSrcDataTemp = pSrcData+space_of_horizontal[nLimit];
dFilterR += (pSrcDataTemp[0] * pdKernal_1[nLimit]);  
dSumR += pdKernal_1[nLimit]; 
}  
}  



if(dSumR != FF_ONE)
{
pDstData[0] = BOUND(dFilterR/dSumR);
}
else
{
pDstData[0] = BOUND(FF2INT(dFilterR));
}
pDstData = pDstData+bitcount;
pSrcData = pSrcData+bitcount;


/* pDstData[4*j] = BOUND(dFilterR/dSumR);  
pDstData[4*j+1] = BOUND(dFilterG/dSumG);  
pDstData[4*j+2] = BOUND(dFilterB/dSumB); */ 
}  

}
else
{
for(i=lTop; i<lBottom; i++)                               //进行x向的高斯滤波(加权平均)   
{  
pSrcData = pSrcBitmap->pPlane[0] + i*pSrcBitmap->lPitch[0];
pDstData = pDstBitmap->pPlane[0] + i*pDstBitmap->lPitch[0];


for(j=0; j<lWidth; j++)  
{  
dSumR = 0;
dFilterR=0;


dFilterG=0;  


dFilterB=0; 


for(nLimit=0; nLimit<lSize; nLimit++)  
{  

if((j+nLimit)>=lCenter && (j+nLimit) < wTemp )       //图像不能超出边界   
{  
pSrcDataTemp = pSrcData+space_of_horizontal[nLimit];


dFilterR += (pSrcDataTemp[0] * pdKernal_1[nLimit]);  
dSumR += pdKernal_1[nLimit]; 


dFilterG += pSrcDataTemp[1] * pdKernal_1[nLimit];  
//dSumG += pdKernal_1[nLimit]; 


dFilterB += pSrcDataTemp[2] * pdKernal_1[nLimit];  
//dSumB += pdKernal_1[nLimit]; 


}  
}  



if(dSumR != FF_ONE)
{
pDstData[0] = BOUND(dFilterR/dSumR);
pDstData[1] = BOUND(dFilterG/dSumR);
pDstData[2] = BOUND(dFilterB/dSumR); 
}
else
{
pDstData[0] = BOUND(FF2INT(dFilterR));
pDstData[1] = BOUND(FF2INT(dFilterG));
pDstData[2] = BOUND(FF2INT(dFilterB));
}
pDstData = pDstData+bitcount;
pSrcData = pSrcData+bitcount;


/* pDstData[4*j] = BOUND(dFilterR/dSumR);  
pDstData[4*j+1] = BOUND(dFilterG/dSumG);  
pDstData[4*j+2] = BOUND(dFilterB/dSumB); */ 
}  

}
return 0;
}






TRESULT Func_SkinSoft_GaussianBlurV(TBITMAP*pSrcBitmap,TBITMAP   *pDstBitmap,TInt32 bitcount,TRECT rc,TLong lSize,TLong lCenter,TInt32 *pdKernal_1 ,int *space_of_horizontal ,int *space_of_vertical)
{
TByte *pSrcData = TNull,*pDstData =TNull;
TByte *pSrcDataTemp = TNull,*pDstDataTemp =TNull;
TLong lWidth=0,lHeight=0;
register TInt32 dSumR;
register TInt32 dFilterR;
register TInt32 dFilterG;
register TInt32 dFilterB;


TInt32 lTop,lBottom;
TInt32 wTemp ;
TInt32 nLimit,i,j;


lWidth = pSrcBitmap->lWidth;
lHeight = pSrcBitmap->lHeight;


lTop = rc.top;
lBottom = rc.bottom;


wTemp = lHeight+lCenter;
if(bitcount == 1)
{
for(i=lTop; i<lBottom; i++)                               //进行x向的高斯滤波(加权平均)   
{  
pSrcData = pSrcBitmap->pPlane[0] + i*pSrcBitmap->lPitch[0];
pDstData = pDstBitmap->pPlane[0] + i*pDstBitmap->lPitch[0];


for(j=0; j<lWidth; j++)  
{  
dSumR = 0;
dFilterR=0;


dFilterG=0;  


dFilterB=0; 


for(nLimit=0; nLimit<lSize; nLimit++)  
{  

if((i+nLimit)>=lCenter && (i+nLimit) < wTemp )       //图像不能超出边界   
{  
pSrcDataTemp = pSrcData+space_of_vertical[nLimit];


dFilterR += pSrcDataTemp[0] * pdKernal_1[nLimit];  
dSumR += pdKernal_1[nLimit]; 


}  
}  



if(dSumR != FF_ONE)
{
pDstData[0] = BOUND(dFilterR/dSumR);
}
else
{
pDstData[0] = BOUND(FF2INT(dFilterR));
}
pDstData = pDstData+bitcount;
pSrcData = pSrcData+bitcount;


/* pDstData[4*j] = BOUND(dFilterR/dSumR);  
pDstData[4*j+1] = BOUND(dFilterG/dSumG);  
pDstData[4*j+2] = BOUND(dFilterB/dSumB); */ 
}  

}
else
{
for(i=lTop; i<lBottom; i++)                               //进行x向的高斯滤波(加权平均)   
{  
pSrcData = pSrcBitmap->pPlane[0] + i*pSrcBitmap->lPitch[0];
pDstData = pDstBitmap->pPlane[0] + i*pDstBitmap->lPitch[0];


for(j=0; j<lWidth; j++)  
{  
dSumR = 0;
dFilterR=0;


dFilterG=0;  


dFilterB=0; 


for(nLimit=0; nLimit<lSize; nLimit++)  
{  

if((i+nLimit)>=lCenter && (i+nLimit) < wTemp )       //图像不能超出边界   
{  
pSrcDataTemp = pSrcData+space_of_vertical[nLimit];


dFilterR += pSrcDataTemp[0] * pdKernal_1[nLimit];  
dSumR += pdKernal_1[nLimit]; 


dFilterG += pSrcDataTemp[1] * pdKernal_1[nLimit];  
//dSumG += pdKernal_1[nLimit]; 


dFilterB += pSrcDataTemp[2] * pdKernal_1[nLimit];  


}  
}  



if(dSumR != FF_ONE)
{
pDstData[0] = BOUND(dFilterR/dSumR);
pDstData[1] = BOUND(dFilterG/dSumR);
pDstData[2] = BOUND(dFilterB/dSumR); 
}
else
{
pDstData[0] = BOUND(FF2INT(dFilterR));
pDstData[1] = BOUND(FF2INT(dFilterG));
pDstData[2] = BOUND(FF2INT(dFilterB));
}
pDstData = pDstData+bitcount;
pSrcData = pSrcData+bitcount;


/* pDstData[4*j] = BOUND(dFilterR/dSumR);  
pDstData[4*j+1] = BOUND(dFilterG/dSumG);  
pDstData[4*j+2] = BOUND(dFilterB/dSumB); */ 
}  

}
 
return 0;
}


#ifdef MUTTILD_THREAD
typedef struct tag_SkinSoft_GaussianBlur_data{
TBITMAP *pSrcBitmap;
TBITMAP  *pDstBitmap;
TInt32 bitcount;
TRECT rc;
TLong lSize;
TLong lCenter;
TInt32 *pdKernal_1 ;
int *space_of_horizontal ;
int *space_of_vertical;
TRESULT res;
}SkinSoft_GaussianBlur_data;


static int mtThreadSkinSoft_GaussianBlurHSttask_tFun(qbbsttask_t *sttask)
{
TRESULT res= TOK;
SkinSoft_GaussianBlur_data *perMTThread = (SkinSoft_GaussianBlur_data *)sttask->task_arg;


Func_SkinSoft_GaussianBlurH(perMTThread->pSrcBitmap,perMTThread->pDstBitmap,perMTThread->bitcount,perMTThread->rc,perMTThread->lSize,perMTThread->lCenter,perMTThread->pdKernal_1 ,perMTThread->space_of_horizontal ,perMTThread->space_of_vertical);
return 0;
}


static int mtThreadSkinSoft_GaussianBlurVSttask_tFun(qbbsttask_t *sttask)
{
TRESULT res= TOK;
SkinSoft_GaussianBlur_data *perMTThread = (SkinSoft_GaussianBlur_data *)sttask->task_arg;


Func_SkinSoft_GaussianBlurV(perMTThread->pSrcBitmap,perMTThread->pDstBitmap,perMTThread->bitcount,perMTThread->rc,perMTThread->lSize,perMTThread->lCenter,perMTThread->pdKernal_1 ,perMTThread->space_of_horizontal ,perMTThread->space_of_vertical);
return 0;
}


static int mtThreadSkinSoft_GaussianBorderBlurHSttask_tFun(qbbsttask_t *sttask)
{
TRESULT res= TOK;
SkinSoft_GaussianBlur_data *perMTThread = (SkinSoft_GaussianBlur_data *)sttask->task_arg;


Func_SkinSoft_GaussianBorderBlurH(perMTThread->pSrcBitmap,perMTThread->pDstBitmap,perMTThread->bitcount,perMTThread->rc,perMTThread->lSize,perMTThread->lCenter,perMTThread->pdKernal_1 ,perMTThread->space_of_horizontal ,perMTThread->space_of_vertical);
return 0;
}


static int mtThreadSkinSoft_GaussianBorderBlurVSttask_tFun(qbbsttask_t *sttask)
{
TRESULT res= TOK;
SkinSoft_GaussianBlur_data *perMTThread = (SkinSoft_GaussianBlur_data *)sttask->task_arg;


Func_SkinSoft_GaussianBorderBlurV(perMTThread->pSrcBitmap,perMTThread->pDstBitmap,perMTThread->bitcount,perMTThread->rc,perMTThread->lSize,perMTThread->lCenter,perMTThread->pdKernal_1 ,perMTThread->space_of_horizontal ,perMTThread->space_of_vertical);
return 0;
}


#endif


#define PI 3.14159 
TRESULT GaussianBorderBlur(THandle TPThreadPool,TBITMAP *pSrcBitmap,TBITMAP *pDstBitmap,TInt32 lRadius,TDouble dSigma, TInt32 bitcount,TBool bSigmasSet)
{
TRESULT res = TOK;


TLong i,j;
TLong lWidth=0,lHeight=0;
TLong lR,lRa,lRb;
TLong lAlpha;
TLong lSize;
TInt32 *pdKernal_1 = TNull;
TFloat *pdKernal_1Temp = TNull;
TDouble dSum_1;
TDouble lSigma;
TLong lCenter;


TBITMAP sDstBitmapTemp = {0};
TBITMAP sTempBitmapcpy = {0};


TDouble nDis;
TLong nLimit;
TLong lw = pSrcBitmap->lPitch[0];
TInt32 iTmep;
TInt32 step = 0;


if(bSigmasSet)
{
lSigma = dSigma;
lSize = 1+3*lSigma; //size 为3lSigma
if(!(lSize&0x1)) //奇数
lSize=lSize+1;
lCenter = lSize>>1;
}
else
{
lCenter = lRadius;
lSigma = (lCenter - 1)*0.3 + 0.8; //通过半径算sigma, 参考opencv gaussblur
lSize = 2*lCenter+1;
}
//lAlpha = skinsoftParam->lAlpha;


//if(lSigma>32 || lSigma<1)
// return TERR_INVALID_PARAM;


lWidth = pSrcBitmap->lWidth;
lHeight = pSrcBitmap->lHeight;


sDstBitmapTemp.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
sDstBitmapTemp.lWidth = pSrcBitmap->lWidth+2*lCenter;
sDstBitmapTemp.lHeight = pSrcBitmap->lHeight+2*lCenter;
TUtilsBitmapAlloc(&sDstBitmapTemp);




sTempBitmapcpy.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
sTempBitmapcpy.lWidth = pSrcBitmap->lWidth;
sTempBitmapcpy.lHeight = pSrcBitmap->lHeight;
sTempBitmapcpy.lPitch[0] = sDstBitmapTemp.lPitch[0];
sTempBitmapcpy.pPlane[0] = sDstBitmapTemp.pPlane[0]+lCenter*sDstBitmapTemp.lPitch[0]+lCenter*bitcount;
step = sTempBitmapcpy.lPitch[0];


//生成一维高斯滤波系数/   
pdKernal_1 = (TInt32*)malloc(lSize*sizeof(TInt32));//new double[lSize];    //定义一维高斯核数组   
pdKernal_1Temp = (TFloat *)malloc(lSize*sizeof(TFloat));//new double[lSize];    //定义一维高斯核数组   
dSum_1 = 0.0;                           //求和,用于进行归一化           
一维高斯函数公式//        
                  x*x                           /   
         -1*----------------                    /   
        1     2*Sigma*Sigma                     /   
  ------------ e                                /   
                                                /   
  /2*pi*Sigma                                  /   
//  


TInt32 iVluae;
TFloat fVluae;


int *space_of_horizontal = (int *)malloc(sizeof(int)*lSize);
int *space_of_vertical = (int *)malloc(sizeof(int)*lSize);
int iCoordinate;
int wTemp;
TInt32 dSum = 0;;
TRECT rc = {0};


for(i=0; i<lSize; i++)  
{  
nDis = (TDouble)(i-lCenter);  
pdKernal_1Temp[i] =  exp(-(0.5)*nDis*nDis/(lSigma*lSigma))/(sqrt(2*PI)*lSigma);
dSum_1 += pdKernal_1Temp[i];  
}  


for(i=0; i<lSize; i++)  
{  


pdKernal_1Temp[i] /= dSum_1;                 //进行归一化   
MFLOAT2FF(pdKernal_1Temp[i],pdKernal_1[i]);
dSum += pdKernal_1[i];
//pdKernal_1[i] = pdKernal_1Temp[i];
}  


dSum = (dSum-FF_ONE);
if( dSum !=0 )
{
pdKernal_1[lSize>>1] = pdKernal_1[lSize>>1]-dSum;
}


for ( i=-lCenter,nLimit=0;i<=lCenter;i++)
{
space_of_horizontal[nLimit]=(int)(i*bitcount);
space_of_vertical[nLimit++]=(int)(i*sDstBitmapTemp.lPitch[0]);
}








#ifdef MUTTILD_THREAD
if(TPThreadPool)
{
//H
{
TInt32 nTaskNum = MUTTILD_TASK_NUM;
TRECT rc;
TInt32 nYStep = 0;
TInt32 nYMax = 0;
TInt32 i = 0;
SkinSoft_GaussianBlur_data *pmtData = TNull;
//THandle TPThreadPool = TNull;
//TPThreadPool =  TPcreate_threadpool(MUTTILD_PTHREAD_NUM);
//TPThreadPool =  qbbtpool_create(MUTTILD_PTHREAD_NUM);


BitmapCopyBoder(pSrcBitmap,&sDstBitmapTemp,lCenter,lCenter,lCenter,lCenter);


pmtData = (SkinSoft_GaussianBlur_data *)TMemAlloc(TNull, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
if(TNull == pmtData)
{
res = TERR_NO_MEMORY;
goto EXIT_TASKH;
}
TMemSet(pmtData, 0, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
rc.left = 0;
rc.top = 0;
rc.right = pSrcBitmap->lWidth;
rc.bottom = rc.top;
nYStep = (pSrcBitmap->lHeight+MUTTILD_TASK_NUM-1)/MUTTILD_TASK_NUM;
if(nYStep & 0x1)
{
nYStep += 1;
}
nYMax = pSrcBitmap->lHeight;


for (i=0; i<nTaskNum; i++)
{
rc.top = rc.bottom;
rc.bottom += nYStep;
rc.bottom = MIN(nYMax, rc.bottom);


pmtData[i].pSrcBitmap = &sTempBitmapcpy;
pmtData[i].pDstBitmap = pDstBitmap;
pmtData[i].bitcount = 4;
pmtData[i].rc.top = rc.top;
pmtData[i].rc.bottom = rc.bottom;
pmtData[i].lSize = lSize;
pmtData[i].lCenter = lCenter;
pmtData[i].pdKernal_1 = pdKernal_1;
pmtData[i].space_of_horizontal = space_of_horizontal;
pmtData[i].space_of_vertical = space_of_vertical;


pmtData[i].res = TOK;


qbbtpool_addtask(TPThreadPool, mtThreadSkinSoft_GaussianBorderBlurHSttask_tFun,(TVoid *)(&pmtData[i]));
}


EXIT_TASKH:
if(TPThreadPool)
{
qbbtpool_task_waitall(TPThreadPool);
//qbbtpool_release(TPThreadPool);
//TPdestroy_threadpool(TPThreadPool);
}
if(pmtData)
{
TMemFree(TNull,pmtData);
pmtData = TNull;
}


}


//V
{
TInt32 nTaskNum = MUTTILD_TASK_NUM;
TRECT rc;
TInt32 nYStep = 0;
TInt32 nYMax = 0;
TInt32 i = 0;
SkinSoft_GaussianBlur_data *pmtData = TNull;
//THandle TPThreadPool = TNull;
//TPThreadPool =  TPcreate_threadpool(MUTTILD_PTHREAD_NUM);
//TPThreadPool =  qbbtpool_create(MUTTILD_PTHREAD_NUM);


BitmapCopyBoder(pDstBitmap,&sDstBitmapTemp,lCenter,lCenter,lCenter,lCenter);


pmtData = (SkinSoft_GaussianBlur_data *)TMemAlloc(TNull, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
if(TNull == pmtData)
{
res = TERR_NO_MEMORY;
goto EXIT_TASKV;
}
TMemSet(pmtData, 0, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
rc.left = 0;
rc.top = 0;
rc.right = pSrcBitmap->lWidth;
rc.bottom = rc.top;
nYStep = (pSrcBitmap->lHeight+MUTTILD_TASK_NUM-1)/MUTTILD_TASK_NUM;
if(nYStep & 0x1)
{
nYStep += 1;
}
nYMax = pSrcBitmap->lHeight;


for (i=0; i<nTaskNum; i++)
{
rc.top = rc.bottom;
rc.bottom += nYStep;
rc.bottom = MIN(nYMax, rc.bottom);


pmtData[i].pSrcBitmap = &sTempBitmapcpy;
pmtData[i].pDstBitmap = pDstBitmap;
pmtData[i].bitcount = 4;
pmtData[i].rc.top = rc.top;
pmtData[i].rc.bottom = rc.bottom;
pmtData[i].lSize = lSize;
pmtData[i].lCenter = lCenter;
pmtData[i].pdKernal_1 = pdKernal_1;
pmtData[i].space_of_horizontal = space_of_horizontal;
pmtData[i].space_of_vertical = space_of_vertical;


pmtData[i].res = TOK;


qbbtpool_addtask(TPThreadPool, mtThreadSkinSoft_GaussianBorderBlurVSttask_tFun ,(TVoid *)(&pmtData[i]));
}


EXIT_TASKV:
if(TPThreadPool)
{
qbbtpool_task_waitall(TPThreadPool);
//qbbtpool_release(TPThreadPool);
//TPdestroy_threadpool(TPThreadPool);
}
if(pmtData)
{
TMemFree(TNull,pmtData);
pmtData = TNull;
}


}


}
else
{
THandle TPThreadPoolInner = qbbtpool_create(MUTTILD_PTHREAD_NUM);
rc.top = 0;
rc.bottom = lHeight;
rc.left = 0;
rc.right = lWidth;

{
TInt32 nTaskNum = MUTTILD_TASK_NUM;
TRECT rc;
TInt32 nYStep = 0;
TInt32 nYMax = 0;
TInt32 i = 0;
SkinSoft_GaussianBlur_data *pmtData = TNull;
//THandle TPThreadPool = TNull;
//TPThreadPool =  TPcreate_threadpool(MUTTILD_PTHREAD_NUM);
//TPThreadPool =  qbbtpool_create(MUTTILD_PTHREAD_NUM);


BitmapCopyBoder(pSrcBitmap,&sDstBitmapTemp,lCenter,lCenter,lCenter,lCenter);


pmtData = (SkinSoft_GaussianBlur_data *)TMemAlloc(TNull, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
if(TNull == pmtData)
{
res = TERR_NO_MEMORY;
goto EXIT_TASK_INNERH;
}
TMemSet(pmtData, 0, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
rc.left = 0;
rc.top = 0;
rc.right = pSrcBitmap->lWidth;
rc.bottom = rc.top;
nYStep = (pSrcBitmap->lHeight+MUTTILD_TASK_NUM-1)/MUTTILD_TASK_NUM;
if(nYStep & 0x1)
{
nYStep += 1;
}
nYMax = pSrcBitmap->lHeight;


for (i=0; i<nTaskNum; i++)
{
rc.top = rc.bottom;
rc.bottom += nYStep;
rc.bottom = MIN(nYMax, rc.bottom);


pmtData[i].pSrcBitmap = &sTempBitmapcpy;
pmtData[i].pDstBitmap = pDstBitmap;
pmtData[i].bitcount = 4;
pmtData[i].rc.top = rc.top;
pmtData[i].rc.bottom = rc.bottom;
pmtData[i].lSize = lSize;
pmtData[i].lCenter = lCenter;
pmtData[i].pdKernal_1 = pdKernal_1;
pmtData[i].space_of_horizontal = space_of_horizontal;
pmtData[i].space_of_vertical = space_of_vertical;


pmtData[i].res = TOK;


qbbtpool_addtask(TPThreadPoolInner, mtThreadSkinSoft_GaussianBorderBlurHSttask_tFun,(TVoid *)(&pmtData[i]));
}


EXIT_TASK_INNERH:
if(TPThreadPoolInner)
{
qbbtpool_task_waitall(TPThreadPoolInner);
//qbbtpool_release(TPThreadPool);
//TPdestroy_threadpool(TPThreadPool);
}
if(pmtData)
{
TMemFree(TNull,pmtData);
pmtData = TNull;
}


}


//V
{
TInt32 nTaskNum = MUTTILD_TASK_NUM;
TRECT rc;
TInt32 nYStep = 0;
TInt32 nYMax = 0;
TInt32 i = 0;
SkinSoft_GaussianBlur_data *pmtData = TNull;
//THandle TPThreadPool = TNull;
//TPThreadPool =  TPcreate_threadpool(MUTTILD_PTHREAD_NUM);
//TPThreadPool =  qbbtpool_create(MUTTILD_PTHREAD_NUM);


BitmapCopyBoder(pDstBitmap,&sDstBitmapTemp,lCenter,lCenter,lCenter,lCenter);


pmtData = (SkinSoft_GaussianBlur_data *)TMemAlloc(TNull, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
if(TNull == pmtData)
{
res = TERR_NO_MEMORY;
goto EXIT_TASK_INNER_V;
}
TMemSet(pmtData, 0, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
rc.left = 0;
rc.top = 0;
rc.right = pSrcBitmap->lWidth;
rc.bottom = rc.top;
nYStep = (pSrcBitmap->lHeight+MUTTILD_TASK_NUM-1)/MUTTILD_TASK_NUM;
if(nYStep & 0x1)
{
nYStep += 1;
}
nYMax = pSrcBitmap->lHeight;


for (i=0; i<nTaskNum; i++)
{
rc.top = rc.bottom;
rc.bottom += nYStep;
rc.bottom = MIN(nYMax, rc.bottom);


pmtData[i].pSrcBitmap = &sTempBitmapcpy;
pmtData[i].pDstBitmap = pDstBitmap;
pmtData[i].bitcount = 4;
pmtData[i].rc.top = rc.top;
pmtData[i].rc.bottom = rc.bottom;
pmtData[i].lSize = lSize;
pmtData[i].lCenter = lCenter;
pmtData[i].pdKernal_1 = pdKernal_1;
pmtData[i].space_of_horizontal = space_of_horizontal;
pmtData[i].space_of_vertical = space_of_vertical;


pmtData[i].res = TOK;


qbbtpool_addtask(TPThreadPoolInner, mtThreadSkinSoft_GaussianBorderBlurVSttask_tFun ,(TVoid *)(&pmtData[i]));
}


EXIT_TASK_INNER_V:
if(TPThreadPoolInner)
{
qbbtpool_task_waitall(TPThreadPoolInner);
//qbbtpool_release(TPThreadPool);
//TPdestroy_threadpool(TPThreadPool);
}
if(pmtData)
{
TMemFree(TNull,pmtData);
pmtData = TNull;
}


}
if(TPThreadPoolInner)
qbbtpool_release(TPThreadPoolInner);




//BitmapCopyBoder(pSrcBitmap,&sDstBitmapTemp,lCenter,lCenter,lCenter,lCenter);


//Func_SkinSoft_GaussianBorderBlurH(&sTempBitmapcpy,pDstBitmap,bitcount,rc,lSize,lCenter,pdKernal_1,space_of_horizontal,space_of_vertical);


//BitmapCopyBoder(pDstBitmap,&sDstBitmapTemp,lCenter,lCenter,lCenter,lCenter);


//Func_SkinSoft_GaussianBorderBlurV(&sTempBitmapcpy,pDstBitmap,bitcount,rc,lSize,lCenter,pdKernal_1,space_of_horizontal,space_of_vertical);
}


#else
{
rc.top = 0;
rc.bottom = lHeight;
rc.left = 0;
rc.right = lWidth;
BitmapCopyBoder(pSrcBitmap,&sDstBitmapTemp,lCenter,lCenter,lCenter,lCenter);


Func_SkinSoft_GaussianBorderBlurH(&sTempBitmapcpy,pDstBitmap,bitcount,rc,lSize,lCenter,pdKernal_1,space_of_horizontal,space_of_vertical);


BitmapCopyBoder(pDstBitmap,&sDstBitmapTemp,lCenter,lCenter,lCenter,lCenter);


Func_SkinSoft_GaussianBorderBlurV(&sTempBitmapcpy,pDstBitmap,bitcount,rc,lSize,lCenter,pdKernal_1,space_of_horizontal,space_of_vertical);
}
#endif












if(space_of_horizontal)
free(space_of_horizontal);
if(space_of_vertical)
free(space_of_vertical);
if(pdKernal_1Temp !=TNull)
free(pdKernal_1Temp);
if(pdKernal_1 != TNull)
free(pdKernal_1);
TUtilsBitmapFree(&sDstBitmapTemp);
return res;


}




TRESULT GaussianBlur(THandle TPThreadPool,TBITMAP *pSrcBitmap,TBITMAP *pDstBitmap,TDouble dSigma, TInt32 bitcount)
{
TRESULT res = TOK;
TLong i,j;
TLong lWidth=0,lHeight=0;
TLong lR,lRa,lRb;
TLong lAlpha;
TLong lSize;
TInt32 *pdKernal_1 = TNull;
TFloat *pdKernal_1Temp = TNull;
TDouble dSum_1;
TDouble lSigma;
TLong lCenter;


TBITMAP sDstBitmapTemp = {0};


TDouble nDis;
TLong nLimit;
TLong lw = pSrcBitmap->lPitch[0];
TInt32 iTmep;
lSigma = dSigma;
lSize = 1+3*lSigma; //size 为3lSigma
if(!(lSize&0x1)) //奇数
lSize=lSize+1;
lCenter = lSize>>1;
//lAlpha = skinsoftParam->lAlpha;


//if(lSigma>32 || lSigma<1)
// return TERR_INVALID_PARAM;


lWidth = pSrcBitmap->lWidth;
lHeight = pSrcBitmap->lHeight;


sDstBitmapTemp.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
sDstBitmapTemp.lWidth = pSrcBitmap->lWidth;
sDstBitmapTemp.lHeight = pSrcBitmap->lHeight;
TUtilsBitmapAlloc(&sDstBitmapTemp);


//生成一维高斯滤波系数/   
pdKernal_1 = (TInt32*)malloc(lSize*sizeof(TInt32));//new double[lSize];    //定义一维高斯核数组   
pdKernal_1Temp = (TFloat *)malloc(lSize*sizeof(TFloat));//new double[lSize];    //定义一维高斯核数组   
dSum_1 = 0.0;                           //求和,用于进行归一化           
一维高斯函数公式//        
                  x*x                           /   
         -1*----------------                    /   
        1     2*Sigma*Sigma                     /   
  ------------ e                                /   
                                                /   
  /2*pi*Sigma                                  /   
//  


TInt32 iVluae;
TFloat fVluae;


int *space_of_horizontal = (int *)malloc(sizeof(int)*lSize);
int *space_of_vertical = (int *)malloc(sizeof(int)*lSize);
int iCoordinate;
int wTemp;
TInt32 dSum = 0;;
TRECT rc = {0};


for(i=0; i<lSize; i++)  
{  
nDis = (TDouble)(i-lCenter);  
pdKernal_1Temp[i] =  exp(-(0.5)*nDis*nDis/(lSigma*lSigma))/(sqrt(2*PI)*lSigma);
dSum_1 += pdKernal_1Temp[i];  
}  


for(i=0; i<lSize; i++)  
{  


pdKernal_1Temp[i] /= dSum_1;                 //进行归一化   
MFLOAT2FF(pdKernal_1Temp[i],pdKernal_1[i]);
dSum += pdKernal_1[i];
//pdKernal_1[i] = pdKernal_1Temp[i];
}  


dSum = (dSum-FF_ONE);
if( dSum !=0 )
{
pdKernal_1[lSize>>1] = pdKernal_1[lSize>>1]-dSum;
}


for ( i=-lCenter,nLimit=0;i<=lCenter;i++)
{
space_of_horizontal[nLimit]=(int)(i*bitcount);
space_of_vertical[nLimit++]=(int)(i*sDstBitmapTemp.lPitch[0]);
}








#ifdef MUTTILD_THREAD
if(TPThreadPool)
{
//H
{
TInt32 nTaskNum = MUTTILD_TASK_NUM;
TRECT rc;
TInt32 nYStep = 0;
TInt32 nYMax = 0;
TInt32 i = 0;
SkinSoft_GaussianBlur_data *pmtData = TNull;
//THandle TPThreadPool = TNull;
//TPThreadPool =  TPcreate_threadpool(MUTTILD_PTHREAD_NUM);
//TPThreadPool =  qbbtpool_create(MUTTILD_PTHREAD_NUM);


pmtData = (SkinSoft_GaussianBlur_data *)TMemAlloc(TNull, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
if(TNull == pmtData)
{
res = TERR_NO_MEMORY;
goto EXIT_TASKH;
}
TMemSet(pmtData, 0, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
rc.left = 0;
rc.top = 0;
rc.right = pSrcBitmap->lWidth;
rc.bottom = rc.top;
nYStep = (pSrcBitmap->lHeight+MUTTILD_TASK_NUM-1)/MUTTILD_TASK_NUM;
if(nYStep & 0x1)
{
nYStep += 1;
}
nYMax = pSrcBitmap->lHeight;


for (i=0; i<nTaskNum; i++)
{
rc.top = rc.bottom;
rc.bottom += nYStep;
rc.bottom = MIN(nYMax, rc.bottom);


pmtData[i].pSrcBitmap = pSrcBitmap;
pmtData[i].pDstBitmap = &sDstBitmapTemp;
pmtData[i].bitcount = 4;
pmtData[i].rc.top = rc.top;
pmtData[i].rc.bottom = rc.bottom;
pmtData[i].lSize = lSize;
pmtData[i].lCenter = lCenter;
pmtData[i].pdKernal_1 = pdKernal_1;
pmtData[i].space_of_horizontal = space_of_horizontal;
pmtData[i].space_of_vertical = space_of_vertical;


pmtData[i].res = TOK;


qbbtpool_addtask(TPThreadPool, mtThreadSkinSoft_GaussianBlurHSttask_tFun ,(TVoid *)(&pmtData[i]));
}


EXIT_TASKH:
if(TPThreadPool)
{
qbbtpool_task_waitall(TPThreadPool);
//qbbtpool_release(TPThreadPool);
//TPdestroy_threadpool(TPThreadPool);
}
if(pmtData)
{
TMemFree(TNull,pmtData);
pmtData = TNull;
}


}

//V
{
TInt32 nTaskNum = MUTTILD_TASK_NUM;
TRECT rc;
TInt32 nYStep = 0;
TInt32 nYMax = 0;
TInt32 i = 0;
SkinSoft_GaussianBlur_data *pmtData = TNull;
//THandle TPThreadPool = TNull;
//TPThreadPool =  TPcreate_threadpool(MUTTILD_PTHREAD_NUM);
//TPThreadPool =  qbbtpool_create(MUTTILD_PTHREAD_NUM);


pmtData = (SkinSoft_GaussianBlur_data *)TMemAlloc(TNull, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
if(TNull == pmtData)
{
res = TERR_NO_MEMORY;
goto EXIT_TASKV;
}
TMemSet(pmtData, 0, sizeof(SkinSoft_GaussianBlur_data)*MUTTILD_TASK_NUM);
rc.left = 0;
rc.top = 0;
rc.right = pSrcBitmap->lWidth;
rc.bottom = rc.top;
nYStep = (pSrcBitmap->lHeight+MUTTILD_TASK_NUM-1)/MUTTILD_TASK_NUM;
if(nYStep & 0x1)
{
nYStep += 1;
}
nYMax = pSrcBitmap->lHeight;


for (i=0; i<nTaskNum; i++)
{
rc.top = rc.bottom;
rc.bottom += nYStep;
rc.bottom = MIN(nYMax, rc.bottom);


pmtData[i].pSrcBitmap = &sDstBitmapTemp;
pmtData[i].pDstBitmap = pDstBitmap;
pmtData[i].bitcount = 4;
pmtData[i].rc.top = rc.top;
pmtData[i].rc.bottom = rc.bottom;
pmtData[i].lSize = lSize;
pmtData[i].lCenter = lCenter;
pmtData[i].pdKernal_1 = pdKernal_1;
pmtData[i].space_of_horizontal = space_of_horizontal;
pmtData[i].space_of_vertical = space_of_vertical;


pmtData[i].res = TOK;


qbbtpool_addtask(TPThreadPool, mtThreadSkinSoft_GaussianBlurVSttask_tFun ,(TVoid *)(&pmtData[i]));
}


EXIT_TASKV:
if(TPThreadPool)
{
qbbtpool_task_waitall(TPThreadPool);
//qbbtpool_release(TPThreadPool);
//TPdestroy_threadpool(TPThreadPool);
}
if(pmtData)
{
TMemFree(TNull,pmtData);
pmtData = TNull;
}


}


}
else
{
rc.top = 0;
rc.bottom = lHeight;
rc.left = 0;
rc.right = lWidth;
Func_SkinSoft_GaussianBlurH(pSrcBitmap,&sDstBitmapTemp,bitcount,rc,lSize,lCenter,pdKernal_1,space_of_horizontal,space_of_vertical);
Func_SkinSoft_GaussianBlurV(&sDstBitmapTemp,pDstBitmap,bitcount,rc,lSize,lCenter,pdKernal_1,space_of_horizontal,space_of_vertical);
}


#else
rc.top = 0;
rc.bottom = lHeight;
rc.left = 0;
rc.right = lWidth;
Func_SkinSoft_GaussianBlurH(pSrcBitmap,&sDstBitmapTemp,bitcount,rc,lSize,lCenter,pdKernal_1,space_of_horizontal,space_of_vertical);
Func_SkinSoft_GaussianBlurV(&sDstBitmapTemp,pDstBitmap,bitcount,rc,lSize,lCenter,pdKernal_1,space_of_horizontal,space_of_vertical);
#endif










if(space_of_horizontal)
free(space_of_horizontal);
if(space_of_vertical)
free(space_of_vertical);
if(pdKernal_1Temp !=TNull)
free(pdKernal_1Temp);
if(pdKernal_1 != TNull)
free(pdKernal_1);
TUtilsBitmapFree(&sDstBitmapTemp);
return res;
}


 
//EPFFilter(Src)
int QBB_FastBilateralFilter(TBITMAP *pSrcBitmap , TBITMAP *pDstBitmap)
{


double sigma_color=40;
double sigma_space=10.0;
int width = pSrcBitmap->lWidth;
int height = pSrcBitmap->lHeight;
int radius =10;
int cn = 3;
int bitcount = 4;
int d=9;
int step = 0;
float *color_weight = (float *)malloc(sizeof(float)*cn*256);
double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
//radius =sigma_space*1.5;
d = radius*2 + 1;
float *space_weight = (float *)malloc(sizeof(float)*d);
int *space_of_horizontal = (int *)malloc(sizeof(int)*d);
int *space_of_vertical = (int *)malloc(sizeof(int)*d);
TBITMAP sTempBitmap = {0};


sTempBitmap.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
sTempBitmap.lWidth = pSrcBitmap->lWidth;
sTempBitmap.lHeight = pSrcBitmap->lHeight;

TUtilsBitmapAlloc(&sTempBitmap);
BitmapCopy(pSrcBitmap,&sTempBitmap);


int i;
for( i = 0; i < 256*cn; i++ )
color_weight[i] = (float)exp(i*i*gauss_color_coeff);


int maxk;
int w;
int h;
int k;
TByte* sptr_k = TNull;
step = pSrcBitmap->lPitch[0];
int b0,g0,r0;
float wvalue;
float sumb=0,sumg=0,sumr=0,wsum=0;
int b,g,r;


for ( i=-radius,maxk=0;i<=radius;i++)
{
double r=abs(i);
space_weight[maxk]=(float)exp(r*r*gauss_space_coeff);
space_of_horizontal[maxk]=(int)(i*bitcount);
space_of_vertical[maxk++]=(int)(i*step);


}


//上
for ( h=0;h<radius;h++)
{
TByte* sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
if((w+k)>=radius && ((w+k)<width))
{
sptr_k=sptr+bitcount*w+space_of_horizontal[k];
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;


}
}


if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}


}


for ( h=0;h<radius;h++)
{
TByte* sptr=  sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
if((h+k)>=radius && ((h+k)<height))
{
sptr_k=sptr+bitcount*w+space_of_vertical[k];
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}


}


if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}
}




//下


for ( h=height-radius;h<height;h++)
{
TByte* sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
if((w+k)>=radius && ((w+k)<width))
{
sptr_k=sptr+bitcount*w+space_of_horizontal[k];
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;


}
}


if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}


}


for ( h=height-radius;h<height;h++)
{
TByte* sptr=  sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
if((h+k)>=radius && ((h+k)<height))
{
sptr_k=sptr+bitcount*w+space_of_vertical[k];
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}


}


if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}
}




//左
for ( h=0;h<height;h++)
{
TByte* sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];
for ( w=0;w<radius;w++)
{
float sumb=0,sumg=0,sumr=0,wsum=0;
int b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
if((w+k)>=radius && ((w+k)<width))
{
sptr_k=sptr+bitcount*w+space_of_horizontal[k];
int b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
float wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}
}


if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}


}


for ( h=0;h<height;h++)
{
TByte* sptr=  sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
for ( w=0;w<radius;w++)
{
float sumb=0,sumg=0,sumr=0,wsum=0;
int b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
if((h+k)>=radius && ((h+k)<height))
{
sptr_k=sptr+bitcount*w+space_of_vertical[k];
int b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
float wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}


}
if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}
}




//右
for ( h=0;h<height;h++)
{
TByte* sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];
for ( w=width-radius;w<width;w++)
{
float sumb=0,sumg=0,sumr=0,wsum=0;
int b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
if((w+k)>=radius && ((w+k)<width))
{
sptr_k=sptr+bitcount*w+space_of_horizontal[k];
int b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
float wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}
}


if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}


}


for ( h=0;h<height;h++)
{
TByte* sptr=  sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
for ( w=width-radius;w<width;w++)
{
float sumb=0,sumg=0,sumr=0,wsum=0;
int b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
if((h+k)>=radius && ((h+k)<height))
{
sptr_k=sptr+bitcount*w+space_of_vertical[k];
int b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
float wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}


}
if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}
}






//内部
for ( h=radius;h<height-radius;h++)
{
TByte* sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];
for ( w=radius;w<width-radius;w++)
{
float sumb=0,sumg=0,sumr=0,wsum=0;
int b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
sptr_k=sptr+bitcount*w+space_of_horizontal[k];
int b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
float wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}


//if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}


}




  for ( h=radius;h<height-radius;h++)
{
TByte* sptr=  sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
for ( w=radius;w<width-radius;w++)
{
float sumb=0,sumg=0,sumr=0,wsum=0;
int b0=sptr[bitcount*w],g0=sptr[bitcount*w+1],r0=sptr[bitcount*w+2];
k=0;
for (;k<maxk;k++)
{
sptr_k=sptr+bitcount*w+space_of_vertical[k];
int b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
float wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}
//if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[bitcount*w] = (TByte)BOUND(b0); dptr[bitcount*w+1] = (TByte)BOUND(g0); dptr[bitcount*w+2] = (TByte)BOUND(r0);
}
}


  if(space_weight)
 free(space_weight);
  if(space_of_horizontal)
 free(space_of_horizontal);
  if(space_of_vertical)
 free(space_of_vertical);
  if(color_weight)
 free(color_weight);
   TUtilsBitmapFree(&sTempBitmap);
return 0;
}


int QBB_FastBilateralFilter2(TBITMAP *pSrcBitmap , TBITMAP *pDstBitmap)
{


double sigma_color=40.0;
double sigma_space=10.0;
int width = pSrcBitmap->lWidth;
int height = pSrcBitmap->lHeight;


int radius = 10;


int cn = 3;
int bitcount = 4;
int d=9;
int step = 0;
float *color_weight = TNull;
double gauss_color_coeff = 0;
double gauss_space_coeff = 0;
//radius =sigma_space*1.5;
float *space_weight = TNull;
int *space_of_horizontal = TNull;
int *space_of_vertical = TNull;
TBITMAP sTempBitmap = {0};
TBITMAP sTempBitmapcpy = {0};


int i;
int maxk;
int w;
int h;
int k;
TByte* sptr_k = TNull;


register int b0,g0,r0;
register float wvalue;
register float sumb=0,sumg=0,sumr=0,wsum=0;
register int b,g,r;


int w1 = 0;
TByte *pDst;
TByte *pSrc;
TByte* sptr;  //temp.ptr(h+radius)+radius*cn;
TByte* dptr;


float fspace_weightSum = 0;
float fcolor_weightSum = 0;


//radius sigma_color=2*radius   sigma_space = radius/2;
d = radius*2 + 1;
sigma_color = radius*2;
sigma_space = radius/2;
gauss_color_coeff = -0.5/(sigma_color*sigma_color);
gauss_space_coeff = -0.5/(sigma_space*sigma_space);




space_weight = (float *)malloc(sizeof(float)*d);
space_of_horizontal = (int *)malloc(sizeof(int)*d);
space_of_vertical = (int *)malloc(sizeof(int)*d);
color_weight = (float *)malloc(sizeof(float)*cn*256);


sTempBitmap.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
sTempBitmap.lWidth = pSrcBitmap->lWidth;
sTempBitmap.lHeight = pSrcBitmap->lHeight;


TUtilsBitmapAlloc(&sTempBitmap);
step = sTempBitmap.lPitch[0];


for( i = 0; i < 256*cn; i++ )
{
color_weight[i] = (float)exp(i*i*gauss_color_coeff);
}


for ( i=-radius,maxk=0;i<=radius;i++)
{
double r=abs(i);
space_weight[maxk]=(float)exp(r*r*gauss_space_coeff);
space_of_horizontal[maxk]=(int)(i*bitcount);
space_of_vertical[maxk++]=(int)(i*step);


}


int wTemp = width+radius;


//内部
for ( h=0;h<height;h++)
{
sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
dptr= sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[0],g0=sptr[1],r0=sptr[2];
k=0;
for (;k<maxk;k++)
{


if((w+k)>=radius && ((w+k)<wTemp))
{
sptr_k=sptr+space_of_horizontal[k];
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}
}


if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[0] = (TByte)BOUND(b0); dptr[1] = (TByte)BOUND(g0); dptr[2] = (TByte)BOUND(r0);
dptr = dptr+bitcount;
sptr = sptr+bitcount;
}


}


wTemp = height+radius;
for ( h=0;h<height;h++)
{
sptr=  sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];  //temp.ptr(h+radius)+radius*cn;
dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[0],g0=sptr[1],r0=sptr[2];
k=0;
for (;k<maxk;k++)
{
if((h+k)>=radius && ((h+k)<wTemp))
{
sptr_k=sptr+space_of_vertical[k];
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}
}


if(wsum != 0)
{
wsum=1.0f/wsum;
b0 = sumb*wsum+0.5;
g0 = sumg*wsum+0.5;
r0 = sumr*wsum+0.5;
}
dptr[0] = (TByte)BOUND(b0); dptr[1] = (TByte)BOUND(g0); dptr[2] = (TByte)BOUND(r0);
dptr = dptr+bitcount;
sptr = sptr+bitcount;
}
}


if(space_weight)
free(space_weight);
if(space_of_horizontal)
free(space_of_horizontal);
if(space_of_vertical)
free(space_of_vertical);
if(color_weight)
free(color_weight);


TUtilsBitmapFree(&sTempBitmap);
return 0;
}




int QBB_FastBilateralFilterFFT2(TBITMAP *pSrcBitmap , TBITMAP *pDstBitmap)
{


double sigma_color=40.0;
double sigma_space=10.0;
int width = pSrcBitmap->lWidth;
int height = pSrcBitmap->lHeight;


int radius = 10;


int cn = 3;
int bitcount = 4;
int d=9;
int step = 0;
int *color_weight = TNull;
double gauss_color_coeff = 0;
double gauss_space_coeff = 0;
//radius =sigma_space*1.5;
int *space_weight = TNull;
int *space_of_horizontal = TNull;
int *space_of_vertical = TNull;
TBITMAP sTempBitmap = {0};
TBITMAP sTempBitmapcpy = {0};


int i;
int maxk;
int w;
int h;
int k;
TByte* sptr_k = TNull;


register int b0,g0,r0;
register int wvalue;
register int sumb=0,sumg=0,sumr=0,wsum=0;
register int b,g,r;


int w1 = 0;
TByte *pDst;
TByte *pSrc;
TByte* sptr;  //temp.ptr(h+radius)+radius*cn;
TByte* dptr;


float fspace_weightSum = 0;
float fcolor_weightSum = 0;


//radius sigma_color=2*radius   sigma_space = radius/2;
d = radius*2 + 1;
sigma_color = radius*2;
sigma_space = radius/2;
gauss_color_coeff = -0.5/(sigma_color*sigma_color);
gauss_space_coeff = -0.5/(sigma_space*sigma_space);




space_weight = (int *)malloc(sizeof(float)*d);
space_of_horizontal = (int *)malloc(sizeof(int)*d);
space_of_vertical = (int *)malloc(sizeof(int)*d);
color_weight = (int *)malloc(sizeof(float)*cn*256);


sTempBitmap.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
sTempBitmap.lWidth = pSrcBitmap->lWidth;
sTempBitmap.lHeight = pSrcBitmap->lHeight;


TUtilsBitmapAlloc(&sTempBitmap);
step = sTempBitmap.lPitch[0];


for( i = 0; i < 256*cn; i++ )
{
MFLOAT2FF(exp(i*i*gauss_color_coeff),color_weight[i]);
//color_weight[i] = (float)exp(i*i*gauss_color_coeff);
}


for ( i=-radius,maxk=0;i<=radius;i++)
{
double r=abs(i);


MFLOAT2FF(exp(r*r*gauss_space_coeff),space_weight[maxk]);
//space_weight[maxk]=(float)exp(r*r*gauss_space_coeff);
space_of_horizontal[maxk]=(int)(i*bitcount);
space_of_vertical[maxk++]=(int)(i*step);


}


int wTemp = width+radius;
double dtemp;
//内部
for ( h=0;h<height;h++)
{
sptr=  pSrcBitmap->pPlane[0]+h*pSrcBitmap->lPitch[0];  //temp.ptr(h+radius)+radius*cn;
dptr= sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[0],g0=sptr[1],r0=sptr[2];
k=0;
for (;k<maxk;k++)
{


if((w+k)>=radius && ((w+k)<wTemp))
{
sptr_k=sptr+space_of_horizontal[k];
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];


wvalue = FF_Multiply(space_weight[k],color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)]);
//wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}
}


if(wsum != 0)
{
//dtemp = 1.0/wsum;
//wsum=1.0f/wsum;
wsum = FF2INT(wsum);
dtemp = 1.0/wsum;
b0 = FF2INT(sumb)*dtemp;
g0 = FF2INT(sumg)*dtemp;
r0 = FF2INT(sumr)*dtemp;
}
dptr[0] = (TByte)BOUND(b0); dptr[1] = (TByte)BOUND(g0); dptr[2] = (TByte)BOUND(r0);
dptr = dptr+bitcount;
sptr = sptr+bitcount;
}


}




wTemp = height+radius;
for ( h=0;h<height;h++)
{
TByte* sptr=  sTempBitmap.pPlane[0]+h*sTempBitmap.lPitch[0];  //temp.ptr(h+radius)+radius*cn;
TByte* dptr= pDstBitmap->pPlane[0]+h*pDstBitmap->lPitch[0];
for ( w=0;w<width;w++)
{
sumb=0,sumg=0,sumr=0,wsum=0;
b0=sptr[0],g0=sptr[1],r0=sptr[2];
k=0;
for (;k<maxk;k++)
{
if((h+k)>=radius && ((h+k)<wTemp))
{
sptr_k=sptr+space_of_vertical[k];
b=sptr_k[0],g=sptr_k[1],r=sptr_k[2];
 wvalue = FF_Multiply(space_weight[k],color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)]);
// wvalue=space_weight[k]*color_weight[abs(b-b0)+abs(g-g0)+abs(r-r0)];
sumb+=b*wvalue;sumg+=g*wvalue;sumr+=r*wvalue;
wsum+=wvalue;
}
}

if(wsum != 0)
{
wsum = FF2INT(wsum);
dtemp = 1.0/wsum;
b0 = FF2INT(sumb)*dtemp;
g0 = FF2INT(sumg)*dtemp;
r0 = FF2INT(sumr)*dtemp;
}
dptr[0] = (TByte)BOUND(b0); dptr[1] = (TByte)BOUND(g0); dptr[2] = (TByte)BOUND(r0);
dptr = dptr+bitcount;
sptr = sptr+bitcount;
}
}


if(space_weight)
free(space_weight);
if(space_of_horizontal)
free(space_of_horizontal);
if(space_of_vertical)
free(space_of_vertical);
if(color_weight)
free(color_weight);


TUtilsBitmapFree(&sTempBitmap);
return 0;
}


//GuassBlur(Dst - Src + 128)
int QBB_FastGaussBlur(THandle TPThreadPool,TBITMAP *pSrcBitmap , TBITMAP *pDstBitmap,TDouble dSigma)
{
TBITMAP sBitmapTemp = {0};
int height,width;
int j,i;
TByte *pSrc = TNull;
TByte *pDst = TNull;
TByte *pDstTemp = TNull;
int bitCount = 4;
sBitmapTemp.dwPixelArrayFormat = pSrcBitmap->dwPixelArrayFormat;
sBitmapTemp.lWidth = pSrcBitmap->lWidth;
sBitmapTemp.lHeight = pSrcBitmap->lHeight;
TUtilsBitmapAlloc(&sBitmapTemp);
height = pSrcBitmap->lHeight;
width = pSrcBitmap->lWidth;
int ivalue;


for(j = 0;j<height;j++)
{
pSrc  = pSrcBitmap->pPlane[0]+j*pSrcBitmap->lPitch[0];
pDst  = pDstBitmap->pPlane[0]+j*pDstBitmap->lPitch[0];
pDstTemp  = sBitmapTemp.pPlane[0]+j*sBitmapTemp.lPitch[0];
for(i = 0; i <width;i++)
{
ivalue = *(pDst++) - *(pSrc++) + 128;
ivalue = BOUND(ivalue);
*(pDstTemp++) = ivalue;


ivalue = *(pDst++) - *(pSrc++) + 128;
ivalue = BOUND(ivalue);
*(pDstTemp++) = ivalue;


ivalue = *(pDst++) - *(pSrc++) + 128;
ivalue = BOUND(ivalue);
*(pDstTemp++) = ivalue;


pDstTemp++;
pDst++;
pSrc++;
}
}


//{
// THandle hTPCM = CreateTPCM();
// TUtilsBitmapSave2(hTPCM,"D:\qbb6\test_GaussianBlur1.bmp",T_BMP,&sBitmapTemp);
// DestroyTPCM(hTPCM);
//}


//BitmapCopy(&sBitmapTemp,pDstBitmap);
GaussianBlur(TPThreadPool,&sBitmapTemp,pDstBitmap,dSigma,4);
//QBB_FastGaussBoxBlur(&sBitmapTemp , pDstBitmap,3);


//{
// THandle hTPCM = CreateTPCM();
// TUtilsBitmapSave2(hTPCM,"D:\qbb6\faceblurtest\qbb_GaussianBlur2.bmp",T_BMP,pDstBitmap);
// DestroyTPCM(hTPCM);
//}


TUtilsBitmapFree(&sBitmapTemp);
return 0;
}




TRESULT Func_SkinSoft_FastBlend_WithMask(TBITMAP*pSrcBitmap,TBITMAP   *pDstBitmap,TBITMAP   *pMask, TFloat fAlpha,TInt32 bitcount)
{
int height,width;
int j,i;
TByte *pSrc = TNull;
TByte *pDst = TNull;
TByte *pMsk = TNull;
int r,g,b;
int r0,g0,b0;


TByte bAplha;
height = pSrcBitmap->lHeight;
width = pSrcBitmap->lWidth;


if(pMask)
{
//BitmapCopy(pSrcBitmap,pDstBitmap);
for(j = 0;j<height;j++)
{
pSrc  = pSrcBitmap->pPlane[0]+j*pSrcBitmap->lPitch[0];
pDst  = pDstBitmap->pPlane[0]+j*pDstBitmap->lPitch[0];
pMsk = pMask->pPlane[0] + j*pMask->lPitch[0];


for(i = 0; i <width;i++)
{
bAplha = *pMsk;
if(bAplha != 0)
{
bAplha = bAplha*fAlpha;
r0 = pSrc[0];
r = pDst[0]+r0-128;
r = BOUND(r);
pDst[0] = (bAplha*r+ (255-bAplha)*r0)>>8;




g0 = pSrc[1];
g = pDst[1]+g0-128;
g = BOUND(g);
pDst[1] = (bAplha*g+ (255-bAplha)*g0)>>8;


b0 = pSrc[2];
b =  pDst[2]+b0-128;
b = BOUND(b);
pDst[2] = (bAplha*b+ (255-bAplha)*b0)>>8;


}
else
{
pDst[0] = pSrc[0];
pDst[1] = pSrc[1];
pDst[2] = pSrc[2];
}


pDst = pDst+bitcount;
pSrc = pSrc+bitcount;
pMsk++;
}
}
}
else
{
for(j = 0;j<height;j++)
{
pSrc  = pSrcBitmap->pPlane[0]+j*pSrcBitmap->lPitch[0];
pDst  = pDstBitmap->pPlane[0]+j*pDstBitmap->lPitch[0];
for(i = 0; i <width;i++)
{
r0 = pSrc[0];
r = pDst[0]+r0-128;
pDst[0] = BOUND(r);
//pDst[0] = (bAplha*r+ (255-bAplha)*r0)>>8;




g0 = pSrc[1];
g = pDst[1]+g0-128;
pDst[1] = BOUND(g);
//pDst[1] = (bAplha*g+ (255-bAplha)*g0)>>8;






b0 = pSrc[2];
b =  pDst[2]+b0-128;
pDst[2] = BOUND(b);
//pDst[2] = (bAplha*b+ (255-bAplha)*b0)>>8;


pDst = pDst+bitcount;
pSrc = pSrc+bitcount;
}
}
}


return 0;
}


#ifdef MUTTILD_THREAD
typedef struct tag_SkinSoft_FastBlend_data{
TBITMAP   sSrcBitmap;
TBITMAP   sDstBitmap;
TBITMAP   sMask;
TDouble   fAlpha;
TInt32 nBitCount;
TRESULT res;
}SkinSoft_FastBlend_data;


static int mtThreadSkinSoft_FastBlendSttask_tFun(qbbsttask_t *sttask)
{
TRESULT res= TOK;
SkinSoft_FastBlend_data *perMTThread = (SkinSoft_FastBlend_data *)sttask->task_arg;


Func_SkinSoft_FastBlend_WithMask(&perMTThread->sSrcBitmap,&perMTThread->sDstBitmap,&perMTThread->sMask, perMTThread->fAlpha,perMTThread->nBitCount);


return 0;
}
#endif
//(Src + 2 * Dst - 256)
int QBB_FastBlend(THandle TPThreadPool,TBITMAP *pSrcBitmap , TBITMAP *pDstBitmap,TBITMAP *pMask,TFloat fAlpha)
{
TRESULT res = TOK;
#ifdef MUTTILD_THREAD
if(TPThreadPool)
{
TInt32 nTaskNum = MUTTILD_TASK_NUM;
TRECT rc;
TInt32 nYStep = 0;
TInt32 nYMax = 0;
TInt32 i = 0;
SkinSoft_FastBlend_data *pmtData = TNull;
//THandle TPThreadPool = TNull;
//TPThreadPool =  TPcreate_threadpool(MUTTILD_PTHREAD_NUM);
//TPThreadPool =  qbbtpool_create(MUTTILD_PTHREAD_NUM);


pmtData = (SkinSoft_FastBlend_data *)TMemAlloc(TNull, sizeof(SkinSoft_FastBlend_data)*MUTTILD_TASK_NUM);
if(TNull == pmtData)
{
res = TERR_NO_MEMORY;
goto EXIT_TASK;
}
TMemSet(pmtData, 0, sizeof(SkinSoft_FastBlend_data)*MUTTILD_TASK_NUM);
rc.left = 0;
rc.top = 0;
rc.right = pSrcBitmap->lWidth;
rc.bottom = rc.top;
nYStep = (pSrcBitmap->lHeight+MUTTILD_TASK_NUM-1)/MUTTILD_TASK_NUM;
if(nYStep & 0x1)
{
nYStep += 1;
}
nYMax = pSrcBitmap->lHeight;


for (i=0; i<nTaskNum; i++)
{
rc.top = rc.bottom;
rc.bottom += nYStep;
rc.bottom = MIN(nYMax, rc.bottom);


res = BitmapGetRectBmp(pSrcBitmap, rc, &pmtData[i].sSrcBitmap);
if(TOK != res)
goto EXIT_TASK;
res = BitmapGetRectBmp(pDstBitmap, rc, &pmtData[i].sDstBitmap);
if(TOK != res)
goto EXIT_TASK;
res = BitmapGetRectBmp(pMask, rc, &pmtData[i].sMask);
if(TOK != res)
goto EXIT_TASK;
pmtData[i].nBitCount = 4;
pmtData[i].fAlpha = fAlpha;
pmtData[i].res = TOK;
//TPdispatch_threadpool(TPThreadPool, mtThreadSkinSoft_BRIGHTFun,(TVoid *)(&pmtData[i]));
qbbtpool_addtask(TPThreadPool, mtThreadSkinSoft_FastBlendSttask_tFun ,(TVoid *)(&pmtData[i]));
}


EXIT_TASK:
if(TPThreadPool)
{
qbbtpool_task_waitall(TPThreadPool);
//qbbtpool_release(TPThreadPool);
//TPdestroy_threadpool(TPThreadPool);
}
if(pmtData)
{
TMemFree(TNull,pmtData);
pmtData = TNull;
}
}
else
{
res = Func_SkinSoft_FastBlend_WithMask(pSrcBitmap,pDstBitmap,pMask,fAlpha, 4);
}


#else


res = Func_SkinSoft_FastBlend_WithMask(pSrcBitmap,pDstBitmap,pMask,fAlpha, 4);
#endif


return 0;
}


//Dest =(Src * (100 - Opacity) + (Src + 2 * GuassBlur(EPFFilter(Src) - Src + 128) - 256) * Opacity) /100 ;
int QBB_FaceBlur(THandle TPThreadPool,TBITMAP *pSrcBitmap , TBITMAP *pDstBitmap,AMSKINSOFT_BIL *skinsoftParam,TBITMAP *pMask)
{
#ifdef _TEST_PERFORMANCE_
TDWord dwtime = TGetCurTimeStamp();
#endif
TInt32 iBilRadius = skinsoftParam->lSigmaD; //10
TDouble dSigmaSR =  skinsoftParam->lSigmaSD; //1.0
TDouble fMaskAlpha = skinsoftParam->lSigmaSR; //1.0




//TDouble lSigmaR;  //BilateralFilter半径
//TDouble lSigmaD; 


//TDouble lSigmaSR; 
//TDouble lSigmaSD; 
BitmapCopy(pSrcBitmap,pDstBitmap);
QBB_FastBilateralFilterBorder(TPThreadPool,pSrcBitmap ,pDstBitmap,pMask, iBilRadius,0,0,0); //iBilRadius[5-10] 控制磨皮程度
//QBB_FastBilateralFilter2(pSrcBitmap , pDstBitmap);
#ifdef _TEST_PERFORMANCE_
LOGI("QBB_FaceBlur1 dwtime=%drn",TGetCurTimeStamp()-dwtime);
#endif
QBB_FastGaussBlur(TPThreadPool,pSrcBitmap ,pDstBitmap,dSigmaSR);//dSigmaSR[0.5-2]  控制 磨皮质感


#ifdef _TEST_PERFORMANCE_
LOGI("QBB_FaceBlurr2 dwtime=%drn",TGetCurTimeStamp()-dwtime);
#endif
QBB_FastBlend(TPThreadPool,pSrcBitmap , pDstBitmap,pMask,fMaskAlpha); //fAlpha[0-1]  和原图再做一个
#ifdef _TEST_PERFORMANCE_
LOGI("QBB_FaceBlur3 dwtime=%drn",TGetCurTimeStamp()-dwtime);
#endif
return 0;
}




void fastStackBlur(unsigned char* pix, unsigned int w, unsigned int h, unsigned int comp, int radius) {
unsigned int wm = w - 1;
unsigned int hm = h - 1;
unsigned int imageSize = w * h;
unsigned int div = radius + radius + 1;


unsigned char * rgb = (unsigned char *)malloc(sizeof(unsigned char) * imageSize * 3);
unsigned char * r = rgb;
unsigned char * g = rgb + imageSize;
unsigned char * b = rgb + imageSize * 2;
int rsum, gsum, bsum, x, y, i, p, yp, yi, yw;


unsigned int *vmin = (unsigned int *)malloc(MAX(w, h) * sizeof(unsigned int));


int divsum = (div + 1) >> 1;
divsum *= divsum;
int *dv = (int *)malloc(256 * divsum * sizeof(int));
for (i = 0; i < 256 * divsum; i++) {
dv[i] = (i / divsum);
}


yw = yi = 0;


int(*stack)[3] = (int(*)[3])malloc(div * 3 * sizeof(int));
unsigned int stackpointer;
unsigned int stackstart;
int *sir;
int rbs;
int r1 = radius + 1;
int routsum, goutsum, boutsum;
int rinsum, ginsum, binsum;


for (y = 0; y < h; y++) {
rinsum = ginsum = binsum = routsum = goutsum = boutsum = rsum = gsum = bsum = 0;
for (i = -radius; i <= radius; i++) {
p = yi + (MIN(wm, MAX(i, 0)));
sir = stack[i + radius];
sir[0] = pix[(p*comp)];
sir[1] = pix[(p*comp) + 1];
sir[2] = pix[(p*comp) + 2];


rbs = r1 - abs(i);
rsum += sir[0] * rbs;
gsum += sir[1] * rbs;
bsum += sir[2] * rbs;
if (i > 0) {
rinsum += sir[0];
ginsum += sir[1];
binsum += sir[2];
}
else {
routsum += sir[0];
goutsum += sir[1];
boutsum += sir[2];
}
}
stackpointer = radius;


for (x = 0; x < w; x++) {


r[yi] = dv[rsum];
g[yi] = dv[gsum];
b[yi] = dv[bsum];


rsum -= routsum;
gsum -= goutsum;
bsum -= boutsum;


stackstart = stackpointer - radius + div;
sir = stack[stackstart % div];


routsum -= sir[0];
goutsum -= sir[1];
boutsum -= sir[2];


if (y == 0) {
vmin[x] = MIN(x + radius + 1, wm);
}
p = yw + vmin[x];




sir[0] = pix[(p*comp)];
sir[1] = pix[(p*comp) + 1];
sir[2] = pix[(p*comp) + 2];
rinsum += sir[0];
ginsum += sir[1];
binsum += sir[2];


rsum += rinsum;
gsum += ginsum;
bsum += binsum;


stackpointer = (stackpointer + 1) % div;
sir = stack[(stackpointer) % div];


routsum += sir[0];
goutsum += sir[1];
boutsum += sir[2];


rinsum -= sir[0];
ginsum -= sir[1];
binsum -= sir[2];


yi++;
}
yw += w;
}
for (x = 0; x < w; x++) {
rinsum = ginsum = binsum = routsum = goutsum = boutsum = rsum = gsum = bsum = 0;
yp = -radius * w;
for (i = -radius; i <= radius; i++) {
yi = MAX(0, yp) + x;


sir = stack[i + radius];


sir[0] = r[yi];
sir[1] = g[yi];
sir[2] = b[yi];


rbs = r1 - abs(i);


rsum += r[yi] * rbs;
gsum += g[yi] * rbs;
bsum += b[yi] * rbs;


if (i > 0) {
rinsum += sir[0];
ginsum += sir[1];
binsum += sir[2];
}
else {
routsum += sir[0];
goutsum += sir[1];
boutsum += sir[2];
}


if (i < hm) {
yp += w;
}
}
yi = x;
stackpointer = radius;
for (y = 0; y < h; y++) {


pix[(yi*comp)] = dv[rsum];
pix[(yi*comp) + 1] = dv[gsum];
pix[(yi*comp) + 2] = dv[bsum];
rsum -= routsum;
gsum -= goutsum;
bsum -= boutsum;


stackstart = stackpointer - radius + div;
sir = stack[stackstart % div];


routsum -= sir[0];
goutsum -= sir[1];
boutsum -= sir[2];


if (x == 0) {
vmin[y] = MIN(y + r1, hm) * w;
}
p = x + vmin[y];


sir[0] = r[p];
sir[1] = g[p];
sir[2] = b[p];


rinsum += sir[0];
ginsum += sir[1];
binsum += sir[2];


rsum += rinsum;
gsum += ginsum;
bsum += binsum;


stackpointer = (stackpointer + 1) % div;
sir = stack[stackpointer];


routsum += sir[0];
goutsum += sir[1];
boutsum += sir[2];


rinsum -= sir[0];
ginsum -= sir[1];
binsum -= sir[2];


yi += w;
}
}


free(rgb);
free(vmin);
free(dv);
free(stack);
}



最后

以上就是外向电源为你收集整理的Fast gauss blur http://blog.ivank.net/fastest-gaussian-blur.html Fastest Gaussian Blur (in linear time)的全部内容,希望文章能够帮你解决Fast gauss blur http://blog.ivank.net/fastest-gaussian-blur.html Fastest Gaussian Blur (in linear time)所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(44)

评论列表共有 0 条评论

立即
投稿
返回
顶部