Download source - 746.44 KB

Introduction

This article is to present you with the way to do Alphablending with CUDA™. What's CUDA? In brief, it is a parallel computing architecture developed by NVIDIA® which is the computing engine in NVIDIA graphics processing units that is accessible to software developers through programming languages.

Background

With a large image, alphablending needs much more CPU time to process. Learning from NVIDIA, CUDA can have image processing performance sped up. Well, you need to give it a try and see how much the difference is between them. This alphablending code is tested on Windows 7 with NVIDIA GeForce G210M.

Alphablending in a Conventional Way

AlphaBlending_Host() is the conventional routine I'm using to do alphablending. I reckon that it's already the fastest way and also has a good performance.

Parameters

PULONG pulFore [in]: Foreground image buffer. Foreground image is, in general, over the background image.
PULONG pulBack [in]: Background image buffer.
PULONG pulResult [out]: The blended image buffer. This image buffer has to be allocated prior to being passed in.
DWORD dwSize [in]: Image dimension size.

void   AlphaBlending_Host(PULONG pulFore , PULONG pulBack, PULONG pulResult, DWORD dwSize)
{
    ULONG   ulResult = 0L ;
    ULONG   ulAlpha = 0L ;
    ULONG   ulVal = 0L ;
    ULONG   ulFore = 0L ;
    ULONG   ulBack = 0L ;

    if ((NULL == pulFore) || (NULL == pulBack) || (NULL == pulResult) || (0L == dwSize))
    {
        return;
    }

    _asm
    {
    FromBeginning:
    //--- get foreground pixel and move buffer forward by 1 ---
    MOV     EAX,pulFore
    MOV     ECX,DWORD PTR [EAX]
    MOV     ulFore,ECX

    MOV     EDX,pulFore
    ADD     EDX,4
    MOV     pulFore,EDX

    //--- get background pixel and move buffer forward by 1 ---
    MOV     EAX,pulBack
    MOV     ECX,DWORD PTR [EAX]
    MOV     ulBack,ECX

    MOV     EDX,pulBack
    ADD     EDX,4
    MOV     pulBack,EDX

    // blend foreground color (F) to a background color (B),
    // using alpha channel value of F
    // Result Z = (Fa * Frgb) + ((1 - Fa) * Brgb)
    // EAX <- Foreground
    // EDX <- Background
    MOV     EAX,ulFore
    MOV     EDX,ulBack
            
    // Test Fa = 255 ? operation of subtraction
    CMP     EAX,0xFF000000      // Fa = 255 ? => Result = EAX
    JNC     ReturnForePixel
    // Test Fa = 0 ? operation of and
    TEST    EAX,0xFF000000      // Fa = 0 ?   => Result = EDX
    JZ      ReturnBackPixel

    JMP     EntryProcess

    ReturnForePixel:
    MOV     ulVal,EAX
    JMP     ProcessFinished

    ReturnBackPixel:
    MOV     ulVal,EDX
    JMP     ProcessFinished

    //--- entry ---
    EntryProcess:

    // Get weight W = Fa * M
    MOV     ECX,EAX             // ECX  <-  Fa Fr Fg Fb
    SHR     ECX,24              // ECX  <-  00 00 00 Fa

    PUSH    EBX

    // P = W * F
    MOV     EBX,EAX             // EBX  <-  Fa Fr Fg Fb
    AND     EAX,0x00FF00FF      // EAX  <-  00 Fr 00 Fb
    AND     EBX,0xFF00FF00      // EBX  <-  Fa 00 Fg 00
    IMUL    EAX,ECX             // EAX  <-  Pr ** Pb **
    SHR     EBX,8               // EBX  <-  00 Fa 00 Fg
    IMUL    EBX,ECX             // EBX  <-  Pa ** Pg **
    ADD     EAX,0x00800080
    AND     EAX,0xFF00FF00      // EAX  <-  Pr 00 Pb 00
    SHR     EAX,8               // EAX  <-  00 Pr ** Pb
    ADD     EBX,0x00800080
    AND     EBX,0xFF00FF00      // EBX  <-  Pa 00 Pg 00
    OR      EAX,EBX             // EAX  <-  Pa Pr Pg Pb

    // W = (1 - W) ; Q = W * B
    XOR     ECX,0x000000FF      // ECX  <-  1 - ECX

    MOV     EBX,EDX             // EBX  <-  Ba Br Bg Bb
    AND     EDX,0x00FF00FF      // EDX  <-  00 Br 00 Bb
    AND     EBX,0xFF00FF00      // EBX  <-  Ba 00 Bg 00
    IMUL    EDX,ECX             // EDX  <-  Qr ** Qb **
    SHR     EBX,8               // EBX  <-  00 Ba 00 Bg
    IMUL    EBX,ECX             // EBX  <-  Qa ** Qg **
    ADD     EDX,0x00800080
    AND     EDX,0xFF00FF00      // EDX  <-  Qr 00 Qb 00
    SHR     EDX,8               // EDX  <-  00 Qr ** Qb
    ADD     EBX,0x00800080
    AND     EBX,0xFF00FF00      // EBX  <-  Qa 00 Qg 00
    OR      EBX,EDX             // EBX  <-  Qa Qr Qg Qb
    // Z = P + Q (assuming no overflow at each byte)
    ADD     EAX,EBX             // EAX  <-  Za Zr Zg Zb

    POP     EBX

    MOV     ulVal,EAX           // new blended RGB color

    ProcessFinished:
    //--- saved to result buffer and move buffer forward by 1 ---
    MOV     EAX,pulResult
    MOV     ECX,ulVal
    MOV     DWORD PTR [EAX],ECX

    MOV     EDX,pulResult
    ADD     EDX,4
    MOV     pulResult,EDX

    //--- check next pixel until the final one ---
    DEC     dwSize
    JNZ     FromBeginning
    }
}

In this case, the processing time is 0.006027 (ms) on CPU.

Alphablending with CUDA

DilutePixel() is to blend pixel with specific alpha channel value and then return the blended pixel. Notice that this function has a __device__ keyword prefixed. It's called by CUDA __global__ function.

Parameters

unsigned long ulPixel [in]: Source device pixel.
unsigned long ulAlpha [in]: The alpha channel value which is used to blend pixel with.

The function returns the blended pixel.

__device__ 
unsigned long DilutePixel(unsigned long ulPixel, unsigned long ulAlpha)
{
    unsigned long nResult = 0;
    
    nResult   = ulPixel;
    ulPixel  &= 0x00ff00ff;
    nResult  &= 0xff00ff00;
    ulPixel  *= ulAlpha;
    nResult >>= 8;
    nResult  *= ulAlpha;
    ulPixel  += 0x00800080;
    ulPixel  &= 0xff00ff00;
    ulPixel >>= 8;
    nResult  += 0x00800080;
    nResult  &= 0xff00ff00;
    nResult  |= ulPixel;

    return( nResult );
}

AlphaBlending_Texture() which is a task thread routine does alphablending with every single pixel of foreground and background image buffers. Notice that this function has a __global__ keyword prefixed.

Parameters

unsigned long* pResult [out]: The blended device image buffer.
unsigned nSize [in]: The image dimension size.

__global__ 
void AlphaBlending_Texture(unsigned long* pResult, unsigned nSize)
{
    unsigned nIndex = (__umul24(blockIdx.x, blockDim.x) + threadIdx.x);
    unsigned long ulPixelF = 0L;
    unsigned long ulPixelB = 0L;
    unsigned long ulAlphaF = 0L;

    if (nIndex >= nSize)
    {
        return;
    }
    ulPixelF = tex1Dfetch(texForegnd, nIndex);
    ulPixelB = tex1Dfetch(texBackgnd, nIndex);
    ulAlphaF = (ulPixelF >> 24L);

    if (ulAlphaF == 0xffL)
    {
        *(pResult + nIndex) = ulPixelF;
    }
    else if (ulAlphaF == 0L)
    {
        *(pResult + nIndex) = ulPixelB;
    }
    else
    {
        ulPixelF = DilutePixel(ulPixelF, ulAlphaF);
        ulPixelB = DilutePixel(ulPixelB, (0xffL ^ ulAlphaF));

        *(pResult + nIndex) = (ulPixelF + ulPixelB);
    }
}

AlphaBlending_Device() is the entry process to handle alphablending of two images.

Parameters

unsigned long* pMemA [in]: Host foreground image buffer.
unsigned long* pMemB [in]: Host background image buffer.
unsigned long* pResult [out]: The host blended image buffer.
unsigned nWidth [in]: Image width.
unsigned nHeight [in]: Image height.

extern "C"
void AlphaBlending_Device(unsigned long* pMemA, unsigned long* pMemB, 
			unsigned long* pMemResult,
                          unsigned nWidth, unsigned nHeight)
{
    unsigned nDimen = (nWidth * nHeight);
    unsigned nSize = (nDimen << 2);
    unsigned char *pDevA = NULL, *pDevB = NULL, *pDevResult = NULL;

    cudaMalloc((void**)&pDevA, nSize);
    cudaMalloc((void**)&pDevB, nSize);

    if (false == gm_bMapHostMemory)
    {
        cudaMalloc((void**)&pDevResult, nSize);
    }
    else
    {
        cudaHostGetDevicePointer((void**)&pDevResult, (void*)pMemResult, 0);
    }

    cudaMemcpy(pDevA, pMemA, nSize, cudaMemcpyHostToDevice);
    cudaMemcpy(pDevB, pMemB, nSize, cudaMemcpyHostToDevice);

    cudaBindTexture(0, texForegnd, pDevA);
    cudaBindTexture(0, texBackgnd, pDevB);

    AlphaBlending_Texture<<<::ceil((float)nDimen / 
	(float)BLOCK_DIM), BLOCK_DIM>>>((unsigned long*)pDevResult, nDimen);

    if (false == gm_bMapHostMemory)
    {
        cudaMemcpy(pMemResult, pDevResult, nSize, cudaMemcpyDeviceToHost);
    }

    cudaUnbindTexture( texForegnd );
    cudaUnbindTexture( texBackgnd );

    cudaFree( pDevA );
    cudaFree( pDevB );
    if (false == gm_bMapHostMemory)
    {
        cudaFree( pDevResult );
    }
}

In this case, the processing time is only 0.000067 (ms) on GPU.

Points of Interest

The result is awesome. The processing time with CUDA is nearly 100x faster than the conventional way. I would like to try another way; OpenCL, which is also a parallel computing language and see how much the further the difference is between them.

Revisions

Version: 1.0 Initial release