Resizing NV12 image using Nearest Neighbor Interpolation and Bilinear Interpolation algorithms

risunch

4.17/5 (3 votes)

Dec 9, 2013

CPOL

2 min read

43646

NV12 Nearest Neighbor Interpolation scaling and Bilinear Interpolation scaling.

Introduction

This article is an optimized NV12 image scaling program.

There are different kinds of image scaling algorithms. The complexity of the algorithm for image scaling is related with the loss of image quality and low performance. I decided to choose the most simple ones which are 'nearest neighbor interpolation' and bilinear interpolation to resize NV12 image.

Background

NV12 is a kind of YUV series format. Before you read my tip.You need to have some basic concept of the format.And know what are interpolation scaling algorithms.

If you have tired RGBA format image scale before,it would be easier for you to understand how my program works.

NV12 format

NV12 format image array in memory is like: YYYYYYYY... UVUV... NV12 is a planar format. It is also called YUV420sp. There are three planes:

The length of Y plane in memory is 'width * height'.
The length of U or V plane in memory is 'width * height / 4'.
U and V is interleaved.
Y plane is grey value if discarding U and V plane

So 'width * height * 3 / 2' is the total memory length of the image. Here is a more clear 8*4 resolution sample:

Logical view:

Obviously, width = 8, height = 4

ylen = 8*4, ulen = 8*4/4, vlen= 8*4/4.
total_length = ylen + ulen + vlen = ylen * 3 / 2

and every four Y value matches the same U value and V value.

For example:

Y00 Y01 Y10 Y11 share U00 and V00
Y20 Y21 Y30 Y31 share U10 and V10

Algorithms

Nearest interpolation

srcX = dstX * (srcWidth / dstWidth), srcY = dstY * (srcHeight / dstHeight)

The proportion usually has a decimal part. This algorithm simply use 'round up', store the nearest pixel value from the source image in the dest image array. So the effect would not be great and usually will have some serious mosaic.

Bilinear interpolation

Bilinear interpolation uses both fractional part and integer together to calculate the final pixel value according to four pixels.The fractional part is used as weighted value. It removes sharp and mosaic.

input: src_nv12_array, src_width, src_height,dest_width,dest_height
output: dst_nv12_array

Code

This is a C version optimized program.

Restrict and register keyword
Use shift operation to optimize float division
Move irrelevant code out of inner loop
Better use logic operation but not arithmetic operation in loop

#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/stat.h>

typedef unsigned char uint8_t;
 
/** 
 * @param src input nv12 raw data array  
 * @param dst output nv12 raw data result, 
 * the memory need to be allocated outside of the function 
 * @param srcWidth width of the input nv12 image 
 * @param srcHeight height of the input nv12 image 
 * @param dstWidth
 * @param dstHeight 
 */

void nv12_nearest_scale(uint8_t* __restrict src, uint8_t* __restrict dst,
                        int srcWidth, int srcHeight, int dstWidth, int
dstHeight)      //restrict keyword is for compiler to optimize program
{
    register int sw = srcWidth;  //register keyword is for local var to accelorate 
    register int sh = srcHeight;
    register int dw = dstWidth;
    register int dh = dstHeight;
    register int y, x;
    unsigned long int srcy, srcx, src_index, dst_index;
    unsigned long int xrIntFloat_16 = (sw << 16) / dw + 1; //better than float division
    unsigned long int yrIntFloat_16 = (sh << 16) / dh + 1;

    uint8_t* dst_uv = dst + dh * dw; //memory start pointer of dest uv
    uint8_t* src_uv = src + sh * sw; //memory start pointer of source uv
    uint8_t* dst_uv_yScanline;
    uint8_t* src_uv_yScanline;
    uint8_t* dst_y_slice = dst; //memory start pointer of dest y
    uint8_t* src_y_slice;
    uint8_t* sp;
    uint8_t* dp;
 
    for (y = 0; y < (dh & ~7); ++y)  //'dh & ~7' is to generate faster assembly code
    {
        srcy = (y * yrIntFloat_16) >> 16;
        src_y_slice = src + srcy * sw;

        if((y & 1) == 0)
        {
            dst_uv_yScanline = dst_uv + (y / 2) * dw;
            src_uv_yScanline = src_uv + (srcy / 2) * sw;
        }

        for(x = 0; x < (dw & ~7); ++x)
        {
            srcx = (x * xrIntFloat_16) >> 16;
            dst_y_slice[x] = src_y_slice[srcx];

            if((y & 1) == 0) //y is even
            {
                if((x & 1) == 0) //x is even
                {
                    src_index = (srcx / 2) * 2;
            
                    sp = dst_uv_yScanline + x;
                    dp = src_uv_yScanline + src_index;
                    *sp = *dp;
                    ++sp;
                    ++dp;
                    *sp = *dp;
                }
             }
         }
         dst_y_slice += dw;
    }
}

void nv12_bilinear_scale (uint8_t* src, uint8_t* dst,
        int srcWidth, int srcHeight, int dstWidth,int dstHeight)
{
    int x, y;
    int ox, oy;
    int tmpx, tmpy;
    int xratio = (srcWidth << 8)/dstWidth;
    int yratio = (srcHeight << 8)/dstHeight;
    uint8_t* dst_y = dst;
    uint8_t* dst_uv = dst + dstHeight * dstWidth;
    uint8_t* src_y = src;
    uint8_t* src_uv = src + srcHeight * srcWidth;

    uint8_t y_plane_color[2][2];
    uint8_t u_plane_color[2][2];
    uint8_t v_plane_color[2][2];
    int j,i;
    int size = srcWidth * srcHeight;
    int offsetY;
    int y_final, u_final, v_final; 
    int u_final1 = 0;
    int v_final1 = 0;
    int u_final2 = 0;
    int v_final2 = 0;
    int u_final3 = 0;
    int v_final3 = 0;
    int u_final4 = 0;
    int v_final4 = 0;
    int u_sum = 0;
    int v_sum = 0;


    tmpy = 0;
    for (j = 0; j < (dstHeight & ~7); ++j)
    {
        //tmpy = j * yratio;
    oy = tmpy >> 8;
    y = tmpy & 0xFF;

    tmpx = 0;
    for (i = 0; i < (dstWidth & ~7); ++i)
    {
        // tmpx = i * xratio;
        ox = tmpx >> 8;
        x = tmpx & 0xFF;
    
        offsetY = oy * srcWidth;
            //YYYYYYYYYYYYYYYY
        y_plane_color[0][0] = src[ offsetY + ox ];
        y_plane_color[1][0] = src[ offsetY + ox + 1 ];
        y_plane_color[0][1] = src[ offsetY + srcWidth + ox ];
        y_plane_color[1][1] = src[ offsetY + srcWidth + ox + 1 ];
            
        int y_final = (0x100 - x) * (0x100 - y) * y_plane_color[0][0]
            + x * (0x100 - y) * y_plane_color[1][0]
            + (0x100 - x) * y * y_plane_color[0][1]
            + x * y * y_plane_color[1][1];
        y_final = y_final >> 16;
        if (y_final>255)
            y_final = 255;
        if (y_final<0)
            y_final = 0;
        dst_y[ j * dstWidth + i] = (uint8_t)y_final; //set Y in dest array 
            //UVUVUVUVUVUV
        if((j & 1) == 0) //j is even
        {
            if((i & 1) == 0) //i is even
            {
                u_plane_color[0][0] = src[ size + offsetY + ox ];
                u_plane_color[1][0] = src[ size + offsetY + ox ];
                u_plane_color[0][1] = src[ size + offsetY + ox ];
                u_plane_color[1][1] = src[ size + offsetY + ox ];

                v_plane_color[0][0] = src[ size + offsetY + ox + 1];
                v_plane_color[1][0] = src[ size + offsetY + ox + 1];
                v_plane_color[0][1] = src[ size + offsetY + ox + 1];
                v_plane_color[1][1] = src[ size + offsetY + ox + 1];
            }
            else //i is odd
            {
                u_plane_color[0][0] = src[ size + offsetY + ox - 1 ];
                u_plane_color[1][0] = src[ size + offsetY + ox + 1 ];
                u_plane_color[0][1] = src[ size + offsetY + ox - 1 ];
                u_plane_color[1][1] = src[ size + offsetY + ox + 1 ];

                v_plane_color[0][0] = src[ size + offsetY + ox ];
                v_plane_color[1][0] = src[ size + offsetY + ox + 1 ];
                v_plane_color[0][1] = src[ size + offsetY + ox ];
                v_plane_color[1][1] = src[ size + offsetY + ox + 1 ];
            }
        }
        else // j is odd
        {
            if((i & 1) == 0) //i is even
            {
                u_plane_color[0][0] = src[ size + offsetY + ox ];
                u_plane_color[1][0] = src[ size + offsetY + ox ];
                u_plane_color[0][1] = src[ size + offsetY + srcWidth + ox ];
                u_plane_color[1][1] = src[ size + offsetY + srcWidth + ox ];
                    
                v_plane_color[0][0] = src[ size + offsetY + ox + 1];
                v_plane_color[1][0] = src[ size + offsetY + ox + 1];
                v_plane_color[0][1] = src[ size + offsetY + srcWidth + ox + 1];
                v_plane_color[1][1] = src[ size + offsetY + srcWidth + ox + 1];                                                    
            }
            else //i is odd
            {
                u_plane_color[0][0] = src[ size + offsetY + ox - 1 ];
                u_plane_color[1][0] = src[ size + offsetY + srcWidth + ox - 1 ];
                u_plane_color[0][1] = src[ size + offsetY + ox + 1];
                u_plane_color[1][1] = src[ size + offsetY + srcWidth + ox + 1];

                v_plane_color[0][0] = src[ size + offsetY + ox ];
                v_plane_color[1][0] = src[ size + offsetY + srcWidth + ox ];
                v_plane_color[0][1] = src[ size + offsetY + ox + 2 ];
                v_plane_color[1][1] = src[ size + offsetY + srcWidth + ox + 2 ];
            }
        }

       int u_final = (0x100 - x) * (0x100 - y) * u_plane_color[0][0]
                     + x * (0x100 - y) * u_plane_color[1][0]
                     + (0x100 - x) * y * u_plane_color[0][1]
                     + x * y * u_plane_color[1][1];
       u_final = u_final >> 16;

       int v_final = (0x100 - x) * (0x100 - y) * v_plane_color[0][0]
                      + x * (0x100 - y) * v_plane_color[1][0]
                      + (0x100 - x) * y * v_plane_color[0][1]
                      + x * y * v_plane_color[1][1];
       v_final = v_final >> 16;
       if((j & 1) == 0)
       {
           if((i & 1) == 0)
           {    
               //set U in dest array  
               dst_uv[(j / 2) * dstWidth + i ] = (uint8_t)(u_sum / 4);
               //set V in dest array
               dst_uv[(j / 2) * dstWidth + i + 1] = (uint8_t)(v_sum / 4);
               u_sum = 0;
               v_sum = 0;
           }
       }
       else
       {
           u_sum += u_final;
           v_sum += v_final;
       }
       tmpx += xratio;
    }
    tmpy += yratio;
    }
}

int ImageResize(uint8_t * src, uint8_t* dst, int sw,
        int sh,int dw,int dh)
{
    if( (src == NULL) || (dst == NULL) || (0 == dw) || (0 == dh) ||
            (0 == sw) || (0 == sh))
    {
        printf("params error\n");
        return -1;
    }
        nv12_nearest_scale(src, dst, sw, sh, dw, dh);
    //nv12_bilinear_scale(src, dst, sw, sh, dw, dh);
    //greyscale(src, dst, sw, sh, dw, dh);
    return 0;
}

int main(int argc,char**argv)
{
    if(argc!=7)
    {
        printf("Input Error!\n");
        printf("Usage :  <Input NV12file> <Output NV12file> 
                <sw><sh> <dw> <dh>");
        return 0;
    }
 
    FILE *inputfp = NULL;
    FILE *outputfp = NULL;
 
    inputfp = fopen(argv[1], "rb");
    if (!inputfp)
    {
        fprintf(stderr, "fopen failed for input file[%s]\n",argv[1]);
        return -1;
    }
 
    outputfp = fopen(argv[2], "wb");
 
    if (!outputfp)
    {
        fprintf(stderr, "fopen failed for output file[%s]\n",argv[2]);
        return -1;
    }
 
    int sw = atoi(argv[3]);
    int sh = atoi(argv[4]);
    int dw = atoi(argv[5]);
    int dh = atoi(argv[6]);
 
    if(sw <= 0 || sh <= 0 || dw <= 0 || dh <=0)
    {
        fprintf(stderr, "parameter error [sw= %d,sh= %d,dw= %d,dh= %d]\n",sw,sh,dw,dh);
        return -1;
    }
 
    int inPixels = sw * sh * 3/2;
    int outPixels = dw * dh * 3/2;
 
    uint8_t* pInBuffer = (uint8_t*)malloc(inPixels);
    fread(pInBuffer,1,inPixels,inputfp);
    uint8_t* pOutBuffer = (uint8_t*)malloc(outPixels);
 
    ImageResize(pInBuffer,pOutBuffer,sw,sh,dw,dh);
        //compute frame per second
    int i = 0;
    clock_t start = clock();
 
    for(;i<1000;++i)
    {
        ImageResize(pInBuffer,pOutBuffer,1536,1088,1024,600);//can change to be any resolution    
    }
    clock_t finish = clock();
    float duration = (float)(finish-start)/CLOCKS_PER_SEC;
    float fps = 1000 / duration;
    printf("nv12Scaling:%d*%d-->%d*%d,time cost:%6.2ffps\n",sw,sh,dw,dh,fps);
 
    fwrite(pOutBuffer, 1 , outPixels, outputfp);
 
    free(pInBuffer);
    free(pOutBuffer);
    fclose(inputfp);
    fclose(outputfp);
    pInBuffer = NULL;
    pOutBuffer = NULL;
    inputfp = NULL;
    outputfp = NULL;
    return 0;
}

Furthermore, I would like to use the ARM assembly language to optimize my program. Maybe the NEON vectorized assembly for the Android project. Or simply use different instrinsics of different CPUs instead.But sometimes it needs a huge change(including re-design algorithm) while translating C code to assembly code. This depends on the feature of CPU.