[x265] [PATCH] asm: correct improper stress test cases, modify algorithm of addAvg 16bpp

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Mon Feb 24 12:44:34 CET 2014


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1393241924 -19800
#      Mon Feb 24 17:08:44 2014 +0530
# Node ID f2872795807a3150d30397a3cab417ef9140a7fa
# Parent  57ce7f0f4f4cbb9acd401751e8bef7b522774e38
asm: correct improper stress test cases, modify algorithm of addAvg 16bpp

diff -r 57ce7f0f4f4c -r f2872795807a source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Thu Feb 20 16:01:28 2014 -0800
+++ b/source/common/x86/const-a.asm	Mon Feb 24 17:08:44 2014 +0530
@@ -39,7 +39,6 @@
 const pw_1023,     times 8  dw 1023
 const pw_1024,     times 16 dw 1024
 const pw_4096,     times 16 dw 4096
-const pw_16400,    times 8  dw 16400
 const pw_00ff,     times 16 dw 0x00ff
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
diff -r 57ce7f0f4f4c -r f2872795807a source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Thu Feb 20 16:01:28 2014 -0800
+++ b/source/common/x86/mc-a.asm	Mon Feb 24 17:08:44 2014 +0530
@@ -54,7 +54,6 @@
 cextern pw_512
 cextern pw_1023
 cextern pw_1024
-cextern pw_16400
 cextern pw_00ff
 cextern pw_pixel_max
 cextern sw_64
@@ -70,9 +69,7 @@
 
 %if HIGH_BIT_DEPTH
 INIT_XMM sse4
-cglobal addAvg_2x4, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova          m7,          [pw_16400]
-    mova          m0,          [pw_1023]
+cglobal addAvg_2x4, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     add           r3,          r3
     add           r4,          r4
     add           r5,          r5
@@ -91,19 +88,20 @@
     movd          m2,          [r0]
     movd          m4,          [r0 + r3]
     movd          m5,          [r1]
-    movd          m6,          [r1 + r4]
+    movd          m0,          [r1 + r4]
 
     punpckldq     m2,          m4
-    punpckldq     m5,          m6
+    punpckldq     m5,          m0
     punpcklqdq    m1,          m2
     punpcklqdq    m3,          m5
 
     paddw         m1,          m3
-    paddw         m1,          m7
-    psraw         m1,          5
-    pxor          m6,          m6
-    pmaxsw        m1,          m6
-    pminsw        m1,          m0
+    pmulhrsw      m1,          [pw_1024]
+    paddw         m1,          [pw_512]
+
+    pxor          m0,          m0
+    pmaxsw        m1,          m0
+    pminsw        m1,          [pw_1023]
 
     movd          [r2],        m1
     pextrd        [r2 + r5],   m1, 1
@@ -115,11 +113,9 @@
 
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal addAvg_2x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-
-    mova          m7,          [pw_16400]
-    mova          m0,          [pw_1023]
-
+cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova          m0,          [pw_512]
+    pxor          m7,          m7
     add           r3,          r3
     add           r4,          r4
     add           r5,          r5
@@ -147,11 +143,11 @@
     punpcklqdq    m3,          m5
 
     paddw         m1,          m3
-    paddw         m1,          m7
-    psraw         m1,          5
-    pxor          m6,          m6
-    pmaxsw        m1,          m6
-    pminsw        m1,          m0
+    pmulhrsw      m1,          [pw_1024]
+    paddw         m1,          m0
+
+    pmaxsw        m1,          m7
+    pminsw        m1,          [pw_1023]
 
     movd          [r2],        m1
     pextrd        [r2 + r5],   m1, 1
@@ -169,10 +165,6 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-
-    mova           m4,          [pw_16400]
-    mova           m5,          [pw_1023]
-    pxor           m6,          m6
     add            r3,          r3
     add            r4,          r4
     add            r5,          r5
@@ -186,10 +178,12 @@
     punpcklqdq     m2,          m3
 
     paddw          m0,          m2
-    paddw          m0,          m4
-    psraw          m0,          5
-    pmaxsw         m1,          m6
-    pminsw         m1,          m5
+    pmulhrsw       m0,          [pw_1024]
+    paddw          m0,          [pw_512]
+
+    pxor           m6,          m6
+    pmaxsw         m0,          m6
+    pminsw         m0,          [pw_1023]
 
     movh           [r2],        m0
     movhps         [r2 + r5],   m0
@@ -197,9 +191,10 @@
 
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal addAvg_6x8, 6,7,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,             [pw_16400]
+cglobal addAvg_6x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,             [pw_512]
     mova        m5,             [pw_1023]
+    mova        m7,             [pw_1024]
     pxor        m6,             m6
     add         r3,             r3
     add         r4,             r4
@@ -209,8 +204,9 @@
     movu        m0,             [r0]
     movu        m2,             [r1]
     paddw       m0,             m2
+    pmulhrsw    m0,             m7
     paddw       m0,             m4
-    psraw       m0,             5
+
     pmaxsw      m0,             m6
     pminsw      m0,             m5
     movh        [r2],           m0
@@ -219,8 +215,9 @@
     movu        m1,             [r0 + r3]
     movu        m3,             [r1 + r4]
     paddw       m1,             m3
+    pmulhrsw    m1,             m7
     paddw       m1,             m4
-    psraw       m1,             5
+
     pmaxsw      m1,             m6
     pminsw      m1,             m5
     movh        [r2 + r5],      m1
@@ -234,9 +231,10 @@
 
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal addAvg_8x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_16400]
+cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,          [pw_512]
     mova        m5,          [pw_1023]
+    mova        m7,          [pw_1024]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -245,8 +243,9 @@
     movu        m0,          [r0]
     movu        m2,          [r1]
     paddw       m0,          m2
+    pmulhrsw    m0,          m7
     paddw       m0,          m4
-    psraw       m0,          5
+
     pmaxsw      m0,          m6
     pminsw      m0,          m5
     movu        [r2],        m0
@@ -254,8 +253,9 @@
     movu        m1,          [r0 + r3]
     movu        m3,          [r1 + r4]
     paddw       m1,          m3
+    pmulhrsw    m1,          m7
     paddw       m1,          m4
-    psraw       m1,          5
+
     pmaxsw      m1,          m6
     pminsw      m1,          m5
     movu        [r2 + r5],   m1
@@ -263,9 +263,10 @@
 
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal addAvg_8x6, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_16400]
+cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,          [pw_512]
     mova        m5,          [pw_1023]
+    mova        m7,          [pw_1024]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -275,8 +276,9 @@
     movu        m0,          [r0]
     movu        m2,          [r1]
     paddw       m0,          m2
+    pmulhrsw    m0,          m7
     paddw       m0,          m4
-    psraw       m0,          5
+
     pmaxsw      m0,          m6
     pminsw      m0,          m5
     movu        [r2],        m0
@@ -284,8 +286,9 @@
     movu        m1,          [r0 + r3]
     movu        m3,          [r1 + r4]
     paddw       m1,          m3
+    pmulhrsw    m1,          m7
     paddw       m1,          m4
-    psraw       m1,          5
+
     pmaxsw      m1,          m6
     pminsw      m1,          m5
     movu        [r2 + r5],   m1
diff -r 57ce7f0f4f4c -r f2872795807a source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Thu Feb 20 16:01:28 2014 -0800
+++ b/source/test/pixelharness.cpp	Mon Feb 24 17:08:44 2014 +0530
@@ -60,10 +60,11 @@
     pixel_test_buff  = (pixel**)X265_MALLOC(pixel*, TEST_CASES);
     short_test_buff  = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
     short_test_buff1 = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
+    short_test_buff2 = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
     int_test_buff    = (int**)X265_MALLOC(int*, TEST_CASES);
 
     if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 || !sbuf1 || !sbuf2 || !sbuf3 || !ibuf1 ||
-        !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1)
+        !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1 || !short_test_buff2)
     {
         fprintf(stderr, "malloc failed, unable to initiate tests!\n");
         exit(1);
@@ -74,6 +75,7 @@
         pixel_test_buff[i]  = (pixel*)X265_MALLOC(pixel, BUFFSIZE);
         short_test_buff[i]  = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
         short_test_buff1[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
+        short_test_buff2[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
         int_test_buff[i]    = (int*)X265_MALLOC(int, BUFFSIZE);
         if (!pixel_test_buff[i] || !short_test_buff[i] || !int_test_buff[i] || !short_test_buff1[i])
         {
@@ -88,18 +90,21 @@
     for (int i = 0; i < BUFFSIZE; i++)
     {
         pixel_test_buff[0][i]   = rand() % PIXEL_MAX;
-        short_test_buff[0][i]   = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
-        short_test_buff1[0][i]  = rand() & PIXEL_MAX;                  //For block copy only
+        short_test_buff[0][i]   = (rand() % (2 * SMAX + 1)) - SMAX - 1; // max(SHORT_MIN, min(rand(), SMAX));
+        short_test_buff1[0][i]  = rand() & PIXEL_MAX;                   // For block copy only
+        short_test_buff2[0][i]  = rand() % 16383;                       // for addAvg
         int_test_buff[0][i]     = rand() % SHORT_MAX;
 
         pixel_test_buff[1][i]   = PIXEL_MIN;
         short_test_buff[1][i]   = SMIN;
         short_test_buff1[1][i]  = PIXEL_MIN;
+        short_test_buff2[1][i]  = -16384;
         int_test_buff[1][i]     = SHORT_MIN;
 
         pixel_test_buff[2][i]   = PIXEL_MAX;
         short_test_buff[2][i]   = SMAX;
         short_test_buff1[2][i]  = PIXEL_MAX;
+        short_test_buff2[2][i]  = 16383;
         int_test_buff[2][i]     = SHORT_MAX;
     }
 
@@ -897,8 +902,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        ref(short_test_buff[index1] + j, short_test_buff[index2] + j, ref_dest, STRIDE, STRIDE, STRIDE);
-        opt(short_test_buff[index1] + j, short_test_buff[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
+        ref(short_test_buff2[index1] + j, short_test_buff2[index2] + j, ref_dest, STRIDE, STRIDE, STRIDE);
+        opt(short_test_buff2[index1] + j, short_test_buff2[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
         {
diff -r 57ce7f0f4f4c -r f2872795807a source/test/pixelharness.h
--- a/source/test/pixelharness.h	Thu Feb 20 16:01:28 2014 -0800
+++ b/source/test/pixelharness.h	Mon Feb 24 17:08:44 2014 +0530
@@ -35,7 +35,7 @@
 
     int *ibuf1, **int_test_buff;
 
-    int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1;
+    int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1, **short_test_buff2;
 
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
     bool check_pixelcmp_sp(pixelcmp_sp_t ref, pixelcmp_sp_t opt);


More information about the x265-devel mailing list