[x265] [PATCH] asm: fixed invalid testbench input for addAvg primitive, fixed addition overflow for some block sizes

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Feb 25 06:09:20 CET 2014


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1393304947 -19800
#      Tue Feb 25 10:39:07 2014 +0530
# Node ID 90c71d1f0c17f25406b9e7ab74b8840b40624e4d
# Parent  18894c99e1a71dc79e0ae55d4d4b8ed5d0c59c69
asm: fixed invalid testbench input for addAvg primitive, fixed addition overflow for some block sizes.

diff -r 18894c99e1a7 -r 90c71d1f0c17 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Mon Feb 24 19:19:27 2014 -0600
+++ b/source/common/x86/const-a.asm	Tue Feb 25 10:39:07 2014 +0530
@@ -39,7 +39,6 @@
 const pw_1023,     times 8  dw 1023
 const pw_1024,     times 16 dw 1024
 const pw_4096,     times 16 dw 4096
-const pw_16400,    times 8  dw 16400
 const pw_00ff,     times 16 dw 0x00ff
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
diff -r 18894c99e1a7 -r 90c71d1f0c17 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Mon Feb 24 19:19:27 2014 -0600
+++ b/source/common/x86/mc-a.asm	Tue Feb 25 10:39:07 2014 +0530
@@ -54,7 +54,6 @@
 cextern pw_512
 cextern pw_1023
 cextern pw_1024
-cextern pw_16400
 cextern pw_00ff
 cextern pw_pixel_max
 cextern sw_64
@@ -67,12 +66,9 @@
 ; r0 = pSrc0,    r1 = pSrc1
 ; r2 = pDst,     r3 = iStride0
 ; r4 = iStride1, r5 = iDstStride
-
 %if HIGH_BIT_DEPTH
 INIT_XMM sse4
-cglobal addAvg_2x4, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova          m7,          [pw_16400]
-    mova          m0,          [pw_1023]
+cglobal addAvg_2x4, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     add           r3,          r3
     add           r4,          r4
     add           r5,          r5
@@ -91,20 +87,18 @@
     movd          m2,          [r0]
     movd          m4,          [r0 + r3]
     movd          m5,          [r1]
-    movd          m6,          [r1 + r4]
-
+    movd          m0,          [r1 + r4]
     punpckldq     m2,          m4
-    punpckldq     m5,          m6
+    punpckldq     m5,          m0
     punpcklqdq    m1,          m2
     punpcklqdq    m3,          m5
-
     paddw         m1,          m3
-    paddw         m1,          m7
-    psraw         m1,          5
-    pxor          m6,          m6
-    pmaxsw        m1,          m6
-    pminsw        m1,          m0
-
+    pmulhrsw      m1,          [pw_1024]
+    paddw         m1,          [pw_512]
+
+    pxor          m0,          m0
+    pmaxsw        m1,          m0
+    pminsw        m1,          [pw_1023]
     movd          [r2],        m1
     pextrd        [r2 + r5],   m1, 1
     lea           r2,          [r2 + 2 * r5]
@@ -112,14 +106,11 @@
     pextrd        [r2 + r5],   m1, 3
 
     RET
-
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal addAvg_2x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-
-    mova          m7,          [pw_16400]
-    mova          m0,          [pw_1023]
-
+cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova          m0,          [pw_512]
+    pxor          m7,          m7
     add           r3,          r3
     add           r4,          r4
     add           r5,          r5
@@ -145,14 +136,12 @@
     punpckldq     m5,          m6
     punpcklqdq    m1,          m2
     punpcklqdq    m3,          m5
-
     paddw         m1,          m3
-    paddw         m1,          m7
-    psraw         m1,          5
-    pxor          m6,          m6
-    pmaxsw        m1,          m6
-    pminsw        m1,          m0
-
+    pmulhrsw      m1,          [pw_1024]
+    paddw         m1,          m0
+
+    pmaxsw        m1,          m7
+    pminsw        m1,          [pw_1023]
     movd          [r2],        m1
     pextrd        [r2 + r5],   m1, 1
     lea           r2,          [r2 + 2 * r5]
@@ -169,10 +158,6 @@
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-
-    mova           m4,          [pw_16400]
-    mova           m5,          [pw_1023]
-    pxor           m6,          m6
     add            r3,          r3
     add            r4,          r4
     add            r5,          r5
@@ -184,22 +169,22 @@
 
     punpcklqdq     m0,          m1
     punpcklqdq     m2,          m3
-
     paddw          m0,          m2
-    paddw          m0,          m4
-    psraw          m0,          5
-    pmaxsw         m1,          m6
-    pminsw         m1,          m5
-
+    pmulhrsw       m0,          [pw_1024]
+    paddw          m0,          [pw_512]
+
+    pxor           m6,          m6
+    pmaxsw         m0,          m6
+    pminsw         m0,          [pw_1023]
     movh           [r2],        m0
     movhps         [r2 + r5],   m0
     RET
-
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal addAvg_6x8, 6,7,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,             [pw_16400]
+cglobal addAvg_6x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,             [pw_512]
     mova        m5,             [pw_1023]
+    mova        m7,             [pw_1024]
     pxor        m6,             m6
     add         r3,             r3
     add         r4,             r4
@@ -209,8 +194,9 @@
     movu        m0,             [r0]
     movu        m2,             [r1]
     paddw       m0,             m2
+    pmulhrsw    m0,             m7
     paddw       m0,             m4
-    psraw       m0,             5
+
     pmaxsw      m0,             m6
     pminsw      m0,             m5
     movh        [r2],           m0
@@ -219,8 +205,9 @@
     movu        m1,             [r0 + r3]
     movu        m3,             [r1 + r4]
     paddw       m1,             m3
+    pmulhrsw    m1,             m7
     paddw       m1,             m4
-    psraw       m1,             5
+
     pmaxsw      m1,             m6
     pminsw      m1,             m5
     movh        [r2 + r5],      m1
@@ -231,12 +218,12 @@
     lea         r1,             [r1 + 2 * r4]
 %endrep
     RET
-
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal addAvg_8x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_16400]
+cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,          [pw_512]
     mova        m5,          [pw_1023]
+    mova        m7,          [pw_1024]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -245,8 +232,9 @@
     movu        m0,          [r0]
     movu        m2,          [r1]
     paddw       m0,          m2
+    pmulhrsw    m0,          m7
     paddw       m0,          m4
-    psraw       m0,          5
+
     pmaxsw      m0,          m6
     pminsw      m0,          m5
     movu        [r2],        m0
@@ -254,18 +242,19 @@
     movu        m1,          [r0 + r3]
     movu        m3,          [r1 + r4]
     paddw       m1,          m3
+    pmulhrsw    m1,          m7
     paddw       m1,          m4
-    psraw       m1,          5
+
     pmaxsw      m1,          m6
     pminsw      m1,          m5
     movu        [r2 + r5],   m1
     RET
-
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal addAvg_8x6, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-    mova        m4,          [pw_16400]
+cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,          [pw_512]
     mova        m5,          [pw_1023]
+    mova        m7,          [pw_1024]
     pxor        m6,          m6
     add         r3,          r3
     add         r4,          r4
@@ -275,8 +264,9 @@
     movu        m0,          [r0]
     movu        m2,          [r1]
     paddw       m0,          m2
+    pmulhrsw    m0,          m7
     paddw       m0,          m4
-    psraw       m0,          5
+
     pmaxsw      m0,          m6
     pminsw      m0,          m5
     movu        [r2],        m0
@@ -284,8 +274,9 @@
     movu        m1,          [r0 + r3]
     movu        m3,          [r1 + r4]
     paddw       m1,          m3
+    pmulhrsw    m1,          m7
     paddw       m1,          m4
-    psraw       m1,          5
+
     pmaxsw      m1,          m6
     pminsw      m1,          m5
     movu        [r2 + r5],   m1
diff -r 18894c99e1a7 -r 90c71d1f0c17 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Feb 24 19:19:27 2014 -0600
+++ b/source/test/pixelharness.cpp	Tue Feb 25 10:39:07 2014 +0530
@@ -60,10 +60,10 @@
     pixel_test_buff  = (pixel**)X265_MALLOC(pixel*, TEST_CASES);
     short_test_buff  = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
     short_test_buff1 = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
+    short_test_buff2 = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
     int_test_buff    = (int**)X265_MALLOC(int*, TEST_CASES);
-
     if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 || !sbuf1 || !sbuf2 || !sbuf3 || !ibuf1 ||
-        !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1)
+        !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1 || !short_test_buff2)
     {
         fprintf(stderr, "malloc failed, unable to initiate tests!\n");
         exit(1);
@@ -74,6 +74,7 @@
         pixel_test_buff[i]  = (pixel*)X265_MALLOC(pixel, BUFFSIZE);
         short_test_buff[i]  = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
         short_test_buff1[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
+        short_test_buff2[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
         int_test_buff[i]    = (int*)X265_MALLOC(int, BUFFSIZE);
         if (!pixel_test_buff[i] || !short_test_buff[i] || !int_test_buff[i] || !short_test_buff1[i])
         {
@@ -88,21 +89,21 @@
     for (int i = 0; i < BUFFSIZE; i++)
     {
         pixel_test_buff[0][i]   = rand() % PIXEL_MAX;
-        short_test_buff[0][i]   = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
-        short_test_buff1[0][i]  = rand() & PIXEL_MAX;                  //For block copy only
+        short_test_buff[0][i]   = (rand() % (2 * SMAX + 1)) - SMAX - 1; // max(SHORT_MIN, min(rand(), SMAX));
+        short_test_buff1[0][i]  = rand() & PIXEL_MAX;                   // For block copy only
+        short_test_buff2[0][i]  = rand() % 16383;                       // for addAvg
         int_test_buff[0][i]     = rand() % SHORT_MAX;
-
         pixel_test_buff[1][i]   = PIXEL_MIN;
         short_test_buff[1][i]   = SMIN;
         short_test_buff1[1][i]  = PIXEL_MIN;
+        short_test_buff2[1][i]  = -16384;
         int_test_buff[1][i]     = SHORT_MIN;
-
         pixel_test_buff[2][i]   = PIXEL_MAX;
         short_test_buff[2][i]   = SMAX;
         short_test_buff1[2][i]  = PIXEL_MAX;
+        short_test_buff2[2][i]  = 16383;
         int_test_buff[2][i]     = SHORT_MAX;
     }
-
     for (int i = 0; i < bufsize; i++)
     {
         pbuf1[i] = rand() & PIXEL_MAX;
@@ -855,9 +856,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        ref(short_test_buff[index1] + j, short_test_buff[index2] + j, ref_dest, STRIDE, STRIDE, STRIDE);
-        opt(short_test_buff[index1] + j, short_test_buff[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
-
+        ref(short_test_buff2[index1] + j, short_test_buff2[index2] + j, ref_dest, STRIDE, STRIDE, STRIDE);
+        opt(short_test_buff2[index1] + j, short_test_buff2[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
         {
             return false;
diff -r 18894c99e1a7 -r 90c71d1f0c17 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Mon Feb 24 19:19:27 2014 -0600
+++ b/source/test/pixelharness.h	Tue Feb 25 10:39:07 2014 +0530
@@ -32,11 +32,8 @@
 protected:
 
     pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4, **pixel_test_buff;
-
     int *ibuf1, **int_test_buff;
-
-    int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1;
-
+    int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1, **short_test_buff2;
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
     bool check_pixelcmp_sp(pixelcmp_sp_t ref, pixelcmp_sp_t opt);
     bool check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);


More information about the x265-devel mailing list