[x265] [PATCH] asm: fixed invalid testbench input for addAvg primitive, fixed addition overflow for some block sizes
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Feb 25 06:09:20 CET 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1393304947 -19800
# Tue Feb 25 10:39:07 2014 +0530
# Node ID 90c71d1f0c17f25406b9e7ab74b8840b40624e4d
# Parent 18894c99e1a71dc79e0ae55d4d4b8ed5d0c59c69
asm: fixed invalid testbench input for addAvg primitive, fixed addition overflow for some block sizes.
diff -r 18894c99e1a7 -r 90c71d1f0c17 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Feb 24 19:19:27 2014 -0600
+++ b/source/common/x86/const-a.asm Tue Feb 25 10:39:07 2014 +0530
@@ -39,7 +39,6 @@
const pw_1023, times 8 dw 1023
const pw_1024, times 16 dw 1024
const pw_4096, times 16 dw 4096
-const pw_16400, times 8 dw 16400
const pw_00ff, times 16 dw 0x00ff
const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
diff -r 18894c99e1a7 -r 90c71d1f0c17 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Mon Feb 24 19:19:27 2014 -0600
+++ b/source/common/x86/mc-a.asm Tue Feb 25 10:39:07 2014 +0530
@@ -54,7 +54,6 @@
cextern pw_512
cextern pw_1023
cextern pw_1024
-cextern pw_16400
cextern pw_00ff
cextern pw_pixel_max
cextern sw_64
@@ -67,12 +66,9 @@
; r0 = pSrc0, r1 = pSrc1
; r2 = pDst, r3 = iStride0
; r4 = iStride1, r5 = iDstStride
-
%if HIGH_BIT_DEPTH
INIT_XMM sse4
-cglobal addAvg_2x4, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m7, [pw_16400]
- mova m0, [pw_1023]
+cglobal addAvg_2x4, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
add r3, r3
add r4, r4
add r5, r5
@@ -91,20 +87,18 @@
movd m2, [r0]
movd m4, [r0 + r3]
movd m5, [r1]
- movd m6, [r1 + r4]
-
+ movd m0, [r1 + r4]
punpckldq m2, m4
- punpckldq m5, m6
+ punpckldq m5, m0
punpcklqdq m1, m2
punpcklqdq m3, m5
-
paddw m1, m3
- paddw m1, m7
- psraw m1, 5
- pxor m6, m6
- pmaxsw m1, m6
- pminsw m1, m0
-
+ pmulhrsw m1, [pw_1024]
+ paddw m1, [pw_512]
+
+ pxor m0, m0
+ pmaxsw m1, m0
+ pminsw m1, [pw_1023]
movd [r2], m1
pextrd [r2 + r5], m1, 1
lea r2, [r2 + 2 * r5]
@@ -112,14 +106,11 @@
pextrd [r2 + r5], m1, 3
RET
-
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal addAvg_2x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-
- mova m7, [pw_16400]
- mova m0, [pw_1023]
-
+cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m0, [pw_512]
+ pxor m7, m7
add r3, r3
add r4, r4
add r5, r5
@@ -145,14 +136,12 @@
punpckldq m5, m6
punpcklqdq m1, m2
punpcklqdq m3, m5
-
paddw m1, m3
- paddw m1, m7
- psraw m1, 5
- pxor m6, m6
- pmaxsw m1, m6
- pminsw m1, m0
-
+ pmulhrsw m1, [pw_1024]
+ paddw m1, m0
+
+ pmaxsw m1, m7
+ pminsw m1, [pw_1023]
movd [r2], m1
pextrd [r2 + r5], m1, 1
lea r2, [r2 + 2 * r5]
@@ -169,10 +158,6 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-
- mova m4, [pw_16400]
- mova m5, [pw_1023]
- pxor m6, m6
add r3, r3
add r4, r4
add r5, r5
@@ -184,22 +169,22 @@
punpcklqdq m0, m1
punpcklqdq m2, m3
-
paddw m0, m2
- paddw m0, m4
- psraw m0, 5
- pmaxsw m1, m6
- pminsw m1, m5
-
+ pmulhrsw m0, [pw_1024]
+ paddw m0, [pw_512]
+
+ pxor m6, m6
+ pmaxsw m0, m6
+ pminsw m0, [pw_1023]
movh [r2], m0
movhps [r2 + r5], m0
RET
-
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal addAvg_6x8, 6,7,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_16400]
+cglobal addAvg_6x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
mova m5, [pw_1023]
+ mova m7, [pw_1024]
pxor m6, m6
add r3, r3
add r4, r4
@@ -209,8 +194,9 @@
movu m0, [r0]
movu m2, [r1]
paddw m0, m2
+ pmulhrsw m0, m7
paddw m0, m4
- psraw m0, 5
+
pmaxsw m0, m6
pminsw m0, m5
movh [r2], m0
@@ -219,8 +205,9 @@
movu m1, [r0 + r3]
movu m3, [r1 + r4]
paddw m1, m3
+ pmulhrsw m1, m7
paddw m1, m4
- psraw m1, 5
+
pmaxsw m1, m6
pminsw m1, m5
movh [r2 + r5], m1
@@ -231,12 +218,12 @@
lea r1, [r1 + 2 * r4]
%endrep
RET
-
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal addAvg_8x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_16400]
+cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
mova m5, [pw_1023]
+ mova m7, [pw_1024]
pxor m6, m6
add r3, r3
add r4, r4
@@ -245,8 +232,9 @@
movu m0, [r0]
movu m2, [r1]
paddw m0, m2
+ pmulhrsw m0, m7
paddw m0, m4
- psraw m0, 5
+
pmaxsw m0, m6
pminsw m0, m5
movu [r2], m0
@@ -254,18 +242,19 @@
movu m1, [r0 + r3]
movu m3, [r1 + r4]
paddw m1, m3
+ pmulhrsw m1, m7
paddw m1, m4
- psraw m1, 5
+
pmaxsw m1, m6
pminsw m1, m5
movu [r2 + r5], m1
RET
-
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal addAvg_8x6, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_16400]
+cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
mova m5, [pw_1023]
+ mova m7, [pw_1024]
pxor m6, m6
add r3, r3
add r4, r4
@@ -275,8 +264,9 @@
movu m0, [r0]
movu m2, [r1]
paddw m0, m2
+ pmulhrsw m0, m7
paddw m0, m4
- psraw m0, 5
+
pmaxsw m0, m6
pminsw m0, m5
movu [r2], m0
@@ -284,8 +274,9 @@
movu m1, [r0 + r3]
movu m3, [r1 + r4]
paddw m1, m3
+ pmulhrsw m1, m7
paddw m1, m4
- psraw m1, 5
+
pmaxsw m1, m6
pminsw m1, m5
movu [r2 + r5], m1
diff -r 18894c99e1a7 -r 90c71d1f0c17 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Feb 24 19:19:27 2014 -0600
+++ b/source/test/pixelharness.cpp Tue Feb 25 10:39:07 2014 +0530
@@ -60,10 +60,10 @@
pixel_test_buff = (pixel**)X265_MALLOC(pixel*, TEST_CASES);
short_test_buff = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
short_test_buff1 = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
+ short_test_buff2 = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
int_test_buff = (int**)X265_MALLOC(int*, TEST_CASES);
-
if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 || !sbuf1 || !sbuf2 || !sbuf3 || !ibuf1 ||
- !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1)
+ !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1 || !short_test_buff2)
{
fprintf(stderr, "malloc failed, unable to initiate tests!\n");
exit(1);
@@ -74,6 +74,7 @@
pixel_test_buff[i] = (pixel*)X265_MALLOC(pixel, BUFFSIZE);
short_test_buff[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
short_test_buff1[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
+ short_test_buff2[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
int_test_buff[i] = (int*)X265_MALLOC(int, BUFFSIZE);
if (!pixel_test_buff[i] || !short_test_buff[i] || !int_test_buff[i] || !short_test_buff1[i])
{
@@ -88,21 +89,21 @@
for (int i = 0; i < BUFFSIZE; i++)
{
pixel_test_buff[0][i] = rand() % PIXEL_MAX;
- short_test_buff[0][i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
- short_test_buff1[0][i] = rand() & PIXEL_MAX; //For block copy only
+ short_test_buff[0][i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; // max(SHORT_MIN, min(rand(), SMAX));
+ short_test_buff1[0][i] = rand() & PIXEL_MAX; // For block copy only
+ short_test_buff2[0][i] = rand() % 16383; // for addAvg
int_test_buff[0][i] = rand() % SHORT_MAX;
-
pixel_test_buff[1][i] = PIXEL_MIN;
short_test_buff[1][i] = SMIN;
short_test_buff1[1][i] = PIXEL_MIN;
+ short_test_buff2[1][i] = -16384;
int_test_buff[1][i] = SHORT_MIN;
-
pixel_test_buff[2][i] = PIXEL_MAX;
short_test_buff[2][i] = SMAX;
short_test_buff1[2][i] = PIXEL_MAX;
+ short_test_buff2[2][i] = 16383;
int_test_buff[2][i] = SHORT_MAX;
}
-
for (int i = 0; i < bufsize; i++)
{
pbuf1[i] = rand() & PIXEL_MAX;
@@ -855,9 +856,8 @@
{
int index1 = rand() % TEST_CASES;
int index2 = rand() % TEST_CASES;
- ref(short_test_buff[index1] + j, short_test_buff[index2] + j, ref_dest, STRIDE, STRIDE, STRIDE);
- opt(short_test_buff[index1] + j, short_test_buff[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
-
+ ref(short_test_buff2[index1] + j, short_test_buff2[index2] + j, ref_dest, STRIDE, STRIDE, STRIDE);
+ opt(short_test_buff2[index1] + j, short_test_buff2[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
{
return false;
diff -r 18894c99e1a7 -r 90c71d1f0c17 source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Feb 24 19:19:27 2014 -0600
+++ b/source/test/pixelharness.h Tue Feb 25 10:39:07 2014 +0530
@@ -32,11 +32,8 @@
protected:
pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4, **pixel_test_buff;
-
int *ibuf1, **int_test_buff;
-
- int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1;
-
+ int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1, **short_test_buff2;
bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
bool check_pixelcmp_sp(pixelcmp_sp_t ref, pixelcmp_sp_t opt);
bool check_pixelcmp_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
More information about the x265-devel
mailing list