[x265] [PATCH] asm: correct improper stress test cases, modify algorithm of addAvg 16bpp
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Mon Feb 24 12:44:34 CET 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1393241924 -19800
# Mon Feb 24 17:08:44 2014 +0530
# Node ID f2872795807a3150d30397a3cab417ef9140a7fa
# Parent 57ce7f0f4f4cbb9acd401751e8bef7b522774e38
asm: correct improper stress test cases, modify algorithm of addAvg 16bpp
diff -r 57ce7f0f4f4c -r f2872795807a source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Thu Feb 20 16:01:28 2014 -0800
+++ b/source/common/x86/const-a.asm Mon Feb 24 17:08:44 2014 +0530
@@ -39,7 +39,6 @@
const pw_1023, times 8 dw 1023
const pw_1024, times 16 dw 1024
const pw_4096, times 16 dw 4096
-const pw_16400, times 8 dw 16400
const pw_00ff, times 16 dw 0x00ff
const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
diff -r 57ce7f0f4f4c -r f2872795807a source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Thu Feb 20 16:01:28 2014 -0800
+++ b/source/common/x86/mc-a.asm Mon Feb 24 17:08:44 2014 +0530
@@ -54,7 +54,6 @@
cextern pw_512
cextern pw_1023
cextern pw_1024
-cextern pw_16400
cextern pw_00ff
cextern pw_pixel_max
cextern sw_64
@@ -70,9 +69,7 @@
%if HIGH_BIT_DEPTH
INIT_XMM sse4
-cglobal addAvg_2x4, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m7, [pw_16400]
- mova m0, [pw_1023]
+cglobal addAvg_2x4, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
add r3, r3
add r4, r4
add r5, r5
@@ -91,19 +88,20 @@
movd m2, [r0]
movd m4, [r0 + r3]
movd m5, [r1]
- movd m6, [r1 + r4]
+ movd m0, [r1 + r4]
punpckldq m2, m4
- punpckldq m5, m6
+ punpckldq m5, m0
punpcklqdq m1, m2
punpcklqdq m3, m5
paddw m1, m3
- paddw m1, m7
- psraw m1, 5
- pxor m6, m6
- pmaxsw m1, m6
- pminsw m1, m0
+ pmulhrsw m1, [pw_1024]
+ paddw m1, [pw_512]
+
+ pxor m0, m0
+ pmaxsw m1, m0
+ pminsw m1, [pw_1023]
movd [r2], m1
pextrd [r2 + r5], m1, 1
@@ -115,11 +113,9 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal addAvg_2x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-
- mova m7, [pw_16400]
- mova m0, [pw_1023]
-
+cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m0, [pw_512]
+ pxor m7, m7
add r3, r3
add r4, r4
add r5, r5
@@ -147,11 +143,11 @@
punpcklqdq m3, m5
paddw m1, m3
- paddw m1, m7
- psraw m1, 5
- pxor m6, m6
- pmaxsw m1, m6
- pminsw m1, m0
+ pmulhrsw m1, [pw_1024]
+ paddw m1, m0
+
+ pmaxsw m1, m7
+ pminsw m1, [pw_1023]
movd [r2], m1
pextrd [r2 + r5], m1, 1
@@ -169,10 +165,6 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
-
- mova m4, [pw_16400]
- mova m5, [pw_1023]
- pxor m6, m6
add r3, r3
add r4, r4
add r5, r5
@@ -186,10 +178,12 @@
punpcklqdq m2, m3
paddw m0, m2
- paddw m0, m4
- psraw m0, 5
- pmaxsw m1, m6
- pminsw m1, m5
+ pmulhrsw m0, [pw_1024]
+ paddw m0, [pw_512]
+
+ pxor m6, m6
+ pmaxsw m0, m6
+ pminsw m0, [pw_1023]
movh [r2], m0
movhps [r2 + r5], m0
@@ -197,9 +191,10 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal addAvg_6x8, 6,7,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_16400]
+cglobal addAvg_6x8, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
mova m5, [pw_1023]
+ mova m7, [pw_1024]
pxor m6, m6
add r3, r3
add r4, r4
@@ -209,8 +204,9 @@
movu m0, [r0]
movu m2, [r1]
paddw m0, m2
+ pmulhrsw m0, m7
paddw m0, m4
- psraw m0, 5
+
pmaxsw m0, m6
pminsw m0, m5
movh [r2], m0
@@ -219,8 +215,9 @@
movu m1, [r0 + r3]
movu m3, [r1 + r4]
paddw m1, m3
+ pmulhrsw m1, m7
paddw m1, m4
- psraw m1, 5
+
pmaxsw m1, m6
pminsw m1, m5
movh [r2 + r5], m1
@@ -234,9 +231,10 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal addAvg_8x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_16400]
+cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
mova m5, [pw_1023]
+ mova m7, [pw_1024]
pxor m6, m6
add r3, r3
add r4, r4
@@ -245,8 +243,9 @@
movu m0, [r0]
movu m2, [r1]
paddw m0, m2
+ pmulhrsw m0, m7
paddw m0, m4
- psraw m0, 5
+
pmaxsw m0, m6
pminsw m0, m5
movu [r2], m0
@@ -254,8 +253,9 @@
movu m1, [r0 + r3]
movu m3, [r1 + r4]
paddw m1, m3
+ pmulhrsw m1, m7
paddw m1, m4
- psraw m1, 5
+
pmaxsw m1, m6
pminsw m1, m5
movu [r2 + r5], m1
@@ -263,9 +263,10 @@
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal addAvg_8x6, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
- mova m4, [pw_16400]
+cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+ mova m4, [pw_512]
mova m5, [pw_1023]
+ mova m7, [pw_1024]
pxor m6, m6
add r3, r3
add r4, r4
@@ -275,8 +276,9 @@
movu m0, [r0]
movu m2, [r1]
paddw m0, m2
+ pmulhrsw m0, m7
paddw m0, m4
- psraw m0, 5
+
pmaxsw m0, m6
pminsw m0, m5
movu [r2], m0
@@ -284,8 +286,9 @@
movu m1, [r0 + r3]
movu m3, [r1 + r4]
paddw m1, m3
+ pmulhrsw m1, m7
paddw m1, m4
- psraw m1, 5
+
pmaxsw m1, m6
pminsw m1, m5
movu [r2 + r5], m1
diff -r 57ce7f0f4f4c -r f2872795807a source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Thu Feb 20 16:01:28 2014 -0800
+++ b/source/test/pixelharness.cpp Mon Feb 24 17:08:44 2014 +0530
@@ -60,10 +60,11 @@
pixel_test_buff = (pixel**)X265_MALLOC(pixel*, TEST_CASES);
short_test_buff = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
short_test_buff1 = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
+ short_test_buff2 = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
int_test_buff = (int**)X265_MALLOC(int*, TEST_CASES);
if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 || !sbuf1 || !sbuf2 || !sbuf3 || !ibuf1 ||
- !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1)
+ !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1 || !short_test_buff2)
{
fprintf(stderr, "malloc failed, unable to initiate tests!\n");
exit(1);
@@ -74,6 +75,7 @@
pixel_test_buff[i] = (pixel*)X265_MALLOC(pixel, BUFFSIZE);
short_test_buff[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
short_test_buff1[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
+ short_test_buff2[i] = (int16_t*)X265_MALLOC(int16_t, BUFFSIZE);
int_test_buff[i] = (int*)X265_MALLOC(int, BUFFSIZE);
if (!pixel_test_buff[i] || !short_test_buff[i] || !int_test_buff[i] || !short_test_buff1[i])
{
@@ -88,18 +90,21 @@
for (int i = 0; i < BUFFSIZE; i++)
{
pixel_test_buff[0][i] = rand() % PIXEL_MAX;
- short_test_buff[0][i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
- short_test_buff1[0][i] = rand() & PIXEL_MAX; //For block copy only
+ short_test_buff[0][i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; // max(SHORT_MIN, min(rand(), SMAX));
+ short_test_buff1[0][i] = rand() & PIXEL_MAX; // For block copy only
+ short_test_buff2[0][i] = rand() % 16383; // for addAvg
int_test_buff[0][i] = rand() % SHORT_MAX;
pixel_test_buff[1][i] = PIXEL_MIN;
short_test_buff[1][i] = SMIN;
short_test_buff1[1][i] = PIXEL_MIN;
+ short_test_buff2[1][i] = -16384;
int_test_buff[1][i] = SHORT_MIN;
pixel_test_buff[2][i] = PIXEL_MAX;
short_test_buff[2][i] = SMAX;
short_test_buff1[2][i] = PIXEL_MAX;
+ short_test_buff2[2][i] = 16383;
int_test_buff[2][i] = SHORT_MAX;
}
@@ -897,8 +902,8 @@
{
int index1 = rand() % TEST_CASES;
int index2 = rand() % TEST_CASES;
- ref(short_test_buff[index1] + j, short_test_buff[index2] + j, ref_dest, STRIDE, STRIDE, STRIDE);
- opt(short_test_buff[index1] + j, short_test_buff[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
+ ref(short_test_buff2[index1] + j, short_test_buff2[index2] + j, ref_dest, STRIDE, STRIDE, STRIDE);
+ opt(short_test_buff2[index1] + j, short_test_buff2[index2] + j, opt_dest, STRIDE, STRIDE, STRIDE);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
{
diff -r 57ce7f0f4f4c -r f2872795807a source/test/pixelharness.h
--- a/source/test/pixelharness.h Thu Feb 20 16:01:28 2014 -0800
+++ b/source/test/pixelharness.h Mon Feb 24 17:08:44 2014 +0530
@@ -35,7 +35,7 @@
int *ibuf1, **int_test_buff;
- int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1;
+ int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1, **short_test_buff2;
bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
bool check_pixelcmp_sp(pixelcmp_sp_t ref, pixelcmp_sp_t opt);
More information about the x265-devel
mailing list