[x265] [PATCH] cvt16to32_cnt optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Sep 2 16:11:31 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1409571425 -19800
# Node ID 51b5a6d820da97a4178dc42d2ef98ffe1970511b
# Parent c09f34b0ab57b4ce2f5cf4aa59c25d20eb6cbd54
cvt16to32_cnt optimization
diff -r c09f34b0ab57 -r 51b5a6d820da source/common/dct.cpp
--- a/source/common/dct.cpp Mon Aug 25 16:54:19 2014 +0530
+++ b/source/common/dct.cpp Mon Sep 01 17:07:05 2014 +0530
@@ -834,7 +834,7 @@
{
for (int j = 0; j < trSize; j++)
{
- coeff[k * trSize + j] = ((int16_t)residual[k * stride + j]);
+ coeff[k * trSize + j] = residual[k * stride + j];
numSig += (residual[k * stride + j] != 0);
}
}
diff -r c09f34b0ab57 -r 51b5a6d820da source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Mon Aug 25 16:54:19 2014 +0530
+++ b/source/common/x86/blockcopy8.asm Mon Sep 01 17:07:05 2014 +0530
@@ -29,6 +29,10 @@
tab_Vm: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+cextern pb_4
+cextern pb_1
+cextern pb_16
+cextern pb_64
cextern pw_4
cextern pb_8
cextern pb_32
@@ -3946,52 +3950,47 @@
;--------------------------------------------------------------------------------------
-; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t cvt16to32_cnt(int16_t *dst, int16_t *src, intptr_t stride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
cglobal cvt16to32_cnt_4, 3,3,5
add r2d, r2d
pxor m4, m4
- ; row 0 & 1
- movh m0, [r1]
- movhps m0, [r1 + r2]
- mova m2, m0
- pmovsxwd m1, m0
- punpckhwd m0, m0
- psrad m0, 16
- movu [r0 + 0 * mmsize], m1
- movu [r0 + 1 * mmsize], m0
-
- ; row 2 & 3
- movh m0, [r1 + r2 * 2]
- lea r2, [r2 * 3]
- movhps m0, [r1 + r2]
- packsswb m2, m0
- pcmpeqb m2, m4
- pmovsxwd m1, m0
- punpckhwd m0, m0
- psrad m0, 16
- movu [r0 + 2 * mmsize], m1
- movu [r0 + 3 * mmsize], m0
-
- ; get count
- ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
-%if 1
- pmovmskb eax, m2
- not ax
- popcnt ax, ax
+ ; row 0 & 1
+ movh m0, [r1]
+ movh m1, [r1 + r2]
+ movh [r0], m0
+ movh [r0 + 8], m1
+
+ mova m2, [r0]
+
+ ; row 2 & 3
+ movh m0, [r1 + r2 * 2]
+ lea r2, [r2 * 3]
+ movh m1, [r1 + r2]
+ movh [r0 + 16], m0
+ movh [r0 + 24], m1
+
+ mova m0, [r0 + 16]
+ packsswb m2, m0
+ pcmpeqb m2, m4
+
+ ; get count
+ ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
+%if 0
+ pmovmskb eax, m2
+ not ax
+ popcnt ax, ax
%else
- movhlps m3, m2
- paddw m2, m3
-
- mova m3, [pw_4]
- paddw m3, m2
- psadbw m3, m4
-
- movd eax, m3
-%endif
- RET
+ mova m0, [pb_1]
+ paddb m2, m0
+ psadbw m2, m4
+ pshufd m0, m2, 2
+ paddw m2, m0
+ movd eax, m2
+ %endif
+ RET
INIT_YMM avx2
@@ -4023,71 +4022,65 @@
;--------------------------------------------------------------------------------------
-; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t cvt16to32_cnt(int16_t *dst, int16_t *src, intptr_t stride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal cvt16to32_cnt_8, 3,5,6
+cglobal cvt16to32_cnt_8, 3,3,6
add r2d, r2d
pxor m4, m4
- mov r3d, 8/4
- lea r4, [r2 * 3]
pxor m5, m5
-.loop
- ; row 0
+ ; row 0 & 1
+ movu m0, [r1]
+ movu m1, [r1 + r2]
+ movu [r0], m0
+ movu [r0 + 16], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ ; row 2 & 3
+ lea r1, [r1 + 2 * r2]
movu m0, [r1]
- mova m2, m0
- pmovsxwd m1, m0
- punpckhwd m0, m0
- psrad m0, 16
- movu [r0 + 0 * mmsize], m1
- movu [r0 + 1 * mmsize], m0
-
- ; row 1
- movu m0, [r1 + r2]
- packsswb m2, m0
- pcmpeqb m2, m4
- paddb m5, m2
- pmovsxwd m1, m0
- punpckhwd m0, m0
- psrad m0, 16
- movu [r0 + 2 * mmsize], m1
- movu [r0 + 3 * mmsize], m0
-
- ; row 2
- movu m0, [r1 + r2 * 2]
- mova m2, m0
- pmovsxwd m1, m0
- punpckhwd m0, m0
- psrad m0, 16
- movu [r0 + 4 * mmsize], m1
- movu [r0 + 5 * mmsize], m0
-
- ; row 3
- movu m0, [r1 + r4]
- packsswb m2, m0
- pcmpeqb m2, m4
- paddb m5, m2
- pmovsxwd m1, m0
- punpckhwd m0, m0
- psrad m0, 16
- movu [r0 + 6 * mmsize], m1
- movu [r0 + 7 * mmsize], m0
-
- add r0, 8 * mmsize
- lea r1, [r1 + r2 * 4]
- dec r3d
- jnz .loop
+ movu m1, [r1 + r2]
+ movu [r0 + 32], m0
+ movu [r0 + 48], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ ; row 4 & 5
+ lea r1, [r1 + 2 * r2]
+ movu m0, [r1]
+ movu m1, [r1 + r2]
+ movu [r0 + 64], m0
+ movu [r0 + 80], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ ; row 6 & 7
+ lea r1, [r1 + 2 * r2]
+ movu m0, [r1]
+ movu m1, [r1 + r2]
+ movu [r0 + 96], m0
+ movu [r0 + 112], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
; get count
- movhlps m3, m5
- paddb m3, m5
-
- paddb m3, [pb_8]
- psadbw m3, m4
-
- movd eax, m3
- RET
+ mova m0, [pb_4]
+ paddb m5, m0
+ psadbw m5, m4
+ pshufd m0, m5, 2
+ paddw m5, m0
+ movd eax, m5
+ RET
INIT_YMM avx2
@@ -4174,58 +4167,69 @@
;--------------------------------------------------------------------------------------
-; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+; uint32_t cvt16to32_cnt(int16_t *dst, int16_t *src, intptr_t stride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal cvt16to32_cnt_16, 3,4,7
- add r2d, r2d
- mov r3d, 16/2
- pxor m5, m5
- pxor m6, m6
+cglobal cvt16to32_cnt_16, 3,4,6
+ add r2d, r2d
+ mov r3d, 4
+ pxor m4, m4
+ pxor m5, m5
.loop
; row 0
movu m0, [r1]
- movu m1, [r1 + mmsize]
- packsswb m4, m0, m1
- pcmpeqb m4, m6
- paddb m5, m4
- pmovsxwd m2, m0
- pmovsxwd m0, [r1 + 8]
- pmovsxwd m3, m1
- pmovsxwd m1, [r1 + mmsize + 8]
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m0
- movu [r0 + 2 * mmsize], m3
- movu [r0 + 3 * mmsize], m1
-
- ; row 1
+ movu m1, [r1 + 16]
+ movu [r0], m0
+ movu [r0 + 16], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ ; row 1
movu m0, [r1 + r2]
- movu m1, [r1 + r2 + mmsize]
- packsswb m4, m0, m1
- pcmpeqb m4, m6
- paddb m5, m4
- pmovsxwd m2, m0
- pmovsxwd m0, [r1 + r2 + 8]
- pmovsxwd m3, m1
- pmovsxwd m1, [r1 + r2 + mmsize + 8]
- movu [r0 + 4 * mmsize], m2
- movu [r0 + 5 * mmsize], m0
- movu [r0 + 6 * mmsize], m3
- movu [r0 + 7 * mmsize], m1
-
- add r0, 8 * mmsize
- lea r1, [r1 + r2 * 2]
- dec r3d
- jnz .loop
-
- ; get count
- movhlps m0, m5
- paddb m0, m5
- paddb m0, [pb_32]
- psadbw m0, m6
- movd eax, m0
- RET
+ movu m1, [r1 + r2 + 16]
+ movu [r0 + 32], m0
+ movu [r0 + 48], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ ; row 2
+ movu m0, [r1 + 2 * r2]
+ movu m1, [r1 + 2 * r2 + 16]
+ movu [r0 + 64], m0
+ movu [r0 + 80], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ ; row 3
+ lea r1, [r1 + 2 * r2]
+ movu m0, [r1 + r2]
+ movu m1, [r1 + r2 + 16]
+ movu [r0 + 96], m0
+ movu [r0 + 112], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ add r0, 128
+ lea r1, [r1 + 2 * r2]
+ dec r3d
+ jnz .loop
+
+ mova m0, [pb_16]
+ paddb m5, m0
+ psadbw m5, m4
+ pshufd m0, m5, 2
+ paddw m5, m0
+ movd eax, m5
+ RET
INIT_YMM avx2
@@ -4294,59 +4298,68 @@
movd eax, xm0
RET
-
;--------------------------------------------------------------------------------------
; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
;--------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal cvt16to32_cnt_32, 3,4,8
+cglobal cvt16to32_cnt_32, 3,4,6
add r2d, r2d
- mov r3d, 32/1
- pxor m6, m6
- pxor m7, m7
+ mov r3d, 16
+ pxor m4, m4
+ pxor m5, m5
.loop
; row 0
- movu m0, [r1 + 0 * mmsize]
- movu m1, [r1 + 1 * mmsize]
- movu m2, [r1 + 2 * mmsize]
- movu m3, [r1 + 3 * mmsize]
- packsswb m4, m0, m1
- packsswb m5, m2, m3
- pcmpeqb m4, m7
- pcmpeqb m5, m7
- paddb m6, m4
- paddb m6, m5
-
- pmovsxwd m4, m0
- pmovsxwd m5, [r1 + 0 * mmsize + mmsize/2]
- movu [r0 + 0 * mmsize], m4
- movu [r0 + 1 * mmsize], m5
- pmovsxwd m4, m1
- pmovsxwd m5, [r1 + 1 * mmsize + mmsize/2]
- movu [r0 + 2 * mmsize], m4
- movu [r0 + 3 * mmsize], m5
- pmovsxwd m4, m2
- pmovsxwd m5, [r1 + 2 * mmsize + mmsize/2]
- movu [r0 + 4 * mmsize], m4
- movu [r0 + 5 * mmsize], m5
- pmovsxwd m4, m3
- pmovsxwd m5, [r1 + 3 * mmsize + mmsize/2]
- movu [r0 + 6 * mmsize], m4
- movu [r0 + 7 * mmsize], m5
-
- add r0, 8 * mmsize
- add r1, r2
- dec r3d
- jnz .loop
-
- ; get count
- movhlps m0, m6
- paddb m0, m6
- paddb m0, [pb_128]
- psadbw m0, m7
- movd eax, m0
- RET
+ movu m0, [r1]
+ movu m1, [r1 + 16]
+ movu [r0], m0
+ movu [r0 + 16], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ movu m0, [r1 + 32]
+ movu m1, [r1 + 48]
+ movu [r0 + 32], m0
+ movu [r0 + 48], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ ; row 1
+ movu m0, [r1 + r2]
+ movu m1, [r1 + r2 + 16]
+ movu [r0 + 64], m0
+ movu [r0 + 80], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ movu m0, [r1 + r2 + 32]
+ movu m1, [r1 + r2 + 48]
+ movu [r0 + 96], m0
+ movu [r0 + 112], m1
+
+ packsswb m0, m1
+ pcmpeqb m0, m4
+ paddb m5, m0
+
+ add r0, 128
+ lea r1, [r1 + 2 * r2]
+ dec r3d
+ jnz .loop
+
+ ; get count
+ mova m0, [pb_64]
+ paddb m5, m0
+ psadbw m5, m4
+ pshufd m0, m5, 2
+ paddw m5, m0
+ movd eax, m5
+ RET
INIT_YMM avx2
diff -r c09f34b0ab57 -r 51b5a6d820da source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Aug 25 16:54:19 2014 +0530
+++ b/source/common/x86/const-a.asm Mon Sep 01 17:07:05 2014 +0530
@@ -48,6 +48,9 @@
const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7
const pw_swap, times 2 db 6,7,4,5,2,3,0,1
+const pb_4, times 16 db 4
+const pb_16, times 16 db 16
+const pb_64, times 16 db 64
const pb_01, times 8 db 0,1
const pb_0, times 16 db 0
const pb_1, times 32 db 1
diff -r c09f34b0ab57 -r 51b5a6d820da source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Aug 25 16:54:19 2014 +0530
+++ b/source/test/pixelharness.cpp Mon Sep 01 17:07:05 2014 +0530
@@ -592,14 +592,11 @@
intptr_t stride = STRIDE;
for (int i = 0; i < ITERS; i++)
{
-#ifdef _DEBUG
- memset(ref_dest, 0xCD, sizeof(ref_dest));
- memset(opt_dest, 0xCD, sizeof(opt_dest));
-#endif
- int opt_cnt = (int)checked(opt, opt_dest, sbuf1 + j, stride);
- int ref_cnt = ref(ref_dest, sbuf1 + j, stride);
+ int index = i % TEST_CASES;
+ int opt_cnt = (int)checked(opt, opt_dest, short_test_buff1[index] + j, stride);
+ int ref_cnt = ref(ref_dest, short_test_buff1[index] + j, stride);
- if ((ref_cnt != opt_cnt) || memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int32_t)))
+ if ((ref_cnt != opt_cnt) || memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
return false;
reportfail();
More information about the x265-devel
mailing list