[x265] [PATCH 1 of 2] asm: rewrite and fix bug in weight_pp_sse4 on HIGH_BIT_DEPTH mode
Min Chen
chenm003 at 163.com
Mon Jan 19 11:22:00 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1421662905 -28800
# Node ID a0bb3bb1b076d2ef559ab94bfe81052142d302c3
# Parent bbc333bd4a6207c72c682b3ea88794c67996aa83
asm: rewrite and fix bug in weight_pp_sse4 on HIGH_BIT_DEPTH mode
---
source/common/x86/asm-primitives.cpp | 2 +-
source/common/x86/pixel-util8.asm | 55 +++++++++++++++++++++-------------
source/test/pixelharness.cpp | 45 +++++++++++++++++++++++++++
3 files changed, 80 insertions(+), 22 deletions(-)
diff -r bbc333bd4a62 -r a0bb3bb1b076 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 19 09:59:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 19 18:21:45 2015 +0800
@@ -924,7 +924,7 @@
p.planecopy_cp = x265_upShift_8_sse4;
// these fail unit tests
- // p.weight_pp = x265_weight_pp_sse4;
+ p.weight_pp = x265_weight_pp_sse4;
// p.weight_sp = x265_weight_sp_sse4;
p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
diff -r bbc333bd4a62 -r a0bb3bb1b076 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Jan 19 09:59:33 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Mon Jan 19 18:21:45 2015 +0800
@@ -55,6 +55,8 @@
cextern pw_1
cextern pb_1
cextern pw_00ff
+cextern pw_1023
+cextern pw_3fff
cextern pw_2000
cextern pw_pixel_max
cextern pd_1
@@ -856,26 +858,52 @@
;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
;-----------------------------------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal weight_pp, 6, 7, 6
-
- shl r5d, 6 ; m0 = [w0<<6]
+cglobal weight_pp, 4,7,7
+%define correction (14 - BIT_DEPTH)
+%if BIT_DEPTH == 10
+ mova m6, [pw_1023]
+%elif BIT_DEPTH == 12
+ mova m6, [pw_3fff]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
mov r6d, r6m
- shl r6d, 16
- or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
+ mov r4d, r4m
+ mov r5d, r5m
+ shl r6d, 16 - correction
+ or r6d, r5d ; assuming both (w0) and round are using maximum of 16 bits each.
movd m0, r6d
- pshufd m0, m0, 0 ; m0 = [w0<<6, round]
- movd m1, r7m
+ pshufd m0, m0, 0 ; m0 = [w0, round]
+ mov r5d, r7m
+ sub r5d, correction
+ movd m1, r5d
movd m2, r8m
pshufd m2, m2, 0
mova m5, [pw_1]
sub r2d, r3d
+ add r2d, r2d
shr r3d, 4
.loopH:
mov r5d, r3d
.loopW:
- pmovzxbw m4, [r0]
+ movu m4, [r0]
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m0
+ psrad m3, m1
+ paddd m3, m2 ; TODO: we can put Offset into Round, but we have to analyze Dynamic Range before that.
+
+ punpckhwd m4, m5
+ pmaddwd m4, m0
+ psrad m4, m1
+ paddd m4, m2
+
+ packusdw m3, m4
+ pminuw m3, m6
+ movu [r1], m3
+
+ movu m4, [r0 + mmsize]
punpcklwd m3, m4, m5
pmaddwd m3, m0
psrad m3, m1
@@ -886,33 +914,18 @@
psrad m4, m1
paddd m4, m2
- packssdw m3, m4
- packuswb m3, m3
- movh [r1], m3
-
- pmovzxbw m4, [r0 + 8]
- punpcklwd m3, m4, m5
- pmaddwd m3, m0
- psrad m3, m1
- paddd m3, m2
-
- punpckhwd m4, m5
- pmaddwd m4, m0
- psrad m4, m1
- paddd m4, m2
-
- packssdw m3, m4
- packuswb m3, m3
- movh [r1 + 8], m3
-
- add r0, 16
- add r1, 16
+ packusdw m3, m4
+ pminuw m3, m6
+ movu [r1 + mmsize], m3
+
+ add r0, 2 * mmsize
+ add r1, 2 * mmsize
dec r5d
- jnz .loopW
-
- lea r0, [r0 + r2]
- lea r1, [r1 + r2]
+ jnz .loopW
+
+ add r0, r2
+ add r1, r2
dec r4d
jnz .loopH
diff -r bbc333bd4a62 -r a0bb3bb1b076 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jan 19 09:59:33 2015 +0530
+++ b/source/test/pixelharness.cpp Mon Jan 19 18:21:45 2015 +0800
@@ -243,7 +243,29 @@
ref(short_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ {
+ printf("--- Ref ---\n");
+ for(int y = 0; y < 16; y++)
+ {
+ for(int x = 0; x < 16; x++)
+ {
+ printf("%04X, ", ref_dest[y * stride + x] & 0xFFFF);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ printf("--- Opt ---\n");
+ for(int y = 0; y < 16; y++)
+ {
+ for(int x = 0; x < 16; x++)
+ {
+ printf("%04X, ", opt_dest[y * stride + x] & 0xFFFF);
+ }
+ printf("\n");
+ }
+ printf("\n");
return false;
+ }
reportfail();
j += INCR;
@@ -275,7 +297,30 @@
ref(pixel_test_buff[index] + j, ref_dest, stride, width, height, w0, round << correction, shift + correction, offset);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ {
+ printf("--- Ref ---\n");
+ for(int y = 0; y < 16; y++)
+ {
+ for(int x = 0; x < 16; x++)
+ {
+ printf("%04X, ", ref_dest[y * stride + x] & 0xFFFF);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ printf("--- Opt ---\n");
+ for(int y = 0; y < 16; y++)
+ {
+ for(int x = 0; x < 16; x++)
+ {
+ printf("%04X, ", opt_dest[y * stride + x] & 0xFFFF);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ checked(opt, pixel_test_buff[index] + j, opt_dest, stride, width, height, w0, round << correction, shift + correction, offset);
return false;
+ }
reportfail();
j += INCR;
More information about the x265-devel
mailing list