[x265] [PATCH 2 of 2] asm: rewrite and fix bug in weight_sp_sse4 on HIGH_BIT_DEPTH mode
Min Chen
chenm003 at 163.com
Mon Jan 19 11:22:01 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1421662910 -28800
# Node ID b2f64dbe26392dd6bea2badaccf2869bec883392
# Parent a0bb3bb1b076d2ef559ab94bfe81052142d302c3
asm: rewrite and fix bug in weight_sp_sse4 on HIGH_BIT_DEPTH mode
---
source/common/pixel.cpp | 7 ++
source/common/x86/asm-primitives.cpp | 2 +-
source/common/x86/const-a.asm | 1 +
source/common/x86/pixel-util8.asm | 134 +++++++++++++++++-----------------
source/test/pixelharness.cpp | 10 ++-
5 files changed, 81 insertions(+), 73 deletions(-)
diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Jan 19 18:21:45 2015 +0800
+++ b/source/common/pixel.cpp Mon Jan 19 18:21:50 2015 +0800
@@ -520,6 +520,13 @@
{
int x, y;
+ const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
+
+ X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
+ X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
+ X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");
+ X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n");
+
for (y = 0; y <= height - 1; y++)
{
for (x = 0; x <= width - 1; )
diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 19 18:21:45 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 19 18:21:50 2015 +0800
@@ -925,7 +925,7 @@
p.planecopy_cp = x265_upShift_8_sse4;
// these fail unit tests
p.weight_pp = x265_weight_pp_sse4;
- // p.weight_sp = x265_weight_sp_sse4;
+ p.weight_sp = x265_weight_sp_sse4;
p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
#if X86_64
diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Jan 19 18:21:45 2015 +0800
+++ b/source/common/x86/const-a.asm Mon Jan 19 18:21:50 2015 +0800
@@ -63,6 +63,7 @@
const pb_128, times 16 db 128
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
+const pw_0_15, times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
const pw_2, times 8 dw 2
const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Jan 19 18:21:45 2015 +0800
+++ b/source/common/x86/pixel-util8.asm Mon Jan 19 18:21:50 2015 +0800
@@ -53,6 +53,7 @@
SECTION .text
cextern pw_1
+cextern pw_0_15
cextern pb_1
cextern pw_00ff
cextern pw_1023
@@ -63,7 +64,6 @@
cextern pd_32767
cextern pd_n32768
-
;-----------------------------------------------------------------------------
; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
;-----------------------------------------------------------------------------
@@ -986,84 +986,82 @@
;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
;-------------------------------------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-%if ARCH_X86_64
-cglobal weight_sp, 6, 7+2, 7
- %define tmp_r0 r7
- %define tmp_r1 r8
-%else ; ARCH_X86_64 = 0
-cglobal weight_sp, 6, 7, 7, 0-(2*4)
- %define tmp_r0 [(rsp + 0 * 4)]
- %define tmp_r1 [(rsp + 1 * 4)]
-%endif ; ARCH_X86_64
-
- movd m0, r6m ; m0 = [w0]
-
- movd m1, r7m ; m1 = [round]
- punpcklwd m0, m1
- pshufd m0, m0, 0 ; m0 = [w0 round]
-
- movd m1, r8m ; m1 = [shift]
-
- movd m2, r9m
- pshufd m2, m2, 0 ; m2 =[offset]
-
- mova m3, [pw_1]
- mova m4, [pw_2000]
-
+cglobal weight_sp, 6,7,8
+%if BIT_DEPTH == 10
+ mova m1, [pw_1023]
+%elif BIT_DEPTH == 12
+ mova m1, [pw_3fff]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ mova m2, [pw_1]
+ mov r6d, r7m
+ shl r6d, 16
+ or r6d, r6m ; assuming both (w0) and round are using maximum of 16 bits each.
+ movd m3, r6d
+ pshufd m3, m3, 0 ; m3 = [round w0]
+
+ movd m4, r8m ; m4 = [shift]
+ movd m5, r9m
+ pshufd m5, m5, 0 ; m5 = [offset]
+
+ ; correct row stride
+ add r3d, r3d
add r2d, r2d
+ mov r6d, r4d
+ and r6d, ~(mmsize / SIZEOF_PIXEL - 1)
+ sub r3d, r6d
+ sub r3d, r6d
+ sub r2d, r6d
+ sub r2d, r6d
+
+ ; generate partial width mask (MUST BE IN XMM0)
+ mov r6d, r4d
+ and r6d, (mmsize / SIZEOF_PIXEL - 1)
+ movd m0, r6d
+ pshuflw m0, m0, 0
+ punpcklqdq m0, m0
+ pcmpgtw m0, [pw_0_15]
.loopH:
mov r6d, r4d
- ; save old src and dst
- mov tmp_r0, r0
- mov tmp_r1, r1
.loopW:
- movu m5, [r0]
- paddw m5, m4
-
- punpcklwd m6,m5, m3
- pmaddwd m6, m0
- psrad m6, m1
- paddd m6, m2
-
- punpckhwd m5, m3
- pmaddwd m5, m0
- psrad m5, m1
- paddd m5, m2
-
- packssdw m6, m5
- packuswb m6, m6
-
- sub r6d, 8
- jl .width4
- movh [r1], m6
- je .nextH
- add r0, 16
- add r1, 8
-
- jmp .loopW
-
-.width4:
- cmp r6d, -4
- jl .width2
- movd [r1], m6
- je .nextH
- add r1, 4
- pshufd m6, m6, 1
-
-.width2:
- pextrw [r1], m6, 0
+ movu m6, [r0]
+ paddw m6, [pw_2000]
+
+ punpcklwd m7, m6, m2
+ pmaddwd m7, m3
+ psrad m7, m4
+ paddd m7, m5
+
+ punpckhwd m6, m2
+ pmaddwd m6, m3
+ psrad m6, m4
+ paddd m6, m5
+
+ packusdw m7, m6
+ pminuw m7, m1
+
+ sub r6d, (mmsize / SIZEOF_PIXEL)
+ jl .widthLess8
+ movu [r1], m7
+ lea r0, [r0 + mmsize]
+ lea r1, [r1 + mmsize]
+ je .nextH
+ jmp .loopW
+
+.widthLess8:
+ movu m6, [r1]
+ pblendvb m6, m6, m7, m0
+ movu [r1], m6
.nextH:
- mov r0, tmp_r0
- mov r1, tmp_r1
- lea r0, [r0 + r2]
- lea r1, [r1 + r3]
+ add r0, r2
+ add r1, r3
dec r5d
jnz .loopH
-
RET
;-----------------------------------------------------------------
diff -r a0bb3bb1b076 -r b2f64dbe2639 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jan 19 18:21:45 2015 +0800
+++ b/source/test/pixelharness.cpp Mon Jan 19 18:21:50 2015 +0800
@@ -222,8 +222,8 @@
bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
{
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * (64 + 1)]);
memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
@@ -236,11 +236,12 @@
int offset = (rand() % 256) - 128;
intptr_t stride = 64;
const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
+
for (int i = 0; i < ITERS; i++)
{
int index = i % TEST_CASES;
- checked(opt, short_test_buff[index] + j, opt_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);
- ref(short_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);
+ checked(opt, short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
+ ref(short_test_buff[index] + j, ref_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
{
@@ -264,6 +265,7 @@
printf("\n");
}
printf("\n");
+ opt(short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
return false;
}
More information about the x265-devel
mailing list