[x265] [PATCH 1 of 3] asm: Assembly SSE2/AVX2 for planecopy_sp_shl
Min Chen
chenm003 at 163.com
Tue Jul 14 02:54:59 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1436834277 25200
# Node ID 0ddd2f402f7bb5ea4c8b2c26d9220873e5bea73d
# Parent 426169ca6c76d3feab03cd0b0c239c9547079a63
asm: Assembly SSE2/AVX2 for planecopy_sp_shl
---
source/common/x86/asm-primitives.cpp | 7 +
source/common/x86/pixel-a.asm | 216 +++++++++++++++++++++++++++++++++-
source/common/x86/pixel.h | 2 +
source/test/pixelharness.cpp | 13 ++-
4 files changed, 235 insertions(+), 3 deletions(-)
diff -r 426169ca6c76 -r 0ddd2f402f7b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jul 13 16:53:29 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jul 13 17:37:57 2015 -0700
@@ -1012,6 +1012,9 @@
LUMA_VSS_FILTERS(sse2);
p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
+ // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
+ //p.planecopy_sp = PFX(downShift_16_sse2);
+ p.planecopy_sp_shl = PFX(upShift_16_sse2);
ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
@@ -1292,6 +1295,10 @@
{
p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+ // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
+ //p.planecopy_sp = PFX(downShift_16_avx2);
+ p.planecopy_sp_shl = PFX(upShift_16_avx2);
+
p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
diff -r 426169ca6c76 -r 0ddd2f402f7b source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Jul 13 16:53:29 2015 +0530
+++ b/source/common/x86/pixel-a.asm Mon Jul 13 17:37:57 2015 -0700
@@ -70,6 +70,7 @@
cextern pd_2
cextern hmul_16p
cextern pb_movemask
+cextern pw_pixel_max
;=============================================================================
; SATD
@@ -7092,7 +7093,7 @@
; Input 10bit, Output 8bit
;------------------------------------------------------------------------------------------------------------------------
-;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
;------------------------------------------------------------------------------------------------------------------------
INIT_XMM sse2
cglobal downShift_16, 7,7,3
@@ -7466,6 +7467,219 @@
%endif
%endmacro
+
+; Input 10bit, Output 12bit
+;------------------------------------------------------------------------------------------------------------------------
+;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+;------------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal upShift_16, 6,7,4
+ movd m0, r6m ; m0 = shift
+ mova m3, [pw_pixel_max]
+ FIX_STRIDES r1d, r3d
+ dec r5d
+.loopH:
+ xor r6d, r6d
+.loopW:
+ movu m1, [r0 + r6 * SIZEOF_PIXEL]
+ movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ ; TODO: if input always valid, we can remove below 2 instructions.
+ pand m1, m3
+ pand m2, m3
+ movu [r2 + r6 * SIZEOF_PIXEL], m1
+ movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
+
+ add r6, mmsize * 2 / SIZEOF_PIXEL
+ cmp r6d, r4d
+ jl .loopW
+
+ ; move to next row
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jnz .loopH
+
+;processing last row of every frame [To handle width which not a multiple of 16]
+
+.loop16:
+ movu m1, [r0]
+ movu m2, [r0 + mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ pand m1, m3
+ pand m2, m3
+ movu [r2], m1
+ movu [r2 + mmsize], m2
+
+ add r0, 2 * mmsize
+ add r2, 2 * mmsize
+ sub r4d, 16
+ jz .end
+ jg .loop16
+
+ cmp r4d, 8
+ jl .process4
+ movu m1, [r0]
+ psrlw m1, m0
+ pand m1, m3
+ movu [r2], m1
+
+ add r0, mmsize
+ add r2, mmsize
+ sub r4d, 8
+ jz .end
+
+.process4:
+ cmp r4d, 4
+ jl .process2
+ movh m1,[r0]
+ psllw m1, m0
+ pand m1, m3
+ movh [r2], m1
+
+ add r0, 8
+ add r2, 8
+ sub r4d, 4
+ jz .end
+
+.process2:
+ cmp r4d, 2
+ jl .process1
+ movd m1, [r0]
+ psllw m1, m0
+ pand m1, m3
+ movd [r2], m1
+
+ add r0, 4
+ add r2, 4
+ sub r4d, 2
+ jz .end
+
+.process1:
+ movd m1, [r0]
+ psllw m1, m0
+ pand m1, m3
+ movd r3, m1
+ mov [r2], r3w
+.end:
+ RET
+
+; Input 10bit, Output 12bit
+;-------------------------------------------------------------------------------------------------------------------------------------
+;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+;-------------------------------------------------------------------------------------------------------------------------------------
+; TODO: NO TEST CODE!
+INIT_YMM avx2
+cglobal upShift_16, 6,7,4
+ movd xm0, r6m ; m0 = shift
+ vbroadcasti128 m3, [pw_pixel_max]
+ FIX_STRIDES r1d, r3d
+ dec r5d
+.loopH:
+ xor r6d, r6d
+.loopW:
+ movu m1, [r0 + r6 * SIZEOF_PIXEL]
+ movu m2, [r0 + r6 * SIZEOF_PIXEL + mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ pand m1, m3
+ pand m2, m3
+ movu [r2 + r6 * SIZEOF_PIXEL], m1
+ movu [r2 + r6 * SIZEOF_PIXEL + mmsize], m2
+
+ add r6, mmsize * 2 / SIZEOF_PIXEL
+ cmp r6d, r4d
+ jl .loopW
+
+ ; move to next row
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jnz .loopH
+
+; processing last row of every frame [To handle width which not a multiple of 32]
+ mov r6d, r4d
+ and r4d, 31
+ shr r6d, 5
+
+.loop32:
+ movu m1, [r0]
+ movu m2, [r0 + mmsize]
+ psllw m1, xm0
+ psllw m2, xm0
+ pand m1, m3
+ pand m2, m3
+ movu [r2], m1
+ movu [r2 + mmsize], m2
+
+ add r0, 2*mmsize
+ add r2, 2*mmsize
+ dec r6d
+ jnz .loop32
+
+ cmp r4d, 16
+ jl .process8
+ movu m1, [r0]
+ psllw m1, xm0
+ pand m1, m3
+ movu [r2], m1
+
+ add r0, mmsize
+ add r2, mmsize
+ sub r4d, 16
+ jz .end
+
+.process8:
+ cmp r4d, 8
+ jl .process4
+ movu xm1, [r0]
+ psllw xm1, xm0
+ pand xm1, xm3
+ movu [r2], xm1
+
+ add r0, 16
+ add r2, 16
+ sub r4d, 8
+ jz .end
+
+.process4:
+ cmp r4d, 4
+ jl .process2
+ movq xm1,[r0]
+ psllw xm1, xm0
+ pand xm1, xm3
+ movq [r2], xm1
+
+ add r0, 8
+ add r2, 8
+ sub r4d, 4
+ jz .end
+
+.process2:
+ cmp r4d, 2
+ jl .process1
+ movd xm1, [r0]
+ psllw xm1, xm0
+ pand xm1, xm3
+ movd [r2], xm1
+
+ add r0, 4
+ add r2, 4
+ sub r4d, 2
+ jz .end
+
+.process1:
+ movd xm1, [r0]
+ psllw xm1, xm0
+ pand xm1, xm3
+ movd r3d, xm1
+ mov [r2], r3w
+.end:
+ RET
+
+
;---------------------------------------------------------------------------------------------------------------------
;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
;---------------------------------------------------------------------------------------------------------------------
diff -r 426169ca6c76 -r 0ddd2f402f7b source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Jul 13 16:53:29 2015 +0530
+++ b/source/common/x86/pixel.h Mon Jul 13 17:37:57 2015 -0700
@@ -30,6 +30,8 @@
void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
diff -r 426169ca6c76 -r 0ddd2f402f7b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jul 13 16:53:29 2015 +0530
+++ b/source/test/pixelharness.cpp Mon Jul 13 17:37:57 2015 -0700
@@ -1283,8 +1283,8 @@
for (int i = 0; i < ITERS; i++)
{
int index = i % TEST_CASES;
- checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)255);
- ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)255);
+ checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
+ ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))
return false;
@@ -2266,6 +2266,15 @@
}
}
+ if (opt.planecopy_sp_shl)
+ {
+ if (!check_planecopy_sp(ref.planecopy_sp_shl, opt.planecopy_sp_shl))
+ {
+ printf("planecopy_sp_shl failed\n");
+ return false;
+ }
+ }
+
if (opt.planecopy_cp)
{
if (!check_planecopy_cp(ref.planecopy_cp, opt.planecopy_cp))
More information about the x265-devel
mailing list