[x265] [PATCH 2 of 3] asm: code for input pixel upShift/downShift
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Mon Mar 31 14:22:56 CEST 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1396268498 -19800
# Mon Mar 31 17:51:38 2014 +0530
# Node ID 1375d631208818cb290eb462a56e50ea224d0c19
# Parent b0931c9326dc4a20718bf4d642c0ef1fcd7cd494
asm: code for input pixel upShift/downShift
diff -r b0931c9326dc -r 1375d6312088 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Mar 31 17:50:37 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Mar 31 17:51:38 2014 +0530
@@ -1071,6 +1071,7 @@
p.intra_pred[BLOCK_8x8][1] = x265_intra_pred_dc8_sse4;
p.intra_pred[BLOCK_16x16][1] = x265_intra_pred_dc16_sse4;
p.intra_pred[BLOCK_32x32][1] = x265_intra_pred_dc32_sse4;
+ p.planecopy_cp = x265_upShift_8_sse4;
INTRA_ANG_SSE4_COMMON(sse4);
INTRA_ANG_SSE4_HIGH(sse4);
@@ -1169,6 +1170,7 @@
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
p.count_nonzero = x265_count_nonzero_sse2;
+ p.planecopy_sp = x265_downShift_16_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r b0931c9326dc -r 1375d6312088 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Mar 31 17:50:37 2014 +0530
+++ b/source/common/x86/pixel-a.asm Mon Mar 31 17:51:38 2014 +0530
@@ -6356,3 +6356,190 @@
RET
%endif ; HIGH_BIT_DEPTH
+; Input 16bpp, Output 8bpp
+;------------------------------------------------------------------------------------------------------------------------
+;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+;------------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal downShift_16, 7,7,3
+ movd m0, r6d ; m0 = shift
+ add r1, r1
+ dec r5d
+.loopH:
+ xor r6, r6
+.loopW:
+ movu m1, [r0 + r6 * 2]
+ movu m2, [r0 + r6 * 2 + 16]
+ psrlw m1, m0
+ psrlw m2, m0
+ packuswb m1, m2
+ movu [r2 + r6], m1
+
+ add r6, 16
+ cmp r6d, r4d
+ jl .loopW
+
+ ; move to next row
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jnz .loopH
+
+;processing last row of every frame [To handle width which not a multiple of 16]
+
+.loop16:
+ movu m1, [r0]
+ movu m2, [r0 + 16]
+ psrlw m1, m0
+ psrlw m2, m0
+ packuswb m1, m2
+ movu [r2], m1
+
+ add r0, 2 * mmsize
+ add r2, mmsize
+ sub r4d, 16
+ jz .end
+ cmp r4d, 15
+ jg .loop16
+
+ cmp r4d, 8
+ jl .process4
+ movu m1, [r0]
+ psrlw m1, m0
+ packuswb m1, m1
+ movh [r2], m1
+
+ add r0, mmsize
+ add r2, 8
+ sub r4d, 8
+ jz .end
+
+.process4:
+ cmp r4d, 4
+ jl .process2
+ movh m1,[r0]
+ psrlw m1, m0
+ packuswb m1, m1
+ movd [r2], m1
+
+ add r0, 8
+ add r2, 4
+ sub r4d, 4
+ jz .end
+
+.process2:
+ cmp r4d, 2
+ jl .process1
+ movd m1, [r0]
+ psrlw m1, m0
+ packuswb m1, m1
+ movd r6, m1
+ mov [r2], r6w
+
+ add r0, 4
+ add r2, 2
+ sub r4d, 2
+ jz .end
+
+.process1:
+ movd m1, [r0]
+ psrlw m1, m0
+ packuswb m1, m1
+ movd r6, m1
+ mov [r2], r6b
+.end:
+ RET
+
+; Input 8bpp, Output 16bpp
+;---------------------------------------------------------------------------------------------------------------------
+;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
+;---------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal upShift_8, 7,7,3
+
+ movd m2, r6d ; m0 = shift
+ add r3, r3
+ dec r5d
+
+.loopH:
+ xor r6, r6
+.loopW:
+ pmovzxbw m0,[r0 + r6]
+ pmovzxbw m1,[r0 + r6 + 8]
+ psllw m0, m2
+ psllw m1, m2
+ movu [r2 + r6 * 2], m0
+ movu [r2 + r6 * 2 + 16], m1
+
+ add r6, 16
+ cmp r6d, r4d
+ jl .loopW
+
+ ; move to next row
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jnz .loopH
+
+;processing last row of every frame [To handle width which not a multiple of 16]
+
+.loop16:
+ pmovzxbw m0,[r0]
+ pmovzxbw m1,[r0 + 8]
+ psllw m0, m2
+ psllw m1, m2
+ movu [r2], m0
+ movu [r2 + 16], m1
+
+ add r0, mmsize
+ add r2, 2 * mmsize
+ sub r4d, 16
+ jz .end
+ cmp r4d, 15
+ jg .loop16
+
+ cmp r4d, 8
+ jl .process4
+ pmovzxbw m0,[r0]
+ psllw m0, m2
+ movu [r2], m0
+
+ add r0, 8
+ add r2, mmsize
+ sub r4d, 8
+ jz .end
+
+.process4:
+ cmp r4d, 4
+ jl .process2
+ movd m0,[r0]
+ pmovzxbw m0,m0
+ psllw m0, m2
+ movh [r2], m0
+
+ add r0, 4
+ add r2, 8
+ sub r4d, 4
+ jz .end
+
+.process2:
+ cmp r4d, 2
+ jl .process1
+ movd m0,[r0]
+ pmovzxbw m0,m0
+ psllw m0, m2
+ movd [r2], m0
+
+ add r0, 2
+ add r2, 4
+ sub r4d, 2
+ jz .end
+
+.process1:
+ movd m0,[r0]
+ pmovzxbw m0,m0
+ psllw m0, m2
+ movd r6, m0
+ mov [r2], r6w
+.end:
+ RET
diff -r b0931c9326dc -r 1375d6312088 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Mar 31 17:50:37 2014 +0530
+++ b/source/common/x86/pixel.h Mon Mar 31 17:51:38 2014 +0530
@@ -200,6 +200,9 @@
ADDAVG(addAvg_64x48)
ADDAVG(addAvg_64x64)
+void x265_downShift_16_sse2(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+void x265_upShift_8_sse4(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift);
+
#undef DECL_PIXELS
#undef DECL_HEVC_SSD
#undef DECL_X1
More information about the x265-devel
mailing list