<div dir="ltr">Cannot apply. Can you please update this to the current tip?<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Jun 25, 2015 at 2:20 PM, <span dir="ltr"><<a href="mailto:rajesh@multicorewareinc.com" target="_blank">rajesh@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Rajesh Paulraj<<a href="mailto:rajesh@multicorewareinc.com">rajesh@multicorewareinc.com</a>><br>
# Date 1435220155 -19800<br>
# Thu Jun 25 13:45:55 2015 +0530<br>
# Node ID 26e8eff8eb5abc1c2fa5dd94f59f620c6040caf9<br>
# Parent 430625004ef81ba9e9e398d4cf12a68a1cd4b664<br>
asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)<br>
<br>
avx2:<br>
planecopy_cp 19.36x 5685.80 110052.08<br>
<br>
sse4:<br>
planecopy_cp 9.65x 10660.20 102850.27<br>
<br>
diff -r 430625004ef8 -r 26e8eff8eb5a source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Thu Jun 25 13:34:17 2015 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 13:45:55 2015 +0530<br>
@@ -1497,6 +1497,7 @@<br>
p.scale2D_64to32 = PFX(scale2D_64to32_avx2);<br>
p.weight_pp = PFX(weight_pp_avx2);<br>
p.sign = x265_calculateSign_avx2;<br>
+ p.planecopy_cp = PFX(upShift_8_avx2);<br>
<br>
<a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);<br>
<a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);<br>
diff -r 430625004ef8 -r 26e8eff8eb5a source/common/x86/pixel-a.asm<br>
--- a/source/common/x86/pixel-a.asm Thu Jun 25 13:34:17 2015 +0530<br>
+++ b/source/common/x86/pixel-a.asm Thu Jun 25 13:45:55 2015 +0530<br>
@@ -7388,6 +7388,96 @@<br>
.end:<br>
RET<br>
<br>
+;---------------------------------------------------------------------------------------------------------------------<br>
+;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)<br>
+;---------------------------------------------------------------------------------------------------------------------<br>
+%if ARCH_X86_64<br>
+INIT_YMM avx2<br>
+cglobal upShift_8, 7,8,3<br>
+ movd xm2, r6d<br>
+ add r3, r3<br>
+<br>
+.loopH:<br>
+ xor r7, r7<br>
+ mov r6d, r4d<br>
+.loopW:<br>
+ pmovzxbw m0,[r0 + r7]<br>
+ pmovzxbw m1,[r0 + r7 + 16]<br>
+ psllw m0, xm2<br>
+ psllw m1, xm2<br>
+ movu [r2 + r7 * 2], m0<br>
+ movu [r2 + r7 * 2 + 32], m1<br>
+<br>
+ add r7d, 32<br>
+ sub r6d, 32<br>
+ jg .loopW<br>
+<br>
+ ; move to next row<br>
+ add r0, r1<br>
+ add r2, r3<br>
+ dec r5d<br>
+ jnz .loopH<br>
+<br>
+;processing last row of every frame [To handle width which not a multiple of 16]<br>
+<br>
+.loop16:<br>
+ pmovzxbw m0,[r0]<br>
+ psllw m0, xm2<br>
+ movu [r2], m0<br>
+<br>
+ add r0, mmsize<br>
+ add r2, 2 * mmsize<br>
+ sub r4d, 16<br>
+ jg .loop16<br>
+ jz .end<br>
+<br>
+ cmp r4d, 8<br>
+ jl .process4<br>
+ pmovzxbw m0,[r0]<br>
+ psllw m0, xm2<br>
+ movu [r2], m0<br>
+<br>
+ add r0, 8<br>
+ add r2, mmsize<br>
+ sub r4d, 8<br>
+ jz .end<br>
+<br>
+.process4:<br>
+ cmp r4d, 4<br>
+ jl .process2<br>
+ movq xm0,[r0]<br>
+ pmovzxbw m0,xm0<br>
+ psllw xm0, xm2<br>
+ movq [r2], xm0<br>
+<br>
+ add r0, 4<br>
+ add r2, 8<br>
+ sub r4d, 4<br>
+ jz .end<br>
+<br>
+.process2:<br>
+ cmp r4d, 2<br>
+ jl .process1<br>
+ movzx r3d, byte [r0]<br>
+ shl r3d, 2<br>
+ mov [r2], r3w<br>
+ movzx r3d, byte [r0 + 1]<br>
+ shl r3d, 2<br>
+ mov [r2 + 2], r3w<br>
+<br>
+ add r0, 2<br>
+ add r2, 4<br>
+ sub r4d, 2<br>
+ jz .end<br>
+<br>
+.process1:<br>
+ movzx r3d, byte [r0]<br>
+ shl r3d, 2<br>
+ mov [r2], r3w<br>
+.end:<br>
+ RET<br>
+%endif<br>
+<br>
%macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp<br>
%if cpuflag(ssse3)<br>
pabsd %1, %3<br>
diff -r 430625004ef8 -r 26e8eff8eb5a source/common/x86/pixel.h<br>
--- a/source/common/x86/pixel.h Thu Jun 25 13:34:17 2015 +0530<br>
+++ b/source/common/x86/pixel.h Thu Jun 25 13:45:55 2015 +0530<br>
@@ -31,6 +31,7 @@<br>
void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);<br>
void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);<br>
void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);<br>
+void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);<br>
<br>
#define DECL_PIXELS(cpu) \<br>
FUNCDEF_PU(int, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>