<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Mon, Nov 11, 2013 at 9:52 AM, <span dir="ltr"><<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Praveen Tiwari<br>
# Date 1384185158 -19800<br>
# Node ID 972a9a01d0b440c919becc8ec17e7187522a2e68<br>
# Parent b83d45863ceb3f88da420646a3789fb787043f6e<br>
asm code for blockcopy_ps_64xN<br></blockquote><div><br></div><div>These are all nice but the encoder is currently ignoring them all.</div><div><br></div><div>The next task should be to change all the TComYuv copy methods that take width and height to instead take a LUMA or CHROMA partition enum (int) and change those functions to use our new block-based primitives and then change all callers to pass in the enum, being careful not to call partitionFromSizes() more often than minimally necessary.</div>
<div><br></div><div>--</div><div>Steve </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
diff -r b83d45863ceb -r 972a9a01d0b4 source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Mon Nov 11 21:06:11 2013 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 11 21:22:38 2013 +0530<br>
@@ -459,6 +459,10 @@<br>
p.luma_copy_ps[LUMA_16x64] = x265_blockcopy_ps_16x64_sse4;<br>
p.luma_copy_ps[LUMA_32x64] = x265_blockcopy_ps_32x64_sse4;<br>
p.luma_copy_ps[LUMA_48x64] = x265_blockcopy_ps_48x64_sse4;<br>
+ p.luma_copy_ps[LUMA_64x16] = x265_blockcopy_ps_64x16_sse4;<br>
+ p.luma_copy_ps[LUMA_64x32] = x265_blockcopy_ps_64x32_sse4;<br>
+ p.luma_copy_ps[LUMA_64x48] = x265_blockcopy_ps_64x48_sse4;<br>
+ p.luma_copy_ps[LUMA_64x64] = x265_blockcopy_ps_64x64_sse4;<br>
}<br>
if (cpuMask & X265_CPU_AVX)<br>
{<br>
diff -r b83d45863ceb -r 972a9a01d0b4 source/common/x86/blockcopy8.asm<br>
--- a/source/common/x86/blockcopy8.asm Mon Nov 11 21:06:11 2013 +0530<br>
+++ b/source/common/x86/blockcopy8.asm Mon Nov 11 21:22:38 2013 +0530<br>
@@ -2286,3 +2286,77 @@<br>
%endmacro<br>
<br>
BLOCKCOPY_PS_W48_H2 48, 64<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);<br>
+;-----------------------------------------------------------------------------<br>
+%macro BLOCKCOPY_PS_W64_H2 2<br>
+INIT_XMM sse4<br>
+cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride<br>
+<br>
+add r1, r1<br>
+mov r4d, %2/2<br>
+pxor m0, m0<br>
+<br>
+.loop<br>
+ movu m1, [r2]<br>
+ pmovzxbw m2, m1<br>
+ movu [r0], m2<br>
+ punpckhbw m1, m0<br>
+ movu [r0 + 16], m1<br>
+<br>
+ movu m1, [r2 + 16]<br>
+ pmovzxbw m2, m1<br>
+ movu [r0 + 32], m2<br>
+ punpckhbw m1, m0<br>
+ movu [r0 + 48], m1<br>
+<br>
+ movu m1, [r2 + 32]<br>
+ pmovzxbw m2, m1<br>
+ movu [r0 + 64], m2<br>
+ punpckhbw m1, m0<br>
+ movu [r0 + 80], m1<br>
+<br>
+ movu m1, [r2 + 48]<br>
+ pmovzxbw m2, m1<br>
+ movu [r0 + 96], m2<br>
+ punpckhbw m1, m0<br>
+ movu [r0 + 112], m1<br>
+<br>
+ movu m1, [r2 + r3]<br>
+ pmovzxbw m2, m1<br>
+ movu [r0 + r1], m2<br>
+ punpckhbw m1, m0<br>
+ movu [r0 + r1 + 16], m1<br>
+<br>
+ movu m1, [r2 + r3 + 16]<br>
+ pmovzxbw m2, m1<br>
+ movu [r0 + r1 + 32], m2<br>
+ punpckhbw m1, m0<br>
+ movu [r0 + r1 + 48], m1<br>
+<br>
+ movu m1, [r2 + r3 + 32]<br>
+ pmovzxbw m2, m1<br>
+ movu [r0 + r1 + 64], m2<br>
+ punpckhbw m1, m0<br>
+ movu [r0 + r1 + 80], m1<br>
+<br>
+ movu m1, [r2 + r3 + 48]<br>
+ pmovzxbw m2, m1<br>
+ movu [r0 + r1 + 96], m2<br>
+ punpckhbw m1, m0<br>
+ movu [r0 + r1 + 112], m1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+<br>
+RET<br>
+%endmacro<br>
+<br>
+BLOCKCOPY_PS_W64_H2 64, 16<br>
+BLOCKCOPY_PS_W64_H2 64, 32<br>
+BLOCKCOPY_PS_W64_H2 64, 48<br>
+BLOCKCOPY_PS_W64_H2 64, 64<br>
diff -r b83d45863ceb -r 972a9a01d0b4 source/common/x86/blockcopy8.h<br>
--- a/source/common/x86/blockcopy8.h Mon Nov 11 21:06:11 2013 +0530<br>
+++ b/source/common/x86/blockcopy8.h Mon Nov 11 21:22:38 2013 +0530<br>
@@ -125,7 +125,11 @@<br>
#define LUMA_BLOCKCOPY_DEF_SSE4(cpu) \<br>
SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(16, 64, cpu); \<br>
SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(32, 64, cpu); \<br>
- SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(48, 64, cpu);<br>
+ SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(48, 64, cpu); \<br>
+ SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 16, cpu); \<br>
+ SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 32, cpu); \<br>
+ SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 48, cpu); \<br>
+ SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 64, cpu);<br>
<br>
CHROMA_BLOCKCOPY_DEF_SSE4(_sse4);<br>
LUMA_BLOCKCOPY_DEF_SSE4(_sse4);<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>