<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Oct 31, 2013 at 10:50 AM, <span dir="ltr"><<a href="mailto:nabajit@multicorewareinc.com" target="_blank">nabajit@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"><div class="im"># HG changeset patch<br>
# User Nabajit Deka<br>
</div># Date 1383234610 -19800<br>
# Thu Oct 31 21:20:10 2013 +0530<br>
# Node ID e11e3328ff72aabab96ee2e6dac802a1284fe2d1<br>
<div class="im"># Parent 9a0da4e6d9e363e383eae7243f0c64026a5f6d00<br>
Assembly routines for vertical luma filter for all block sizes<br></div></blockquote><div><br></div><div><div><br></div><div>1> C:/mcw/x265/source/common/x86/ipfilter8.asm:2295: error: undefined symbol `tab_LumaCoeffVerLuma' (first use)</div>
<div>1> C:/mcw/x265/source/common/x86/ipfilter8.asm:2295: error: (Each undefined symbol is reported only once.)</div></div><div> </div><div><br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
<div class="im">
</div>diff -r 9a0da4e6d9e3 -r e11e3328ff72 source/common/x86/ipfilter8.asm<br>
--- a/source/common/x86/ipfilter8.asm Thu Oct 31 15:10:34 2013 +0530<br>
+++ b/source/common/x86/ipfilter8.asm Thu Oct 31 21:20:10 2013 +0530<br>
@@ -2127,3 +2127,394 @@<br>
jnz .loopH<br>
<br>
RET<br>
+<br>
+%macro PROCESS_LUMA_W4_4R 0<br>
+ movd m0, [r0]<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m2, m0, m1 ; m2=[0 1]<br>
+<br>
+ movd m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0 ; m1=[1 2]<br>
+ punpcklqdq m2, m1 ; m2=[0 1 1 2]<br>
+ pmaddubsw m7, m2, [r6 + 0 * 16] ; m7 = [0+1 1+2]<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m6, m0, m1 ; m2=[2 3]<br>
+ movd m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0 ; m1=[3 4]<br>
+ punpcklqdq m6, m1 ; m6=[2 3 3 4]<br>
+ pmaddubsw m2, m6, [r6 + 1 * 16] ; m2 = [2+3 3+4]<br>
+ paddw m7, m2 ; m7=[0+1+2+3 1+2+3+4] Row1-2<br>
+ pmaddubsw m6, [r6 + 0 * 16] ; m6 = [2+3 3+4] Row3-4<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m2, m0, m1 ; m2=[4 5]<br>
+ movd m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0 ; m1=[5 6]<br>
+ punpcklqdq m2, m1 ; m2=[4 5 5 6]<br>
+ pmaddubsw m1, m2, [r6 + 2 * 16] ; m1 = [4+5 5+6]<br>
+ paddw m7, m1 ; m7=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2<br>
+ pmaddubsw m2, [r6 + 1 * 16] ; m2 = [4+5 5+6]<br>
+ paddw m6, m2 ; m6=[2+3+4+5 3+4+5+6] Row3-4<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m2, m0, m1 ; m2=[6 7]<br>
+ movd m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0 ; m1=[7 8]<br>
+ punpcklqdq m2, m1 ; m2=[6 7 7 8]<br>
+ pmaddubsw m1, m2, [r6 + 3 * 16] ; m1 = [6+7 7+8]<br>
+ paddw m7, m1 ; m7=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end<br>
+ pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]<br>
+ paddw m6, m2 ; m6=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m2, m0, m1 ; m2=[8 9]<br>
+ movd m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0 ; m1=[9 10]<br>
+ punpcklqdq m2, m1 ; m2=[8 9 9 10]<br>
+ pmaddubsw m2, [r6 + 3 * 16] ; m2 = [8+9 9+10]<br>
+ paddw m6, m2 ; m6=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end<br>
+%endmacro<br>
+<br>
+%macro PROCESS_LUMA_W8_4R 0<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m7, m0, [r6 + 0 *16] ;m7 = [0+1] Row1<br>
+<br>
+ movq m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m6, m1, [r6 + 0 *16] ;m6 = [1+2] Row2<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m5, m0, [r6 + 0 *16] ;m5 = [2+3] Row3<br>
+ pmaddubsw m0, [r6 + 1 * 16]<br>
+ paddw m7, m0 ;m7 = [0+1+2+3] Row1<br>
+<br>
+ movq m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m4, m1, [r6 + 0 *16] ;m4 = [3+4] Row4<br>
+ pmaddubsw m1, [r6 + 1 * 16]<br>
+ paddw m6, m1 ;m6 = [1+2+3+4] Row2<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m2, m0, [r6 + 1 * 16]<br>
+ pmaddubsw m0, [r6 + 2 * 16]<br>
+ paddw m7, m0 ;m7 = [0+1+2+3+4+5] Row1<br>
+ paddw m5, m2 ;m5 = [2+3+4+5] Row3<br>
+<br>
+ movq m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m2, m1, [r6 + 1 * 16]<br>
+ pmaddubsw m1, [r6 + 2 * 16]<br>
+ paddw m6, m1 ;m6 = [1+2+3+4+5+6] Row2<br>
+ paddw m4, m2 ;m4 = [3+4+5+6] Row4<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m2, m0, [r6 + 2 * 16]<br>
+ pmaddubsw m0, [r6 + 3 * 16]<br>
+ paddw m7, m0 ;m7 = [0+1+2+3+4+5+6+7] Row1 end<br>
+ paddw m5, m2 ;m5 = [2+3+4+5+6+7] Row3<br>
+<br>
+ movq m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m2, m1, [r6 + 2 * 16]<br>
+ pmaddubsw m1, [r6 + 3 * 16]<br>
+ paddw m6, m1 ;m6 = [1+2+3+4+5+6+7+8] Row2 end<br>
+ paddw m4, m2 ;m4 = [3+4+5+6+7+8] Row4<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m0, [r6 + 3 * 16]<br>
+ paddw m5, m0 ;m5 = [2+3+4+5+6+7+8+9] Row3 end<br>
+<br>
+ movq m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m1, [r6 + 3 * 16]<br>
+ paddw m4, m1 ;m4 = [3+4+5+6+7+8+9+10] Row4 end<br>
+%endmacro<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA_4xN 2<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 6<br>
+ lea r5, [r1 + 2 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffVerLuma]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffVerLuma + r4]<br>
+%endif<br>
+<br>
+ mova m5, [tab_c_512]<br>
+ mov r4d, %2/4<br>
+<br>
+.loopH<br>
+ PROCESS_LUMA_W4_4R<br>
+<br>
+ pmulhrsw m7, m5<br>
+ pmulhrsw m6, m5<br>
+<br>
+ packuswb m7, m7<br>
+ packuswb m6, m6<br>
+<br>
+ movd [r2], m7<br>
+ pshufd m7, m7, 1<br>
+ movd [r2 + r3], m7<br>
+ movd [r2 + 2 * r3], m6<br>
+ pshufd m6, m6, 1<br>
+ lea r5, [r3 + 2 * r3]<br>
+ movd [r2 + r5], m6<br>
+<br>
+ lea r5, [4 * r1]<br>
+ sub r0, r5<br>
+ lea r2, [r2 + 4 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_4xN 4,4<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_4xN 4,8<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_4xN 4,16<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA_8xN 2<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8<br>
+ lea r5, [r1 + 2 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffVerLuma]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffVerLuma + r4]<br>
+%endif<br>
+<br>
+ mova m3, [tab_c_512]<br>
+ mov r4d, %2/4<br>
+<br>
+.loopH<br>
+ PROCESS_LUMA_W8_4R<br>
+<br>
+ pmulhrsw m7, m3<br>
+ pmulhrsw m6, m3<br>
+ pmulhrsw m5, m3<br>
+ pmulhrsw m4, m3<br>
+<br>
+ packuswb m7, m6<br>
+ packuswb m5, m4<br>
+<br>
+ movlps [r2], m7<br>
+ movhps [r2 + r3], m7<br>
+ movlps [r2 + 2 * r3], m5<br>
+ lea r5, [r3 + 2 * r3]<br>
+ movhps [r2 + r5], m5<br>
+<br>
+ lea r5, [4 * r1]<br>
+ sub r0, r5<br>
+ lea r2, [r2 + 4 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8,4<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8,8<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8,16<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8,32<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA_12xN 2<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8<br>
+ lea r5, [r1 + 2 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffVerLuma]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffVerLuma + r4]<br>
+%endif<br>
+<br>
+ mova m3, [tab_c_512]<br>
+ mov r4d, %2/4<br>
+<br>
+.loopH<br>
+ PROCESS_LUMA_W8_4R<br>
+<br>
+ pmulhrsw m7, m3<br>
+ pmulhrsw m6, m3<br>
+ pmulhrsw m5, m3<br>
+ pmulhrsw m4, m3<br>
+<br>
+ packuswb m7, m6<br>
+ packuswb m5, m4<br>
+<br>
+ movlps [r2], m7<br>
+ movhps [r2 + r3], m7<br>
+ movlps [r2 + 2 * r3], m5<br>
+ lea r5, [r3 + 2 * r3]<br>
+ movhps [r2 + r5], m5<br>
+<br>
+ lea r5, [8 * r1 - 8]<br>
+ sub r0, r5<br>
+ add r2, 8<br>
+<br>
+ PROCESS_LUMA_W4_4R<br>
+<br>
+ pmulhrsw m7, m3<br>
+ pmulhrsw m6, m3<br>
+<br>
+ packuswb m7, m7<br>
+ packuswb m6, m6<br>
+<br>
+ movd [r2], m7<br>
+ pshufd m7, m7, 1<br>
+ movd [r2 + r3], m7<br>
+ movd [r2 + 2 * r3], m6<br>
+ pshufd m6, m6, 1<br>
+ lea r5, [r3 + 2 * r3]<br>
+ movd [r2 + r5], m6<br>
+<br>
+ lea r5, [4 * r1 + 8]<br>
+ sub r0, r5<br>
+ lea r2, [r2 + 4 * r3 - 8]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_12xN 12, 16<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA 2<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-1<br>
+ lea r5, [r1 + 2 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffVerLuma]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffVerLuma + r4]<br>
+%endif<br>
+<br>
+ mova m3, [tab_c_512]<br>
+ mov byte [rsp], %2/4<br>
+<br>
+<br>
+.loopH<br>
+ mov r4d, (%1/8)<br>
+.loopW<br>
+ PROCESS_LUMA_W8_4R<br>
+<br>
+ pmulhrsw m7, m3<br>
+ pmulhrsw m6, m3<br>
+ pmulhrsw m5, m3<br>
+ pmulhrsw m4, m3<br>
+<br>
+ packuswb m7, m6<br>
+ packuswb m5, m4<br>
+<br>
+ movlps [r2], m7<br>
+ movhps [r2 + r3], m7<br>
+ movlps [r2 + 2 * r3], m5<br>
+ lea r5, [r3 + 2 * r3]<br>
+ movhps [r2 + r5], m5<br>
+<br>
+ lea r5, [8 * r1 - 8]<br>
+ sub r0, r5<br>
+ add r2, 8<br>
+ dec r4d<br>
+ jnz .loopW<br>
+<br>
+ lea r0, [r0 + 4 * r1 - %1]<br>
+ lea r2, [r2 + 4 * r3 - %1]<br>
+<br>
+ dec byte [rsp]<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_VER_LUMA 16, 4<br>
+FILTER_VER_LUMA 16, 8<br>
+FILTER_VER_LUMA 16, 12<br>
+FILTER_VER_LUMA 16, 16<br>
+FILTER_VER_LUMA 16, 32<br>
+FILTER_VER_LUMA 16, 64<br>
+FILTER_VER_LUMA 24, 32<br>
+FILTER_VER_LUMA 32, 8<br>
+FILTER_VER_LUMA 32, 16<br>
+FILTER_VER_LUMA 32, 24<br>
+FILTER_VER_LUMA 32, 32<br>
+FILTER_VER_LUMA 32, 64<br>
+FILTER_VER_LUMA 48, 64<br>
+FILTER_VER_LUMA 64, 16<br>
+FILTER_VER_LUMA 64, 32<br>
+FILTER_VER_LUMA 64, 48<br>
+FILTER_VER_LUMA 64, 64<br>
<div class=""><div class="h5">_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</div></div></blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>