<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Tue, Oct 15, 2013 at 12:00 PM, <span dir="ltr"><<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Praveen Tiwari<br>
# Date 1381856430 -19800<br>
# Node ID 203a9b334293c50a3e8741352726f2eef71dddb3<br>
# Parent 39fc3c36e1b1b5fcaa7a7f65ddd21a2ecba1fc06<br>
primitives pointer initialization and chroma asm code for all width sizes<br></blockquote><div><br></div><div>holding off on this patch until the testbench is ready for these.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/ipfilter.cpp<br>
--- a/source/common/ipfilter.cpp Tue Oct 15 21:22:03 2013 +0530<br>
+++ b/source/common/ipfilter.cpp Tue Oct 15 22:30:30 2013 +0530<br>
@@ -497,8 +497,20 @@<br>
namespace x265 {<br>
// x265 private namespace<br>
<br>
+ #define SETUP_PARTITION(W) \<br>
+ p.chroma_hpp[CHROMA_PARTITION_W##W] = interp_horiz_pp##<4, W>;<br>
+<br>
void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)<br>
{<br>
+<br>
+ SETUP_PARTITION(2);<br>
+ SETUP_PARTITION(4);<br>
+ SETUP_PARTITION(6);<br>
+ SETUP_PARTITION(8);<br>
+ SETUP_PARTITION(12);<br>
+ SETUP_PARTITION(16);<br>
+ SETUP_PARTITION(32);<br>
+<br>
p.ipfilter_pp[FILTER_H_P_P_8] = filterHorizontal_p_p<8>;<br>
p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_p_s<8>;<br>
p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_p_s<8>;<br>
diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Tue Oct 15 21:22:03 2013 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp Tue Oct 15 22:30:30 2013 +0530<br>
@@ -2,6 +2,7 @@<br>
* Copyright (C) 2013 x265 project<br>
*<br>
* Authors: Steve Borho <<a href="mailto:steve@borho.org">steve@borho.org</a>><br>
+ * Praveen Kumar Tiwari <<a href="mailto:praveen@multicorewareinc.com">praveen@multicorewareinc.com</a>><br>
*<br>
* This program is free software; you can redistribute it and/or modify<br>
* it under the terms of the GNU General Public License as published by<br>
@@ -91,7 +92,13 @@<br>
DECL_SUF( x265_pixel_avg_4x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))<br>
DECL_SUF( x265_pixel_avg_4x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))<br>
<br>
-void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff);<br>
+void x265_interp_4tap_horiz_pp_w2_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);<br>
+void x265_interp_4tap_horiz_pp_w4_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);<br>
+void x265_interp_4tap_horiz_pp_w6_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);<br>
+void x265_interp_4tap_horiz_pp_w8_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);<br>
+void x265_interp_4tap_horiz_pp_w12_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);<br>
+void x265_interp_4tap_horiz_pp_w16_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);<br>
+void x265_interp_4tap_horiz_pp_w32_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);<br>
}<br>
<br>
using namespace x265;<br>
@@ -273,9 +280,14 @@<br>
p.sa8d[BLOCK_8x8] = x265_pixel_sa8d_8x8_sse4;<br>
p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse4;<br>
SA8D_INTER_FROM_BLOCK(sse4);<br>
-<br>
-#if !defined(X86_64)<br>
- p.ipfilter_pp[FILTER_H_P_P_4] = x265_filterHorizontal_p_p_4_sse4;<br>
+#if !defined(X86_64) // will go away tommorow once PIC issue is fixed for 64-bit build<br>
+ p.chroma_hpp[CHROMA_PARTITION_W2] = x265_interp_4tap_horiz_pp_w2_sse4;<br>
+ p.chroma_hpp[CHROMA_PARTITION_W4] = x265_interp_4tap_horiz_pp_w4_sse4;<br>
+ p.chroma_hpp[CHROMA_PARTITION_W6] = x265_interp_4tap_horiz_pp_w6_sse4;<br>
+ p.chroma_hpp[CHROMA_PARTITION_W8] = x265_interp_4tap_horiz_pp_w8_sse4;<br>
+ p.chroma_hpp[CHROMA_PARTITION_W12] = x265_interp_4tap_horiz_pp_w12_sse4;<br>
+ p.chroma_hpp[CHROMA_PARTITION_W16] = x265_interp_4tap_horiz_pp_w16_sse4;<br>
+ p.chroma_hpp[CHROMA_PARTITION_W32] = x265_interp_4tap_horiz_pp_w32_sse4;<br>
#endif<br>
}<br>
if (cpuMask & X265_CPU_AVX)<br>
diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/x86/ipfilter8.asm<br>
--- a/source/common/x86/ipfilter8.asm Tue Oct 15 21:22:03 2013 +0530<br>
+++ b/source/common/x86/ipfilter8.asm Tue Oct 15 22:30:30 2013 +0530<br>
@@ -26,109 +26,208 @@<br>
%include "x86inc.asm"<br>
%include "x86util.asm"<br>
<br>
-%if ARCH_X86_64 == 0<br>
-<br>
SECTION_RODATA 32<br>
-tab_leftmask: db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0<br>
-<br>
-tab_Tm: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8<br>
+tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6<br>
+ db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10<br>
<br>
tab_c_512: times 8 dw 512<br>
<br>
+tab_coeff: db 0, 64, 0, 0<br>
+ db -2, 58, 10, -2<br>
+ db -4, 54, 16, -2<br>
+ db -6, 46, 28, -4<br>
+ db -4, 36, 36, -4<br>
+ db -4, 28, 46, -6<br>
+ db -2, 16, 54, -4<br>
+ db -2, 10, 58, -2<br>
+<br>
SECTION .text<br>
<br>
-%macro FILTER_H4 2<br>
- movu %1, [src + col - 1]<br>
- pshufb %1, Tm4<br>
- pmaddubsw %1, coef2<br>
- movu %2, [src + col + 1]<br>
- pshufb %2, Tm4<br>
- pmaddubsw %2, coef3<br>
- paddw %1, %2<br>
- pmulhrsw %1, c512<br>
- packuswb %1, %1<br>
+%macro FILTER_H4_w2 3<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ packuswb %2, %2<br>
+ pextrw [dstq], %2, 0<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w4 3<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ packuswb %2, %2<br>
+ movd [dstq], %2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w6 3<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ packuswb %2, %2<br>
+ movd [dstq], %2<br>
+ pextrw [dstq + 4], %2, 2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w8 3<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ packuswb %2, %2<br>
+ movh [dstq], %2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w12 3<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ movu %1, [srcq - 1 + 8]<br>
+ pshufb %1, %1, Tm0<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %1, %1<br>
+ pmulhrsw %1, %3<br>
+ packuswb %2, %1<br>
+ movh [dstq], %2<br>
+ pextrd [dstq + 8], %2, 2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w16 4<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq - 1 + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ pmulhrsw %2, %3<br>
+ pmulhrsw %4, %3<br>
+ packuswb %2, %4<br>
+ movu [dstq], %2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w32 4<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq - 1 + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ pmulhrsw %2, %3<br>
+ pmulhrsw %4, %3<br>
+ packuswb %2, %4<br>
+ movu [dstq], %2<br>
+ movu %1, [srcq - 1 + 16]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq - 1 + 24]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ pmulhrsw %2, %3<br>
+ pmulhrsw %4, %3<br>
+ packuswb %2, %4<br>
+ movu [dstq + 16], %2<br>
%endmacro<br>
<br>
;-----------------------------------------------------------------------------<br>
-; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)<br>
+; void interp_4tap_horiz_pp_w%1(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx)<br>
;-----------------------------------------------------------------------------<br>
+%macro IPFILTER_CHROMA 1<br>
INIT_XMM sse4<br>
-cglobal filterHorizontal_p_p_4, 0, 7, 8<br>
-%define src r0<br>
-%define dst r1<br>
-%define row r2<br>
-%define col r3<br>
-%define width r4<br>
-%define widthleft r5<br>
-%define mask_offset r6<br>
-%define coef2 m7<br>
-%define coef3 m6<br>
-%define Tm4 m5<br>
-%define c512 m4<br>
-%define x2 m3<br>
-%define x1 m2<br>
-%define x0 m1<br>
-%define leftmask m0<br>
-%define tmp r0<br>
-%define tmp1 r1<br>
-<br>
- mov tmp, r6m<br>
- movd coef2, [tmp ]<br>
- movd coef3, [tmp + 4]<br>
- pshufd coef2, coef2, 0<br>
- pshufd coef3, coef3, 0<br>
- packsswb coef2, coef2<br>
- packsswb coef3, coef3<br>
+cglobal interp_4tap_horiz_pp_w%1, 6, 7, 6, src, srcstride, dst, dststride, height, coeffIdx<br>
+%define coef2 m5<br>
+%define Tm0 m4<br>
+%define Tm1 m3<br>
+%define x2 m2<br>
+%define x1 m1<br>
+%define x0 m0<br>
<br>
- mov width, r4m<br>
- mov widthleft, width<br>
- and width, ~7<br>
- and widthleft, 7<br>
- mov mask_offset, widthleft<br>
- neg mask_offset<br>
+movd coef2, [tab_coeff + r5d * 4]<br>
<br>
- movq leftmask, [tab_leftmask + (7 + mask_offset)]<br>
- mova Tm4, [tab_Tm]<br>
- mova c512, [tab_c_512]<br>
+pshufd coef2, coef2, 0<br>
+mova x2, [tab_c_512]<br>
+mova Tm0, [tab_Tm]<br>
+mova Tm1, [tab_Tm + 16]<br>
<br>
- mov src, r0m<br>
- mov dst, r2m<br>
- mov row, r5m<br>
+.loop<br>
+FILTER_H4_w%1 x0, x1, x2<br>
+add srcq, srcstrideq<br>
+add dstq, dststrideq<br>
<br>
-_loop_row:<br>
- xor col, col<br>
-<br>
-_loop_col:<br>
- cmp col, width<br>
- jge _end_col<br>
+dec r4d<br>
+jnz .loop<br>
+<br>
+RET<br>
+%endmacro<br>
<br>
- FILTER_H4 x0, x1<br>
- movh [dst + col], x0<br>
+IPFILTER_CHROMA 2<br>
+IPFILTER_CHROMA 4<br>
+IPFILTER_CHROMA 6<br>
+IPFILTER_CHROMA 8<br>
+IPFILTER_CHROMA 12<br>
<br>
- add col, 8<br>
- jmp _loop_col<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_w%1(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro IPFILTER_CHROMA_W 1<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_w%1, 6, 7, 7, src, srcstride, dst, dststride, height, coeffIdx<br>
+%define coef2 m6<br>
+%define Tm0 m5<br>
+%define Tm1 m4<br>
+%define x3 m3<br>
+%define x2 m2<br>
+%define x1 m1<br>
+%define x0 m0<br>
<br>
-_end_col:<br>
- test widthleft, widthleft<br>
- jz _next_row<br>
+movd coef2, [tab_coeff + r5d * 4]<br>
<br>
- movq x2, [dst + col]<br>
- FILTER_H4 x0, x1<br>
- pblendvb x2, x2, x0, leftmask<br>
- movh [dst + col], x2<br>
+pshufd coef2, coef2, 0<br>
+mova x2, [tab_c_512]<br>
+mova Tm0, [tab_Tm]<br>
+mova Tm1, [tab_Tm + 16]<br>
<br>
-_next_row:<br>
- add src, r1m<br>
- add dst, r3m<br>
- dec row<br>
+.loop<br>
+FILTER_H4_w%1 x0, x1, x2, x3<br>
+add srcq, srcstrideq<br>
+add dstq, dststrideq<br>
<br>
- test row, row<br>
- jz _end_row<br>
+dec r4d<br>
+jnz .loop<br>
+<br>
+RET<br>
+%endmacro<br>
<br>
- jmp _loop_row<br>
-<br>
-_end_row:<br>
-<br>
- RET<br>
-<br>
-%endif ; ARCH_X86_64 == 0<br>
+IPFILTER_CHROMA_W 16<br>
+IPFILTER_CHROMA_W 32<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>