[x265] [PATCH] primitives pointer initialization and chroma asm code for all width sizes
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Oct 15 19:00:47 CEST 2013
# HG changeset patch
# User Praveen Tiwari
# Date 1381856430 -19800
# Node ID 203a9b334293c50a3e8741352726f2eef71dddb3
# Parent 39fc3c36e1b1b5fcaa7a7f65ddd21a2ecba1fc06
primitives pointer initialization and chroma asm code for all width sizes
diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Tue Oct 15 21:22:03 2013 +0530
+++ b/source/common/ipfilter.cpp Tue Oct 15 22:30:30 2013 +0530
@@ -497,8 +497,20 @@
namespace x265 {
// x265 private namespace
+ #define SETUP_PARTITION(W) \
+ p.chroma_hpp[CHROMA_PARTITION_W##W] = interp_horiz_pp##<4, W>;
+
void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
{
+
+ SETUP_PARTITION(2);
+ SETUP_PARTITION(4);
+ SETUP_PARTITION(6);
+ SETUP_PARTITION(8);
+ SETUP_PARTITION(12);
+ SETUP_PARTITION(16);
+ SETUP_PARTITION(32);
+
p.ipfilter_pp[FILTER_H_P_P_8] = filterHorizontal_p_p<8>;
p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_p_s<8>;
p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_p_s<8>;
diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Oct 15 21:22:03 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Oct 15 22:30:30 2013 +0530
@@ -2,6 +2,7 @@
* Copyright (C) 2013 x265 project
*
* Authors: Steve Borho <steve at borho.org>
+ * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -91,7 +92,13 @@
DECL_SUF( x265_pixel_avg_4x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
DECL_SUF( x265_pixel_avg_4x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
-void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff);
+void x265_interp_4tap_horiz_pp_w2_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);
+void x265_interp_4tap_horiz_pp_w4_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);
+void x265_interp_4tap_horiz_pp_w6_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);
+void x265_interp_4tap_horiz_pp_w8_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);
+void x265_interp_4tap_horiz_pp_w12_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);
+void x265_interp_4tap_horiz_pp_w16_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);
+void x265_interp_4tap_horiz_pp_w32_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx);
}
using namespace x265;
@@ -273,9 +280,14 @@
p.sa8d[BLOCK_8x8] = x265_pixel_sa8d_8x8_sse4;
p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse4;
SA8D_INTER_FROM_BLOCK(sse4);
-
-#if !defined(X86_64)
- p.ipfilter_pp[FILTER_H_P_P_4] = x265_filterHorizontal_p_p_4_sse4;
+#if !defined(X86_64) // will go away tommorow once PIC issue is fixed for 64-bit build
+ p.chroma_hpp[CHROMA_PARTITION_W2] = x265_interp_4tap_horiz_pp_w2_sse4;
+ p.chroma_hpp[CHROMA_PARTITION_W4] = x265_interp_4tap_horiz_pp_w4_sse4;
+ p.chroma_hpp[CHROMA_PARTITION_W6] = x265_interp_4tap_horiz_pp_w6_sse4;
+ p.chroma_hpp[CHROMA_PARTITION_W8] = x265_interp_4tap_horiz_pp_w8_sse4;
+ p.chroma_hpp[CHROMA_PARTITION_W12] = x265_interp_4tap_horiz_pp_w12_sse4;
+ p.chroma_hpp[CHROMA_PARTITION_W16] = x265_interp_4tap_horiz_pp_w16_sse4;
+ p.chroma_hpp[CHROMA_PARTITION_W32] = x265_interp_4tap_horiz_pp_w32_sse4;
#endif
}
if (cpuMask & X265_CPU_AVX)
diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Oct 15 21:22:03 2013 +0530
+++ b/source/common/x86/ipfilter8.asm Tue Oct 15 22:30:30 2013 +0530
@@ -26,109 +26,208 @@
%include "x86inc.asm"
%include "x86util.asm"
-%if ARCH_X86_64 == 0
-
SECTION_RODATA 32
-tab_leftmask: db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-
-tab_Tm: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
tab_c_512: times 8 dw 512
+tab_coeff: db 0, 64, 0, 0
+ db -2, 58, 10, -2
+ db -4, 54, 16, -2
+ db -6, 46, 28, -4
+ db -4, 36, 36, -4
+ db -4, 28, 46, -6
+ db -2, 16, 54, -4
+ db -2, 10, 58, -2
+
SECTION .text
-%macro FILTER_H4 2
- movu %1, [src + col - 1]
- pshufb %1, Tm4
- pmaddubsw %1, coef2
- movu %2, [src + col + 1]
- pshufb %2, Tm4
- pmaddubsw %2, coef3
- paddw %1, %2
- pmulhrsw %1, c512
- packuswb %1, %1
+%macro FILTER_H4_w2 3
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+ pextrw [dstq], %2, 0
+%endmacro
+
+%macro FILTER_H4_w4 3
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+ movd [dstq], %2
+%endmacro
+
+%macro FILTER_H4_w6 3
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+ movd [dstq], %2
+ pextrw [dstq + 4], %2, 2
+%endmacro
+
+%macro FILTER_H4_w8 3
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+ movh [dstq], %2
+%endmacro
+
+%macro FILTER_H4_w12 3
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ movu %1, [srcq - 1 + 8]
+ pshufb %1, %1, Tm0
+ pmaddubsw %1, coef2
+ phaddw %1, %1
+ pmulhrsw %1, %3
+ packuswb %2, %1
+ movh [dstq], %2
+ pextrd [dstq + 8], %2, 2
+%endmacro
+
+%macro FILTER_H4_w16 4
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq - 1 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq], %2
+%endmacro
+
+%macro FILTER_H4_w32 4
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq - 1 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq], %2
+ movu %1, [srcq - 1 + 16]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq - 1 + 24]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq + 16], %2
%endmacro
;-----------------------------------------------------------------------------
-; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)
+; void interp_4tap_horiz_pp_w%1(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx)
;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA 1
INIT_XMM sse4
-cglobal filterHorizontal_p_p_4, 0, 7, 8
-%define src r0
-%define dst r1
-%define row r2
-%define col r3
-%define width r4
-%define widthleft r5
-%define mask_offset r6
-%define coef2 m7
-%define coef3 m6
-%define Tm4 m5
-%define c512 m4
-%define x2 m3
-%define x1 m2
-%define x0 m1
-%define leftmask m0
-%define tmp r0
-%define tmp1 r1
-
- mov tmp, r6m
- movd coef2, [tmp ]
- movd coef3, [tmp + 4]
- pshufd coef2, coef2, 0
- pshufd coef3, coef3, 0
- packsswb coef2, coef2
- packsswb coef3, coef3
+cglobal interp_4tap_horiz_pp_w%1, 6, 7, 6, src, srcstride, dst, dststride, height, coeffIdx
+%define coef2 m5
+%define Tm0 m4
+%define Tm1 m3
+%define x2 m2
+%define x1 m1
+%define x0 m0
- mov width, r4m
- mov widthleft, width
- and width, ~7
- and widthleft, 7
- mov mask_offset, widthleft
- neg mask_offset
+movd coef2, [tab_coeff + r5d * 4]
- movq leftmask, [tab_leftmask + (7 + mask_offset)]
- mova Tm4, [tab_Tm]
- mova c512, [tab_c_512]
+pshufd coef2, coef2, 0
+mova x2, [tab_c_512]
+mova Tm0, [tab_Tm]
+mova Tm1, [tab_Tm + 16]
- mov src, r0m
- mov dst, r2m
- mov row, r5m
+.loop
+FILTER_H4_w%1 x0, x1, x2
+add srcq, srcstrideq
+add dstq, dststrideq
-_loop_row:
- xor col, col
-
-_loop_col:
- cmp col, width
- jge _end_col
+dec r4d
+jnz .loop
+
+RET
+%endmacro
- FILTER_H4 x0, x1
- movh [dst + col], x0
+IPFILTER_CHROMA 2
+IPFILTER_CHROMA 4
+IPFILTER_CHROMA 6
+IPFILTER_CHROMA 8
+IPFILTER_CHROMA 12
- add col, 8
- jmp _loop_col
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_w%1(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_W 1
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_w%1, 6, 7, 7, src, srcstride, dst, dststride, height, coeffIdx
+%define coef2 m6
+%define Tm0 m5
+%define Tm1 m4
+%define x3 m3
+%define x2 m2
+%define x1 m1
+%define x0 m0
-_end_col:
- test widthleft, widthleft
- jz _next_row
+movd coef2, [tab_coeff + r5d * 4]
- movq x2, [dst + col]
- FILTER_H4 x0, x1
- pblendvb x2, x2, x0, leftmask
- movh [dst + col], x2
+pshufd coef2, coef2, 0
+mova x2, [tab_c_512]
+mova Tm0, [tab_Tm]
+mova Tm1, [tab_Tm + 16]
-_next_row:
- add src, r1m
- add dst, r3m
- dec row
+.loop
+FILTER_H4_w%1 x0, x1, x2, x3
+add srcq, srcstrideq
+add dstq, dststrideq
- test row, row
- jz _end_row
+dec r4d
+jnz .loop
+
+RET
+%endmacro
- jmp _loop_row
-
-_end_row:
-
- RET
-
-%endif ; ARCH_X86_64 == 0
+IPFILTER_CHROMA_W 16
+IPFILTER_CHROMA_W 32
More information about the x265-devel
mailing list