[x265] [PATCH] primitives pointer initialization and chroma asm code for all width sizes
Steve Borho
steve at borho.org
Tue Oct 15 19:26:31 CEST 2013
On Tue, Oct 15, 2013 at 12:00 PM, <praveen at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1381856430 -19800
> # Node ID 203a9b334293c50a3e8741352726f2eef71dddb3
> # Parent 39fc3c36e1b1b5fcaa7a7f65ddd21a2ecba1fc06
> primitives pointer initialization and chroma asm code for all width sizes
>
holding off on this patch until the testbench is ready for these.
>
> diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/ipfilter.cpp
> --- a/source/common/ipfilter.cpp Tue Oct 15 21:22:03 2013 +0530
> +++ b/source/common/ipfilter.cpp Tue Oct 15 22:30:30 2013 +0530
> @@ -497,8 +497,20 @@
> namespace x265 {
> // x265 private namespace
>
> + #define SETUP_PARTITION(W) \
> + p.chroma_hpp[CHROMA_PARTITION_W##W] = interp_horiz_pp##<4, W>;
> +
> void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
> {
> +
> + SETUP_PARTITION(2);
> + SETUP_PARTITION(4);
> + SETUP_PARTITION(6);
> + SETUP_PARTITION(8);
> + SETUP_PARTITION(12);
> + SETUP_PARTITION(16);
> + SETUP_PARTITION(32);
> +
> p.ipfilter_pp[FILTER_H_P_P_8] = filterHorizontal_p_p<8>;
> p.ipfilter_ps[FILTER_H_P_S_8] = filterHorizontal_p_s<8>;
> p.ipfilter_ps[FILTER_V_P_S_8] = filterVertical_p_s<8>;
> diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue Oct 15 21:22:03 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp Tue Oct 15 22:30:30 2013
> +0530
> @@ -2,6 +2,7 @@
> * Copyright (C) 2013 x265 project
> *
> * Authors: Steve Borho <steve at borho.org>
> + * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -91,7 +92,13 @@
> DECL_SUF( x265_pixel_avg_4x8, ( pixel *, intptr_t, pixel *, intptr_t,
> pixel *, intptr_t, int ))
> DECL_SUF( x265_pixel_avg_4x4, ( pixel *, intptr_t, pixel *, intptr_t,
> pixel *, intptr_t, int ))
>
> -void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int width, int height, short const *coeff);
> +void x265_interp_4tap_horiz_pp_w2_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w4_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w6_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w8_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w12_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w16_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> +void x265_interp_4tap_horiz_pp_w32_sse4(pixel *src, intptr_t srcStride,
> pixel *dst, intptr_t dstStride, int height, int coeffIdx);
> }
>
> using namespace x265;
> @@ -273,9 +280,14 @@
> p.sa8d[BLOCK_8x8] = x265_pixel_sa8d_8x8_sse4;
> p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse4;
> SA8D_INTER_FROM_BLOCK(sse4);
> -
> -#if !defined(X86_64)
> - p.ipfilter_pp[FILTER_H_P_P_4] = x265_filterHorizontal_p_p_4_sse4;
> +#if !defined(X86_64) // will go away tommorow once PIC issue is fixed
> for 64-bit build
> + p.chroma_hpp[CHROMA_PARTITION_W2] =
> x265_interp_4tap_horiz_pp_w2_sse4;
> + p.chroma_hpp[CHROMA_PARTITION_W4] =
> x265_interp_4tap_horiz_pp_w4_sse4;
> + p.chroma_hpp[CHROMA_PARTITION_W6] =
> x265_interp_4tap_horiz_pp_w6_sse4;
> + p.chroma_hpp[CHROMA_PARTITION_W8] =
> x265_interp_4tap_horiz_pp_w8_sse4;
> + p.chroma_hpp[CHROMA_PARTITION_W12] =
> x265_interp_4tap_horiz_pp_w12_sse4;
> + p.chroma_hpp[CHROMA_PARTITION_W16] =
> x265_interp_4tap_horiz_pp_w16_sse4;
> + p.chroma_hpp[CHROMA_PARTITION_W32] =
> x265_interp_4tap_horiz_pp_w32_sse4;
> #endif
> }
> if (cpuMask & X265_CPU_AVX)
> diff -r 39fc3c36e1b1 -r 203a9b334293 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm Tue Oct 15 21:22:03 2013 +0530
> +++ b/source/common/x86/ipfilter8.asm Tue Oct 15 22:30:30 2013 +0530
> @@ -26,109 +26,208 @@
> %include "x86inc.asm"
> %include "x86util.asm"
>
> -%if ARCH_X86_64 == 0
> -
> SECTION_RODATA 32
> -tab_leftmask: db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0
> -
> -tab_Tm: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
> +tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
> + db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
>
> tab_c_512: times 8 dw 512
>
> +tab_coeff: db 0, 64, 0, 0
> + db -2, 58, 10, -2
> + db -4, 54, 16, -2
> + db -6, 46, 28, -4
> + db -4, 36, 36, -4
> + db -4, 28, 46, -6
> + db -2, 16, 54, -4
> + db -2, 10, 58, -2
> +
> SECTION .text
>
> -%macro FILTER_H4 2
> - movu %1, [src + col - 1]
> - pshufb %1, Tm4
> - pmaddubsw %1, coef2
> - movu %2, [src + col + 1]
> - pshufb %2, Tm4
> - pmaddubsw %2, coef3
> - paddw %1, %2
> - pmulhrsw %1, c512
> - packuswb %1, %1
> +%macro FILTER_H4_w2 3
> + movu %1, [srcq - 1]
> + pshufb %2, %1, Tm0
> + pmaddubsw %2, coef2
> + phaddw %2, %1
> + pmulhrsw %2, %3
> + packuswb %2, %2
> + pextrw [dstq], %2, 0
> +%endmacro
> +
> +%macro FILTER_H4_w4 3
> + movu %1, [srcq - 1]
> + pshufb %2, %1, Tm0
> + pmaddubsw %2, coef2
> + phaddw %2, %1
> + pmulhrsw %2, %3
> + packuswb %2, %2
> + movd [dstq], %2
> +%endmacro
> +
> +%macro FILTER_H4_w6 3
> + movu %1, [srcq - 1]
> + pshufb %2, %1, Tm0
> + pmaddubsw %2, coef2
> + pshufb %1, %1, Tm1
> + pmaddubsw %1, coef2
> + phaddw %2, %1
> + pmulhrsw %2, %3
> + packuswb %2, %2
> + movd [dstq], %2
> + pextrw [dstq + 4], %2, 2
> +%endmacro
> +
> +%macro FILTER_H4_w8 3
> + movu %1, [srcq - 1]
> + pshufb %2, %1, Tm0
> + pmaddubsw %2, coef2
> + pshufb %1, %1, Tm1
> + pmaddubsw %1, coef2
> + phaddw %2, %1
> + pmulhrsw %2, %3
> + packuswb %2, %2
> + movh [dstq], %2
> +%endmacro
> +
> +%macro FILTER_H4_w12 3
> + movu %1, [srcq - 1]
> + pshufb %2, %1, Tm0
> + pmaddubsw %2, coef2
> + pshufb %1, %1, Tm1
> + pmaddubsw %1, coef2
> + phaddw %2, %1
> + pmulhrsw %2, %3
> + movu %1, [srcq - 1 + 8]
> + pshufb %1, %1, Tm0
> + pmaddubsw %1, coef2
> + phaddw %1, %1
> + pmulhrsw %1, %3
> + packuswb %2, %1
> + movh [dstq], %2
> + pextrd [dstq + 8], %2, 2
> +%endmacro
> +
> +%macro FILTER_H4_w16 4
> + movu %1, [srcq - 1]
> + pshufb %2, %1, Tm0
> + pmaddubsw %2, coef2
> + pshufb %1, %1, Tm1
> + pmaddubsw %1, coef2
> + phaddw %2, %1
> + movu %1, [srcq - 1 + 8]
> + pshufb %4, %1, Tm0
> + pmaddubsw %4, coef2
> + pshufb %1, %1, Tm1
> + pmaddubsw %1, coef2
> + phaddw %4, %1
> + pmulhrsw %2, %3
> + pmulhrsw %4, %3
> + packuswb %2, %4
> + movu [dstq], %2
> +%endmacro
> +
> +%macro FILTER_H4_w32 4
> + movu %1, [srcq - 1]
> + pshufb %2, %1, Tm0
> + pmaddubsw %2, coef2
> + pshufb %1, %1, Tm1
> + pmaddubsw %1, coef2
> + phaddw %2, %1
> + movu %1, [srcq - 1 + 8]
> + pshufb %4, %1, Tm0
> + pmaddubsw %4, coef2
> + pshufb %1, %1, Tm1
> + pmaddubsw %1, coef2
> + phaddw %4, %1
> + pmulhrsw %2, %3
> + pmulhrsw %4, %3
> + packuswb %2, %4
> + movu [dstq], %2
> + movu %1, [srcq - 1 + 16]
> + pshufb %2, %1, Tm0
> + pmaddubsw %2, coef2
> + pshufb %1, %1, Tm1
> + pmaddubsw %1, coef2
> + phaddw %2, %1
> + movu %1, [srcq - 1 + 24]
> + pshufb %4, %1, Tm0
> + pmaddubsw %4, coef2
> + pshufb %1, %1, Tm1
> + pmaddubsw %1, coef2
> + phaddw %4, %1
> + pmulhrsw %2, %3
> + pmulhrsw %4, %3
> + packuswb %2, %4
> + movu [dstq + 16], %2
> %endmacro
>
>
> ;-----------------------------------------------------------------------------
> -; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, short const *coeff)
> +; void interp_4tap_horiz_pp_w%1(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int height, int coeffIdx)
>
> ;-----------------------------------------------------------------------------
> +%macro IPFILTER_CHROMA 1
> INIT_XMM sse4
> -cglobal filterHorizontal_p_p_4, 0, 7, 8
> -%define src r0
> -%define dst r1
> -%define row r2
> -%define col r3
> -%define width r4
> -%define widthleft r5
> -%define mask_offset r6
> -%define coef2 m7
> -%define coef3 m6
> -%define Tm4 m5
> -%define c512 m4
> -%define x2 m3
> -%define x1 m2
> -%define x0 m1
> -%define leftmask m0
> -%define tmp r0
> -%define tmp1 r1
> -
> - mov tmp, r6m
> - movd coef2, [tmp ]
> - movd coef3, [tmp + 4]
> - pshufd coef2, coef2, 0
> - pshufd coef3, coef3, 0
> - packsswb coef2, coef2
> - packsswb coef3, coef3
> +cglobal interp_4tap_horiz_pp_w%1, 6, 7, 6, src, srcstride, dst,
> dststride, height, coeffIdx
> +%define coef2 m5
> +%define Tm0 m4
> +%define Tm1 m3
> +%define x2 m2
> +%define x1 m1
> +%define x0 m0
>
> - mov width, r4m
> - mov widthleft, width
> - and width, ~7
> - and widthleft, 7
> - mov mask_offset, widthleft
> - neg mask_offset
> +movd coef2, [tab_coeff + r5d * 4]
>
> - movq leftmask, [tab_leftmask + (7 + mask_offset)]
> - mova Tm4, [tab_Tm]
> - mova c512, [tab_c_512]
> +pshufd coef2, coef2, 0
> +mova x2, [tab_c_512]
> +mova Tm0, [tab_Tm]
> +mova Tm1, [tab_Tm + 16]
>
> - mov src, r0m
> - mov dst, r2m
> - mov row, r5m
> +.loop
> +FILTER_H4_w%1 x0, x1, x2
> +add srcq, srcstrideq
> +add dstq, dststrideq
>
> -_loop_row:
> - xor col, col
> -
> -_loop_col:
> - cmp col, width
> - jge _end_col
> +dec r4d
> +jnz .loop
> +
> +RET
> +%endmacro
>
> - FILTER_H4 x0, x1
> - movh [dst + col], x0
> +IPFILTER_CHROMA 2
> +IPFILTER_CHROMA 4
> +IPFILTER_CHROMA 6
> +IPFILTER_CHROMA 8
> +IPFILTER_CHROMA 12
>
> - add col, 8
> - jmp _loop_col
>
> +;-----------------------------------------------------------------------------
> +; void interp_4tap_horiz_pp_w%1(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int height, int coeffIdx)
>
> +;-----------------------------------------------------------------------------
> +%macro IPFILTER_CHROMA_W 1
> +INIT_XMM sse4
> +cglobal interp_4tap_horiz_pp_w%1, 6, 7, 7, src, srcstride, dst,
> dststride, height, coeffIdx
> +%define coef2 m6
> +%define Tm0 m5
> +%define Tm1 m4
> +%define x3 m3
> +%define x2 m2
> +%define x1 m1
> +%define x0 m0
>
> -_end_col:
> - test widthleft, widthleft
> - jz _next_row
> +movd coef2, [tab_coeff + r5d * 4]
>
> - movq x2, [dst + col]
> - FILTER_H4 x0, x1
> - pblendvb x2, x2, x0, leftmask
> - movh [dst + col], x2
> +pshufd coef2, coef2, 0
> +mova x2, [tab_c_512]
> +mova Tm0, [tab_Tm]
> +mova Tm1, [tab_Tm + 16]
>
> -_next_row:
> - add src, r1m
> - add dst, r3m
> - dec row
> +.loop
> +FILTER_H4_w%1 x0, x1, x2, x3
> +add srcq, srcstrideq
> +add dstq, dststrideq
>
> - test row, row
> - jz _end_row
> +dec r4d
> +jnz .loop
> +
> +RET
> +%endmacro
>
> - jmp _loop_row
> -
> -_end_row:
> -
> - RET
> -
> -%endif ; ARCH_X86_64 == 0
> +IPFILTER_CHROMA_W 16
> +IPFILTER_CHROMA_W 32
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131015/0498ca19/attachment-0001.html>
More information about the x265-devel
mailing list