[x265] [PATCH REVIEW Only ] chroma 4XN block, coeffIdex insted of coeff pointer
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Fri Oct 11 18:50:38 CEST 2013
# HG changeset patch
# User Praveen Tiwari
# Date 1381510220 -19800
# Node ID 5a9160e8b0bdc3117c2417bc29453077488efd8e
# Parent c6d89dc62e191f56f63dbcb1781a6494da50a70d
chroma 4XN block, coeffIdex insted of coeff pointer
diff -r c6d89dc62e19 -r 5a9160e8b0bd source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Fri Oct 11 01:47:53 2013 -0500
+++ b/source/common/x86/ipfilter8.asm Fri Oct 11 22:20:20 2013 +0530
@@ -26,107 +26,58 @@
%include "x86inc.asm"
%include "x86util.asm"
-%if ARCH_X86_64 == 0
-
SECTION_RODATA 32
-tab_leftmask: db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-
tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
- db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
tab_c_512: times 8 dw 512
+tab_coeff: db 0, 64, 0, 0, 0, 64, 0, 0, 0, 64, 0, 0, 0, 64, 0, 0
+ db -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2
+ db -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2
+ db -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4
+ db -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4
+ db -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6
+ db -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4
+ db -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2
+
SECTION .text
-%macro FILTER_H4 3
- movu %1, [src + col - 1]
- pshufb %2, %1, Tm4
+%macro FILTER_H4_w4 3
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
pmaddubsw %2, coef2
- pshufb %1, %1, Tm5
- pmaddubsw %1, coef2
phaddw %2, %1
pmulhrsw %2, %3
packuswb %2, %2
%endmacro
+%macro FILTER_H4_w4_CALL 0
+ FILTER_H4_w4 x0, x1, x2
+
+ movd [dstq], x1
+
+ add srcq, srcstrideq
+ add dstq, dststrideq
+%endmacro
+
;-----------------------------------------------------------------------------
-; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)
+; void interp_4tap_horiz_pp_w4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal filterHorizontal_p_p_4, 0, 7, 8
-%define src r0
-%define dst r1
-%define row r2
-%define col r3
-%define width r4
-%define widthleft r5
-%define mask_offset r6
-%define coef2 m7
-%define x3 m6
-%define Tm5 m5
-%define Tm4 m4
-%define x2 m3
-%define x1 m2
-%define x0 m1
-%define leftmask m0
-%define tmp r0
-%define tmp1 r1
-
- mov tmp, r6m
- movu coef2, [tmp]
- packsswb coef2, coef2
- pshufd coef2, coef2, 0
+cglobal interp_4tap_horiz_pp_w4, 6, 6, 5, src, srcstride, dst, dststride, height, coeffIdx
+%define coef2 m4
+%define Tm0 m3
+%define x2 m2
+%define x1 m1
+%define x0 m0
- mova x3, [tab_c_512]
+ mova coef2, [tab_coeff + 16]
+ mova x2, [tab_c_512]
+ mova Tm0, [tab_Tm]
- mov width, r4m
- mov widthleft, width
- and width, ~7
- and widthleft, 7
- mov mask_offset, widthleft
- neg mask_offset
+.loop
+FILTER_H4_w4_CALL
+dec r4d
+jnz .loop
+RET
- movq leftmask, [tab_leftmask + (7 + mask_offset)]
- mova Tm4, [tab_Tm]
- mova Tm5, [tab_Tm + 16]
-
- mov src, r0m
- mov dst, r2m
- mov row, r5m
-
-_loop_row:
- xor col, col
-
-_loop_col:
- FILTER_H4 x0, x1, x3
- movh [dst + col], x1
-
- add col, 8
-
- cmp col, width
- jl _loop_col
-
-_end_col:
- test widthleft, widthleft
- jz _next_row
-
- movq x2, [dst + col]
- FILTER_H4 x0, x1, x3
- pblendvb x2, x2, x1, leftmask
- movh [dst + col], x2
-
-_next_row:
- add src, r1m
- add dst, r3m
- dec row
-
- test row, row
- jz _end_row
-
- jmp _loop_row
-
-_end_row:
-
- RET
-
-%endif ; ARCH_X86_64 == 0
More information about the x265-devel
mailing list