[x265] [PATCH] asm: Optimizations and cleaups on ipfilter functions
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Tue Feb 11 14:51:59 CET 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1392125003 -19800
# Tue Feb 11 18:53:23 2014 +0530
# Node ID 7eccc042e269ead4ff5d32f4d853287e30c59044
# Parent 07b5d6b82f5fbcb78ecab12cb8abcf13c78fe552
asm: Optimizations and cleaups on ipfilter functions
diff -r 07b5d6b82f5f -r 7eccc042e269 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Feb 10 15:05:04 2014 -0600
+++ b/source/common/x86/ipfilter8.asm Tue Feb 11 18:53:23 2014 +0530
@@ -814,7 +814,7 @@
pmaddwd %8, [r5 + %10 * 16]
paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
paddd %6, %8 ; R1 = H[1+2+3+4]
-%endmacro ; FILTER_HV8_START
+%endmacro ; FILTER_HV8_MID
; Round and Saturate
%macro FILTER_HV8_END 4 ; output in [1, 3]
@@ -830,8 +830,7 @@
packssdw %3, %4
; TODO: is merge better? I think this way is short dependency link
- packuswb %1, %1
- packuswb %3, %3
+ packuswb %1, %3
%endmacro ; FILTER_HV8_END
;-----------------------------------------------------------------------------
@@ -899,8 +898,8 @@
FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
FILTER_HV8_END m3, m0, m4, m1
- movq [r2], m3
- movq [r2 + r3], m4
+ movh [r2], m3
+ movhps [r2 + r3], m3
lea r0, [r0 + 16 * 2]
lea r2, [r2 + r3 * 2]
@@ -915,7 +914,7 @@
;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_2x4, 4, 7, 8
+cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -926,16 +925,15 @@
%else
movd m0, [tab_ChromaCoeff + r4 * 4]
%endif
-
+lea r4, [r1 * 3]
+lea r5, [r0 + 4 * r1]
pshufb m0, [tab_Cm]
-
mova m1, [tab_c_512]
movd m2, [r0]
movd m3, [r0 + r1]
movd m4, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movd m5, [r5 + r1]
+movd m5, [r0 + r4]
punpcklbw m2, m3
punpcklbw m6, m4, m5
@@ -943,7 +941,7 @@
pmaddubsw m2, m0
-movd m6, [r0 + 4 * r1]
+movd m6, [r5]
punpcklbw m3, m4
punpcklbw m7, m5, m6
@@ -954,16 +952,11 @@
phaddw m2, m3
pmulhrsw m2, m1
-packuswb m2, m2
-
-pextrw [r2], m2, 0
-pextrw [r2 + r3], m2, 2
-
-lea r5, [r0 + 4 * r1]
-movd m2, [r5 + r1]
+
+movd m7, [r5 + r1]
punpcklbw m4, m5
-punpcklbw m3, m6, m2
+punpcklbw m3, m6, m7
punpcklbw m4, m3
pmaddubsw m4, m0
@@ -971,19 +964,21 @@
movd m3, [r5 + 2 * r1]
punpcklbw m5, m6
-punpcklbw m2, m3
-punpcklbw m5, m2
+punpcklbw m7, m3
+punpcklbw m5, m7
pmaddubsw m5, m0
phaddw m4, m5
pmulhrsw m4, m1
-packuswb m4, m4
-
-pextrw [r2 + 2 * r3], m4, 0
-lea r6, [r2 + 2 * r3]
-pextrw [r6 + r3], m4, 2
+packuswb m2, m4
+
+pextrw [r2], m2, 0
+pextrw [r2 + r3], m2, 2
+lea r2, [r2 + 2 * r3]
+pextrw [r2], m2, 4
+pextrw [r2 + r3], m2, 6
RET
@@ -992,7 +987,7 @@
;-----------------------------------------------------------------------------
%macro FILTER_V4_W2_H4 2
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_2x8, 4, 7, 8
+cglobal interp_4tap_vert_pp_2x8, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -1009,13 +1004,13 @@
mova m1, [tab_c_512]
mov r4d, %2
+lea r5, [3 * r1]
.loop
movd m2, [r0]
movd m3, [r0 + r1]
movd m4, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movd m5, [r5 + r1]
+movd m5, [r0 + r5]
punpcklbw m2, m3
punpcklbw m6, m4, m5
@@ -1023,7 +1018,8 @@
pmaddubsw m2, m0
-movd m6, [r0 + 4 * r1]
+lea r0, [r0 + 4 * r1]
+movd m6, [r0]
punpcklbw m3, m4
punpcklbw m7, m5, m6
@@ -1034,39 +1030,35 @@
phaddw m2, m3
pmulhrsw m2, m1
-packuswb m2, m2
-
-pextrw [r2], m2, 0
-pextrw [r2 + r3], m2, 2
-
-lea r5, [r0 + 4 * r1]
-movd m2, [r5 + r1]
+
+movd m7, [r0 + r1]
punpcklbw m4, m5
-punpcklbw m3, m6, m2
+punpcklbw m3, m6, m7
punpcklbw m4, m3
pmaddubsw m4, m0
-movd m3, [r5 + 2 * r1]
+movd m3, [r0 + 2 * r1]
punpcklbw m5, m6
-punpcklbw m2, m3
-punpcklbw m5, m2
+punpcklbw m7, m3
+punpcklbw m5, m7
pmaddubsw m5, m0
phaddw m4, m5
pmulhrsw m4, m1
-packuswb m4, m4
-
-pextrw [r2 + 2 * r3], m4, 0
-lea r6, [r2 + 2 * r3]
-pextrw [r6 + r3], m4, 2
-
-lea r0, [r0 + 4 * r1]
-lea r2, [r2 + 4 * r3]
+packuswb m2, m4
+
+pextrw [r2], m2, 0
+pextrw [r2 + r3], m2, 2
+lea r2, [r2 + 2 * r3]
+pextrw [r2], m2, 4
+pextrw [r2 + r3], m2, 6
+
+lea r2, [r2 + 2 * r3]
sub r4, 4
jnz .loop
@@ -1079,7 +1071,7 @@
; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_4x2, 4, 6, 8
+cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
mov r4d, r4m
sub r0, r1
@@ -1092,32 +1084,30 @@
%endif
pshufb m0, [tab_Cm]
-
-mova m1, [tab_c_512]
+lea r5, [r0 + 2 * r1]
movd m2, [r0]
movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
+movd m4, [r5]
movd m5, [r5 + r1]
punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
+punpcklbw m1, m4, m5
+punpcklbw m2, m1
pmaddubsw m2, m0
-movd m6, [r0 + 4 * r1]
+movd m1, [r0 + 4 * r1]
punpcklbw m3, m4
-punpcklbw m5, m6
+punpcklbw m5, m1
punpcklbw m3, m5
pmaddubsw m3, m0
phaddw m2, m3
-pmulhrsw m2, m1
+pmulhrsw m2, [tab_c_512]
packuswb m2, m2
movd [r2], m2
pextrd [r2 + r3], m2, 1
@@ -1128,7 +1118,7 @@
; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_4x4, 4, 7, 8
+cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -1141,14 +1131,14 @@
%endif
pshufb m0, [tab_Cm]
-
mova m1, [tab_c_512]
+lea r5, [r0 + 4 * r1]
+lea r4, [r1 * 3]
movd m2, [r0]
movd m3, [r0 + r1]
movd m4, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movd m5, [r5 + r1]
+movd m5, [r0 + r4]
punpcklbw m2, m3
punpcklbw m6, m4, m5
@@ -1156,7 +1146,7 @@
pmaddubsw m2, m0
-movd m6, [r0 + 4 * r1]
+movd m6, [r5]
punpcklbw m3, m4
punpcklbw m7, m5, m6
@@ -1167,34 +1157,33 @@
phaddw m2, m3
pmulhrsw m2, m1
-packuswb m2, m2
+
+movd m7, [r5 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m7
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r5 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m7, m3
+punpcklbw m5, m7
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+
+packuswb m2, m4
movd [r2], m2
-pextrd [r2 + r3], m2, 1
-
-lea r5, [r0 + 4 * r1]
-movd m2, [r5 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m2
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r5 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m2, m3
-punpcklbw m5, m2
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-packuswb m4, m4
-movd [r2 + 2 * r3], m4
-lea r6, [r2 + 2 * r3]
-pextrd [r6 + r3], m4, 1
+pextrd [r2 + r3], m2, 1
+lea r2, [r2 + 2 * r3]
+pextrd [r2], m2, 2
+pextrd [r2 + r3], m2, 3
RET
@@ -1203,7 +1192,7 @@
;-----------------------------------------------------------------------------
%macro FILTER_V4_W4_H4 2
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -1221,12 +1210,13 @@
mov r4d, %2
+lea r5, [3 * r1]
+
.loop
movd m2, [r0]
movd m3, [r0 + r1]
movd m4, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movd m5, [r5 + r1]
+movd m5, [r0 + r5]
punpcklbw m2, m3
punpcklbw m6, m4, m5
@@ -1234,7 +1224,8 @@
pmaddubsw m2, m0
-movd m6, [r0 + 4 * r1]
+lea r0, [r0 + 4 * r1]
+movd m6, [r0]
punpcklbw m3, m4
punpcklbw m7, m5, m6
@@ -1245,37 +1236,34 @@
phaddw m2, m3
pmulhrsw m2, m1
-packuswb m2, m2
+
+movd m7, [r0 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m7
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r0 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m7, m3
+punpcklbw m5, m7
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+packuswb m2, m4
movd [r2], m2
pextrd [r2 + r3], m2, 1
-
-lea r5, [r0 + 4 * r1]
-movd m2, [r5 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m2
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r5 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m2, m3
-punpcklbw m5, m2
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-packuswb m4, m4
-movd [r2 + 2 * r3], m4
-lea r6, [r2 + 2 * r3]
-pextrd [r6 + r3], m4, 1
-
-lea r0, [r0 + 4 * r1]
-lea r2, [r2 + 4 * r3]
+lea r2, [r2 + 2 * r3]
+pextrd [r2], m2, 2
+pextrd [r2 + r3], m2, 3
+
+lea r2, [r2 + 2 * r3]
sub r4, 4
jnz .loop
@@ -1450,7 +1438,7 @@
; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_4x2, 4, 6, 8
+cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
mov r4d, r4m
sub r0, r1
@@ -1465,32 +1453,30 @@
pshufb m0, [tab_Cm]
-mova m1, [pw_2000]
-
movd m2, [r0]
movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
lea r5, [r0 + 2 * r1]
+movd m4, [r5]
movd m5, [r5 + r1]
punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
+punpcklbw m1, m4, m5
+punpcklbw m2, m1
pmaddubsw m2, m0
-movd m6, [r0 + 4 * r1]
+movd m1, [r0 + 4 * r1]
punpcklbw m3, m4
-punpcklbw m5, m6
+punpcklbw m5, m1
punpcklbw m3, m5
pmaddubsw m3, m0
phaddw m2, m3
-psubw m2, m1
-movlps [r2], m2
+psubw m2, [pw_2000]
+movh [r2], m2
movhps [r2 + r3], m2
RET
@@ -1499,7 +1485,7 @@
; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_4x4, 4, 7, 8
+cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
mov r4d, r4m
sub r0, r1
@@ -1514,13 +1500,13 @@
pshufb m0, [tab_Cm]
- mova m1, [pw_2000]
+ lea r4, [r1 * 3]
+ lea r5, [r0 + 4 * r1]
movd m2, [r0]
movd m3, [r0 + r1]
movd m4, [r0 + 2 * r1]
- lea r5, [r0 + 2 * r1]
- movd m5, [r5 + r1]
+ movd m5, [r0 + r4]
punpcklbw m2, m3
punpcklbw m6, m4, m5
@@ -1528,21 +1514,22 @@
pmaddubsw m2, m0
- movd m6, [r0 + 4 * r1]
+ movd m6, [r5]
punpcklbw m3, m4
- punpcklbw m7, m5, m6
- punpcklbw m3, m7
+ punpcklbw m1, m5, m6
+ punpcklbw m3, m1
pmaddubsw m3, m0
phaddw m2, m3
+ mova m1, [pw_2000]
+
psubw m2, m1
- movlps [r2], m2
+ movh [r2], m2
movhps [r2 + r3], m2
- lea r5, [r0 + 4 * r1]
movd m2, [r5 + r1]
punpcklbw m4, m5
@@ -1562,9 +1549,9 @@
phaddw m4, m5
psubw m4, m1
- movlps [r2 + 2 * r3], m4
- lea r6, [r2 + 2 * r3]
- movhps [r6 + r3], m4
+ lea r2, [r2 + 2 * r3]
+ movh [r2], m4
+ movhps [r2 + r3], m4
RET
@@ -1573,7 +1560,7 @@
;---------------------------------------------------------------------------------------------------------------
%macro FILTER_V_PS_W4_H4 2
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
@@ -1591,13 +1578,13 @@
mova m1, [pw_2000]
mov r4d, %2/4
+ lea r5, [3 * r1]
.loop
movd m2, [r0]
movd m3, [r0 + r1]
movd m4, [r0 + 2 * r1]
- lea r5, [r0 + 2 * r1]
- movd m5, [r5 + r1]
+ movd m5, [r0 + r5]
punpcklbw m2, m3
punpcklbw m6, m4, m5
@@ -1605,7 +1592,8 @@
pmaddubsw m2, m0
- movd m6, [r0 + 4 * r1]
+ lea r0, [r0 + 4 * r1]
+ movd m6, [r0]
punpcklbw m3, m4
punpcklbw m7, m5, m6
@@ -1616,11 +1604,10 @@
phaddw m2, m3
psubw m2, m1
- movlps [r2], m2
+ movh [r2], m2
movhps [r2 + r3], m2
- lea r5, [r0 + 4 * r1]
- movd m2, [r5 + r1]
+ movd m2, [r0 + r1]
punpcklbw m4, m5
punpcklbw m3, m6, m2
@@ -1628,7 +1615,7 @@
pmaddubsw m4, m0
- movd m3, [r5 + 2 * r1]
+ movd m3, [r0 + 2 * r1]
punpcklbw m5, m6
punpcklbw m2, m3
@@ -1639,12 +1626,11 @@
phaddw m4, m5
psubw m4, m1
- movlps [r2 + 2 * r3], m4
- lea r6, [r2 + 2 * r3]
- movhps [r6 + r3], m4
-
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
+ lea r2, [r2 + 2 * r3]
+ movh [r2], m4
+ movhps [r2 + r3], m4
+
+ lea r2, [r2 + 2 * r3]
dec r4d
jnz .loop
@@ -1659,15 +1645,15 @@
;--------------------------------------------------------------------------------------------------------------
%macro FILTER_V_PS_W8_H8_H16_H2 2
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
mov r4d, r4m
sub r0, r1
add r3d, r3d
%ifdef PIC
- lea r6, [tab_ChromaCoeff]
- movd m5, [r6 + r4 * 4]
+ lea r5, [tab_ChromaCoeff]
+ movd m5, [r5 + r4 * 4]
%else
movd m5, [tab_ChromaCoeff + r4 * 4]
%endif
@@ -1677,34 +1663,34 @@
mova m4, [pw_2000]
mov r4d, %2/2
+ lea r5, [3 * r1]
.loopH
movq m0, [r0]
movq m1, [r0 + r1]
movq m2, [r0 + 2 * r1]
- lea r5, [r0 + 2 * r1]
- movq m3, [r5 + r1]
+ movq m3, [r0 + r5]
punpcklbw m0, m1
- punpcklbw m7, m2, m3
+ punpcklbw m1, m2
+ punpcklbw m2, m3
pmaddubsw m0, m6
- pmaddubsw m7, m5
-
- paddw m0, m7
+ pmaddubsw m2, m5
+
+ paddw m0, m2
psubw m0, m4
movu [r2], m0
movq m0, [r0 + 4 * r1]
- punpcklbw m1, m2
- punpcklbw m7, m3, m0
+ punpcklbw m3, m0
pmaddubsw m1, m6
- pmaddubsw m7, m5
-
- paddw m1, m7
+ pmaddubsw m3, m5
+
+ paddw m1, m3
psubw m1, m4
movu [r2 + r3], m1
@@ -1727,15 +1713,15 @@
;--------------------------------------------------------------------------------------------------------------
%macro FILTER_V_PS_W8_H8_H16_H32 2
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
add r3d, r3d
%ifdef PIC
- lea r6, [tab_ChromaCoeff]
- movd m5, [r6 + r4 * 4]
+ lea r5, [tab_ChromaCoeff]
+ movd m5, [r5 + r4 * 4]
%else
movd m5, [tab_ChromaCoeff + r4 * 4]
%endif
@@ -1745,55 +1731,54 @@
mova m4, [pw_2000]
mov r4d, %2/4
+ lea r5, [3 * r1]
.loop
movq m0, [r0]
movq m1, [r0 + r1]
movq m2, [r0 + 2 * r1]
- lea r5, [r0 + 2 * r1]
- movq m3, [r5 + r1]
+ movq m3, [r0 + r5]
punpcklbw m0, m1
- punpcklbw m7, m2, m3
+ punpcklbw m1, m2
+ punpcklbw m2, m3
pmaddubsw m0, m6
- pmaddubsw m7, m5
+ pmaddubsw m7, m2, m5
paddw m0, m7
psubw m0, m4
movu [r2], m0
- movq m0, [r0 + 4 * r1]
-
- punpcklbw m1, m2
- punpcklbw m7, m3, m0
+ lea r0, [r0 + 4 * r1]
+ movq m0, [r0]
+
+ punpcklbw m3, m0
pmaddubsw m1, m6
- pmaddubsw m7, m5
+ pmaddubsw m7, m3, m5
paddw m1, m7
psubw m1, m4
movu [r2 + r3], m1
- lea r6, [r0 + 4 * r1]
- movq m1, [r6 + r1]
-
- punpcklbw m2, m3
- punpcklbw m7, m0, m1
+ movq m1, [r0 + r1]
+
+ punpcklbw m0, m1
pmaddubsw m2, m6
- pmaddubsw m7, m5
-
- paddw m2, m7
+ pmaddubsw m0, m5
+
+ paddw m2, m0
psubw m2, m4
- movu [r2 + 2 * r3], m2
-
- movq m2, [r6 + 2 * r1]
-
- punpcklbw m3, m0
+ lea r2, [r2 + 2 * r3]
+ movu [r2], m2
+
+ movq m2, [r0 + 2 * r1]
+
punpcklbw m1, m2
pmaddubsw m3, m6
@@ -1802,11 +1787,9 @@
paddw m3, m1
psubw m3, m4
- lea r5, [r2 + 2 * r3]
- movu [r5 + r3], m3
-
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
+ movu [r2 + r3], m3
+
+ lea r2, [r2 + 2 * r3]
dec r4d
jnz .loop
@@ -1821,15 +1804,15 @@
;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal interp_4tap_vert_ps_6x8, 4, 7, 8
+cglobal interp_4tap_vert_ps_6x8, 4, 6, 8
mov r4d, r4m
sub r0, r1
add r3d, r3d
%ifdef PIC
- lea r6, [tab_ChromaCoeff]
- movd m5, [r6 + r4 * 4]
+ lea r5, [tab_ChromaCoeff]
+ movd m5, [r5 + r4 * 4]
%else
movd m5, [tab_ChromaCoeff + r4 * 4]
%endif
@@ -1837,36 +1820,35 @@
pshufb m6, m5, [tab_Vm]
pshufb m5, [tab_Vm + 16]
mova m4, [pw_2000]
-
+ lea r5, [3 * r1]
mov r4d, 2
.loop
movq m0, [r0]
movq m1, [r0 + r1]
movq m2, [r0 + 2 * r1]
- lea r5, [r0 + 2 * r1]
- movq m3, [r5 + r1]
+ movq m3, [r0 + r5]
punpcklbw m0, m1
- punpcklbw m7, m2, m3
+ punpcklbw m1, m2
+ punpcklbw m2, m3
pmaddubsw m0, m6
- pmaddubsw m7, m5
+ pmaddubsw m7, m2, m5
paddw m0, m7
-
psubw m0, m4
+
movh [r2], m0
pshufd m0, m0, 2
movd [r2 + 8], m0
- movq m0, [r0 + 4 * r1]
-
- punpcklbw m1, m2
- punpcklbw m7, m3, m0
+ lea r0, [r0 + 4 * r1]
+ movq m0, [r0]
+ punpcklbw m3, m0
pmaddubsw m1, m6
- pmaddubsw m7, m5
+ pmaddubsw m7, m3, m5
paddw m1, m7
psubw m1, m4
@@ -1875,25 +1857,21 @@
pshufd m1, m1, 2
movd [r2 + r3 + 8], m1
- lea r6, [r0 + 4 * r1]
- movq m1, [r6 + r1]
-
- punpcklbw m2, m3
- punpcklbw m7, m0, m1
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
pmaddubsw m2, m6
- pmaddubsw m7, m5
-
- paddw m2, m7
+ pmaddubsw m0, m5
+
+ paddw m2, m0
psubw m2, m4
- movh [r2 + 2 * r3], m2
+ lea r2,[r2 + 2 * r3]
+ movh [r2], m2
pshufd m2, m2, 2
- movd [r2 + 2 * r3 + 8], m2
-
- movq m2,[r6 + 2 * r1]
-
- punpcklbw m3, m0
+ movd [r2 + 8], m2
+
+ movq m2,[r0 + 2 * r1]
punpcklbw m1, m2
pmaddubsw m3, m6
@@ -1902,13 +1880,11 @@
paddw m3, m1
psubw m3, m4
- lea r5,[r2 + 2 * r3]
- movh [r5 + r3], m3
+ movh [r2 + r3], m3
pshufd m3, m3, 2
- movd [r5 + r3 + 8], m3
-
- lea r0, [r0 + 4 * r1]
- lea r2, [r2 + 4 * r3]
+ movd [r2 + r3 + 8], m3
+
+ lea r2, [r2 + 2 * r3]
dec r4d
jnz .loop
@@ -1934,68 +1910,61 @@
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
- mova m7, [pw_2000]
-
mov r4d, 16/2
.loop
movu m2, [r0]
movu m3, [r0 + r1]
- punpcklbw m4, m2, m3,
- punpckhbw m2, m3,
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
pmaddubsw m4, m1
pmaddubsw m2, m1
- movu m5, [r0 + 2 * r1]
- lea r5, [r0 + 2 * r1]
- movu m3, [r5 + r1]
-
- punpcklbw m6, m5, m3,
- punpckhbw m5, m3,
-
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0]
+ movu m7, [r0 + r1]
+
+ punpcklbw m6, m5, m7
pmaddubsw m6, m0
- pmaddubsw m5, m0
-
paddw m4, m6
- paddw m2, m5
-
- psubw m4, m7
- psubw m2, m7
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_2000]
+
+ psubw m4, m6
+ psubw m2, m6
movu [r2], m4
movh [r2 + 16], m2
- movu m2, [r0 + r1]
- movu m3, [r0 + 2 * r1]
-
- punpcklbw m4, m2, m3,
- punpckhbw m2, m3,
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
pmaddubsw m4, m1
- pmaddubsw m2, m1
-
- lea r5, [r0 + 2 * r1]
- movu m5, [r5 + r1]
- movu m3, [r5 + 2 * r1]
-
- punpcklbw m6, m5, m3,
- punpckhbw m5, m3,
-
- pmaddubsw m6, m0
+ pmaddubsw m3, m1
+
+ movu m2, [r0 + 2 * r1]
+
+ punpcklbw m5, m7, m2
+ punpckhbw m7, m2
+
pmaddubsw m5, m0
-
- paddw m4, m6
- paddw m2, m5
-
- psubw m4, m7
- psubw m2, m7
+ pmaddubsw m7, m0
+
+ paddw m4, m5
+ paddw m3, m7
+
+ psubw m4, m6
+ psubw m3, m6
movu [r2 + r3], m4
- movh [r2 + r3 + 16], m2
-
- lea r0, [r0 + 2 * r1]
+ movh [r2 + r3 + 16], m3
+
lea r2, [r2 + 2 * r3]
dec r4d
@@ -2022,7 +1991,6 @@
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
-
mov r4d, %2/2
.loop
@@ -2030,63 +1998,54 @@
movu m3, [r0 + r1]
punpcklbw m4, m2, m3
- punpckhbw m5, m2, m3
+ punpckhbw m2, m3
pmaddubsw m4, m1
- pmaddubsw m5, m1
-
- movu m2, [r0 + 2 * r1]
- lea r5, [r0 + 2 * r1]
- movu m3, [r5 + r1]
-
- punpcklbw m6, m2, m3
- punpckhbw m7, m2, m3
-
+ pmaddubsw m2, m1
+
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0]
+ movu m7, [r0 + r1]
+
+ punpcklbw m6, m5, m7
pmaddubsw m6, m0
+ paddw m4, m6
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_2000]
+
+ psubw m4, m6
+ psubw m2, m6
+
+ movu [r2], m4
+ movu [r2 + 16], m2
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m5, [r0 + 2 * r1]
+
+ punpcklbw m2, m7, m5
+ punpckhbw m7, m5
+
+ pmaddubsw m2, m0
pmaddubsw m7, m0
- paddw m4, m6
- paddw m5, m7
-
- mova m6, [pw_2000]
+ paddw m4, m2
+ paddw m3, m7
psubw m4, m6
- psubw m5, m6
-
- movu [r2], m4
- movu [r2 + 16], m5
-
- movu m2, [r0 + r1]
- movu m3, [r0 + 2 * r1]
-
- punpcklbw m4, m2, m3
- punpckhbw m5, m2, m3
-
- pmaddubsw m4, m1
- pmaddubsw m5, m1
-
- lea r5, [r0 + 2 * r1]
- movu m2, [r5 + r1]
- movu m3, [r5 + 2 * r1]
-
- punpcklbw m6, m2, m3,
- punpckhbw m7, m2, m3,
-
- pmaddubsw m6, m0
- pmaddubsw m7, m0
-
- paddw m4, m6
- paddw m5, m7
-
- mova m6, [pw_2000]
-
- psubw m4, m6
- psubw m5, m6
+ psubw m3, m6
movu [r2 + r3], m4
- movu [r2 + r3 + 16], m5
-
- lea r0, [r0 + 2 * r1]
+ movu [r2 + r3 + 16], m3
+
lea r2, [r2 + 2 * r3]
dec r4d
@@ -2120,100 +2079,92 @@
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
- mova m7, [pw_2000]
-
mov r4d, 32/2
.loop
movu m2, [r0]
movu m3, [r0 + r1]
- punpcklbw m4, m2, m3,
- punpckhbw m2, m3,
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
pmaddubsw m4, m1
pmaddubsw m2, m1
- movu m5, [r0 + 2 * r1]
lea r5, [r0 + 2 * r1]
- movu m3, [r5 + r1]
-
- punpcklbw m6, m5, m3,
- punpckhbw m5, m3
-
+
+ movu m5, [r5]
+ movu m7, [r5 + r1]
+
+ punpcklbw m6, m5, m7
pmaddubsw m6, m0
- pmaddubsw m5, m0
-
paddw m4, m6
- paddw m2, m5
-
- psubw m4, m7
- psubw m2, m7
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_2000]
+
+ psubw m4, m6
+ psubw m2, m6
movu [r2], m4
movu [r2 + 16], m2
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m2, [r5 + 2 * r1]
+
+ punpcklbw m5, m7, m2
+ punpckhbw m7, m2
+
+ pmaddubsw m5, m0
+ pmaddubsw m7, m0
+
+ paddw m4, m5
+ paddw m3, m7
+
+ psubw m4, m6
+ psubw m3, m6
+
+ movu [r2 + r3], m4
+ movu [r2 + r3 + 16], m3
+
movq m2, [r0 + 16]
movq m3, [r0 + r1 + 16]
- movq m4, [r0 + 2 * r1 + 16]
+ movq m4, [r5 + 16]
movq m5, [r5 + r1 + 16]
punpcklbw m2, m3
- punpcklbw m4, m5
+ punpcklbw m7, m4, m5
pmaddubsw m2, m1
- pmaddubsw m4, m0
-
- paddw m2, m4
- psubw m2, m7
+ pmaddubsw m7, m0
+
+ paddw m2, m7
+ psubw m2, m6
movu [r2 + 32], m2
- movu m2, [r0 + r1]
- movu m3, [r0 + 2 * r1]
-
- punpcklbw m4, m2, m3,
- punpckhbw m2, m3,
-
- pmaddubsw m4, m1
- pmaddubsw m2, m1
-
- lea r5, [r0 + 2 * r1]
- movu m5, [r5 + r1]
- movu m3, [r5 + 2 * r1]
-
- punpcklbw m6, m5, m3,
- punpckhbw m5, m3
-
- pmaddubsw m6, m0
+ movq m2, [r5 + 2 * r1 + 16]
+
+ punpcklbw m3, m4
+ punpcklbw m5, m2
+
+ pmaddubsw m3, m1
pmaddubsw m5, m0
- paddw m4, m6
- paddw m2, m5
-
- psubw m4, m7
- psubw m2, m7
-
- movu [r2 + r3], m4
- movu [r2 + r3 + 16], m2
-
- movq m2, [r0 + r1 + 16]
- movq m3, [r0 + 2 * r1 + 16]
- movq m4, [r5 + r1 + 16]
- movq m5, [r5 + 2 * r1 + 16]
-
- punpcklbw m2, m3
- punpcklbw m4, m5
-
- pmaddubsw m2, m1
- pmaddubsw m4, m0
-
- paddw m2, m4
-
- psubw m2, m7
- movu [r2 + r3 + 32], m2
-
- lea r0, [r0 + 2 * r1]
+ paddw m3, m5
+ psubw m3, m6
+
+ movu [r2 + r3 + 32], m3
+
+ mov r0, r5
lea r2, [r2 + 2 * r3]
dec r4d
@@ -2249,18 +2200,18 @@
movu m2, [r0]
movu m3, [r0 + r1]
- punpcklbw m4, m2, m3,
- punpckhbw m2, m3,
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
pmaddubsw m4, m1
pmaddubsw m2, m1
- movu m3, [r0 + 2 * r1]
lea r5, [r0 + 2 * r1]
+ movu m3, [r5]
movu m5, [r5 + r1]
punpcklbw m6, m3, m5
- punpckhbw m3, m5,
+ punpckhbw m3, m5
pmaddubsw m6, m0
pmaddubsw m3, m0
@@ -2277,17 +2228,17 @@
movu m2, [r0 + 16]
movu m3, [r0 + r1 + 16]
- punpcklbw m4, m2, m3,
- punpckhbw m2, m3,
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
pmaddubsw m4, m1
pmaddubsw m2, m1
- movu m3, [r0 + 2 * r1 + 16]
+ movu m3, [r5 + 16]
movu m5, [r5 + r1 + 16]
punpcklbw m6, m3, m5
- punpckhbw m3, m5,
+ punpckhbw m3, m5
pmaddubsw m6, m0
pmaddubsw m3, m0
@@ -2319,14 +2270,14 @@
;-----------------------------------------------------------------------------
%macro FILTER_V4_W8_H8_H16_H32 2
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
mov r4d, r4m
sub r0, r1
%ifdef PIC
-lea r6, [tab_ChromaCoeff]
-movd m5, [r6 + r4 * 4]
+lea r5, [tab_ChromaCoeff]
+movd m5, [r5 + r4 * 4]
%else
movd m5, [tab_ChromaCoeff + r4 * 4]
%endif
@@ -2334,6 +2285,7 @@
pshufb m6, m5, [tab_Vm]
pshufb m5, [tab_Vm + 16]
mova m4, [tab_c_512]
+lea r5, [r1 * 3]
mov r4d, %2
@@ -2341,14 +2293,14 @@
movq m0, [r0]
movq m1, [r0 + r1]
movq m2, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movq m3, [r5 + r1]
+movq m3, [r0 + r5]
punpcklbw m0, m1
-punpcklbw m7, m2, m3
+punpcklbw m1, m2
+punpcklbw m2, m3
pmaddubsw m0, m6
-pmaddubsw m7, m5
+pmaddubsw m7, m2, m5
paddw m0, m7
@@ -2356,13 +2308,13 @@
packuswb m0, m0
movh [r2], m0
-movq m0, [r0 + 4 * r1]
-
-punpcklbw m1, m2
-punpcklbw m7, m3, m0
+lea r0, [r0 + 4 * r1]
+movq m0, [r0]
+
+punpcklbw m3, m0
pmaddubsw m1, m6
-pmaddubsw m7, m5
+pmaddubsw m7, m3, m5
paddw m1, m7
@@ -2370,25 +2322,19 @@
packuswb m1, m1
movh [r2 + r3], m1
-lea r6, [r0 + 4 * r1]
-movq m1, [r6 + r1]
-
-punpcklbw m2, m3
-punpcklbw m7, m0, m1
+movq m1, [r0 + r1]
+
+punpcklbw m0, m1
pmaddubsw m2, m6
-pmaddubsw m7, m5
-
-paddw m2, m7
+pmaddubsw m0, m5
+
+paddw m2, m0
pmulhrsw m2, m4
-packuswb m2, m2
-movh [r2 + 2 * r3], m2
-
-movq m2, [r6 + 2 * r1]
-
-punpcklbw m3, m0
-punpcklbw m1, m2
+
+movq m7, [r0 + 2 * r1]
+punpcklbw m1, m7
pmaddubsw m3, m6
pmaddubsw m1, m5
@@ -2396,13 +2342,13 @@
paddw m3, m1
pmulhrsw m3, m4
-packuswb m3, m3
-
-lea r5, [r2 + 2 * r3]
-movh [r5 + r3], m3
-
-lea r0, [r0 + 4 * r1]
-lea r2, [r2 + 4 * r3]
+packuswb m2, m3
+
+lea r2, [r2 + 2 * r3]
+movh [r2], m2
+movhps [r2 + r3], m2
+
+lea r2, [r2 + 2 * r3]
sub r4, 4
jnz .loop
@@ -2418,14 +2364,14 @@
;-----------------------------------------------------------------------------
%macro FILTER_V4_W6_H4 2
INIT_XMM sse4
-cglobal interp_4tap_vert_pp_6x8, 4, 7, 8
+cglobal interp_4tap_vert_pp_6x8, 4, 6, 8
mov r4d, r4m
sub r0, r1
%ifdef PIC
-lea r6, [tab_ChromaCoeff]
-movd m5, [r6 + r4 * 4]
+lea r5, [tab_ChromaCoeff]
+movd m5, [r5 + r4 * 4]
%else
movd m5, [tab_ChromaCoeff + r4 * 4]
%endif
@@ -2435,19 +2381,20 @@
mova m4, [tab_c_512]
mov r4d, %2
+lea r5, [3 * r1]
.loop
movq m0, [r0]
movq m1, [r0 + r1]
movq m2, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movq m3, [r5 + r1]
+movq m3, [r0 + r5]
punpcklbw m0, m1
-punpcklbw m7, m2, m3
+punpcklbw m1, m2
+punpcklbw m2, m3
pmaddubsw m0, m6
-pmaddubsw m7, m5
+pmaddubsw m7, m2, m5
paddw m0, m7
@@ -2456,13 +2403,13 @@
movd [r2], m0
pextrw [r2 + 4], m0, 2
-movq m0, [r0 + 4 * r1]
-
-punpcklbw m1, m2
-punpcklbw m7, m3, m0
+lea r0, [r0 + 4 * r1]
+
+movq m0, [r0]
+punpcklbw m3, m0
pmaddubsw m1, m6
-pmaddubsw m7, m5
+pmaddubsw m7, m3, m5
paddw m1, m7
@@ -2471,10 +2418,7 @@
movd [r2 + r3], m1
pextrw [r2 + r3 + 4], m1, 2
-lea r6, [r0 + 4 * r1]
-movq m1, [r6 + r1]
-
-punpcklbw m2, m3
+movq m1, [r0 + r1]
punpcklbw m7, m0, m1
pmaddubsw m2, m6
@@ -2484,12 +2428,11 @@
pmulhrsw m2, m4
packuswb m2, m2
-movd [r2 + 2 * r3], m2
-pextrw [r2 + 2 * r3 + 4], m2, 2
-
-movq m2, [r6 + 2 * r1]
-
-punpcklbw m3, m0
+lea r2, [r2 + 2 * r3]
+movd [r2], m2
+pextrw [r2 + 4], m2, 2
+
+movq m2, [r0 + 2 * r1]
punpcklbw m1, m2
pmaddubsw m3, m6
@@ -2500,12 +2443,10 @@
pmulhrsw m3, m4
packuswb m3, m3
-lea r5, [r2 + 2 * r3]
-movd [r5 + r3], m3
-pextrw [r5 + r3 + 4], m3, 2
-
-lea r0, [r0 + 4 * r1]
-lea r2, [r2 + 4 * r3]
+movd [r2 + r3], m3
+pextrw [r2 + r3 + 4], m3, 2
+
+lea r2, [r2 + 2 * r3]
sub r4, 4
jnz .loop
@@ -2534,72 +2475,65 @@
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
-mova m7, [tab_c_512]
-
-mov r4d, %2
+mov r4d, %2
.loop
movu m2, [r0]
movu m3, [r0 + r1]
-punpcklbw m4, m2, m3,
-punpckhbw m2, m3,
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
pmaddubsw m4, m1
pmaddubsw m2, m1
-movu m5, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movu m3, [r5 + r1]
-
-punpcklbw m6, m5, m3,
-punpckhbw m5, m3,
-
+lea r0, [r0 + 2 * r1]
+movu m5, [r0]
+movu m7, [r0 + r1]
+
+punpcklbw m6, m5, m7
pmaddubsw m6, m0
-pmaddubsw m5, m0
-
paddw m4, m6
-paddw m2, m5
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
+
+punpckhbw m6, m5, m7
+pmaddubsw m6, m0
+paddw m2, m6
+
+mova m6, [tab_c_512]
+
+pmulhrsw m4, m6
+pmulhrsw m2, m6
packuswb m4, m2
movh [r2], m4
pextrd [r2 + 8], m4, 2
-movu m2, [r0 + r1]
-movu m3, [r0 + 2 * r1]
-
-punpcklbw m4, m2, m3,
-punpckhbw m2, m3,
+punpcklbw m4, m3, m5
+punpckhbw m3, m5
pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r5, [r0 + 2 * r1]
-movu m5, [r5 + r1]
-movu m3, [r5 + 2 * r1]
-
-punpcklbw m6, m5, m3,
-punpckhbw m5, m3,
-
-pmaddubsw m6, m0
-pmaddubsw m5, m0
-
-paddw m4, m6
-paddw m2, m5
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
+pmaddubsw m3, m1
+
+movu m5, [r0 + 2 * r1]
+
+punpcklbw m2, m7, m5
+punpckhbw m7, m5
+
+pmaddubsw m2, m0
+pmaddubsw m7, m0
+
+paddw m4, m2
+paddw m3, m7
+
+pmulhrsw m4, m6
+pmulhrsw m3, m6
+
+packuswb m4, m3
movh [r2 + r3], m4
pextrd [r2 + r3 + 8], m4, 2
-lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
sub r4, 2
@@ -2635,66 +2569,57 @@
movu m2, [r0]
movu m3, [r0 + r1]
-punpcklbw m4, m2, m3,
-punpckhbw m5, m2, m3,
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
pmaddubsw m4, m1
-pmaddubsw m5, m1
-
-movu m2, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movu m3, [r5 + r1]
-
-punpcklbw m6, m2, m3,
-punpckhbw m7, m2, m3,
-
+pmaddubsw m2, m1
+
+lea r0, [r0 + 2 * r1]
+movu m5, [r0]
+movu m6, [r0 + r1]
+
+punpckhbw m7, m5, m6
+pmaddubsw m7, m0
+paddw m2, m7
+
+punpcklbw m7, m5, m6
+pmaddubsw m7, m0
+paddw m4, m7
+
+mova m7, [tab_c_512]
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2], m4
+
+punpcklbw m4, m3, m5
+punpckhbw m3, m5
+
+pmaddubsw m4, m1
+pmaddubsw m3, m1
+
+movu m5, [r0 + 2 * r1]
+
+punpcklbw m2, m6, m5
+punpckhbw m6, m5
+
+pmaddubsw m2, m0
pmaddubsw m6, m0
-pmaddubsw m7, m0
-
-paddw m4, m6;
-paddw m5, m7;
-
-mova m6, [tab_c_512]
-
-pmulhrsw m4, m6
-pmulhrsw m5, m6
-
-packuswb m4, m5
-
-movu [r2], m4
-
-movu m2, [r0 + r1]
-movu m3, [r0 + 2 * r1]
-
-punpcklbw m4, m2, m3,
-punpckhbw m5, m2, m3,
-
-pmaddubsw m4, m1
-pmaddubsw m5, m1
-
-lea r5, [r0 + 2 * r1]
-movu m2, [r5 + r1]
-movu m3, [r5 + 2 * r1]
-
-punpcklbw m6, m2, m3,
-punpckhbw m7, m2, m3,
-
-pmaddubsw m6, m0
-pmaddubsw m7, m0
-
-paddw m4, m6
-paddw m5, m7
-
-mova m6, [tab_c_512]
-
-pmulhrsw m4, m6
-pmulhrsw m5, m6
-
-packuswb m4, m5
+
+paddw m4, m2
+paddw m3, m6
+
+pmulhrsw m4, m7
+pmulhrsw m3, m7
+
+packuswb m4, m3
movu [r2 + r3], m4
-lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
sub r4, 2
@@ -2728,43 +2653,66 @@
pshufb m1, m0, [tab_Vm]
pshufb m0, [tab_Vm + 16]
-mova m7, [tab_c_512]
-
mov r4d, %2
.loop
movu m2, [r0]
movu m3, [r0 + r1]
-punpcklbw m4, m2, m3,
-punpckhbw m2, m3,
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
pmaddubsw m4, m1
pmaddubsw m2, m1
-movu m5, [r0 + 2 * r1]
lea r5, [r0 + 2 * r1]
-movu m3, [r5 + r1]
-
-punpcklbw m6, m5, m3,
-punpckhbw m5, m3
-
+movu m5, [r5]
+movu m7, [r5 + r1]
+
+punpcklbw m6, m5, m7
pmaddubsw m6, m0
+paddw m4, m6
+
+punpckhbw m6, m5, m7
+pmaddubsw m6, m0
+paddw m2, m6
+
+mova m6, [tab_c_512]
+
+pmulhrsw m4, m6
+pmulhrsw m2, m6
+
+packuswb m4, m2
+
+movu [r2], m4
+
+punpcklbw m4, m3, m5
+punpckhbw m3, m5
+
+pmaddubsw m4, m1
+pmaddubsw m3, m1
+
+movu m2, [r5 + 2 * r1]
+
+punpcklbw m5, m7, m2
+punpckhbw m7, m2
+
pmaddubsw m5, m0
-
-paddw m4, m6
-paddw m2, m5
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2], m4
+pmaddubsw m7, m0
+
+paddw m4, m5
+paddw m3, m7
+
+pmulhrsw m4, m6
+pmulhrsw m3, m6
+
+packuswb m4, m3
+
+movu [r2 + r3], m4
movq m2, [r0 + 16]
movq m3, [r0 + r1 + 16]
-movq m4, [r0 + 2 * r1 + 16]
+movq m4, [r5 + 16]
movq m5, [r5 + r1 + 16]
punpcklbw m2, m3
@@ -2775,57 +2723,28 @@
paddw m2, m4
-pmulhrsw m2, m7
-packuswb m2, m2
+pmulhrsw m2, m6
+
+movq m3, [r0 + r1 + 16]
+movq m4, [r5 + 16]
+movq m5, [r5 + r1 + 16]
+movq m7, [r5 + 2 * r1 + 16]
+
+punpcklbw m3, m4
+punpcklbw m5, m7
+
+pmaddubsw m3, m1
+pmaddubsw m5, m0
+
+paddw m3, m5
+
+pmulhrsw m3, m6
+packuswb m2, m3
+
movh [r2 + 16], m2
-
-movu m2, [r0 + r1]
-movu m3, [r0 + 2 * r1]
-
-punpcklbw m4, m2, m3,
-punpckhbw m2, m3,
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r5, [r0 + 2 * r1]
-movu m5, [r5 + r1]
-movu m3, [r5 + 2 * r1]
-
-punpcklbw m6, m5, m3,
-punpckhbw m5, m3
-
-pmaddubsw m6, m0
-pmaddubsw m5, m0
-
-paddw m4, m6
-paddw m2, m5
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2 + r3], m4
-
-movq m2, [r0 + r1 + 16]
-movq m3, [r0 + 2 * r1 + 16]
-movq m4, [r5 + r1 + 16]
-movq m5, [r5 + 2 * r1 + 16]
-
-punpcklbw m2, m3
-punpcklbw m4, m5
-
-pmaddubsw m2, m1
-pmaddubsw m4, m0
-
-paddw m2, m4
-
-pmulhrsw m2, m7
-packuswb m2, m2
-movh [r2 + r3 + 16], m2
-
-lea r0, [r0 + 2 * r1]
+movhps [r2 + r3 + 16], m2
+
+mov r0, r5
lea r2, [r2 + 2 * r3]
sub r4, 2
@@ -2863,18 +2782,18 @@
movu m2, [r0]
movu m3, [r0 + r1]
-punpcklbw m4, m2, m3,
-punpckhbw m2, m3,
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
pmaddubsw m4, m1
pmaddubsw m2, m1
-movu m3, [r0 + 2 * r1]
lea r5, [r0 + 2 * r1]
+movu m3, [r5]
movu m5, [r5 + r1]
punpcklbw m6, m3, m5
-punpckhbw m3, m5,
+punpckhbw m3, m5
pmaddubsw m6, m0
pmaddubsw m3, m0
@@ -2892,17 +2811,17 @@
movu m2, [r0 + 16]
movu m3, [r0 + r1 + 16]
-punpcklbw m4, m2, m3,
-punpckhbw m2, m3,
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
pmaddubsw m4, m1
pmaddubsw m2, m1
-movu m3, [r0 + 2 * r1 + 16]
+movu m3, [r5 + 16]
movu m5, [r5 + r1 + 16]
punpcklbw m6, m3, m5
-punpckhbw m3, m5,
+punpckhbw m3, m5
pmaddubsw m6, m0
pmaddubsw m3, m0
diff -r 07b5d6b82f5f -r 7eccc042e269 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Mon Feb 10 15:05:04 2014 -0600
+++ b/source/test/ipfilterharness.cpp Tue Feb 11 18:53:23 2014 +0530
@@ -158,7 +158,7 @@
rand_coeffIdx = rand() % 8; // Random coeffIdex in the filter
rand_srcStride = rand() % 100; // Randomly generated srcStride
- rand_dstStride = rand() % 100; // Randomly generated dstStride
+ rand_dstStride = rand() % 100 + 32; // Randomly generated dstStride
opt(pixel_buff + 3 * rand_srcStride,
rand_srcStride,
@@ -187,7 +187,7 @@
rand_coeffIdx = rand() % 8; // Random coeffIdex in the filter
rand_srcStride = rand() % 100; // Randomly generated srcStride
- rand_dstStride = rand() % 100; // Randomly generated dstStride
+ rand_dstStride = rand() % 100 + 32; // Randomly generated dstStride
ref(pixel_buff + 3 * rand_srcStride,
rand_srcStride,
More information about the x265-devel
mailing list