[x265] [PATCH 2 of 3] asm: fix Main12 bugs in sad_mmx2 & sad_sse2
Min Chen
chenm003 at 163.com
Tue Jul 14 02:55:00 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1436834279 25200
# Node ID ec16d0d21987893aab1da59679b0f1f2bafc78a0
# Parent 0ddd2f402f7bb5ea4c8b2c26d9220873e5bea73d
asm: fix Main12 bugs in sad_mmx2 & sad_sse2
---
source/common/x86/sad16-a.asm | 62 +++++++++++++++++++++++++++--------------
1 files changed, 41 insertions(+), 21 deletions(-)
diff -r 0ddd2f402f7b -r ec16d0d21987 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Mon Jul 13 17:37:57 2015 -0700
+++ b/source/common/x86/sad16-a.asm Mon Jul 13 17:37:59 2015 -0700
@@ -6,6 +6,7 @@
;* Authors: Oskar Arvidsson <oskar at irock.se>
;* Henrik Gramner <henrik at gramner.com>
;* Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+;* Min Chen <chenm003 at 163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
@@ -51,8 +52,14 @@
lea r2, [r2+2*r3]
paddw m1, m2
paddw m3, m4
+ %if BIT_DEPTH <= 10
paddw m0, m1
paddw m0, m3
+ %else
+ paddw m1, m3
+ pmaddwd m1, [pw_1]
+ paddd m0, m1
+ %endif
%endmacro
%macro SAD_INC_2x8P_MMX 0
@@ -70,8 +77,14 @@
lea r2, [r2+4*r3]
paddw m1, m2
paddw m3, m4
+ %if BIT_DEPTH <= 10
paddw m0, m1
paddw m0, m3
+ %else
+ paddw m1, m3
+ pmaddwd m1, [pw_1]
+ paddd m0, m1
+ %endif
%endmacro
%macro SAD_INC_2x4P_MMX 0
@@ -82,8 +95,14 @@
ABSW2 m1, m2, m1, m2, m3, m4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
+ %if BIT_DEPTH <= 10
paddw m0, m1
paddw m0, m2
+ %else
+ paddw m1, m2
+ pmaddwd m1, [pw_1]
+ paddd m0, m1
+ %endif
%endmacro
;-----------------------------------------------------------------------------
@@ -103,9 +122,17 @@
jg .loop
%endif
%if %1*%2 == 256
+ %if BIT_DEPTH <= 10
HADDUW m0, m1
+ %else
+ HADDD m0, m1
+ %endif
%else
+ %if BIT_DEPTH <= 10
HADDW m0, m1
+ %else
+ HADDD m0, m1
+ %endif
%endif
movd eax, m0
RET
@@ -276,8 +303,9 @@
ABSW2 m3, m4, m3, m4, m7, m5
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m1, m3
+ pmaddwd m1, [pw_1]
+ paddd m0, m1
%else
movu m1, [r2]
movu m2, [r2+2*r3]
@@ -286,8 +314,9 @@
ABSW2 m1, m2, m1, m2, m3, m4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
- paddw m0, m1
- paddw m0, m2
+ paddw m1, m2
+ pmaddwd m1, [pw_1]
+ paddd m0, m1
%endif
%endmacro
@@ -307,8 +336,9 @@
ABSW2 m3, m4, m3, m4, m7, m5
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m8, m3
+ paddw m1, m3
+ pmaddwd m1, [pw_1]
+ paddd m0, m1
%else
movu m1, [r2]
movu m2, [r2 + 2 * r3]
@@ -317,8 +347,9 @@
ABSW2 m1, m2, m1, m2, m3, m4
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
- paddw m0, m1
- paddw m8, m2
+ paddw m1, m2
+ pmaddwd m1, [pw_1]
+ paddd m0, m1
%endif
%endmacro
@@ -326,7 +357,7 @@
; int pixel_sad_NxM(uint16_t *, intptr_t, uint16_t *, intptr_t)
; ---------------------------------------------------------------------------- -
%macro SAD 2
-cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
+cglobal pixel_sad_%1x%2, 4,5,8
pxor m0, m0
%if %2 == 4
SAD_INC_2ROW %1
@@ -338,12 +369,7 @@
dec r4d
jg .loop
%endif
-%if %2 == 32
- HADDUWD m0, m1
HADDD m0, m1
-%else
- HADDW m0, m1
-%endif
movd eax, xm0
RET
%endmacro
@@ -352,21 +378,15 @@
; int pixel_sad_Nx64(uint16_t *, intptr_t, uint16_t *, intptr_t)
; ---------------------------------------------------------------------------- -
%macro SAD_Nx64 1
-cglobal pixel_sad_%1x64, 4,5-(64&4/4), 9
+cglobal pixel_sad_%1x64, 4,5, 8
pxor m0, m0
- pxor m8, m8
mov r4d, 64 / 2
.loop:
SAD_INC_2ROW_Nx64 %1
dec r4d
jg .loop
- HADDUWD m0, m1
- HADDUWD m8, m1
HADDD m0, m1
- HADDD m8, m1
- paddd m0, m8
-
movd eax, xm0
RET
%endmacro
More information about the x265-devel
mailing list