[x265] [PATCH] asm: saoCuOrgE2[0] avx2 code: improve 154c->128c

Divya Manivannan divya at multicorewareinc.com
Thu Apr 23 15:19:10 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429794634 -19800
#      Thu Apr 23 18:40:34 2015 +0530
# Node ID 8d9267b2128bb647557def1b893d1c1bbada4e37
# Parent  861ffbedeaefd45eb6431d8ce6d5a3b4789f9a2c
asm: saoCuOrgE2[0] avx2 code: improve 154c->128c

diff -r 861ffbedeaef -r 8d9267b2128b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Apr 23 18:13:13 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Apr 23 18:40:34 2015 +0530
@@ -1707,6 +1707,7 @@
         p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
         p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
         p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
+        p.saoCuOrgE2[0] = x265_saoCuOrgE2_avx2;
         p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
         p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
         p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
diff -r 861ffbedeaef -r 8d9267b2128b source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Thu Apr 23 18:13:13 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Thu Apr 23 18:40:34 2015 +0530
@@ -452,6 +452,41 @@
          jnz         .loop
     RET
 
+INIT_YMM avx2
+cglobal saoCuOrgE2, 5, 6, 6, rec, bufft, buff1, offsetEo, lcuWidth
+    mov            r5d,   r5m
+    pxor           xm0,   xm0                     ; xm0 = 0
+    mova           xm5,   [pb_128]
+    inc            r1
+
+    movu           xm1,   [r0]                    ; xm1 = rec[x]
+    movu           xm2,   [r0 + r5 + 1]           ; xm2 = rec[x + stride + 1]
+    pxor           xm3,   xm1,   xm5
+    pxor           xm4,   xm2,   xm5
+    pcmpgtb        xm2,   xm3,   xm4
+    pcmpgtb        xm4,   xm3
+    pand           xm2,   [pb_1]
+    por            xm2,   xm4
+    movu           xm3,   [r2]                    ; xm3 = buff1
+
+    paddb          xm3,   xm2
+    paddb          xm3,   [pb_2]                  ; xm3 = edgeType
+
+    movu           xm4,   [r3]                    ; xm4 = offsetEo
+    pshufb         xm4,   xm3
+
+    psubb          xm3,   xm0,   xm2
+    movu           [r1],  xm3
+
+    pmovzxbw       m2,    xm1
+    pmovsxbw       m3,    xm4
+
+    paddw          m2,    m3
+    vextracti128   xm3,   m2,    1
+    packuswb       xm2,   xm3
+    movu           [r0],  xm2
+    RET
+
 ;=======================================================================================================
 ;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX)
 ;=======================================================================================================
diff -r 861ffbedeaef -r 8d9267b2128b source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Thu Apr 23 18:13:13 2015 +0530
+++ b/source/common/x86/loopfilter.h	Thu Apr 23 18:40:34 2015 +0530
@@ -32,6 +32,7 @@
 void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE1_2Rows_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
+void x265_saoCuOrgE2_avx2(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);


More information about the x265-devel mailing list