[x265] [PATCH] asm: fix mbtree_propagate_cost asm failure, fixes crash in OpenBSD

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Wed Nov 4 15:34:33 CET 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1446645042 -19800
#      Wed Nov 04 19:20:42 2015 +0530
# Node ID 25bada1bb5494fc12d62e87d1b7b788307dd963f
# Parent  c11dd97a8b999414c60dceef8620d3d9055cf4c1
asm: fix mbtree_propagate_cost asm failure, fixes crash in OpenBSD

The SSE2 asm code reads and write extra 4 bytes if loop counter is not multiple
of 2 as SSE2 asm code process 2 int values in single iteration

The AVX asm code reads and write extra 4,8 or 12 bytes if loop counter is not
multiple of 4 as AVX asm code process 4 int values in single iteration

diff -r c11dd97a8b99 -r 25bada1bb549 source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm	Wed Nov 04 17:06:33 2015 +0530
+++ b/source/common/x86/mc-a2.asm	Wed Nov 04 19:20:42 2015 +0530
@@ -995,7 +995,8 @@
 ;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal mbtree_propagate_cost, 6,6,7
+cglobal mbtree_propagate_cost, 7,7,7
+    dec         r6d
     movsd       m6, [r5]
     mulpd       m6, [pd_inv256]
     xor         r5d, r5d
@@ -1044,8 +1045,40 @@
 
     movh        [r0+r5*4], m0
     add         r5d, 2
-    cmp         r5d, r6m
+    cmp         r5d, r6d
     jl         .loop
+
+    xor         r6d, r5d
+    jnz         .even
+    movd        m2, [r2+r5*4]       ; intra
+    movd        m0, [r4+r5*4]       ; invq
+    movd        m3, [r3+r5*2]       ; inter
+    pand        m3, m5
+    punpcklwd   m3, m4
+
+    ; PMINSD
+    pcmpgtd     m1, m2, m3
+    pand        m3, m1
+    pandn       m1, m2
+    por         m3, m1
+
+    movd        m1, [r1+r5*2]       ; prop
+    pmaddwd     m0, m2
+    punpcklwd   m1, m4
+    cvtdq2pd    m0, m0
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
+    cvtdq2pd    m1, m1              ; prop
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
+    cvtdq2pd    m1, m2              ; intra
+    psubd       m2, m3              ; intra - inter
+    cvtdq2pd    m2, m2              ; intra - inter
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+
+    divpd       m0, m1
+    addpd       m0, [pd_0_5]
+    cvttpd2dq    m0, m0
+    movd        [r0+r5*4], m0
+.even:
     RET
 
 
@@ -1055,7 +1088,8 @@
 ;-----------------------------------------------------------------------------
 ; FIXME: align loads/stores to 16 bytes
 %macro MBTREE_AVX 0
-cglobal mbtree_propagate_cost, 6,6,7
+cglobal mbtree_propagate_cost, 7,7,7
+    sub             r6d, 3
     vbroadcastsd    m6, [r5]
     mulpd           m6, [pd_inv256]
     xor             r5d, r5d
@@ -1089,9 +1123,81 @@
     cvttpd2dq       xm0, m0
 
     movu            [r0+r5*4], xm0
-    add             r5d, 4
-    cmp             r5d, r6m
+    add             r5d, 4              ; process 4 values in one iteration
+    cmp             r5d, r6d
     jl             .loop
+
+    add             r6d, 3
+    xor             r6d, r5d
+    jz              .even               ; if loop counter is multiple of 4, all values are processed
+
+    and             r6d, 3              ; otherwise, remaining unprocessed values must be 1, 2 or 3
+    cmp             r6d, 1
+    je              .process1           ; if only 1 value is unprocessed
+
+    ; process 2 values here
+    movq            xm2, [r2+r5*4]      ; intra
+    movq            xm0, [r4+r5*4]      ; invq
+    movd            xm3, [r3+r5*2]      ; inter
+    pmovzxwd        xm3, xm3
+    pand            xm3, xm5
+    pminsd          xm3, xm2
+
+    movd            xm1, [r1+r5*2]      ; prop
+    pmovzxwd        xm1, xm1
+    pmaddwd         xm0, xm2
+    cvtdq2pd        m0, xm0
+    cvtdq2pd        m1, xm1             ; prop
+%if cpuflag(avx2)
+    fmaddpd         m0, m0, m6, m1
+%else
+    mulpd           m0, m6              ; intra*invq*fps_factor>>8
+    addpd           m0, m1              ; prop + (intra*invq*fps_factor>>8)
+%endif
+    cvtdq2pd        m1, xm2             ; intra
+    psubd           xm2, xm3            ; intra - inter
+    cvtdq2pd        m2, xm2             ; intra - inter
+    mulpd           m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+
+    divpd           m0, m1
+    addpd           m0, [pd_0_5]
+    cvttpd2dq       xm0, m0
+    movq            [r0+r5*4], xm0
+
+    xor             r6d, 2
+    jz              .even
+    add             r5d, 2
+
+    ; process 1 value here
+.process1:
+    movd            xm2, [r2+r5*4]      ; intra
+    movd            xm0, [r4+r5*4]      ; invq
+    movzx           r6d, word [r3+r5*2] ; inter
+    movd            xm3, r6d
+    pand            xm3, xm5
+    pminsd          xm3, xm2
+
+    movzx           r6d, word [r1+r5*2] ; prop
+    movd            xm1, r6d
+    pmaddwd         xm0, xm2
+    cvtdq2pd        m0, xm0
+    cvtdq2pd        m1, xm1             ; prop
+%if cpuflag(avx2)
+    fmaddpd         m0, m0, m6, m1
+%else
+    mulpd           m0, m6              ; intra*invq*fps_factor>>8
+    addpd           m0, m1              ; prop + (intra*invq*fps_factor>>8)
+%endif
+    cvtdq2pd        m1, xm2             ; intra
+    psubd           xm2, xm3            ; intra - inter
+    cvtdq2pd        m2, xm2             ; intra - inter
+    mulpd           m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+
+    divpd           m0, m1
+    addpd           m0, [pd_0_5]
+    cvttpd2dq       xm0, m0
+    movd            [r0+r5*4], xm0
+.even:
     RET
 %endmacro
 
diff -r c11dd97a8b99 -r 25bada1bb549 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Nov 04 17:06:33 2015 +0530
+++ b/source/test/pixelharness.cpp	Wed Nov 04 19:20:42 2015 +0530
@@ -1360,16 +1360,16 @@
     memset(opt_dest, 0xCD, sizeof(opt_dest));
 
     double fps = 1.0;
-    int width = 16 + rand() % 64;
     int j = 0;
 
     for (int i = 0; i < ITERS; i++)
     {
+        int width = 16 + rand() % 64;
         int index = i % TEST_CASES;
         checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
         ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
 
-        if (memcmp(ref_dest, opt_dest, width * sizeof(pixel)))
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
             return false;
 
         reportfail();


More information about the x265-devel mailing list