<div dir="ltr"><div dir="ltr"><div>From e5ad11568ec5c7b5a9d624d8f9d8f5810f14f546 Mon Sep 17 00:00:00 2001</div><div>From: Harshitha Suresh <<a href="mailto:harshitha@multicorewareinc.com">harshitha@multicorewareinc.com</a>></div><div>Date: Fri, 13 Sep 2024 13:54:08 +0530</div><div>Subject: [PATCH] AArch64: Add 8bit and 10bit neon intrinsics for intraFilter</div><div> and intrapred DC</div><div><br></div><div>---</div><div> source/common/aarch64/intrapred-prim.cpp | 335 ++++++++++++++++++++++-</div><div> 1 file changed, 321 insertions(+), 14 deletions(-)</div><div><br></div><div>diff --git a/source/common/aarch64/intrapred-prim.cpp b/source/common/aarch64/intrapred-prim.cpp</div><div>index 8624dd2a6..68a9f26ac 100644</div><div>--- a/source/common/aarch64/intrapred-prim.cpp</div><div>+++ b/source/common/aarch64/intrapred-prim.cpp</div><div>@@ -2,7 +2,7 @@</div><div> #include "primitives.h"</div><div> </div><div> </div><div>-#if 1</div><div>+#if HAVE_NEON</div><div> #include "arm64-utils.h"</div><div> #include <arm_neon.h></div><div> </div><div>@@ -12,6 +12,52 @@ namespace</div><div> {</div><div> </div><div> </div><div>+template<int tuSize></div><div>+void intraFilter_neon(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */</div><div>+{</div><div>+    const int tuSize2 = tuSize << 1;</div><div>+    pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2];</div><div>+</div><div>+    uint16x8_t two_vec = vdupq_n_u16(2);</div><div>+#if !HIGH_BIT_DEPTH</div><div>+    {</div><div>+        for(int i = 0; i < tuSize2 + tuSize2; i+=8)</div><div>+         {</div><div>+            uint16x8_t sample1 = vmovl_u8(vld1_u8(&samples[i]));</div><div>+            uint16x8_t sample2 = vmovl_u8(vld1_u8(&samples[i-1]));</div><div>+            uint16x8_t sample3 = vmovl_u8(vld1_u8(&samples[i+1]));</div><div>+</div><div>+            uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );</div><div>+            uint16x8_t result2 = vaddq_u16(sample3, two_vec);</div><div>+            uint16x8_t result3 = vaddq_u16(result1,result2);</div><div>+            vst1_u8(&filtered[i] , vmovn_u16(vshrq_n_u16(result3, 2)));</div><div>+        }</div><div>+    }</div><div>+#else</div><div>+    {</div><div>+        for(int i = 0; i < tuSize2 + tuSize2; i+=8)</div><div>+        {</div><div>+            uint16x8_t sample1 = vld1q_u16(&samples[i]);</div><div>+            uint16x8_t sample2 = vld1q_u16(&samples[i-1]);</div><div>+            uint16x8_t sample3 = vld1q_u16(&samples[i+1]);</div><div>+</div><div>+            uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );</div><div>+            uint16x8_t result2 = vaddq_u16(sample3, two_vec);</div><div>+            uint16x8_t result3 = vaddq_u16(result1,result2);</div><div>+            vst1q_u16(&filtered[i] , vshrq_n_u16(result3, 2));</div><div>+        }</div><div>+    }</div><div>+#endif</div><div>+    // filtering top</div><div>+    filtered[tuSize2] = topLast;</div><div>+</div><div>+    // filtering top-left</div><div>+    filtered[0] = ((topLeft << 1) + samples[1] + samples[tuSize2 + 1] + 2) >> 2;</div><div>+</div><div>+    // filtering left</div><div>+    filtered[tuSize2 + 1] = ((samples[tuSize2 + 1] << 1) + topLeft + samples[tuSize2 + 2] + 2) >> 2;</div><div>+    filtered[tuSize2 + tuSize2] = leftLast;</div><div>+}</div><div> </div><div> template<int width></div><div> void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)</div><div>@@ -188,6 +234,7 @@ void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, i</div><div>     }</div><div> }</div><div> </div><div>+#endif</div><div> template<int log2Size></div><div> void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)</div><div> {</div><div>@@ -232,6 +279,270 @@ void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)</div><div>         }</div><div>     }</div><div> }</div><div>+</div><div>+template<int log2Size></div><div>+void planar_pred_neon(pixel * dst, intptr_t dstStride, const pixel * srcPix, int /*dirMode*/, int /*bFilter*/)</div><div>+{</div><div>+    const int blkSize = 1 << log2Size;</div><div>+</div><div>+    const pixel* above = srcPix + 1;</div><div>+    const pixel* left = srcPix + (2 * blkSize + 1);</div><div>+</div><div>+    switch (blkSize) {</div><div>+    case 8:</div><div>+    {</div><div>+        const uint16_t log2SizePlusOne = log2Size + 1;</div><div>+        uint16x8_t blkSizeVec = vdupq_n_u16(blkSize);</div><div>+        uint16x8_t topRight = vdupq_n_u16(above[blkSize]);</div><div>+        uint16_t bottomLeft = left[blkSize];</div><div>+        uint16x8_t oneVec = vdupq_n_u16(1);</div><div>+        uint16x8_t blkSizeSubOneVec = vdupq_n_u16(blkSize - 1);</div><div>+</div><div>+        for (int y = 0; y < blkSize; y++) {</div><div>+            // (blkSize - 1 - y)</div><div>+            uint16x8_t vlkSizeYVec = vdupq_n_u16(blkSize - 1 - y);</div><div>+            // (y + 1) * bottomLeft</div><div>+            uint16x8_t bottomLeftYVec = vdupq_n_u16((y + 1) * bottomLeft);</div><div>+            // left[y]</div><div>+            uint16x8_t leftYVec = vdupq_n_u16(left[y]);</div><div>+</div><div>+            for (int x = 0; x < blkSize; x += 8) {</div><div>+                int idx = y * dstStride + x;</div><div>+                uint16x8_t xvec = { (uint16_t)(x + 0), (uint16_t)(x + 1),</div><div>+                                    (uint16_t)(x + 2), (uint16_t)(x + 3),</div><div>+                                    (uint16_t)(x + 4), (uint16_t)(x + 5),</div><div>+                                    (uint16_t)(x + 6), (uint16_t)(x + 7) };</div><div>+</div><div>+                // (blkSize - 1 - y) * above[x]</div><div>+                uint16x8_t aboveVec = { (uint16_t)(above[x + 0]),</div><div>+                                        (uint16_t)(above[x + 1]),</div><div>+                                        (uint16_t)(above[x + 2]),</div><div>+                                        (uint16_t)(above[x + 3]),</div><div>+                                        (uint16_t)(above[x + 4]),</div><div>+                                        (uint16_t)(above[x + 5]),</div><div>+                                        (uint16_t)(above[x + 6]),</div><div>+                                        (uint16_t)(above[x + 7]) };</div><div>+</div><div>+                aboveVec = vmulq_u16(aboveVec, vlkSizeYVec);</div><div>+</div><div>+                // (blkSize - 1 - x) * left[y]</div><div>+                uint16x8_t first = vsubq_u16(blkSizeSubOneVec, xvec);</div><div>+                first = vmulq_u16(first, leftYVec);</div><div>+</div><div>+                // (x + 1) * topRight</div><div>+                uint16x8_t second = vaddq_u16(xvec, oneVec);</div><div>+                second = vmulq_u16(second, topRight);</div><div>+</div><div>+                uint16x8_t resVec = vaddq_u16(first, second);</div><div>+                resVec = vaddq_u16(resVec, aboveVec);</div><div>+                resVec = vaddq_u16(resVec, bottomLeftYVec);</div><div>+                resVec = vaddq_u16(resVec, blkSizeVec);</div><div>+                resVec = vshrq_n_u16(resVec, log2SizePlusOne);</div><div>+</div><div>+                for (int i = 0; i < 8; i++)</div><div>+                    dst[idx + i] = (pixel)resVec[i];</div><div>+    }</div><div>+}</div><div>+        }</div><div>+    break;</div><div>+    case 4:</div><div>+    case 32:</div><div>+    case 16:</div><div>+    {</div><div>+        const uint32_t log2SizePlusOne = log2Size + 1;</div><div>+        uint32x4_t blkSizeVec = vdupq_n_u32(blkSize);</div><div>+        uint32x4_t topRight = vdupq_n_u32(above[blkSize]);</div><div>+        uint32_t bottomLeft = left[blkSize];</div><div>+        uint32x4_t oneVec = vdupq_n_u32(1);</div><div>+        uint32x4_t blkSizeSubOneVec = vdupq_n_u32(blkSize - 1);</div><div>+</div><div>+        for (int y = 0; y < blkSize; y++) {</div><div>+            // (blkSize - 1 - y)</div><div>+            uint32x4_t vlkSizeYVec = vdupq_n_u32(blkSize - 1 - y);</div><div>+            // (y + 1) * bottomLeft</div><div>+            uint32x4_t bottomLeftYVec = vdupq_n_u32((y + 1) * bottomLeft);</div><div>+            // left[y]</div><div>+            uint32x4_t leftYVec = vdupq_n_u32(left[y]);</div><div>+</div><div>+            for (int x = 0; x < blkSize; x += 4) {</div><div>+                int idx = y * dstStride + x;</div><div>+                uint32x4_t xvec = { (uint32_t)(x + 0), (uint32_t)(x + 1),</div><div>+                                    (uint32_t)(x + 2), (uint32_t)(x + 3) };</div><div>+</div><div>+                // (blkSize - 1 - y) * above[x]</div><div>+                uint32x4_t aboveVec = { (uint32_t)(above[x + 0]),</div><div>+                                        (uint32_t)(above[x + 1]),</div><div>+                                        (uint32_t)(above[x + 2]),</div><div>+                                        (uint32_t)(above[x + 3]) };</div><div>+                aboveVec = vmulq_u32(aboveVec, vlkSizeYVec);</div><div>+</div><div>+                // (blkSize - 1 - x) * left[y]</div><div>+                uint32x4_t first = vsubq_u32(blkSizeSubOneVec, xvec);</div><div>+                first = vmulq_u32(first, leftYVec);</div><div>+</div><div>+                // (x + 1) * topRight</div><div>+                uint32x4_t second = vaddq_u32(xvec, oneVec);</div><div>+                second = vmulq_u32(second, topRight);</div><div>+</div><div>+                uint32x4_t resVec = vaddq_u32(first, second);</div><div>+                resVec = vaddq_u32(resVec, aboveVec);</div><div>+                resVec = vaddq_u32(resVec, bottomLeftYVec);</div><div>+                resVec = vaddq_u32(resVec, blkSizeVec);</div><div>+                resVec = vshrq_n_u32(resVec, log2SizePlusOne);</div><div>+</div><div>+                for (int i = 0; i < 4; i++)</div><div>+                    dst[idx + i] = (pixel)resVec[i];</div><div>+            }</div><div>+        }</div><div>+    }</div><div>+    break;</div><div>+        }</div><div>+}</div><div>+</div><div>+static void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)</div><div>+{</div><div>+    // boundary pixels processing</div><div>+    pixel topLeft = (pixel)((above[0] + left[0] + 2 * dst[0] + 2) >> 2);</div><div>+    pixel * pdst = dst;</div><div>+</div><div>+    switch (size) {</div><div>+    case 32:</div><div>+    case 16:</div><div>+    case 8:</div><div>+    {</div><div>+        uint16x8_t vconst_3 = vdupq_n_u16(3);</div><div>+        uint16x8_t vconst_2 = vdupq_n_u16(2);</div><div>+        for (int x = 0; x < size; x += 8) {</div><div>+            uint16x8_t vabo = { (uint16_t)(above[x + 0]),</div><div>+                                (uint16_t)(above[x + 1]),</div><div>+                                (uint16_t)(above[x + 2]),</div><div>+                                (uint16_t)(above[x + 3]),</div><div>+                                (uint16_t)(above[x + 4]),</div><div>+                                (uint16_t)(above[x + 5]),</div><div>+                                (uint16_t)(above[x + 6]),</div><div>+                                (uint16_t)(above[x + 7]) };</div><div>+</div><div>+            uint16x8_t vdst = { (uint16_t)(dst[x + 0]),</div><div>+                                (uint16_t)(dst[x + 1]),</div><div>+                                (uint16_t)(dst[x + 2]),</div><div>+                                (uint16_t)(dst[x + 3]),</div><div>+                                (uint16_t)(dst[x + 4]),</div><div>+                                (uint16_t)(dst[x + 5]),</div><div>+                                (uint16_t)(dst[x + 6]),</div><div>+                                (uint16_t)(dst[x + 7]) };</div><div>+            //  dst[x] = (pixel)((above[x] +  3 * dst[x] + 2) >> 2);</div><div>+            vdst = vmulq_u16(vdst, vconst_3);</div><div>+            vdst = vaddq_u16(vdst, vabo);</div><div>+            vdst = vaddq_u16(vdst, vconst_2);</div><div>+            vdst = vshrq_n_u16(vdst, 2);</div><div>+            for (int i = 0; i < 8; i++)</div><div>+                dst[x + i] = (pixel)(vdst[i]);</div><div>+        }</div><div>+        dst += dststride;</div><div>+        for (int y = 1; y < size; y++)</div><div>+        {</div><div>+            *dst = (pixel)((left[y] + 3 * *dst + 2) >> 2);</div><div>+            dst += dststride;</div><div>+        }</div><div>+    }</div><div>+    break;</div><div>+    case 4:</div><div>+    {</div><div>+        uint16x4_t vconst_3 = vdup_n_u16(3);</div><div>+        uint16x4_t vconst_2 = vdup_n_u16(2);</div><div>+        uint16x4_t vabo = { (uint16_t)(above[0]),</div><div>+                            (uint16_t)(above[1]),</div><div>+                            (uint16_t)(above[2]),</div><div>+                            (uint16_t)(above[3]) };</div><div>+        uint16x4_t vdstx = { (uint16_t)(dst[0]),</div><div>+                             (uint16_t)(dst[1]),</div><div>+                             (uint16_t)(dst[2]),</div><div>+                             (uint16_t)(dst[3]) };</div><div>+        vdstx = vmul_u16(vdstx, vconst_3);</div><div>+        vdstx = vadd_u16(vdstx, vabo);</div><div>+        vdstx = vadd_u16(vdstx, vconst_2);</div><div>+        vdstx = vshr_n_u16(vdstx, 2);</div><div>+        for (int i = 0; i < 4; i++)</div><div>+            dst[i] = (pixel)(vdstx[i]);</div><div>+</div><div>+        dst += dststride;</div><div>+        for (int y = 1; y < size; y++)</div><div>+        {</div><div>+            *dst = (pixel)((left[y] + 3 * *dst + 2) >> 2);</div><div>+            dst += dststride;</div><div>+        }</div><div>+    }</div><div>+    break;</div><div>+    }</div><div>+</div><div>+    *pdst = topLeft;</div><div>+}</div><div>+</div><div>+template<int width></div><div>+void intra_pred_dc_neon(pixel* dst, intptr_t dstStride, const pixel* srcPix, int /*dirMode*/, int bFilter)</div><div>+{</div><div>+    int k, l;</div><div>+    int dcVal = width;</div><div>+</div><div>+    switch (width) {</div><div>+    case 32:</div><div>+    case 16:</div><div>+    case 8:</div><div>+    {</div><div>+        for (int i = 0; i < width; i += 8) {</div><div>+            uint16x8_t spa = { (uint16_t)(srcPix[i + 1]),</div><div>+                               (uint16_t)(srcPix[i + 2]),</div><div>+                               (uint16_t)(srcPix[i + 3]),</div><div>+                               (uint16_t)(srcPix[i + 4]),</div><div>+                               (uint16_t)(srcPix[i + 5]),</div><div>+                               (uint16_t)(srcPix[i + 6]),</div><div>+                               (uint16_t)(srcPix[i + 7]),</div><div>+                               (uint16_t)(srcPix[i + 8]) };</div><div>+            uint16x8_t spb = { (uint16_t)(srcPix[2 * width + i + 1]),</div><div>+                               (uint16_t)(srcPix[2 * width + i + 2]),</div><div>+                               (uint16_t)(srcPix[2 * width + i + 3]),</div><div>+                               (uint16_t)(srcPix[2 * width + i + 4]),</div><div>+                               (uint16_t)(srcPix[2 * width + i + 5]),</div><div>+                               (uint16_t)(srcPix[2 * width + i + 6]),</div><div>+                               (uint16_t)(srcPix[2 * width + i + 7]),</div><div>+                               (uint16_t)(srcPix[2 * width + i + 8]) };</div><div>+            uint16x8_t vsp = vaddq_u16(spa, spb);</div><div>+            dcVal += vaddlvq_u16(vsp);</div><div>+        }</div><div>+</div><div>+        dcVal = dcVal / (width + width);</div><div>+        for (k = 0; k < width; k++)</div><div>+            for (l = 0; l < width; l += 8) {</div><div>+                uint16x8_t vdv = vdupq_n_u16((pixel)dcVal);</div><div>+                for (int n = 0; n < 8; n++)</div><div>+                    dst[k * dstStride + l + n] = (pixel)(vdv[n]);</div><div>+            }</div><div>+    }</div><div>+    break;</div><div>+    case 4:</div><div>+    {</div><div>+        uint16x4_t spa = { (uint16_t)(srcPix[1]), (uint16_t)(srcPix[2]),</div><div>+                           (uint16_t)(srcPix[3]), (uint16_t)(srcPix[4]) };</div><div>+        uint16x4_t spb = { (uint16_t)(srcPix[2 * width + 1]),</div><div>+                           (uint16_t)(srcPix[2 * width + 2]),</div><div>+                           (uint16_t)(srcPix[2 * width + 3]),</div><div>+                           (uint16_t)(srcPix[2 * width + 4]) };</div><div>+        uint16x4_t vsp = vadd_u16(spa, spb);</div><div>+        dcVal += vaddlv_u16(vsp);</div><div>+</div><div>+        dcVal = dcVal / (width + width);</div><div>+        for (k = 0; k < width; k++) {</div><div>+            uint16x4_t vdv = vdup_n_u16((pixel)dcVal);</div><div>+            for (int n = 0; n < 4; n++)</div><div>+                dst[k * dstStride + n] = (pixel)(vdv[n]);</div><div>+        }</div><div>+    }</div><div>+    break;</div><div>+    }</div><div>+</div><div>+    if (bFilter)</div><div>+        dcPredFilter(srcPix + 1, srcPix + (2 * width + 1), dst, dstStride, width);</div><div>+}</div><div> }</div><div> </div><div> namespace X265_NS</div><div>@@ -242,6 +553,11 @@ extern "C" void PFX(intra_pred_planar16_neon)(pixel* dst, intptr_t dstStride, co</div><div> </div><div> void setupIntraPrimitives_neon(EncoderPrimitives &p)</div><div> {</div><div>+    <a href="http://p.cu">p.cu</a>[BLOCK_4x4].intra_filter = intraFilter_neon<4>;</div><div>+    <a href="http://p.cu">p.cu</a>[BLOCK_8x8].intra_filter = intraFilter_neon<8>;</div><div>+    <a href="http://p.cu">p.cu</a>[BLOCK_16x16].intra_filter = intraFilter_neon<16>;</div><div>+    <a href="http://p.cu">p.cu</a>[BLOCK_32x32].intra_filter = intraFilter_neon<32>;</div><div>+</div><div>     for (int i = 2; i < NUM_INTRA_MODE; i++)</div><div>     {</div><div>         <a href="http://p.cu">p.cu</a>[BLOCK_8x8].intra_pred[i] = intra_pred_ang_neon<8>;</div><div>@@ -263,22 +579,13 @@ void setupIntraPrimitives_neon(EncoderPrimitives &p)</div><div>     <a href="http://p.cu">p.cu</a>[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_neon);</div><div>     <a href="http://p.cu">p.cu</a>[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_neon);</div><div> #endif</div><div>-}</div><div> </div><div>+    <a href="http://p.cu">p.cu</a>[BLOCK_4x4].intra_pred[DC_IDX] = intra_pred_dc_neon<4>;</div><div>+    <a href="http://p.cu">p.cu</a>[BLOCK_8x8].intra_pred[DC_IDX] = intra_pred_dc_neon<8>;</div><div>+    <a href="http://p.cu">p.cu</a>[BLOCK_16x16].intra_pred[DC_IDX] = intra_pred_dc_neon<16>;</div><div>+    <a href="http://p.cu">p.cu</a>[BLOCK_32x32].intra_pred[DC_IDX] = intra_pred_dc_neon<32>;</div><div> }</div><div>-</div><div>-</div><div>-</div><div>-#else</div><div>-</div><div>-namespace X265_NS</div><div>-{</div><div>-// x265 private namespace</div><div>-void setupIntraPrimitives_neon(EncoderPrimitives &p)</div><div>-{}</div><div> }</div><div> </div><div>-#endif</div><div>-</div><div> </div><div> </div><div>-- </div><div>2.36.0.windows.1</div><div><br></div><div><div dir="ltr" class="gmail_signature"><div dir="ltr"><div><b>__________________________</b></div><div><b>Karam Singh</b></div><div><b>Ph.D. IIT Guwahati</b></div><div><font size="1">Senior Software (Video Coding) Engineer  </font></div><div><font size="1">Mobile: +91 8011279030</font></div><div><font size="1">Block 9A, 6th floor, DLF Cyber City</font></div><div><font size="1">Manapakkam, Chennai 600 089</font></div></div></div></div></div></div>