[vlc-devel] [PATCH 07/16] block cipher: use one lookup table for sbox and permutation

glenvt18 glenvt18 at gmail.com
Fri Jun 26 13:20:01 CEST 2015


---
 src/dvbcsa_bs_block.c | 269 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 187 insertions(+), 82 deletions(-)

diff --git a/src/dvbcsa_bs_block.c b/src/dvbcsa_bs_block.c
index e4f532c..9bf36b1 100644
--- a/src/dvbcsa_bs_block.c
+++ b/src/dvbcsa_bs_block.c
@@ -26,14 +26,115 @@
 #include "dvbcsa/dvbcsa.h"
 #include "dvbcsa_bs.h"
 
-#define BS_XOREQ(a, b)	do { dvbcsa_bs_word_t *_t = &(a); *_t = BS_XOR(*_t, (b)); } while (0)
+/* SIMD targets which support loading streams of two interleaved bytes */
+#ifdef BS_LOAD_DEINTERLEAVE_8
+
+/*
+  sbox + permute table - both values at one look-up
+
+  LSB: sbox output
+  MSB: permuted sbox output
+*/
+static const uint16_t dvbcsa_block_sbox_perm[256] =
+  {
+    0xd43a, 0xd9ea, 0x5168, 0xfdfe, 0xc633, 0x5be9, 0x1888, 0x941a,
+    0x8a83, 0xbbcf, 0x4be1, 0xf77f, 0xdcba, 0xc9e2, 0x5438, 0x8412,
+    0x59e8, 0xe227, 0x4361, 0x2e95, 0x300c, 0xe436, 0x6be5, 0x4570,
+    0xc8a2, 0xa006, 0x8882, 0x757c, 0xa617, 0xcaa3, 0xe026, 0x1349,
+    0xfcbe, 0xd57a, 0x736d, 0xa347, 0x0bc1, 0x0751, 0xba8f, 0xcff3,
+    0x39cc, 0x975b, 0xe367, 0x7ebd, 0x3bcd, 0x1418, 0x1008, 0x1bc9,
+    0xffff, 0x5369, 0xfbef, 0x8203, 0xb14e, 0x1148, 0x914a, 0x2884,
+    0xf63f, 0x6cb4, 0x0410, 0x2004, 0x3ddc, 0x6ff5, 0x355c, 0xa9c6,
+    0xa416, 0xdaab, 0x78ac, 0x314c, 0x4ff1, 0xd16a, 0xf22f, 0x743c,
+    0xd63b, 0x2dd4, 0x2fd5, 0x2c94, 0x0dd0, 0x29c4, 0xc363, 0xc162,
+    0x4771, 0x4aa1, 0x5ff9, 0xb34f, 0xf02e, 0xd8aa, 0x2bc5, 0xa556,
+    0xcbe3, 0x5639, 0x8e93, 0xb9ce, 0x6365, 0x6164, 0x69e4, 0x1558,
+    0x716c, 0x1619, 0x8142, 0x5779, 0x3fdd, 0xf9ee, 0xac96, 0xedf6,
+    0x988a, 0x79ec, 0xb41e, 0x2a85, 0x8753, 0x2345, 0xbdde, 0xdebb,
+    0xf57e, 0x900a, 0x9c9a, 0x8613, 0xd02a, 0x3e9d, 0x89c2, 0xb55e,
+    0x955a, 0xb61f, 0xc432, 0x6635, 0x3c9c, 0x58a8, 0xc773, 0x4430,
+    0x5229, 0x763d, 0xebe7, 0x8c92, 0xaa87, 0x961b, 0xd22b, 0x934b,
+    0x6aa5, 0xa757, 0xae97, 0x0140, 0x2615, 0xe9e6, 0x7cbc, 0xb00e,
+    0xdbeb, 0x8bc3, 0x6434, 0x722d, 0x5cb8, 0x2144, 0x6225, 0x68a4,
+    0x341c, 0xabc7, 0xc223, 0x7bed, 0x0c90, 0xf16e, 0x0550, 0x0000,
+    0x1e99, 0xbc9e, 0x334d, 0x1fd9, 0x9dda, 0x3a8d, 0xf36f, 0xb75f,
+    0xf43e, 0xafd7, 0x4221, 0x6574, 0xa886, 0xbfdf, 0xd36b, 0x2205,
+    0xb88e, 0x375d, 0xe637, 0x0611, 0x8dd2, 0x5028, 0x6775, 0xadd6,
+    0xeaa7, 0xe777, 0x6024, 0xfebf, 0x4df0, 0x4cb0, 0x8002, 0xeeb7,
+    0x5df8, 0x7dfc, 0x0a81, 0x1209, 0x4eb1, 0x0201, 0xe576, 0x0e91,
+    0x777d, 0xb20f, 0x19c8, 0x48a0, 0xcdf2, 0x9bcb, 0x5578, 0x4160,
+    0x0fd1, 0xeff7, 0x49e0, 0x6eb5, 0x1c98, 0xc022, 0xceb3, 0x4020,
+    0x361d, 0xe8a6, 0x9fdb, 0xd77b, 0x1759, 0xbe9f, 0xf8ae, 0x4631,
+    0xdffb, 0x8fd3, 0xecb6, 0x99ca, 0x8343, 0xc572, 0xa207, 0x6df4,
+    0x1dd8, 0x0341, 0x2414, 0x2755, 0x320d, 0x2554, 0x9a8b, 0x5eb9,
+    0x7aad, 0xa146, 0x920b, 0xfaaf, 0x0880, 0x8552, 0x702c, 0xddfa,
+    0x388c, 0x1a89, 0xe166, 0x7ffd, 0xccb2, 0x5aa9, 0x9e9b, 0x09c0,
+  };
+
+#define BLOCK_SBOX_PERMUTE(in_buf, out_buf) \
+    { \
+    uint8_t *src = (uint8_t *)in_buf; \
+    uint16_t *dst = (uint16_t *)out_buf; \
+    uint8_t a, b, c, d; \
+    int j; \
+    for (j = 0; j < BS_BATCH_BYTES * 8; j += 4) \
+      { \
+        a = src[j + 0]; \
+        b = src[j + 1]; \
+        c = src[j + 2]; \
+        d = src[j + 3]; \
+        dst[j + 0] = dvbcsa_block_sbox_perm[a]; \
+        dst[j + 1] = dvbcsa_block_sbox_perm[b]; \
+        dst[j + 2] = dvbcsa_block_sbox_perm[c]; \
+        dst[j + 3] = dvbcsa_block_sbox_perm[d]; \
+      } \
+    }
+
+#else
+
+#define BLOCK_SBOX(in_buf, out_buf) \
+    { \
+    uint8_t *src = (uint8_t *)in_buf; \
+    uint8_t *dst = (uint8_t *)out_buf; \
+    uint8_t a, b, c, d; \
+    int j; \
+    for (j = 0; j < BS_BATCH_BYTES * 8; j += 4) \
+      { \
+        a = src[j + 0]; \
+        b = src[j + 1]; \
+        c = src[j + 2]; \
+        d = src[j + 3]; \
+        dst[j + 0] = dvbcsa_block_sbox[a]; \
+        dst[j + 1] = dvbcsa_block_sbox[b]; \
+        dst[j + 2] = dvbcsa_block_sbox[c]; \
+        dst[j + 3] = dvbcsa_block_sbox[d]; \
+      } \
+    }
+
+#define BLOCK_PERMUTE_LOGIC(in, out) \
+    { \
+    out = BS_OR( \
+        BS_OR( \
+              BS_OR (BS_SHL (BS_AND (in, BS_VAL8(29)), 1), \
+                     BS_SHL (BS_AND (in, BS_VAL8(02)), 6)), \
+              BS_OR (BS_SHL (BS_AND (in, BS_VAL8(04)), 3), \
+                     BS_SHR (BS_AND (in, BS_VAL8(10)), 2))), \
+        BS_OR(       BS_SHR (BS_AND (in, BS_VAL8(40)), 6), \
+                     BS_SHR (BS_AND (in, BS_VAL8(80)), 4))); \
+    }
+
+#endif
 
 DVBCSA_INLINE static inline void
 dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
 {
   dvbcsa_bs_word_t scratch1[8];
+#ifdef BS_LOAD_DEINTERLEAVE_8
+  dvbcsa_bs_word_t scratch2[8 * 2];
+#else
   dvbcsa_bs_word_t scratch2[8];
-  int i, j, g;
+#endif
+  int i, g;
 
   r += 8 * 56;
 
@@ -47,48 +148,48 @@ dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
       for (g = 0; g < 8; g++)
         scratch1[g] = BS_XOR(block[i], r6_N[g]);
 
-      /* sbox */
-      {
-      uint8_t *p1, *p2;
-      uint8_t a, b, c, d;
-
-      p1 = (uint8_t *)scratch1;
-      p2 = (uint8_t *)scratch2;
-      for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
-        {
-          a = p1[j + 0];
-          b = p1[j + 1];
-          c = p1[j + 2];
-          d = p1[j + 3];
-          p2[j + 0] = dvbcsa_block_sbox[a];
-          p2[j + 1] = dvbcsa_block_sbox[b];
-          p2[j + 2] = dvbcsa_block_sbox[c];
-          p2[j + 3] = dvbcsa_block_sbox[d];
-        }
-      }
+#ifdef BS_LOAD_DEINTERLEAVE_8
+      /* sbox + bit permutation */
+      BLOCK_SBOX_PERMUTE(scratch1, scratch2);
+#else
+      /* only sbox */
+      BLOCK_SBOX(scratch1, scratch2);
+#endif
 
       for (g = 0; g < 8; g++)
         {
-          dvbcsa_bs_word_t sbox_out = scratch2[g];
-          dvbcsa_bs_word_t w;
-
-          /* bit permutation */
-
-	  dvbcsa_bs_word_t in = BS_OR(
-				      BS_OR(
-					    BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(29)), 1),
-						   BS_SHL (BS_AND (sbox_out, BS_VAL8(02)), 6)),
-					    BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(04)), 3),
-						   BS_SHR (BS_AND (sbox_out, BS_VAL8(10)), 2))),
-				      BS_OR(       BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
-					           BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
-
-          w = BS_XOR(r[8 * 8 + g], sbox_out);
-	  r[8 * 0 + g] = w;
-	  BS_XOREQ(r[8 * 2 + g], w);
-	  BS_XOREQ(r[8 * 3 + g], w);
-	  BS_XOREQ(r[8 * 4 + g], w);
-	  BS_XOREQ(r[8 * 6 + g], in);
+          dvbcsa_bs_word_t sbox_out, perm_out, w, tmp1, tmp2, tmp3, tmp4;
+
+#ifdef BS_LOAD_DEINTERLEAVE_8
+          BS_LOAD_DEINTERLEAVE_8(scratch2 + g * 2, sbox_out, perm_out);
+#else
+          sbox_out = scratch2[g];
+          BLOCK_PERMUTE_LOGIC(sbox_out, perm_out);
+#endif
+          /*
+              w = r[8 * 8 + g] ^ sbox_out;
+              r[8 * 0 + g] = w;
+              r[8 * 2 + g] ^= w;
+              r[8 * 3 + g] ^= w;
+              r[8 * 4 + g] ^ =w;
+              r[8 * 6 + g] ^= perm_out;
+          */
+
+          w = r[8 * 8 + g];
+          tmp1 = r[8 * 2 + g];
+          tmp2 = r[8 * 3 + g];
+          tmp3 = r[8 * 4 + g];
+          w = BS_XOR(w, sbox_out);
+          tmp4 = r[8 * 6 + g];
+          tmp1 = BS_XOR(tmp1, w);
+          tmp2 = BS_XOR(tmp2, w);
+          tmp3 = BS_XOR(tmp3, w);
+          r[8 * 0 + g] = w;
+          r[8 * 2 + g] = tmp1;
+          tmp4 = BS_XOR(tmp4, perm_out);
+          r[8 * 3 + g] = tmp2;
+          r[8 * 4 + g] = tmp3;
+          r[8 * 6 + g] = tmp4;
 	}
     }
 }
@@ -132,8 +233,12 @@ DVBCSA_INLINE static inline void
 dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
 {
   dvbcsa_bs_word_t scratch1[8];
+#ifdef BS_LOAD_DEINTERLEAVE_8
+  dvbcsa_bs_word_t scratch2[8 * 2];
+#else
   dvbcsa_bs_word_t scratch2[8];
-  int i, j, g;
+#endif
+  int i, g;
 
   /* loop over kk[55]..kk[0] */
   for (i = 0; i < 56; i++)
@@ -143,50 +248,50 @@ dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
       r += 8;   /* virtual shift of registers */
 
       for (g = 0; g < 8; g++)
-          scratch1[g] = BS_XOR(block[i], r7_N[g]);
-
-      /* sbox */
-      {
-      uint8_t *p1, *p2;
-      uint8_t a, b, c, d;
+         scratch1[g] = BS_XOR(block[i], r7_N[g]);
 
-      p1 = (uint8_t *)scratch1;
-      p2 = (uint8_t *)scratch2;
-      for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
-        {
-          a = p1[j + 0];
-          b = p1[j + 1];
-          c = p1[j + 2];
-          d = p1[j + 3];
-          p2[j + 0] = dvbcsa_block_sbox[a];
-          p2[j + 1] = dvbcsa_block_sbox[b];
-          p2[j + 2] = dvbcsa_block_sbox[c];
-          p2[j + 3] = dvbcsa_block_sbox[d];
-        }
-      }
+#ifdef BS_LOAD_DEINTERLEAVE_8
+      /* sbox + bit permutation */
+      BLOCK_SBOX_PERMUTE(scratch1, scratch2);
+#else
+      /* only sbox */
+      BLOCK_SBOX(scratch1, scratch2);
+#endif
 
       for (g = 0; g < 8; g++)
         {
-          dvbcsa_bs_word_t sbox_out = scratch2[g];
-          dvbcsa_bs_word_t w = r[-8 * 1 + g];
-
-          /* bit permutation */
-
-	  dvbcsa_bs_word_t in = BS_OR(
-				      BS_OR(
-					    BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(29)), 1),
-						   BS_SHL (BS_AND (sbox_out, BS_VAL8(02)), 6)),
-					    BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(04)), 3),
-						   BS_SHR (BS_AND (sbox_out, BS_VAL8(10)), 2))),
-				      BS_OR(       BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
-					           BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
-
-
-	  r[8 * 7 + g] = BS_XOR(w, sbox_out);
-	  BS_XOREQ(r[8 * 1 + g], w);
-	  BS_XOREQ(r[8 * 2 + g], w);
-	  BS_XOREQ(r[8 * 3 + g], w);
-	  BS_XOREQ(r[8 * 5 + g], in);
+          dvbcsa_bs_word_t sbox_out, perm_out, w, tmp1, tmp2, tmp3, tmp4;
+
+#ifdef BS_LOAD_DEINTERLEAVE_8
+          BS_LOAD_DEINTERLEAVE_8(scratch2 + g * 2, sbox_out, perm_out);
+#else
+          sbox_out = scratch2[g];
+          BLOCK_PERMUTE_LOGIC(sbox_out, perm_out);
+#endif
+          /*
+              w = r[-8 * 1 + g];
+              r[8 * 7 + g] = w ^ sbox_out;
+              r[8 * 1 + g] ^= w;
+              r[8 * 2 + g] ^= w;
+              r[8 * 3 + g] ^ =w;
+              r[8 * 5 + g] ^= perm_out;
+          */
+
+          w = r[-8 * 1 + g];
+          tmp1 = r[8 * 1 + g];
+          tmp2 = r[8 * 2 + g];
+          tmp3 = r[8 * 3 + g];
+          sbox_out = BS_XOR(sbox_out, w);
+          tmp4 = r[8 * 5 + g];
+          tmp1 = BS_XOR(tmp1, w);
+          tmp2 = BS_XOR(tmp2, w);
+          tmp3 = BS_XOR(tmp3, w);
+          r[8 * 7 + g] = sbox_out;
+          r[8 * 1 + g] = tmp1;
+          tmp4 = BS_XOR(tmp4, perm_out);
+          r[8 * 2 + g] = tmp2;
+          r[8 * 3 + g] = tmp3;
+          r[8 * 5 + g] = tmp4;
 	}
     }
 }
-- 
1.9.1




More information about the vlc-devel mailing list