[vlc-devel] [PATCH 07/16] block cipher: use one lookup table for sbox and permutation
glenvt18
glenvt18 at gmail.com
Fri Jun 26 13:20:01 CEST 2015
---
src/dvbcsa_bs_block.c | 269 +++++++++++++++++++++++++++++++++++---------------
1 file changed, 187 insertions(+), 82 deletions(-)
diff --git a/src/dvbcsa_bs_block.c b/src/dvbcsa_bs_block.c
index e4f532c..9bf36b1 100644
--- a/src/dvbcsa_bs_block.c
+++ b/src/dvbcsa_bs_block.c
@@ -26,14 +26,115 @@
#include "dvbcsa/dvbcsa.h"
#include "dvbcsa_bs.h"
-#define BS_XOREQ(a, b) do { dvbcsa_bs_word_t *_t = &(a); *_t = BS_XOR(*_t, (b)); } while (0)
+/* SIMD targets which support loading streams of two interleaved bytes */
+#ifdef BS_LOAD_DEINTERLEAVE_8
+
+/*
+ sbox + permute table - both values at one look-up
+
+ LSB: sbox output
+ MSB: permuted sbox output
+*/
+static const uint16_t dvbcsa_block_sbox_perm[256] =
+ {
+ 0xd43a, 0xd9ea, 0x5168, 0xfdfe, 0xc633, 0x5be9, 0x1888, 0x941a,
+ 0x8a83, 0xbbcf, 0x4be1, 0xf77f, 0xdcba, 0xc9e2, 0x5438, 0x8412,
+ 0x59e8, 0xe227, 0x4361, 0x2e95, 0x300c, 0xe436, 0x6be5, 0x4570,
+ 0xc8a2, 0xa006, 0x8882, 0x757c, 0xa617, 0xcaa3, 0xe026, 0x1349,
+ 0xfcbe, 0xd57a, 0x736d, 0xa347, 0x0bc1, 0x0751, 0xba8f, 0xcff3,
+ 0x39cc, 0x975b, 0xe367, 0x7ebd, 0x3bcd, 0x1418, 0x1008, 0x1bc9,
+ 0xffff, 0x5369, 0xfbef, 0x8203, 0xb14e, 0x1148, 0x914a, 0x2884,
+ 0xf63f, 0x6cb4, 0x0410, 0x2004, 0x3ddc, 0x6ff5, 0x355c, 0xa9c6,
+ 0xa416, 0xdaab, 0x78ac, 0x314c, 0x4ff1, 0xd16a, 0xf22f, 0x743c,
+ 0xd63b, 0x2dd4, 0x2fd5, 0x2c94, 0x0dd0, 0x29c4, 0xc363, 0xc162,
+ 0x4771, 0x4aa1, 0x5ff9, 0xb34f, 0xf02e, 0xd8aa, 0x2bc5, 0xa556,
+ 0xcbe3, 0x5639, 0x8e93, 0xb9ce, 0x6365, 0x6164, 0x69e4, 0x1558,
+ 0x716c, 0x1619, 0x8142, 0x5779, 0x3fdd, 0xf9ee, 0xac96, 0xedf6,
+ 0x988a, 0x79ec, 0xb41e, 0x2a85, 0x8753, 0x2345, 0xbdde, 0xdebb,
+ 0xf57e, 0x900a, 0x9c9a, 0x8613, 0xd02a, 0x3e9d, 0x89c2, 0xb55e,
+ 0x955a, 0xb61f, 0xc432, 0x6635, 0x3c9c, 0x58a8, 0xc773, 0x4430,
+ 0x5229, 0x763d, 0xebe7, 0x8c92, 0xaa87, 0x961b, 0xd22b, 0x934b,
+ 0x6aa5, 0xa757, 0xae97, 0x0140, 0x2615, 0xe9e6, 0x7cbc, 0xb00e,
+ 0xdbeb, 0x8bc3, 0x6434, 0x722d, 0x5cb8, 0x2144, 0x6225, 0x68a4,
+ 0x341c, 0xabc7, 0xc223, 0x7bed, 0x0c90, 0xf16e, 0x0550, 0x0000,
+ 0x1e99, 0xbc9e, 0x334d, 0x1fd9, 0x9dda, 0x3a8d, 0xf36f, 0xb75f,
+ 0xf43e, 0xafd7, 0x4221, 0x6574, 0xa886, 0xbfdf, 0xd36b, 0x2205,
+ 0xb88e, 0x375d, 0xe637, 0x0611, 0x8dd2, 0x5028, 0x6775, 0xadd6,
+ 0xeaa7, 0xe777, 0x6024, 0xfebf, 0x4df0, 0x4cb0, 0x8002, 0xeeb7,
+ 0x5df8, 0x7dfc, 0x0a81, 0x1209, 0x4eb1, 0x0201, 0xe576, 0x0e91,
+ 0x777d, 0xb20f, 0x19c8, 0x48a0, 0xcdf2, 0x9bcb, 0x5578, 0x4160,
+ 0x0fd1, 0xeff7, 0x49e0, 0x6eb5, 0x1c98, 0xc022, 0xceb3, 0x4020,
+ 0x361d, 0xe8a6, 0x9fdb, 0xd77b, 0x1759, 0xbe9f, 0xf8ae, 0x4631,
+ 0xdffb, 0x8fd3, 0xecb6, 0x99ca, 0x8343, 0xc572, 0xa207, 0x6df4,
+ 0x1dd8, 0x0341, 0x2414, 0x2755, 0x320d, 0x2554, 0x9a8b, 0x5eb9,
+ 0x7aad, 0xa146, 0x920b, 0xfaaf, 0x0880, 0x8552, 0x702c, 0xddfa,
+ 0x388c, 0x1a89, 0xe166, 0x7ffd, 0xccb2, 0x5aa9, 0x9e9b, 0x09c0,
+ };
+
+#define BLOCK_SBOX_PERMUTE(in_buf, out_buf) \
+ { \
+ uint8_t *src = (uint8_t *)in_buf; \
+ uint16_t *dst = (uint16_t *)out_buf; \
+ uint8_t a, b, c, d; \
+ int j; \
+ for (j = 0; j < BS_BATCH_BYTES * 8; j += 4) \
+ { \
+ a = src[j + 0]; \
+ b = src[j + 1]; \
+ c = src[j + 2]; \
+ d = src[j + 3]; \
+ dst[j + 0] = dvbcsa_block_sbox_perm[a]; \
+ dst[j + 1] = dvbcsa_block_sbox_perm[b]; \
+ dst[j + 2] = dvbcsa_block_sbox_perm[c]; \
+ dst[j + 3] = dvbcsa_block_sbox_perm[d]; \
+ } \
+ }
+
+#else
+
+#define BLOCK_SBOX(in_buf, out_buf) \
+ { \
+ uint8_t *src = (uint8_t *)in_buf; \
+ uint8_t *dst = (uint8_t *)out_buf; \
+ uint8_t a, b, c, d; \
+ int j; \
+ for (j = 0; j < BS_BATCH_BYTES * 8; j += 4) \
+ { \
+ a = src[j + 0]; \
+ b = src[j + 1]; \
+ c = src[j + 2]; \
+ d = src[j + 3]; \
+ dst[j + 0] = dvbcsa_block_sbox[a]; \
+ dst[j + 1] = dvbcsa_block_sbox[b]; \
+ dst[j + 2] = dvbcsa_block_sbox[c]; \
+ dst[j + 3] = dvbcsa_block_sbox[d]; \
+ } \
+ }
+
+#define BLOCK_PERMUTE_LOGIC(in, out) \
+ { \
+ out = BS_OR( \
+ BS_OR( \
+ BS_OR (BS_SHL (BS_AND (in, BS_VAL8(29)), 1), \
+ BS_SHL (BS_AND (in, BS_VAL8(02)), 6)), \
+ BS_OR (BS_SHL (BS_AND (in, BS_VAL8(04)), 3), \
+ BS_SHR (BS_AND (in, BS_VAL8(10)), 2))), \
+ BS_OR( BS_SHR (BS_AND (in, BS_VAL8(40)), 6), \
+ BS_SHR (BS_AND (in, BS_VAL8(80)), 4))); \
+ }
+
+#endif
DVBCSA_INLINE static inline void
dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
{
dvbcsa_bs_word_t scratch1[8];
+#ifdef BS_LOAD_DEINTERLEAVE_8
+ dvbcsa_bs_word_t scratch2[8 * 2];
+#else
dvbcsa_bs_word_t scratch2[8];
- int i, j, g;
+#endif
+ int i, g;
r += 8 * 56;
@@ -47,48 +148,48 @@ dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
for (g = 0; g < 8; g++)
scratch1[g] = BS_XOR(block[i], r6_N[g]);
- /* sbox */
- {
- uint8_t *p1, *p2;
- uint8_t a, b, c, d;
-
- p1 = (uint8_t *)scratch1;
- p2 = (uint8_t *)scratch2;
- for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
- {
- a = p1[j + 0];
- b = p1[j + 1];
- c = p1[j + 2];
- d = p1[j + 3];
- p2[j + 0] = dvbcsa_block_sbox[a];
- p2[j + 1] = dvbcsa_block_sbox[b];
- p2[j + 2] = dvbcsa_block_sbox[c];
- p2[j + 3] = dvbcsa_block_sbox[d];
- }
- }
+#ifdef BS_LOAD_DEINTERLEAVE_8
+ /* sbox + bit permutation */
+ BLOCK_SBOX_PERMUTE(scratch1, scratch2);
+#else
+ /* only sbox */
+ BLOCK_SBOX(scratch1, scratch2);
+#endif
for (g = 0; g < 8; g++)
{
- dvbcsa_bs_word_t sbox_out = scratch2[g];
- dvbcsa_bs_word_t w;
-
- /* bit permutation */
-
- dvbcsa_bs_word_t in = BS_OR(
- BS_OR(
- BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(29)), 1),
- BS_SHL (BS_AND (sbox_out, BS_VAL8(02)), 6)),
- BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(04)), 3),
- BS_SHR (BS_AND (sbox_out, BS_VAL8(10)), 2))),
- BS_OR( BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
- BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
-
- w = BS_XOR(r[8 * 8 + g], sbox_out);
- r[8 * 0 + g] = w;
- BS_XOREQ(r[8 * 2 + g], w);
- BS_XOREQ(r[8 * 3 + g], w);
- BS_XOREQ(r[8 * 4 + g], w);
- BS_XOREQ(r[8 * 6 + g], in);
+ dvbcsa_bs_word_t sbox_out, perm_out, w, tmp1, tmp2, tmp3, tmp4;
+
+#ifdef BS_LOAD_DEINTERLEAVE_8
+ BS_LOAD_DEINTERLEAVE_8(scratch2 + g * 2, sbox_out, perm_out);
+#else
+ sbox_out = scratch2[g];
+ BLOCK_PERMUTE_LOGIC(sbox_out, perm_out);
+#endif
+ /*
+ w = r[8 * 8 + g] ^ sbox_out;
+ r[8 * 0 + g] = w;
+ r[8 * 2 + g] ^= w;
+ r[8 * 3 + g] ^= w;
+ r[8 * 4 + g] ^ =w;
+ r[8 * 6 + g] ^= perm_out;
+ */
+
+ w = r[8 * 8 + g];
+ tmp1 = r[8 * 2 + g];
+ tmp2 = r[8 * 3 + g];
+ tmp3 = r[8 * 4 + g];
+ w = BS_XOR(w, sbox_out);
+ tmp4 = r[8 * 6 + g];
+ tmp1 = BS_XOR(tmp1, w);
+ tmp2 = BS_XOR(tmp2, w);
+ tmp3 = BS_XOR(tmp3, w);
+ r[8 * 0 + g] = w;
+ r[8 * 2 + g] = tmp1;
+ tmp4 = BS_XOR(tmp4, perm_out);
+ r[8 * 3 + g] = tmp2;
+ r[8 * 4 + g] = tmp3;
+ r[8 * 6 + g] = tmp4;
}
}
}
@@ -132,8 +233,12 @@ DVBCSA_INLINE static inline void
dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
{
dvbcsa_bs_word_t scratch1[8];
+#ifdef BS_LOAD_DEINTERLEAVE_8
+ dvbcsa_bs_word_t scratch2[8 * 2];
+#else
dvbcsa_bs_word_t scratch2[8];
- int i, j, g;
+#endif
+ int i, g;
/* loop over kk[55]..kk[0] */
for (i = 0; i < 56; i++)
@@ -143,50 +248,50 @@ dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
r += 8; /* virtual shift of registers */
for (g = 0; g < 8; g++)
- scratch1[g] = BS_XOR(block[i], r7_N[g]);
-
- /* sbox */
- {
- uint8_t *p1, *p2;
- uint8_t a, b, c, d;
+ scratch1[g] = BS_XOR(block[i], r7_N[g]);
- p1 = (uint8_t *)scratch1;
- p2 = (uint8_t *)scratch2;
- for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
- {
- a = p1[j + 0];
- b = p1[j + 1];
- c = p1[j + 2];
- d = p1[j + 3];
- p2[j + 0] = dvbcsa_block_sbox[a];
- p2[j + 1] = dvbcsa_block_sbox[b];
- p2[j + 2] = dvbcsa_block_sbox[c];
- p2[j + 3] = dvbcsa_block_sbox[d];
- }
- }
+#ifdef BS_LOAD_DEINTERLEAVE_8
+ /* sbox + bit permutation */
+ BLOCK_SBOX_PERMUTE(scratch1, scratch2);
+#else
+ /* only sbox */
+ BLOCK_SBOX(scratch1, scratch2);
+#endif
for (g = 0; g < 8; g++)
{
- dvbcsa_bs_word_t sbox_out = scratch2[g];
- dvbcsa_bs_word_t w = r[-8 * 1 + g];
-
- /* bit permutation */
-
- dvbcsa_bs_word_t in = BS_OR(
- BS_OR(
- BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(29)), 1),
- BS_SHL (BS_AND (sbox_out, BS_VAL8(02)), 6)),
- BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(04)), 3),
- BS_SHR (BS_AND (sbox_out, BS_VAL8(10)), 2))),
- BS_OR( BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
- BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
-
-
- r[8 * 7 + g] = BS_XOR(w, sbox_out);
- BS_XOREQ(r[8 * 1 + g], w);
- BS_XOREQ(r[8 * 2 + g], w);
- BS_XOREQ(r[8 * 3 + g], w);
- BS_XOREQ(r[8 * 5 + g], in);
+ dvbcsa_bs_word_t sbox_out, perm_out, w, tmp1, tmp2, tmp3, tmp4;
+
+#ifdef BS_LOAD_DEINTERLEAVE_8
+ BS_LOAD_DEINTERLEAVE_8(scratch2 + g * 2, sbox_out, perm_out);
+#else
+ sbox_out = scratch2[g];
+ BLOCK_PERMUTE_LOGIC(sbox_out, perm_out);
+#endif
+ /*
+ w = r[-8 * 1 + g];
+ r[8 * 7 + g] = w ^ sbox_out;
+ r[8 * 1 + g] ^= w;
+ r[8 * 2 + g] ^= w;
+ r[8 * 3 + g] ^ =w;
+ r[8 * 5 + g] ^= perm_out;
+ */
+
+ w = r[-8 * 1 + g];
+ tmp1 = r[8 * 1 + g];
+ tmp2 = r[8 * 2 + g];
+ tmp3 = r[8 * 3 + g];
+ sbox_out = BS_XOR(sbox_out, w);
+ tmp4 = r[8 * 5 + g];
+ tmp1 = BS_XOR(tmp1, w);
+ tmp2 = BS_XOR(tmp2, w);
+ tmp3 = BS_XOR(tmp3, w);
+ r[8 * 7 + g] = sbox_out;
+ r[8 * 1 + g] = tmp1;
+ tmp4 = BS_XOR(tmp4, perm_out);
+ r[8 * 2 + g] = tmp2;
+ r[8 * 3 + g] = tmp3;
+ r[8 * 5 + g] = tmp4;
}
}
}
--
1.9.1
More information about the vlc-devel
mailing list