암호보안
홈 > HPC솔류션 > 암호보안

암호보안

 

특히, 암호화된 파일 등의 해독은 보안과 관련한 중요한 분야입니다. 일반적인 기법으로 기존 CPU로 계산할 경우 천문학적 시간이 걸리는 것으로 알려져 있다. 이를 CUDA를 이용한 병렬처리 기법을 이용하여 속도향상을 이룰 수 있습니다.

 

ElcomSoft

러시아의 암호 전문 회사에서 WPA/WPA2 등의 암호 알고리즘에 대해  CUDA/Tesla기반 분산 암호 해독 소프트웨어를 출시하였습니다. 특히 PGP 암호화된 하드디스크에 관하여 빠른 구현을 하였다. 보다 자세한 정보는 다음 사이트를 통해 얻을 수 있습니다. http://gpu.elcomsoft.com 또한, CUDA zone에 Client/Server기반의 예제 어플리케이션을 제공하고 있습니다.

 

Svetlin Manavski

Manavski는 AES 128, AES 256 알고리즘의 CUDA를 적용한 논문을 발표하였습니다. 논문제목은 다음과 같습니다.  Svetlin A. Manavski, ‘CUDA Compatible GPU as an Efficient Hardware Accelerator for AES Cryptography’(ICSPC 2007), 24-27 November 2007. 다음의 사이트에서 논문을 다운로드 받을 수 있습니다. http://www.manavski.com/downloads/PID505889.pdf

 

abstract는 다음과 같습니다.

This paper presents a study of the efficiency in applying modern Graphics Processing Units in symmetric key cryptographic solutions. It describes both traditional style approaches based on the OpenGL graphics API and new ones based on the recent technology trends of major hardware vendors. It presents an efficient implementation of the Advanced Encryption Standard (AES) algorithm in the novel CUDA platform by Nvidia. AES is currently the most widely adopted modern symmetric key encryption standard. The performance of the new fastest GPU solution is compared with those of the reference sequential implementations running on an Intel Pentium IV 3.0 GHz CPU. Unlike previous research in this field, the results of this effort show for the first time the GPU can perform as an efficient cryptographic accelerator. The developed solutions run up to 20 times faster than OpenSSL and in the same range of performance of existing hardware based implementations.

 

openSSL

에스토니아 Tartu 대학의 연구원이 AES, AES-128 기반 OpenSSL 알고리즘의 CUDA 코드를 공개하였습니다. 코드를 다운받을 수 있는 사이트의 주소는 다음과 같습니다. http://math.ut.ee/~uraes/openssl-gpu

편의를 위항여 소스코드를  페이지 뒤쪽에 올려드립니다.

 

TEA Encryption

웹에 Tiny Encryption Algorithm에 대한 CUDA 가속 코드를  공개하였습니다.Google 코드 사이트를 통해 소스 코드를  다운로드  받을 수 있습니다. http://code.google.com/p/pyrit/source/browse/tags/opt/tea_cuda.cu

Core

Speed

NVIDIA GTX 260

 380 mb/s

Intel Core2Duo  2×2.5Ghz

   40 mb/s

TEA 알고리즘의 GPU CPU의 속도향상 비교

 

 

openSSL 소스코드는 다음과 같습니다.

/*

 * Master's thesis "Using Graphic Processing Unit in Block Cipher Calculations"

 * by Urmas Rosenberg

 *

 * 2007

 *

 * GPU 8800GTS implementation, ENGINE

 */

 

#include <openssl/opensslconf.h>

 

#ifndef OPENSSL_NO_HW

 

//#include <stdio.h>

#include <openssl/engine.h>

 

// GPU stuff

#define GLEW_STATIC 1

#include <GL/glew.h>

#include <GL/glut.h>

#include <Cg/cgGL.h>

 

static ENGINE *ENGINE_gpu (void);

 

void

ENGINE_load_gpu (void)

{

        ENGINE *toadd = ENGINE_gpu ();

        if (!toadd) return;

        ENGINE_add (toadd);

        ENGINE_free (toadd);

        ERR_clear_error ();

}

 

static int gpu_init(ENGINE *e);

static int gpu_finish(ENGINE *e);

static int gpu_ciphers(ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid);

 

static const char *gpu_id = "gpu";

static char gpu_name[100];

 

 

static int

gpu_bind_helper(ENGINE *e)

{

        BIO_snprintf(gpu_name, sizeof(gpu_name),"GPU (8800GTS) accelerated encrypting/decrypting");

        if (!ENGINE_set_id(e, gpu_id) ||

            !ENGINE_set_name(e, gpu_name) ||

            !ENGINE_set_init_function(e, gpu_init) ||

            !ENGINE_set_finish_function(e, gpu_finish) ||

            !ENGINE_set_ciphers (e, gpu_ciphers)) {

               return 0;

        }

        return 1;

}

 

static ENGINE *

ENGINE_gpu(void)

{

        ENGINE *eng = ENGINE_new();

 

        if (!eng) {

               return NULL;

        }

 

        if (!gpu_bind_helper(eng)) {

               ENGINE_free(eng);

               return NULL;

        }

 

        return eng;

}

 

static int

gpu_init(ENGINE *e)

{

        init_8800();

        return 1;

}

 

static int

gpu_finish(ENGINE *e)

{

        finish_8800();

        return 1;

}

 

#define AES_BLOCK_SIZE        16

#define AES_KEY_SIZE_128      16

 

 

static int gpu_cipher_nids[] = {

        NID_aes_128_ecb,

};

static int gpu_cipher_nids_num = (sizeof(gpu_cipher_nids)/

                                    sizeof(gpu_cipher_nids[0]));

 

static int gpu_aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,

                              const unsigned char *iv, int enc);

static int gpu_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,

                             const unsigned char *in, size_t nbytes);

 

#define EVP_CIPHER_block_size_ECB     AES_BLOCK_SIZE

 

#define DECLARE_AES_EVP(ksize,lmode,umode)    \

static const EVP_CIPHER gpu_aes_##ksize##_##lmode = {        \

        NID_aes_##ksize##_##lmode,            \

        EVP_CIPHER_block_size_##umode, \

        AES_KEY_SIZE_##ksize,         \

        AES_BLOCK_SIZE,                       \

        0 | EVP_CIPH_##umode##_MODE,  \

        gpu_aes_init_key,             \

        gpu_aes_cipher,               \

        NULL,                         \

        0,      \

        EVP_CIPHER_set_asn1_iv,               \

        EVP_CIPHER_get_asn1_iv,               \

        NULL,                         \

        NULL                          \

}

 

DECLARE_AES_EVP(128,ecb,ECB);

 

static int

gpu_ciphers (ENGINE *e, const EVP_CIPHER **cipher, const int **nids, int nid)

{

        if (!cipher) {

               *nids = gpu_cipher_nids;

               return gpu_cipher_nids_num;

        }

        switch (nid) {

          case NID_aes_128_ecb:

            *cipher = &gpu_aes_128_ecb;

            break;

          default:

            *cipher = NULL;

            return 0;

        }

        return 1;

}

 

static int

gpu_aes_init_key (EVP_CIPHER_CTX *ctx, const unsigned char *key,

                     const unsigned char *iv, int enc)

{

        // decrypt is not implemented on gpu

        if (!ctx->encrypt)return 0;

 

        int key_len = EVP_CIPHER_CTX_key_length(ctx) * 8;

 

        int i, j;

        unsigned char _key[256];

        memset(_key, 0, sizeof(_key));

 

        switch(key_len) {

                case 128:

                       init_key_8800(key);

 

                       break;

               default:

                       return 0;

        }

 

        return 1;

}

 

static int

gpu_aes_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out_arg,

                  const unsigned char *in_arg, size_t nbytes)

{

        cipher_8800(in_arg, out_arg);

        return 1;

}

 

#endif /* !OPENSSL_NO_HW */

 

 

 

/*

 * Master's thesis "Using Graphic Processing Unit in Block Cipher Calculations"

 * by Urmas Rosenberg

 *

 * 2007

 *

 * GPU 8800GTS implementation, CUDA code

 *

 * AES 128 ECB encrypt implementation

 */

 

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

 

#include <cutil.h>

 

// num_blocks sould be n * 12 (12:number of multiprotsessors on 8800)

#define NUM_BLOCKS 360

#define NUM_THREADS 512

#define ROUNDS_IN_THREAD 10

#define MEM_SIZE (NUM_BLOCKS * NUM_THREADS * ROUNDS_IN_THREAD * 16)

 

extern "C" {

 

uint* isbox;

uint sbox[] = {

0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,

0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,

0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,

0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,

0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,

0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,

0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,

0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,

0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,

0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,

0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,

0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,

0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,

0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,

0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,

0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};

 

__device__ int mix[] = {

0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,

0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,

0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e,

0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e,

0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e,

0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe,

0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde,

0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe,

0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05,

0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25,

0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45,

0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65,

0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85,

0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5,

0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5,

0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5};

 

const uint rcon[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36};

 

uint* ikey;

uint _key[256];

 

// modified version from wikipedia

__device__ void mix_column(uint* ur) {

        /*

        uint r[4];

 

        r[0]=(*ur    ) & 0xff;

        r[1]=(*ur>>8 ) & 0xff;

        r[2]=(*ur>>16) & 0xff;

        r[3]=(*ur>>24) & 0xff;

 

        uint a[4];

        uint b[4];*/

        /*int c;

        int h;

        for(c=0;c<4;c++) {

               a[c] = r[c];

               h = r[c] & 0x80;

               b[c] = (r[c] << 1) & 0xff;

               if(h == 0x80)

                       b[c] ^= 0x1b;

        }*/

        /*

        r[0] = b[0] ^ a[3] ^ a[2] ^ b[1] ^ a[1];

        r[1] = b[1] ^ a[0] ^ a[3] ^ b[2] ^ a[2];

        r[2] = b[2] ^ a[1] ^ a[0] ^ b[3] ^ a[3];

        r[3] = b[3] ^ a[2] ^ a[1] ^ b[0] ^ a[0];

        */

 

        /*a[0] = r[0];

        a[1] = r[1];

        a[2] = r[2];

        a[3] = r[3];

        r[0] = mix[a[0]] ^ a[3] ^ a[2] ^ mix[a[1]] ^ a[1];

        r[1] = mix[a[1]] ^ a[0] ^ a[3] ^ mix[a[2]] ^ a[2];

        r[2] = mix[a[2]] ^ a[1] ^ a[0] ^ mix[a[3]] ^ a[3];

        r[3] = mix[a[3]] ^ a[2] ^ a[1] ^ mix[a[0]] ^ a[0];

 

        *ur=(r[3]<<24 | r[2]<<16 | r[1]<<8 | r[0]);*/

 

        /*uint b = 0;

        b |= mix[(*ur>>24)     ]<<24;

        b |= mix[(*ur>>16)&0xff]<<16;

        b |= mix[(*ur>>8 )&0xff]<< 8;

        b |= mix[(*ur    )&0xff]    ;*/

       

        //uint h = *ur & 0x80808080;

        uint b = (*ur << 1) & 0xfefefefe;

        uint mask = 0;

        /*if((h&0x000000ff) == 0x00000080) mask |= 0x000000ff;

        if((h&0x0000ff00) == 0x00008000) mask |= 0x0000ff00;

        if((h&0x00ff0000) == 0x00800000) mask |= 0x00ff0000;

        if((h&0xff000000) == 0x80000000) mask |= 0xff000000; */

 

        if(*ur&0x00000080) mask |= 0x000000ff;

        if(*ur&0x00008000) mask |= 0x0000ff00;

        if(*ur&0x00800000) mask |= 0x00ff0000;

        if(*ur&0x80000000) mask |= 0xff000000;

 

        b = (b & ~mask) | ((b ^ 0x1b1b1b1b) & mask);

        *ur = b ^ ((*ur>>24)|(*ur<<8)) ^ ((*ur>>16)|(*ur<<16)) ^ ((b>>8)|(b<<24)) ^ ((*ur>>8)|(*ur<<24));

 

}

 

__global__ void

encrypt_8800(uint* idata, uint* odata, const uint* sbox,

        const uint* key){

        //const unsigned int blocks_per_thread = 128;

 

        const unsigned int bid = blockIdx.x;

        const unsigned int tid = threadIdx.x;

 

        // startbyte. Note unsigned char vs uint, 16 vs 4!

        unsigned long int sb = (bid * NUM_THREADS * ROUNDS_IN_THREAD * 4)

               + (tid * ROUNDS_IN_THREAD * 4);

 

 

        //unsigned char tmp;

        //unsigned long int i;

        int k;

        int rounds;

        uint t0, t1, t2, t3, s0, s1, s2, s3;

 

        for(int l=1;l<=ROUNDS_IN_THREAD;l++){

               k=0;

 

               // columns of block, local variables, each contains 4 bytes

               t0 = idata[sb  ];

               t1 = idata[sb+1];

               t2 = idata[sb+2];

               t3 = idata[sb+3];

 

               //add roundkey

               t0 = t0 ^ key[k++];

               t1 = t1 ^ key[k++];

               t2 = t2 ^ key[k++];

               t3 = t3 ^ key[k++];

 

               for(rounds=1;rounds<=10;rounds++){

                       //subbytes & shift rows

                       s0=sbox[t0 & 0xff] | sbox[(t1>>8) & 0xff]<<8 | sbox[(t2>>16) & 0xff]<<16 | sbox[(t3>>24) & 0xff]<<24;

                       s1=sbox[t1 & 0xff] | sbox[(t2>>8) & 0xff]<<8 | sbox[(t3>>16) & 0xff]<<16 | sbox[(t0>>24) & 0xff]<<24;

                       s2=sbox[t2 & 0xff] | sbox[(t3>>8) & 0xff]<<8 | sbox[(t0>>16) & 0xff]<<16 | sbox[(t1>>24) & 0xff]<<24;

                       s3=sbox[t3 & 0xff] | sbox[(t0>>8) & 0xff]<<8 | sbox[(t1>>16) & 0xff]<<16 | sbox[(t2>>24) & 0xff]<<24;

 

                       //mix columns

                       if(rounds<10){

                              mix_column(&s0);

                              mix_column(&s1);

                              mix_column(&s2);

                              mix_column(&s3);

                       }

 

                       //add roundkey

                       t0 = s0 ^ key[k++];

                       t1 = s1 ^ key[k++];

                       t2 = s2 ^ key[k++];

                       t3 = s3 ^ key[k++];

 

               } //rounds<=10

 

               odata[sb  ] = t0;

               odata[sb+1] = t1;

               odata[sb+2] = t2;

               odata[sb+3] = t3;

 

               sb+=4;

 

        } //l<ROUNDS_IN_THREAD

 

}

 

unsigned char* d_idata;

unsigned char* d_odata;

 

void

init_8800(){

        CUT_CHECK_DEVICE();

 

        CUDA_SAFE_CALL(cudaMalloc((void**) &isbox, 256*sizeof(uint)));

        CUDA_SAFE_CALL(cudaMemcpy(isbox, sbox, 256*sizeof(uint), cudaMemcpyHostToDevice));

 

        CUDA_SAFE_CALL(cudaMalloc((void**) &d_idata, MEM_SIZE));

        CUDA_SAFE_CALL(cudaMalloc((void**) &d_odata, MEM_SIZE));

}

 

void

finish_8800(){

    CUDA_SAFE_CALL(cudaFree(ikey));

    CUDA_SAFE_CALL(cudaFree(isbox));

    CUDA_SAFE_CALL(cudaFree(d_idata));

    CUDA_SAFE_CALL(cudaFree(d_odata));

}

 

void

init_key_8800(const uint* key){

        memset(_key, 0, 256);

        int i, j;

 

        for(i=0;i<4;i++)_key[i]=key[i];

        for(j=1;j<=10;j++){

               i=j*4;

               _key[i  ] = (sbox[(_key[i-1]>>8)&0xff]

                                      | (sbox[(_key[i-1]>>16)&0xff]<<8)

                                      | (sbox[(_key[i-1]>>24)&0xff]<<16)

                                      | (sbox[(_key[i-1])&0xff]<<24)) ^ _key[i-4] ^ rcon[j-1];

               _key[i+1] = _key[i+1-4]^_key[i];

               _key[i+2] = _key[i+2-4]^_key[i+1];

               _key[i+3] = _key[i+3-4]^_key[i+2];

        }

 

        CUDA_SAFE_CALL(cudaMalloc((void**) &ikey, 256*sizeof(uint)));

        CUDA_SAFE_CALL(cudaMemcpy(ikey, _key, 256*sizeof(uint), cudaMemcpyHostToDevice));

}

 

 

void

cipher_8800(unsigned char *in, unsigned char *out){

        dim3  grid(NUM_BLOCKS, 1, 1);

        dim3  threads(NUM_THREADS, 1, 1);

 

        CUDA_SAFE_CALL(cudaMemcpy(d_idata, in, MEM_SIZE, cudaMemcpyHostToDevice));

       

        encrypt_8800<<<grid, threads>>>((uint*)d_idata, (uint*)d_odata, isbox, (uint*)ikey);

        CUT_CHECK_ERROR("Kernel execution failed");

 

        CUDA_SAFE_CALL(cudaMemcpy(out, d_odata, MEM_SIZE, cudaMemcpyDeviceToHost));

};

}

 

 

TEA소스코드는 다음과 같습니다. 

include <stdlib.h>

#include <stdint.h>

#include <stdio.h>

#include <unistd.h>

#include <cuda/cuda.h>

#include <cuda_runtime.h>

 

/* Execution parameters */

#define THREADS_PER_THREADBLOCK 128

#define BLOCKBUFFER_SIZE (32768 * THREADS_PER_THREADBLOCK)

#define DATASIZE (128*1024*1024 / 16)

#define ITERCOUNT 10

 

/* Data structures */

typedef struct {

uint32_t k0, k1, k2,k3;

} TEA_KEY;

 

typedef struct __align__(16) {

uint32_t x_v0, x_v1, y_v0, y_v1;

} TEA_BLOCK;

 

/* Test-vectors for TEA with 32 rounds */

#define TV_KEY0 0x4E8E7829;

#define TV_KEY1 0xC88BA95E;

#define TV_KEY2 0xB84E28AF;

#define TV_KEY3 0xA0A47295;

#define TV_PLAIN0 0x8FADF3B3;

#define TV_PLAIN1 0x41EA3A0A

#define TV_CRYPT0 0xED650698

#define TV_CRYPT1 0xCF9F2B79

 

#define TEA_ROUND(block,key,sum) \

{ \

(block).x_v0 += (((block).x_v1<<4) + (key).k0) ^ ((block).x_v1 + sum) ^ (((block).x_v1>>5) + (key).k1); \

(block).x_v1 += (((block).x_v0<<4) + (key).k2) ^ ((block).x_v0 + sum) ^ (((block).x_v0>>5) + (key).k3); \

(block).y_v0 += (((block).y_v1<<4) + (key).k0) ^ ((block).y_v1 + sum) ^ (((block).y_v1>>5) + (key).k1); \

(block).y_v1 += (((block).y_v0<<4) + (key).k2) ^ ((block).y_v0 + sum) ^ (((block).y_v0>>5) + (key).k3); \

}

 

#define TEA_DELTA 0x9E3779B9

 

 

 

/* ### Device code below ### */

 

__global__

void cuda_encrypt (TEA_BLOCK *v, TEA_KEY key)

{

TEA_BLOCK tmp_v;

int idx = (blockIdx.x * blockDim.x + threadIdx.x);

 

tmp_v = v[idx];

TEA_ROUND(tmp_v, key, TEA_DELTA*1); TEA_ROUND(tmp_v, key, TEA_DELTA*2);

TEA_ROUND(tmp_v, key, TEA_DELTA*3); TEA_ROUND(tmp_v, key, TEA_DELTA*4);

TEA_ROUND(tmp_v, key, TEA_DELTA*5); TEA_ROUND(tmp_v, key, TEA_DELTA*6);

TEA_ROUND(tmp_v, key, TEA_DELTA*7); TEA_ROUND(tmp_v, key, TEA_DELTA*8);

TEA_ROUND(tmp_v, key, TEA_DELTA*9); TEA_ROUND(tmp_v, key, TEA_DELTA*10);

TEA_ROUND(tmp_v, key, TEA_DELTA*11); TEA_ROUND(tmp_v, key, TEA_DELTA*12);

TEA_ROUND(tmp_v, key, TEA_DELTA*13); TEA_ROUND(tmp_v, key, TEA_DELTA*14);

TEA_ROUND(tmp_v, key, TEA_DELTA*15); TEA_ROUND(tmp_v, key, TEA_DELTA*16);

TEA_ROUND(tmp_v, key, TEA_DELTA*17); TEA_ROUND(tmp_v, key, TEA_DELTA*18);

TEA_ROUND(tmp_v, key, TEA_DELTA*19); TEA_ROUND(tmp_v, key, TEA_DELTA*20);

TEA_ROUND(tmp_v, key, TEA_DELTA*21); TEA_ROUND(tmp_v, key, TEA_DELTA*22);

TEA_ROUND(tmp_v, key, TEA_DELTA*23); TEA_ROUND(tmp_v, key, TEA_DELTA*24);

TEA_ROUND(tmp_v, key, TEA_DELTA*25); TEA_ROUND(tmp_v, key, TEA_DELTA*26);

TEA_ROUND(tmp_v, key, TEA_DELTA*27); TEA_ROUND(tmp_v, key, TEA_DELTA*28);

TEA_ROUND(tmp_v, key, TEA_DELTA*29); TEA_ROUND(tmp_v, key, TEA_DELTA*30);

TEA_ROUND(tmp_v, key, TEA_DELTA*31); TEA_ROUND(tmp_v, key, TEA_DELTA*32);

v[idx] = tmp_v;

}

 

 

/* ### Host code below ### */

 

 

/* tea_encrypt() encrypts the data at 'inbuffer' using 'key' and writes results to 'outbuffer

The length of inbuffer *must* be aligned to a 16-byte boundary */

int tea_encrypt(unsigned char* inbuffer, size_t len, unsigned char* outbuffer, TEA_KEY key)

{

void* gpu_databuffer;

cudaEvent_t evt;

size_t transfer_size, numBufferBlocks, numThreadBlocks;

cudaError_t ret;

 

/* numBufferBlocks == number of TEA-double-blocks to encrypt */

numBufferBlocks = len / sizeof(TEA_BLOCK);

if (numBufferBlocks <= 0)

return 0;

 

 

/* We request page-locked memory from the CUDA api. Beware! */

cudaMalloc(&gpu_databuffer, BLOCKBUFFER_SIZE * sizeof(TEA_BLOCK));

while (numBufferBlocks > 0)

{

transfer_size = numBufferBlocks > BLOCKBUFFER_SIZE ? BLOCKBUFFER_SIZE : numBufferBlocks;

cudaMemcpy(gpu_databuffer, inbuffer, transfer_size*sizeof(TEA_BLOCK), cudaMemcpyHostToDevice);

 

cudaEventCreate(&evt);

numThreadBlocks = transfer_size / THREADS_PER_THREADBLOCK;

cuda_encrypt<<<numThreadBlocks, THREADS_PER_THREADBLOCK>>>((TEA_BLOCK *)gpu_databuffer, key);

// usleeping() while the kernel is running saves CPU cycles but may decrease performance

if (cudaEventRecord(evt, NULL) == cudaSuccess)

while (cudaEventQuery(evt) == cudaErrorNotReady) { usleep(2000); }

cudaEventDestroy(evt);

 

ret = cudaGetLastError();

if (ret != cudaSuccess || cudaThreadSynchronize() != cudaSuccess)

{

printf("Kernel failed to run. CUDA threw error message '%s'\n", cudaGetErrorString(ret));

cudaFree(gpu_databuffer);

return 0;

}

 

cudaMemcpy(outbuffer, gpu_databuffer, transfer_size * sizeof(TEA_BLOCK), cudaMemcpyDeviceToHost);

 

inbuffer += transfer_size * sizeof(TEA_BLOCK);

outbuffer += transfer_size * sizeof(TEA_BLOCK);

numBufferBlocks -= transfer_size;

}

cudaFree(gpu_databuffer);

 

return 1;

}

 

int main(int argc, char *argv[])

{

cudaError_t ret;

int i, j, cudadev, cudadevcount;

TEA_KEY key;

TEA_BLOCK* host_databuffer;

struct cudaDeviceProp cuda_devprop;

 

key.k0 = TV_KEY0;

key.k1 = TV_KEY1;

key.k2 = TV_KEY2;

key.k3 = TV_KEY3;

 

cudaGetDeviceCount(&cudadevcount);

ret = cudaGetLastError();

if (ret != cudaSuccess)

{

printf("CUDA failed to report devices with error '%s'\n", cudaGetErrorString(ret));

return EXIT_FAILURE;

}

 

printf("Welcome to TEA-CUDA. We have %i device(s) available:\n", cudadevcount);

for (cudadev = 0; cudadev < cudadevcount; cudadev++)

{

cudaGetDeviceProperties(&cuda_devprop, cudadev);

printf("(%i) '%s'\n", cudadev, &cuda_devprop.name);

}

cudaGetDevice(&cudadev);

if (ret != cudaSuccess)

{

printf("Failed to select device.\n");

return EXIT_FAILURE;

}

printf("\nWorking on device '%s'...\n", &cuda_devprop.name);

 

ret = cudaMallocHost((void**)(&host_databuffer), DATASIZE * sizeof(TEA_BLOCK));

if (ret != cudaSuccess)

{

printf("Failed to allocate page-locked buffer.\n");

return EXIT_FAILURE;

}

 

for (j = 0; j < ITERCOUNT; j++)

{

printf("Run %i... ", j);

for (i = 0; i < DATASIZE; i++)

{

host_databuffer[i].x_v0 = TV_PLAIN0;

host_databuffer[i].x_v1 = TV_PLAIN1;

host_databuffer[i].y_v0 = TV_PLAIN0;

host_databuffer[i].y_v1 = TV_PLAIN1;

}

 

if (!tea_encrypt((unsigned char*)host_databuffer, DATASIZE*sizeof(TEA_BLOCK), (unsigned char*)host_databuffer, key))

{

printf("FAILED IN tea_encrypt()\n");

break;

}

 

for (i = 0; i < DATASIZE; i++)

{

if (host_databuffer[i].x_v0 != TV_CRYPT0 || host_databuffer[i].x_v1 != TV_CRYPT1 || \

host_databuffer[i].y_v0 != TV_CRYPT0 || host_databuffer[i].y_v1 != TV_CRYPT1)

{

printf("%i FAILED to correctly encrypt on GPU.\n", i);

break;

}

 

}

if (i != DATASIZE)

{

break;

} else {

printf("OK\n");

}

}

 

cudaFreeHost(host_databuffer);

 

return EXIT_SUCCESS;

}