/* * Copyright (C) 2008 Apple Inc. All rights reserved. * * This document is the property of Apple Inc. * It is considered confidential and proprietary. * * This document may not be reproduced or transmitted in any form, * in whole or in part, without the express written permission of * Apple Inc. */ #include #include "AppleS5L8900XPKE-hardware.h" #define AppleS5L8900XPKE_hardware_DEBUG 0 #undef absolutetime_to_nanoseconds #if AppleS5L8900XPKE_hardware_DEBUG static void debug_data(uint8_t *data, size_t length) { uint32_t i=0; printf("data %p,%d\n" , data, length); for (i = 0; i < length; i++) { if( !(i%256) ) printf("Seg : %p \n", data); printf("%02X, ", *(data + i)); if ((i % 16) == 15) printf("\n"); } printf("\n"); } #define debug(fmt, args...) printf("AppleS5L8900XPKE-hardware::%s: " fmt "\n", __FUNCTION__ , ##args) #else #define debug(fmt, args...) #define debug_data(data, length) #endif #if IN_KERNEL extern uint64_t iodelay_time[3]; static uint64_t delay_time; #endif uint32_t PkeSetMode(uint32_t uSegSize) { uint32_t uWriteVal; switch (uSegSize) { case PKE_SEG_SIZE_256: uWriteVal = PKE_REGVAL_SEG_SIZE_SEGSIZE_256; break; case PKE_SEG_SIZE_128: uWriteVal = PKE_REGVAL_SEG_SIZE_SEGSIZE_128; break; case PKE_SEG_SIZE_64: uWriteVal = PKE_REGVAL_SEG_SIZE_SEGSIZE_64; break; default: uWriteVal = 0; debug("error setting segment size/mode\n"); } /* ! write SEG_SIZE and MODE_ID */ return (uWriteVal | PKE_REGMASK_SEG_SIZE_MUSTBEONE); } uint32_t PkeGetSegmentCount(uint32_t uSegSize) { /* return segment total number */ switch (uSegSize) { case PKE_SEG_SIZE_256: return PKE_SEG_NUM_CASE256; case PKE_SEG_SIZE_128: return PKE_SEG_NUM_CASE128; case PKE_SEG_SIZE_64: return PKE_SEG_NUM_CASE64; default: return 0; } } static uint32_t get_precision(uint32_t uBitLen) { uint32_t uPrecision; if (uBitLen <= PKE_LEN_BIT_512) { uPrecision = PKE_SIZE_PREC_0; } else if ((uBitLen <= PKE_LEN_BIT_1024) && (uBitLen != PKE_LEN_BIT_672) && (uBitLen != PKE_LEN_BIT_864)) { uPrecision = PKE_SIZE_PREC_1; } else if ((uBitLen <= PKE_LEN_BIT_1536) && (uBitLen != PKE_LEN_BIT_1280) && (uBitLen != PKE_LEN_BIT_1408)) { uPrecision = PKE_SIZE_PREC_2; } else { uPrecision = PKE_SIZE_PREC_3; } return uPrecision; } static uint32_t get_chunkSize(uint32_t uBitLen, uint32_t uPrecision) { uint32_t uDiv, uChunk; switch (uPrecision) { case PKE_SIZE_PREC_0: if (uBitLen < PKE_LEN_BIT_128) { uBitLen = PKE_LEN_BIT_128; } uDiv = uBitLen >> 5; uChunk = uDiv; if (uBitLen - (uChunk << 5)) { uDiv++; } uChunk = --uDiv; break; case PKE_SIZE_PREC_1: uDiv = uBitLen >> 6; uChunk = uDiv; if (uBitLen - (uChunk << 6)) { uDiv++; } uChunk = --uDiv; break; case PKE_SIZE_PREC_2: uDiv = (uBitLen / 3) >> 5; uChunk = uDiv; if (uBitLen - ((uChunk * 3) << 5)) { uDiv++; } uChunk = --uDiv; break; case PKE_SIZE_PREC_3: uDiv = uBitLen >> 7; uChunk = uDiv; if (uBitLen - (uChunk << 7)) { uDiv++; } uChunk = --uDiv; break; default: debug("Unsupported precision value: %u", uPrecision); uChunk = 0; } return (((uChunk << 3) + 8) << 2); } static uint32_t config_pke_key_len(uint32_t uBitLen) { uint32_t uDiv, uChunk, uPrecision; /* 1. Get Precision */ uPrecision = get_precision(uBitLen); /* 2. Get chunck size */ uChunk = get_chunkSize(uBitLen, uPrecision); /* 3. Set precision and chunksize */ uDiv = ((uChunk >> 2) - 8) | uPrecision; #if IN_KERNEL //Set the IODelay time based on uBitLen. if (uBitLen <= PKE_LEN_BIT_512) { delay_time = iodelay_time[0]; } else if (uBitLen <= PKE_LEN_BIT_1024) { delay_time = iodelay_time[1]; } else { delay_time = iodelay_time[2]; } #endif return uDiv; } static inline uint32_t RSAPkeSetInOut(uint32_t uDest, uint32_t uSrc1, uint32_t uSrc2) { uint32_t uPosition = 0; /*! Set segment ID of src and dest */ uPosition = ((uSrc1 << PKE_REGSHIFT_SEG_ID_A_SEG_ID) | (uSrc2 << PKE_REGSHIFT_SEG_ID_B_SEG_ID) | (uDest << PKE_REGSHIFT_SEG_ID_S_SEG_ID)); return uPosition; } static inline bool RSAPkeRun(volatile struct pke_regs *registers, uint32_t uCmdVal) { /*! Run PKE */ registers->pke_start = uCmdVal; #if IN_KERNEL IODelay(delay_time); #endif do { if (!(registers->pke_start & PKE_REGMASK_START_EXEC_ON)) return true; } while (1); return false; } static bool shift_left(uint32_t *data, uint32_t data_size) { int prev_carry = 0, cur_carry = 0; uint32_t itr = 0; for (itr=0; itr < (data_size/sizeof(uint32_t)); itr++) { cur_carry = data[itr] >> 31; data[itr] = (data[itr] << 1) + prev_carry; prev_carry = cur_carry; } return prev_carry; } static int32_t highestSetBit(uint32_t *data1, uint32_t data_size) { int32_t ix = data_size/sizeof(uint32_t) - 1; // Most significant word. while( ix>=0 && data1[ix]==0) { ix--; } if(ix < 0) return -1; //mods is zero. uint32_t msNZword = data1[ix]; uint32_t b = ((ix+1) * sizeof(uint32_t) * DATA_BYTE_TO_BIT) - 1 ; // Now find the highest set bit. uint32_t mask = 0x80000000; while ((msNZword & mask) == 0) { mask >>= 1; b--; } return b; } static bool is_greater(uint32_t *data1, uint32_t *data2, uint32_t data_size) { uint32_t itr = data_size/sizeof(uint32_t) - 1; //start at the last word. do { if(data1[itr] > data2[itr]) return true; else if(data1[itr] < data2[itr]) return false; //check next word; }while(itr--); return false; } /* Do data1 = data1 - data2 */ static uint32_t sub(uint32_t *data1, uint32_t *data2, uint32_t data_size) { uint32_t borrow = 0; uint32_t itr = 0; for(itr=0; itr < (data_size/sizeof(uint32_t)); itr++) { if(borrow) { if(data1[itr] != 0) borrow = 0; //No need to borrow again. data1[itr] -= 1; } if( data1[itr] < data2[itr] ) borrow = 1; data1[itr] -= data2[itr]; } if(borrow) return 1; return 0; } /*Do data1 = data1 + data2 */ static uint32_t add(uint32_t *data1, uint32_t *data2, uint32_t data_size) { uint32_t temp; int carry = 0; uint32_t itr = 0; for(itr=0; itr < (data_size/sizeof(uint32_t)); itr++) { temp = data1[itr] + data2[itr] + carry; if(temp < data1[itr] || (temp == data1[itr] && carry)) carry = 1; else carry = 0; data1[itr] = temp; } return carry; } // The src buffer is assumed to be word aligned at this point. // The hardware register is 4 byte aligned. // The data is in little Endian. static inline void copy_into_segment(volatile uint8_t *memory, uint32_t segment_id, uint32_t segment_size, uint8_t *data, uint32_t length) { volatile uint8_t *dst = memory + segment_id * segment_size; uint32_t ii=0; //Make sure the buffer is word aligned. //assert((data & 3) == 0); //We don't assert in iBoot. The only interface makes sure we have aligned buffer. uint32_t newLength = length & ~3 ; //Copy words. for(ii=0; ii<(newLength/4); ii++) *((uint32_t*)dst + ii) = *((uint32_t*)data + ii); //See if you have less than a word left to copy. if(length & 3) { //have less than 4 bytes left to copy. uint8_t wordData[4] = {0,0,0,0}; uint8_t *bytesLeft = data + newLength; for(ii=0; ii<(length & 3); ii++) { wordData[ii] = *(bytesLeft + ii); } *((uint32_t*)(dst+newLength)) = *((uint32_t*)wordData); } //zero fill the rest of the segment. uint32_t bytesCopied = ((length+3) & ~3); uint32_t left = segment_size - bytesCopied; if(left) { uint32_t *dest = (uint32_t *)(dst + bytesCopied); for(ii=0; ii<(left/4); ii++) *(dest+ii) = 0x00000000; } } /* Calculate R0 in software. rzero has memory allocated of size modulus_length and is zero filled. */ bool rsa_cal_R0(uint32_t * const mods, uint32_t modulus_length, uint32_t *rzero) { debug("called"); uint32_t precision = get_precision(modulus_length*DATA_BYTE_TO_BIT); uint32_t chunk_size = get_chunkSize(modulus_length*DATA_BYTE_TO_BIT, precision); //precision is 1,2,3,4 instead of 0,1,2,3 precision += 1; /* Compute 2^b */ uint32_t power = (chunk_size / 16) + 1; if (precision == 3) power = power * 3; power = power + (precision * (chunk_size + 16)); //compute the most significant bit set in the modulus. int32_t b = highestSetBit((uint32_t*)mods, modulus_length); if(b == -1) return false; //store 2^b in rzero. bzero(rzero, modulus_length); rzero[(b/32)] = 1 << (b % 32); /* Calculate (2^(power-b) * 2^b) mod M */ uint32_t itr; for(itr=0; itr<(power-b); itr++) { bool carry = shift_left(rzero, modulus_length); //Shift left. if (carry || is_greater(rzero, mods, modulus_length)) sub(rzero, (uint32_t*)mods, modulus_length); } return true; } /* Compute the value of R^2 using the hardware. The value of rzero (R0) is calculated in softare and passed into the function The hardware should be reset before calling this function. The value of R^2 is returned thru' rsquare (memory for this is allocated before calling this function and of modulus_length size) */ static bool rsa_cal_R2modM(volatile struct pke_regs *registers, volatile uint8_t *memory, uint32_t seg_size, uint32_t * const mods, uint32_t mods_length, uint32_t *rsquare, uint32_t *r2_segid) { debug("called"); if(!rsquare) //Need rzero (which should be in rsquare) to proceed return false; //Set the key length; registers->pke_key_len = config_pke_key_len(mods_length*DATA_BYTE_TO_BIT); //Set segment size; registers->pke_seg_size = PkeSetMode(seg_size); registers->pke_seg_sign = 0; registers->pke_seg_id = 0; /* Use hardware to calculate R1 thru' Rn and finally R^2 */ //Load modulus always into seg0. copy_into_segment(memory, mod_segid, seg_size, (uint8_t*)mods, mods_length); //based on seg_size compute the seg_ids to store the (2^(power-b) * 2^b) mod M // and the final value of R^2 uint32_t rzero_segid, rsq_segid; switch (seg_size) { case PKE_SEG_SIZE_256: rzero_segid = PKE_SEG_ID_05; rsq_segid = PKE_SEG_ID_06; break; case PKE_SEG_SIZE_128: rzero_segid = PKE_SEG_ID_12; rsq_segid = PKE_SEG_ID_14; break; case PKE_SEG_SIZE_64: rzero_segid = PKE_SEG_ID_28; rsq_segid = PKE_SEG_ID_29; break; default: //Should not happen. return false; } //Load the rzero into the hardware. copy_into_segment(memory, rzero_segid, seg_size, (uint8_t*)rsquare, mods_length); uint32_t precision = (registers->pke_key_len&PKE_REGMASK_KEY_LEN_PREC_ID) + 1; //Compute the number of hardware iterations based on precision. uint32_t square_count; if (precision == 1 || precision == 3) square_count = 4; else if (precision == 2) square_count = 5; else /* precision == 4 */ square_count = 6; //Set pre load modulus on when using the hardware for the first time. uint32_t start_mask = PKE_REGMASK_START_PLDM_ON | PKE_REGMASK_START_EXEC_ON; uint32_t itr = 1; uint32_t temp_segid = rsq_segid; while (itr <= square_count) { //setup seg_id pointers on the hardware registers->pke_seg_id = RSAPkeSetInOut(temp_segid, rzero_segid, rzero_segid); //kick the hardware. if(!RSAPkeRun(registers, start_mask)) return false; itr++; if(itr > square_count) break; //swap the segids and do the square again. temp_segid += rzero_segid; rzero_segid = temp_segid - rzero_segid; temp_segid -= rzero_segid; start_mask = PKE_REGMASK_START_EXEC_ON; //dont have to laod M again. } // Check the sign value of temp_segid and add modulus if negative. uint32_t uRead, uMask; uMask = (1 << temp_segid); uRead = registers->pke_seg_sign & uMask; //If negative then add modulus. if(uRead) { add( (uint32_t*)(memory+(temp_segid*seg_size)), (uint32_t*)mods, mods_length); } *r2_segid = temp_segid; //Clean up the temp space used. bzero((uint8_t*) memory+rzero_segid*seg_size, mods_length); /* Reset the signbits */ registers->pke_seg_sign = 0; memcpy(rsquare, (uint8_t *) memory+rsq_segid*seg_size, mods_length); return true; } /* Calculate exponential. The options parameter passed in indicates if R^2 is passed in or needs to be computed. Options = 00 => Compute R^2 and don't care about caching. *rsquare is ignored. Options = 01 => Compute R^2 and return it for caching. *rsquare has mem allocated and is of size modulus_length Options = 10 => Use passed in R^2 value. Reset the hardware before calling this function. */ bool rsa_cal_exp(void *dst, uint32_t *len, uint32_t options, uint8_t *rsquare, uint32_t *rsquare_length, uint8_t * const base, uint32_t base_length, uint8_t * const expn, uint32_t expn_length, uint8_t * const modulus, uint32_t modulus_length, volatile struct pke_regs *registers, volatile uint8_t *memory) { debug("called"); /* Perform validation checks */ if(modulus_length % 4) return false; uint32_t segment_size, num_segments; if (modulus_length <= (PKE_SEG_SIZE_64)) { segment_size = PKE_SEG_SIZE_64; } else if (modulus_length <= (PKE_SEG_SIZE_128)) { segment_size = PKE_SEG_SIZE_128; } else if (modulus_length <= (PKE_SEG_SIZE_256)) { segment_size = PKE_SEG_SIZE_256; } else return false; /* key too long */ num_segments = PkeGetSegmentCount(segment_size); if ( !num_segments || base_length > segment_size || expn_length > segment_size || modulus_length > segment_size) return false; /* Sane input, so proceed */ //rsquare segid must always be the last seg ID. uint32_t rsq_segid = num_segments - 1; if( (options&kIOPKEAcceleratorComputeRsquareMask) == 0 ) { //Compute rsquare. if(!(options&kIOPKEAcceleratorPreProcessingDone)) { //R0 was not cacluated. should only happen in iBoot. if(!rsquare){ //should not happen. The expectation is that memory for rsquare should be allocated. return false; } rsa_cal_R0((uint32_t*)modulus, modulus_length, (uint32_t*)rsquare); } //Compute R^2 if(!rsa_cal_R2modM(registers, memory, segment_size, (uint32_t*)modulus, modulus_length, (uint32_t*)rsquare, &rsq_segid)) return false; *rsquare_length = modulus_length; } else { //This would have been done while calculating R^2. //Set the key length; registers->pke_key_len = config_pke_key_len(modulus_length*DATA_BYTE_TO_BIT); //Set segment size; registers->pke_seg_size = PkeSetMode(segment_size); /* Reset the signbits */ registers->pke_seg_sign = 0; registers->pke_seg_id = 0; //Load modulus into seg0; copy_into_segment(memory, mod_segid, segment_size, modulus, modulus_length); //copy the rsquare into the hardware segment. memcpy((uint8_t*)memory+(rsq_segid*segment_size), rsquare, modulus_length); } /* Compute (base * r^2 mod M) using hardware */ //Copy base into tmp_segid. copy_into_segment(memory, tmp_segid, segment_size, base, base_length); //Setup the segid pointers. registers->pke_seg_id = RSAPkeSetInOut(iter_segid, tmp_segid, rsq_segid); //Kick the hardware with pre load modulus bit set. if(!RSAPkeRun(registers, (PKE_REGMASK_START_PLDM_ON | PKE_REGMASK_START_EXEC_ON))) return false; /* compute (1*R^2modM) using hardware. */ registers->pke_seg_size |= PKE_REGVAL_SEG_SIZE_FUNC_ID_A1; // A*1 //When PKE_REGVAL_SEG_SIZE_FUNC_ID_A1 is set, the hardware ignores the segid_B pointer registers->pke_seg_id = RSAPkeSetInOut(acum_segid, rsq_segid, rsq_segid); if(!RSAPkeRun(registers, PKE_REGMASK_START_EXEC_ON)) return false; // Clear the PKE_REGVAL_SEG_SIZE_FUNC_ID_A1 bit. registers->pke_seg_size &= ~PKE_REGVAL_SEG_SIZE_FUNC_ID_A1; /* Run the PKE */ uint32_t *exp_word = (uint32_t*)expn; /* word aligned byte length */ uint32_t uExpLen = ((expn_length + (DATA_WORD_TO_BYTE - 1)) & (~0x3)); uint32_t last_word = exp_word[(uExpLen/DATA_WORD_TO_BYTE) - 1]; uint32_t uIndex = (DATA_WORD_TO_BYTE*DATA_BYTE_TO_BIT) - (highestSetBit(&last_word, DATA_WORD_TO_BYTE) + 1); for (; uIndex < (uExpLen * DATA_BYTE_TO_BIT); uIndex++) { unsigned int word; unsigned int bit; uint32_t offset = (uExpLen / DATA_WORD_TO_BYTE) - (uIndex / (DATA_WORD_TO_BYTE * DATA_BYTE_TO_BIT)) - 1; word = exp_word[offset]; registers->pke_seg_id = RSAPkeSetInOut(tmp_segid, acum_segid, acum_segid); if(!RSAPkeRun(registers, PKE_REGMASK_START_EXEC_ON)) return false; bit = word & (0x80000000 >> (uIndex % (DATA_WORD_TO_BYTE * DATA_BYTE_TO_BIT))); if (bit) { //debug("bit %d set", uIndex); registers->pke_seg_id = RSAPkeSetInOut(acum_segid, tmp_segid, iter_segid); if(!RSAPkeRun(registers, PKE_REGMASK_START_EXEC_ON)) return false; } else { memcpy( (uint8_t *) memory + acum_segid * segment_size, (uint8_t *) memory + tmp_segid * segment_size, segment_size); if (registers->pke_seg_sign & (0x01 << tmp_segid)) registers->pke_seg_sign |= (0x01 << acum_segid); else registers->pke_seg_sign &= ~(0x01 << acum_segid); } } /* Factor out R^-1 from the result */ registers->pke_seg_size |= PKE_REGVAL_SEG_SIZE_FUNC_ID_A1; // A*1 registers->pke_seg_id = RSAPkeSetInOut(tmp_segid, acum_segid, acum_segid); if (!RSAPkeRun(registers, PKE_REGMASK_START_EXEC_ON)) return false; registers->pke_seg_size &= ~PKE_REGVAL_SEG_SIZE_FUNC_ID_A1; // A*B // Check the sign value of tmp_segid and add modulus if negative. uint32_t uRead, uMask; uMask = (1 << tmp_segid); uRead = registers->pke_seg_sign & uMask; //If negative then add modulus. if(uRead) { registers->pke_seg_sign &= ~uMask; //clear the sign. add( (uint32_t*)(memory+(tmp_segid*segment_size)), (uint32_t*)modulus, modulus_length); } /* Copy data out */ memcpy(dst, (uint8_t*)memory+(tmp_segid*segment_size), modulus_length); *len = modulus_length; return true; } #if IN_KERNEL bool internalTest(volatile struct pke_regs *registers, volatile uint8_t *memory, uint32_t mod_size, uint64_t *time_micro) { uint8_t buff[mod_size]; uint32_t segment_size; if (mod_size <= (PKE_SEG_SIZE_64)) { segment_size = PKE_SEG_SIZE_64; } else if (mod_size <= (PKE_SEG_SIZE_128)) { segment_size = PKE_SEG_SIZE_128; } else if (mod_size <= (PKE_SEG_SIZE_256)) { segment_size = PKE_SEG_SIZE_256; } else return false; /* key too long */ uint32_t rsq_segid; switch (segment_size) { case PKE_SEG_SIZE_256: rsq_segid = PKE_SEG_ID_06; break; case PKE_SEG_SIZE_128: rsq_segid = PKE_SEG_ID_14; break; case PKE_SEG_SIZE_64: rsq_segid = PKE_SEG_ID_29; break; default: //Should not happen. return false; } registers->pke_key_len = config_pke_key_len(mod_size*DATA_BYTE_TO_BIT); //Set segment size; registers->pke_seg_size = PkeSetMode(segment_size); /* Reset the signbits */ registers->pke_seg_sign = 0; registers->pke_seg_id = 0; debug("key: %x, reg_key: %x, modsize:%u, reg_segsize: %u", config_pke_key_len(mod_size*DATA_BYTE_TO_BIT), registers->pke_key_len, mod_size, registers->pke_seg_size); //Load modulus into seg0; memset(buff, 0xF0, mod_size); copy_into_segment(memory, mod_segid, segment_size, buff, mod_size); //Set R^2 memset(buff, 0xE5, mod_size); copy_into_segment(memory, rsq_segid, segment_size, buff, mod_size); //Set A memset(buff, 0xD2, mod_size); copy_into_segment(memory, iter_segid, segment_size, buff, mod_size); //Setup the segid pointers. registers->pke_seg_id = RSAPkeSetInOut(iter_segid, tmp_segid, tmp_segid); uint64_t tasm0, tasm1, tasm2, asm_ns; tasm0 = mach_absolute_time(); tasm1 = mach_absolute_time(); //Kick the hardware with pre load modulus bit set. if(!RSAPkeRun(registers, (PKE_REGMASK_START_PLDM_ON | PKE_REGMASK_START_EXEC_ON))) return false; tasm2 = mach_absolute_time(); absolutetime_to_nanoseconds( (tasm2 + tasm0 - tasm1 - tasm1), &asm_ns); //debug("Single multiply time: %lluns\n", asm_ns); *time_micro = (asm_ns/1000) - 5; //clear the hardware. registers->pke_key_len = 0; registers->pke_seg_size = 0; registers->pke_seg_sign = 0; registers->pke_seg_id = 0; bzero((uint8_t*)memory+(tmp_segid*segment_size), mod_size); bzero((uint8_t*)memory+(mod_segid*segment_size), mod_size); bzero((uint8_t*)memory+(iter_segid*segment_size), mod_size); return true; } #endif