Loading CMakeLists.txt +2 −2 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Linux") "${CMAKE_CXX_FLAGS} \ -fPIC \ -Wall \ -O1 \ -O2 \ -g " ) Loading @@ -21,7 +21,7 @@ if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Linux") "${CMAKE_C_FLAGS} \ -fPIC \ -Wall \ -O1 \ -O2 \ -g " ) endif() Loading src/crypto/sha.cpp +141 −0 Original line number Diff line number Diff line #include "sha.h" #include <cstring> // ─── SHA-256 with SHA-NI hardware acceleration (runtime dispatch) ─── #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) #include <immintrin.h> #if defined(__GNUC__) || defined(__clang__) __attribute__((target("sha,sse4.1"))) #endif static std::vector<uint8_t> sha256_hash_shani(const std::vector<uint8_t>& input) { // SHA-256 initial state alignas(16) static const uint32_t H0[4] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a}; alignas(16) static const uint32_t H1[4] = {0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; static const uint32_t K256[64] = { 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 }; const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); // Padding std::vector<uint8_t> msg = input; uint64_t bitLen = (uint64_t)input.size() * 8; msg.push_back(0x80); while ((msg.size() + 8) % 64 != 0) msg.push_back(0x00); for (int i = 7; i >= 0; --i) msg.push_back((bitLen >> (i * 8)) & 0xFF); // Init state: SHA-NI uses DCBA / HGFE layout (reversed within 128-bit) __m128i STATE0 = _mm_loadu_si128((const __m128i*)H0); __m128i STATE1 = _mm_loadu_si128((const __m128i*)H1); // Rearrange: STATE0 = ABEF, STATE1 = CDGH (for _mm_sha256rnds2_epu32) __m128i TMP = _mm_shuffle_epi32(STATE0, 0xB1); // BADC STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // HGFE STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH for (size_t offset = 0; offset < msg.size(); offset += 64) { __m128i ABEF_SAVE = STATE0; __m128i CDGH_SAVE = STATE1; __m128i MSG0 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 0)), MASK); __m128i MSG1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 16)), MASK); __m128i MSG2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 32)), MASK); __m128i MSG3 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 48)), MASK); __m128i MSGTMP; // Rounds 0-3 MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[0])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); #define SHA256_SHANI_ROUND(i, m0, m1, m2, m3) \ m0 = _mm_sha256msg1_epu32(m0, m1); \ m0 = _mm_add_epi32(m0, _mm_alignr_epi8(m3, m2, 4)); \ m0 = _mm_sha256msg2_epu32(m0, m3); \ MSGTMP = _mm_add_epi32(m0, _mm_loadu_si128((const __m128i*)&K256[i])); \ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); \ MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); \ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); // Rounds 4-7 MSGTMP = _mm_add_epi32(MSG1, _mm_loadu_si128((const __m128i*)&K256[4])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); // Rounds 8-11 MSGTMP = _mm_add_epi32(MSG2, _mm_loadu_si128((const __m128i*)&K256[8])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); // Rounds 12-15 MSGTMP = _mm_add_epi32(MSG3, _mm_loadu_si128((const __m128i*)&K256[12])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); MSG0 = _mm_add_epi32(MSG0, _mm_alignr_epi8(MSG3, MSG2, 4)); MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); // Rounds 16-59 (unrolled in groups of 4) SHA256_SHANI_ROUND(16, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(20, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(24, MSG3, MSG0, MSG1, MSG2); SHA256_SHANI_ROUND(28, MSG0, MSG1, MSG2, MSG3); SHA256_SHANI_ROUND(32, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(36, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(40, MSG3, MSG0, MSG1, MSG2); SHA256_SHANI_ROUND(44, MSG0, MSG1, MSG2, MSG3); SHA256_SHANI_ROUND(48, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(52, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(56, MSG3, MSG0, MSG1, MSG2); // Rounds 60-63 MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[60])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); #undef SHA256_SHANI_ROUND STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); } // Rearrange back to ABCDEFGH and byte-swap to big-endian TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // HGFE STATE0 = _mm_shuffle_epi8(STATE0, MASK); STATE1 = _mm_shuffle_epi8(STATE1, MASK); std::vector<uint8_t> hash(32); _mm_storeu_si128((__m128i*)hash.data(), STATE0); _mm_storeu_si128((__m128i*)(hash.data() + 16), STATE1); return hash; } static bool cpu_has_shani() { #if defined(__GNUC__) || defined(__clang__) return __builtin_cpu_supports("sha"); #else return false; #endif } #endif // x86 std::vector<uint8_t> netplus::sha1_hash(const std::vector<uint8_t>& input) { #define SHA1_ROL(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) Loading Loading @@ -68,6 +204,11 @@ std::vector<uint8_t> netplus::sha1_hash(const std::vector<uint8_t>& input) { } std::vector<uint8_t> netplus::sha256_hash(const std::vector<uint8_t>& input) { #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) static const bool has_shani = cpu_has_shani(); if (has_shani) return sha256_hash_shani(input); #endif // SHA-256 - correct implementation based on FIPS 180-4 #define SHA256_ROR(value, bits) (((value) >> (bits)) | ((value) << (32 - (bits)))) Loading Loading
CMakeLists.txt +2 −2 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Linux") "${CMAKE_CXX_FLAGS} \ -fPIC \ -Wall \ -O1 \ -O2 \ -g " ) Loading @@ -21,7 +21,7 @@ if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Linux") "${CMAKE_C_FLAGS} \ -fPIC \ -Wall \ -O1 \ -O2 \ -g " ) endif() Loading
src/crypto/sha.cpp +141 −0 Original line number Diff line number Diff line #include "sha.h" #include <cstring> // ─── SHA-256 with SHA-NI hardware acceleration (runtime dispatch) ─── #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) #include <immintrin.h> #if defined(__GNUC__) || defined(__clang__) __attribute__((target("sha,sse4.1"))) #endif static std::vector<uint8_t> sha256_hash_shani(const std::vector<uint8_t>& input) { // SHA-256 initial state alignas(16) static const uint32_t H0[4] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a}; alignas(16) static const uint32_t H1[4] = {0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; static const uint32_t K256[64] = { 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 }; const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); // Padding std::vector<uint8_t> msg = input; uint64_t bitLen = (uint64_t)input.size() * 8; msg.push_back(0x80); while ((msg.size() + 8) % 64 != 0) msg.push_back(0x00); for (int i = 7; i >= 0; --i) msg.push_back((bitLen >> (i * 8)) & 0xFF); // Init state: SHA-NI uses DCBA / HGFE layout (reversed within 128-bit) __m128i STATE0 = _mm_loadu_si128((const __m128i*)H0); __m128i STATE1 = _mm_loadu_si128((const __m128i*)H1); // Rearrange: STATE0 = ABEF, STATE1 = CDGH (for _mm_sha256rnds2_epu32) __m128i TMP = _mm_shuffle_epi32(STATE0, 0xB1); // BADC STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // HGFE STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH for (size_t offset = 0; offset < msg.size(); offset += 64) { __m128i ABEF_SAVE = STATE0; __m128i CDGH_SAVE = STATE1; __m128i MSG0 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 0)), MASK); __m128i MSG1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 16)), MASK); __m128i MSG2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 32)), MASK); __m128i MSG3 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 48)), MASK); __m128i MSGTMP; // Rounds 0-3 MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[0])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); #define SHA256_SHANI_ROUND(i, m0, m1, m2, m3) \ m0 = _mm_sha256msg1_epu32(m0, m1); \ m0 = _mm_add_epi32(m0, _mm_alignr_epi8(m3, m2, 4)); \ m0 = _mm_sha256msg2_epu32(m0, m3); \ MSGTMP = _mm_add_epi32(m0, _mm_loadu_si128((const __m128i*)&K256[i])); \ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); \ MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); \ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); // Rounds 4-7 MSGTMP = _mm_add_epi32(MSG1, _mm_loadu_si128((const __m128i*)&K256[4])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); // Rounds 8-11 MSGTMP = _mm_add_epi32(MSG2, _mm_loadu_si128((const __m128i*)&K256[8])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); // Rounds 12-15 MSGTMP = _mm_add_epi32(MSG3, _mm_loadu_si128((const __m128i*)&K256[12])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); MSG0 = _mm_add_epi32(MSG0, _mm_alignr_epi8(MSG3, MSG2, 4)); MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); // Rounds 16-59 (unrolled in groups of 4) SHA256_SHANI_ROUND(16, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(20, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(24, MSG3, MSG0, MSG1, MSG2); SHA256_SHANI_ROUND(28, MSG0, MSG1, MSG2, MSG3); SHA256_SHANI_ROUND(32, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(36, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(40, MSG3, MSG0, MSG1, MSG2); SHA256_SHANI_ROUND(44, MSG0, MSG1, MSG2, MSG3); SHA256_SHANI_ROUND(48, MSG1, MSG2, MSG3, MSG0); SHA256_SHANI_ROUND(52, MSG2, MSG3, MSG0, MSG1); SHA256_SHANI_ROUND(56, MSG3, MSG0, MSG1, MSG2); // Rounds 60-63 MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[60])); STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP); #undef SHA256_SHANI_ROUND STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); } // Rearrange back to ABCDEFGH and byte-swap to big-endian TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // HGFE STATE0 = _mm_shuffle_epi8(STATE0, MASK); STATE1 = _mm_shuffle_epi8(STATE1, MASK); std::vector<uint8_t> hash(32); _mm_storeu_si128((__m128i*)hash.data(), STATE0); _mm_storeu_si128((__m128i*)(hash.data() + 16), STATE1); return hash; } static bool cpu_has_shani() { #if defined(__GNUC__) || defined(__clang__) return __builtin_cpu_supports("sha"); #else return false; #endif } #endif // x86 std::vector<uint8_t> netplus::sha1_hash(const std::vector<uint8_t>& input) { #define SHA1_ROL(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits)))) Loading Loading @@ -68,6 +204,11 @@ std::vector<uint8_t> netplus::sha1_hash(const std::vector<uint8_t>& input) { } std::vector<uint8_t> netplus::sha256_hash(const std::vector<uint8_t>& input) { #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) static const bool has_shani = cpu_has_shani(); if (has_shani) return sha256_hash_shani(input); #endif // SHA-256 - correct implementation based on FIPS 180-4 #define SHA256_ROR(value, bits) (((value) >> (bits)) | ((value) << (32 - (bits)))) Loading