test (06016625) · Commits · jan.koester / libnetplus

CMakeLists.txt

+2 −2

Original line number	Diff line number	Diff line
		@@ -13,7 +13,7 @@ if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Linux")
		"${CMAKE_CXX_FLAGS} \
		-fPIC \
		-Wall \
		-O1 \
		-O2 \
		-g "
		)

		@@ -21,7 +21,7 @@ if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Linux")
		"${CMAKE_C_FLAGS} \
		-fPIC \
		-Wall \
		-O1 \
		-O2 \
		-g "
		)
		endif()

src/crypto/sha.cpp

+141 −0

Original line number	Diff line number	Diff line
		#include "sha.h"
		#include <cstring>

		// ─── SHA-256 with SHA-NI hardware acceleration (runtime dispatch) ───
		#if defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(__i386__) \|\| defined(_M_IX86)
		#include <immintrin.h>

		#if defined(__GNUC__) \|\| defined(__clang__)
		__attribute__((target("sha,sse4.1")))
		#endif
		static std::vector<uint8_t> sha256_hash_shani(const std::vector<uint8_t>& input) {
		// SHA-256 initial state
		alignas(16) static const uint32_t H0[4] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a};
		alignas(16) static const uint32_t H1[4] = {0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
		static const uint32_t K256[64] = {
		0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
		0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
		0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
		0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
		0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
		0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
		0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
		0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
		};
		const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);

		// Padding
		std::vector<uint8_t> msg = input;
		uint64_t bitLen = (uint64_t)input.size() * 8;
		msg.push_back(0x80);
		while ((msg.size() + 8) % 64 != 0) msg.push_back(0x00);
		for (int i = 7; i >= 0; --i) msg.push_back((bitLen >> (i * 8)) & 0xFF);

		// Init state: SHA-NI uses DCBA / HGFE layout (reversed within 128-bit)
		__m128i STATE0 = _mm_loadu_si128((const __m128i*)H0);
		__m128i STATE1 = _mm_loadu_si128((const __m128i*)H1);
		// Rearrange: STATE0 = ABEF, STATE1 = CDGH (for _mm_sha256rnds2_epu32)
		__m128i TMP = _mm_shuffle_epi32(STATE0, 0xB1); // BADC
		STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // HGFE
		STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
		STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH

		for (size_t offset = 0; offset < msg.size(); offset += 64) {
		__m128i ABEF_SAVE = STATE0;
		__m128i CDGH_SAVE = STATE1;

		__m128i MSG0 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 0)), MASK);
		__m128i MSG1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 16)), MASK);
		__m128i MSG2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 32)), MASK);
		__m128i MSG3 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 48)), MASK);

		__m128i MSGTMP;
		// Rounds 0-3
		MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[0]));
		STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
		MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
		STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);

		#define SHA256_SHANI_ROUND(i, m0, m1, m2, m3) \
		m0 = _mm_sha256msg1_epu32(m0, m1); \
		m0 = _mm_add_epi32(m0, _mm_alignr_epi8(m3, m2, 4)); \
		m0 = _mm_sha256msg2_epu32(m0, m3); \
		MSGTMP = _mm_add_epi32(m0, _mm_loadu_si128((const __m128i*)&K256[i])); \
		STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); \
		MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); \
		STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);

		// Rounds 4-7
		MSGTMP = _mm_add_epi32(MSG1, _mm_loadu_si128((const __m128i*)&K256[4]));
		STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
		MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
		STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);
		MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

		// Rounds 8-11
		MSGTMP = _mm_add_epi32(MSG2, _mm_loadu_si128((const __m128i*)&K256[8]));
		STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
		MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
		STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);
		MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

		// Rounds 12-15
		MSGTMP = _mm_add_epi32(MSG3, _mm_loadu_si128((const __m128i*)&K256[12]));
		STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
		MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
		STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);
		MSG0 = _mm_add_epi32(MSG0, _mm_alignr_epi8(MSG3, MSG2, 4));
		MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
		MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

		// Rounds 16-59 (unrolled in groups of 4)
		SHA256_SHANI_ROUND(16, MSG1, MSG2, MSG3, MSG0);
		SHA256_SHANI_ROUND(20, MSG2, MSG3, MSG0, MSG1);
		SHA256_SHANI_ROUND(24, MSG3, MSG0, MSG1, MSG2);
		SHA256_SHANI_ROUND(28, MSG0, MSG1, MSG2, MSG3);
		SHA256_SHANI_ROUND(32, MSG1, MSG2, MSG3, MSG0);
		SHA256_SHANI_ROUND(36, MSG2, MSG3, MSG0, MSG1);
		SHA256_SHANI_ROUND(40, MSG3, MSG0, MSG1, MSG2);
		SHA256_SHANI_ROUND(44, MSG0, MSG1, MSG2, MSG3);
		SHA256_SHANI_ROUND(48, MSG1, MSG2, MSG3, MSG0);
		SHA256_SHANI_ROUND(52, MSG2, MSG3, MSG0, MSG1);
		SHA256_SHANI_ROUND(56, MSG3, MSG0, MSG1, MSG2);

		// Rounds 60-63
		MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[60]));
		STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
		MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
		STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);

		#undef SHA256_SHANI_ROUND

		STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
		STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
		}

		// Rearrange back to ABCDEFGH and byte-swap to big-endian
		TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
		STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
		STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
		STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // HGFE

		STATE0 = _mm_shuffle_epi8(STATE0, MASK);
		STATE1 = _mm_shuffle_epi8(STATE1, MASK);

		std::vector<uint8_t> hash(32);
		_mm_storeu_si128((__m128i*)hash.data(), STATE0);
		_mm_storeu_si128((__m128i*)(hash.data() + 16), STATE1);
		return hash;
		}

		static bool cpu_has_shani() {
		#if defined(__GNUC__) \|\| defined(__clang__)
		return __builtin_cpu_supports("sha");
		#else
		return false;
		#endif
		}
		#endif // x86

		std::vector<uint8_t> netplus::sha1_hash(const std::vector<uint8_t>& input) {

		#define SHA1_ROL(value, bits) (((value) << (bits)) \| ((value) >> (32 - (bits))))
		@@ -68,6 +204,11 @@ std::vector<uint8_t> netplus::sha1_hash(const std::vector<uint8_t>& input) {
		}

		std::vector<uint8_t> netplus::sha256_hash(const std::vector<uint8_t>& input) {
		#if defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(__i386__) \|\| defined(_M_IX86)
		static const bool has_shani = cpu_has_shani();
		if (has_shani) return sha256_hash_shani(input);
		#endif

		// SHA-256 - correct implementation based on FIPS 180-4
		#define SHA256_ROR(value, bits) (((value) >> (bits)) \| ((value) << (32 - (bits))))