Commit 06016625 authored by jan.koester's avatar jan.koester
Browse files

test

parent 1e531e31
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Linux")
        "${CMAKE_CXX_FLAGS} \
        -fPIC \
        -Wall \
        -O1 \
        -O2 \
        -g "
    )

@@ -21,7 +21,7 @@ if(${CMAKE_HOST_SYSTEM_NAME} STREQUAL "Linux")
        "${CMAKE_C_FLAGS} \
        -fPIC \
        -Wall \
        -O1 \
        -O2 \
        -g "
    )
endif()
+141 −0
Original line number Diff line number Diff line
#include "sha.h"
#include <cstring>

// ─── SHA-256 with SHA-NI hardware acceleration (runtime dispatch) ───
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
#include <immintrin.h>

#if defined(__GNUC__) || defined(__clang__)
__attribute__((target("sha,sse4.1")))
#endif
static std::vector<uint8_t> sha256_hash_shani(const std::vector<uint8_t>& input) {
    // SHA-256 initial state
    alignas(16) static const uint32_t H0[4] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a};
    alignas(16) static const uint32_t H1[4] = {0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
    static const uint32_t K256[64] = {
        0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
        0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
        0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
        0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
        0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
        0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
        0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
        0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
    };
    const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);

    // Padding
    std::vector<uint8_t> msg = input;
    uint64_t bitLen = (uint64_t)input.size() * 8;
    msg.push_back(0x80);
    while ((msg.size() + 8) % 64 != 0) msg.push_back(0x00);
    for (int i = 7; i >= 0; --i) msg.push_back((bitLen >> (i * 8)) & 0xFF);

    // Init state: SHA-NI uses DCBA / HGFE layout (reversed within 128-bit)
    __m128i STATE0 = _mm_loadu_si128((const __m128i*)H0);
    __m128i STATE1 = _mm_loadu_si128((const __m128i*)H1);
    // Rearrange: STATE0 = ABEF, STATE1 = CDGH (for _mm_sha256rnds2_epu32)
    __m128i TMP = _mm_shuffle_epi32(STATE0, 0xB1); // BADC
    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B);      // HGFE
    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8);      // ABEF
    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0);   // CDGH

    for (size_t offset = 0; offset < msg.size(); offset += 64) {
        __m128i ABEF_SAVE = STATE0;
        __m128i CDGH_SAVE = STATE1;

        __m128i MSG0 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset +  0)), MASK);
        __m128i MSG1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 16)), MASK);
        __m128i MSG2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 32)), MASK);
        __m128i MSG3 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(msg.data() + offset + 48)), MASK);

        __m128i MSGTMP;
        // Rounds 0-3
        MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[0]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);

        #define SHA256_SHANI_ROUND(i, m0, m1, m2, m3) \
            m0 = _mm_sha256msg1_epu32(m0, m1); \
            m0 = _mm_add_epi32(m0, _mm_alignr_epi8(m3, m2, 4)); \
            m0 = _mm_sha256msg2_epu32(m0, m3); \
            MSGTMP = _mm_add_epi32(m0, _mm_loadu_si128((const __m128i*)&K256[i])); \
            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP); \
            MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E); \
            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);

        // Rounds 4-7
        MSGTMP = _mm_add_epi32(MSG1, _mm_loadu_si128((const __m128i*)&K256[4]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);
        MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

        // Rounds 8-11
        MSGTMP = _mm_add_epi32(MSG2, _mm_loadu_si128((const __m128i*)&K256[8]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);
        MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

        // Rounds 12-15
        MSGTMP = _mm_add_epi32(MSG3, _mm_loadu_si128((const __m128i*)&K256[12]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);
        MSG0 = _mm_add_epi32(MSG0, _mm_alignr_epi8(MSG3, MSG2, 4));
        MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
        MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

        // Rounds 16-59 (unrolled in groups of 4)
        SHA256_SHANI_ROUND(16, MSG1, MSG2, MSG3, MSG0);
        SHA256_SHANI_ROUND(20, MSG2, MSG3, MSG0, MSG1);
        SHA256_SHANI_ROUND(24, MSG3, MSG0, MSG1, MSG2);
        SHA256_SHANI_ROUND(28, MSG0, MSG1, MSG2, MSG3);
        SHA256_SHANI_ROUND(32, MSG1, MSG2, MSG3, MSG0);
        SHA256_SHANI_ROUND(36, MSG2, MSG3, MSG0, MSG1);
        SHA256_SHANI_ROUND(40, MSG3, MSG0, MSG1, MSG2);
        SHA256_SHANI_ROUND(44, MSG0, MSG1, MSG2, MSG3);
        SHA256_SHANI_ROUND(48, MSG1, MSG2, MSG3, MSG0);
        SHA256_SHANI_ROUND(52, MSG2, MSG3, MSG0, MSG1);
        SHA256_SHANI_ROUND(56, MSG3, MSG0, MSG1, MSG2);

        // Rounds 60-63
        MSGTMP = _mm_add_epi32(MSG0, _mm_loadu_si128((const __m128i*)&K256[60]));
        STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSGTMP);
        MSGTMP = _mm_shuffle_epi32(MSGTMP, 0x0E);
        STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSGTMP);

        #undef SHA256_SHANI_ROUND

        STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
        STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
    }

    // Rearrange back to ABCDEFGH and byte-swap to big-endian
    TMP = _mm_shuffle_epi32(STATE0, 0x1B);        // FEBA
    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1);     // DCHG
    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0);  // DCBA
    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8);     // HGFE

    STATE0 = _mm_shuffle_epi8(STATE0, MASK);
    STATE1 = _mm_shuffle_epi8(STATE1, MASK);

    std::vector<uint8_t> hash(32);
    _mm_storeu_si128((__m128i*)hash.data(), STATE0);
    _mm_storeu_si128((__m128i*)(hash.data() + 16), STATE1);
    return hash;
}

static bool cpu_has_shani() {
#if defined(__GNUC__) || defined(__clang__)
    return __builtin_cpu_supports("sha");
#else
    return false;
#endif
}
#endif // x86

std::vector<uint8_t> netplus::sha1_hash(const std::vector<uint8_t>& input) {

    #define SHA1_ROL(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
@@ -68,6 +204,11 @@ std::vector<uint8_t> netplus::sha1_hash(const std::vector<uint8_t>& input) {
}

std::vector<uint8_t> netplus::sha256_hash(const std::vector<uint8_t>& input) {
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
    static const bool has_shani = cpu_has_shani();
    if (has_shani) return sha256_hash_shani(input);
#endif

    // SHA-256 - correct implementation based on FIPS 180-4
    #define SHA256_ROR(value, bits) (((value) >> (bits)) | ((value) << (32 - (bits))))