Commit e0ca5ae8 authored by jan.koester's avatar jan.koester
Browse files

deb

parent 85b26bdb
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
libnetplus (20260425+1) unstable; urgency=medium

  * RSA: replace separate multiply + reduce with fused CIOS
    (Coarsely Integrated Operand Scanning) in montgomeryMultiply_into,
    eliminating a full schoolbook multiplication pass and improving
    cache locality

 -- Jan Koester <jan.koester@tuxist.de>  Sat, 25 Apr 2026 12:00:00 +0200

libnetplus (20260424+22) unstable; urgency=medium

  * Fix remaining compiler warnings: remove unused derOctetString, enlarge
+44 −27
Original line number Diff line number Diff line
@@ -1219,50 +1219,67 @@ namespace netplus {
        return final_res;
    }

    // In-place Montgomery multiply: writes result into |out|, uses |scratch| as temp.
    // Both scratch and out must be pre-allocated with sufficient capacity.
    // In-place Montgomery multiply using CIOS (Coarsely Integrated Operand Scanning).
    // Fuses multiplication and reduction into a single pass for better cache locality.
    // scratch is used as the n+2 word accumulator T.
    void rsa::montgomeryMultiply_into(const bigInt& a, const bigInt& b,
                                       const bigInt& mod, uint32_t n_prime,
                                       bigInt& scratch, bigInt& out) {
        const size_t n = mod.used;
        const size_t need = 2 * n + 2;  // extra word for carry propagation
        const size_t tlen = n + 2;

        // Ensure scratch has enough room
        if (scratch.capacity < need) scratch.reserve(need);
        // Use scratch as T accumulator
        if (scratch.capacity < tlen) scratch.reserve(tlen);
        uint32_t* __restrict__ T = scratch.data.get();
        std::memset(T, 0, tlen * sizeof(uint32_t));

        // Perform schoolbook multiplication (scratch = a * b)
        multiply(a, b, scratch);
        // Ensure any limbs beyond the product are zero for carry propagation
        if (scratch.capacity < need) scratch.reserve(need);
        for (size_t i = scratch.used; i < need; ++i)
            scratch.data[i] = 0;

        // Montgomery Reduction — use raw pointers
        uint32_t* __restrict__ sp = scratch.data.get();
        const uint32_t* __restrict__ ap = a.data.get();
        const uint32_t* __restrict__ bp = b.data.get();
        const uint32_t* __restrict__ mp = mod.data.get();
        const size_t au = a.used;

        for (size_t i = 0; i < n; i++) {
            const uint64_t mi = (uint64_t)(sp[i] * n_prime);
        for (size_t i = 0; i < n; ++i) {
            // Step 1: T += a[i] * b
            const uint64_t ai = (i < au) ? (uint64_t)ap[i] : 0;
            uint64_t carry = 0;
            for (size_t j = 0; j < n; j++) {
                uint64_t cur = (uint64_t)sp[i + j] + mi * (uint64_t)mp[j] + carry;
                sp[i + j] = (uint32_t)cur;
            for (size_t j = 0; j < n; ++j) {
                uint64_t cur = (uint64_t)T[j] + ai * (uint64_t)bp[j] + carry;
                T[j] = (uint32_t)cur;
                carry = cur >> 32;
            }
            size_t k = i + n;
            while (carry > 0) {
                uint64_t cur = (uint64_t)sp[k] + carry;
                sp[k] = (uint32_t)cur;
            {
                uint64_t cur = (uint64_t)T[n] + carry;
                T[n] = (uint32_t)cur;
                T[n + 1] = (uint32_t)(cur >> 32);
            }

            // Step 2: Montgomery reduction — T += m_i * mod, then shift right by one word
            const uint64_t mi = (uint64_t)((uint32_t)(T[0] * n_prime));
            carry = 0;
            // First word: T[0] + mi*mod[0] is zero mod 2^32, just capture carry
            {
                uint64_t cur = (uint64_t)T[0] + mi * (uint64_t)mp[0];
                carry = cur >> 32;
            }
            for (size_t j = 1; j < n; ++j) {
                uint64_t cur = (uint64_t)T[j] + mi * (uint64_t)mp[j] + carry;
                T[j - 1] = (uint32_t)cur;
                carry = cur >> 32;
            }
            {
                uint64_t cur = (uint64_t)T[n] + carry;
                T[n - 1] = (uint32_t)cur;
                carry = cur >> 32;
                k++;
            }
            T[n] = T[n + 1] + (uint32_t)carry;
            T[n + 1] = 0;
        }

        // Shift right by N words (divide by R) — write into out
        // Result is T[0..n-1] (possibly n words)
        if (out.capacity < n + 1) out.reserve(n + 1);
        uint32_t* __restrict__ op = out.data.get();
        std::memcpy(op, sp + n, (n + 1) * sizeof(uint32_t));
        out.used = n + 1;
        std::memcpy(op, T, (n + 1) * sizeof(uint32_t));
        out.used = (T[n] != 0) ? n + 1 : n;
        while (out.used > 1 && op[out.used - 1] == 0) out.used--;

        // Final step: if out >= mod, subtract mod in-place