Loading debian/changelog +9 −0 Original line number Diff line number Diff line libnetplus (20260425+1) unstable; urgency=medium * RSA: replace separate multiply + reduce with fused CIOS (Coarsely Integrated Operand Scanning) in montgomeryMultiply_into, eliminating a full schoolbook multiplication pass and improving cache locality -- Jan Koester <jan.koester@tuxist.de> Sat, 25 Apr 2026 12:00:00 +0200 libnetplus (20260424+22) unstable; urgency=medium * Fix remaining compiler warnings: remove unused derOctetString, enlarge Loading src/crypto/rsa.cpp +44 −27 Original line number Diff line number Diff line Loading @@ -1219,50 +1219,67 @@ namespace netplus { return final_res; } // In-place Montgomery multiply: writes result into |out|, uses |scratch| as temp. // Both scratch and out must be pre-allocated with sufficient capacity. // In-place Montgomery multiply using CIOS (Coarsely Integrated Operand Scanning). // Fuses multiplication and reduction into a single pass for better cache locality. // scratch is used as the n+2 word accumulator T. void rsa::montgomeryMultiply_into(const bigInt& a, const bigInt& b, const bigInt& mod, uint32_t n_prime, bigInt& scratch, bigInt& out) { const size_t n = mod.used; const size_t need = 2 * n + 2; // extra word for carry propagation const size_t tlen = n + 2; // Ensure scratch has enough room if (scratch.capacity < need) scratch.reserve(need); // Use scratch as T accumulator if (scratch.capacity < tlen) scratch.reserve(tlen); uint32_t* __restrict__ T = scratch.data.get(); std::memset(T, 0, tlen * sizeof(uint32_t)); // Perform schoolbook multiplication (scratch = a * b) multiply(a, b, scratch); // Ensure any limbs beyond the product are zero for carry propagation if (scratch.capacity < need) scratch.reserve(need); for (size_t i = scratch.used; i < need; ++i) scratch.data[i] = 0; // Montgomery Reduction — use raw pointers uint32_t* __restrict__ sp = scratch.data.get(); const uint32_t* __restrict__ ap = a.data.get(); const uint32_t* __restrict__ bp = b.data.get(); const uint32_t* __restrict__ mp = mod.data.get(); const size_t au = a.used; for (size_t i = 0; i < n; i++) { const uint64_t mi = (uint64_t)(sp[i] * n_prime); for (size_t i = 0; i < n; ++i) { // Step 1: T += a[i] * b const uint64_t ai = (i < au) ? (uint64_t)ap[i] : 0; uint64_t carry = 0; for (size_t j = 0; j < n; j++) { uint64_t cur = (uint64_t)sp[i + j] + mi * (uint64_t)mp[j] + carry; sp[i + j] = (uint32_t)cur; for (size_t j = 0; j < n; ++j) { uint64_t cur = (uint64_t)T[j] + ai * (uint64_t)bp[j] + carry; T[j] = (uint32_t)cur; carry = cur >> 32; } size_t k = i + n; while (carry > 0) { uint64_t cur = (uint64_t)sp[k] + carry; sp[k] = (uint32_t)cur; { uint64_t cur = (uint64_t)T[n] + carry; T[n] = (uint32_t)cur; T[n + 1] = (uint32_t)(cur >> 32); } // Step 2: Montgomery reduction — T += m_i * mod, then shift right by one word const uint64_t mi = (uint64_t)((uint32_t)(T[0] * n_prime)); carry = 0; // First word: T[0] + mi*mod[0] is zero mod 2^32, just capture carry { uint64_t cur = (uint64_t)T[0] + mi * (uint64_t)mp[0]; carry = cur >> 32; } for (size_t j = 1; j < n; ++j) { uint64_t cur = (uint64_t)T[j] + mi * (uint64_t)mp[j] + carry; T[j - 1] = (uint32_t)cur; carry = cur >> 32; } { uint64_t cur = (uint64_t)T[n] + carry; T[n - 1] = (uint32_t)cur; carry = cur >> 32; k++; } T[n] = T[n + 1] + (uint32_t)carry; T[n + 1] = 0; } // Shift right by N words (divide by R) — write into out // Result is T[0..n-1] (possibly n words) if (out.capacity < n + 1) out.reserve(n + 1); uint32_t* __restrict__ op = out.data.get(); std::memcpy(op, sp + n, (n + 1) * sizeof(uint32_t)); out.used = n + 1; std::memcpy(op, T, (n + 1) * sizeof(uint32_t)); out.used = (T[n] != 0) ? n + 1 : n; while (out.used > 1 && op[out.used - 1] == 0) out.used--; // Final step: if out >= mod, subtract mod in-place Loading Loading
debian/changelog +9 −0 Original line number Diff line number Diff line libnetplus (20260425+1) unstable; urgency=medium * RSA: replace separate multiply + reduce with fused CIOS (Coarsely Integrated Operand Scanning) in montgomeryMultiply_into, eliminating a full schoolbook multiplication pass and improving cache locality -- Jan Koester <jan.koester@tuxist.de> Sat, 25 Apr 2026 12:00:00 +0200 libnetplus (20260424+22) unstable; urgency=medium * Fix remaining compiler warnings: remove unused derOctetString, enlarge Loading
src/crypto/rsa.cpp +44 −27 Original line number Diff line number Diff line Loading @@ -1219,50 +1219,67 @@ namespace netplus { return final_res; } // In-place Montgomery multiply: writes result into |out|, uses |scratch| as temp. // Both scratch and out must be pre-allocated with sufficient capacity. // In-place Montgomery multiply using CIOS (Coarsely Integrated Operand Scanning). // Fuses multiplication and reduction into a single pass for better cache locality. // scratch is used as the n+2 word accumulator T. void rsa::montgomeryMultiply_into(const bigInt& a, const bigInt& b, const bigInt& mod, uint32_t n_prime, bigInt& scratch, bigInt& out) { const size_t n = mod.used; const size_t need = 2 * n + 2; // extra word for carry propagation const size_t tlen = n + 2; // Ensure scratch has enough room if (scratch.capacity < need) scratch.reserve(need); // Use scratch as T accumulator if (scratch.capacity < tlen) scratch.reserve(tlen); uint32_t* __restrict__ T = scratch.data.get(); std::memset(T, 0, tlen * sizeof(uint32_t)); // Perform schoolbook multiplication (scratch = a * b) multiply(a, b, scratch); // Ensure any limbs beyond the product are zero for carry propagation if (scratch.capacity < need) scratch.reserve(need); for (size_t i = scratch.used; i < need; ++i) scratch.data[i] = 0; // Montgomery Reduction — use raw pointers uint32_t* __restrict__ sp = scratch.data.get(); const uint32_t* __restrict__ ap = a.data.get(); const uint32_t* __restrict__ bp = b.data.get(); const uint32_t* __restrict__ mp = mod.data.get(); const size_t au = a.used; for (size_t i = 0; i < n; i++) { const uint64_t mi = (uint64_t)(sp[i] * n_prime); for (size_t i = 0; i < n; ++i) { // Step 1: T += a[i] * b const uint64_t ai = (i < au) ? (uint64_t)ap[i] : 0; uint64_t carry = 0; for (size_t j = 0; j < n; j++) { uint64_t cur = (uint64_t)sp[i + j] + mi * (uint64_t)mp[j] + carry; sp[i + j] = (uint32_t)cur; for (size_t j = 0; j < n; ++j) { uint64_t cur = (uint64_t)T[j] + ai * (uint64_t)bp[j] + carry; T[j] = (uint32_t)cur; carry = cur >> 32; } size_t k = i + n; while (carry > 0) { uint64_t cur = (uint64_t)sp[k] + carry; sp[k] = (uint32_t)cur; { uint64_t cur = (uint64_t)T[n] + carry; T[n] = (uint32_t)cur; T[n + 1] = (uint32_t)(cur >> 32); } // Step 2: Montgomery reduction — T += m_i * mod, then shift right by one word const uint64_t mi = (uint64_t)((uint32_t)(T[0] * n_prime)); carry = 0; // First word: T[0] + mi*mod[0] is zero mod 2^32, just capture carry { uint64_t cur = (uint64_t)T[0] + mi * (uint64_t)mp[0]; carry = cur >> 32; } for (size_t j = 1; j < n; ++j) { uint64_t cur = (uint64_t)T[j] + mi * (uint64_t)mp[j] + carry; T[j - 1] = (uint32_t)cur; carry = cur >> 32; } { uint64_t cur = (uint64_t)T[n] + carry; T[n - 1] = (uint32_t)cur; carry = cur >> 32; k++; } T[n] = T[n + 1] + (uint32_t)carry; T[n + 1] = 0; } // Shift right by N words (divide by R) — write into out // Result is T[0..n-1] (possibly n words) if (out.capacity < n + 1) out.reserve(n + 1); uint32_t* __restrict__ op = out.data.get(); std::memcpy(op, sp + n, (n + 1) * sizeof(uint32_t)); out.used = n + 1; std::memcpy(op, T, (n + 1) * sizeof(uint32_t)); out.used = (T[n] != 0) ? n + 1 : n; while (out.used > 1 && op[out.used - 1] == 0) out.used--; // Final step: if out >= mod, subtract mod in-place Loading