deb (e0ca5ae8) · Commits · jan.koester / libnetplus

debian/changelog

+9 −0

Original line number	Diff line number	Diff line
		libnetplus (20260425+1) unstable; urgency=medium

		* RSA: replace separate multiply + reduce with fused CIOS
		(Coarsely Integrated Operand Scanning) in montgomeryMultiply_into,
		eliminating a full schoolbook multiplication pass and improving
		cache locality

		-- Jan Koester <jan.koester@tuxist.de> Sat, 25 Apr 2026 12:00:00 +0200

		libnetplus (20260424+22) unstable; urgency=medium

		* Fix remaining compiler warnings: remove unused derOctetString, enlarge

src/crypto/rsa.cpp

+44 −27

Original line number	Diff line number	Diff line
		@@ -1219,50 +1219,67 @@ namespace netplus {
		return final_res;
		}

		// In-place Montgomery multiply: writes result into \|out\|, uses \|scratch\| as temp.
		// Both scratch and out must be pre-allocated with sufficient capacity.
		// In-place Montgomery multiply using CIOS (Coarsely Integrated Operand Scanning).
		// Fuses multiplication and reduction into a single pass for better cache locality.
		// scratch is used as the n+2 word accumulator T.
		void rsa::montgomeryMultiply_into(const bigInt& a, const bigInt& b,
		const bigInt& mod, uint32_t n_prime,
		bigInt& scratch, bigInt& out) {
		const size_t n = mod.used;
		const size_t need = 2 * n + 2; // extra word for carry propagation
		const size_t tlen = n + 2;

		// Ensure scratch has enough room
		if (scratch.capacity < need) scratch.reserve(need);
		// Use scratch as T accumulator
		if (scratch.capacity < tlen) scratch.reserve(tlen);
		uint32_t* __restrict__ T = scratch.data.get();
		std::memset(T, 0, tlen * sizeof(uint32_t));

		// Perform schoolbook multiplication (scratch = a * b)
		multiply(a, b, scratch);
		// Ensure any limbs beyond the product are zero for carry propagation
		if (scratch.capacity < need) scratch.reserve(need);
		for (size_t i = scratch.used; i < need; ++i)
		scratch.data[i] = 0;

		// Montgomery Reduction — use raw pointers
		uint32_t* __restrict__ sp = scratch.data.get();
		const uint32_t* __restrict__ ap = a.data.get();
		const uint32_t* __restrict__ bp = b.data.get();
		const uint32_t* __restrict__ mp = mod.data.get();
		const size_t au = a.used;

		for (size_t i = 0; i < n; i++) {
		const uint64_t mi = (uint64_t)(sp[i] * n_prime);
		for (size_t i = 0; i < n; ++i) {
		// Step 1: T += a[i] * b
		const uint64_t ai = (i < au) ? (uint64_t)ap[i] : 0;
		uint64_t carry = 0;
		for (size_t j = 0; j < n; j++) {
		uint64_t cur = (uint64_t)sp[i + j] + mi * (uint64_t)mp[j] + carry;
		sp[i + j] = (uint32_t)cur;
		for (size_t j = 0; j < n; ++j) {
		uint64_t cur = (uint64_t)T[j] + ai * (uint64_t)bp[j] + carry;
		T[j] = (uint32_t)cur;
		carry = cur >> 32;
		}
		size_t k = i + n;
		while (carry > 0) {
		uint64_t cur = (uint64_t)sp[k] + carry;
		sp[k] = (uint32_t)cur;
		{
		uint64_t cur = (uint64_t)T[n] + carry;
		T[n] = (uint32_t)cur;
		T[n + 1] = (uint32_t)(cur >> 32);
		}

		// Step 2: Montgomery reduction — T += m_i * mod, then shift right by one word
		const uint64_t mi = (uint64_t)((uint32_t)(T[0] * n_prime));
		carry = 0;
		// First word: T[0] + mi*mod[0] is zero mod 2^32, just capture carry
		{
		uint64_t cur = (uint64_t)T[0] + mi * (uint64_t)mp[0];
		carry = cur >> 32;
		}
		for (size_t j = 1; j < n; ++j) {
		uint64_t cur = (uint64_t)T[j] + mi * (uint64_t)mp[j] + carry;
		T[j - 1] = (uint32_t)cur;
		carry = cur >> 32;
		}
		{
		uint64_t cur = (uint64_t)T[n] + carry;
		T[n - 1] = (uint32_t)cur;
		carry = cur >> 32;
		k++;
		}
		T[n] = T[n + 1] + (uint32_t)carry;
		T[n + 1] = 0;
		}

		// Shift right by N words (divide by R) — write into out
		// Result is T[0..n-1] (possibly n words)
		if (out.capacity < n + 1) out.reserve(n + 1);
		uint32_t* __restrict__ op = out.data.get();
		std::memcpy(op, sp + n, (n + 1) * sizeof(uint32_t));
		out.used = n + 1;
		std::memcpy(op, T, (n + 1) * sizeof(uint32_t));
		out.used = (T[n] != 0) ? n + 1 : n;
		while (out.used > 1 && op[out.used - 1] == 0) out.used--;

		// Final step: if out >= mod, subtract mod in-place