Commit e5323477 authored by Heiko Carstens's avatar Heiko Carstens
Browse files

Merge branch 'decompressor-memory-tracking' into features



Vasily Gorbik says:

===================
Combine and generalize all methods for finding unused memory in
decompressor, while decreasing complexity, add memory holes support,
while improving error handling (especially in low-memory conditions)
and debug-ability.
===================

Signed-off-by: default avatarHeiko Carstens <hca@linux.ibm.com>
parents 7229ea86 557b1970
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ endif

CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char

obj-y	:= head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o vmem.o
obj-y	:= head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
obj-y	+= string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
obj-y	+= version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o
obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))	+= uv.o
+33 −6
Original line number Diff line number Diff line
@@ -8,6 +8,8 @@

#ifndef __ASSEMBLY__

#include <asm/physmem_info.h>

struct machine_info {
	unsigned char has_edat1 : 1;
	unsigned char has_edat2 : 1;
@@ -30,24 +32,44 @@ struct vmlinux_info {
	unsigned long init_mm_off;
	unsigned long swapper_pg_dir_off;
	unsigned long invalid_pg_dir_off;
#ifdef CONFIG_KASAN
	unsigned long kasan_early_shadow_page_off;
	unsigned long kasan_early_shadow_pte_off;
	unsigned long kasan_early_shadow_pmd_off;
	unsigned long kasan_early_shadow_pud_off;
	unsigned long kasan_early_shadow_p4d_off;
#endif
};

void startup_kernel(void);
unsigned long detect_memory(unsigned long *safe_addr);
void mem_detect_set_usable_limit(unsigned long limit);
unsigned long detect_max_physmem_end(void);
void detect_physmem_online_ranges(unsigned long max_physmem_end);
void physmem_set_usable_limit(unsigned long limit);
void physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size);
void physmem_free(enum reserved_range_type type);
/* for continuous/multiple allocations per type */
unsigned long physmem_alloc_top_down(enum reserved_range_type type, unsigned long size,
				     unsigned long align);
/* for single allocations, 1 per type */
unsigned long physmem_alloc_range(enum reserved_range_type type, unsigned long size,
				  unsigned long align, unsigned long min, unsigned long max,
				  bool die_on_oom);
bool ipl_report_certs_intersects(unsigned long addr, unsigned long size,
				 unsigned long *intersection_start);
bool is_ipl_block_dump(void);
void store_ipl_parmblock(void);
unsigned long read_ipl_report(unsigned long safe_addr);
int read_ipl_report(void);
void save_ipl_cert_comp_list(void);
void setup_boot_command_line(void);
void parse_boot_command_line(void);
void verify_facilities(void);
void print_missing_facilities(void);
void sclp_early_setup_buffer(void);
void print_pgm_check_info(void);
unsigned long get_random_base(unsigned long safe_addr);
unsigned long get_random_base(void);
void setup_vmem(unsigned long asce_limit);
unsigned long vmem_estimate_memory_needs(unsigned long online_mem_total);
void __printf(1, 2) decompressor_printk(const char *fmt, ...);
void print_stacktrace(unsigned long sp);
void error(char *m);

extern struct machine_info machine;
@@ -62,7 +84,7 @@ extern char __boot_data_start[], __boot_data_end[];
extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
extern char _decompressor_syms_start[], _decompressor_syms_end[];
extern char _stack_start[], _stack_end[];
extern char _end[];
extern char _end[], _decompressor_end[];
extern unsigned char _compressed_start[];
extern unsigned char _compressed_end[];
extern struct vmlinux_info _vmlinux_info;
@@ -70,5 +92,10 @@ extern struct vmlinux_info _vmlinux_info;

#define __abs_lowcore_pa(x)	(((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore))

static inline bool intersects(unsigned long addr0, unsigned long size0,
			      unsigned long addr1, unsigned long size1)
{
	return addr0 + size0 > addr1 && addr1 + size1 > addr0;
}
#endif /* __ASSEMBLY__ */
#endif /* BOOT_BOOT_H */
+50 −50
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@
#include <asm/sclp.h>
#include <asm/sections.h>
#include <asm/boot_data.h>
#include <asm/physmem_info.h>
#include <uapi/asm/ipl.h>
#include "boot.h"

@@ -16,20 +17,16 @@ unsigned long __bootdata_preserved(ipl_cert_list_size);
unsigned long __bootdata(early_ipl_comp_list_addr);
unsigned long __bootdata(early_ipl_comp_list_size);

static struct ipl_rb_certificates *certs;
static struct ipl_rb_components *comps;
static bool ipl_report_needs_saving;

#define for_each_rb_entry(entry, rb) \
	for (entry = rb->entries; \
	     (void *) entry + sizeof(*entry) <= (void *) rb + rb->len; \
	     entry++)

static inline bool intersects(unsigned long addr0, unsigned long size0,
			      unsigned long addr1, unsigned long size1)
{
	return addr0 + size0 > addr1 && addr1 + size1 > addr0;
}

static unsigned long find_bootdata_space(struct ipl_rb_components *comps,
					 struct ipl_rb_certificates *certs,
					 unsigned long safe_addr)
static unsigned long get_cert_comp_list_size(void)
{
	struct ipl_rb_certificate_entry *cert;
	struct ipl_rb_component_entry *comp;
@@ -44,36 +41,27 @@ static unsigned long find_bootdata_space(struct ipl_rb_components *comps,
	ipl_cert_list_size = 0;
	for_each_rb_entry(cert, certs)
		ipl_cert_list_size += sizeof(unsigned int) + cert->len;
	size = ipl_cert_list_size + early_ipl_comp_list_size;
	return ipl_cert_list_size + early_ipl_comp_list_size;
}

	/*
	 * Start from safe_addr to find a free memory area large
	 * enough for the IPL report boot data. This area is used
	 * for ipl_cert_list_addr/ipl_cert_list_size and
	 * early_ipl_comp_list_addr/early_ipl_comp_list_size. It must
	 * not overlap with any component or any certificate.
	 */
repeat:
	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && initrd_data.start && initrd_data.size &&
	    intersects(initrd_data.start, initrd_data.size, safe_addr, size))
		safe_addr = initrd_data.start + initrd_data.size;
	for_each_rb_entry(comp, comps)
		if (intersects(safe_addr, size, comp->addr, comp->len)) {
			safe_addr = comp->addr + comp->len;
			goto repeat;
bool ipl_report_certs_intersects(unsigned long addr, unsigned long size,
				 unsigned long *intersection_start)
{
	struct ipl_rb_certificate_entry *cert;

	if (!ipl_report_needs_saving)
		return false;

	for_each_rb_entry(cert, certs) {
		if (intersects(addr, size, cert->addr, cert->len)) {
			*intersection_start = cert->addr;
			return true;
		}
	for_each_rb_entry(cert, certs)
		if (intersects(safe_addr, size, cert->addr, cert->len)) {
			safe_addr = cert->addr + cert->len;
			goto repeat;
	}
	early_ipl_comp_list_addr = safe_addr;
	ipl_cert_list_addr = safe_addr + early_ipl_comp_list_size;

	return safe_addr + size;
	return false;
}

static void copy_components_bootdata(struct ipl_rb_components *comps)
static void copy_components_bootdata(void)
{
	struct ipl_rb_component_entry *comp, *ptr;

@@ -82,7 +70,7 @@ static void copy_components_bootdata(struct ipl_rb_components *comps)
		memcpy(ptr++, comp, sizeof(*ptr));
}

static void copy_certificates_bootdata(struct ipl_rb_certificates *certs)
static void copy_certificates_bootdata(void)
{
	struct ipl_rb_certificate_entry *cert;
	void *ptr;
@@ -96,10 +84,8 @@ static void copy_certificates_bootdata(struct ipl_rb_certificates *certs)
	}
}

unsigned long read_ipl_report(unsigned long safe_addr)
int read_ipl_report(void)
{
	struct ipl_rb_certificates *certs;
	struct ipl_rb_components *comps;
	struct ipl_pl_hdr *pl_hdr;
	struct ipl_rl_hdr *rl_hdr;
	struct ipl_rb_hdr *rb_hdr;
@@ -112,7 +98,7 @@ unsigned long read_ipl_report(unsigned long safe_addr)
	 */
	if (!ipl_block_valid ||
	    !(ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR))
		return safe_addr;
		return -1;
	ipl_secure_flag = !!(ipl_block.hdr.flags & IPL_PL_FLAG_SIPL);
	/*
	 * There is an IPL report, to find it load the pointer to the
@@ -150,16 +136,30 @@ unsigned long read_ipl_report(unsigned long safe_addr)
	 * With either the component list or the certificate list
	 * missing the kernel will stay ignorant of secure IPL.
	 */
	if (!comps || !certs)
		return safe_addr;
	if (!comps || !certs) {
		certs = NULL;
		return -1;
	}

	/*
	 * Copy component and certificate list to a safe area
	 * where the decompressed kernel can find them.
	 */
	safe_addr = find_bootdata_space(comps, certs, safe_addr);
	copy_components_bootdata(comps);
	copy_certificates_bootdata(certs);
	ipl_report_needs_saving = true;
	physmem_reserve(RR_IPLREPORT, (unsigned long)pl_hdr,
			(unsigned long)rl_end - (unsigned long)pl_hdr);
	return 0;
}

void save_ipl_cert_comp_list(void)
{
	unsigned long size;

	if (!ipl_report_needs_saving)
		return;

	size = get_cert_comp_list_size();
	early_ipl_comp_list_addr = physmem_alloc_top_down(RR_CERT_COMP_LIST, size, sizeof(int));
	ipl_cert_list_addr = early_ipl_comp_list_addr + early_ipl_comp_list_size;

	return safe_addr;
	copy_components_bootdata();
	copy_certificates_bootdata();
	physmem_free(RR_IPLREPORT);
	ipl_report_needs_saving = false;
}
+9 −106
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@
 * Copyright IBM Corp. 2019
 */
#include <linux/pgtable.h>
#include <asm/mem_detect.h>
#include <asm/physmem_info.h>
#include <asm/cpacf.h>
#include <asm/timex.h>
#include <asm/sclp.h>
@@ -91,113 +91,16 @@ static int get_random(unsigned long limit, unsigned long *value)
	return 0;
}

/*
 * To randomize kernel base address we have to consider several facts:
 * 1. physical online memory might not be continuous and have holes. mem_detect
 *    info contains list of online memory ranges we should consider.
 * 2. we have several memory regions which are occupied and we should not
 *    overlap and destroy them. Currently safe_addr tells us the border below
 *    which all those occupied regions are. We are safe to use anything above
 *    safe_addr.
 * 3. the upper limit might apply as well, even if memory above that limit is
 *    online. Currently those limitations are:
 *    3.1. Limit set by "mem=" kernel command line option
 *    3.2. memory reserved at the end for kasan initialization.
 * 4. kernel base address must be aligned to THREAD_SIZE (kernel stack size).
 *    Which is required for CONFIG_CHECK_STACK. Currently THREAD_SIZE is 4 pages
 *    (16 pages when the kernel is built with kasan enabled)
 * Assumptions:
 * 1. kernel size (including .bss size) and upper memory limit are page aligned.
 * 2. mem_detect memory region start is THREAD_SIZE aligned / end is PAGE_SIZE
 *    aligned (in practice memory configurations granularity on z/VM and LPAR
 *    is 1mb).
 *
 * To guarantee uniform distribution of kernel base address among all suitable
 * addresses we generate random value just once. For that we need to build a
 * continuous range in which every value would be suitable. We can build this
 * range by simply counting all suitable addresses (let's call them positions)
 * which would be valid as kernel base address. To count positions we iterate
 * over online memory ranges. For each range which is big enough for the
 * kernel image we count all suitable addresses we can put the kernel image at
 * that is
 * (end - start - kernel_size) / THREAD_SIZE + 1
 * Two functions count_valid_kernel_positions and position_to_address help
 * to count positions in memory range given and then convert position back
 * to address.
 */
static unsigned long count_valid_kernel_positions(unsigned long kernel_size,
						  unsigned long _min,
						  unsigned long _max)
{
	unsigned long start, end, pos = 0;
	int i;

	for_each_mem_detect_usable_block(i, &start, &end) {
		if (_min >= end)
			continue;
		if (start >= _max)
			break;
		start = max(_min, start);
		end = min(_max, end);
		if (end - start < kernel_size)
			continue;
		pos += (end - start - kernel_size) / THREAD_SIZE + 1;
	}

	return pos;
}

static unsigned long position_to_address(unsigned long pos, unsigned long kernel_size,
				 unsigned long _min, unsigned long _max)
{
	unsigned long start, end;
	int i;

	for_each_mem_detect_usable_block(i, &start, &end) {
		if (_min >= end)
			continue;
		if (start >= _max)
			break;
		start = max(_min, start);
		end = min(_max, end);
		if (end - start < kernel_size)
			continue;
		if ((end - start - kernel_size) / THREAD_SIZE + 1 >= pos)
			return start + (pos - 1) * THREAD_SIZE;
		pos -= (end - start - kernel_size) / THREAD_SIZE + 1;
	}

	return 0;
}

unsigned long get_random_base(unsigned long safe_addr)
unsigned long get_random_base(void)
{
	unsigned long usable_total = get_mem_detect_usable_total();
	unsigned long memory_limit = get_mem_detect_end();
	unsigned long base_pos, max_pos, kernel_size;
	int i;

	/*
	 * Avoid putting kernel in the end of physical memory
	 * which vmem and kasan code will use for shadow memory and
	 * pgtable mapping allocations.
	 */
	memory_limit -= kasan_estimate_memory_needs(usable_total);
	memory_limit -= vmem_estimate_memory_needs(usable_total);

	safe_addr = ALIGN(safe_addr, THREAD_SIZE);
	kernel_size = vmlinux.image_size + vmlinux.bss_size;
	if (safe_addr + kernel_size > memory_limit)
		return 0;
	unsigned long vmlinux_size = vmlinux.image_size + vmlinux.bss_size;
	unsigned long minimal_pos = vmlinux.default_lma + vmlinux_size;
	unsigned long random;

	max_pos = count_valid_kernel_positions(kernel_size, safe_addr, memory_limit);
	if (!max_pos) {
		sclp_early_printk("KASLR disabled: not enough memory\n");
	/* [vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size : physmem_info.usable] */
	if (get_random(physmem_info.usable - minimal_pos, &random))
		return 0;
	}

	/* we need a value in the range [1, base_pos] inclusive */
	if (get_random(max_pos, &base_pos))
		return 0;
	return position_to_address(base_pos + 1, kernel_size, safe_addr, memory_limit);
	return physmem_alloc_range(RR_VMLINUX, vmlinux_size, THREAD_SIZE,
				   vmlinux.default_lma, minimal_pos + random, false);
}
+2 −3
Original line number Diff line number Diff line
@@ -123,11 +123,10 @@ void decompressor_printk(const char *fmt, ...)
	sclp_early_printk(buf);
}

static noinline void print_stacktrace(void)
void print_stacktrace(unsigned long sp)
{
	struct stack_info boot_stack = { STACK_TYPE_TASK, (unsigned long)_stack_start,
					 (unsigned long)_stack_end };
	unsigned long sp = S390_lowcore.gpregs_save_area[15];
	bool first = true;

	decompressor_printk("Call Trace:\n");
@@ -173,7 +172,7 @@ void print_pgm_check_info(void)
			    gpregs[8], gpregs[9], gpregs[10], gpregs[11]);
	decompressor_printk("      %016lx %016lx %016lx %016lx\n",
			    gpregs[12], gpregs[13], gpregs[14], gpregs[15]);
	print_stacktrace();
	print_stacktrace(S390_lowcore.gpregs_save_area[15]);
	decompressor_printk("Last Breaking-Event-Address:\n");
	decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.pgm_last_break,
			    (void *)S390_lowcore.pgm_last_break);
Loading