Commit 5e049663 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o:
 "Regression and bug fixes:

   - Performance regression fix from 5.18 on a Rasberry Pi

   - Fix extent parsing bug which triggers a BUG_ON when a (corrupted)
     extent tree has has a non-root node when zero entries.

   - Fix a livelock where in the right (wrong) circumstances a large
     number of nfsd threads can try to write to a nearly full file
     system, and retry for hours(!)"

* tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: limit the number of retries after discarding preallocations blocks
  ext4: fix bug in extents parsing when eh_entries == 0 and eh_depth > 0
  ext4: use buckets for cr 1 block scan instead of rbtree
  ext4: use locality group preallocation for small closed files
  ext4: make directory inode spreading reflect flexbg size
  ext4: avoid unnecessary spreading of allocations among groups
  ext4: make mballoc try target group first even with mb_optimize_scan
parents 4207d595 80fa46d6
Loading
Loading
Loading
Loading
+5 −5
Original line number Diff line number Diff line
@@ -167,8 +167,6 @@ enum SHIFT_DIRECTION {
#define EXT4_MB_CR0_OPTIMIZED		0x8000
/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
#define EXT4_MB_CR1_OPTIMIZED		0x00010000
/* Perform linear traversal for one group */
#define EXT4_MB_SEARCH_NEXT_LINEAR	0x00020000
struct ext4_allocation_request {
	/* target inode for block we're allocating */
	struct inode *inode;
@@ -1600,8 +1598,8 @@ struct ext4_sb_info {
	struct list_head s_discard_list;
	struct work_struct s_discard_work;
	atomic_t s_retry_alloc_pending;
	struct rb_root s_mb_avg_fragment_size_root;
	rwlock_t s_mb_rb_lock;
	struct list_head *s_mb_avg_fragment_size;
	rwlock_t *s_mb_avg_fragment_size_locks;
	struct list_head *s_mb_largest_free_orders;
	rwlock_t *s_mb_largest_free_orders_locks;

@@ -3413,6 +3411,8 @@ struct ext4_group_info {
	ext4_grpblk_t	bb_first_free;	/* first free block */
	ext4_grpblk_t	bb_free;	/* total free blocks */
	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
	int		bb_avg_fragment_size_order;	/* order of average
							   fragment in BG */
	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
	ext4_group_t	bb_group;	/* Group number */
	struct          list_head bb_prealloc_list;
@@ -3420,7 +3420,7 @@ struct ext4_group_info {
	void            *bb_bitmap;
#endif
	struct rw_semaphore alloc_sem;
	struct rb_node	bb_avg_fragment_size_rb;
	struct list_head bb_avg_fragment_size_node;
	struct list_head bb_largest_free_order_node;
	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
					 * regions, index is order.
+4 −0
Original line number Diff line number Diff line
@@ -460,6 +460,10 @@ static int __ext4_ext_check(const char *function, unsigned int line,
		error_msg = "invalid eh_entries";
		goto corrupted;
	}
	if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
		error_msg = "eh_entries is 0 but eh_depth is > 0";
		goto corrupted;
	}
	if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
		error_msg = "invalid extent entries";
		goto corrupted;
+1 −1
Original line number Diff line number Diff line
@@ -510,7 +510,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
		goto fallback;
	}

	max_dirs = ndirs / ngroups + inodes_per_group / 16;
	max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
	min_inodes = avefreei - inodes_per_group*flex_size / 4;
	if (min_inodes < 1)
		min_inodes = 1;
+144 −174
Original line number Diff line number Diff line
@@ -140,13 +140,15 @@
 *    number of buddy bitmap orders possible) number of lists. Group-infos are
 *    placed in appropriate lists.
 *
 * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
 * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
 *
 *    Locking: sbi->s_mb_rb_lock (rwlock)
 *    Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
 *
 *    This is a red black tree consisting of group infos and the tree is sorted
 *    by average fragment sizes (which is calculated as ext4_group_info->bb_free
 *    / ext4_group_info->bb_fragments).
 *    This is an array of lists where in the i-th list there are groups with
 *    average fragment size >= 2^i and < 2^(i+1). The average fragment size
 *    is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
 *    Note that we don't bother with a special list for completely empty groups
 *    so we only have MB_NUM_ORDERS(sb) lists.
 *
 * When "mb_optimize_scan" mount option is set, mballoc consults the above data
 * structures to decide the order in which groups are to be traversed for
@@ -160,7 +162,8 @@
 *
 * At CR = 1, we only consider groups where average fragment size > request
 * size. So, we lookup a group which has average fragment size just above or
 * equal to request size using our rb tree (data structure 2) in O(log N) time.
 * equal to request size using our average fragment size group lists (data
 * structure 2) in O(1) time.
 *
 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
 * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
@@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
	}
}

static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
			int (*cmp)(struct rb_node *, struct rb_node *))
static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
{
	struct rb_node **iter = &root->rb_node, *parent = NULL;

	while (*iter) {
		parent = *iter;
		if (cmp(new, *iter) > 0)
			iter = &((*iter)->rb_left);
		else
			iter = &((*iter)->rb_right);
	}

	rb_link_node(new, parent, iter);
	rb_insert_color(new, root);
}

static int
ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
{
	struct ext4_group_info *grp1 = rb_entry(rb1,
						struct ext4_group_info,
						bb_avg_fragment_size_rb);
	struct ext4_group_info *grp2 = rb_entry(rb2,
						struct ext4_group_info,
						bb_avg_fragment_size_rb);
	int num_frags_1, num_frags_2;

	num_frags_1 = grp1->bb_fragments ?
		grp1->bb_free / grp1->bb_fragments : 0;
	num_frags_2 = grp2->bb_fragments ?
		grp2->bb_free / grp2->bb_fragments : 0;

	return (num_frags_2 - num_frags_1);
}
	int order;

	/*
 * Reinsert grpinfo into the avg_fragment_size tree with new average
 * fragment size.
	 * We don't bother with a special lists groups with only 1 block free
	 * extents and for completely empty groups.
	 */
	order = fls(len) - 2;
	if (order < 0)
		return 0;
	if (order == MB_NUM_ORDERS(sb))
		order--;
	return order;
}

/* Move group to appropriate avg_fragment_size list */
static void
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	int new_order;

	if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
		return;

	write_lock(&sbi->s_mb_rb_lock);
	if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
		rb_erase(&grp->bb_avg_fragment_size_rb,
				&sbi->s_mb_avg_fragment_size_root);
		RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
	}
	new_order = mb_avg_fragment_size_order(sb,
					grp->bb_free / grp->bb_fragments);
	if (new_order == grp->bb_avg_fragment_size_order)
		return;

	ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root,
		&grp->bb_avg_fragment_size_rb,
		ext4_mb_avg_fragment_size_cmp);
	write_unlock(&sbi->s_mb_rb_lock);
	if (grp->bb_avg_fragment_size_order != -1) {
		write_lock(&sbi->s_mb_avg_fragment_size_locks[
					grp->bb_avg_fragment_size_order]);
		list_del(&grp->bb_avg_fragment_size_node);
		write_unlock(&sbi->s_mb_avg_fragment_size_locks[
					grp->bb_avg_fragment_size_order]);
	}
	grp->bb_avg_fragment_size_order = new_order;
	write_lock(&sbi->s_mb_avg_fragment_size_locks[
					grp->bb_avg_fragment_size_order]);
	list_add_tail(&grp->bb_avg_fragment_size_node,
		&sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
	write_unlock(&sbi->s_mb_avg_fragment_size_locks[
					grp->bb_avg_fragment_size_order]);
}

/*
@@ -909,86 +898,56 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
		*new_cr = 1;
	} else {
		*group = grp->bb_group;
		ac->ac_last_optimal_group = *group;
		ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
	}
}

/*
 * Choose next group by traversing average fragment size tree. Updates *new_cr
 * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that
 * the linear search should continue for one iteration since there's lock
 * contention on the rb tree lock.
 * Choose next group by traversing average fragment size list of suitable
 * order. Updates *new_cr if cr level needs an update.
 */
static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
		int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
	int avg_fragment_size, best_so_far;
	struct rb_node *node, *found;
	struct ext4_group_info *grp;

	/*
	 * If there is contention on the lock, instead of waiting for the lock
	 * to become available, just continue searching lineraly. We'll resume
	 * our rb tree search later starting at ac->ac_last_optimal_group.
	 */
	if (!read_trylock(&sbi->s_mb_rb_lock)) {
		ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
		return;
	}
	struct ext4_group_info *grp, *iter;
	int i;

	if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
		if (sbi->s_mb_stats)
			atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
		/* We have found something at CR 1 in the past */
		grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
		for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL;
		     found = rb_next(found)) {
			grp = rb_entry(found, struct ext4_group_info,
				       bb_avg_fragment_size_rb);
	}

	for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
	     i < MB_NUM_ORDERS(ac->ac_sb); i++) {
		if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
			continue;
		read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
		if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
			read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
			continue;
		}
		grp = NULL;
		list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
				    bb_avg_fragment_size_node) {
			if (sbi->s_mb_stats)
				atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
			if (likely(ext4_mb_good_group(ac, grp->bb_group, 1)))
			if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
				grp = iter;
				break;
			}
		goto done;
	}

	node = sbi->s_mb_avg_fragment_size_root.rb_node;
	best_so_far = 0;
	found = NULL;

	while (node) {
		grp = rb_entry(node, struct ext4_group_info,
			       bb_avg_fragment_size_rb);
		avg_fragment_size = 0;
		if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
			avg_fragment_size = grp->bb_fragments ?
				grp->bb_free / grp->bb_fragments : 0;
			if (!best_so_far || avg_fragment_size < best_so_far) {
				best_so_far = avg_fragment_size;
				found = node;
			}
		}
		if (avg_fragment_size > ac->ac_g_ex.fe_len)
			node = node->rb_right;
		else
			node = node->rb_left;
		read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
		if (grp)
			break;
	}

done:
	if (found) {
		grp = rb_entry(found, struct ext4_group_info,
			       bb_avg_fragment_size_rb);
	if (grp) {
		*group = grp->bb_group;
		ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
	} else {
		*new_cr = 2;
	}

	read_unlock(&sbi->s_mb_rb_lock);
	ac->ac_last_optimal_group = *group;
}

static inline int should_optimize_scan(struct ext4_allocation_context *ac)
@@ -1017,11 +976,6 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
		goto inc_and_return;
	}

	if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
		ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
		goto inc_and_return;
	}

	return group;
inc_and_return:
	/*
@@ -1049,8 +1003,10 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
{
	*new_cr = ac->ac_criteria;

	if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining)
	if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) {
		*group = next_linear_group(ac, *group, ngroups);
		return;
	}

	if (*new_cr == 0) {
		ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
@@ -1075,23 +1031,25 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	int i;

	if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) {
	for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
		if (grp->bb_counters[i] > 0)
			break;
	/* No need to move between order lists? */
	if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
	    i == grp->bb_largest_free_order) {
		grp->bb_largest_free_order = i;
		return;
	}

	if (grp->bb_largest_free_order >= 0) {
		write_lock(&sbi->s_mb_largest_free_orders_locks[
					      grp->bb_largest_free_order]);
		list_del_init(&grp->bb_largest_free_order_node);
		write_unlock(&sbi->s_mb_largest_free_orders_locks[
					      grp->bb_largest_free_order]);
	}
	grp->bb_largest_free_order = -1; /* uninit */

	for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) {
		if (grp->bb_counters[i] > 0) {
	grp->bb_largest_free_order = i;
			break;
		}
	}
	if (test_opt2(sb, MB_OPTIMIZE_SCAN) &&
	    grp->bb_largest_free_order >= 0 && grp->bb_free) {
	if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
		write_lock(&sbi->s_mb_largest_free_orders_locks[
					      grp->bb_largest_free_order]);
		list_add_tail(&grp->bb_largest_free_order_node,
@@ -1148,13 +1106,13 @@ void ext4_mb_generate_buddy(struct super_block *sb,
					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
	}
	mb_set_largest_free_order(sb, grp);
	mb_update_avg_fragment_size(sb, grp);

	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));

	period = get_cycles() - period;
	atomic_inc(&sbi->s_mb_buddies_generated);
	atomic64_add(period, &sbi->s_mb_generation_time);
	mb_update_avg_fragment_size(sb, grp);
}

/* The buddy information is attached the buddy cache inode
@@ -2636,7 +2594,7 @@ static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
	ext4_group_t prefetch_grp = 0, ngroups, group, i;
	int cr = -1;
	int cr = -1, new_cr;
	int err = 0, first_err = 0;
	unsigned int nr = 0, prefetch_ios = 0;
	struct ext4_sb_info *sbi;
@@ -2707,17 +2665,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
		 * from the goal value specified
		 */
		group = ac->ac_g_ex.fe_group;
		ac->ac_last_optimal_group = group;
		ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
		prefetch_grp = group;

		for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups),
			     i++) {
			int ret = 0, new_cr;
		for (i = 0, new_cr = cr; i < ngroups; i++,
		     ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
			int ret = 0;

			cond_resched();

			ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
			if (new_cr != cr) {
				cr = new_cr;
				goto repeat;
@@ -2991,9 +2946,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock)
	struct super_block *sb = pde_data(file_inode(seq->file));
	unsigned long position;

	read_lock(&EXT4_SB(sb)->s_mb_rb_lock);

	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
	if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
		return NULL;
	position = *pos + 1;
	return (void *) ((unsigned long) position);
@@ -3005,7 +2958,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof
	unsigned long position;

	++*pos;
	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
	if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
		return NULL;
	position = *pos + 1;
	return (void *) ((unsigned long) position);
@@ -3017,29 +2970,22 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	unsigned long position = ((unsigned long) v);
	struct ext4_group_info *grp;
	struct rb_node *n;
	unsigned int count, min, max;
	unsigned int count;

	position--;
	if (position >= MB_NUM_ORDERS(sb)) {
		seq_puts(seq, "fragment_size_tree:\n");
		n = rb_first(&sbi->s_mb_avg_fragment_size_root);
		if (!n) {
			seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
			return 0;
		}
		grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
		min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
		count = 1;
		while (rb_next(n)) {
			count++;
			n = rb_next(n);
		}
		grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
		max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
		position -= MB_NUM_ORDERS(sb);
		if (position == 0)
			seq_puts(seq, "avg_fragment_size_lists:\n");

		seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n",
			   min, max, count);
		count = 0;
		read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
		list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
				    bb_avg_fragment_size_node)
			count++;
		read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
		seq_printf(seq, "\tlist_order_%u_groups: %u\n",
					(unsigned int)position, count);
		return 0;
	}

@@ -3049,9 +2995,11 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
		seq_puts(seq, "max_free_order_lists:\n");
	}
	count = 0;
	read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
	list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
			    bb_largest_free_order_node)
		count++;
	read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
	seq_printf(seq, "\tlist_order_%u_groups: %u\n",
		   (unsigned int)position, count);

@@ -3059,11 +3007,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
}

static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
__releases(&EXT4_SB(sb)->s_mb_rb_lock)
{
	struct super_block *sb = pde_data(file_inode(seq->file));

	read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
}

const struct seq_operations ext4_mb_seq_structs_summary_ops = {
@@ -3176,8 +3120,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
	init_rwsem(&meta_group_info[i]->alloc_sem);
	meta_group_info[i]->bb_free_root = RB_ROOT;
	INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
	RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb);
	INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
	meta_group_info[i]->bb_avg_fragment_size_order = -1;  /* uninit */
	meta_group_info[i]->bb_group = group;

	mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
@@ -3426,7 +3371,24 @@ int ext4_mb_init(struct super_block *sb)
		i++;
	} while (i < MB_NUM_ORDERS(sb));

	sbi->s_mb_avg_fragment_size_root = RB_ROOT;
	sbi->s_mb_avg_fragment_size =
		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
			GFP_KERNEL);
	if (!sbi->s_mb_avg_fragment_size) {
		ret = -ENOMEM;
		goto out;
	}
	sbi->s_mb_avg_fragment_size_locks =
		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
			GFP_KERNEL);
	if (!sbi->s_mb_avg_fragment_size_locks) {
		ret = -ENOMEM;
		goto out;
	}
	for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
		INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
		rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
	}
	sbi->s_mb_largest_free_orders =
		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
			GFP_KERNEL);
@@ -3445,7 +3407,6 @@ int ext4_mb_init(struct super_block *sb)
		INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
		rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
	}
	rwlock_init(&sbi->s_mb_rb_lock);

	spin_lock_init(&sbi->s_md_lock);
	sbi->s_mb_free_pending = 0;
@@ -3516,6 +3477,8 @@ int ext4_mb_init(struct super_block *sb)
	free_percpu(sbi->s_locality_groups);
	sbi->s_locality_groups = NULL;
out:
	kfree(sbi->s_mb_avg_fragment_size);
	kfree(sbi->s_mb_avg_fragment_size_locks);
	kfree(sbi->s_mb_largest_free_orders);
	kfree(sbi->s_mb_largest_free_orders_locks);
	kfree(sbi->s_mb_offsets);
@@ -3582,6 +3545,8 @@ int ext4_mb_release(struct super_block *sb)
		kvfree(group_info);
		rcu_read_unlock();
	}
	kfree(sbi->s_mb_avg_fragment_size);
	kfree(sbi->s_mb_avg_fragment_size_locks);
	kfree(sbi->s_mb_largest_free_orders);
	kfree(sbi->s_mb_largest_free_orders_locks);
	kfree(sbi->s_mb_offsets);
@@ -5193,6 +5158,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
	int bsbits = ac->ac_sb->s_blocksize_bits;
	loff_t size, isize;
	bool inode_pa_eligible, group_pa_eligible;

	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
		return;
@@ -5200,25 +5166,27 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
		return;

	group_pa_eligible = sbi->s_mb_group_prealloc > 0;
	inode_pa_eligible = true;
	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
		>> bsbits;

	/* No point in using inode preallocation for closed files */
	if ((size == isize) && !ext4_fs_is_busy(sbi) &&
	    !inode_is_open_for_write(ac->ac_inode)) {
		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
		return;
	}
	    !inode_is_open_for_write(ac->ac_inode))
		inode_pa_eligible = false;

	if (sbi->s_mb_group_prealloc <= 0) {
		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
		return;
	}

	/* don't use group allocation for large files */
	size = max(size, isize);
	if (size > sbi->s_mb_stream_request) {
	/* Don't use group allocation for large files */
	if (size > sbi->s_mb_stream_request)
		group_pa_eligible = false;

	if (!group_pa_eligible) {
		if (inode_pa_eligible)
			ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
		else
			ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
		return;
	}

@@ -5565,6 +5533,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
	ext4_fsblk_t block = 0;
	unsigned int inquota = 0;
	unsigned int reserv_clstrs = 0;
	int retries = 0;
	u64 seq;

	might_sleep();
@@ -5667,7 +5636,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
			ar->len = ac->ac_b_ex.fe_len;
		}
	} else {
		if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
		if (++retries < 3 &&
		    ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
			goto repeat;
		/*
		 * If block allocation fails then the pa allocated above
+0 −1
Original line number Diff line number Diff line
@@ -178,7 +178,6 @@ struct ext4_allocation_context {
	/* copy of the best found extent taken before preallocation efforts */
	struct ext4_free_extent ac_f_ex;

	ext4_group_t ac_last_optimal_group;
	__u32 ac_groups_considered;
	__u32 ac_flags;		/* allocation hints */
	__u16 ac_groups_scanned;