Skip to content

Commit db4ae18

Browse files
boryasgregkh
authored andcommitted
btrfs: fix racy bitfield write in btrfs_clear_space_info_full()
[ Upstream commit 38e8187 ] >From the memory-barriers.txt document regarding memory barrier ordering guarantees: (*) These guarantees do not apply to bitfields, because compilers often generate code to modify these using non-atomic read-modify-write sequences. Do not attempt to use bitfields to synchronize parallel algorithms. (*) Even in cases where bitfields are protected by locks, all fields in a given bitfield must be protected by one lock. If two fields in a given bitfield are protected by different locks, the compiler's non-atomic read-modify-write sequences can cause an update to one field to corrupt the value of an adjacent field. btrfs_space_info has a bitfield sharing an underlying word consisting of the fields full, chunk_alloc, and flush: struct btrfs_space_info { struct btrfs_fs_info * fs_info; /* 0 8 */ struct btrfs_space_info * parent; /* 8 8 */ ... int clamp; /* 172 4 */ unsigned int full:1; /* 176: 0 4 */ unsigned int chunk_alloc:1; /* 176: 1 4 */ unsigned int flush:1; /* 176: 2 4 */ ... Therefore, to be safe from parallel read-modify-writes losing a write to one of the bitfield members protected by a lock, all writes to all the bitfields must use the lock. They almost universally do, except for btrfs_clear_space_info_full() which iterates over the space_infos and writes out found->full = 0 without a lock. Imagine that we have one thread completing a transaction in which we finished deleting a block_group and are thus calling btrfs_clear_space_info_full() while simultaneously the data reclaim ticket infrastructure is running do_async_reclaim_data_space(): T1 T2 btrfs_commit_transaction btrfs_clear_space_info_full data_sinfo->full = 0 READ: full:0, chunk_alloc:0, flush:1 do_async_reclaim_data_space(data_sinfo) spin_lock(&space_info->lock); if(list_empty(tickets)) space_info->flush = 0; READ: full: 0, chunk_alloc:0, flush:1 MOD/WRITE: full: 0, chunk_alloc:0, flush:0 spin_unlock(&space_info->lock); return; MOD/WRITE: full:0, chunk_alloc:0, flush:1 and now data_sinfo->flush is 1 but the reclaim worker has exited. This breaks the invariant that flush is 0 iff there is no work queued or running. Once this invariant is violated, future allocations that go into __reserve_bytes() will add tickets to space_info->tickets but will see space_info->flush is set to 1 and not queue the work. After this, they will block forever on the resulting ticket, as it is now impossible to kick the worker again. I also confirmed by looking at the assembly of the affected kernel that it is doing RMW operations. For example, to set the flush (3rd) bit to 0, the assembly is: andb $0xfb,0x60(%rbx) and similarly for setting the full (1st) bit to 0: andb $0xfe,-0x20(%rax) So I think this is really a bug on practical systems. I have observed a number of systems in this exact state, but am currently unable to reproduce it. Rather than leaving this footgun lying around for the future, take advantage of the fact that there is room in the struct anyway, and that it is already quite large and simply change the three bitfield members to bools. This avoids writes to space_info->full having any effect on writes to space_info->flush, regardless of locking. Fixes: 957780e ("Btrfs: introduce ticketed enospc infrastructure") Reviewed-by: Qu Wenruo <wqu@suse.com> Signed-off-by: Boris Burkov <boris@bur.io> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> [ The context change is due to the commit cc0517f ("btrfs: tweak extent/chunk allocation for space_info sub-space") in v6.16 which is irrelevant to the logic of this patch. ] Signed-off-by: Rahul Sharma <black.hawk@163.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 0328bb0 commit db4ae18

3 files changed

Lines changed: 17 additions & 17 deletions

File tree

fs/btrfs/block-group.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4195,7 +4195,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
41954195
mutex_unlock(&fs_info->chunk_mutex);
41964196
} else {
41974197
/* Proceed with allocation */
4198-
space_info->chunk_alloc = 1;
4198+
space_info->chunk_alloc = true;
41994199
wait_for_alloc = false;
42004200
spin_unlock(&space_info->lock);
42014201
}
@@ -4244,7 +4244,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
42444244
spin_lock(&space_info->lock);
42454245
if (ret < 0) {
42464246
if (ret == -ENOSPC)
4247-
space_info->full = 1;
4247+
space_info->full = true;
42484248
else
42494249
goto out;
42504250
} else {
@@ -4254,7 +4254,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
42544254

42554255
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
42564256
out:
4257-
space_info->chunk_alloc = 0;
4257+
space_info->chunk_alloc = false;
42584258
spin_unlock(&space_info->lock);
42594259
mutex_unlock(&fs_info->chunk_mutex);
42604260

fs/btrfs/space-info.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
183183
struct btrfs_space_info *found;
184184

185185
list_for_each_entry(found, head, list)
186-
found->full = 0;
186+
found->full = false;
187187
}
188188

189189
/*
@@ -364,7 +364,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
364364
found->bytes_readonly += block_group->bytes_super;
365365
btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable);
366366
if (block_group->length > 0)
367-
found->full = 0;
367+
found->full = false;
368368
btrfs_try_granting_tickets(info, found);
369369
spin_unlock(&found->lock);
370370

@@ -1140,7 +1140,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
11401140
spin_lock(&space_info->lock);
11411141
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
11421142
if (!to_reclaim) {
1143-
space_info->flush = 0;
1143+
space_info->flush = false;
11441144
spin_unlock(&space_info->lock);
11451145
return;
11461146
}
@@ -1152,7 +1152,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
11521152
flush_space(fs_info, space_info, to_reclaim, flush_state, false);
11531153
spin_lock(&space_info->lock);
11541154
if (list_empty(&space_info->tickets)) {
1155-
space_info->flush = 0;
1155+
space_info->flush = false;
11561156
spin_unlock(&space_info->lock);
11571157
return;
11581158
}
@@ -1195,7 +1195,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
11951195
flush_state = FLUSH_DELAYED_ITEMS_NR;
11961196
commit_cycles--;
11971197
} else {
1198-
space_info->flush = 0;
1198+
space_info->flush = false;
11991199
}
12001200
} else {
12011201
flush_state = FLUSH_DELAYED_ITEMS_NR;
@@ -1357,7 +1357,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
13571357

13581358
spin_lock(&space_info->lock);
13591359
if (list_empty(&space_info->tickets)) {
1360-
space_info->flush = 0;
1360+
space_info->flush = false;
13611361
spin_unlock(&space_info->lock);
13621362
return;
13631363
}
@@ -1368,7 +1368,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
13681368
flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
13691369
spin_lock(&space_info->lock);
13701370
if (list_empty(&space_info->tickets)) {
1371-
space_info->flush = 0;
1371+
space_info->flush = false;
13721372
spin_unlock(&space_info->lock);
13731373
return;
13741374
}
@@ -1385,7 +1385,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
13851385
data_flush_states[flush_state], false);
13861386
spin_lock(&space_info->lock);
13871387
if (list_empty(&space_info->tickets)) {
1388-
space_info->flush = 0;
1388+
space_info->flush = false;
13891389
spin_unlock(&space_info->lock);
13901390
return;
13911391
}
@@ -1402,7 +1402,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
14021402
if (maybe_fail_all_tickets(fs_info, space_info))
14031403
flush_state = 0;
14041404
else
1405-
space_info->flush = 0;
1405+
space_info->flush = false;
14061406
} else {
14071407
flush_state = 0;
14081408
}
@@ -1418,7 +1418,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
14181418

14191419
aborted_fs:
14201420
maybe_fail_all_tickets(fs_info, space_info);
1421-
space_info->flush = 0;
1421+
space_info->flush = false;
14221422
spin_unlock(&space_info->lock);
14231423
}
14241424

@@ -1787,7 +1787,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
17871787
*/
17881788
maybe_clamp_preempt(fs_info, space_info);
17891789

1790-
space_info->flush = 1;
1790+
space_info->flush = true;
17911791
trace_btrfs_trigger_flush(fs_info,
17921792
space_info->flags,
17931793
orig_bytes, flush,

fs/btrfs/space-info.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,11 @@ struct btrfs_space_info {
136136
flushing. The value is >> clamp, so turns
137137
out to be a 2^clamp divisor. */
138138

139-
unsigned int full:1; /* indicates that we cannot allocate any more
139+
bool full; /* indicates that we cannot allocate any more
140140
chunks for this space */
141-
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
141+
bool chunk_alloc; /* set if we are allocating a chunk */
142142

143-
unsigned int flush:1; /* set if we are trying to make space */
143+
bool flush; /* set if we are trying to make space */
144144

145145
unsigned int force_alloc; /* set if we need to force a chunk
146146
alloc for this space */

0 commit comments

Comments
 (0)