diff --git a/kernel/crates/another_ext4/src/ext4/alloc.rs b/kernel/crates/another_ext4/src/ext4/alloc.rs index a78cb8624c..038ae7a10e 100644 --- a/kernel/crates/another_ext4/src/ext4/alloc.rs +++ b/kernel/crates/another_ext4/src/ext4/alloc.rs @@ -6,6 +6,17 @@ use crate::prelude::*; use crate::return_error; impl Ext4 { + fn restore_block_allocation_state( + &self, + bitmap_block: &Block, + bg: &BlockGroupRef, + sb: &SuperBlock, + ) -> Result<()> { + self.write_block(bitmap_block)?; + self.write_block_group_with_csum(&mut BlockGroupRef::new(bg.id, bg.desc))?; + self.write_super_block(sb) + } + fn block_group_first_block(sb: &SuperBlock, bgid: BlockGroupId) -> PBlockId { bgid as PBlockId * sb.blocks_per_group() as PBlockId } @@ -182,6 +193,9 @@ impl Ext4 { // extent physical block numbers are absolute filesystem block numbers. let bitmap_block_id = bg.desc.block_bitmap_block(); let mut bitmap_block = self.read_block(bitmap_block_id)?; + let old_bitmap_block = bitmap_block.clone(); + let old_bg = BlockGroupRef::new(bg.id, bg.desc); + let old_sb = sb; let mut bitmap = Bitmap::new(&mut *bitmap_block.data, blocks_in_group); let bit = match bitmap.find_and_set_first_clear_bit(0, blocks_in_group) { @@ -197,11 +211,29 @@ impl Ext4 { // Update block group counters bg.desc .set_free_blocks_count(bg.desc.get_free_blocks_count() - 1); - self.write_block_group_with_csum(&mut bg)?; + if let Err(err) = self.write_block_group_with_csum(&mut bg) { + return match self.restore_block_allocation_state( + &old_bitmap_block, + &old_bg, + &old_sb, + ) { + Ok(()) => Err(err), + Err(rollback_err) => Err(rollback_err), + }; + } // Update superblock counters sb.set_free_blocks_count(sb.free_blocks_count() - 1); - self.write_super_block(&sb)?; + if let Err(err) = self.write_super_block(&sb) { + return match self.restore_block_allocation_state( + &old_bitmap_block, + &old_bg, + &old_sb, + ) { + Ok(()) => Err(err), + Err(rollback_err) => Err(rollback_err), + }; + } trace!("Alloc block {} ok", fblock); return Ok(fblock); @@ -231,6 +263,9 @@ impl Ext4 { // Load block bitmap let bitmap_block_id = bg.desc.block_bitmap_block(); let mut bitmap_block = self.read_block(bitmap_block_id)?; + let old_bitmap_block = bitmap_block.clone(); + let old_bg = BlockGroupRef::new(bg.id, bg.desc); + let old_sb = sb; let mut bitmap = Bitmap::new(&mut *bitmap_block.data, blocks_in_group); // Free the block @@ -245,11 +280,21 @@ impl Ext4 { // Update block group counters bg.desc .set_free_blocks_count(bg.desc.get_free_blocks_count() + 1); - self.write_block_group_with_csum(&mut bg)?; + if let Err(err) = self.write_block_group_with_csum(&mut bg) { + return match self.restore_block_allocation_state(&old_bitmap_block, &old_bg, &old_sb) { + Ok(()) => Err(err), + Err(rollback_err) => Err(rollback_err), + }; + } // Update superblock counters sb.set_free_blocks_count(sb.free_blocks_count() + 1); - self.write_super_block(&sb)?; + if let Err(err) = self.write_super_block(&sb) { + return match self.restore_block_allocation_state(&old_bitmap_block, &old_bg, &old_sb) { + Ok(()) => Err(err), + Err(rollback_err) => Err(rollback_err), + }; + } trace!("Free block {} ok", pblock); Ok(()) diff --git a/kernel/crates/another_ext4/src/ext4/low_level.rs b/kernel/crates/another_ext4/src/ext4/low_level.rs index 83ceb4b164..57a45ed2e5 100644 --- a/kernel/crates/another_ext4/src/ext4/low_level.rs +++ b/kernel/crates/another_ext4/src/ext4/low_level.rs @@ -1146,8 +1146,7 @@ impl Ext4 { } } - /// Set extended attribute of a file. This function will not check name conflict, - /// call `getxattr` to check beforehand. + /// Set extended attribute of a file. /// /// # Params /// @@ -1159,20 +1158,73 @@ impl Ext4 { /// /// `ENOSPC` - xattr block does not have enough space pub fn setxattr(&self, inode: InodeId, name: &str, value: &[u8]) -> Result<()> { + self.setxattr_with_flags(inode, name, value, false, false) + } + + /// Set extended attribute of a file with Linux create/replace semantics. + /// + /// Existing xattr blocks are modified on a cloned candidate block first and + /// written back only after the whole operation succeeds. This preserves the + /// old value when replacing with a value that does not fit. + pub fn setxattr_with_flags( + &self, + inode: InodeId, + name: &str, + value: &[u8], + create: bool, + replace: bool, + ) -> Result<()> { + let _mutation_guard = + self.inode_mutation_locks[self.inode_mutation_lock_index(inode)].lock(); let mut inode_ref = self.read_inode(inode)?; let xattr_block_id = inode_ref.inode.xattr_block(); if xattr_block_id == 0 { + if replace { + return_error!(ErrCode::ENODATA, "Xattr {} does not exist", name); + } // lazy allocate xattr block let pblock = self.alloc_block(&mut inode_ref)?; - inode_ref.inode.set_xattr_block(pblock); - self.write_inode_with_csum(&mut inode_ref)?; + let old_xattr_block = xattr_block_id; + let result = (|| { + let mut xattr_block = XattrBlock::new(self.read_block(pblock)?); + xattr_block.init(); + if !xattr_block.insert(name, value) { + return_error!( + ErrCode::ENOSPC, + "Xattr block of Inode {} does not have enough space", + inode + ); + } + self.write_block(&xattr_block.block())?; + inode_ref.inode.set_xattr_block(pblock); + self.write_inode_with_csum(&mut inode_ref)?; + Ok(()) + })(); + if let Err(err) = result { + inode_ref.inode.set_xattr_block(old_xattr_block); + return match self.dealloc_block(&mut inode_ref, pblock) { + Ok(()) => Err(err), + Err(rollback_err) => Err(rollback_err), + }; + } + return Ok(()); } - let mut xattr_block = XattrBlock::new(self.read_block(inode_ref.inode.xattr_block())?); - if xattr_block_id == 0 { - xattr_block.init(); + + let xattr_block = XattrBlock::new(self.read_block(xattr_block_id)?); + let exists = xattr_block.get(name).is_some(); + if exists && create { + return_error!(ErrCode::EEXIST, "Xattr {} already exists", name); } - if xattr_block.insert(name, value) { - self.write_block(&xattr_block.block())?; + if !exists && replace { + return_error!(ErrCode::ENODATA, "Xattr {} does not exist", name); + } + + let mut new_xattr_block = xattr_block; + if exists { + let _ = new_xattr_block.remove(name); + } + if new_xattr_block.insert(name, value) { + self.write_block(&new_xattr_block.block())?; Ok(()) } else { return_error!( @@ -1194,6 +1246,8 @@ impl Ext4 { /// /// `ENODATA` - the attribute does not exist pub fn removexattr(&self, inode: InodeId, name: &str) -> Result<()> { + let _mutation_guard = + self.inode_mutation_locks[self.inode_mutation_lock_index(inode)].lock(); let inode_ref = self.read_inode(inode)?; let xattr_block_id = inode_ref.inode.xattr_block(); if xattr_block_id == 0 { @@ -1231,6 +1285,7 @@ impl Ext4 { #[cfg(test)] mod tests { use super::*; + use crate::FileType; struct StubBlockDevice { sb_block: Block, @@ -1303,4 +1358,312 @@ mod tests { assert_eq!(err.code(), ErrCode::EIO); assert_eq!(buf, [0x5a; 16]); } + + const TEST_BLOCK_COUNT: usize = 16; + const TEST_BLOCK_BITMAP: PBlockId = 2; + const TEST_INODE_BITMAP: PBlockId = 3; + const TEST_INODE_TABLE: PBlockId = 4; + const TEST_XATTR_BLOCK: PBlockId = 5; + const TEST_INITIAL_FREE_BLOCKS: u64 = (TEST_BLOCK_COUNT as u64) - 5; + + struct FailingBlockDevice { + blocks: spin::Mutex>, + fail_reads: spin::Mutex>, + fail_writes: spin::Mutex>, + } + + impl FailingBlockDevice { + fn new() -> Self { + let mut blocks = BTreeMap::new(); + for block_id in 0..TEST_BLOCK_COUNT as PBlockId { + blocks.insert(block_id, Block::new(block_id, Box::new([0u8; BLOCK_SIZE]))); + } + + let mut sb_block = blocks.remove(&0).unwrap(); + Self::write_u32(&mut sb_block, BASE_OFFSET + 4, TEST_BLOCK_COUNT as u32); + Self::write_u32( + &mut sb_block, + BASE_OFFSET + 12, + TEST_INITIAL_FREE_BLOCKS as u32, + ); + Self::write_u32(&mut sb_block, BASE_OFFSET + 20, 0); + Self::write_u32(&mut sb_block, BASE_OFFSET + 24, 2); + Self::write_u32(&mut sb_block, BASE_OFFSET + 28, 2); + Self::write_u32(&mut sb_block, BASE_OFFSET + 32, TEST_BLOCK_COUNT as u32); + Self::write_u32(&mut sb_block, BASE_OFFSET + 40, 16); + Self::write_u16(&mut sb_block, BASE_OFFSET + 56, 0xef53); + Self::write_u32(&mut sb_block, BASE_OFFSET + 84, 1); + Self::write_u16(&mut sb_block, BASE_OFFSET + 88, SB_GOOD_INODE_SIZE as u16); + Self::write_u16(&mut sb_block, BASE_OFFSET + 254, SB_GOOD_DESC_SIZE as u16); + blocks.insert(0, sb_block); + + let mut bgdt = blocks.remove(&1).unwrap(); + Self::write_u32(&mut bgdt, 0, TEST_BLOCK_BITMAP as u32); + Self::write_u32(&mut bgdt, 4, TEST_INODE_BITMAP as u32); + Self::write_u32(&mut bgdt, 8, TEST_INODE_TABLE as u32); + Self::write_u16(&mut bgdt, 12, TEST_INITIAL_FREE_BLOCKS as u16); + blocks.insert(1, bgdt); + + let mut bitmap = blocks.remove(&TEST_BLOCK_BITMAP).unwrap(); + bitmap.data[0] = 0b0001_1111; + blocks.insert(TEST_BLOCK_BITMAP, bitmap); + + let mut inode_table = blocks.remove(&TEST_INODE_TABLE).unwrap(); + let mut inode = Inode::default(); + inode.set_mode(InodeMode::from_type_and_perm( + FileType::RegularFile, + InodeMode::from_bits_retain(0o644), + )); + inode.set_link_count(1); + inode_table.write_offset_as(SB_GOOD_INODE_SIZE, &inode); + blocks.insert(TEST_INODE_TABLE, inode_table); + + Self { + blocks: spin::Mutex::new(blocks), + fail_reads: spin::Mutex::new(Vec::new()), + fail_writes: spin::Mutex::new(Vec::new()), + } + } + + fn write_u16(block: &mut Block, offset: usize, value: u16) { + block.write_offset(offset, &value.to_le_bytes()); + } + + fn write_u32(block: &mut Block, offset: usize, value: u32) { + block.write_offset(offset, &value.to_le_bytes()); + } + + fn fail_once_on_read(&self, block_id: PBlockId) { + self.fail_reads.lock().push(block_id); + } + + fn fail_once_on_write(&self, block_id: PBlockId) { + self.fail_writes.lock().push(block_id); + } + + fn take_failure(list: &mut Vec, block_id: PBlockId) -> bool { + if let Some(pos) = list.iter().position(|&id| id == block_id) { + list.remove(pos); + true + } else { + false + } + } + + fn block_bitmap_bit_is_set(&self, bit: usize) -> bool { + let blocks = self.blocks.lock(); + let block = blocks.get(&TEST_BLOCK_BITMAP).unwrap(); + (block.data[bit / 8] & (1 << (bit % 8))) != 0 + } + + fn bg_free_blocks(&self) -> u64 { + let blocks = self.blocks.lock(); + let block = blocks.get(&1).unwrap(); + u16::from_le_bytes(block.data[12..14].try_into().unwrap()) as u64 + } + + fn sb_free_blocks(&self) -> u64 { + let blocks = self.blocks.lock(); + let block = blocks.get(&0).unwrap(); + u32::from_le_bytes( + block.data[BASE_OFFSET + 12..BASE_OFFSET + 16] + .try_into() + .unwrap(), + ) as u64 + } + + fn disk_inode_xattr_block(&self) -> PBlockId { + let blocks = self.blocks.lock(); + let block = blocks.get(&TEST_INODE_TABLE).unwrap(); + let inode: Inode = block.read_offset_as(SB_GOOD_INODE_SIZE); + inode.xattr_block() + } + } + + impl BlockDevice for FailingBlockDevice { + fn read_block(&self, block_id: PBlockId) -> Result { + if Self::take_failure(&mut self.fail_reads.lock(), block_id) { + return Err(Ext4Error::new(ErrCode::EIO)); + } + self.blocks + .lock() + .get(&block_id) + .cloned() + .ok_or_else(|| Ext4Error::new(ErrCode::EIO)) + } + + fn write_block(&self, block: &Block) -> Result<()> { + if Self::take_failure(&mut self.fail_writes.lock(), block.id) { + return Err(Ext4Error::new(ErrCode::EIO)); + } + self.blocks.lock().insert(block.id, block.clone()); + Ok(()) + } + } + + fn load_failing_test_fs() -> (Arc, Ext4) { + let block_device = Arc::new(FailingBlockDevice::new()); + let fs = Ext4::load(block_device.clone()).unwrap(); + (block_device, fs) + } + + fn assert_xattr_alloc_rolled_back(fs: &Ext4, block_device: &FailingBlockDevice) { + assert!(!block_device.block_bitmap_bit_is_set(TEST_XATTR_BLOCK as usize)); + assert_eq!(block_device.bg_free_blocks(), TEST_INITIAL_FREE_BLOCKS); + assert_eq!(block_device.sb_free_blocks(), TEST_INITIAL_FREE_BLOCKS); + assert_eq!( + fs.read_block_group(0).unwrap().desc.get_free_blocks_count(), + TEST_INITIAL_FREE_BLOCKS + ); + assert_eq!( + fs.read_super_block_cached().free_blocks_count(), + TEST_INITIAL_FREE_BLOCKS + ); + assert_eq!(block_device.disk_inode_xattr_block(), 0); + } + + fn assert_allocation_state( + fs: &Ext4, + block_device: &FailingBlockDevice, + allocated: bool, + free_blocks: u64, + ) { + assert_eq!( + block_device.block_bitmap_bit_is_set(TEST_XATTR_BLOCK as usize), + allocated + ); + assert_eq!(block_device.bg_free_blocks(), free_blocks); + assert_eq!(block_device.sb_free_blocks(), free_blocks); + assert_eq!( + fs.read_block_group(0).unwrap().desc.get_free_blocks_count(), + free_blocks + ); + assert_eq!( + fs.read_super_block_cached().free_blocks_count(), + free_blocks + ); + } + + #[test] + fn setxattr_rolls_back_when_new_xattr_block_read_fails() { + let (block_device, fs) = load_failing_test_fs(); + block_device.fail_once_on_read(TEST_XATTR_BLOCK); + + let err = fs + .setxattr_with_flags(2, "user.rollback", b"value", false, false) + .unwrap_err(); + + assert_eq!(err.code(), ErrCode::EIO); + assert_xattr_alloc_rolled_back(&fs, &block_device); + } + + #[test] + fn setxattr_rolls_back_when_new_xattr_block_write_fails() { + let (block_device, fs) = load_failing_test_fs(); + block_device.fail_once_on_write(TEST_XATTR_BLOCK); + + let err = fs + .setxattr_with_flags(2, "user.rollback", b"value", false, false) + .unwrap_err(); + + assert_eq!(err.code(), ErrCode::EIO); + assert_xattr_alloc_rolled_back(&fs, &block_device); + } + + #[test] + fn setxattr_rolls_back_when_inode_write_fails() { + let (block_device, fs) = load_failing_test_fs(); + block_device.fail_once_on_write(TEST_INODE_TABLE); + + let err = fs + .setxattr_with_flags(2, "user.rollback", b"value", false, false) + .unwrap_err(); + + assert_eq!(err.code(), ErrCode::EIO); + assert_xattr_alloc_rolled_back(&fs, &block_device); + } + + #[test] + fn setxattr_rolls_back_when_new_xattr_does_not_fit() { + let (block_device, fs) = load_failing_test_fs(); + let value = vec![0x5au8; BLOCK_SIZE]; + + let err = fs + .setxattr_with_flags(2, "user.rollback", &value, false, false) + .unwrap_err(); + + assert_eq!(err.code(), ErrCode::ENOSPC); + assert_xattr_alloc_rolled_back(&fs, &block_device); + } + + #[test] + fn block_group_cache_updates_only_after_disk_write_succeeds() { + let (block_device, fs) = load_failing_test_fs(); + let mut bg = fs.read_block_group(0).unwrap(); + bg.desc.set_free_blocks_count(TEST_INITIAL_FREE_BLOCKS - 1); + block_device.fail_once_on_write(1); + + let err = fs.write_block_group_with_csum(&mut bg).unwrap_err(); + + assert_eq!(err.code(), ErrCode::EIO); + assert_eq!( + fs.read_block_group(0).unwrap().desc.get_free_blocks_count(), + TEST_INITIAL_FREE_BLOCKS + ); + assert_eq!(block_device.bg_free_blocks(), TEST_INITIAL_FREE_BLOCKS); + } + + #[test] + fn alloc_block_rolls_back_when_block_group_write_fails() { + let (block_device, fs) = load_failing_test_fs(); + let mut inode = fs.read_inode(2).unwrap(); + block_device.fail_once_on_write(1); + + let err = fs.alloc_block(&mut inode).unwrap_err(); + + assert_eq!(err.code(), ErrCode::EIO); + assert_allocation_state(&fs, &block_device, false, TEST_INITIAL_FREE_BLOCKS); + } + + #[test] + fn alloc_block_rolls_back_when_superblock_write_fails() { + let (block_device, fs) = load_failing_test_fs(); + let mut inode = fs.read_inode(2).unwrap(); + block_device.fail_once_on_write(0); + + let err = fs.alloc_block(&mut inode).unwrap_err(); + + assert_eq!(err.code(), ErrCode::EIO); + assert_allocation_state(&fs, &block_device, false, TEST_INITIAL_FREE_BLOCKS); + } + + #[test] + fn dealloc_block_rolls_back_when_block_group_write_fails() { + let (block_device, fs) = load_failing_test_fs(); + let mut inode = fs.read_inode(2).unwrap(); + let pblock = fs.alloc_block(&mut inode).unwrap(); + assert_eq!(pblock, TEST_XATTR_BLOCK); + assert_allocation_state(&fs, &block_device, true, TEST_INITIAL_FREE_BLOCKS - 1); + block_device.fail_once_on_write(1); + + let err = fs.dealloc_block(&mut inode, pblock).unwrap_err(); + + assert_eq!(err.code(), ErrCode::EIO); + assert_allocation_state(&fs, &block_device, true, TEST_INITIAL_FREE_BLOCKS - 1); + } + + #[test] + fn dealloc_block_rolls_back_when_superblock_write_fails() { + let (block_device, fs) = load_failing_test_fs(); + let mut inode = fs.read_inode(2).unwrap(); + let pblock = fs.alloc_block(&mut inode).unwrap(); + assert_eq!(pblock, TEST_XATTR_BLOCK); + assert_allocation_state(&fs, &block_device, true, TEST_INITIAL_FREE_BLOCKS - 1); + block_device.fail_once_on_write(0); + + let err = fs.dealloc_block(&mut inode, pblock).unwrap_err(); + + assert_eq!(err.code(), ErrCode::EIO); + assert_allocation_state(&fs, &block_device, true, TEST_INITIAL_FREE_BLOCKS - 1); + } } diff --git a/kernel/crates/another_ext4/src/ext4/rw.rs b/kernel/crates/another_ext4/src/ext4/rw.rs index d0c45c7733..db0ca5eee2 100644 --- a/kernel/crates/another_ext4/src/ext4/rw.rs +++ b/kernel/crates/another_ext4/src/ext4/rw.rs @@ -108,18 +108,17 @@ impl Ext4 { /// Write a block group descriptor to disk and update cache, without checksum. #[allow(unused)] pub(super) fn write_block_group_without_csum(&self, bg_ref: &BlockGroupRef) -> Result<()> { - // Update cache - if let Some(cached) = self.cached_block_groups.get(bg_ref.id as usize) { - *cached.lock() = bg_ref.desc; - } - // Write to disk let sb = self.read_super_block_cached(); let desc_per_block = BLOCK_SIZE as u32 / sb.desc_size() as u32; let block_id = sb.first_data_block() + bg_ref.id / desc_per_block + 1; let offset = (bg_ref.id % desc_per_block) * sb.desc_size() as u32; let mut block = self.read_block(block_id as PBlockId)?; block.write_offset_as(offset as usize, &bg_ref.desc); - self.write_block(&block) + self.write_block(&block)?; + if let Some(cached) = self.cached_block_groups.get(bg_ref.id as usize) { + *cached.lock() = bg_ref.desc; + } + Ok(()) } /// Get disk position of an inode. Return block id and offset within the block. diff --git a/kernel/src/arch/loongarch64/process/mod.rs b/kernel/src/arch/loongarch64/process/mod.rs index 5952b5af82..f047e14ee6 100644 --- a/kernel/src/arch/loongarch64/process/mod.rs +++ b/kernel/src/arch/loongarch64/process/mod.rs @@ -52,6 +52,10 @@ impl ArchPCBInfo { pub fn new(kstack: &KernelStack) -> Self { todo!("la64: ArchPCBInfo::new") } + + /// Synchronize hardware-backed current-thread state before common fork code + /// clones `ArchPCBInfo`. + pub fn sync_current_state_before_fork(&mut self) {} } impl ProcessControlBlock { diff --git a/kernel/src/arch/riscv64/process/mod.rs b/kernel/src/arch/riscv64/process/mod.rs index 55025badfa..102fc35c7d 100644 --- a/kernel/src/arch/riscv64/process/mod.rs +++ b/kernel/src/arch/riscv64/process/mod.rs @@ -432,6 +432,10 @@ impl ArchPCBInfo { *self = from.clone(); } + /// Synchronize hardware-backed current-thread state before common fork code + /// clones `ArchPCBInfo`. + pub fn sync_current_state_before_fork(&mut self) {} + pub fn set_stack(&mut self, stack: VirtAddr) { self.ksp = stack.data(); } diff --git a/kernel/src/arch/x86_64/interrupt/trap.rs b/kernel/src/arch/x86_64/interrupt/trap.rs index fc1cae9d1c..60c5b6faaf 100644 --- a/kernel/src/arch/x86_64/interrupt/trap.rs +++ b/kernel/src/arch/x86_64/interrupt/trap.rs @@ -200,6 +200,20 @@ unsafe extern "C" fn do_bounds(regs: &'static TrapFrame, error_code: u64) { /// 处理未定义操作码异常 6 #UD #[no_mangle] unsafe extern "C" fn do_undefined_opcode(regs: &'static TrapFrame, error_code: u64) { + if regs.is_from_user() { + CurrentIrqArch::interrupt_enable(); + if let Err(err) = force_kernel_signal_to_current(Signal::SIGILL) { + error!( + "failed to send SIGILL for user undefined opcode, pid: {:?}, rip: {:#x}, error_code: {:#x}, err: {:?}", + ProcessManager::current_pid(), + regs.rip, + error_code, + err + ); + } + return; + } + error!( "do_undefined_opcode(6), \tError code: {:#x},\trsp: {:#x},\trip: {:#x},\t CPU: {}, \tpid: {:?}", error_code, diff --git a/kernel/src/arch/x86_64/ipc/signal.rs b/kernel/src/arch/x86_64/ipc/signal.rs index 1a9455b55b..17a54dd77f 100644 --- a/kernel/src/arch/x86_64/ipc/signal.rs +++ b/kernel/src/arch/x86_64/ipc/signal.rs @@ -29,7 +29,7 @@ use crate::{ }, mm::MemoryManagementArch, process::ProcessManager, - syscall::user_access::UserBufferWriter, + syscall::user_access::{UserBufferReader, UserBufferWriter}, }; /// 信号处理的栈的栈指针的最小对齐数量 @@ -37,6 +37,36 @@ pub const STACK_ALIGN: u64 = 16; /// 信号最大值 pub const MAX_SIG_NUM: usize = 64; +const UC_FP_XSTATE: u64 = 0x1; +const UC_SIGCONTEXT_SS: u64 = 0x2; +const UC_STRICT_RESTORE_SS: u64 = 0x4; + +const FP_XSTATE_MAGIC1: u32 = 0x4650_5853; +const FP_XSTATE_MAGIC2: u32 = 0x4650_5845; +const FP_XSTATE_MAGIC2_SIZE: usize = size_of::(); +const FPSTATE_FRAME_SIZE: usize = size_of::() + FP_XSTATE_MAGIC2_SIZE; + +const X86_EFLAGS_CF: u64 = 1 << 0; +const X86_EFLAGS_PF: u64 = 1 << 2; +const X86_EFLAGS_AF: u64 = 1 << 4; +const X86_EFLAGS_ZF: u64 = 1 << 6; +const X86_EFLAGS_SF: u64 = 1 << 7; +const X86_EFLAGS_TF: u64 = 1 << 8; +const X86_EFLAGS_DF: u64 = 1 << 10; +const X86_EFLAGS_OF: u64 = 1 << 11; +const X86_EFLAGS_RF: u64 = 1 << 16; +const X86_EFLAGS_AC: u64 = 1 << 18; +const FIX_EFLAGS: u64 = X86_EFLAGS_AC + | X86_EFLAGS_OF + | X86_EFLAGS_DF + | X86_EFLAGS_TF + | X86_EFLAGS_SF + | X86_EFLAGS_ZF + | X86_EFLAGS_AF + | X86_EFLAGS_PF + | X86_EFLAGS_CF + | X86_EFLAGS_RF; + // ===== Linux 兼容的信号栈帧结构 ===== /// XSAVE header 结构(64 字节,位于偏移 512) @@ -78,6 +108,16 @@ struct UserFpState64 { pub reserved3: [u32; 12], } +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +struct FpxSwBytes { + magic1: u32, + extended_size: u32, + xfeatures: u64, + xstate_size: u32, + padding: [u32; 7], +} + /// 完整的 XSAVE 状态结构(包含 AVX 扩展) /// 布局: /// - 0-511: FXSAVE 兼容区域 (UserFpState64) @@ -162,11 +202,8 @@ impl UserSigSet { } } -/// 与 Linux 兼容的 ucontext 结构 -/// 参考: /usr/include/bits/types/struct_ucontext.h -/// -/// 注意:为了支持 AVX,我们扩展了 __fpregs_mem 以包含完整的 XSAVE 状态。 -/// 这与 Linux 的布局略有不同,但对用户态透明(通过 fpstate 指针访问)。 +/// 与 Linux x86_64 内核 sigframe 兼容的 ucontext 结构 +/// 参考: Linux arch/x86/include/uapi/asm/ucontext.h #[repr(C)] #[derive(Debug, Clone, Copy)] struct UserUContext { @@ -175,25 +212,43 @@ struct UserUContext { pub uc_stack: StackT, pub uc_mcontext: UserSigContext, pub uc_sigmask: UserSigSet, // 使用 Linux 兼容的 1024-bit sigset - /// 实际的 fpstate 数据(包含完整的 XSAVE 状态以支持 AVX) - pub __fpregs_mem: UserXState, } // 编译期校验关键字段偏移量与 Linux 的兼容性 -// 注意:uc_sigmask 之前的字段保持与 Linux 兼容 -// __fpregs_mem 由于 UserXState 需要 64 字节对齐,可能有填充 const _: () = { assert!(core::mem::offset_of!(UserUContext, uc_stack) == 16); assert!(core::mem::offset_of!(UserUContext, uc_mcontext) == 40); assert!(core::mem::offset_of!(UserUContext, uc_sigmask) == 296); - // __fpregs_mem 需要 64 字节对齐,所以偏移量会被调整 - // 424 + padding to 64-byte boundary = 448 - assert!(core::mem::offset_of!(UserUContext, __fpregs_mem) % 64 == 0); + assert!(core::mem::size_of::() == 424); // UserXState = 512 (FXSAVE) + 64 (header) + 256 (AVX) = 832 bytes assert!(core::mem::size_of::() == 832); }; impl UserXState { + fn user_size() -> usize { + size_of::() + } + + fn build_sw_bytes(&self) -> FpxSwBytes { + FpxSwBytes { + magic1: FP_XSTATE_MAGIC1, + extended_size: (Self::user_size() + FP_XSTATE_MAGIC2_SIZE) as u32, + xfeatures: self.header.xfeatures, + xstate_size: Self::user_size() as u32, + padding: [0; 7], + } + } + + fn install_sw_bytes(&mut self) { + let sw_bytes = self.build_sw_bytes(); + self.fpstate.reserved3[0] = sw_bytes.magic1; + self.fpstate.reserved3[1] = sw_bytes.extended_size; + self.fpstate.reserved3[2] = sw_bytes.xfeatures as u32; + self.fpstate.reserved3[3] = (sw_bytes.xfeatures >> 32) as u32; + self.fpstate.reserved3[4] = sw_bytes.xstate_size; + self.fpstate.reserved3[5..12].copy_from_slice(&sw_bytes.padding); + } + fn validate_for_sigreturn(&self) -> Result<(), SystemError> { let mxcsr_mask = FpState::mxcsr_feature_mask(); if self.fpstate.mxcsr & !mxcsr_mask != 0 { @@ -290,11 +345,13 @@ impl UserXState { } } - Self { + let mut user_xstate = Self { fpstate, header, avx, - } + }; + user_xstate.install_sw_bytes(); + user_xstate } /// 从用户态 XSAVE 状态转换回内核 FpState @@ -468,8 +525,13 @@ impl UserUContext { /// 从 TrapFrame 创建 UserUContext #[inline(never)] pub fn from_trapframe(frame: &TrapFrame, oldset: &SigSet, cr2: u64) -> Self { + let mut uc_flags = UC_SIGCONTEXT_SS | UC_STRICT_RESTORE_SS; + if FpState::is_xsave_enabled() { + uc_flags |= UC_FP_XSTATE; + } + Self { - uc_flags: 0, + uc_flags, uc_link: core::ptr::null_mut(), uc_stack: StackT { ss_sp: core::ptr::null_mut(), @@ -507,7 +569,6 @@ impl UserUContext { reserved1: [0; 8], }, uc_sigmask: UserSigSet::from_kernel_sigset(oldset), - __fpregs_mem: UserXState::default(), } } @@ -530,7 +591,7 @@ impl UserUContext { frame.rcx = self.uc_mcontext.rcx; frame.rsp = self.uc_mcontext.rsp; frame.rip = self.uc_mcontext.rip; - frame.rflags = self.uc_mcontext.eflags; + frame.rflags = (frame.rflags & !FIX_EFLAGS) | (self.uc_mcontext.eflags & FIX_EFLAGS); // 注意: cs, ss 等段寄存器不恢复,由内核管理 } } @@ -582,54 +643,51 @@ impl Default for X86SigStack { } /// Linux 兼容的信号栈帧结构 -/// 这个结构布局与 Linux 完全兼容,用户态可以通过 ucontext 访问寄存器和 FP 状态 -#[repr(C, align(16))] +/// x86_64 Linux 布局为 pretcode, ucontext, siginfo,FP/XSAVE 状态跟随在结构之后。 +#[repr(C)] #[derive(Debug, Clone, Copy)] struct SigFrame { /// 指向restorer的地址的指针 pub ret_code_ptr: *mut c_void, + /// ucontext_t 结构 + pub ucontext: UserUContext, /// siginfo_t 结构 pub siginfo: PosixSigInfo, - /// ucontext_t 结构(内含 fpstate 和 __ssp) - pub ucontext: UserUContext, } impl SigFrame { - /// 安全地设置 fpstate 指针,指向 ucontext 内的 __fpregs_mem 的 FXSAVE 兼容部分 - pub fn setup_fpstate_pointer(&mut self) { - // fpstate 指针指向 UserXState 的 fpstate 字段(FXSAVE 兼容部分) + /// 设置 fpstate 指针,指向 sigframe 后方独立的 64 字节对齐 XSAVE 区域。 + pub fn setup_fpstate_pointer(&mut self, fpstate_ptr: *mut UserXState) { self.ucontext.uc_mcontext.fpstate = - &mut self.ucontext.__fpregs_mem.fpstate as *mut UserFpState64; + unsafe { &mut (*fpstate_ptr).fpstate as *mut UserFpState64 }; } - /// 安全地获取完整 fpstate (包含 AVX) 的可变引用 - pub fn fpstate_mut(&mut self) -> &mut UserXState { - &mut self.ucontext.__fpregs_mem - } - - /// 从栈帧恢复 fpstate,包含安全性检查(防止 SROP 攻击) + /// 从 sigcontext 指向的用户 fpstate 恢复完整 XSAVE 状态。 /// 返回包含完整 XSAVE 状态(包括 AVX)的 FpState pub fn restore_fpstate(&self) -> Result, SystemError> { if self.ucontext.uc_mcontext.fpstate.is_null() { return Ok(None); } - // 验证指针确实指向 ucontext 内的 __fpregs_mem.fpstate - let expected_addr = &self.ucontext.__fpregs_mem.fpstate as *const UserFpState64; - if !core::ptr::eq(self.ucontext.uc_mcontext.fpstate as *const _, expected_addr) { - // 指针被篡改,这可能是 SROP 攻击 - error!( - "fpstate pointer mismatch: expected={:p}, got={:p}, possible SROP attack", - expected_addr, self.ucontext.uc_mcontext.fpstate - ); - return Err(SystemError::EFAULT); - } - - // 使用 UserXState::to_kernel_fpstate 恢复完整的 XSAVE 状态(包括 AVX) - self.ucontext.__fpregs_mem.to_kernel_fpstate().map(Some) + let fpstate_ptr = self.ucontext.uc_mcontext.fpstate as *const UserXState; + let reader = UserBufferReader::new(fpstate_ptr, size_of::(), true)?; + let mut user_xstate = UserXState::default(); + reader.copy_one_from_user(&mut user_xstate, 0)?; + user_xstate.to_kernel_fpstate().map(Some) } } +const _: () = { + assert!(core::mem::offset_of!(SigFrame, ucontext) == 8); + assert!(core::mem::offset_of!(SigFrame, siginfo) == 432); + assert!(core::mem::size_of::() == 560); +}; + +struct SignalFrameLocation { + frame: *mut SigFrame, + fpstate: *mut UserXState, +} + unsafe fn do_signal(frame: &mut TrapFrame, got_signal: &mut bool) { let pcb = ProcessManager::current_pcb(); @@ -755,6 +813,12 @@ unsafe fn do_signal(frame: &mut TrapFrame, got_signal: &mut bool) { } *got_signal = true; + let mut blocked = oldset | sigaction.mask(); + if !sigaction.flags().contains(SigFlags::SA_NODEFER) { + blocked.insert(sig_number.into()); + } + set_current_blocked(&mut blocked); + // 注意!由于handle_signal里面可能会退出进程, // 因此这里需要检查清楚:上面所有的锁、arc指针都被释放了。否则会产生资源泄露的问题! let res: Result = @@ -840,7 +904,6 @@ impl SignalArch for X86_64SignalArch { } let frame = unsafe { &*frame_ptr }; - // 1. 恢复信号掩码(从 1024-bit 用户态格式转换到 64-bit 内核格式) let mut sigmask = frame.ucontext.uc_sigmask.to_kernel_sigset(); set_current_blocked(&mut sigmask); @@ -988,7 +1051,8 @@ fn setup_frame( } // 分配新的信号栈帧 - let frame_ptr: *mut SigFrame = get_stack(sigaction, trap_frame, size_of::()); + let frame_location = get_stack(sigaction, trap_frame, size_of::()); + let frame_ptr = frame_location.frame; // 验证地址位于用户空间 UserBufferWriter::new(frame_ptr, size_of::(), true).map_err(|_| { @@ -999,6 +1063,14 @@ fn setup_frame( ); SystemError::EFAULT })?; + UserBufferWriter::new(frame_location.fpstate, FPSTATE_FRAME_SIZE, true).map_err(|_| { + error!("In setup_frame: fpstate access check failed"); + let _ = crate::ipc::kill::send_signal_to_pid( + ProcessManager::current_pcb().raw_pid(), + Signal::SIGSEGV, + ); + SystemError::EFAULT + })?; // 获取栈帧的可变引用(唯一需要 unsafe 的地方) let frame = unsafe { &mut *frame_ptr }; @@ -1022,10 +1094,18 @@ fn setup_frame( // 3. 写入用户栈(可能触发缺页,必须在释放锁后进行) frame.ucontext = user_ucontext; if let Some(fpstate) = user_fpstate { - *frame.fpstate_mut() = fpstate; + let mut fp_writer = + UserBufferWriter::new(frame_location.fpstate, size_of::(), true)?; + fp_writer.copy_one_to_user(&fpstate, 0)?; + let mut magic_writer = UserBufferWriter::new( + unsafe { (frame_location.fpstate as *mut u8).add(size_of::()) as *mut u32 }, + FP_XSTATE_MAGIC2_SIZE, + true, + )?; + magic_writer.copy_one_to_user(&FP_XSTATE_MAGIC2, 0)?; } // 设置 fpstate 指针指向栈帧内的 fpstate - frame.setup_fpstate_pointer(); + frame.setup_fpstate_pointer(frame_location.fpstate); // 4. 复制 siginfo info.copy_posix_siginfo_to_user(&mut frame.siginfo as *mut PosixSigInfo) @@ -1042,18 +1122,20 @@ fn setup_frame( // 6. 设置 trap_frame,准备进入信号处理函数 trap_frame.rdi = sig as u64; // 参数1: 信号编号 + trap_frame.rax = 0; // Linux x86_64: support handlers declared without prototypes trap_frame.rsi = &frame.siginfo as *const _ as u64; // 参数2: siginfo_t* trap_frame.rdx = &frame.ucontext as *const _ as u64; // 参数3: ucontext_t* trap_frame.rsp = frame_ptr as u64; trap_frame.rip = handler_addr as u64; trap_frame.cs = (USER_CS.bits() | 0x3) as u64; trap_frame.ds = (USER_DS.bits() | 0x3) as u64; + trap_frame.rflags &= !(X86_EFLAGS_DF | X86_EFLAGS_RF | X86_EFLAGS_TF); Ok(0) } #[inline(always)] -fn get_stack(sigaction: &mut Sigaction, frame: &TrapFrame, size: usize) -> *mut SigFrame { +fn get_stack(sigaction: &mut Sigaction, frame: &TrapFrame, size: usize) -> SignalFrameLocation { let pcb = ProcessManager::current_pcb(); let stack = pcb.sig_altstack(); @@ -1064,14 +1146,20 @@ fn get_stack(sigaction: &mut Sigaction, frame: &TrapFrame, size: usize) -> *mut && !stack.flags.contains(SigStackFlags::SS_DISABLE) && !stack.on_sig_stack(frame.rsp as usize) { - rsp = stack.sp + stack.size as usize - size; + rsp = stack.sp + stack.size as usize; } else { - // 默认使用用户栈:rsp - 红区(128) - size - rsp = (frame.rsp as usize) - 128 - size; + // 默认使用用户栈:先跳过 x86_64 ABI red zone。 + rsp = (frame.rsp as usize) - 128; } - // 16字节对齐,减8是为了保持 x86_64 ABI 的栈对齐约定 + let fpstate = (rsp - FPSTATE_FRAME_SIZE) & !(64 - 1); + rsp = fpstate - size; + + // 16字节对齐,减8是为了保持 x86_64 ABI 的栈对齐约定。 rsp = (rsp & !(STACK_ALIGN - 1) as usize) - 8; - rsp as *mut SigFrame + SignalFrameLocation { + frame: rsp as *mut SigFrame, + fpstate: fpstate as *mut UserXState, + } } diff --git a/kernel/src/arch/x86_64/mm/fault.rs b/kernel/src/arch/x86_64/mm/fault.rs index 5143013eb7..449208c70e 100644 --- a/kernel/src/arch/x86_64/mm/fault.rs +++ b/kernel/src/arch/x86_64/mm/fault.rs @@ -18,7 +18,7 @@ use crate::{ mm::{ fault::{FaultFlags, PageFaultHandler, PageFaultMessage}, ucontext::{AddressSpace, LockedVMA}, - VirtAddr, VmFaultReason, VmFlags, + VirtAddr, VirtRegion, VmFaultReason, VmFlags, }, process::ProcessManager, }; @@ -316,12 +316,27 @@ impl X86_64MMArch { let current_address_space: Arc = AddressSpace::current().unwrap(); let mut space_guard = current_address_space.write(); + let fault_region = VirtRegion::new( + VirtAddr::new(address.data() & !MMArch::PAGE_OFFSET_MASK), + MMArch::PAGE_SIZE, + ); let mut fault; loop { let vma = space_guard.mappings.find_nearest(address); let vma = match vma { Some(vma) => vma, None => { + if space_guard + .mappings + .first_reservation_conflict(fault_region) + .is_some() + { + drop(space_guard); + current_address_space.wait_for_no_reservation_conflict(fault_region); + space_guard = current_address_space.write(); + continue; + } + log::error!( "pid:{}, can not find nearest vma, \n\terror_code: {:?}, address: {:#x}, rip: {:#x}", ProcessManager::current_pid().data(), @@ -345,8 +360,30 @@ impl X86_64MMArch { drop(guard); if !region.contains(address) { + if space_guard + .mappings + .first_reservation_conflict(fault_region) + .is_some() + { + drop(space_guard); + current_address_space.wait_for_no_reservation_conflict(fault_region); + space_guard = current_address_space.write(); + continue; + } + if vm_flags.contains(VmFlags::VM_GROWSDOWN) { let extension_size = region.start() - address; + let extension_region = VirtRegion::new(address, extension_size); + if space_guard + .mappings + .first_reservation_conflict(extension_region) + .is_some() + { + drop(space_guard); + current_address_space.wait_for_no_reservation_conflict(extension_region); + space_guard = current_address_space.write(); + continue; + } // 首先检查地址是否在栈的合理扩展范围内 // 如果地址距离栈底太远(超过最大栈限制),则这不是一个栈扩展请求, @@ -403,8 +440,6 @@ impl X86_64MMArch { address.data(), flags ); - log::error!("fault rip: {:#x}", regs.rip); - // 地址不在VMA范围内,检查是否需要异常表修复 if handle_kernel_access_failed(regs) { return; // 已通过异常表修复 diff --git a/kernel/src/arch/x86_64/process/mod.rs b/kernel/src/arch/x86_64/process/mod.rs index 013bfac72c..7bb15725ce 100644 --- a/kernel/src/arch/x86_64/process/mod.rs +++ b/kernel/src/arch/x86_64/process/mod.rs @@ -288,6 +288,15 @@ impl ArchPCBInfo { *self = from.clone_all(); self.gsdata = gsdata; } + + /// Synchronize hardware-backed current-thread state before common fork code + /// clones `ArchPCBInfo`. + pub fn sync_current_state_before_fork(&mut self) { + unsafe { + self.save_fsbase(); + self.save_gsbase(); + } + } } impl ProcessControlBlock { @@ -361,6 +370,11 @@ impl ProcessManager { // 注意:需要使用mut guard以便保存FP状态 let mut current_arch_guard = current_pcb.arch_info_irqsave(); + unsafe { + current_arch_guard.save_fsbase(); + current_arch_guard.save_gsbase(); + } + // 在拷贝FP状态之前,先从硬件寄存器保存当前的FP状态 // 这样确保即使在信号处理函数中fork,子进程也能继承fork时刻的真实FP寄存器状态 current_arch_guard.save_fp_state(); diff --git a/kernel/src/arch/x86_64/process/syscall.rs b/kernel/src/arch/x86_64/process/syscall.rs index e6ed6ac2dc..ba718c80f6 100644 --- a/kernel/src/arch/x86_64/process/syscall.rs +++ b/kernel/src/arch/x86_64/process/syscall.rs @@ -25,6 +25,10 @@ impl Syscall { ) -> Result<(), SystemError> { // debug!("write proc_init_info to user stack done"); + // glibc treats %rdx at _start as rtld_fini and calls it on exit. + // It must not inherit execve's envp argument across a successful exec. + regs.rdx = 0; + // (兼容旧版libc)把argv的指针写到寄存器内 // TODO: 改写旧版libc,不再需要这个兼容 regs.rdi = param.init_info().args.len() as u64; diff --git a/kernel/src/debug/klog/loglevel.rs b/kernel/src/debug/klog/loglevel.rs index 202608fd7c..d47102fbd0 100644 --- a/kernel/src/debug/klog/loglevel.rs +++ b/kernel/src/debug/klog/loglevel.rs @@ -24,6 +24,10 @@ use crate::init::{ pub static KERNEL_LOG_LEVEL: KernelLogLevel = KernelLogLevel::new(); const QUIET_CONSOLE_LOGLEVEL: u8 = 4; +const DEFAULT_CONSOLE_LOGLEVEL: u8 = 7; +const DEFAULT_MESSAGE_LOGLEVEL: u8 = 4; +const MINIMUM_CONSOLE_LOGLEVEL: u8 = 1; +const MAXIMUM_CONSOLE_LOGLEVEL: u8 = 8; /// 日志级别 #[derive(Default, Clone, PartialEq, Debug)] @@ -86,16 +90,16 @@ impl KernelLogLevel { /// 创建新的内核日志级别配置 /// /// 默认值遵循Linux内核标准: - /// - console_loglevel: 7 (DEBUG) + /// - console_loglevel: 7 (默认输出INFO及以上级别,不输出DEBUG) /// - default_message_loglevel: 4 (WARNING) /// - minimum_console_loglevel: 1 (ALERT) - /// - default_console_loglevel: 7 (DEBUG) + /// - default_console_loglevel: 7 pub const fn new() -> Self { Self { - console_level: AtomicU8::new(7), // DEBUG级别 - default_message_level: AtomicU8::new(4), // WARNING级别 - minimum_level: AtomicU8::new(1), // ALERT级别 - default_console_level: AtomicU8::new(7), // DEBUG级别 + console_level: AtomicU8::new(DEFAULT_CONSOLE_LOGLEVEL), + default_message_level: AtomicU8::new(DEFAULT_MESSAGE_LOGLEVEL), + minimum_level: AtomicU8::new(MINIMUM_CONSOLE_LOGLEVEL), + default_console_level: AtomicU8::new(DEFAULT_CONSOLE_LOGLEVEL), } } @@ -105,27 +109,28 @@ impl KernelLogLevel { /// - `message_level`: 消息级别 (0-7) /// /// # 返回值 - /// - `true`: 消息级别 <= 控制台级别,应该输出 - /// - `false`: 消息级别 > 控制台级别,应该过滤 + /// - `true`: 消息级别 < 控制台级别,应该输出 + /// - `false`: 消息级别 >= 控制台级别,应该过滤 /// /// # Linux语义 - /// 遵循Linux内核的过滤规则:message_level <= console_loglevel + /// 遵循Linux内核的过滤规则:message_level < console_loglevel pub fn should_print(&self, message_level: LogLevel) -> bool { let console_level = self.console_level.load(Ordering::Acquire); - // Linux语义:message_level <= console_loglevel 时输出 - message_level as u8 <= console_level + // Linux语义:message_level >= console_loglevel 时抑制输出。 + // 因此默认console_loglevel=7时,INFO(6)会输出,DEBUG(7)不会输出。 + (message_level as u8) < console_level } /// 设置控制台日志级别 /// /// # 参数 - /// - `level`: 新的控制台级别 (0-7) + /// - `level`: 新的控制台级别 (0-8) /// /// # 返回值 /// - `Ok(())`: 设置成功 /// - `Err(SystemError::EINVAL)`: 级别值无效 pub fn set_console_level(&self, level: u8) -> Result<(), SystemError> { - if level <= 7 { + if level <= MAXIMUM_CONSOLE_LOGLEVEL { self.console_level.store(level, Ordering::Release); Ok(()) } else { @@ -169,15 +174,16 @@ impl KernelLogLevel { /// loglevel命令行参数处理 /// -/// 支持格式:loglevel=N (N=0-7) +/// 支持格式:loglevel=N (N=0-8) /// 示例:loglevel=4 只输出WARNING及以上级别的日志 +/// 示例:loglevel=8 输出DEBUG及以上级别的日志 #[linkme::distributed_slice(KCMDLINE_PARAM_KV)] static LOGLEVEL_PARAM: KernelCmdlineParameter = KernelCmdlineParameter::KV(KernelCmdlineKV { name: "loglevel", value: None, initialized: false, supplied: false, - default: "7", // 默认DEBUG级别 + default: "7", }); kernel_cmdline_param_arg!(QUIET_PARAM, quiet, false, false); @@ -213,10 +219,14 @@ pub fn handle_loglevel_param() { continue; }; if let Ok(level) = value_str.parse::() { - if level <= 7 { + if level <= MAXIMUM_CONSOLE_LOGLEVEL { selected_level = Some(level); } else { - log::warn!("loglevel: invalid level {}, must be 0-7", level); + log::warn!( + "loglevel: invalid level {}, must be 0-{}", + level, + MAXIMUM_CONSOLE_LOGLEVEL + ); } } else { log::warn!("loglevel: invalid value '{}', must be a number", value_str); diff --git a/kernel/src/driver/tty/pty/mod.rs b/kernel/src/driver/tty/pty/mod.rs index 3cd8f1a825..8e366189e9 100644 --- a/kernel/src/driver/tty/pty/mod.rs +++ b/kernel/src/driver/tty/pty/mod.rs @@ -98,6 +98,13 @@ impl PtyCommon { return Err(SystemError::EIO); } + if core.driver().tty_driver_sub_type() == TtyDriverSubType::PtySlave + && link_core.count() != 1 + { + core.flags_write().insert(TtyFlag::IO_ERROR); + return Err(SystemError::EIO); + } + core.flags_write().remove(TtyFlag::IO_ERROR); link_core.flags_write().remove(TtyFlag::OTHER_CLOSED); core.flags_write().insert(TtyFlag::THROTTLED); diff --git a/kernel/src/driver/tty/pty/unix98pty.rs b/kernel/src/driver/tty/pty/unix98pty.rs index 6a8cca6002..f2872e882a 100644 --- a/kernel/src/driver/tty/pty/unix98pty.rs +++ b/kernel/src/driver/tty/pty/unix98pty.rs @@ -2,7 +2,6 @@ use alloc::{ string::ToString, sync::{Arc, Weak}, }; -use core::sync::atomic::{AtomicBool, Ordering}; use system_error::SystemError; use crate::{ @@ -19,7 +18,10 @@ use crate::{ file::FileFlags, FilePrivateData, FileSystem, FileType, IndexNode, InodeMode, MountFS, }, }, - libs::{casting::DowncastArc, mutex::MutexGuard}, + libs::{ + casting::DowncastArc, + mutex::{Mutex, MutexGuard}, + }, mm::VirtAddr, process::ProcessManager, syscall::user_access::UserBufferWriter, @@ -52,14 +54,21 @@ struct PtyDevPtsLink { /// devpts 文件系统本体,用于精确回收索引(避免再去 downcast/全局路径查找) devpts: Weak, index: usize, - /// master 侧(ptmx)最后一个 fd 已关闭 - master_closed: AtomicBool, - /// slave 侧(/dev/pts/N)最后一个 fd 已关闭 - slave_closed: AtomicBool, - /// 目录项是否已经 unlink(通常在 master close 时执行) - unlinked: AtomicBool, - /// 索引是否已经归还(仅在 master+slave 都关闭后才允许归还) - index_freed: AtomicBool, + state: Mutex, +} + +#[derive(Debug, Default)] +struct PtyDevPtsState { + /// master 侧(ptmx)最后一个 fd 已关闭。 + master_closed: bool, + /// slave open 已经进入 driver open,但尚未提交为 active fd。 + slave_opening: usize, + /// 是否存在已成功打开的 userspace slave fd。 + slave_active: bool, + /// 目录项是否已经 unlink(通常在 master close 时执行)。 + unlinked: bool, + /// 索引是否已经归还(仅在 master close 且无 opening/active slave 后允许归还)。 + index_freed: bool, } impl crate::driver::tty::tty_driver::TtyCorePrivateField for PtyDevPtsLink { @@ -74,23 +83,20 @@ impl PtyDevPtsLink { pts_root, devpts, index, - master_closed: AtomicBool::new(false), - slave_closed: AtomicBool::new(false), - unlinked: AtomicBool::new(false), - index_freed: AtomicBool::new(false), + state: Mutex::new(PtyDevPtsState::default()), } } fn on_close(&self, subtype: TtyDriverSubType) { match subtype { TtyDriverSubType::PtyMaster => { - self.master_closed.store(true, Ordering::SeqCst); + self.state.lock().master_closed = true; // Linux 语义:master 关闭后,/dev/pts/N 目录项应从 devpts 中消失; // 但索引不能立即复用(slave 可能仍持有打开的 fd),因此 unlink 与 free_index 分离。 self.try_unlink_once(); } TtyDriverSubType::PtySlave => { - self.slave_closed.store(true, Ordering::SeqCst); + self.state.lock().slave_active = false; } _ => {} } @@ -98,8 +104,57 @@ impl PtyDevPtsLink { self.try_free_index_when_fully_closed(); } + fn begin_slave_open(&self) -> Result<(), SystemError> { + let mut state = self.state.lock(); + if state.master_closed || state.index_freed { + return Err(SystemError::EIO); + } + state.slave_opening += 1; + Ok(()) + } + + fn finish_slave_open(&self) { + { + let mut state = self.state.lock(); + if state.slave_opening == 0 { + log::warn!( + "PtyDevPtsLink: finish slave open without matching begin, index={}", + self.index + ); + return; + } + state.slave_opening -= 1; + state.slave_active = true; + } + self.try_free_index_when_fully_closed(); + } + + fn abort_slave_open(&self) { + { + let mut state = self.state.lock(); + if state.slave_opening == 0 { + log::warn!( + "PtyDevPtsLink: abort slave open without matching begin, index={}", + self.index + ); + return; + } + state.slave_opening -= 1; + } + self.try_free_index_when_fully_closed(); + } + fn try_unlink_once(&self) { - if self.unlinked.swap(true, Ordering::SeqCst) { + let should_unlink = { + let mut state = self.state.lock(); + if state.unlinked { + false + } else { + state.unlinked = true; + true + } + }; + if !should_unlink { return; } if let Some(root) = self.pts_root.upgrade() { @@ -108,16 +163,32 @@ impl PtyDevPtsLink { } fn try_free_index_when_fully_closed(&self) { - if !(self.master_closed.load(Ordering::SeqCst) && self.slave_closed.load(Ordering::SeqCst)) - { + let (should_unlink, should_free_index) = { + let mut state = self.state.lock(); + if !state.master_closed + || state.slave_opening != 0 + || state.slave_active + || state.index_freed + { + (false, false) + } else { + state.index_freed = true; + let should_unlink = !state.unlinked; + state.unlinked = true; + (should_unlink, true) + } + }; + + if !should_free_index { return; } - if self.index_freed.swap(true, Ordering::SeqCst) { - return; + + if should_unlink { + if let Some(root) = self.pts_root.upgrade() { + let _ = root.unlink(&self.index.to_string()); + } } - // 兜底:如果 master 未触发 unlink(异常路径),在最终回收时再尝试一次。 - self.try_unlink_once(); if let Some(devpts) = self.devpts.upgrade() { devpts.free_index(self.index); } @@ -139,7 +210,29 @@ impl TtyOperation for Unix98PtyDriverInner { } fn open(&self, tty: &TtyCoreData) -> Result<(), SystemError> { - PtyCommon::pty_common_open(tty) + let subtype = tty.driver().tty_driver_sub_type(); + + if subtype == TtyDriverSubType::PtySlave { + if let Some(hook_arc) = tty.private_fields() { + if let Some(hook) = hook_arc.as_any().downcast_ref::() { + hook.begin_slave_open()?; + return match PtyCommon::pty_common_open(tty) { + Ok(()) => { + hook.finish_slave_open(); + Ok(()) + } + Err(err) => { + hook.abort_slave_open(); + Err(err) + } + }; + } + } + } + + PtyCommon::pty_common_open(tty)?; + + Ok(()) } fn write(&self, tty: &TtyCoreData, buf: &[u8], nr: usize) -> Result { @@ -323,17 +416,19 @@ impl TtyOperation for Unix98PtyDriverInner { fn close(&self, tty: Arc) -> Result<(), SystemError> { let driver = tty.core().driver(); - // 通过 hook 精确管理 devpts 目录项与索引生命周期 - if let Some(hook_arc) = tty.private_fields() { - if let Some(hook) = hook_arc.as_any().downcast_ref::() { - hook.on_close(driver.tty_driver_sub_type()); - } - } + let core = tty.core(); + let subtype = driver.tty_driver_sub_type(); + + core.flags_write().insert(TtyFlag::IO_ERROR); + core.read_wq().wakeup_all(); + core.write_wq().wakeup_all(); + core.contorl_info_irqsave().packet = false; - if driver.tty_driver_sub_type() == TtyDriverSubType::PtySlave { - driver.ttys().remove(&tty.core().index()); - if let Some(link) = tty.core().link() { + if subtype == TtyDriverSubType::PtySlave { + let mut peer_closed = true; + if let Some(link) = core.link() { let link_core = link.core(); + peer_closed = link_core.flags().contains(TtyFlag::IO_ERROR); // set OTHER_CLOSED flag to tell master side that the slave side is closed link_core.flags_write().insert(TtyFlag::OTHER_CLOSED); // wake up waiting read/write queues on master side @@ -343,16 +438,31 @@ impl TtyOperation for Unix98PtyDriverInner { let epitems = link_core.epitems(); let _ = EventPoll::wakeup_epoll(epitems, EPollEventType::EPOLLHUP); } - } else if driver.tty_driver_sub_type() == TtyDriverSubType::PtyMaster { + if peer_closed { + driver.ttys().remove(&core.index()); + } + } else if subtype == TtyDriverSubType::PtyMaster { // master 侧最后关闭:从 driver 表移除自身(避免泄漏);devpts 的释放由 hook 统一处理 - driver.ttys().remove(&tty.core().index()); - if let Some(link) = tty.core().link() { + driver.ttys().remove(&core.index()); + core.flags_write().insert(TtyFlag::OTHER_CLOSED); + if let Some(link) = core.link() { let link_core = link.core(); link_core.flags_write().insert(TtyFlag::OTHER_CLOSED); link_core.read_wq().wakeup_all(); link_core.write_wq().wakeup_all(); let epitems = link_core.epitems(); let _ = EventPoll::wakeup_epoll(epitems, EPollEventType::EPOLLHUP); + if link_core.flags().contains(TtyFlag::IO_ERROR) { + link_core.driver().ttys().remove(&link_core.index()); + } + } + } + + // 通过 hook 精确管理 devpts 目录项与索引生命周期。必须放在 driver 表 + // 解绑之后,避免 index 释放后被新 PTY 复用又被旧 close 删除。 + if let Some(hook_arc) = tty.private_fields() { + if let Some(hook) = hook_arc.as_any().downcast_ref::() { + hook.on_close(subtype); } } @@ -399,7 +509,13 @@ pub fn ptmx_open( let index = fsinfo.alloc_index()?; - let tty = ptm_driver().init_tty_device(Some(index))?; + let tty = match ptm_driver().init_tty_device(Some(index)) { + Ok(tty) => tty, + Err(err) => { + fsinfo.free_index(index); + return Err(err); + } + }; // 设置privdata *data = FilePrivateData::Tty(TtyFilePrivateData { @@ -410,11 +526,17 @@ pub fn ptmx_open( let core = tty.core(); core.flags_write().insert(TtyFlag::PTY_LOCK); - let _ = pts_root_inode.create( + if let Err(err) = pts_root_inode.create( &index.to_string(), FileType::CharDevice, InodeMode::from_bits_truncate(0x666), - )?; + ) { + ptm_driver().ttys().remove(&index); + pts_driver().ttys().remove(&index); + fsinfo.free_index(index); + *data = FilePrivateData::Unused; + return Err(err); + } // 在 master/slave 两端记录 devpts 根目录与 fs,用于精确清理: // - master close: unlink /dev/pts/N @@ -429,7 +551,14 @@ pub fn ptmx_open( slave.set_private_fields(hook); } - ptm_driver().driver_funcs().open(core)?; + if let Err(err) = ptm_driver().driver_funcs().open(core) { + ptm_driver().ttys().remove(&index); + pts_driver().ttys().remove(&index); + let _ = pts_root_inode.unlink(&index.to_string()); + fsinfo.free_index(index); + *data = FilePrivateData::Unused; + return Err(err); + } Ok(()) } diff --git a/kernel/src/driver/tty/tty_core.rs b/kernel/src/driver/tty/tty_core.rs index f10c8a23a5..ae70937145 100644 --- a/kernel/src/driver/tty/tty_core.rs +++ b/kernel/src/driver/tty/tty_core.rs @@ -11,7 +11,7 @@ use system_error::SystemError; use crate::{ arch::ipc::signal::Signal, - driver::{base::device::device_number::DeviceNumber, tty::pty::ptm_driver}, + driver::base::device::device_number::DeviceNumber, filesystem::epoll::{event_poll::LockedEPItemLinkedList, EPollEventType, EPollItem}, libs::{ rwlock::{RwLock, RwLockReadGuard, RwLockUpgradableGuard, RwLockWriteGuard}, @@ -42,14 +42,6 @@ pub struct TtyCore { line_discipline: Arc, } -impl Drop for TtyCore { - fn drop(&mut self) { - if self.core.driver().tty_driver_sub_type() == TtyDriverSubType::PtySlave { - ptm_driver().ttys().remove(&self.core().index); - } - } -} - impl TtyCore { #[inline(never)] pub fn new(driver: Arc, index: usize) -> Arc { @@ -422,6 +414,10 @@ impl TtyCoreData { self.flags.write_irqsave() } + pub fn private_fields(&self) -> Option> { + self.privete_fields.lock().clone() + } + #[inline] pub fn termios(&self) -> RwLockReadGuard<'_, Termios> { self.termios.read_irqsave() diff --git a/kernel/src/driver/tty/tty_device.rs b/kernel/src/driver/tty/tty_device.rs index 235aa29669..a286df6be8 100644 --- a/kernel/src/driver/tty/tty_device.rs +++ b/kernel/src/driver/tty/tty_device.rs @@ -266,6 +266,8 @@ impl IndexNode for TtyDevice { let ret = tty.open(tty.core()); if let Err(err) = ret { + tty.core().dec_count(); + *data = FilePrivateData::Unused; if err == SystemError::ENOSYS { return Err(SystemError::ENODEV); } diff --git a/kernel/src/driver/tty/tty_driver.rs b/kernel/src/driver/tty/tty_driver.rs index 8d34ed9cae..5fda58d300 100644 --- a/kernel/src/driver/tty/tty_driver.rs +++ b/kernel/src/driver/tty/tty_driver.rs @@ -336,7 +336,15 @@ impl TtyDriver { tty.set_port(ports[core.index()].clone()); } // log::debug!("init_tty_device: to ldisc_setup"); - TtyLdiscManager::ldisc_setup(tty.clone(), tty.core().link())?; + if let Err(err) = TtyLdiscManager::ldisc_setup(tty.clone(), tty.core().link()) { + if self.tty_driver_type == TtyDriverType::Pty { + self.ttys.lock().remove(&idx); + if let Some(other_driver) = self.other_pty_driver() { + other_driver.ttys().remove(&idx); + } + } + return Err(err); + } // 对 PTY 来说,用户可见的设备节点由 devpts 挂载点下的动态节点提供, // 不应再向全局 devfs 注册(否则在新实例复用索引时会因已有的 ptm/ptsX 节点返回 EEXIST)。 diff --git a/kernel/src/driver/tty/tty_ldisc/ntty.rs b/kernel/src/driver/tty/tty_ldisc/ntty.rs index 9dc28ed7a8..a1483f1019 100644 --- a/kernel/src/driver/tty/tty_ldisc/ntty.rs +++ b/kernel/src/driver/tty/tty_ldisc/ntty.rs @@ -12,7 +12,7 @@ use crate::{ driver::tty::{ termios::{ControlCharIndex, InputMode, LocalMode, OutputMode, Termios}, tty_core::{EchoOperation, TtyCore, TtyCoreData, TtyFlag, TtyIoctlCmd, TtyPacketStatus}, - tty_driver::{TtyDriverFlag, TtyOperation}, + tty_driver::{TtyDriverFlag, TtyDriverSubType, TtyOperation}, tty_job_control::TtyJobCtrlManager, }, filesystem::{epoll::EPollEventType, vfs::file::FileFlags}, @@ -1680,8 +1680,10 @@ impl TtyLineDiscipline for NTtyLinediscipline { let core = tty.core(); if !ldata.input_available(core.termios(), false) { if core.flags().contains(TtyFlag::OTHER_CLOSED) { - // 对端已关闭且无数据可读,返回EOF而不是EIO,符合常规PTY语义 - ret = Ok(0); + if core.driver().tty_driver_sub_type() == TtyDriverSubType::PtyMaster { + // Linux pty master read after the last slave close returns EIO. + ret = Err(SystemError::EIO); + } break; } @@ -1709,14 +1711,16 @@ impl TtyLineDiscipline for NTtyLinediscipline { // let wakeup_helper = Timer::new(helper, timeout); // wakeup_helper.activate(); // drop(termios); - drop(ldata); - core.read_wq() - .sleep((EPollEventType::EPOLLIN | EPollEventType::EPOLLRDNORM).bits() as u64); + core.read_wq().sleep_unlock_spinlock( + (EPollEventType::EPOLLIN | EPollEventType::EPOLLRDNORM).bits() as u64, + ldata, + ); continue; } if ldata.icanon && !core.termios().local_mode.contains(LocalMode::EXTPROC) { - if ldata.canon_copy_from_read_buf(buf, &mut nr, &mut offset)? { + let more = ldata.canon_copy_from_read_buf(buf, &mut nr, &mut offset)?; + if more { *cookie = true; offset += len - nr; return Ok(offset); @@ -1780,7 +1784,8 @@ impl TtyLineDiscipline for NTtyLinediscipline { return Err(SystemError::ERESTARTSYS); } if core.flags().contains(TtyFlag::HUPPED) - || core.flags().contains(TtyFlag::OTHER_CLOSED) + || (core.flags().contains(TtyFlag::OTHER_CLOSED) + && core.driver().tty_driver_sub_type() != TtyDriverSubType::PtyMaster) || core.flags().contains(TtyFlag::HUPPING) { return Err(SystemError::EIO); @@ -1956,13 +1961,14 @@ impl TtyLineDiscipline for NTtyLinediscipline { ldata.icanon = termios.local_mode.contains(LocalMode::ICANON); // 设置回显 - if termios.local_mode.contains(LocalMode::ECHO) { - ldata.echo = true; - } + ldata.echo = termios.local_mode.contains(LocalMode::ECHO); if termios.input_mode.contains(InputMode::ISTRIP) || termios.input_mode.contains(InputMode::IUCLC) || termios.input_mode.contains(InputMode::IGNCR) + || termios.input_mode.contains(InputMode::ICRNL) + || termios.input_mode.contains(InputMode::INLCR) + || termios.local_mode.contains(LocalMode::ICANON) || termios.input_mode.contains(InputMode::IXON) || termios.local_mode.contains(LocalMode::ISIG) || termios.local_mode.contains(LocalMode::ECHO) @@ -2042,7 +2048,7 @@ impl TtyLineDiscipline for NTtyLinediscipline { ldata .char_map - .set(ControlCharIndex::DISABLE_CHAR as usize, true); + .set(ControlCharIndex::DISABLE_CHAR as usize, false); ldata.raw = false; ldata.real_raw = false; } else { diff --git a/kernel/src/driver/virtio/transport_pci.rs b/kernel/src/driver/virtio/transport_pci.rs index 26f703bc73..de16215f0f 100644 --- a/kernel/src/driver/virtio/transport_pci.rs +++ b/kernel/src/driver/virtio/transport_pci.rs @@ -13,12 +13,13 @@ use crate::exception::IrqNumber; use crate::libs::volatile::{ReadOnly, Volatile, VolatileReadable, VolatileWritable, WriteOnly}; use crate::mm::VirtAddr; -use alloc::sync::Arc; +use alloc::{sync::Arc, vec, vec::Vec}; use core::{ fmt::{self, Display, Formatter}, mem::{align_of, size_of}, ptr::{self, addr_of_mut, NonNull}, }; +use log::warn; use virtio_drivers::{ transport::{DeviceStatus, DeviceType, Transport}, Error, Hal, PhysAddr, @@ -94,6 +95,8 @@ pub struct PciTransport { /// The start of the queue notification region within some BAR. notify_region: NonNull<[WriteOnly]>, notify_off_multiplier: u32, + /// Cached notify-region indices keyed by virtqueue index. + queue_notify_indices: Vec>, /// The ISR status register within some BAR. isr_status: NonNull>, /// The VirtIO device-specific configuration within some BAR. @@ -185,7 +188,7 @@ impl PciTransport { } } - let common_cfg = get_bar_region::<_>( + let common_cfg = get_bar_region::( &device.standard_device_bar.read(), &common_cfg.ok_or(VirtioPciError::MissingCommonConfig)?, )?; @@ -199,6 +202,8 @@ impl PciTransport { //debug!("notify.offset={},notify.length={}",notify_cfg.offset,notify_cfg.length); let notify_region = get_bar_region_slice::<_>(&device.standard_device_bar.read(), ¬ify_cfg)?; + let queue_count = unsafe { volread!(common_cfg, num_queues) as usize }; + let queue_notify_indices = vec![None; queue_count]; let isr_status = get_bar_region::<_>( &device.standard_device_bar.read(), &isr_cfg.ok_or(VirtioPciError::MissingIsrConfig)?, @@ -217,6 +222,7 @@ impl PciTransport { common_cfg, notify_region, notify_off_multiplier, + queue_notify_indices, isr_status, config_space, irq, @@ -232,6 +238,62 @@ impl PciTransport { pub fn irq(&self) -> IrqNumber { self.irq } + + fn cache_queue_notify_index(&mut self, queue: u16) -> Option { + let queue_index = queue as usize; + if queue_index >= self.queue_notify_indices.len() { + warn!( + "VirtIO PCI notify queue {} out of range, num_queues={}", + queue, + self.queue_notify_indices.len() + ); + return None; + } + + unsafe { + volwrite!(self.common_cfg, queue_select, queue); + let queue_notify_off = volread!(self.common_cfg, queue_notify_off); + let offset_bytes = + usize::from(queue_notify_off).checked_mul(self.notify_off_multiplier as usize); + let Some(offset_bytes) = offset_bytes else { + warn!( + "VirtIO PCI notify offset overflow: queue={}, notify_off={}, multiplier={}", + queue, queue_notify_off, self.notify_off_multiplier + ); + return None; + }; + + let Some(end_offset_bytes) = offset_bytes.checked_add(size_of::()) else { + warn!( + "VirtIO PCI notify offset end overflow: queue={}, offset_bytes={}", + queue, offset_bytes + ); + return None; + }; + let notify_region_len_bytes = self.notify_region.len() * size_of::(); + if end_offset_bytes > notify_region_len_bytes { + warn!( + "VirtIO PCI notify offset out of range: queue={}, offset_bytes={}, notify_region_len_bytes={}", + queue, + offset_bytes, + notify_region_len_bytes + ); + return None; + } + + let index = offset_bytes / size_of::(); + self.queue_notify_indices[queue_index] = Some(index); + Some(index) + } + } + + fn fail_bad_notify_config(&mut self, queue: u16) -> ! { + self.set_status(DeviceStatus::FAILED); + panic!( + "VirtIO PCI queue {} has invalid or missing notification register", + queue + ); + } } impl Transport for PciTransport { @@ -274,17 +336,19 @@ impl Transport for PciTransport { } fn notify(&mut self, queue: u16) { - // Safe because the common config and notify region pointers are valid and we checked in - // get_bar_region that they were aligned. - unsafe { - volwrite!(self.common_cfg, queue_select, queue); - // TODO: Consider caching this somewhere (per queue). - let queue_notify_off = volread!(self.common_cfg, queue_notify_off); + let queue_index = queue as usize; + let Some(index) = self + .queue_notify_indices + .get(queue_index) + .copied() + .flatten() + .or_else(|| self.cache_queue_notify_index(queue)) + else { + self.fail_bad_notify_config(queue); + }; - let offset_bytes = usize::from(queue_notify_off) * self.notify_off_multiplier as usize; - let index = offset_bytes / size_of::(); - addr_of_mut!((*self.notify_region.as_ptr())[index]).vwrite(queue); - } + // Safe because notify_region is a valid BAR mapping and the cached index is bounds-checked. + unsafe { addr_of_mut!((*self.notify_region.as_ptr())[index]).vwrite(queue) }; } fn set_status(&mut self, status: DeviceStatus) { @@ -323,6 +387,9 @@ impl Transport for PciTransport { volwrite!(self.common_cfg, queue_desc, descriptors as u64); volwrite!(self.common_cfg, queue_driver, driver_area as u64); volwrite!(self.common_cfg, queue_device, device_area as u64); + if self.cache_queue_notify_index(queue).is_none() { + self.fail_bad_notify_config(queue); + } // 这里设置队列中断对应的中断项 if matches!(*self.device.irq_type.read(), IrqType::Msix { .. }) { if queue == QUEUE_RECEIVE { @@ -359,6 +426,9 @@ impl Transport for PciTransport { volwrite!(self.common_cfg, queue_driver, 0); volwrite!(self.common_cfg, queue_device, 0); } + if let Some(index) = self.queue_notify_indices.get_mut(queue as usize) { + *index = None; + } } fn queue_used(&mut self, queue: u16) -> bool { diff --git a/kernel/src/filesystem/debugfs.rs b/kernel/src/filesystem/debugfs.rs new file mode 100644 index 0000000000..e7b6e8c4e4 --- /dev/null +++ b/kernel/src/filesystem/debugfs.rs @@ -0,0 +1,75 @@ +use core::any::Any; + +use alloc::sync::Arc; +use system_error::SystemError; + +use crate::debug::sysfs::debugfs_kobj; +use crate::driver::base::kobject::KObject; +use crate::filesystem::vfs::{ + FileSystem, FileSystemMakerData, FsInfo, IndexNode, Magic, MountableFileSystem, SuperBlock, + FSMAKER, +}; +use crate::register_mountable_fs; + +use linkme::distributed_slice; + +const DEBUGFS_MAX_NAMELEN: u64 = 255; +const DEBUGFS_BLOCK_SIZE: u64 = 4096; + +#[derive(Debug)] +pub struct DebugFs { + root: Arc, +} + +impl DebugFs { + fn new() -> Result, SystemError> { + let root = debugfs_kobj().inode().ok_or(SystemError::ENOENT)?; + Ok(Arc::new(Self { root })) + } +} + +impl FileSystem for DebugFs { + fn root_inode(&self) -> Arc { + self.root.clone() + } + + fn info(&self) -> FsInfo { + FsInfo { + blk_dev_id: 0, + max_name_len: DEBUGFS_MAX_NAMELEN as usize, + } + } + + fn as_any_ref(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "debugfs" + } + + fn super_block(&self) -> SuperBlock { + SuperBlock::new( + Magic::DEBUGFS_MAGIC, + DEBUGFS_BLOCK_SIZE, + DEBUGFS_MAX_NAMELEN, + ) + } +} + +impl MountableFileSystem for DebugFs { + fn make_mount_data( + _raw_data: Option<&str>, + _source: &str, + ) -> Result>, SystemError> { + Ok(None) + } + + fn make_fs( + _data: Option<&dyn FileSystemMakerData>, + ) -> Result, SystemError> { + Ok(Self::new()?) + } +} + +register_mountable_fs!(DebugFs, DEBUGFSMAKER, "debugfs"); diff --git a/kernel/src/filesystem/devpts/mod.rs b/kernel/src/filesystem/devpts/mod.rs index 45e691102b..3b4dec9580 100644 --- a/kernel/src/filesystem/devpts/mod.rs +++ b/kernel/src/filesystem/devpts/mod.rs @@ -108,7 +108,6 @@ impl DevPtsFs { pub fn free_index(&self, idx: usize) { self.pts_ida.lock().free(idx); - self.pts_count.fetch_sub(1, Ordering::SeqCst); } fn install_ptmx_node(&self) { @@ -382,7 +381,12 @@ impl IndexNode for LockedDevPtsFSInode { fn unlink(&self, name: &str) -> Result<(), SystemError> { let mut guard = self.inner.lock(); - guard.children_unchecked_mut().remove(name); + if guard.children_unchecked_mut().remove(name).is_some() + && name.as_bytes().iter().all(u8::is_ascii_digit) + { + let fs = guard.fs.upgrade().unwrap(); + fs.pts_count.fetch_sub(1, Ordering::SeqCst); + } Ok(()) } } diff --git a/kernel/src/filesystem/epoll/event_poll.rs b/kernel/src/filesystem/epoll/event_poll.rs index 4f6d220f39..0d3c6a4e7e 100644 --- a/kernel/src/filesystem/epoll/event_poll.rs +++ b/kernel/src/filesystem/epoll/event_poll.rs @@ -761,13 +761,41 @@ impl EventPoll { } if let Some(removed) = epoll.ep_items.remove(&fd) { - let mut rs = epoll.ready_state.lock_irqsave(); - rs.ready_list.retain(|item| !Arc::ptr_eq(item, &removed)); + Self::remove_ready_item(epoll, &removed); } Ok(()) } + /// Remove an epoll item when the referenced file is being released. + /// + /// This is the DragonOS equivalent of Linux `eventpoll_release_file()`, + /// called from the file cleanup path rather than from `epoll_ctl(DEL)`. + pub fn release_file_epitem(epitem: &Arc) { + let Some(epoll) = epitem.epoll().upgrade() else { + return; + }; + let mut epoll = epoll.lock(); + let fd = epitem.fd(); + let Some(current) = epoll.ep_items.get(&fd).cloned() else { + return; + }; + if !Arc::ptr_eq(¤t, epitem) { + return; + } + + epoll.ep_items.remove(&fd); + Self::remove_ready_item(&mut epoll, ¤t); + } + + fn remove_ready_item(epoll: &mut MutexGuard, epitem: &Arc) { + let mut rs = epoll.ready_state.lock_irqsave(); + rs.ready_list.retain(|item| !Arc::ptr_eq(item, epitem)); + if let Some(ovflist) = rs.ovflist.as_mut() { + ovflist.retain(|item| !Arc::ptr_eq(item, epitem)); + } + } + /// ## 修改已经注册的监听事件 /// /// ### 参数 diff --git a/kernel/src/filesystem/eventfd.rs b/kernel/src/filesystem/eventfd.rs index f335afb317..71a3b7716e 100644 --- a/kernel/src/filesystem/eventfd.rs +++ b/kernel/src/filesystem/eventfd.rs @@ -264,20 +264,23 @@ impl IndexNode for EventFdInode { if val == u64::MAX { return Err(SystemError::EINVAL); } + let pollflag; loop { + let mut eventfd = self.eventfd.lock(); + // Allow write when count + val <= EVENTFD_MAX. + if EVENTFD_MAX.saturating_sub(eventfd.count) >= val { + eventfd.count += val; + pollflag = + EPollEventType::from_bits_truncate(self.do_poll(&data, &eventfd)? as u32); + break; + } + if eventfd.flags.contains(EventFdFlags::EFD_NONBLOCK) { + return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); + } + drop(eventfd); if ProcessManager::current_pcb().has_pending_signal() { return Err(SystemError::ERESTARTSYS); } - { - let eventfd = self.eventfd.lock(); - // Allow write when count + val <= EVENTFD_MAX. - if EVENTFD_MAX.saturating_sub(eventfd.count) >= val { - break; - } - if eventfd.flags.contains(EventFdFlags::EFD_NONBLOCK) { - return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); - } - } // 等待计数下降(被 read 消费) self.wait_queue.wait_event_interruptible( || { @@ -287,15 +290,8 @@ impl IndexNode for EventFdInode { None::, )?; } - let mut eventfd = self.eventfd.lock(); - eventfd.count += val; - drop(eventfd); self.wait_queue.wakeup_all(None); - let eventfd = self.eventfd.lock(); - let pollflag = EPollEventType::from_bits_truncate(self.do_poll(&data, &eventfd)? as u32); - drop(eventfd); - // 唤醒epoll中等待的进程 EventPoll::wakeup_epoll(&self.epitems, pollflag)?; return Ok(8); diff --git a/kernel/src/filesystem/ext4/inode.rs b/kernel/src/filesystem/ext4/inode.rs index 29bf06e1c9..6ab90309af 100644 --- a/kernel/src/filesystem/ext4/inode.rs +++ b/kernel/src/filesystem/ext4/inode.rs @@ -1,11 +1,11 @@ use crate::{ arch::MMArch, - driver::base::device::device_number::DeviceNumber, + driver::base::device::device_number::{DeviceNumber, Major}, filesystem::{ page_cache::{AsyncPageCacheBackend, PageCache}, vfs::{ self, syscall::RenameFlags, utils::DName, vcore::generate_inode_id, FilePrivateData, - IndexNode, InodeFlags, InodeId, InodeMode, SpecialNodeData, + IndexNode, InodeFlags, InodeId, InodeMode, SpecialNodeData, XattrFlags, }, }, ipc::pipe::LockedPipeInode, @@ -18,6 +18,7 @@ use crate::{ }; use alloc::{ collections::BTreeMap, + format, string::String, sync::{Arc, Weak}, vec::Vec, @@ -29,6 +30,8 @@ use system_error::SystemError; use super::filesystem::Ext4FileSystem; +const WHITEOUT_DEV: DeviceNumber = DeviceNumber::new(Major::UNNAMED_MAJOR, 0); + bitflags! { /// Inode 脏状态标志位,对应 Linux `inode->i_state` 中的 `I_DIRTY_*` 位。 pub(super) struct InodeDirtyState: u32 { @@ -643,7 +646,7 @@ impl IndexNode for LockedExt4Inode { Ok(copy_len) } - fn setxattr(&self, name: &str, value: &[u8]) -> Result { + fn setxattr(&self, name: &str, value: &[u8], flags: XattrFlags) -> Result { let guard = self.0.lock(); let ext4 = &guard.concret_fs().fs; let inode_num = guard.inner_inode_num; @@ -652,13 +655,58 @@ impl IndexNode for LockedExt4Inode { return Err(SystemError::EPERM); } - if ext4.getxattr(inode_num, name).is_ok() { - ext4.removexattr(inode_num, name)?; + ext4.setxattr_with_flags( + inode_num, + name, + value, + flags.contains(XattrFlags::CREATE), + flags.contains(XattrFlags::REPLACE), + )?; + + Ok(0) + } + + fn listxattr(&self, buf: &mut [u8]) -> Result { + let guard = self.0.lock(); + let ext4 = &guard.concret_fs().fs; + let inode_num = guard.inner_inode_num; + + let names = ext4.listxattr(inode_num)?; + let total_len = names.iter().try_fold(0usize, |acc, name| { + acc.checked_add(name.len()) + .and_then(|len| len.checked_add(1)) + .ok_or(SystemError::E2BIG) + })?; + + if buf.is_empty() { + return Ok(total_len); + } + if buf.len() < total_len { + return Err(SystemError::ERANGE); } - // 调用another_ext4库的setxattr接口 - ext4.setxattr(inode_num, name, value)?; + let mut offset = 0; + for name in names { + let name_bytes = name.as_bytes(); + let next = offset + name_bytes.len(); + buf[offset..next].copy_from_slice(name_bytes); + buf[next] = 0; + offset = next + 1; + } + + Ok(total_len) + } + fn removexattr(&self, name: &str) -> Result { + let guard = self.0.lock(); + let ext4 = &guard.concret_fs().fs; + let inode_num = guard.inner_inode_num; + + if ext4.getattr(inode_num)?.ftype == FileType::SymLink { + return Err(SystemError::EPERM); + } + + ext4.removexattr(inode_num, name)?; Ok(0) } @@ -742,17 +790,17 @@ impl IndexNode for LockedExt4Inode { let old_dname = DName::from(old_name); let new_dname = DName::from(new_name); - // Same directory, same name -> no-op - if src_inode_num == target_inode_num && old_dname == new_dname { - return Ok(()); - } - // NOREPLACE check (VFS layer responsibility - ext4 lib doesn't know about flags) if flags.contains(RenameFlags::NOREPLACE) && ext4.lookup(target_inode_num, new_name).is_ok() { return Err(SystemError::EEXIST); } + // Same directory, same name -> no-op + if src_inode_num == target_inode_num && old_dname == new_dname { + return Ok(()); + } + // RENAME_EXCHANGE: 原子交换两个文件/目录 if flags.contains(RenameFlags::EXCHANGE) { // VFS 层已验证目标存在,直接调用 exchange @@ -781,8 +829,44 @@ impl IndexNode for LockedExt4Inode { } } - // ext4 library now correctly handles atomic replace - ext4.rename(src_inode_num, old_name, target_inode_num, new_name)?; + if flags.contains(RenameFlags::WHITEOUT) { + let mut temp_name = String::new(); + for _ in 0..32 { + let candidate = format!(".dragonos-whiteout-{}", generate_inode_id().data()); + if ext4.lookup(src_inode_num, &candidate).is_ok() { + continue; + } + ext4.mknod( + src_inode_num, + &candidate, + another_ext4::InodeMode::CHARDEV + | another_ext4::InodeMode::from_bits_retain(0o600), + WHITEOUT_DEV.major().data(), + WHITEOUT_DEV.minor(), + )?; + temp_name = candidate; + break; + } + if temp_name.is_empty() { + return Err(SystemError::EEXIST); + } + + if let Err(err) = + ext4.rename_exchange(src_inode_num, old_name, src_inode_num, &temp_name) + { + let _ = ext4.unlink(src_inode_num, &temp_name); + return Err(err.into()); + } + + if let Err(err) = ext4.rename(src_inode_num, &temp_name, target_inode_num, new_name) { + let _ = ext4.rename_exchange(src_inode_num, old_name, src_inode_num, &temp_name); + let _ = ext4.unlink(src_inode_num, &temp_name); + return Err(err.into()); + } + } else { + // ext4 library now correctly handles atomic replace + ext4.rename(src_inode_num, old_name, target_inode_num, new_name)?; + } // Update cache self.update_rename_cache( diff --git a/kernel/src/filesystem/fat/fs.rs b/kernel/src/filesystem/fat/fs.rs index ef0fa6635e..332f926376 100644 --- a/kernel/src/filesystem/fat/fs.rs +++ b/kernel/src/filesystem/fat/fs.rs @@ -50,6 +50,7 @@ use super::{ const FAT_MAX_NAMELEN: u64 = 255; const FAT_LRU_CACHE_SIZE: usize = 4096; +const FAT_NEGATIVE_CHILDREN_CACHE_SIZE: usize = 256; /// FAT32文件系统的最大的文件大小 pub const MAX_FILE_SIZE: u64 = 0xffff_ffff; @@ -145,6 +146,9 @@ pub struct FATInode { /// 子Inode的map. 该数据结构用作缓存区。其中,它的key表示inode的名称。 /// 请注意,由于FAT的查询过程对大小写不敏感,因此我们选择让key全部是大写的,方便统一操作。 children: HashMap>, + /// 有界负向目录项缓存。FAT 没有全局 VFS dcache;动态链接器等热路径会反复探测 + /// 不存在的 hwcaps/tls 路径。缓存 ENOENT 可以避免每次 miss 都重新线性扫描目录。 + negative_children: LruCache, /// 当前inode的元数据 metadata: Metadata, /// 指向inode所在的文件系统对象的指针 @@ -201,6 +205,25 @@ impl FATInode { // log::warn!("update_time has not yet been implemented"); } + fn negative_children_cache() -> LruCache { + LruCache::new(NonZeroUsize::new(FAT_NEGATIVE_CHILDREN_CACHE_SIZE).unwrap()) + } + + fn invalidate_negative_children(&mut self) { + self.negative_children = Self::negative_children_cache(); + } + + fn mark_child_negative(&mut self, search_name: String) { + self.negative_children.put(search_name, ()); + } + + fn mark_child_absent(&mut self, name: &str) { + let search_name = to_search_name(name); + self.children.remove(&search_name); + self.invalidate_negative_children(); + self.mark_child_negative(search_name); + } + fn find(&mut self, name: &str) -> Result, SystemError> { match &self.inode_type { FATDirEntry::Dir(d) => { @@ -209,10 +232,20 @@ impl FATInode { if let Some(entry) = self.children.get(&search_name) { return Ok(entry.clone()); } + if self.negative_children.get(&search_name).is_some() { + return Err(SystemError::ENOENT); + } // 在缓存区找不到 // 在磁盘查找 let fat_entry: FATDirEntry = - d.find_entry(name, None, None, self.fs.upgrade().unwrap())?; + match d.find_entry(name, None, None, self.fs.upgrade().unwrap()) { + Ok(entry) => entry, + Err(SystemError::ENOENT) => { + self.mark_child_negative(search_name); + return Err(SystemError::ENOENT); + } + Err(e) => return Err(e), + }; let dname = DName::from(name); // 创建新的inode let entry_inode: Arc = LockedFATInode::new( @@ -222,7 +255,9 @@ impl FATInode { fat_entry, ); // 加入缓存区, 由于FAT文件系统的大小写不敏感问题,因此存入缓存区的key应当是全大写的 - self.children.insert(search_name, entry_inode.clone()); + self.children + .insert(search_name.clone(), entry_inode.clone()); + self.negative_children.pop(&search_name); return Ok(entry_inode); } FATDirEntry::UnInit => { @@ -255,6 +290,7 @@ impl LockedFATInode { parent, self_ref: Weak::default(), children: HashMap::new(), + negative_children: FATInode::negative_children_cache(), fs: Arc::downgrade(&fs), inode_type, metadata: Metadata { @@ -309,10 +345,25 @@ impl LockedFATInode { new_name: &str, flags: RenameFlags, ) -> Result<(), SystemError> { + if flags.contains(RenameFlags::WHITEOUT) { + return Err(SystemError::EINVAL); + } if old_name == new_name { + if flags.contains(RenameFlags::NOREPLACE) { + return Err(SystemError::EEXIST); + } return Ok(()); } + let old_key = to_search_name(old_name); + let new_key = to_search_name(new_name); let mut guard = self.0.lock(); + if old_key == new_key { + guard.find(old_name)?; + if flags.contains(RenameFlags::NOREPLACE) { + return Err(SystemError::EEXIST); + } + return Ok(()); + } let old_inode = guard.find(old_name)?; let new_inode = guard.find(new_name).ok(); if flags.contains(RenameFlags::NOREPLACE) && new_inode.is_some() { @@ -342,9 +393,11 @@ impl LockedFATInode { }; // remove entries old_inode_guard.inode_type = old_dir.rename(fs, old_name, new_name, new_inode)?; - let old_inode = guard.children.remove(&to_search_name(old_name)).unwrap(); + let old_inode = guard.children.remove(&old_key).unwrap(); // the new_name should refer to old_inode - guard.children.insert(to_search_name(new_name), old_inode); + guard.invalidate_negative_children(); + guard.mark_child_negative(old_key); + guard.children.insert(new_key, old_inode); Ok(()) } @@ -357,6 +410,9 @@ impl LockedFATInode { target: &Arc, flags: RenameFlags, ) -> Result<(), SystemError> { + if flags.contains(RenameFlags::WHITEOUT) { + return Err(SystemError::EINVAL); + } let mut old_guard = self.0.lock(); let other: &LockedFATInode = target .downcast_ref::() @@ -406,13 +462,13 @@ impl LockedFATInode { old_inode_guard.inode_type = old_dir.rename_across(fs, new_dir, old_name, new_name, new_inode)?; // 将源节点从父目录中删除 - let old_inode = old_guard - .children - .remove(&to_search_name(old_name)) - .unwrap(); - new_guard - .children - .insert(to_search_name(new_name), old_inode); + let old_key = to_search_name(old_name); + let new_key = to_search_name(new_name); + let old_inode = old_guard.children.remove(&old_key).unwrap(); + old_guard.invalidate_negative_children(); + old_guard.mark_child_negative(old_key); + new_guard.invalidate_negative_children(); + new_guard.children.insert(new_key, old_inode); Ok(()) } } @@ -668,6 +724,7 @@ impl FATFileSystem { parent: Weak::default(), self_ref: Weak::default(), children: HashMap::new(), + negative_children: FATInode::negative_children_cache(), fs: Weak::default(), inode_type: FATDirEntry::UnInit, metadata: Metadata { @@ -1942,26 +1999,29 @@ impl IndexNode for LockedFATInode { FATDirEntry::Dir(d) => match file_type { FileType::File => { d.create_file(name, fs)?; + guard.invalidate_negative_children(); return Ok(guard.find(name)?); } FileType::Dir => { d.create_dir(name, fs)?; + guard.invalidate_negative_children(); + let child = guard.find(name)?; // 刚创建的目录,确保自身 nlink >= 2,并更新父目录 nlink - if let Some(child) = guard.children.get(&to_search_name(name)) { - let mut child_md = child.0.lock(); - if child_md.metadata.nlinks < 2 { - child_md.metadata.nlinks = 2; - } + let mut child_md = child.0.lock(); + if child_md.metadata.nlinks < 2 { + child_md.metadata.nlinks = 2; } + drop(child_md); // 父目录因为新增子目录,多一个链接(来自子目录的 "..") guard.metadata.nlinks += 1; - return Ok(guard.find(name)?); + return Ok(child); } FileType::SymLink => return Err(SystemError::ENOSYS), FileType::Socket => { d.create_file(name, fs)?; + guard.invalidate_negative_children(); return Ok(guard.find(name)?); } @@ -2137,6 +2197,7 @@ impl IndexNode for LockedFATInode { ent, ); // 加入缓存区, 由于FAT文件系统的大小写不敏感问题,因此存入缓存区的key应当是全大写的 + guard.negative_children.pop(&search_name); guard.children.insert(search_name, entry_inode.clone()); } } @@ -2179,6 +2240,8 @@ impl IndexNode for LockedFATInode { if nod.is_some() { let file_type = target_guard.metadata.file_type; if file_type == FileType::Pipe { + guard.invalidate_negative_children(); + guard.mark_child_negative(to_search_name(name)); return Ok(()); } } @@ -2199,6 +2262,9 @@ impl IndexNode for LockedFATInode { // 再从磁盘删除 let r = dir.remove(guard.fs.upgrade().unwrap().clone(), name, true); drop(target_guard); + if r.is_ok() { + guard.mark_child_absent(name); + } return r; } @@ -2232,6 +2298,7 @@ impl IndexNode for LockedFATInode { if guard.metadata.nlinks > 0 { guard.metadata.nlinks -= 1; } + guard.mark_child_absent(name); return Ok(()); } Err(r) => { @@ -2366,7 +2433,9 @@ impl IndexNode for LockedFATInode { return Err(SystemError::EINVAL); } - inode.children.insert(to_search_name(filename), nod.clone()); + let search_name = to_search_name(filename); + inode.invalidate_negative_children(); + inode.children.insert(search_name, nod.clone()); Ok(nod) } diff --git a/kernel/src/filesystem/fuse/conn.rs b/kernel/src/filesystem/fuse/conn.rs index 45db032822..25d40064e7 100644 --- a/kernel/src/filesystem/fuse/conn.rs +++ b/kernel/src/filesystem/fuse/conn.rs @@ -25,12 +25,12 @@ use super::protocol::{ FuseInterruptIn, FuseOutHeader, FuseWriteIn, FUSE_ABORT_ERROR, FUSE_ASYNC_DIO, FUSE_ASYNC_READ, FUSE_ATOMIC_O_TRUNC, FUSE_AUTO_INVAL_DATA, FUSE_BIG_WRITES, FUSE_DESTROY, FUSE_DONT_MASK, FUSE_DO_READDIRPLUS, FUSE_EXPLICIT_INVAL_DATA, FUSE_EXPORT_SUPPORT, FUSE_FLUSH, FUSE_FORGET, - FUSE_FSYNC, FUSE_FSYNCDIR, FUSE_HANDLE_KILLPRIV, FUSE_INIT, FUSE_INIT_EXT, FUSE_INTERRUPT, - FUSE_KERNEL_MINOR_VERSION, FUSE_KERNEL_VERSION, FUSE_LOOKUP, FUSE_MAX_PAGES, - FUSE_MIN_READ_BUFFER, FUSE_NOTIFY_DELETE, FUSE_NOTIFY_INVAL_ENTRY, FUSE_NOTIFY_INVAL_INODE, - FUSE_NOTIFY_POLL, FUSE_NOTIFY_RETRIEVE, FUSE_NOTIFY_STORE, FUSE_NO_OPENDIR_SUPPORT, - FUSE_NO_OPEN_SUPPORT, FUSE_PARALLEL_DIROPS, FUSE_POSIX_ACL, FUSE_POSIX_LOCKS, - FUSE_READDIRPLUS_AUTO, FUSE_SUBMOUNTS, + FUSE_FSYNC, FUSE_FSYNCDIR, FUSE_GETXATTR, FUSE_HANDLE_KILLPRIV, FUSE_INIT, FUSE_INIT_EXT, + FUSE_INTERRUPT, FUSE_KERNEL_MINOR_VERSION, FUSE_KERNEL_VERSION, FUSE_LISTXATTR, FUSE_LOOKUP, + FUSE_MAX_PAGES, FUSE_MIN_READ_BUFFER, FUSE_NOTIFY_DELETE, FUSE_NOTIFY_INVAL_ENTRY, + FUSE_NOTIFY_INVAL_INODE, FUSE_NOTIFY_POLL, FUSE_NOTIFY_RETRIEVE, FUSE_NOTIFY_STORE, + FUSE_NO_OPENDIR_SUPPORT, FUSE_NO_OPEN_SUPPORT, FUSE_PARALLEL_DIROPS, FUSE_POSIX_ACL, + FUSE_POSIX_LOCKS, FUSE_READDIRPLUS_AUTO, FUSE_REMOVEXATTR, FUSE_SETXATTR, FUSE_SUBMOUNTS, }; fn wait_with_recheck(waitq: &WaitQueue, mut check: F) -> Result @@ -177,6 +177,10 @@ struct FuseConnInner { no_flush: bool, no_fsync: bool, no_fsyncdir: bool, + no_getxattr: bool, + no_setxattr: bool, + no_listxattr: bool, + no_removexattr: bool, no_interrupt: bool, max_write_cap: usize, pending: VecDeque>, @@ -199,6 +203,7 @@ impl FuseConn { // Keep this in sync with `sys_read.rs` userspace chunking size. const USER_READ_CHUNK: usize = 64 * 1024; const MIN_MAX_WRITE: usize = 4096; + const DEFAULT_MAX_READAHEAD: usize = 128 * MMArch::PAGE_SIZE; pub fn new() -> Arc { Self::new_with_max_write_cap( @@ -236,6 +241,10 @@ impl FuseConn { no_flush: false, no_fsync: false, no_fsyncdir: false, + no_getxattr: false, + no_setxattr: false, + no_listxattr: false, + no_removexattr: false, no_interrupt: false, max_write_cap, pending: VecDeque::new(), @@ -376,6 +385,28 @@ impl FuseConn { } } + pub fn no_xattr(&self, opcode: u32) -> bool { + let g = self.inner.lock(); + match opcode { + FUSE_GETXATTR => g.no_getxattr, + FUSE_SETXATTR => g.no_setxattr, + FUSE_LISTXATTR => g.no_listxattr, + FUSE_REMOVEXATTR => g.no_removexattr, + _ => false, + } + } + + pub fn mark_no_xattr(&self, opcode: u32) { + let mut g = self.inner.lock(); + match opcode { + FUSE_GETXATTR => g.no_getxattr = true, + FUSE_SETXATTR => g.no_setxattr = true, + FUSE_LISTXATTR => g.no_listxattr = true, + FUSE_REMOVEXATTR => g.no_removexattr = true, + _ => {} + } + } + fn alloc_unique(&self) -> u64 { self.next_unique.fetch_add(2, Ordering::Relaxed) } @@ -459,7 +490,12 @@ impl FuseConn { let mut g = self.inner.lock(); should_destroy = g.connected && g.initialized; g.mounted = false; - g.pending.clear(); + // Filesystem teardown queues accumulated FORGET requests before + // the connection enters on_umount(). Preserve those no-reply + // requests so the daemon can release lookup references before + // it receives DESTROY; drop ordinary requests that can no longer + // complete after unmount. + g.pending.retain(|req| req.opcode == FUSE_FORGET); processing = g.processing.values().cloned().collect(); g.processing.clear(); } @@ -697,7 +733,7 @@ impl FuseConn { let init_in = FuseInitIn { major: FUSE_KERNEL_VERSION, minor: FUSE_KERNEL_MINOR_VERSION, - max_readahead: 0, + max_readahead: Self::DEFAULT_MAX_READAHEAD as u32, flags: flags as u32, flags2: (flags >> 32) as u32, unused: [0; 11], @@ -1076,6 +1112,10 @@ impl FuseConn { (opcode, SystemError::from_i32(errno)), (FUSE_LOOKUP, Some(SystemError::ENOENT)) | (FUSE_FLUSH, Some(SystemError::ENOSYS)) + | (FUSE_GETXATTR, Some(SystemError::ENOSYS)) + | (FUSE_SETXATTR, Some(SystemError::ENOSYS)) + | (FUSE_LISTXATTR, Some(SystemError::ENOSYS)) + | (FUSE_REMOVEXATTR, Some(SystemError::ENOSYS)) | (FUSE_INTERRUPT, Some(SystemError::EAGAIN_OR_EWOULDBLOCK)) ) } @@ -1102,4 +1142,19 @@ impl FuseConn { let g = self.inner.lock(); core::cmp::max(Self::MIN_MAX_WRITE, g.max_read as usize) } + + pub fn max_pages(&self) -> usize { + let g = self.inner.lock(); + core::cmp::max(1, g.init.max_pages as usize) + } + + pub fn max_readahead_pages(&self) -> usize { + let g = self.inner.lock(); + let bytes = if g.init.max_readahead == 0 { + Self::DEFAULT_MAX_READAHEAD + } else { + g.init.max_readahead as usize + }; + core::cmp::max(1, bytes >> MMArch::PAGE_SHIFT) + } } diff --git a/kernel/src/filesystem/fuse/fs.rs b/kernel/src/filesystem/fuse/fs.rs index 273fdb5037..334fe17176 100644 --- a/kernel/src/filesystem/fuse/fs.rs +++ b/kernel/src/filesystem/fuse/fs.rs @@ -3,6 +3,7 @@ use alloc::{ sync::{Arc, Weak}, vec::Vec, }; +use core::sync::atomic::{AtomicU8, Ordering}; use system_error::SystemError; use crate::{ @@ -65,11 +66,42 @@ pub struct FuseFS { super_block: SuperBlock, conn: Arc, nodes: Mutex>>, + retired_nodes: Mutex>>, + state: AtomicU8, default_permissions: bool, is_submount: bool, } impl FuseFS { + const STATE_ACTIVE: u8 = 0; + const STATE_TEARING_DOWN: u8 = 1; + const STATE_DEAD: u8 = 2; + + fn should_retire_node( + node: &Arc, + generation: Option, + cached: Option<&Metadata>, + ) -> bool { + if let Some(gen) = generation { + let old_gen = node.generation(); + if old_gen != 0 && old_gen != gen { + return true; + } + } + if let (Some(old_type), Some(md)) = (node.cached_file_type(), cached) { + if old_type != md.file_type { + return true; + } + } + false + } + + fn retire_stale_node(&self, node: Arc) { + self.retired_nodes.lock().push(Arc::downgrade(&node)); + node.clear_lookup_cache_tree(); + node.clear_parent(); + } + fn parse_opt_u32_decimal(v: &str) -> Result { v.parse::().map_err(|_| SystemError::EINVAL) } @@ -158,157 +190,183 @@ impl FuseFS { pub fn get_or_create_node( self: &Arc, nodeid: u64, - parent_nodeid: u64, + parent: Option>, cached: Option, - ) -> Arc { - self.get_or_create_node_with_generation(nodeid, parent_nodeid, cached, None) + ) -> Result, SystemError> { + self.get_or_create_node_with_generation(nodeid, parent, cached, None, 0) } pub fn get_or_create_node_with_generation( self: &Arc, nodeid: u64, - parent_nodeid: u64, + parent: Option>, cached: Option, generation: Option, - ) -> Arc { + lookup_refs: u64, + ) -> Result, SystemError> { if nodeid == self.root.nodeid() { - return self.root.clone(); + if parent.is_some() || lookup_refs != 0 { + return Err(SystemError::EIO); + } + return Ok(self.root.clone()); } + let parent_nodeid = parent + .as_ref() + .map(|node| node.nodeid()) + .unwrap_or(FUSE_ROOT_ID); - let mut nodes = self.nodes.lock(); - if let Some(w) = nodes.get(&nodeid) { - if let Some(n) = w.upgrade() { + let mut stale_node = None; + let node = { + let mut nodes = self.nodes.lock(); + if self.state.load(Ordering::Acquire) != Self::STATE_ACTIVE { + return Err(SystemError::ESHUTDOWN); + } + if let Some(n) = nodes.get(&nodeid).and_then(|node| node.upgrade()) { if let Some(gen) = generation { - let old_gen = n.generation(); - if old_gen != 0 && old_gen != gen { + if Self::should_retire_node(&n, Some(gen), cached.as_ref()) { n.mark_stale(); nodes.remove(&nodeid); + stale_node = Some(n); } else { n.set_generation(gen); n.set_parent_nodeid(parent_nodeid); + n.set_parent_if_absent(parent); if let Some(md) = cached { n.set_cached_metadata(md); } - return n; + n.inc_lookup(lookup_refs); + return Ok(n); } } else { n.set_parent_nodeid(parent_nodeid); + n.set_parent_if_absent(parent); if let Some(md) = cached { n.set_cached_metadata(md); } - return n; + n.inc_lookup(lookup_refs); + return Ok(n); } + } else { + nodes.remove(&nodeid); } - } - let n = FuseNode::new( - Arc::downgrade(self), - self.conn.clone(), - nodeid, - parent_nodeid, - cached, - ); - if let Some(gen) = generation { - n.set_generation(gen); - } - nodes.insert(nodeid, Arc::downgrade(&n)); - n - } - - pub(crate) fn find_cached_child( - self: &Arc, - parent_nodeid: u64, - name: &str, - ) -> Option> { - let mut stale = Vec::new(); - let nodes = self.nodes.lock(); - for (nodeid, weak) in nodes.iter() { - let Some(node) = weak.upgrade() else { - stale.push(*nodeid); - continue; - }; - if node.parent_fuse_nodeid() == parent_nodeid && node.has_dname(name) { - return Some(node); - } - } - drop(nodes); - if !stale.is_empty() { - let mut nodes = self.nodes.lock(); - for nodeid in stale { - nodes.remove(&nodeid); + let n = FuseNode::new( + Arc::downgrade(self), + self.conn.clone(), + nodeid, + parent_nodeid, + parent, + cached, + ); + if let Some(gen) = generation { + n.set_generation(gen); } + n.inc_lookup(lookup_refs); + nodes.insert(nodeid, Arc::downgrade(&n)); + n + }; + if let Some(stale_node) = stale_node { + self.retire_stale_node(stale_node); } - None + Ok(node) } pub(crate) fn get_or_create_node_for_link( self: &Arc, nodeid: u64, - parent_nodeid: u64, + parent: Option>, cached: Option, generation: Option, - ) -> Arc { + lookup_refs: u64, + ) -> Result, SystemError> { if nodeid == self.root.nodeid() { - return self.root.clone(); + if parent.is_some() || lookup_refs != 0 { + return Err(SystemError::EIO); + } + return Ok(self.root.clone()); } + let parent_nodeid = parent + .as_ref() + .map(|node| node.nodeid()) + .unwrap_or(FUSE_ROOT_ID); - let mut nodes = self.nodes.lock(); - if let Some(w) = nodes.get(&nodeid) { - if let Some(n) = w.upgrade() { + let mut stale_node = None; + let node = { + let mut nodes = self.nodes.lock(); + if self.state.load(Ordering::Acquire) != Self::STATE_ACTIVE { + return Err(SystemError::ESHUTDOWN); + } + if let Some(n) = nodes.get(&nodeid).and_then(|node| node.upgrade()) { if let Some(gen) = generation { - let old_gen = n.generation(); - if old_gen != 0 && old_gen != gen { + if Self::should_retire_node(&n, Some(gen), cached.as_ref()) { n.mark_stale(); nodes.remove(&nodeid); + stale_node = Some(n); } else { n.set_generation(gen); + n.set_parent_if_absent(parent); if let Some(md) = cached { n.set_cached_metadata(md); } - return n; + n.inc_lookup(lookup_refs); + return Ok(n); } } else { + n.set_parent_if_absent(parent); if let Some(md) = cached { n.set_cached_metadata(md); } - return n; + n.inc_lookup(lookup_refs); + return Ok(n); } + } else { + nodes.remove(&nodeid); } - } - let n = FuseNode::new( - Arc::downgrade(self), - self.conn.clone(), - nodeid, - parent_nodeid, - cached, - ); - if let Some(gen) = generation { - n.set_generation(gen); + let n = FuseNode::new( + Arc::downgrade(self), + self.conn.clone(), + nodeid, + parent_nodeid, + parent, + cached, + ); + if let Some(gen) = generation { + n.set_generation(gen); + } + n.inc_lookup(lookup_refs); + nodes.insert(nodeid, Arc::downgrade(&n)); + n + }; + if let Some(stale_node) = stale_node { + self.retire_stale_node(stale_node); } - nodes.insert(nodeid, Arc::downgrade(&n)); - n + Ok(node) } /// 为 virtiofs announce-submounts 创建子挂载树(共享同一 FuseConn)。 pub fn new_submount( parent: &Arc, + root_parent: Arc, root_nodeid: u64, - parent_nodeid: u64, root_md: Metadata, ) -> Arc { let conn = parent.conn.clone(); + let parent_nodeid = root_parent.nodeid(); let fs = Arc::new_cyclic(|weak| FuseFS { root: FuseNode::new( weak.clone(), conn.clone(), root_nodeid, parent_nodeid, + Some(root_parent), Some(root_md), ), super_block: parent.super_block.clone(), conn, nodes: Mutex::new(BTreeMap::new()), + retired_nodes: Mutex::new(Vec::new()), + state: AtomicU8::new(Self::STATE_ACTIVE), default_permissions: parent.default_permissions, is_submount: true, }); @@ -317,6 +375,66 @@ impl FuseFS { .insert(root_nodeid, Arc::downgrade(&fs.root)); fs } + + fn teardown_nodes(&self) { + if self + .state + .compare_exchange( + Self::STATE_ACTIVE, + Self::STATE_TEARING_DOWN, + Ordering::AcqRel, + Ordering::Acquire, + ) + .is_err() + { + return; + } + + let live_nodes: Vec> = { + let mut nodes = self.nodes.lock(); + let mut live = Vec::new(); + live.push(self.root.clone()); + nodes.retain(|_, node| { + if let Some(node) = node.upgrade() { + live.push(node); + true + } else { + false + } + }); + live + }; + let retired_nodes: Vec> = { + let mut retired = self.retired_nodes.lock(); + let mut live = Vec::new(); + retired.retain(|node| { + if let Some(node) = node.upgrade() { + live.push(node); + true + } else { + false + } + }); + live + }; + + for node in live_nodes.iter().chain(retired_nodes.iter()) { + node.mark_stale(); + } + for node in live_nodes.iter().chain(retired_nodes.iter()) { + node.clear_lookup_cache_tree(); + } + for node in live_nodes.iter().chain(retired_nodes.iter()) { + node.flush_forget(); + } + for node in live_nodes.iter().chain(retired_nodes.iter()) { + node.clear_parent(); + } + + self.nodes.lock().clear(); + self.retired_nodes.lock().clear(); + self.state.store(Self::STATE_DEAD, Ordering::Release); + } } /// DragonOS currently mounts announced FUSE submounts eagerly at lookup time. @@ -360,12 +478,7 @@ pub fn fuse_try_automount_submount( } let parent_fs = fuse_node.fuse_fs().ok_or(SystemError::ENOENT)?; - let sub_fs = FuseFS::new_submount( - &parent_fs, - fuse_node.nodeid(), - fuse_node.parent_fuse_nodeid(), - md, - ); + let sub_fs = FuseFS::new_submount(&parent_fs, fuse_node.clone(), fuse_node.nodeid(), md); let mount_path = match mount_path_override { Some(path) => path, None => { @@ -470,14 +583,20 @@ impl MountableFileSystem for FuseFS { conn.clone(), FUSE_ROOT_ID, FUSE_ROOT_ID, + None, Some(root_md), ), super_block, conn: conn.clone(), nodes: Mutex::new(BTreeMap::new()), + retired_nodes: Mutex::new(Vec::new()), + state: AtomicU8::new(Self::STATE_ACTIVE), default_permissions: mount_data.default_permissions, is_submount: false, }); + fs.nodes + .lock() + .insert(FUSE_ROOT_ID, Arc::downgrade(&fs.root)); if let Err(e) = conn.enqueue_init() { conn.rollback_mount_setup(); @@ -586,6 +705,12 @@ impl FileSystem for FuseFS { .map(|cache| !cache.is_page_ready(page_index)) .unwrap_or(true); + if major { + let mut ra_state = file.get_ra_state(); + let _ = node.mmap_readahead_with_open(page_index, 1, &mut ra_state, fh, file_flags); + let _ = file.set_ra_state(ra_state); + } + match node.fault_page_with_open(page_index, fh, file_flags) { Ok(page) => { pfm.set_page(page); @@ -659,18 +784,11 @@ impl FileSystem for FuseFS { start_pgoff: usize, end_pgoff: usize, ) -> VmFaultReason { - let _ = (pfm, start_pgoff, end_pgoff); - VmFaultReason::VM_FAULT_SIGBUS + PageFaultHandler::filemap_map_pages(pfm, start_pgoff, end_pgoff) } fn on_umount(&self) { - let live_nodes: Vec> = { - let nodes = self.nodes.lock(); - nodes.values().filter_map(|w| w.upgrade()).collect() - }; - for node in live_nodes { - node.flush_forget(); - } + self.teardown_nodes(); if !self.is_submount { self.conn.on_umount(); } diff --git a/kernel/src/filesystem/fuse/inode.rs b/kernel/src/filesystem/fuse/inode.rs index 0e20c64c78..9372f84dea 100644 --- a/kernel/src/filesystem/fuse/inode.rs +++ b/kernel/src/filesystem/fuse/inode.rs @@ -1,9 +1,10 @@ use alloc::{ + collections::BTreeMap, string::{String, ToString}, sync::{Arc, Weak}, vec::Vec, }; -use core::mem::size_of; +use core::mem::{replace, size_of, take}; use core::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; use system_error::SystemError; @@ -20,14 +21,14 @@ use crate::{ syscall::RenameFlags, utils::DName, FilePrivateData, FileSystem, FileType, IndexNode, InodeFlags, InodeId, InodeMode, - Metadata, + Metadata, XattrFlags, }, }, libs::{ casting::DowncastArc, mutex::{Mutex, MutexGuard}, }, - mm::MemoryManagementArch, + mm::{readahead::FileReadaheadState, MemoryManagementArch}, time::PosixTimeSpec, }; @@ -40,16 +41,17 @@ use super::{ protocol::{ fuse_pack_struct, fuse_read_struct, FuseAccessIn, FuseAttr, FuseAttrOut, FuseCreateIn, FuseDirent, FuseDirentPlus, FuseEntryOut, FuseFallocateIn, FuseFlushIn, FuseFsyncIn, - FuseGetattrIn, FuseLinkIn, FuseMkdirIn, FuseMknodIn, FuseOpenIn, FuseOpenOut, FuseReadIn, - FuseReleaseIn, FuseRename2In, FuseRenameIn, FuseSetattrIn, FuseWriteIn, FuseWriteOut, - FATTR_ATIME, FATTR_CTIME, FATTR_FH, FATTR_GID, FATTR_LOCKOWNER, FATTR_MODE, FATTR_MTIME, - FATTR_SIZE, FATTR_UID, FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE, FOPEN_NOFLUSH, FOPEN_NONSEEKABLE, - FOPEN_STREAM, FUSE_ACCESS, FUSE_CREATE, FUSE_FALLOCATE, FUSE_FLUSH, FUSE_FSYNC, - FUSE_FSYNCDIR, FUSE_FSYNC_FDATASYNC, FUSE_GETATTR, FUSE_LINK, FUSE_LOOKUP, FUSE_MKDIR, - FUSE_MKNOD, FUSE_OPEN, FUSE_OPENDIR, FUSE_READ, FUSE_READDIR, FUSE_READDIRPLUS, - FUSE_READLINK, FUSE_READ_LOCKOWNER, FUSE_RELEASE, FUSE_RELEASEDIR, FUSE_RENAME, - FUSE_RENAME2, FUSE_RMDIR, FUSE_ROOT_ID, FUSE_SETATTR, FUSE_SYMLINK, FUSE_UNLINK, - FUSE_WRITE, FUSE_WRITE_CACHE, FUSE_WRITE_LOCKOWNER, + FuseGetattrIn, FuseGetxattrIn, FuseGetxattrOut, FuseLinkIn, FuseMkdirIn, FuseMknodIn, + FuseOpenIn, FuseOpenOut, FuseReadIn, FuseReleaseIn, FuseRename2In, FuseRenameIn, + FuseSetattrIn, FuseSetxattrInCompat, FuseWriteIn, FuseWriteOut, FATTR_ATIME, FATTR_CTIME, + FATTR_FH, FATTR_GID, FATTR_LOCKOWNER, FATTR_MODE, FATTR_MTIME, FATTR_SIZE, FATTR_UID, + FOPEN_DIRECT_IO, FOPEN_KEEP_CACHE, FOPEN_NOFLUSH, FOPEN_NONSEEKABLE, FOPEN_STREAM, + FUSE_ACCESS, FUSE_CREATE, FUSE_FALLOCATE, FUSE_FLUSH, FUSE_FSYNC, FUSE_FSYNCDIR, + FUSE_FSYNC_FDATASYNC, FUSE_GETATTR, FUSE_GETXATTR, FUSE_LINK, FUSE_LISTXATTR, FUSE_LOOKUP, + FUSE_MKDIR, FUSE_MKNOD, FUSE_OPEN, FUSE_OPENDIR, FUSE_READ, FUSE_READDIR, FUSE_READDIRPLUS, + FUSE_READLINK, FUSE_READ_LOCKOWNER, FUSE_RELEASE, FUSE_RELEASEDIR, FUSE_REMOVEXATTR, + FUSE_RENAME, FUSE_RENAME2, FUSE_RMDIR, FUSE_ROOT_ID, FUSE_SETATTR, FUSE_SETXATTR, + FUSE_SYMLINK, FUSE_UNLINK, FUSE_WRITE, FUSE_WRITE_CACHE, FUSE_WRITE_LOCKOWNER, }, }; @@ -60,10 +62,12 @@ pub struct FuseNode { self_ref: Weak, nodeid: u64, parent_nodeid: Mutex, + parent: Mutex>>, name: Mutex>, cached_metadata: Mutex>, page_cache: Mutex>>, writeback_handles: Mutex>>, + lookup_cache: Mutex>, direct_io_lock: Mutex<()>, cached_metadata_deadline_ns: AtomicU64, lookup_count: AtomicU64, @@ -74,6 +78,13 @@ pub struct FuseNode { stale: AtomicBool, } +#[derive(Debug, Clone)] +struct FuseLookupCacheEntry { + child: Arc, + generation: u64, + deadline_ns: u64, +} + #[derive(Debug)] struct FusePageCacheBackend { node: Weak, @@ -113,12 +124,16 @@ impl PageCacheBackend for FusePageCacheBackend { impl FuseNode { const FUSE_DIRENT_ALIGN: usize = 8; + const LOOKUP_CACHE_MAX_ENTRIES: usize = 1024; + const XATTR_SIZE_MAX: usize = 65536; + const XATTR_LIST_MAX: usize = 65536; pub fn new( fs: Weak, conn: Arc, nodeid: u64, parent_nodeid: u64, + parent: Option>, cached: Option, ) -> Arc { let has_cached = cached.is_some(); @@ -128,10 +143,12 @@ impl FuseNode { self_ref: self_ref.clone(), nodeid, parent_nodeid: Mutex::new(parent_nodeid), + parent: Mutex::new(parent), name: Mutex::new(None), cached_metadata: Mutex::new(cached), page_cache: Mutex::new(None), writeback_handles: Mutex::new(Vec::new()), + lookup_cache: Mutex::new(BTreeMap::new()), direct_io_lock: Mutex::new(()), cached_metadata_deadline_ns: AtomicU64::new(if has_cached { u64::MAX } else { 0 }), lookup_count: AtomicU64::new(0), @@ -157,7 +174,7 @@ impl FuseNode { self.stale.store(true, Ordering::Release); } - fn check_not_stale(&self) -> Result<(), SystemError> { + pub(crate) fn check_not_stale(&self) -> Result<(), SystemError> { if self.stale.load(Ordering::Acquire) { return Err(SystemError::ESTALE); } @@ -187,6 +204,37 @@ impl FuseNode { *self.parent_nodeid.lock() = parent; } + pub(crate) fn set_parent_if_absent(&self, parent: Option>) { + let Some(parent) = parent else { + return; + }; + if parent.nodeid() == self.nodeid { + return; + } + let mut guard = self.parent.lock(); + if guard.is_none() { + *guard = Some(parent); + } + } + + pub(crate) fn set_parent(&self, parent: Option>) { + if parent + .as_ref() + .is_some_and(|parent| parent.nodeid() == self.nodeid) + { + return; + } + *self.parent.lock() = parent; + } + + pub(crate) fn clear_parent(&self) { + *self.parent.lock() = None; + } + + pub(crate) fn cached_file_type(&self) -> Option { + self.cached_metadata.lock().as_ref().map(|md| md.file_type) + } + pub fn set_cached_metadata(&self, md: Metadata) { *self.cached_metadata.lock() = Some(md); self.cached_metadata_deadline_ns @@ -236,6 +284,146 @@ impl FuseNode { Self::now_ns().saturating_add(delta_ns) } + fn cache_lookup_child( + &self, + name: &str, + child: &Arc, + generation: u64, + valid: u64, + valid_nsec: u32, + ) { + if child.nodeid() == self.nodeid { + return; + } + let deadline_ns = Self::cache_deadline(valid, valid_nsec); + self.prune_lookup_cache(); + + let mut removed = Vec::new(); + { + let mut cache = self.lookup_cache.lock(); + if deadline_ns == 0 { + if let Some(entry) = cache.remove(name) { + removed.push(entry); + } + } else { + if !cache.contains_key(name) && cache.len() >= Self::LOOKUP_CACHE_MAX_ENTRIES { + if let Some(victim) = cache.keys().next().cloned() { + if let Some(entry) = cache.remove(&victim) { + removed.push(entry); + } + } + } + if let Some(entry) = cache.get_mut(name) { + if Arc::ptr_eq(&entry.child, child) { + entry.generation = generation; + entry.deadline_ns = deadline_ns; + } else { + let old_entry = replace( + entry, + FuseLookupCacheEntry { + child: child.clone(), + generation, + deadline_ns, + }, + ); + removed.push(old_entry); + } + } else { + cache.insert( + name.to_string(), + FuseLookupCacheEntry { + child: child.clone(), + generation, + deadline_ns, + }, + ); + } + } + } + Self::clear_removed_lookup_entries(removed); + } + + fn invalidate_lookup_cache(&self, name: &str) { + if let Some(entry) = self.remove_lookup_cache_entry(name) { + Self::clear_removed_lookup_entries(vec![entry]); + } + } + + fn invalidate_child_name(&self, name: &str) { + let removed = self.remove_lookup_cache_entry(name); + if let Some(child) = removed.as_ref().map(|entry| entry.child.clone()) { + child.clear_dname_if(name); + } + if let Some(entry) = removed { + Self::clear_removed_lookup_entries(vec![entry]); + } + } + + fn lookup_cached_child(&self, name: &str) -> Option> { + self.prune_lookup_cache(); + let cache = self.lookup_cache.lock(); + let entry = cache.get(name).cloned()?; + Some(entry.child) + } + + fn remove_lookup_cache_entry(&self, name: &str) -> Option { + self.lookup_cache.lock().remove(name) + } + + fn lookup_cache_entry_expired_or_stale( + parent_nodeid: u64, + name: &str, + entry: &FuseLookupCacheEntry, + now: u64, + ) -> bool { + (entry.deadline_ns != u64::MAX && now >= entry.deadline_ns) + || entry.child.check_not_stale().is_err() + || entry.child.generation() != entry.generation + || entry.child.parent_fuse_nodeid() != parent_nodeid + || !entry.child.has_dname(name) + } + + fn take_lookup_cache_entries(&self) -> Vec { + let mut cache = self.lookup_cache.lock(); + take(&mut *cache).into_values().collect() + } + + pub(crate) fn clear_lookup_cache_tree(&self) { + Self::clear_removed_lookup_entries(self.take_lookup_cache_entries()); + } + + fn prune_lookup_cache(&self) { + let now = Self::now_ns(); + let removed = { + let mut cache = self.lookup_cache.lock(); + let stale_keys: Vec = cache + .iter() + .filter_map(|(name, entry)| { + if Self::lookup_cache_entry_expired_or_stale(self.nodeid, name, entry, now) { + Some(name.clone()) + } else { + None + } + }) + .collect(); + let mut removed = Vec::new(); + for key in stale_keys { + if let Some(entry) = cache.remove(&key) { + removed.push(entry); + } + } + removed + }; + Self::clear_removed_lookup_entries(removed); + } + + fn clear_removed_lookup_entries(entries: Vec) { + let mut stack = entries; + while let Some(entry) = stack.pop() { + stack.extend(entry.child.take_lookup_cache_entries()); + } + } + pub(crate) fn conn(&self) -> &Arc { &self.conn } @@ -374,6 +562,25 @@ impl FuseNode { payload } + fn fuse_xattr_unsupported(&self, opcode: u32) -> SystemError { + self.conn.mark_no_xattr(opcode); + SystemError::EOPNOTSUPP_OR_ENOTSUP + } + + fn verify_xattr_list(list: &[u8]) -> Result<(), SystemError> { + let mut idx = 0usize; + while idx < list.len() { + let Some(end) = list[idx..].iter().position(|b| *b == 0) else { + return Err(SystemError::EIO); + }; + if end == 0 { + return Err(SystemError::EIO); + } + idx += end + 1; + } + Ok(()) + } + fn pack_two_names_payload(first: &str, second: &str) -> Vec { let mut payload = Vec::with_capacity(first.len() + second.len() + 2); payload.extend_from_slice(first.as_bytes()); @@ -650,21 +857,68 @@ impl FuseNode { (base_len + Self::FUSE_DIRENT_ALIGN - 1) & !(Self::FUSE_DIRENT_ALIGN - 1) } - fn cache_child_from_entry(&self, entry: &FuseEntryOut, name: &str) { + fn entry_file_type(attr: &FuseAttr) -> Result { + let mode = InodeMode::from_bits_truncate(attr.mode); + match mode & InodeMode::S_IFMT { + t if t == InodeMode::S_IFDIR => Ok(FileType::Dir), + t if t == InodeMode::S_IFREG => Ok(FileType::File), + t if t == InodeMode::S_IFLNK => Ok(FileType::SymLink), + t if t == InodeMode::S_IFCHR => Ok(FileType::CharDevice), + t if t == InodeMode::S_IFBLK => Ok(FileType::BlockDevice), + t if t == InodeMode::S_IFSOCK => Ok(FileType::Socket), + t if t == InodeMode::S_IFIFO => Ok(FileType::Pipe), + _ => Err(SystemError::EIO), + } + } + + fn metadata_from_valid_entry( + entry: &FuseEntryOut, + zero_nodeid_error: SystemError, + expected_type: Option, + ) -> Result { if entry.nodeid == 0 { - return; + return Err(zero_nodeid_error); + } + if entry.attr.size > i64::MAX as u64 { + return Err(SystemError::EIO); + } + let file_type = Self::entry_file_type(&entry.attr)?; + if expected_type.is_some_and(|expected| expected != file_type) { + return Err(SystemError::EIO); } - if let Some(fs) = self.fs.upgrade() { - let md = Self::attr_to_metadata(&entry.attr); + Ok(Self::attr_to_metadata(&entry.attr)) + } + + fn cache_child_from_entry(&self, entry: &FuseEntryOut, name: &str) { + let mut consumed = false; + let result = (|| { + let md = Self::metadata_from_valid_entry(entry, SystemError::EIO, None)?; + if entry.nodeid == self.nodeid { + return Err(SystemError::EIO); + } + let fs = self.fs.upgrade().ok_or(SystemError::ENOENT)?; + let parent = self.self_ref.upgrade().ok_or(SystemError::ENOENT)?; let child = fs.get_or_create_node_with_generation( entry.nodeid, - self.nodeid, + Some(parent), Some(md.clone()), Some(entry.generation), - ); + 1, + )?; + consumed = true; child.set_dname(name); - child.inc_lookup(1); child.set_cached_metadata_with_valid(md, entry.attr_valid, entry.attr_valid_nsec); + self.cache_lookup_child( + name, + &child, + entry.generation, + entry.entry_valid, + entry.entry_valid_nsec, + ); + Ok::<(), SystemError>(()) + })(); + if result.is_err() && entry.nodeid != 0 && !consumed { + let _ = self.conn.queue_forget(entry.nodeid, 1); } } @@ -690,6 +944,8 @@ impl FuseNode { names.push(name.to_string()); self.cache_child_from_entry(&plus.entry_out, name); } + } else if plus.entry_out.nodeid != 0 { + let _ = self.conn.queue_forget(plus.entry_out.nodeid, 1); } last_off = dirent.off; @@ -738,24 +994,7 @@ impl FuseNode { fn attr_to_metadata(attr: &FuseAttr) -> Metadata { let mode = InodeMode::from_bits_truncate(attr.mode); - let ifmt = mode.bits() & InodeMode::S_IFMT.bits(); - let file_type = if ifmt == InodeMode::S_IFDIR.bits() { - FileType::Dir - } else if ifmt == InodeMode::S_IFREG.bits() { - FileType::File - } else if ifmt == InodeMode::S_IFLNK.bits() { - FileType::SymLink - } else if ifmt == InodeMode::S_IFCHR.bits() { - FileType::CharDevice - } else if ifmt == InodeMode::S_IFBLK.bits() { - FileType::BlockDevice - } else if ifmt == InodeMode::S_IFSOCK.bits() { - FileType::Socket - } else if ifmt == InodeMode::S_IFIFO.bits() { - FileType::Pipe - } else { - FileType::File - }; + let file_type = Self::entry_file_type(attr).unwrap_or(FileType::File); let inode_id = InodeId::new(attr.ino as usize); @@ -875,7 +1114,14 @@ impl FuseNode { self.set_open_private_data(data, opcode, out.fh, file_flags, out.open_flags, false) } - fn release_common(&self, opcode: u32, fh: u64, file_flags: u32, lock_owner: u64) { + fn release_common_for_node( + &self, + opcode: u32, + nodeid: u64, + fh: u64, + file_flags: u32, + lock_owner: u64, + ) { let inarg = FuseReleaseIn { fh, flags: file_flags, @@ -883,7 +1129,6 @@ impl FuseNode { lock_owner, }; let conn = self.conn.clone(); - let nodeid = self.nodeid; let payload = fuse_pack_struct(&inarg).to_vec(); if let Err(err) = conn.request_nocreds_background(opcode, nodeid, &payload) { log::warn!( @@ -896,6 +1141,10 @@ impl FuseNode { } } + fn release_common(&self, opcode: u32, fh: u64, file_flags: u32, lock_owner: u64) { + self.release_common_for_node(opcode, self.nodeid, fh, file_flags, lock_owner); + } + fn ensure_dir(&self) -> Result<(), SystemError> { let md = self.cached_or_fetch_metadata()?; if md.file_type != FileType::Dir { @@ -976,26 +1225,42 @@ impl FuseNode { &self, entry: &FuseEntryOut, name: Option<&str>, + expected_type: FileType, ) -> Result, SystemError> { - self.check_not_stale()?; - let md = Self::attr_to_metadata(&entry.attr); - let fs = self.fs.upgrade().ok_or(SystemError::ENOENT)?; - let child = fs.get_or_create_node_with_generation( - entry.nodeid, - self.nodeid, - Some(md), - Some(entry.generation), - ); - if let Some(name) = name { - child.set_dname(name); + let mut consumed = false; + let result = (|| { + self.check_not_stale()?; + let md = Self::metadata_from_valid_entry(entry, SystemError::EIO, Some(expected_type))?; + if entry.nodeid == self.nodeid { + return Err(SystemError::EIO); + } + let fs = self.fs.upgrade().ok_or(SystemError::ENOENT)?; + let parent = self.self_ref.upgrade().ok_or(SystemError::ENOENT)?; + let child = fs.get_or_create_node_with_generation( + entry.nodeid, + Some(parent), + Some(md.clone()), + Some(entry.generation), + 1, + )?; + if let Some(name) = name { + child.set_dname(name); + self.cache_lookup_child( + name, + &child, + entry.generation, + entry.entry_valid, + entry.entry_valid_nsec, + ); + } + consumed = true; + child.set_cached_metadata_with_valid(md, entry.attr_valid, entry.attr_valid_nsec); + Ok(child as Arc) + })(); + if result.is_err() && entry.nodeid != 0 && !consumed { + let _ = self.conn.queue_forget(entry.nodeid, 1); } - child.inc_lookup(1); - child.set_cached_metadata_with_valid( - Self::attr_to_metadata(&entry.attr), - entry.attr_valid, - entry.attr_valid_nsec, - ); - Ok(child) + result } fn read_direct_with_open( @@ -1089,6 +1354,133 @@ impl FuseNode { ) } + fn fill_page_cache_range_with_open( + &self, + page_cache: &Arc, + start_page: usize, + end_page: usize, + file_size: usize, + fh: u64, + file_flags: u32, + ) -> Result<(usize, Option), SystemError> { + if start_page >= end_page || file_size == 0 { + return Ok((0, None)); + } + + let max_read = self.conn().max_read(); + let max_pages_by_read = core::cmp::max(1, max_read >> MMArch::PAGE_SHIFT); + let max_pages = core::cmp::max( + 1, + core::cmp::min(max_pages_by_read, self.conn().max_pages()), + ); + let mut total_read = 0usize; + let mut truncate_eof = None; + + let mut idx = start_page; + while idx < end_page { + if page_cache.is_page_ready(idx) { + idx += 1; + continue; + } + + let run_start = idx; + let mut run_end = run_start + 1; + while run_end < end_page + && run_end - run_start < max_pages + && !page_cache.is_page_ready(run_end) + { + run_end += 1; + } + + let read_offset = run_start + .checked_mul(MMArch::PAGE_SIZE) + .ok_or(SystemError::EOVERFLOW)?; + if read_offset >= file_size { + break; + } + + let read_pages_len = (run_end - run_start) + .checked_mul(MMArch::PAGE_SIZE) + .ok_or(SystemError::EOVERFLOW)?; + let read_len = core::cmp::min( + core::cmp::min(read_pages_len, max_read), + file_size - read_offset, + ); + if read_len == 0 { + break; + } + + let mut read_buf = vec![0u8; read_len]; + let bytes_read = self.read_direct_with_open( + read_offset, + read_len, + &mut read_buf, + fh, + file_flags, + 0, + )?; + if bytes_read == 0 { + let (eof, should_truncate) = self.note_short_read_eof(run_start, 0, file_size)?; + if should_truncate { + truncate_eof = Some(eof); + } + break; + } + + let covered_pages = bytes_read.div_ceil(MMArch::PAGE_SIZE); + let pages_to_commit = core::cmp::min(run_end - run_start, covered_pages); + let mut saw_short_page = false; + for rel_page in 0..pages_to_commit { + let page_idx = run_start + rel_page; + let page_offset = rel_page * MMArch::PAGE_SIZE; + let page_read_len = + core::cmp::min(MMArch::PAGE_SIZE, bytes_read.saturating_sub(page_offset)); + if page_read_len == 0 { + break; + } + + let mut filled_len = None; + let page = page_cache.manager().commit_page_with(page_idx, |_, dst| { + dst.fill(0); + dst[..page_read_len] + .copy_from_slice(&read_buf[page_offset..page_offset + page_read_len]); + filled_len = Some(page_read_len); + Ok(page_read_len) + })?; + drop(page); + + if filled_len.is_some() { + total_read += 1; + } + + if page_read_len < MMArch::PAGE_SIZE { + let (eof, should_truncate) = + self.note_short_read_eof(page_idx, page_read_len, file_size)?; + if should_truncate { + truncate_eof = Some(eof); + } + saw_short_page = true; + break; + } + } + + if bytes_read < read_len && !saw_short_page { + let (eof, should_truncate) = + self.note_short_read_eof(run_start, bytes_read, file_size)?; + if should_truncate { + truncate_eof = Some(eof); + } + } + + if saw_short_page || bytes_read < read_len { + break; + } + idx = run_end; + } + + Ok((total_read, truncate_eof)) + } + fn read_cached_with_open( &self, offset: usize, @@ -1108,9 +1500,27 @@ impl FuseNode { let end_page_index = (offset + read_len - 1) >> MMArch::PAGE_SHIFT; let page_cache = self.ensure_page_cache()?; let _invalidate = page_cache.invalidate_read(); + let last_file_page = (file_size - 1) >> MMArch::PAGE_SHIFT; + let max_pages_by_read = core::cmp::max(1, self.conn().max_read() >> MMArch::PAGE_SHIFT); + let max_pages_by_conn = core::cmp::min(max_pages_by_read, self.conn().max_pages()); + let readaround_pages = core::cmp::min(max_pages_by_conn, 16); + let prefetch_end = core::cmp::min( + last_file_page + 1, + core::cmp::max( + end_page_index + 1, + start_page_index.saturating_add(readaround_pages), + ), + ); + let (_, mut truncate_eof) = self.fill_page_cache_range_with_open( + &page_cache, + start_page_index, + prefetch_end, + file_size, + fh, + file_flags, + )?; let mut dst_offset = 0usize; - let mut truncate_eof = None; for page_index in start_page_index..=end_page_index { let page_start = page_index << MMArch::PAGE_SHIFT; let page_end = page_start + MMArch::PAGE_SIZE; @@ -1222,6 +1632,68 @@ impl FuseNode { Ok(page) } + pub(crate) fn mmap_readahead_with_open( + &self, + page_index: usize, + req_pages: usize, + ra_state: &mut FileReadaheadState, + fh: u64, + file_flags: u32, + ) -> Result { + if req_pages == 0 { + return Ok(0); + } + + let md = self.cached_metadata_snapshot().ok_or(SystemError::EIO)?; + let file_size = md.size.max(0) as usize; + if file_size == 0 { + return Ok(0); + } + + let last_file_page = (file_size - 1) >> MMArch::PAGE_SHIFT; + if page_index > last_file_page { + return Ok(0); + } + + let page_cache = self.ensure_page_cache()?; + let max_pages_by_read = core::cmp::max(1, self.conn().max_read() >> MMArch::PAGE_SHIFT); + let max_pages_by_conn = core::cmp::min(max_pages_by_read, self.conn().max_pages()); + let max_pages = core::cmp::max( + 1, + core::cmp::min( + ra_state.ra_pages, + core::cmp::min(max_pages_by_conn, self.conn().max_readahead_pages()), + ), + ); + let pages_to_read = core::cmp::min(max_pages, core::cmp::max(req_pages, 16)); + let end_page = core::cmp::min(last_file_page + 1, page_index.saturating_add(pages_to_read)); + + let (total_read, truncate_eof) = self.fill_page_cache_range_with_open( + &page_cache, + page_index, + end_page, + file_size, + fh, + file_flags, + )?; + + if let Some(eof) = truncate_eof { + if matches!( + self.cached_metadata_snapshot(), + Some(md) if md.size.max(0) as usize == eof + ) { + self.truncate_page_cache(eof)?; + } + } + + ra_state.start = page_index; + ra_state.size = end_page.saturating_sub(page_index); + ra_state.async_size = ra_state.size.saturating_sub(req_pages); + ra_state.prev_index = end_page.saturating_sub(1) as i64; + + Ok(total_read) + } + fn update_cached_pages_after_write( &self, page_cache: &Arc, @@ -1454,6 +1926,105 @@ impl IndexNode for FuseNode { Ok(()) } + fn getxattr(&self, name: &str, buf: &mut [u8]) -> Result { + self.check_not_stale()?; + if self.conn.no_xattr(FUSE_GETXATTR) { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + let requested = core::cmp::min(buf.len(), Self::XATTR_SIZE_MAX); + let inarg = FuseGetxattrIn { + size: requested as u32, + padding: 0, + }; + let payload_in = Self::pack_struct_and_name_payload(&inarg, name); + let payload = match self.conn().request(FUSE_GETXATTR, self.nodeid, &payload_in) { + Ok(payload) => payload, + Err(SystemError::ENOSYS) => return Err(self.fuse_xattr_unsupported(FUSE_GETXATTR)), + Err(err) => return Err(err), + }; + + if buf.is_empty() { + let out: FuseGetxattrOut = fuse_read_struct(&payload)?; + return Ok(core::cmp::min(out.size as usize, Self::XATTR_SIZE_MAX)); + } + if payload.len() > buf.len() { + return Err(SystemError::ERANGE); + } + if payload.len() > Self::XATTR_SIZE_MAX { + return Err(SystemError::E2BIG); + } + buf[..payload.len()].copy_from_slice(&payload); + Ok(payload.len()) + } + + fn setxattr(&self, name: &str, value: &[u8], flags: XattrFlags) -> Result { + self.check_not_stale()?; + if self.conn.no_xattr(FUSE_SETXATTR) { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + if value.len() > Self::XATTR_SIZE_MAX { + return Err(SystemError::E2BIG); + } + let inarg = FuseSetxattrInCompat { + size: value.len() as u32, + flags: flags.bits() as u32, + }; + let mut payload_in = Self::pack_struct_and_name_payload(&inarg, name); + payload_in.extend_from_slice(value); + match self.conn().request(FUSE_SETXATTR, self.nodeid, &payload_in) { + Ok(_) => Ok(0), + Err(SystemError::ENOSYS) => Err(self.fuse_xattr_unsupported(FUSE_SETXATTR)), + Err(err) => Err(err), + } + } + + fn listxattr(&self, buf: &mut [u8]) -> Result { + self.check_not_stale()?; + if self.conn.no_xattr(FUSE_LISTXATTR) { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + let requested = core::cmp::min(buf.len(), Self::XATTR_LIST_MAX); + let inarg = FuseGetxattrIn { + size: requested as u32, + padding: 0, + }; + let payload = + match self + .conn() + .request(FUSE_LISTXATTR, self.nodeid, fuse_pack_struct(&inarg)) + { + Ok(payload) => payload, + Err(SystemError::ENOSYS) => return Err(self.fuse_xattr_unsupported(FUSE_LISTXATTR)), + Err(err) => return Err(err), + }; + + if buf.is_empty() { + let out: FuseGetxattrOut = fuse_read_struct(&payload)?; + return Ok(core::cmp::min(out.size as usize, Self::XATTR_LIST_MAX)); + } + if payload.len() > buf.len() { + return Err(SystemError::ERANGE); + } + if payload.len() > Self::XATTR_LIST_MAX { + return Err(SystemError::E2BIG); + } + Self::verify_xattr_list(&payload)?; + buf[..payload.len()].copy_from_slice(&payload); + Ok(payload.len()) + } + + fn removexattr(&self, name: &str) -> Result { + self.check_not_stale()?; + if self.conn.no_xattr(FUSE_REMOVEXATTR) { + return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP); + } + match self.request_name(FUSE_REMOVEXATTR, self.nodeid, name) { + Ok(_) => Ok(0), + Err(SystemError::ENOSYS) => Err(self.fuse_xattr_unsupported(FUSE_REMOVEXATTR)), + Err(err) => Err(err), + } + } + fn truncate_before_open(&self, flags: &FileFlags) -> bool { flags.contains(FileFlags::O_TRUNC) && !self @@ -2015,44 +2586,85 @@ impl IndexNode for FuseNode { self.check_not_stale()?; self.ensure_dir()?; if name == "." { - let fs = self.fs.upgrade().ok_or(SystemError::ENOENT)?; - return Ok(fs.get_or_create_node(self.nodeid, *self.parent_nodeid.lock(), None)); + let this = self.self_ref.upgrade().ok_or(SystemError::ENOENT)?; + return Ok(this); } if name == ".." { return self.parent(); } - let payload = self.request_name(FUSE_LOOKUP, self.nodeid, name)?; + if let Some(child) = self.lookup_cached_child(name) { + return Ok(child); + } + + let payload = match self.request_name(FUSE_LOOKUP, self.nodeid, name) { + Ok(payload) => payload, + Err(err) => { + self.invalidate_lookup_cache(name); + return Err(err); + } + }; let entry: FuseEntryOut = fuse_read_struct(&payload)?; - let md = Self::attr_to_metadata(&entry.attr); + let md = Self::metadata_from_valid_entry(&entry, SystemError::ENOENT, None).inspect_err( + |_| { + if entry.nodeid != 0 { + let _ = self.conn.queue_forget(entry.nodeid, 1); + } + }, + )?; + if entry.nodeid == self.nodeid { + let _ = self.conn.queue_forget(entry.nodeid, 1); + return Err(SystemError::EIO); + } - let fs = self.fs.upgrade().ok_or(SystemError::ENOENT)?; - let child = fs.get_or_create_node_with_generation( - entry.nodeid, - self.nodeid, - Some(md), - Some(entry.generation), - ); + let mut consumed = false; + let result = (|| { + let fs = self.fs.upgrade().ok_or(SystemError::ENOENT)?; + let parent = self.self_ref.upgrade().ok_or(SystemError::ENOENT)?; + let child = fs.get_or_create_node_with_generation( + entry.nodeid, + Some(parent), + Some(md.clone()), + Some(entry.generation), + 1, + )?; + consumed = true; + Ok(child) + })(); + let child = match result { + Ok(child) => child, + Err(err) => { + if entry.nodeid != 0 && !consumed { + let _ = self.conn.queue_forget(entry.nodeid, 1); + } + return Err(err); + } + }; child.set_dname(name); child .lookup_attr_flags .store(entry.attr.flags, Ordering::Relaxed); - child.inc_lookup(1); - child.set_cached_metadata_with_valid( - Self::attr_to_metadata(&entry.attr), - entry.attr_valid, - entry.attr_valid_nsec, + child.set_cached_metadata_with_valid(md, entry.attr_valid, entry.attr_valid_nsec); + self.cache_lookup_child( + name, + &child, + entry.generation, + entry.entry_valid, + entry.entry_valid_nsec, ); Ok(child) } fn parent(&self) -> Result, SystemError> { let fs = self.fs.upgrade().ok_or(SystemError::ENOENT)?; + if let Some(parent) = self.parent.lock().clone() { + return Ok(parent); + } let parent_nodeid = *self.parent_nodeid.lock(); if parent_nodeid == self.nodeid { return Ok(fs.root_node()); } - Ok(fs.get_or_create_node(parent_nodeid, parent_nodeid, None)) + Err(SystemError::ESTALE) } fn create( @@ -2080,8 +2692,17 @@ impl IndexNode for FuseNode { Err(SystemError::ENOSYS) => return self.create_with_data(name, file_type, mode, 0), Err(e) => return Err(e), }; - let (entry, _) = Self::parse_create_reply(&payload)?; - self.create_node_from_entry(&entry, Some(name)) + let (entry, open_out) = Self::parse_create_reply(&payload)?; + if entry.nodeid != 0 { + self.release_common_for_node( + FUSE_RELEASE, + entry.nodeid, + open_out.fh, + FileFlags::O_RDONLY.bits(), + 0, + ); + } + self.create_node_from_entry(&entry, Some(name), FileType::File) } fn create_with_data( @@ -2103,7 +2724,7 @@ impl IndexNode for FuseNode { let payload_in = Self::pack_struct_and_name_payload(&inarg, name); let payload = self.conn().request(FUSE_MKDIR, self.nodeid, &payload_in)?; let entry: FuseEntryOut = fuse_read_struct(&payload)?; - self.create_node_from_entry(&entry, Some(name)) + self.create_node_from_entry(&entry, Some(name), FileType::Dir) } FileType::File => { let inarg = FuseMknodIn { @@ -2115,7 +2736,7 @@ impl IndexNode for FuseNode { let payload_in = Self::pack_struct_and_name_payload(&inarg, name); let payload = self.conn().request(FUSE_MKNOD, self.nodeid, &payload_in)?; let entry: FuseEntryOut = fuse_read_struct(&payload)?; - self.create_node_from_entry(&entry, Some(name)) + self.create_node_from_entry(&entry, Some(name), FileType::File) } FileType::SymLink => { let mut payload_in = Vec::with_capacity(name.len() + 2); @@ -2126,7 +2747,7 @@ impl IndexNode for FuseNode { .conn() .request(FUSE_SYMLINK, self.nodeid, &payload_in)?; let entry: FuseEntryOut = fuse_read_struct(&payload)?; - self.create_node_from_entry(&entry, Some(name)) + self.create_node_from_entry(&entry, Some(name), FileType::SymLink) } _ => Err(SystemError::ENOSYS), } @@ -2140,7 +2761,7 @@ impl IndexNode for FuseNode { .conn() .request(FUSE_SYMLINK, self.nodeid, &payload_in)?; let entry: FuseEntryOut = fuse_read_struct(&payload)?; - self.create_node_from_entry(&entry, Some(name)) + self.create_node_from_entry(&entry, Some(name), FileType::SymLink) } fn link(&self, name: &str, other: &Arc) -> Result<(), SystemError> { @@ -2150,33 +2771,44 @@ impl IndexNode for FuseNode { .as_any_ref() .downcast_ref::() .ok_or(SystemError::EXDEV)?; + let expected_type = target.cached_or_fetch_metadata()?.file_type; let inarg = FuseLinkIn { oldnodeid: target.nodeid, }; let payload_in = Self::pack_struct_and_name_payload(&inarg, name); let payload = self.conn().request(FUSE_LINK, self.nodeid, &payload_in)?; let entry: FuseEntryOut = fuse_read_struct(&payload)?; - let md = Self::attr_to_metadata(&entry.attr); - let fs = self.fs.upgrade().ok_or(SystemError::ENOENT)?; - let child = fs.get_or_create_node_for_link( - entry.nodeid, - self.nodeid, - Some(md), - Some(entry.generation), - ); - child.inc_lookup(1); - child.set_cached_metadata_with_valid( - Self::attr_to_metadata(&entry.attr), - entry.attr_valid, - entry.attr_valid_nsec, - ); - Ok(()) + let mut consumed = false; + let result = (|| { + let md = + Self::metadata_from_valid_entry(&entry, SystemError::EIO, Some(expected_type))?; + let fs = self.fs.upgrade().ok_or(SystemError::ENOENT)?; + let parent = self.self_ref.upgrade().ok_or(SystemError::ENOENT)?; + let child = fs.get_or_create_node_for_link( + entry.nodeid, + Some(parent), + Some(md.clone()), + Some(entry.generation), + 1, + )?; + consumed = true; + child.set_cached_metadata_with_valid(md, entry.attr_valid, entry.attr_valid_nsec); + Ok(()) + })(); + if result.is_err() && entry.nodeid != 0 && !consumed { + let _ = self.conn.queue_forget(entry.nodeid, 1); + } + if result.is_ok() { + self.invalidate_lookup_cache(name); + } + result } fn unlink(&self, name: &str) -> Result<(), SystemError> { self.check_not_stale()?; self.ensure_dir()?; let _ = self.request_name(FUSE_UNLINK, self.nodeid, name)?; + self.invalidate_child_name(name); Ok(()) } @@ -2184,6 +2816,7 @@ impl IndexNode for FuseNode { self.check_not_stale()?; self.ensure_dir()?; let _ = self.request_name(FUSE_RMDIR, self.nodeid, name)?; + self.invalidate_child_name(name); Ok(()) } @@ -2221,41 +2854,35 @@ impl IndexNode for FuseNode { payload_in.push(0); payload_in.extend_from_slice(new_name.as_bytes()); payload_in.push(0); - let cached_old = self - .fs - .upgrade() - .and_then(|fs| fs.find_cached_child(self.nodeid, old_name)) - .or_else(|| { - self.find(old_name) - .ok() - .and_then(|inode| inode.downcast_arc::()) - }); - let cached_new = target_any - .fs - .upgrade() - .and_then(|fs| fs.find_cached_child(target_any.nodeid, new_name)) - .or_else(|| { - if flag.contains(RenameFlags::EXCHANGE) { - target_any - .find(new_name) - .ok() - .and_then(|inode| inode.downcast_arc::()) - } else { - None - } - }); + let cached_old = self.lookup_cached_child(old_name).or_else(|| { + self.find(old_name) + .ok() + .and_then(|inode| inode.downcast_arc::()) + }); + let cached_new = target_any.lookup_cached_child(new_name).or_else(|| { + target_any + .find(new_name) + .ok() + .and_then(|inode| inode.downcast_arc::()) + }); let r = self.conn().request(opcode, self.nodeid, &payload_in); if opcode == FUSE_RENAME2 && matches!(r, Err(SystemError::ENOSYS)) { return Err(SystemError::EINVAL); } let _ = r?; + self.invalidate_lookup_cache(old_name); + target_any.invalidate_lookup_cache(new_name); if let Some(node) = cached_old { node.set_parent_nodeid(target_any.nodeid); + node.set_parent(Some( + target_any.self_ref.upgrade().ok_or(SystemError::ENOENT)?, + )); node.set_dname(new_name); } if let Some(node) = cached_new { if flag.contains(RenameFlags::EXCHANGE) { node.set_parent_nodeid(self.nodeid); + node.set_parent(Some(self.self_ref.upgrade().ok_or(SystemError::ENOENT)?)); node.set_dname(old_name); } else { node.clear_dname_if(new_name); @@ -2279,6 +2906,8 @@ impl IndexNode for FuseNode { impl Drop for FuseNode { fn drop(&mut self) { + self.clear_lookup_cache_tree(); self.flush_forget(); + self.clear_parent(); } } diff --git a/kernel/src/filesystem/fuse/protocol.rs b/kernel/src/filesystem/fuse/protocol.rs index dbc1f00d1d..8358f85206 100644 --- a/kernel/src/filesystem/fuse/protocol.rs +++ b/kernel/src/filesystem/fuse/protocol.rs @@ -33,6 +33,10 @@ pub const FUSE_WRITE: u32 = 16; pub const FUSE_STATFS: u32 = 17; pub const FUSE_RELEASE: u32 = 18; pub const FUSE_FSYNC: u32 = 20; +pub const FUSE_SETXATTR: u32 = 21; +pub const FUSE_GETXATTR: u32 = 22; +pub const FUSE_LISTXATTR: u32 = 23; +pub const FUSE_REMOVEXATTR: u32 = 24; pub const FUSE_FLUSH: u32 = 25; pub const FUSE_INIT: u32 = 26; pub const FUSE_OPENDIR: u32 = 27; @@ -422,6 +426,27 @@ pub struct FuseFsyncIn { pub padding: u32, } +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct FuseSetxattrInCompat { + pub size: u32, + pub flags: u32, +} + +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct FuseGetxattrIn { + pub size: u32, + pub padding: u32, +} + +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct FuseGetxattrOut { + pub size: u32, + pub padding: u32, +} + #[repr(C)] #[derive(Debug, Clone, Copy)] pub struct FuseAccessIn { diff --git a/kernel/src/filesystem/fuse/virtiofs.rs b/kernel/src/filesystem/fuse/virtiofs.rs index 15e1058b5a..60c214684d 100644 --- a/kernel/src/filesystem/fuse/virtiofs.rs +++ b/kernel/src/filesystem/fuse/virtiofs.rs @@ -24,9 +24,10 @@ use crate::{ virtio_impl::HalImpl, }, filesystem::vfs::{ - FileSystem, FileSystemMakerData, FsInfo, IndexNode, MountableFileSystem, SuperBlock, - FSMAKER, + file::File, FileSystem, FileSystemMakerData, FsInfo, IndexNode, MountableFileSystem, + SuperBlock, FSMAKER, }, + mm::{fault::PageFaultMessage, VirtRegion, VmFaultReason, VmFlags}, process::{kthread::KernelThreadClosure, kthread::KernelThreadMechanism, ProcessManager}, register_mountable_fs, time::{sleep::nanosleep, PosixTimeSpec}, @@ -811,6 +812,31 @@ impl FileSystem for VirtioFsFs { self.inner.permission_policy() } + unsafe fn fault(&self, pfm: &mut PageFaultMessage) -> VmFaultReason { + self.inner.fault(pfm) + } + + unsafe fn page_mkwrite(&self, pfm: &mut PageFaultMessage) -> VmFaultReason { + self.inner.page_mkwrite(pfm) + } + + fn mprotect(&self, old_vm_flags: VmFlags, new_vm_flags: VmFlags) -> Result<(), SystemError> { + self.inner.mprotect(old_vm_flags, new_vm_flags) + } + + fn vma_close(&self, file: &Arc, region: VirtRegion, vm_flags: VmFlags) { + self.inner.vma_close(file, region, vm_flags) + } + + unsafe fn map_pages( + &self, + pfm: &mut PageFaultMessage, + start_pgoff: usize, + end_pgoff: usize, + ) -> VmFaultReason { + self.inner.map_pages(pfm, start_pgoff, end_pgoff) + } + fn on_umount(&self) { self.inner.on_umount(); self.instance.wait_session_released(self.session_id); diff --git a/kernel/src/filesystem/mod.rs b/kernel/src/filesystem/mod.rs index 58a7e9ae4b..5ce9d7e02a 100644 --- a/kernel/src/filesystem/mod.rs +++ b/kernel/src/filesystem/mod.rs @@ -1,4 +1,5 @@ pub mod cgroup2; +pub mod debugfs; pub mod devfs; pub mod devpts; pub mod epoll; @@ -9,6 +10,7 @@ pub mod fs; pub mod fuse; pub mod kernfs; pub mod mbr; +pub mod mqueue; pub mod overlayfs; pub mod page_cache; pub mod poll; diff --git a/kernel/src/filesystem/mqueue.rs b/kernel/src/filesystem/mqueue.rs new file mode 100644 index 0000000000..58674dfada --- /dev/null +++ b/kernel/src/filesystem/mqueue.rs @@ -0,0 +1,177 @@ +use core::any::Any; + +use alloc::string::String; +use alloc::sync::{Arc, Weak}; +use alloc::vec; +use alloc::vec::Vec; +use system_error::SystemError; + +use crate::driver::base::device::device_number::DeviceNumber; +use crate::filesystem::vfs::utils::DName; +use crate::filesystem::vfs::{ + file::FilePrivateData, vcore::generate_inode_id, FileSystem, FileSystemMakerData, FileType, + FsInfo, IndexNode, InodeFlags, InodeId, InodeMode, Magic, Metadata, MountableFileSystem, + SuperBlock, FSMAKER, +}; +use crate::libs::mutex::{Mutex, MutexGuard}; +use crate::register_mountable_fs; +use crate::time::PosixTimeSpec; + +use linkme::distributed_slice; + +const MQUEUE_MAX_NAMELEN: u64 = 255; +const MQUEUE_BLOCK_SIZE: u64 = 4096; + +#[derive(Debug)] +pub struct MqueueFs { + root: Arc, + super_block: SuperBlock, +} + +#[derive(Debug)] +pub struct MqueueRootInode { + self_ref: Weak, + fs: Mutex>, + metadata: Metadata, +} + +impl MqueueFs { + fn new() -> Arc { + let super_block = + SuperBlock::new(Magic::MQUEUE_MAGIC, MQUEUE_BLOCK_SIZE, MQUEUE_MAX_NAMELEN); + + Arc::new_cyclic(|weak_fs| { + let root = Arc::new_cyclic(|weak_root| MqueueRootInode { + self_ref: weak_root.clone(), + fs: Mutex::new(weak_fs.clone()), + metadata: Metadata { + dev_id: 0, + inode_id: generate_inode_id(), + size: 0, + blk_size: MQUEUE_BLOCK_SIZE as usize, + blocks: 0, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + btime: PosixTimeSpec::default(), + file_type: FileType::Dir, + mode: InodeMode::S_IFDIR | InodeMode::S_ISVTX | InodeMode::S_IRWXUGO, + nlinks: 2, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), + flags: InodeFlags::empty(), + }, + }); + + Self { root, super_block } + }) + } +} + +impl FileSystem for MqueueFs { + fn root_inode(&self) -> Arc { + self.root.clone() + } + + fn info(&self) -> FsInfo { + FsInfo { + blk_dev_id: 0, + max_name_len: MQUEUE_MAX_NAMELEN as usize, + } + } + + fn as_any_ref(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "mqueue" + } + + fn super_block(&self) -> SuperBlock { + self.super_block.clone() + } +} + +impl MountableFileSystem for MqueueFs { + fn make_mount_data( + _raw_data: Option<&str>, + _source: &str, + ) -> Result>, SystemError> { + Ok(None) + } + + fn make_fs( + _data: Option<&dyn FileSystemMakerData>, + ) -> Result, SystemError> { + Ok(Self::new()) + } +} + +register_mountable_fs!(MqueueFs, MQUEUEFSMAKER, "mqueue"); + +impl IndexNode for MqueueRootInode { + fn read_at( + &self, + _offset: usize, + _len: usize, + _buf: &mut [u8], + _data: MutexGuard, + ) -> Result { + Err(SystemError::EISDIR) + } + + fn write_at( + &self, + _offset: usize, + _len: usize, + _buf: &[u8], + _data: MutexGuard, + ) -> Result { + Err(SystemError::EISDIR) + } + + fn find(&self, name: &str) -> Result, SystemError> { + match name { + "" | "." | ".." => Ok(self.self_ref.upgrade().ok_or(SystemError::ENOENT)?), + _ => Err(SystemError::ENOENT), + } + } + + fn get_entry_name(&self, ino: InodeId) -> Result { + if ino == self.metadata.inode_id { + return Ok(String::from(".")); + } + Err(SystemError::ENOENT) + } + + fn list(&self) -> Result, SystemError> { + Ok(vec![String::from("."), String::from("..")]) + } + + fn fs(&self) -> Arc { + self.fs.lock().upgrade().unwrap() + } + + fn metadata(&self) -> Result { + Ok(self.metadata.clone()) + } + + fn as_any_ref(&self) -> &dyn Any { + self + } + + fn dname(&self) -> Result { + Ok(DName::from("mqueue")) + } + + fn create( + &self, + _name: &str, + _file_type: FileType, + _mode: InodeMode, + ) -> Result, SystemError> { + Err(SystemError::ENOSYS) + } +} diff --git a/kernel/src/filesystem/overlayfs/copy_up.rs b/kernel/src/filesystem/overlayfs/copy_up.rs index 7de9a661b6..61c661f7a1 100644 --- a/kernel/src/filesystem/overlayfs/copy_up.rs +++ b/kernel/src/filesystem/overlayfs/copy_up.rs @@ -1,11 +1,16 @@ use super::OvlInode; -use crate::{ - filesystem::vfs::{FileType, IndexNode, Metadata}, - libs::mutex::Mutex, +use crate::filesystem::vfs::{ + file::{File, FileFlags}, + FileType, IndexNode, Metadata, }; +use alloc::string::String; use alloc::sync::Arc; use system_error::SystemError; +const COPY_UP_CHUNK_SIZE: usize = 64 * 1024; +type UpperCleanup = Option<(Arc, String)>; +type CreatedUpper = (Arc, UpperCleanup); + impl OvlInode { pub fn copy_up(&self) -> Result<(), SystemError> { let mut upper_inode = self.upper_inode.lock(); @@ -16,14 +21,47 @@ impl OvlInode { let lower_inode = self.lower_inodes.first().ok_or(SystemError::ENOENT)?; let metadata = lower_inode.metadata()?; - let new_upper_inode = self.create_upper_inode(metadata.clone())?; + let (new_upper_inode, cleanup) = self.create_upper_inode(metadata.clone())?; + + let copy_result = (|| -> Result<(), SystemError> { + if metadata.file_type == FileType::File { + let size = metadata.size.max(0) as usize; + let lower_file = File::new(lower_inode.clone(), FileFlags::O_RDONLY)?; + let upper_file = File::new(new_upper_inode.clone(), FileFlags::O_WRONLY)?; + let mut buffer = vec![0u8; COPY_UP_CHUNK_SIZE.min(size.max(1))]; + let mut offset = 0usize; + + while offset < size { + let chunk_len = (size - offset).min(buffer.len()); + let read_len = lower_file.pread(offset, chunk_len, &mut buffer[..chunk_len])?; + if read_len == 0 { + return Err(SystemError::EIO); + } + + let mut written = 0usize; + while written < read_len { + let n = upper_file.pwrite( + offset + written, + read_len - written, + &buffer[written..read_len], + )?; + if n == 0 { + return Err(SystemError::EIO); + } + written += n; + } + offset += read_len; + } + } - if metadata.file_type == FileType::File { - let mut buffer = vec![0u8; metadata.size as usize]; - let lock = Mutex::new(crate::filesystem::vfs::FilePrivateData::Unused); - lower_inode.read_at(0, metadata.size as usize, &mut buffer, lock.lock())?; + Ok(()) + })(); - new_upper_inode.write_at(0, metadata.size as usize, &buffer, lock.lock())?; + if let Err(err) = copy_result { + if let Some((parent, name)) = cleanup { + let _ = parent.unlink(&name); + } + return Err(err); } *upper_inode = Some(new_upper_inode); @@ -31,10 +69,10 @@ impl OvlInode { Ok(()) } - fn create_upper_inode(&self, metadata: Metadata) -> Result, SystemError> { + fn create_upper_inode(&self, metadata: Metadata) -> Result { let upper_root_inode = self.upper_root_inode()?; if self.redirect.is_empty() { - return Ok(upper_root_inode); + return Ok((upper_root_inode, None)); } let (parent_path, name) = match self.redirect.rsplit_once('/') { @@ -44,9 +82,10 @@ impl OvlInode { let parent_inode = self.ensure_upper_dir_path(parent_path)?; if let Ok(existing) = parent_inode.find(name) { - return Ok(existing); + return Ok((existing, None)); } - parent_inode.create_with_data(name, metadata.file_type, metadata.mode, 0) + let inode = parent_inode.create_with_data(name, metadata.file_type, metadata.mode, 0)?; + Ok((inode, Some((parent_inode, name.into())))) } } diff --git a/kernel/src/filesystem/overlayfs/mod.rs b/kernel/src/filesystem/overlayfs/mod.rs index 56695d3aef..ccdb5a180a 100644 --- a/kernel/src/filesystem/overlayfs/mod.rs +++ b/kernel/src/filesystem/overlayfs/mod.rs @@ -2,22 +2,30 @@ pub mod copy_up; pub mod entry; +use super::page_cache::PageCache; use super::ramfs::{LockedRamFSInode, RamFSInode}; +use super::vfs::file::{File, FileFlags, FilePrivateData}; use super::vfs::utils::DName; +use super::vfs::vcore; use super::vfs::FSMAKER; use super::vfs::{ - self, FileSystem, FileType, FsInfo, IndexNode, Metadata, MountableFileSystem, SuperBlock, + self, syscall::RenameFlags, FileSystem, FileType, FsInfo, IndexNode, Metadata, + MountableFileSystem, SuperBlock, }; use crate::driver::base::device::device_number::DeviceNumber; use crate::driver::base::device::device_number::Major; use crate::filesystem::vfs::{FileSystemMaker, FileSystemMakerData}; -use crate::libs::mutex::Mutex; +use crate::libs::{casting::DowncastArc, mutex::Mutex}; +use crate::mm::VmFlags; use crate::process::ProcessManager; use crate::register_mountable_fs; -use alloc::string::String; +use alloc::format; +use alloc::string::{String, ToString}; use alloc::sync::Arc; use alloc::sync::Weak; use alloc::vec::Vec; +use core::mem; +use core::sync::atomic::{AtomicUsize, Ordering}; use entry::{OvlEntry, OvlLayer}; use linkme::distributed_slice; use system_error::SystemError; @@ -25,7 +33,42 @@ use system_error::SystemError; const WHITEOUT_MODE: u64 = 0o020000 | 0o600; // whiteout字符设备文件模式与权限 const WHITEOUT_DEV: DeviceNumber = DeviceNumber::new(Major::UNNAMED_MAJOR, 0); // Whiteout 文件设备号 const WHITEOUT_FLAG: u64 = 0x1; +static OVL_TEMP_ID: AtomicUsize = AtomicUsize::new(0); type LowerRoot = (String, Arc); +type WorkdirTemp = (Arc, Arc, String); + +#[derive(Debug, Clone)] +pub struct OverlayFilePrivateData { + inner: Arc>, +} + +#[derive(Debug)] +struct OverlayFilePrivateDataInner { + backing_file: Arc, + backing_is_upper: bool, + flags: FileFlags, +} + +impl OverlayFilePrivateData { + fn new(backing_file: Arc, backing_is_upper: bool, flags: FileFlags) -> Self { + Self { + inner: Arc::new(Mutex::new(OverlayFilePrivateDataInner { + backing_file, + backing_is_upper, + flags, + })), + } + } + + pub fn set_flags(&mut self, flags: FileFlags) -> Result<(), SystemError> { + let mut inner = self.inner.lock(); + inner + .backing_file + .set_flags(OvlInode::backing_open_flags(flags))?; + inner.flags = flags; + Ok(()) + } +} #[derive(Debug)] pub struct OverlayMountData { @@ -79,9 +122,10 @@ struct OverlayFS { numfs: u32, numdatalayer: usize, layers: Vec, // 第0层为读写层,后面是只读层 - workdir: Arc, + workdir: Arc, root_inode: Arc, super_block: SuperBlock, + mutation_lock: Mutex<()>, } #[derive(Debug)] @@ -205,12 +249,18 @@ impl MountableFileSystem for OverlayFS { let lower_layers = lower_layers?; - let workdir = Arc::new(OvlInode::new( - mount_data.work_dir.clone(), - FileType::Dir, - None, - Vec::new(), - )); + let workdir_inode = root_inode + .lookup(&mount_data.work_dir) + .map_err(|_| SystemError::EINVAL)?; + if upper_file_type != FileType::Dir || workdir_inode.metadata()?.file_type != FileType::Dir + { + return Err(SystemError::EINVAL); + } + if Arc::ptr_eq(&upper_inode, &workdir_inode) + || !Arc::ptr_eq(&upper_inode.fs(), &workdir_inode.fs()) + { + return Err(SystemError::EINVAL); + } if lower_roots.is_empty() { return Err(SystemError::EINVAL); @@ -235,7 +285,6 @@ impl MountableFileSystem for OverlayFS { for layer in &layers { layer.mnt.set_fs(weak_fs.clone()); } - workdir.set_fs(weak_fs.clone()); root_inode.set_fs(weak_fs.clone()); OverlayFS { @@ -243,9 +292,10 @@ impl MountableFileSystem for OverlayFS { numfs: 1, numdatalayer: lower_roots.len(), layers, - workdir, + workdir: workdir_inode, root_inode, super_block: super_block.clone(), + mutation_lock: Mutex::new(()), } }); Ok(fs) @@ -286,6 +336,19 @@ impl OvlInode { upper_inode.clone().ok_or(SystemError::EROFS) } + fn writable_upper_inode(&self) -> Result, SystemError> { + if let Some(inode) = self.upper_inode.lock().clone() { + return Ok(inode); + } + + self.copy_up()?; + self.upper_inode.lock().clone().ok_or(SystemError::EROFS) + } + + fn workdir_inode(&self) -> Result, SystemError> { + Ok(self.overlay_fs()?.workdir.clone()) + } + fn child_redirect(&self, name: &str) -> String { if self.redirect.is_empty() { String::from(name) @@ -395,6 +458,85 @@ impl OvlInode { } } + fn create_workdir_temp(&self, create: F) -> Result + where + F: Fn(&Arc, &str) -> Result, SystemError>, + { + let workdir = self.workdir_inode()?; + for _ in 0..32 { + let id = OVL_TEMP_ID.fetch_add(1, Ordering::Relaxed); + let name = format!(".dragonos-ovl-{}", id); + match create(&workdir, &name) { + Ok(inode) => return Ok((workdir, inode, name)), + Err(SystemError::EEXIST) => continue, + Err(err) => return Err(err), + } + } + + Err(SystemError::EEXIST) + } + + fn cleanup_workdir_temp(workdir: &Arc, name: &str) { + let Ok(inode) = workdir.find(name) else { + return; + }; + let Ok(metadata) = inode.metadata() else { + return; + }; + + if metadata.file_type == FileType::Dir { + let _ = workdir.rmdir(name); + } else { + let _ = workdir.unlink(name); + } + } + + fn create_over_whiteout( + &self, + name: &str, + create_temp: F, + is_dir: bool, + ) -> Result, SystemError> + where + F: Fn(&Arc, &str) -> Result, SystemError>, + { + let upper_inode = self.writable_upper_inode()?; + match upper_inode.find(name) { + Ok(inode) if Self::is_whiteout_inode(&inode) => {} + Ok(_) => return Err(SystemError::EEXIST), + Err(SystemError::ENOENT) => return create_temp(&upper_inode, name), + Err(err) => return Err(err), + } + + let (workdir, temp_inode, temp_name) = self.create_workdir_temp(create_temp)?; + let commit_result = if is_dir { + workdir.move_to( + &temp_name, + &upper_inode, + name, + vfs::syscall::RenameFlags::EXCHANGE, + ) + } else { + workdir.move_to( + &temp_name, + &upper_inode, + name, + vfs::syscall::RenameFlags::empty(), + ) + }; + + if let Err(err) = commit_result { + Self::cleanup_workdir_temp(&workdir, &temp_name); + return Err(err); + } + + if is_dir { + Self::cleanup_workdir_temp(&workdir, &temp_name); + } + + upper_inode.find(name).or(Ok(temp_inode)) + } + fn is_dot_entry(name: &str) -> bool { name == "." || name == ".." } @@ -402,9 +544,117 @@ impl OvlInode { fn is_dir_empty(inode: &Arc) -> Result { Ok(inode.list()?.iter().all(|entry| Self::is_dot_entry(entry))) } + + fn downcast_overlay_inode(inode: Arc) -> Result, SystemError> { + inode.downcast_arc::().ok_or(SystemError::EXDEV) + } + + fn lookup_overlay_child(&self, name: &str) -> Result, SystemError> { + Self::downcast_overlay_inode(self.find(name)?) + } + + fn has_upper(&self) -> bool { + self.upper_inode.lock().is_some() + } + + fn has_lower(&self) -> bool { + !self.lower_inodes.is_empty() + } + + fn is_pure_upper(&self) -> bool { + self.has_upper() && !self.has_lower() + } + + fn is_dir(&self) -> bool { + self.file_type == FileType::Dir + } + + fn parent_redirect(&self) -> Option<&str> { + if self.redirect.is_empty() { + return None; + } + + match self.redirect.rsplit_once('/') { + Some((parent, _)) => Some(parent), + None => Some(""), + } + } + + fn open_flags_need_copy_up(flags: &FileFlags) -> bool { + let access = flags.access_flags(); + access == FileFlags::O_WRONLY + || access == FileFlags::O_RDWR + || flags.contains(FileFlags::O_TRUNC) + } + + fn backing_open_flags(mut flags: FileFlags) -> FileFlags { + flags.remove( + FileFlags::O_CREAT | FileFlags::O_EXCL | FileFlags::O_NOCTTY | FileFlags::O_TRUNC, + ); + flags + } + + fn current_realdata_inode(&self) -> Result<(Arc, bool), SystemError> { + if let Some(inode) = self.upper_inode.lock().clone() { + return Ok((inode, true)); + } + + let lower_inode = self.lower_inodes.first().ok_or(SystemError::ENOENT)?; + Ok((lower_inode.clone(), false)) + } + + fn open_backing_file(&self, flags: FileFlags) -> Result { + if Self::open_flags_need_copy_up(&flags) { + self.copy_up()?; + } + + let (backing_inode, backing_is_upper) = self.current_realdata_inode()?; + let backing_file = Arc::new(File::new(backing_inode, Self::backing_open_flags(flags))?); + if flags.contains(FileFlags::O_TRUNC) && backing_is_upper { + vcore::vfs_truncate_file( + backing_file.inode(), + 0, + vcore::current_file_lock_owner_id(), + || backing_file.private_data.lock(), + )?; + } + Ok(OverlayFilePrivateData::new( + backing_file, + backing_is_upper, + flags, + )) + } + + fn backing_file_for_io( + &self, + data: crate::libs::mutex::MutexGuard, + ) -> Result<(Arc, bool), SystemError> { + let FilePrivateData::Overlayfs(overlay_data) = &*data else { + return Err(SystemError::EBADF); + }; + let overlay_data = overlay_data.clone(); + drop(data); + + let inner = overlay_data.inner.lock(); + Ok((inner.backing_file.clone(), inner.backing_is_upper)) + } } impl IndexNode for OvlInode { + fn open( + &self, + mut data: crate::libs::mutex::MutexGuard, + flags: &FileFlags, + ) -> Result<(), SystemError> { + let overlay_data = self.open_backing_file(*flags)?; + *data = FilePrivateData::Overlayfs(overlay_data); + Ok(()) + } + + fn truncate_before_open(&self, _flags: &FileFlags) -> bool { + false + } + fn read_at( &self, offset: usize, @@ -412,28 +662,18 @@ impl IndexNode for OvlInode { buf: &mut [u8], data: crate::libs::mutex::MutexGuard, ) -> Result { - if let Some(ref upper_inode) = *self.upper_inode.lock() { - return upper_inode.read_at(offset, len, buf, data); - } - - let mut lower_inodes = self.lower_inodes.iter(); - if let Some(lower_inode) = lower_inodes.next() { - match lower_inode.read_at(offset, len, buf, data) { - Ok(read_len) => return Ok(read_len), - Err(mut err) => { - for lower_inode in lower_inodes { - let lock = Mutex::new(vfs::FilePrivateData::Unused); - match lower_inode.read_at(offset, len, buf, lock.lock()) { - Ok(read_len) => return Ok(read_len), - Err(next_err) => err = next_err, - } - } - return Err(err); - } - } + if self.file_type == FileType::SymLink { + drop(data); + let (backing_inode, _) = self.current_realdata_inode()?; + return backing_inode.read_at( + offset, + len, + buf, + crate::libs::mutex::Mutex::new(FilePrivateData::Unused).lock(), + ); } - - Err(SystemError::ENOENT) + let (backing_file, _) = self.backing_file_for_io(data)?; + backing_file.pread(offset, len, buf) } fn write_at( @@ -443,30 +683,21 @@ impl IndexNode for OvlInode { buf: &[u8], data: crate::libs::mutex::MutexGuard, ) -> Result { - if (*self.upper_inode.lock()).is_none() { - self.copy_up()?; - } - if let Some(ref upper_inode) = *self.upper_inode.lock() { - return upper_inode.write_at(offset, len, buf, data); - } - - Err(SystemError::EROFS) + let (backing_file, _) = self.backing_file_for_io(data)?; + backing_file.pwrite(offset, len, buf) } fn sync_file( &self, datasync: bool, - _data: crate::libs::mutex::MutexGuard, + data: crate::libs::mutex::MutexGuard, ) -> Result<(), SystemError> { - if let Some(ref upper_inode) = *self.upper_inode.lock() { - return upper_inode.sync_file(datasync, _data); - } - - if !self.lower_inodes.is_empty() { - return Ok(()); + let (backing_file, backing_is_upper) = self.backing_file_for_io(data)?; + if backing_is_upper { + backing_file.sync_range_and_check_wb_error(0, usize::MAX, datasync) + } else { + Ok(()) } - - Err(SystemError::ENOENT) } fn sync_file_range( @@ -476,15 +707,69 @@ impl IndexNode for OvlInode { datasync: bool, data: crate::libs::mutex::MutexGuard, ) -> Result<(), SystemError> { - if let Some(ref upper_inode) = *self.upper_inode.lock() { - return upper_inode.sync_file_range(start, end, datasync, data); + let (backing_file, backing_is_upper) = self.backing_file_for_io(data)?; + if backing_is_upper { + backing_file.sync_range_and_check_wb_error(start, end, datasync) + } else { + Ok(()) } + } - if !self.lower_inodes.is_empty() { - return Ok(()); + fn flush_file( + &self, + data: crate::libs::mutex::MutexGuard, + lock_owner: u64, + ) -> Result<(), SystemError> { + let (backing_file, _) = self.backing_file_for_io(data)?; + backing_file.flush_for_close(lock_owner) + } + + fn close( + &self, + mut data: crate::libs::mutex::MutexGuard, + ) -> Result<(), SystemError> { + let old = mem::replace(&mut *data, FilePrivateData::Unused); + drop(data); + if let FilePrivateData::Overlayfs(overlay_data) = old { + drop(overlay_data); } + Ok(()) + } - Err(SystemError::ENOENT) + fn check_mmap_file( + &self, + file: &Arc, + len: usize, + offset: usize, + vm_flags: VmFlags, + ) -> Result<(), SystemError> { + let (backing_file, _) = self.backing_file_for_io(file.private_data.lock())?; + backing_file + .inode() + .check_mmap_file(&backing_file, len, offset, vm_flags) + } + + fn mmap_effective_file(&self, file: &Arc) -> Result, SystemError> { + let (backing_file, _) = self.backing_file_for_io(file.private_data.lock())?; + Ok(backing_file) + } + + fn mmap_file( + &self, + file: &Arc, + start: usize, + len: usize, + offset: usize, + vm_flags: VmFlags, + ) -> Result<(), SystemError> { + let (backing_file, _) = self.backing_file_for_io(file.private_data.lock())?; + backing_file + .inode() + .mmap_file(&backing_file, start, len, offset, vm_flags) + } + + fn page_cache(&self) -> Option> { + None } fn fs(&self) -> Arc { @@ -509,7 +794,27 @@ impl IndexNode for OvlInode { } fn dname(&self) -> Result { - Ok(DName::from(self.redirect.clone())) + Ok(DName::from( + self.redirect + .rsplit('/') + .next() + .unwrap_or(&self.redirect) + .to_string(), + )) + } + + fn parent(&self) -> Result, SystemError> { + let fs = self.overlay_fs()?; + let Some(parent_redirect) = self.parent_redirect() else { + return Ok(fs.root_inode.clone()); + }; + + if parent_redirect.is_empty() { + return Ok(fs.root_inode.clone()); + } + + let root: Arc = fs.root_inode.clone(); + root.lookup(parent_redirect) } fn list(&self) -> Result, system_error::SystemError> { @@ -563,14 +868,14 @@ impl IndexNode for OvlInode { name: &str, mode: vfs::InodeMode, ) -> Result, system_error::SystemError> { - if let Some(ref upper_inode) = *self.upper_inode.lock() { - upper_inode.mkdir(name, mode) - } else { - Err(SystemError::EROFS) - } + let fs = self.overlay_fs()?; + let _mutation_guard = fs.mutation_lock.lock(); + self.create_over_whiteout(name, |dir, temp_name| dir.mkdir(temp_name, mode), true) } fn rmdir(&self, name: &str) -> Result<(), SystemError> { + let fs = self.overlay_fs()?; + let _mutation_guard = fs.mutation_lock.lock(); if let Some(ref upper_inode) = *self.upper_inode.lock() { match upper_inode.rmdir(name) { Ok(()) => return Ok(()), @@ -597,6 +902,8 @@ impl IndexNode for OvlInode { } fn unlink(&self, name: &str) -> Result<(), SystemError> { + let fs = self.overlay_fs()?; + let _mutation_guard = fs.mutation_lock.lock(); if let Some(ref upper_inode) = *self.upper_inode.lock() { match upper_inode.unlink(name) { Ok(()) => return Ok(()), @@ -624,11 +931,17 @@ impl IndexNode for OvlInode { name: &str, other: &Arc, ) -> Result<(), system_error::SystemError> { - if let Some(ref upper_inode) = *self.upper_inode.lock() { - upper_inode.link(name, other) - } else { - Err(SystemError::EROFS) - } + let fs = self.overlay_fs()?; + let _mutation_guard = fs.mutation_lock.lock(); + self.create_over_whiteout( + name, + |dir, temp_name| { + dir.link(temp_name, other)?; + dir.find(temp_name) + }, + false, + ) + .map(|_| ()) } fn create( @@ -637,9 +950,104 @@ impl IndexNode for OvlInode { file_type: vfs::FileType, mode: vfs::InodeMode, ) -> Result, system_error::SystemError> { - let upper_inode = self.upper_inode.lock().clone().ok_or(SystemError::EROFS)?; - self.remove_whiteout_if_present(name)?; - upper_inode.create(name, file_type, mode) + let fs = self.overlay_fs()?; + let _mutation_guard = fs.mutation_lock.lock(); + self.create_over_whiteout( + name, + |dir, temp_name| dir.create(temp_name, file_type, mode), + file_type == FileType::Dir, + ) + } + + fn move_to( + &self, + old_name: &str, + target: &Arc, + new_name: &str, + flags: RenameFlags, + ) -> Result<(), SystemError> { + if flags.contains(RenameFlags::WHITEOUT) { + return Err(SystemError::EINVAL); + } + + let fs = self.overlay_fs()?; + let _mutation_guard = fs.mutation_lock.lock(); + + let target_ovl = target + .clone() + .downcast_arc::() + .ok_or(SystemError::EXDEV)?; + + let source = self.lookup_overlay_child(old_name)?; + let target_had_whiteout = target_ovl.has_whiteout(new_name); + let target_child = match target_ovl.lookup_overlay_child(new_name) { + Ok(inode) => Some(inode), + Err(SystemError::ENOENT) => None, + Err(err) => return Err(err), + }; + + if flags.contains(RenameFlags::NOREPLACE) && target_child.is_some() { + return Err(SystemError::EEXIST); + } + + if flags.contains(RenameFlags::EXCHANGE) { + let target_child = target_child.ok_or(SystemError::ENOENT)?; + if (source.is_dir() && source.has_lower()) + || (target_child.is_dir() && target_child.has_lower()) + { + return Err(SystemError::EXDEV); + } + + source.copy_up()?; + target_child.copy_up()?; + let old_upper_dir = self.writable_upper_inode()?; + let new_upper_dir = target_ovl.writable_upper_inode()?; + return old_upper_dir.move_to(old_name, &new_upper_dir, new_name, flags); + } + + if self.redirect == target_ovl.redirect && old_name == new_name { + return Ok(()); + } + + let source_needs_whiteout = source.has_lower(); + if source_needs_whiteout && source.is_dir() { + return Err(SystemError::EXDEV); + } + + if let Some(target_child) = target_child { + if source.is_dir() && !target_child.is_dir() { + return Err(SystemError::ENOTDIR); + } + if !source.is_dir() && target_child.is_dir() { + return Err(SystemError::EISDIR); + } + if source.is_dir() && target_child.is_dir() { + let target_node: Arc = target_child.clone(); + if !Self::is_dir_empty(&target_node)? { + return Err(SystemError::ENOTEMPTY); + } + } + } + + if !source.is_pure_upper() { + source.copy_up()?; + } + + let old_upper_dir = self.writable_upper_inode()?; + let new_upper_dir = target_ovl.writable_upper_inode()?; + let mut upper_flags = flags; + if target_had_whiteout { + upper_flags.remove(RenameFlags::NOREPLACE); + if source.is_dir() { + old_upper_dir.move_to(old_name, &new_upper_dir, new_name, RenameFlags::EXCHANGE)?; + Self::cleanup_workdir_temp(&old_upper_dir, old_name); + return Ok(()); + } + } + if source_needs_whiteout { + upper_flags.insert(RenameFlags::WHITEOUT); + } + old_upper_dir.move_to(old_name, &new_upper_dir, new_name, upper_flags) } fn find(&self, name: &str) -> Result, system_error::SystemError> { @@ -724,11 +1132,16 @@ impl IndexNode for OvlInode { mode: vfs::InodeMode, dev_t: crate::driver::base::device::device_number::DeviceNumber, ) -> Result, system_error::SystemError> { - let upper_inode = self.upper_inode.lock(); - if let Some(ref inode) = *upper_inode { - inode.mknod(filename, mode, dev_t) - } else { - Err(SystemError::EROFS) + let fs = self.overlay_fs()?; + let _mutation_guard = fs.mutation_lock.lock(); + if FileType::from(mode) == FileType::CharDevice && dev_t == WHITEOUT_DEV { + return Err(SystemError::EPERM); } + + self.create_over_whiteout( + filename, + |dir, temp_name| dir.mknod(temp_name, mode, dev_t), + FileType::from(mode) == FileType::Dir, + ) } } diff --git a/kernel/src/filesystem/page_cache.rs b/kernel/src/filesystem/page_cache.rs index a1ee5cd5fe..a0084c4635 100644 --- a/kernel/src/filesystem/page_cache.rs +++ b/kernel/src/filesystem/page_cache.rs @@ -96,6 +96,17 @@ impl MmFilePageGroup { } } +/// Policy for zapping page-cache backed file mappings. +/// +/// This mirrors Linux's `unmap_mapping_pages(..., even_cows)`: cache invalidation +/// must preserve private COW data, while truncate must also drop COWed private +/// PTEs so future access faults against the new file size. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum UnmapMappingMode { + CacheOnly, + EvenCow, +} + lazy_static! { static ref PAGECACHE_IO_WQS: Vec> = { let mut wqs = Vec::new(); @@ -1583,6 +1594,31 @@ impl PageCache { &self, start_page_index: usize, end_page_index_exclusive: Option, + ) -> Result<(), SystemError> { + self.unmap_mapping_pages_with_mode( + start_page_index, + end_page_index_exclusive, + UnmapMappingMode::CacheOnly, + ) + } + + pub fn unmap_mapping_pages_even_cow( + &self, + start_page_index: usize, + end_page_index_exclusive: Option, + ) -> Result<(), SystemError> { + self.unmap_mapping_pages_with_mode( + start_page_index, + end_page_index_exclusive, + UnmapMappingMode::EvenCow, + ) + } + + fn unmap_mapping_pages_with_mode( + &self, + start_page_index: usize, + end_page_index_exclusive: Option, + mode: UnmapMappingMode, ) -> Result<(), SystemError> { loop { let (seq, snapshot) = @@ -1610,7 +1646,7 @@ impl PageCache { let _pt_edit = group.mm.page_table_edit(); let mut tlb = MmuGather::gather(&group.mm); for (vma, region) in group.ranges { - vma.unmap_range(region, &mm_guard.user_mapper.utable, &mut tlb); + vma.unmap_range(region, &mm_guard.user_mapper.utable, &mut tlb, mode); } tlb.finish(); } @@ -1630,7 +1666,7 @@ impl PageCache { fn truncate_locked(&self, new_size: usize) -> Result<(), SystemError> { let hole_start_page = page_align_up(new_size) >> MMArch::PAGE_SHIFT; - self.unmap_mapping_pages(hole_start_page, None)?; + self.unmap_mapping_pages_even_cow(hole_start_page, None)?; let first_full_truncate_page = page_align_up(new_size) >> MMArch::PAGE_SHIFT; let truncate_indices: Vec = { @@ -1714,7 +1750,7 @@ impl PageCache { } } - self.unmap_mapping_pages(hole_start_page, None)?; + self.unmap_mapping_pages_even_cow(hole_start_page, None)?; Ok(()) } diff --git a/kernel/src/filesystem/procfs/pid/maps.rs b/kernel/src/filesystem/procfs/pid/maps.rs index 2c849df24a..403490105f 100644 --- a/kernel/src/filesystem/procfs/pid/maps.rs +++ b/kernel/src/filesystem/procfs/pid/maps.rs @@ -119,7 +119,7 @@ fn generate_maps_content(target: &ProcPidTarget) -> Result, SystemError> .absolute_path() .unwrap_or_default(); - let as_guard = vm.read(); + let as_guard = vm.read_guard_no_reservations(); // 收集并按地址排序 let mut vmas: Vec> = as_guard.mappings.iter_vmas().cloned().collect(); diff --git a/kernel/src/filesystem/procfs/pid/stat.rs b/kernel/src/filesystem/procfs/pid/stat.rs index 0d9c60accf..ff84382946 100644 --- a/kernel/src/filesystem/procfs/pid/stat.rs +++ b/kernel/src/filesystem/procfs/pid/stat.rs @@ -153,7 +153,7 @@ impl FileOps for StatFileOps { .unwrap_or(1); let (vsize_bytes, rss_pages) = user_vm .map(|vm| { - let guard = vm.read(); + let guard = vm.read_guard_no_reservations(); let bytes = guard.vma_usage_bytes(); let pages = (bytes.saturating_add(MMArch::PAGE_SIZE - 1)) >> MMArch::PAGE_SHIFT; (bytes as u64, pages as u64) diff --git a/kernel/src/filesystem/procfs/pid/statm.rs b/kernel/src/filesystem/procfs/pid/statm.rs index 3623ae85c3..119152d61f 100644 --- a/kernel/src/filesystem/procfs/pid/statm.rs +++ b/kernel/src/filesystem/procfs/pid/statm.rs @@ -57,7 +57,7 @@ impl FileOps for StatmFileOps { // 获取进程内存信息(简化实现) let size_pages = user_vm .map(|vm| { - let guard = vm.read(); + let guard = vm.read_guard_no_reservations(); // statm 第一列为总虚拟内存页数 (guard .vma_usage_bytes() diff --git a/kernel/src/filesystem/ramfs/mod.rs b/kernel/src/filesystem/ramfs/mod.rs index c39fe3d1ff..7192a00f24 100644 --- a/kernel/src/filesystem/ramfs/mod.rs +++ b/kernel/src/filesystem/ramfs/mod.rs @@ -7,7 +7,7 @@ use crate::libs::rwsem::RwSem; use crate::register_mountable_fs; use crate::{ arch::MMArch, - driver::base::device::device_number::DeviceNumber, + driver::base::device::device_number::{DeviceNumber, Major}, filesystem::vfs::{vcore::generate_inode_id, FileType}, ipc::pipe::LockedPipeInode, libs::casting::DowncastArc, @@ -37,6 +37,157 @@ use super::vfs::{Magic, MountableFileSystem, SuperBlock}; /// RamFS的inode名称的最大长度 const RAMFS_MAX_NAMELEN: usize = 64; const RAMFS_BLOCK_SIZE: u64 = 512; +const WHITEOUT_DEV: DeviceNumber = DeviceNumber::new(Major::UNNAMED_MAJOR, 0); + +fn ramfs_move_entry_between_dirs( + src_dir: &mut RamFSInode, + dst_dir: &mut RamFSInode, + old_key: &DName, + new_key: &DName, + flags: RenameFlags, +) -> Result<(), SystemError> { + if src_dir.metadata.file_type != FileType::Dir || dst_dir.metadata.file_type != FileType::Dir { + return Err(SystemError::ENOTDIR); + } + + let src_self = src_dir.self_ref.upgrade().ok_or(SystemError::EIO)?; + let dst_self = dst_dir.self_ref.upgrade().ok_or(SystemError::EIO)?; + let inode_to_move = src_dir + .children + .get(old_key) + .cloned() + .ok_or(SystemError::ENOENT)?; + let old_type = inode_to_move.0.lock().metadata.file_type; + + if flags.contains(RenameFlags::EXCHANGE) { + let existing = dst_dir + .children + .get(new_key) + .cloned() + .ok_or(SystemError::ENOENT)?; + if Arc::ptr_eq(&inode_to_move, &existing) { + return Ok(()); + } + let existing_type = existing.0.lock().metadata.file_type; + + src_dir.children.insert(old_key.clone(), existing.clone()); + dst_dir + .children + .insert(new_key.clone(), inode_to_move.clone()); + if old_type == FileType::Dir { + src_dir.metadata.nlinks = src_dir.metadata.nlinks.saturating_sub(1); + dst_dir.metadata.nlinks = dst_dir.metadata.nlinks.saturating_add(1); + } + if existing_type == FileType::Dir { + dst_dir.metadata.nlinks = dst_dir.metadata.nlinks.saturating_sub(1); + src_dir.metadata.nlinks = src_dir.metadata.nlinks.saturating_add(1); + } + + { + let mut moved = inode_to_move.0.lock(); + moved.parent = Arc::downgrade(&dst_self); + moved.name = new_key.clone(); + } + { + let mut replaced = existing.0.lock(); + replaced.parent = Arc::downgrade(&src_self); + replaced.name = old_key.clone(); + } + return Ok(()); + } + + if let Some(existing) = dst_dir.children.get(new_key).cloned() { + if flags.contains(RenameFlags::NOREPLACE) { + return Err(SystemError::EEXIST); + } + + let (existing_id, existing_type, existing_dir_nonempty) = { + let guard = existing.0.lock(); + let t = guard.metadata.file_type; + let nonempty = t == FileType::Dir && !guard.children.is_empty(); + (guard.metadata.inode_id, t, nonempty) + }; + let to_move_id = inode_to_move.0.lock().metadata.inode_id; + if existing_id == to_move_id { + src_dir.children.remove(old_key); + return Ok(()); + } + + if old_type == FileType::Dir && existing_type != FileType::Dir { + return Err(SystemError::ENOTDIR); + } + if old_type != FileType::Dir && existing_type == FileType::Dir { + return Err(SystemError::EISDIR); + } + if old_type == FileType::Dir && existing_dir_nonempty { + return Err(SystemError::ENOTEMPTY); + } + + dst_dir.children.remove(new_key); + let mut existing_guard = existing.0.lock(); + if existing_type == FileType::Dir { + dst_dir.metadata.nlinks = dst_dir.metadata.nlinks.saturating_sub(1); + existing_guard.metadata.nlinks = 0; + } else { + existing_guard.metadata.nlinks = existing_guard.metadata.nlinks.saturating_sub(1); + } + } + + src_dir.children.remove(old_key); + if flags.contains(RenameFlags::WHITEOUT) { + ramfs_insert_whiteout(src_dir, old_key)?; + } + if old_type == FileType::Dir { + src_dir.metadata.nlinks = src_dir.metadata.nlinks.saturating_sub(1); + dst_dir.metadata.nlinks = dst_dir.metadata.nlinks.saturating_add(1); + } + dst_dir + .children + .insert(new_key.clone(), inode_to_move.clone()); + + let mut moved = inode_to_move.0.lock(); + moved.parent = Arc::downgrade(&dst_self); + moved.name = new_key.clone(); + Ok(()) +} + +fn ramfs_insert_whiteout(dir: &mut RamFSInode, name: &DName) -> Result<(), SystemError> { + if dir.children.contains_key(name) { + return Err(SystemError::EEXIST); + } + + let whiteout = Arc::new(LockedRamFSInode(Mutex::new(RamFSInode { + parent: dir.self_ref.clone(), + self_ref: Weak::default(), + children: BTreeMap::new(), + data: Vec::new(), + metadata: Metadata { + dev_id: 0, + inode_id: generate_inode_id(), + size: 0, + blk_size: 0, + blocks: 0, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + btime: PosixTimeSpec::default(), + file_type: FileType::CharDevice, + mode: InodeMode::S_IFCHR | InodeMode::from_bits_truncate(0o600), + nlinks: 1, + uid: 0, + gid: 0, + raw_dev: WHITEOUT_DEV, + flags: InodeFlags::empty(), + }, + fs: dir.fs.clone(), + special_node: None, + name: name.clone(), + }))); + whiteout.0.lock().self_ref = Arc::downgrade(&whiteout); + dir.children.insert(name.clone(), whiteout); + Ok(()) +} + /// @brief 内存文件系统的Inode结构体 #[derive(Debug)] pub struct LockedRamFSInode(pub Mutex); @@ -523,49 +674,95 @@ impl IndexNode for LockedRamFSInode { new_name: &str, flags: RenameFlags, ) -> Result<(), SystemError> { - let inode_to_move = self - .find(old_name)? + let old_key = DName::from(old_name); + let new_name = DName::from(new_name); + let target_locked = target + .clone() .downcast_arc::() .ok_or(SystemError::EINVAL)?; - let new_name = DName::from(new_name); + let self_id = self.0.lock().metadata.inode_id; + let target_id = target_locked.0.lock().metadata.inode_id; - inode_to_move.0.lock().name = new_name.clone(); + if self_id == target_id { + let mut dir = self.0.lock(); + let inode_to_move = dir + .children + .get(&old_key) + .cloned() + .ok_or(SystemError::ENOENT)?; + let old_type = inode_to_move.0.lock().metadata.file_type; - let target_id = target.metadata()?.inode_id; + if flags.contains(RenameFlags::EXCHANGE) { + let existing = dir + .children + .get(&new_name) + .cloned() + .ok_or(SystemError::ENOENT)?; + let to_move_id = inode_to_move.0.lock().metadata.inode_id; + let existing_id = existing.0.lock().metadata.inode_id; + if existing_id == to_move_id { + return Ok(()); + } - let mut self_inode = self.0.lock(); - // 判断是否在同一目录下, 是则进行重命名 - if target_id == self_inode.metadata.inode_id { - if flags.contains(RenameFlags::NOREPLACE) && self_inode.children.contains_key(&new_name) - { - return Err(SystemError::EEXIST); + dir.children.insert(old_key.clone(), existing.clone()); + dir.children.insert(new_name.clone(), inode_to_move.clone()); + existing.0.lock().name = old_key; + inode_to_move.0.lock().name = new_name; + return Ok(()); } - self_inode.children.remove(&DName::from(old_name)); - self_inode.children.insert(new_name, inode_to_move); - return Ok(()); - } - drop(self_inode); - // 修改其对父节点的引用 - inode_to_move.0.lock().parent = Arc::downgrade( - &target - .clone() - .downcast_arc::() - .ok_or(SystemError::EINVAL)?, - ); + if let Some(existing) = dir.children.get(&new_name).cloned() { + if flags.contains(RenameFlags::NOREPLACE) { + return Err(SystemError::EEXIST); + } + + let existing_id = existing.0.lock().metadata.inode_id; + let to_move_id = inode_to_move.0.lock().metadata.inode_id; + if existing_id == to_move_id { + return Ok(()); + } - // 在新的目录下创建一个硬链接 - target.link(new_name.as_ref(), &(inode_to_move as Arc))?; + let existing_type = existing.0.lock().metadata.file_type; + if old_type == FileType::Dir && existing_type != FileType::Dir { + return Err(SystemError::ENOTDIR); + } + if old_type != FileType::Dir && existing_type == FileType::Dir { + return Err(SystemError::EISDIR); + } + if old_type == FileType::Dir && !existing.0.lock().children.is_empty() { + return Err(SystemError::ENOTEMPTY); + } - // 取消现有的目录下的这个硬链接 - if let Err(e) = self.unlink(old_name) { - // 当操作失败时回退操作 - target.unlink(new_name.as_ref())?; - return Err(e); + dir.children.remove(&new_name); + let mut existing_guard = existing.0.lock(); + if existing_type == FileType::Dir { + dir.metadata.nlinks = dir.metadata.nlinks.saturating_sub(1); + existing_guard.metadata.nlinks = 0; + } else { + existing_guard.metadata.nlinks = + existing_guard.metadata.nlinks.saturating_sub(1); + } + } + + dir.children.remove(&old_key); + if flags.contains(RenameFlags::WHITEOUT) { + ramfs_insert_whiteout(&mut dir, &old_key)?; + } + dir.children.insert(new_name.clone(), inode_to_move.clone()); + inode_to_move.0.lock().name = new_name; + return Ok(()); } - return Ok(()); + if self_id < target_id { + let mut src_dir = self.0.lock(); + let mut dst_dir = target_locked.0.lock(); + ramfs_move_entry_between_dirs(&mut src_dir, &mut dst_dir, &old_key, &new_name, flags) + } else { + let mut dst_dir = target_locked.0.lock(); + let mut src_dir = self.0.lock(); + ramfs_move_entry_between_dirs(&mut src_dir, &mut dst_dir, &old_key, &new_name, flags) + } } fn find(&self, name: &str) -> Result, SystemError> { diff --git a/kernel/src/filesystem/tmpfs/mod.rs b/kernel/src/filesystem/tmpfs/mod.rs index ce48a4ed0c..522f9eeeaa 100644 --- a/kernel/src/filesystem/tmpfs/mod.rs +++ b/kernel/src/filesystem/tmpfs/mod.rs @@ -13,7 +13,7 @@ use crate::register_mountable_fs; use crate::{ arch::mm::LockedFrameAllocator, arch::MMArch, - driver::base::device::device_number::DeviceNumber, + driver::base::device::device_number::{DeviceNumber, Major}, filesystem::vfs::{vcore::generate_inode_id, FileType}, ipc::pipe::LockedPipeInode, libs::casting::DowncastArc, @@ -45,6 +45,7 @@ const TMPFS_BLOCK_SIZE: u64 = 4096; const TMPFS_DEFAULT_MIN_SIZE_BYTES: usize = 16 * 1024 * 1024; // 16MiB const TMPFS_DEFAULT_MAX_SIZE_BYTES: usize = 4 * 1024 * 1024 * 1024; // 4GiB +const WHITEOUT_DEV: DeviceNumber = DeviceNumber::new(Major::UNNAMED_MAJOR, 0); #[derive(Debug)] struct TmpfsPageCacheBackend { @@ -106,18 +107,55 @@ fn tmpfs_move_entry_between_dirs( .ok_or(SystemError::ENOENT)?; let old_type = inode_to_move.0.lock().metadata.file_type; - if let Some(existing) = dst_dir.children.get(new_key) { + if flags.contains(RenameFlags::EXCHANGE) { + let existing = dst_dir + .children + .get(new_key) + .cloned() + .ok_or(SystemError::ENOENT)?; + if Arc::ptr_eq(&inode_to_move, &existing) { + return Ok(()); + } + let existing_type = existing.0.lock().metadata.file_type; + + src_dir.children.insert(old_key.clone(), existing.clone()); + dst_dir + .children + .insert(new_key.clone(), inode_to_move.clone()); + if old_type == FileType::Dir { + src_dir.metadata.nlinks = src_dir.metadata.nlinks.saturating_sub(1); + dst_dir.metadata.nlinks = dst_dir.metadata.nlinks.saturating_add(1); + } + if existing_type == FileType::Dir { + dst_dir.metadata.nlinks = dst_dir.metadata.nlinks.saturating_sub(1); + src_dir.metadata.nlinks = src_dir.metadata.nlinks.saturating_add(1); + } + + { + let mut moved = inode_to_move.0.lock(); + moved.parent = Arc::downgrade(&dst_self); + moved.name = new_key.clone(); + } + { + let mut replaced = existing.0.lock(); + replaced.parent = Arc::downgrade(&src_self); + replaced.name = old_key.clone(); + } + return Ok(()); + } + + if let Some(existing) = dst_dir.children.get(new_key).cloned() { if flags.contains(RenameFlags::NOREPLACE) { return Err(SystemError::EEXIST); } // Avoid self-deadlock: `existing` may be `src_dir`/`dst_dir` itself. - if Arc::ptr_eq(existing, &src_self) { + if Arc::ptr_eq(&existing, &src_self) { // Example: rename("dir/subdir", "dir") -> ENOTEMPTY (dir not empty). // Linux expects ENOTEMPTY for this case (TargetIsAncestorOfSource). return Err(SystemError::ENOTEMPTY); } - if Arc::ptr_eq(existing, &dst_self) { + if Arc::ptr_eq(&existing, &dst_self) { // Shouldn't happen in normal tmpfs (no self entry), but treat as busy. return Err(SystemError::EBUSY); } @@ -137,27 +175,32 @@ fn tmpfs_move_entry_between_dirs( return Ok(()); } - if old_type != existing_type { - return Err(if old_type == FileType::Dir { - SystemError::ENOTDIR - } else { - SystemError::EISDIR - }); + if old_type == FileType::Dir && existing_type != FileType::Dir { + return Err(SystemError::ENOTDIR); + } + if old_type != FileType::Dir && existing_type == FileType::Dir { + return Err(SystemError::EISDIR); } - if old_type == FileType::Dir && existing_dir_nonempty { return Err(SystemError::ENOTEMPTY); } // Remove existing destination entry (replacement). dst_dir.children.remove(new_key); - if old_type == FileType::Dir { + let mut existing_guard = existing.0.lock(); + if existing_type == FileType::Dir { dst_dir.metadata.nlinks = dst_dir.metadata.nlinks.saturating_sub(1); + existing_guard.metadata.nlinks = 0; + } else { + existing_guard.metadata.nlinks = existing_guard.metadata.nlinks.saturating_sub(1); } } // Remove from source directory. src_dir.children.remove(old_key); + if flags.contains(RenameFlags::WHITEOUT) { + tmpfs_insert_whiteout(src_dir, old_key)?; + } if old_type == FileType::Dir { src_dir.metadata.nlinks = src_dir.metadata.nlinks.saturating_sub(1); dst_dir.metadata.nlinks = dst_dir.metadata.nlinks.saturating_add(1); @@ -174,6 +217,43 @@ fn tmpfs_move_entry_between_dirs( Ok(()) } +fn tmpfs_insert_whiteout(dir: &mut TmpfsInode, name: &DName) -> Result<(), SystemError> { + if dir.children.contains_key(name) { + return Err(SystemError::EEXIST); + } + + let whiteout = Arc::new(LockedTmpfsInode(Mutex::new(TmpfsInode { + parent: dir.self_ref.clone(), + self_ref: Weak::default(), + children: BTreeMap::new(), + page_cache: None, + metadata: Metadata { + dev_id: 0, + inode_id: generate_inode_id(), + size: 0, + blk_size: 0, + blocks: 0, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + btime: PosixTimeSpec::default(), + file_type: FileType::CharDevice, + mode: InodeMode::S_IFCHR | InodeMode::from_bits_truncate(0o600), + nlinks: 1, + uid: 0, + gid: 0, + raw_dev: WHITEOUT_DEV, + flags: InodeFlags::empty(), + }, + fs: dir.fs.clone(), + special_node: None, + name: name.clone(), + }))); + whiteout.0.lock().self_ref = Arc::downgrade(&whiteout); + dir.children.insert(name.clone(), whiteout); + Ok(()) +} + #[derive(Debug)] pub struct LockedTmpfsInode(pub Mutex); @@ -1019,6 +1099,9 @@ impl IndexNode for LockedTmpfsInode { if Arc::ptr_eq(&(self.0.lock().self_ref.upgrade().unwrap()), &target_locked) && old_key == new_key { + if flags.contains(RenameFlags::NOREPLACE) { + return Err(SystemError::EEXIST); + } return Ok(()); } @@ -1036,7 +1119,26 @@ impl IndexNode for LockedTmpfsInode { .ok_or(SystemError::ENOENT)?; let old_type = inode_to_move.0.lock().metadata.file_type; - if let Some(existing) = dir.children.get(&new_key) { + if flags.contains(RenameFlags::EXCHANGE) { + let existing = dir + .children + .get(&new_key) + .cloned() + .ok_or(SystemError::ENOENT)?; + let to_move_id = inode_to_move.0.lock().metadata.inode_id; + let existing_id = existing.0.lock().metadata.inode_id; + if existing_id == to_move_id { + return Ok(()); + } + + dir.children.insert(old_key.clone(), existing.clone()); + dir.children.insert(new_key.clone(), inode_to_move.clone()); + existing.0.lock().name = old_key; + inode_to_move.0.lock().name = new_key; + return Ok(()); + } + + if let Some(existing) = dir.children.get(&new_key).cloned() { if flags.contains(RenameFlags::NOREPLACE) { return Err(SystemError::EEXIST); } @@ -1049,12 +1151,11 @@ impl IndexNode for LockedTmpfsInode { } let existing_type = existing.0.lock().metadata.file_type; - if old_type != existing_type { - return Err(if old_type == FileType::Dir { - SystemError::ENOTDIR - } else { - SystemError::EISDIR - }); + if old_type == FileType::Dir && existing_type != FileType::Dir { + return Err(SystemError::ENOTDIR); + } + if old_type != FileType::Dir && existing_type == FileType::Dir { + return Err(SystemError::EISDIR); } if old_type == FileType::Dir && !existing.0.lock().children.is_empty() { @@ -1063,10 +1164,21 @@ impl IndexNode for LockedTmpfsInode { // Remove existing destination entry (replacement). dir.children.remove(&new_key); + let mut existing_guard = existing.0.lock(); + if existing_type == FileType::Dir { + dir.metadata.nlinks = dir.metadata.nlinks.saturating_sub(1); + existing_guard.metadata.nlinks = 0; + } else { + existing_guard.metadata.nlinks = + existing_guard.metadata.nlinks.saturating_sub(1); + } } // Move entry within the same directory. dir.children.remove(&old_key); + if flags.contains(RenameFlags::WHITEOUT) { + tmpfs_insert_whiteout(&mut dir, &old_key)?; + } dir.children.insert(new_key.clone(), inode_to_move.clone()); inode_to_move.0.lock().name = new_key; return Ok(()); diff --git a/kernel/src/filesystem/vfs/file.rs b/kernel/src/filesystem/vfs/file.rs index 36ec5b6a1c..984bfcfa3c 100644 --- a/kernel/src/filesystem/vfs/file.rs +++ b/kernel/src/filesystem/vfs/file.rs @@ -21,11 +21,15 @@ use crate::{ }, filesystem::{ devfs::{devfs_lookup_device_by_devnum, LockedDevFSInode}, - epoll::{event_poll::EPollPrivateData, EPollItem}, + epoll::{ + event_poll::{EPollPrivateData, EventPoll, LockedEPItemLinkedList}, + EPollItem, + }, ext4::inode::LockedExt4Inode, fat::fs::LockedFATInode, fuse::private_data::FuseFilePrivateData, kernfs::callback::KernFilePrivateData, + overlayfs::OverlayFilePrivateData, page_cache::PageCache, procfs::ProcfsFilePrivateData, ramfs::LockedRamFSInode, @@ -232,6 +236,8 @@ pub enum FilePrivateData { SocketCreate, /// FUSE file private data. Fuse(FuseFilePrivateData), + /// OverlayFS per-open backing file private data. + Overlayfs(OverlayFilePrivateData), /// kernfs/debugfs per-open callback state. Kernfs(Option), /// 不需要文件私有信息 @@ -245,7 +251,7 @@ impl Default for FilePrivateData { } impl FilePrivateData { - pub fn update_flags(&mut self, flags: FileFlags) { + pub fn update_flags(&mut self, flags: FileFlags) -> Result<(), SystemError> { match self { FilePrivateData::Pipefs(pdata) => { pdata.set_flags(flags); @@ -256,8 +262,12 @@ impl FilePrivateData { FilePrivateData::Fuse(pdata) => { pdata.set_flags(flags); } + FilePrivateData::Overlayfs(pdata) => { + pdata.set_flags(flags)?; + } _ => {} } + Ok(()) } pub fn is_pid(&self) -> bool { @@ -525,6 +535,12 @@ pub struct File { wb_error_seq: Mutex, /// 当前 open file description 已观测到的 superblock 写回错误序列。 sb_error_seq: Mutex, + /// epoll items that reference this open file description. + /// + /// Linux removes these from their owning epoll instances during `__fput()` + /// via `eventpoll_release(file)`. DragonOS keeps the same lifetime edge + /// here so fd numbers can be safely reused after close. + epitems: Arc, } impl File { @@ -831,6 +847,7 @@ impl File { ra_state: Mutex::new(FileReadaheadState::new()), wb_error_seq: Mutex::new(wb_error_seq), sb_error_seq: Mutex::new(sb_error_seq), + epitems: Arc::new(LockedEPItemLinkedList::default()), }; return Ok(f); @@ -1362,6 +1379,7 @@ impl File { ra_state: Mutex::new(self.ra_state.lock().clone()), wb_error_seq: Mutex::new(*self.wb_error_seq.lock()), sb_error_seq: Mutex::new(*self.sb_error_seq.lock()), + epitems: Arc::new(LockedEPItemLinkedList::default()), }; // 调用inode的open方法,让inode知道有新的文件打开了这个inode // TODO: reopen is not a good idea for some inodes, need a better design @@ -1463,9 +1481,9 @@ impl File { let new_bits = (new_flags.bits() & SETFL_MASK) | (old_flags.bits() & !SETFL_MASK); new_flags = FileFlags::from_bits_truncate(new_bits); + self.private_data.lock().update_flags(new_flags)?; // 更新文件的打开模式 *self.flags.write() = new_flags; - self.private_data.lock().update_flags(new_flags); return Ok(()); } @@ -1517,15 +1535,22 @@ impl File { let private_data = self.private_data.lock(); self.inode .as_pollable_inode()? - .add_epitem(epitem, &private_data) + .add_epitem(epitem.clone(), &private_data)?; + self.epitems.lock_irqsave().push_back(epitem); + Ok(()) } /// Remove epitems associated with the epoll pub fn remove_epitem(&self, epitem: &Arc) -> Result<(), SystemError> { let private_data = self.private_data.lock(); - self.inode + let result = self + .inode .as_pollable_inode()? - .remove_epitem(epitem, &private_data) + .remove_epitem(epitem, &private_data); + self.epitems + .lock_irqsave() + .retain(|x| !Arc::ptr_eq(x, epitem)); + result } /// Poll the file for events @@ -1607,6 +1632,17 @@ impl File { impl Drop for File { fn drop(&mut self) { + let epitems = { + let mut guard = self.epitems.lock_irqsave(); + let snapshot = guard.iter().cloned().collect::>(); + guard.clear(); + snapshot + }; + for epitem in epitems { + EventPoll::release_file_epitem(&epitem); + let _ = self.remove_epitem(&epitem); + } + super::flock::release_all_for_file(self); if self.mode.read().contains(FileMode::FMODE_WRITER) { if let Some(mnt_inode) = self.inode.clone().downcast_arc::() { diff --git a/kernel/src/filesystem/vfs/mod.rs b/kernel/src/filesystem/vfs/mod.rs index 738aa20801..f56ec19f61 100644 --- a/kernel/src/filesystem/vfs/mod.rs +++ b/kernel/src/filesystem/vfs/mod.rs @@ -389,6 +389,10 @@ pub trait IndexNode: Any + Sync + Send + Debug + CastFromSync { Ok(()) } + fn mmap_effective_file(&self, file: &Arc) -> Result, SystemError> { + Ok(file.clone()) + } + fn mmap_file( &self, _file: &Arc, @@ -417,8 +421,7 @@ pub trait IndexNode: Any + Sync + Send + Debug + CastFromSync { _data: MutexGuard, _flags: &FileFlags, ) -> Result<(), SystemError> { - // 若文件系统没有实现此方法,则返回"不支持" - return Err(SystemError::ENOSYS); + Ok(()) } /// Adjust per-open file mode bits after `open()` initialized private data. @@ -1043,7 +1046,12 @@ pub trait IndexNode: Any + Sync + Send + Debug + CastFromSync { /// /// @return 成功:Ok(0) /// 失败:Err(错误码) - fn setxattr(&self, _name: &str, _value: &[u8]) -> Result { + fn setxattr( + &self, + _name: &str, + _value: &[u8], + _flags: XattrFlags, + ) -> Result { log::warn!( "setxattr not implemented for {}", crate::libs::name::get_type_name(&self) @@ -1051,6 +1059,24 @@ pub trait IndexNode: Any + Sync + Send + Debug + CastFromSync { return Err(SystemError::ENOSYS); } + /// @brief 列出扩展属性名,返回实际列表长度。 + fn listxattr(&self, _buf: &mut [u8]) -> Result { + log::warn!( + "listxattr not implemented for {}", + crate::libs::name::get_type_name(&self) + ); + return Err(SystemError::ENOSYS); + } + + /// @brief 删除指定扩展属性。 + fn removexattr(&self, _name: &str) -> Result { + log::warn!( + "removexattr not implemented for {}", + crate::libs::name::get_type_name(&self) + ); + return Err(SystemError::ENOSYS); + } + /// # 将当前Inode转换为 Socket 引用 /// /// # 返回值 @@ -1444,6 +1470,8 @@ bitflags! { const PROC_MAGIC = 0x9fa0; const RAMFS_MAGIC = 0x858458f6; const DEVPTS_MAGIC = 0x1cd1; + const DEBUGFS_MAGIC = 0x64626720; + const MQUEUE_MAGIC = 0x19800202; const MOUNT_MAGIC = 61267; const PIPEFS_MAGIC = 0x50495045; const EVENTFD_MAGIC = 0x45564446; // "EVDF" in ASCII @@ -1469,6 +1497,14 @@ pub enum WritebackSyncMode { All, } +bitflags! { + /// Flags controlling Linux extended attribute set semantics. + pub struct XattrFlags: i32 { + const CREATE = 0x1; + const REPLACE = 0x2; + } +} + #[derive(Debug, Clone, Copy)] pub struct WritebackControl { pub sync_mode: WritebackSyncMode, diff --git a/kernel/src/filesystem/vfs/mount/mod.rs b/kernel/src/filesystem/vfs/mount/mod.rs index 6c6dc8c4ed..5b1f12a21d 100644 --- a/kernel/src/filesystem/vfs/mount/mod.rs +++ b/kernel/src/filesystem/vfs/mount/mod.rs @@ -2,7 +2,7 @@ use super::{ file::{FileFlags, FileMode}, utils::DName, FilePrivateData, FileSystem, FileType, IndexNode, InodeId, InodeMode, PollableInode, - SuperBlock, + SuperBlock, XattrFlags, }; use crate::{ driver::base::device::device_number::{DeviceNumber, Major}, @@ -682,16 +682,12 @@ impl MountFS { all_descendants.reverse(); for child_mfs in &all_descendants { - if let Some(path) = mntns.mount_list().get_mount_path_by_mountfs(child_mfs) { - mntns.remove_mount(path.as_str()); - } + mntns.remove_mount_exact(child_mfs); let _ = child_mfs.umount(); } // 3. Finally unmount the root mount itself - if let Some(path) = mntns.mount_list().get_mount_path_by_mountfs(root) { - mntns.remove_mount(path.as_str()); - } + mntns.remove_mount_exact(root); let _ = root.umount(); } @@ -1196,10 +1192,10 @@ impl MountFSInode { loop { // Reached the current namespace root: stop. if current.is_mountpoint_root()? - && current - .mount_fs - .namespace() - .is_some_and(|ns| Arc::ptr_eq(¤t.mount_fs, ns.root_mntfs())) + && current.mount_fs.namespace().is_some_and(|ns| { + let ns_root = ns.root_mntfs(); + Arc::ptr_eq(¤t.mount_fs, &ns_root) + }) { break; } @@ -1308,6 +1304,13 @@ impl IndexNode for MountFSInode { .check_mmap_file(file, len, offset, vm_flags) } + fn mmap_effective_file( + &self, + file: &Arc, + ) -> Result, SystemError> { + self.inner_inode.mmap_effective_file(file) + } + fn mmap_file( &self, file: &Arc, @@ -1796,9 +1799,18 @@ impl IndexNode for MountFSInode { self.inner_inode.getxattr(name, buf) } - fn setxattr(&self, name: &str, value: &[u8]) -> Result { + fn setxattr(&self, name: &str, value: &[u8], flags: XattrFlags) -> Result { + self.ensure_mount_writable()?; + self.inner_inode.setxattr(name, value, flags) + } + + fn listxattr(&self, buf: &mut [u8]) -> Result { + self.inner_inode.listxattr(buf) + } + + fn removexattr(&self, name: &str) -> Result { self.ensure_mount_writable()?; - self.inner_inode.setxattr(name, value) + self.inner_inode.removexattr(name) } } @@ -2084,6 +2096,45 @@ impl MountList { None } + /// Remove a specific mount record by MountFS identity. + /// + /// The mount path is resolved from `mfs2mp` while holding the MountList write + /// lock, so callers that already hold the target `MountFS` do not lose object + /// identity by round-tripping through a path and popping the current top mount. + #[inline(never)] + pub fn remove_exact(&self, fs: &Arc) -> Option> { + let mut inner = self.inner.write(); + let path = inner.mfs2mp.get(fs).cloned()?; + let Some(mut stack) = inner.mounts.remove(&path) else { + inner.mfs2mp.remove(fs); + inner.mfs2ino.remove(fs); + return None; + }; + + clear_mount_list_stack_indexes(&mut inner, &path, &stack); + let pos = stack.iter().rposition(|rec| Arc::ptr_eq(&rec.fs, fs)); + let removed = match pos { + Some(pos) => stack.remove(pos), + None => { + inner.mfs2mp.remove(fs); + inner.mfs2ino.remove(fs); + reindex_mount_list_stack(&mut inner, &path, &stack); + if !stack.is_empty() { + inner.mounts.insert(path, stack); + } + return None; + } + }; + let removed_fs = removed.fs.clone(); + + reindex_mount_list_stack(&mut inner, &path, &stack); + if !stack.is_empty() { + inner.mounts.insert(path, stack); + } + + Some(removed_fs) + } + pub fn rewrite_paths(&self, mut rewrite: F) where F: FnMut(&str) -> Option, @@ -2163,6 +2214,40 @@ impl MountList { } } +fn clear_mount_list_stack_indexes( + inner: &mut InnerMountList, + path: &Arc, + stack: &[MountRecord], +) { + for rec in stack { + inner.mfs2mp.remove(&rec.fs); + if let Some(ino) = rec.ino { + inner.mfs2ino.remove(&rec.fs); + if inner + .ino2mp + .get(&ino) + .is_some_and(|mapped| Arc::ptr_eq(mapped, path)) + { + inner.ino2mp.remove(&ino); + } + } + } +} + +fn reindex_mount_list_stack( + inner: &mut InnerMountList, + path: &Arc, + stack: &[MountRecord], +) { + for rec in stack { + inner.mfs2mp.insert(rec.fs.clone(), path.clone()); + if let Some(ino) = rec.ino { + inner.mfs2ino.insert(rec.fs.clone(), ino); + inner.ino2mp.insert(ino, path.clone()); + } + } +} + impl Debug for MountList { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let inner = self.inner.read(); diff --git a/kernel/src/filesystem/vfs/syscall/mod.rs b/kernel/src/filesystem/vfs/syscall/mod.rs index f9ea5c9056..076f564076 100644 --- a/kernel/src/filesystem/vfs/syscall/mod.rs +++ b/kernel/src/filesystem/vfs/syscall/mod.rs @@ -130,10 +130,16 @@ mod sys_unlink; mod sys_utimes; mod sys_fgetxattr; +mod sys_flistxattr; +mod sys_fremovexattr; mod sys_fsetxattr; mod sys_getxattr; mod sys_lgetxattr; +mod sys_listxattr; +mod sys_llistxattr; +mod sys_lremovexattr; mod sys_lsetxattr; +mod sys_removexattr; mod sys_setxattr; mod xattr_utils; @@ -142,10 +148,6 @@ pub const SEEK_CUR: u32 = 1; pub const SEEK_END: u32 = 2; pub const SEEK_MAX: u32 = 3; -// 扩展属性操作标志 -pub const XATTR_CREATE: i32 = 0x1; // 设置值,如果属性不存在则创建,已存在返回则失败 -pub const XATTR_REPLACE: i32 = 0x2; // 设置值,如果属性已存在则替换,不存在返回则失败 - bitflags! { /// Flags used in the `renameat2` system call. /// diff --git a/kernel/src/filesystem/vfs/syscall/rename_utils.rs b/kernel/src/filesystem/vfs/syscall/rename_utils.rs index a073611d65..4265cc62a9 100644 --- a/kernel/src/filesystem/vfs/syscall/rename_utils.rs +++ b/kernel/src/filesystem/vfs/syscall/rename_utils.rs @@ -32,6 +32,14 @@ pub fn do_renameat2( filename_to: *const u8, flags: u32, ) -> Result { + let flags = RenameFlags::from_bits(flags).ok_or(SystemError::EINVAL)?; + + if flags.contains(RenameFlags::EXCHANGE) + && (flags.contains(RenameFlags::NOREPLACE) || flags.contains(RenameFlags::WHITEOUT)) + { + return Err(SystemError::EINVAL); + } + let filename_from = vfs_check_and_clone_cstr(filename_from, Some(MAX_PATHLEN))? .into_string() .map_err(|_| SystemError::EINVAL)?; @@ -63,15 +71,6 @@ pub fn do_renameat2( return Err(SystemError::ENAMETOOLONG); } - let flags = RenameFlags::from_bits_truncate(flags); - - // 标志互斥性检查(Linux 语义:EXCHANGE 与 NOREPLACE/WHITEOUT 互斥) - if flags.contains(RenameFlags::EXCHANGE) - && (flags.contains(RenameFlags::NOREPLACE) || flags.contains(RenameFlags::WHITEOUT)) - { - return Err(SystemError::EINVAL); - } - if flags.contains(RenameFlags::NOREPLACE) && (new_filename == "." || new_filename == "..") { return Err(SystemError::EEXIST); } @@ -86,13 +85,23 @@ pub fn do_renameat2( } let old_inode = old_parent_inode.lookup(old_filename)?; - if old_inode.metadata()?.file_type == crate::filesystem::vfs::FileType::Dir { + let old_inode_type = old_inode.metadata()?.file_type; + if old_inode_type == crate::filesystem::vfs::FileType::Dir { // 仅当把目录移动到其自身或其子树下时拦截 if is_ancestor(&old_inode, &new_parent_inode) { return Err(SystemError::EINVAL); } } + if flags.contains(RenameFlags::EXCHANGE) { + let new_inode = new_parent_inode.lookup(new_filename)?; + if new_inode.metadata()?.file_type == crate::filesystem::vfs::FileType::Dir + && is_ancestor(&new_inode, &old_parent_inode) + { + return Err(SystemError::EINVAL); + } + } + // 不要在这里检查 new_parent 是否是 old 的祖先: // 这会把同目录/向上移动的合法情况误判为 ENOTEMPTY。 // 非空目录覆盖应由具体文件系统在 move_to/rename 实现中返回 ENOTEMPTY。 diff --git a/kernel/src/filesystem/vfs/syscall/sys_flistxattr.rs b/kernel/src/filesystem/vfs/syscall/sys_flistxattr.rs new file mode 100644 index 0000000000..6aef1d617a --- /dev/null +++ b/kernel/src/filesystem/vfs/syscall/sys_flistxattr.rs @@ -0,0 +1,45 @@ +//! System call handler for sys_flistxattr. + +use super::xattr_utils::fd_listxattr; +use crate::{ + arch::{interrupt::TrapFrame, syscall::nr::SYS_FLISTXATTR}, + syscall::table::{FormattedSyscallParam, Syscall}, +}; +use alloc::{string::ToString, vec::Vec}; +use system_error::SystemError; + +pub struct SysFlistxattrHandle; + +impl Syscall for SysFlistxattrHandle { + fn num_args(&self) -> usize { + 3 + } + + fn handle(&self, args: &[usize], _frame: &mut TrapFrame) -> Result { + fd_listxattr(Self::fd(args), Self::buf(args), Self::size(args)) + } + + fn entry_format(&self, args: &[usize]) -> Vec { + vec![ + FormattedSyscallParam::new("fd", Self::fd(args).to_string()), + FormattedSyscallParam::new("buf", format!("{:#x}", Self::buf(args) as usize)), + FormattedSyscallParam::new("size", Self::size(args).to_string()), + ] + } +} + +impl SysFlistxattrHandle { + fn fd(args: &[usize]) -> i32 { + args[0] as i32 + } + + fn buf(args: &[usize]) -> *mut u8 { + args[1] as *mut u8 + } + + fn size(args: &[usize]) -> usize { + args[2] + } +} + +syscall_table_macros::declare_syscall!(SYS_FLISTXATTR, SysFlistxattrHandle); diff --git a/kernel/src/filesystem/vfs/syscall/sys_fremovexattr.rs b/kernel/src/filesystem/vfs/syscall/sys_fremovexattr.rs new file mode 100644 index 0000000000..d31b254ca9 --- /dev/null +++ b/kernel/src/filesystem/vfs/syscall/sys_fremovexattr.rs @@ -0,0 +1,40 @@ +//! System call handler for sys_fremovexattr. + +use super::xattr_utils::fd_removexattr; +use crate::{ + arch::{interrupt::TrapFrame, syscall::nr::SYS_FREMOVEXATTR}, + syscall::table::{FormattedSyscallParam, Syscall}, +}; +use alloc::{string::ToString, vec::Vec}; +use system_error::SystemError; + +pub struct SysFremovexattrHandle; + +impl Syscall for SysFremovexattrHandle { + fn num_args(&self) -> usize { + 2 + } + + fn handle(&self, args: &[usize], _frame: &mut TrapFrame) -> Result { + fd_removexattr(Self::fd(args), Self::name(args)) + } + + fn entry_format(&self, args: &[usize]) -> Vec { + vec![ + FormattedSyscallParam::new("fd", Self::fd(args).to_string()), + FormattedSyscallParam::new("name", format!("{:#x}", Self::name(args) as usize)), + ] + } +} + +impl SysFremovexattrHandle { + fn fd(args: &[usize]) -> i32 { + args[0] as i32 + } + + fn name(args: &[usize]) -> *const u8 { + args[1] as *const u8 + } +} + +syscall_table_macros::declare_syscall!(SYS_FREMOVEXATTR, SysFremovexattrHandle); diff --git a/kernel/src/filesystem/vfs/syscall/sys_listxattr.rs b/kernel/src/filesystem/vfs/syscall/sys_listxattr.rs new file mode 100644 index 0000000000..a421a8cc5a --- /dev/null +++ b/kernel/src/filesystem/vfs/syscall/sys_listxattr.rs @@ -0,0 +1,51 @@ +//! System call handler for sys_listxattr. + +use super::xattr_utils::path_listxattr; +use crate::{ + arch::{interrupt::TrapFrame, syscall::nr::SYS_LISTXATTR}, + filesystem::vfs::VFS_MAX_FOLLOW_SYMLINK_TIMES, + syscall::table::{FormattedSyscallParam, Syscall}, +}; +use alloc::{string::ToString, vec::Vec}; +use system_error::SystemError; + +pub struct SysListxattrHandle; + +impl Syscall for SysListxattrHandle { + fn num_args(&self) -> usize { + 3 + } + + fn handle(&self, args: &[usize], _frame: &mut TrapFrame) -> Result { + path_listxattr( + Self::path(args), + Self::buf(args), + Self::size(args), + VFS_MAX_FOLLOW_SYMLINK_TIMES, + ) + } + + fn entry_format(&self, args: &[usize]) -> Vec { + vec![ + FormattedSyscallParam::new("path", format!("{:#x}", Self::path(args) as usize)), + FormattedSyscallParam::new("buf", format!("{:#x}", Self::buf(args) as usize)), + FormattedSyscallParam::new("size", Self::size(args).to_string()), + ] + } +} + +impl SysListxattrHandle { + fn path(args: &[usize]) -> *const u8 { + args[0] as *const u8 + } + + fn buf(args: &[usize]) -> *mut u8 { + args[1] as *mut u8 + } + + fn size(args: &[usize]) -> usize { + args[2] + } +} + +syscall_table_macros::declare_syscall!(SYS_LISTXATTR, SysListxattrHandle); diff --git a/kernel/src/filesystem/vfs/syscall/sys_llistxattr.rs b/kernel/src/filesystem/vfs/syscall/sys_llistxattr.rs new file mode 100644 index 0000000000..561a22f083 --- /dev/null +++ b/kernel/src/filesystem/vfs/syscall/sys_llistxattr.rs @@ -0,0 +1,45 @@ +//! System call handler for sys_llistxattr. + +use super::xattr_utils::path_listxattr; +use crate::{ + arch::{interrupt::TrapFrame, syscall::nr::SYS_LLISTXATTR}, + syscall::table::{FormattedSyscallParam, Syscall}, +}; +use alloc::{string::ToString, vec::Vec}; +use system_error::SystemError; + +pub struct SysLlistxattrHandle; + +impl Syscall for SysLlistxattrHandle { + fn num_args(&self) -> usize { + 3 + } + + fn handle(&self, args: &[usize], _frame: &mut TrapFrame) -> Result { + path_listxattr(Self::path(args), Self::buf(args), Self::size(args), 0) + } + + fn entry_format(&self, args: &[usize]) -> Vec { + vec![ + FormattedSyscallParam::new("path", format!("{:#x}", Self::path(args) as usize)), + FormattedSyscallParam::new("buf", format!("{:#x}", Self::buf(args) as usize)), + FormattedSyscallParam::new("size", Self::size(args).to_string()), + ] + } +} + +impl SysLlistxattrHandle { + fn path(args: &[usize]) -> *const u8 { + args[0] as *const u8 + } + + fn buf(args: &[usize]) -> *mut u8 { + args[1] as *mut u8 + } + + fn size(args: &[usize]) -> usize { + args[2] + } +} + +syscall_table_macros::declare_syscall!(SYS_LLISTXATTR, SysLlistxattrHandle); diff --git a/kernel/src/filesystem/vfs/syscall/sys_lremovexattr.rs b/kernel/src/filesystem/vfs/syscall/sys_lremovexattr.rs new file mode 100644 index 0000000000..e167cf59e8 --- /dev/null +++ b/kernel/src/filesystem/vfs/syscall/sys_lremovexattr.rs @@ -0,0 +1,40 @@ +//! System call handler for sys_lremovexattr. + +use super::xattr_utils::path_removexattr; +use crate::{ + arch::{interrupt::TrapFrame, syscall::nr::SYS_LREMOVEXATTR}, + syscall::table::{FormattedSyscallParam, Syscall}, +}; +use alloc::vec::Vec; +use system_error::SystemError; + +pub struct SysLremovexattrHandle; + +impl Syscall for SysLremovexattrHandle { + fn num_args(&self) -> usize { + 2 + } + + fn handle(&self, args: &[usize], _frame: &mut TrapFrame) -> Result { + path_removexattr(Self::path(args), Self::name(args), 0) + } + + fn entry_format(&self, args: &[usize]) -> Vec { + vec![ + FormattedSyscallParam::new("path", format!("{:#x}", Self::path(args) as usize)), + FormattedSyscallParam::new("name", format!("{:#x}", Self::name(args) as usize)), + ] + } +} + +impl SysLremovexattrHandle { + fn path(args: &[usize]) -> *const u8 { + args[0] as *const u8 + } + + fn name(args: &[usize]) -> *const u8 { + args[1] as *const u8 + } +} + +syscall_table_macros::declare_syscall!(SYS_LREMOVEXATTR, SysLremovexattrHandle); diff --git a/kernel/src/filesystem/vfs/syscall/sys_mount.rs b/kernel/src/filesystem/vfs/syscall/sys_mount.rs index 3bddbf4cac..6c13a89aea 100644 --- a/kernel/src/filesystem/vfs/syscall/sys_mount.rs +++ b/kernel/src/filesystem/vfs/syscall/sys_mount.rs @@ -12,6 +12,7 @@ use crate::{ }, libs::casting::DowncastArc, process::{ + cred::{ns_capable, CAPFlags}, namespace::propagation::{ change_mnt_propagation_recursive, flags_to_propagation_type, is_propagation_change, propagate_moved_tree, @@ -145,6 +146,13 @@ impl SysMountHandle { syscall_table_macros::declare_syscall!(SYS_MOUNT, SysMountHandle); +/// Linux `may_mount()`: modifying the current mount namespace requires +/// CAP_SYS_ADMIN in that namespace's owning user namespace. +pub(super) fn may_mount() -> bool { + let current_mntns = ProcessManager::current_mntns(); + ns_capable(current_mntns.user_ns(), CAPFlags::CAP_SYS_ADMIN) +} + /// # do_mount - Dispatch a mount operation /// /// Resolves `target` in the current mount namespace and dispatches the request @@ -173,13 +181,14 @@ pub fn do_mount( data: Option, mount_flags: MountFlags, ) -> Result<(), SystemError> { + let requested_target = target.as_deref().unwrap_or(""); let (current_node, rest_path) = user_path_at( &ProcessManager::current_pcb(), AtFlags::AT_FDCWD.bits(), - target.as_deref().unwrap_or(""), + requested_target, )?; let inode = current_node.lookup_follow_symlink(&rest_path, VFS_MAX_FOLLOW_SYMLINK_TIMES)?; - let resolved_target_path = inode.absolute_path()?; + let resolved_target_path = resolved_mount_target_path(requested_target, &inode)?; return path_mount( source, &resolved_target_path, @@ -190,6 +199,47 @@ pub fn do_mount( ); } +fn resolved_mount_target_path( + requested_path: &str, + inode: &Arc, +) -> Result { + match inode.absolute_path() { + Ok(path) => Ok(path), + Err(SystemError::ENOSYS) => Ok(normalize_requested_mount_path(requested_path)), + Err(err) => Err(err), + } +} + +fn normalize_requested_mount_path(path: &str) -> String { + let base = if path.starts_with('/') { + String::from("/") + } else { + ProcessManager::current_pcb().basic().cwd() + }; + + let mut components: Vec<&str> = base.split('/').filter(|part| !part.is_empty()).collect(); + for component in path.split('/').filter(|part| !part.is_empty()) { + match component { + "." => {} + ".." => { + components.pop(); + } + _ => components.push(component), + } + } + + if components.is_empty() { + return String::from("/"); + } + + let mut normalized = String::new(); + for component in components { + normalized.push('/'); + normalized.push_str(component); + } + normalized +} + fn path_mount( source: Option, target_path: &str, @@ -206,6 +256,10 @@ fn path_mount( return Err(SystemError::EINVAL); } + if !may_mount() { + return Err(SystemError::EPERM); + } + let mut mnt_flags = MountFlags::empty(); // Default to relatime unless overriden diff --git a/kernel/src/filesystem/vfs/syscall/sys_pivot_root.rs b/kernel/src/filesystem/vfs/syscall/sys_pivot_root.rs index f4ac382160..70238c18a4 100644 --- a/kernel/src/filesystem/vfs/syscall/sys_pivot_root.rs +++ b/kernel/src/filesystem/vfs/syscall/sys_pivot_root.rs @@ -16,13 +16,15 @@ use crate::{ FileSystem, FileType, IndexNode, MAX_PATHLEN, VFS_MAX_FOLLOW_SYMLINK_TIMES, }, libs::casting::DowncastArc, - process::{all_process, cred::CAPFlags, ProcessControlBlock, ProcessManager}, + process::{all_process, ProcessControlBlock, ProcessManager}, syscall::{ table::{FormattedSyscallParam, Syscall}, user_access::vfs_check_and_clone_cstr, }, }; +use super::sys_mount::may_mount; + pub struct SysPivotRootHandle; struct PivotRootTargets { @@ -90,6 +92,10 @@ fn resolve_pivot_root_targets( new_root_ptr: *const u8, put_old_ptr: *const u8, ) -> Result { + if !may_mount() { + return Err(SystemError::EPERM); + } + if new_root_ptr.is_null() || put_old_ptr.is_null() { return Err(SystemError::EFAULT); } @@ -106,10 +112,6 @@ fn resolve_pivot_root_targets( } let current_pcb = ProcessManager::current_pcb(); - if !current_pcb.cred().has_capability(CAPFlags::CAP_SYS_ADMIN) { - return Err(SystemError::EPERM); - } - let current_mntns = ProcessManager::current_mntns(); let namespace_root_inode = current_mntns.root_inode(); let current_root_inode = namespace_root_inode.clone(); @@ -170,8 +172,8 @@ fn resolve_pivot_root_targets( return Err(SystemError::EINVAL); } - let old_new_root_path = new_root_inode.absolute_path()?; - let put_old_path_before = put_old_inode.absolute_path()?; + let old_new_root_path = resolved_visible_path(&new_root_path, &new_root_inode)?; + let put_old_path_before = resolved_visible_path(&put_old_path, &put_old_inode)?; let new_put_old_path = put_old_path_before .strip_prefix(&old_new_root_path) .map(normalize_visible_path) @@ -236,6 +238,47 @@ fn same_path_ref(left: &Arc, right: &Arc) -> bool Arc::ptr_eq(&left.fs(), &right.fs()) && left_meta.inode_id == right_meta.inode_id } +fn resolved_visible_path( + requested_path: &str, + inode: &Arc, +) -> Result { + match inode.absolute_path() { + Ok(path) => Ok(path), + Err(SystemError::ENOSYS) => Ok(normalize_requested_path(requested_path)), + Err(err) => Err(err), + } +} + +fn normalize_requested_path(path: &str) -> String { + let base = if path.starts_with('/') { + String::from("/") + } else { + ProcessManager::current_pcb().basic().cwd() + }; + + let mut components: Vec<&str> = base.split('/').filter(|part| !part.is_empty()).collect(); + for component in path.split('/').filter(|part| !part.is_empty()) { + match component { + "." => {} + ".." => { + components.pop(); + } + _ => components.push(component), + } + } + + if components.is_empty() { + return String::from("/"); + } + + let mut normalized = String::new(); + for component in components { + normalized.push('/'); + normalized.push_str(component); + } + normalized +} + fn repair_same_namespace_fs_refs( target_mntns: &Arc, current_task: &Arc, diff --git a/kernel/src/filesystem/vfs/syscall/sys_removexattr.rs b/kernel/src/filesystem/vfs/syscall/sys_removexattr.rs new file mode 100644 index 0000000000..86abda0f3a --- /dev/null +++ b/kernel/src/filesystem/vfs/syscall/sys_removexattr.rs @@ -0,0 +1,45 @@ +//! System call handler for sys_removexattr. + +use super::xattr_utils::path_removexattr; +use crate::{ + arch::{interrupt::TrapFrame, syscall::nr::SYS_REMOVEXATTR}, + filesystem::vfs::VFS_MAX_FOLLOW_SYMLINK_TIMES, + syscall::table::{FormattedSyscallParam, Syscall}, +}; +use alloc::vec::Vec; +use system_error::SystemError; + +pub struct SysRemovexattrHandle; + +impl Syscall for SysRemovexattrHandle { + fn num_args(&self) -> usize { + 2 + } + + fn handle(&self, args: &[usize], _frame: &mut TrapFrame) -> Result { + path_removexattr( + Self::path(args), + Self::name(args), + VFS_MAX_FOLLOW_SYMLINK_TIMES, + ) + } + + fn entry_format(&self, args: &[usize]) -> Vec { + vec![ + FormattedSyscallParam::new("path", format!("{:#x}", Self::path(args) as usize)), + FormattedSyscallParam::new("name", format!("{:#x}", Self::name(args) as usize)), + ] + } +} + +impl SysRemovexattrHandle { + fn path(args: &[usize]) -> *const u8 { + args[0] as *const u8 + } + + fn name(args: &[usize]) -> *const u8 { + args[1] as *const u8 + } +} + +syscall_table_macros::declare_syscall!(SYS_REMOVEXATTR, SysRemovexattrHandle); diff --git a/kernel/src/filesystem/vfs/syscall/sys_umount2.rs b/kernel/src/filesystem/vfs/syscall/sys_umount2.rs index 2f9bf5e9ec..3a7909194c 100644 --- a/kernel/src/filesystem/vfs/syscall/sys_umount2.rs +++ b/kernel/src/filesystem/vfs/syscall/sys_umount2.rs @@ -1,5 +1,7 @@ //! System call handler for sys_umount. +use super::sys_mount::may_mount; + use crate::{ arch::{interrupt::TrapFrame, syscall::nr::SYS_UMOUNT2}, filesystem::vfs::{ @@ -95,6 +97,10 @@ pub fn do_umount2( let (work, rest) = user_path_at(&ProcessManager::current_pcb(), dirfd, target)?; let target_inode = work.lookup_follow_symlink(&rest, VFS_MAX_FOLLOW_SYMLINK_TIMES)?; + if !may_mount() { + return Err(SystemError::EPERM); + } + let path = visible_umount_path(&target_inode)?; let current_mntns = ProcessManager::current_mntns(); @@ -115,7 +121,13 @@ pub fn do_umount2( ); return Err(err); } - let _ = current_mntns.remove_mount(&path); + if current_mntns.remove_mount_exact(&fs).is_none() { + log::error!( + "do_umount2: mount_list exact remove miss for resolved='{}', fs='{}'", + path, + fs.name() + ); + } Ok(fs) } diff --git a/kernel/src/filesystem/vfs/syscall/xattr_utils.rs b/kernel/src/filesystem/vfs/syscall/xattr_utils.rs index 00c1a58109..f4f0b32324 100644 --- a/kernel/src/filesystem/vfs/syscall/xattr_utils.rs +++ b/kernel/src/filesystem/vfs/syscall/xattr_utils.rs @@ -1,14 +1,61 @@ -use super::{XATTR_CREATE, XATTR_REPLACE}; use crate::{ - filesystem::vfs::{syscall::AtFlags, utils::user_path_at, IndexNode, MAX_PATHLEN}, + filesystem::vfs::{syscall::AtFlags, utils::user_path_at, IndexNode, XattrFlags, MAX_PATHLEN}, process::ProcessManager, syscall::user_access::{ check_and_clone_cstr, vfs_check_and_clone_cstr, UserBufferReader, UserBufferWriter, }, }; -use alloc::{sync::Arc, vec::Vec}; +use alloc::{string::String, sync::Arc, vec::Vec}; use system_error::SystemError; +const XATTR_LIST_MAX: usize = 65536; +const XATTR_NAME_MAX: usize = 255; +const XATTR_SIZE_MAX: usize = 65536; + +struct SetxattrArgs { + name: String, + value: Vec, + flags: XattrFlags, +} + +fn clone_xattr_name(name_ptr: *const u8) -> Result { + let name = check_and_clone_cstr(name_ptr, Some(XATTR_NAME_MAX + 1))?; + if name.as_bytes().is_empty() || name.as_bytes().len() > XATTR_NAME_MAX { + return Err(SystemError::ERANGE); + } + + name.into_string().map_err(|_| SystemError::EINVAL) +} + +fn parse_setxattr_flags(flags: i32) -> Result { + XattrFlags::from_bits(flags).ok_or(SystemError::EINVAL) +} + +fn prepare_setxattr_args( + name_ptr: *const u8, + value_ptr: *const u8, + size: usize, + flags: i32, +) -> Result { + let flags = parse_setxattr_flags(flags)?; + let name = clone_xattr_name(name_ptr)?; + + if size > XATTR_SIZE_MAX { + return Err(SystemError::E2BIG); + } + + let value = if size == 0 { + Vec::new() + } else { + let user_buffer_reader = UserBufferReader::new(value_ptr, size, true)?; + let mut value = vec![0u8; size]; + user_buffer_reader.copy_from_user_protected(&mut value, 0)?; + value + }; + + Ok(SetxattrArgs { name, value, flags }) +} + /// Extended attribute GET operations pub(super) fn path_getxattr( path_ptr: *const u8, @@ -47,27 +94,139 @@ pub(super) fn fd_getxattr( do_getxattr(inode, name_ptr, buf_ptr, size) } -fn do_getxattr( +/// Extended attribute LIST operations +pub(super) fn path_listxattr( + path_ptr: *const u8, + buf_ptr: *mut u8, + size: usize, + lookup_flags: usize, +) -> Result { + let path = vfs_check_and_clone_cstr(path_ptr, Some(MAX_PATHLEN))? + .into_string() + .map_err(|_| SystemError::EINVAL)?; + + let pcb = ProcessManager::current_pcb(); + let (current_node, rest_path) = user_path_at(&pcb, AtFlags::AT_FDCWD.bits(), &path)?; + let inode = current_node.lookup_follow_symlink(&rest_path, lookup_flags)?; + + do_listxattr(inode, buf_ptr, size) +} + +pub(super) fn fd_listxattr(fd: i32, buf_ptr: *mut u8, size: usize) -> Result { + let binding = ProcessManager::current_pcb().fd_table(); + let fd_table_guard = binding.read(); + + let file = fd_table_guard + .get_file_by_fd(fd) + .ok_or(SystemError::EBADF)?; + let inode = file.inode(); + + do_listxattr(inode, buf_ptr, size) +} + +fn do_listxattr( inode: Arc, - name_ptr: *const u8, buf_ptr: *mut u8, size: usize, ) -> Result { - let name = check_and_clone_cstr(name_ptr, None)? + if size == 0 { + let mut temp_buf = Vec::new(); + return inode.listxattr(&mut temp_buf); + } + + let capped_size = core::cmp::min(size, XATTR_LIST_MAX); + let mut list = vec![0u8; capped_size]; + let actual_size = match inode.listxattr(&mut list) { + Err(SystemError::ERANGE) if capped_size == XATTR_LIST_MAX => { + return Err(SystemError::E2BIG) + } + result => result?, + }; + if actual_size > capped_size { + if capped_size == XATTR_LIST_MAX { + return Err(SystemError::E2BIG); + } + return Err(SystemError::ERANGE); + } + + if actual_size == 0 { + return Ok(0); + } + + let mut user_buffer_writer = UserBufferWriter::new(buf_ptr, actual_size, true)?; + user_buffer_writer.copy_to_user(&list[..actual_size], 0)?; + Ok(actual_size) +} + +/// Extended attribute REMOVE operations +pub(super) fn path_removexattr( + path_ptr: *const u8, + name_ptr: *const u8, + lookup_flags: usize, +) -> Result { + let path = vfs_check_and_clone_cstr(path_ptr, Some(MAX_PATHLEN))? .into_string() .map_err(|_| SystemError::EINVAL)?; + let pcb = ProcessManager::current_pcb(); + let (current_node, rest_path) = user_path_at(&pcb, AtFlags::AT_FDCWD.bits(), &path)?; + let inode = current_node.lookup_follow_symlink(&rest_path, lookup_flags)?; + + do_removexattr(inode, name_ptr) +} + +pub(super) fn fd_removexattr(fd: i32, name_ptr: *const u8) -> Result { + let binding = ProcessManager::current_pcb().fd_table(); + let fd_table_guard = binding.read(); + + let file = fd_table_guard + .get_file_by_fd(fd) + .ok_or(SystemError::EBADF)?; + let inode = file.inode(); + + do_removexattr(inode, name_ptr) +} + +fn do_removexattr(inode: Arc, name_ptr: *const u8) -> Result { + let name = clone_xattr_name(name_ptr)?; + + inode.removexattr(&name) +} + +fn do_getxattr( + inode: Arc, + name_ptr: *const u8, + buf_ptr: *mut u8, + size: usize, +) -> Result { + let name = clone_xattr_name(name_ptr)?; + if size == 0 { // 只返回需要的缓冲区大小 let mut temp_buf = Vec::new(); let result_size = inode.getxattr(&name, &mut temp_buf)?; Ok(result_size) } else { - let mut user_buffer_writer = UserBufferWriter::new(buf_ptr, size, true)?; - let user_buf = user_buffer_writer.buffer(0)?; + let capped_size = core::cmp::min(size, XATTR_SIZE_MAX); + let mut value = vec![0u8; capped_size]; + let actual_size = match inode.getxattr(&name, &mut value) { + Err(SystemError::ERANGE) if capped_size == XATTR_SIZE_MAX => { + return Err(SystemError::E2BIG) + } + result => result?, + }; + if actual_size > capped_size { + if capped_size == XATTR_SIZE_MAX { + return Err(SystemError::E2BIG); + } + return Err(SystemError::ERANGE); + } + if actual_size == 0 { + return Ok(0); + } - // 读取属性值 - let actual_size = inode.getxattr(&name, user_buf)?; + let mut user_buffer_writer = UserBufferWriter::new(buf_ptr, actual_size, true)?; + user_buffer_writer.copy_to_user(&value[..actual_size], 0)?; Ok(actual_size) } } @@ -81,6 +240,8 @@ pub(super) fn path_setxattr( lookup_flags: usize, flags: i32, ) -> Result { + let xattr = prepare_setxattr_args(name_ptr, value_ptr, size, flags)?; + let path = vfs_check_and_clone_cstr(path_ptr, Some(MAX_PATHLEN))? .into_string() .map_err(|_| SystemError::EINVAL)?; @@ -89,7 +250,7 @@ pub(super) fn path_setxattr( let (current_node, rest_path) = user_path_at(&pcb, AtFlags::AT_FDCWD.bits(), &path)?; let inode = current_node.lookup_follow_symlink(&rest_path, lookup_flags)?; - do_setxattr(inode, name_ptr, value_ptr, size, flags) + do_setxattr(inode, xattr) } pub(super) fn fd_setxattr( @@ -107,29 +268,10 @@ pub(super) fn fd_setxattr( .ok_or(SystemError::EBADF)?; let inode = file.inode(); - do_setxattr(inode, name_ptr, value_ptr, size, flags) + let xattr = prepare_setxattr_args(name_ptr, value_ptr, size, flags)?; + do_setxattr(inode, xattr) } -fn do_setxattr( - inode: Arc, - name_ptr: *const u8, - value_ptr: *const u8, - size: usize, - flags: i32, -) -> Result { - let name = check_and_clone_cstr(name_ptr, None)? - .into_string() - .map_err(|_| SystemError::EINVAL)?; - - if (flags & XATTR_CREATE != 0) && inode.getxattr(&name, &mut Vec::new()).is_ok() { - return Err(SystemError::EEXIST); - } - if (flags & XATTR_REPLACE != 0) && inode.getxattr(&name, &mut Vec::new()).is_err() { - return Err(SystemError::ENODATA); - } - - let user_buffer_reader = UserBufferReader::new(value_ptr, size, true)?; - let value_buf = user_buffer_reader.buffer(0)?; - - inode.setxattr(&name, value_buf) +fn do_setxattr(inode: Arc, xattr: SetxattrArgs) -> Result { + inode.setxattr(&xattr.name, &xattr.value, xattr.flags) } diff --git a/kernel/src/filesystem/vfs/vcore.rs b/kernel/src/filesystem/vfs/vcore.rs index c6d6b8ab72..89a44ab9dc 100644 --- a/kernel/src/filesystem/vfs/vcore.rs +++ b/kernel/src/filesystem/vfs/vcore.rs @@ -100,7 +100,7 @@ fn migrate_virtual_filesystem( let current_mntns = ProcessManager::current_mntns(); let old_root_inode = current_mntns.root_inode(); - let old_mntfs = current_mntns.root_mntfs().clone(); + let old_mntfs = current_mntns.root_mntfs(); let new_fs = MountFS::new( new_fs, None, @@ -132,9 +132,7 @@ fn migrate_virtual_filesystem( .mount_from(old_root_inode.find("sys").expect("sys not mounted!")) .expect("Failed to migrate filesystem of sys"); - unsafe { - current_mntns.force_change_root_mountfs(new_fs); - } + current_mntns.force_change_root_mountfs(new_fs); // 换根后需要同步更新“当前进程”的 fs root/pwd。 // 我们的路径解析(绝对路径)以进程 fs root 为起点;若不更新,后续诸如 /dev/pts 的挂载、 diff --git a/kernel/src/ipc/pipe.rs b/kernel/src/ipc/pipe.rs index dcdeda7b60..202eca4f9b 100644 --- a/kernel/src/ipc/pipe.rs +++ b/kernel/src/ipc/pipe.rs @@ -1426,72 +1426,40 @@ impl IndexNode for LockedPipeInode { } let mut guard = self.inner.lock(); - - // 写端关闭 - if accflags == FileFlags::O_WRONLY { - assert!(guard.writer > 0); - guard.writer -= 1; - // 如果已经没有写端了,则唤醒读端 - if guard.writer == 0 { - // poll/epoll 语义仍然是 HUP,但 Linux 的 SIGIO/fasync 在这里上报 POLL_IN - // (读端被唤醒后可读到 EOF)。 - let poll_flags = FileFlags::O_RDONLY; - let poll_data = FilePrivateData::Pipefs(PipeFsPrivateData { flags: poll_flags }); - let pollflag = guard - .poll(&poll_data) - .map(|v| EPollEventType::from_bits_truncate(v as u32)) - .unwrap_or(EPollEventType::EPOLLHUP); - drop(guard); // 先释放 inner 锁,避免潜在的死锁 - self.read_wait_queue - .wakeup_all(Some(ProcessState::Blocked(true))); - let _ = EventPoll::wakeup_epoll(&self.epitems, pollflag); - self.read_fasync_items.send_sigio(FASYNC_POLL_IN); - return Ok(()); + match accflags { + FileFlags::O_RDONLY => { + assert!(guard.reader > 0); + guard.reader -= 1; } - } - - // 读端关闭 - if accflags == FileFlags::O_RDONLY { - assert!(guard.reader > 0); - guard.reader -= 1; - // 如果已经没有读端了,则唤醒写端 - if guard.reader == 0 { - // poll/epoll 语义仍然是 ERR,但 Linux 的 SIGIO/fasync 在这里上报 POLL_OUT - // (写端被唤醒后下一次 write 再观察到 EPIPE)。 - let poll_data = FilePrivateData::Pipefs(PipeFsPrivateData { - flags: FileFlags::O_WRONLY, - }); - let pollflag = guard - .poll(&poll_data) - .map(|v| EPollEventType::from_bits_truncate(v as u32)) - .unwrap_or(EPollEventType::EPOLLERR); - - drop(guard); // 先释放 inner 锁,避免死锁 - self.write_wait_queue.wakeup_all(None); - let _ = EventPoll::wakeup_epoll(&self.epitems, pollflag); - self.write_fasync_items.send_sigio(FASYNC_POLL_OUT); - return Ok(()); + FileFlags::O_WRONLY => { + assert!(guard.writer > 0); + guard.writer -= 1; + } + FileFlags::O_RDWR => { + assert!(guard.reader > 0); + assert!(guard.writer > 0); + guard.reader -= 1; + guard.writer -= 1; } + _ => {} } - // O_RDWR 模式关闭:同时减少读写计数 - if accflags == FileFlags::O_RDWR { - assert!(guard.reader > 0); - assert!(guard.writer > 0); - guard.reader -= 1; - guard.writer -= 1; - let wake_reader = guard.writer == 0; - let wake_writer = guard.reader == 0; - drop(guard); // 先释放 inner 锁 + // Linux pipe_release() wakes both wait queues and notifies both fasync + // sides only when close leaves exactly one endpoint class present. + let release_notify = (guard.reader == 0) != (guard.writer == 0); + let pollflag = if release_notify { + guard.poll_both_ends() + } else { + EPollEventType::empty() + }; + drop(guard); - // 如果已经没有写端了,则唤醒读端 - if wake_reader { - self.read_wait_queue.wakeup_all(None); - } - // 如果已经没有读端了,则唤醒写端 - if wake_writer { - self.write_wait_queue.wakeup_all(None); - } + if release_notify { + self.read_wait_queue.wakeup_all(None); + self.write_wait_queue.wakeup_all(None); + let _ = EventPoll::wakeup_epoll(&self.epitems, pollflag); + self.read_fasync_items.send_sigio(FASYNC_POLL_IN); + self.write_fasync_items.send_sigio(FASYNC_POLL_OUT); } return Ok(()); diff --git a/kernel/src/ipc/syscall/sys_shmat.rs b/kernel/src/ipc/syscall/sys_shmat.rs index 55252d58e1..6d0397fc85 100644 --- a/kernel/src/ipc/syscall/sys_shmat.rs +++ b/kernel/src/ipc/syscall/sys_shmat.rs @@ -12,7 +12,7 @@ use crate::{ page::{page_manager_lock, DeferredFlusher, EntryFlags}, syscall::ProtFlags, ucontext::{AddressSpace, PhysmapParams, VMA}, - VirtAddr, VmFlags, + VirtAddr, VirtRegion, VmFlags, }, process::ProcessManager, syscall::{table::Syscall, user_access::UserBufferReader}, @@ -39,12 +39,22 @@ pub(super) fn do_kernel_shmat( shmflg: ShmFlags, ) -> Result { let ipcns = ProcessManager::current_ipcns(); - let mut shm_manager_guard = ipcns.shm.lock(); let current_address_space = AddressSpace::current()?; - let mut address_write_guard = current_address_space.write(); + let size = { + let mut shm_manager_guard = ipcns.shm.lock(); + let kernel_shm = shm_manager_guard.get_mut(&id).ok_or(SystemError::EINVAL)?; + page_align_up(kernel_shm.size()) + }; + + let mut address_write_guard = if vaddr.data() == 0 { + current_address_space.write() + } else { + current_address_space.write_guard_no_reservation_conflict(VirtRegion::new(vaddr, size)) + }; + + let mut shm_manager_guard = ipcns.shm.lock(); let kernel_shm = shm_manager_guard.get_mut(&id).ok_or(SystemError::EINVAL)?; - let size = page_align_up(kernel_shm.size()); let mut phys = PhysPageFrame::new(kernel_shm.start_paddr()); let count = PageFrameCount::from_bytes(size).unwrap(); let r = match vaddr.data() { diff --git a/kernel/src/ipc/syscall/sys_shmdt.rs b/kernel/src/ipc/syscall/sys_shmdt.rs index 41aec2805c..025eb7be05 100644 --- a/kernel/src/ipc/syscall/sys_shmdt.rs +++ b/kernel/src/ipc/syscall/sys_shmdt.rs @@ -2,8 +2,8 @@ use crate::arch::interrupt::TrapFrame; use crate::mm::mmu_gather::MmuGather; use crate::syscall::table::FormattedSyscallParam; use crate::{ - arch::syscall::nr::SYS_SHMDT, - mm::{ucontext::AddressSpace, VirtAddr}, + arch::{syscall::nr::SYS_SHMDT, MMArch}, + mm::{ucontext::AddressSpace, MemoryManagementArch, VirtAddr, VirtRegion}, syscall::table::Syscall, }; use alloc::vec::Vec; @@ -42,7 +42,8 @@ impl Syscall for SysShmdtHandle { fn handle(&self, args: &[usize], _frame: &mut TrapFrame) -> Result { let vaddr = Self::vaddr(args); let current_address_space = AddressSpace::current()?; - let mut address_write_guard = current_address_space.write(); + let mut address_write_guard = current_address_space + .write_guard_no_reservation_conflict(VirtRegion::new(vaddr, MMArch::PAGE_SIZE)); // 获取vma let vma = address_write_guard diff --git a/kernel/src/libs/elf.rs b/kernel/src/libs/elf.rs index 72b822e03e..03785b2c7b 100644 --- a/kernel/src/libs/elf.rs +++ b/kernel/src/libs/elf.rs @@ -213,12 +213,13 @@ impl ElfLoader { #[allow(clippy::too_many_arguments)] fn map_readonly_segment( user_vm_guard: &mut RwSemWriteGuard<'_, InnerAddressSpace>, - param: &ExecParam, + param: &mut ExecParam, addr_to_map: VirtAddr, prot: ProtFlags, map_flags: MapFlags, file_offset: usize, beginning_page_offset: usize, + seg_in_file_size: usize, map_size: usize, total_size: usize, map_err_handler: impl FnOnce(SystemError) -> SystemError, @@ -243,18 +244,13 @@ impl ElfLoader { return Err(SystemError::EINVAL); } - let file = param.file(); + let tmp_prot = if !prot.contains(ProtFlags::PROT_WRITE) { + prot | ProtFlags::PROT_WRITE + } else { + prot + }; let start_page = user_vm_guard - .file_mapping_with_file( - file, - addr_to_map, - map_len, - prot, - map_flags, - file_page_offset, - false, - false, - ) + .map_anonymous(addr_to_map, map_len, tmp_prot, map_flags, false, true) .map_err(map_err_handler)?; let mapped = start_page.virt_address(); @@ -269,6 +265,20 @@ impl ElfLoader { } } + Self::do_load_file( + mapped + beginning_page_offset, + seg_in_file_size, + file_offset, + param, + )?; + if tmp_prot != prot { + user_vm_guard.mprotect( + VirtPageFrame::new(mapped), + PageFrameCount::from_bytes(page_align_up(map_size)).unwrap(), + prot, + )?; + } + Ok((mapped, true)) } @@ -366,6 +376,7 @@ impl ElfLoader { *map_flags, file_offset, beginning_page_offset, + seg_in_file_size, map_size, total_size, map_err_handler, @@ -704,6 +715,7 @@ impl ElfLoader { init_info .auxv .insert(AtType::PageSize as u8, MMArch::PAGE_SIZE); + init_info.auxv.insert(AtType::Flags as u8, 0); init_info.auxv.insert(AtType::Phdr as u8, phdr_vaddr.data()); init_info .auxv @@ -715,6 +727,16 @@ impl ElfLoader { AtType::Base as u8, interpreter_base.unwrap_or(VirtAddr::new(0)).data(), ); + let cred = crate::process::ProcessManager::current_pcb().cred(); + init_info.auxv.insert(AtType::Uid as u8, cred.uid.data()); + init_info.auxv.insert(AtType::EUid as u8, cred.euid.data()); + init_info.auxv.insert(AtType::Gid as u8, cred.gid.data()); + init_info.auxv.insert(AtType::EGid as u8, cred.egid.data()); + init_info.auxv.insert(AtType::HwCap as u8, 0); + init_info.auxv.insert(AtType::ClkTck as u8, 100); + init_info.auxv.insert(AtType::Secure as u8, 0); + init_info.auxv.insert(AtType::HwCap2 as u8, 0); + init_info.auxv.insert(AtType::MinSigStackSize as u8, 2048); // 添加 rseq 相关的 auxv init_info @@ -810,8 +832,17 @@ impl ElfLoader { data_buf.clear(); data_buf.resize(size, 0); - file.read(size, data_buf) - .expect("read program header table failed"); + let read_len = file.read(size, data_buf).map_err(|e| { + error!("read program header table failed: {:?}", e); + elf::ParseError::BadOffset(phoff as u64) + })?; + if read_len != size { + error!( + "short read program header table: expected {}, got {}", + size, read_len + ); + return Err(elf::ParseError::BadOffset(phoff as u64)); + } let buf = data_buf.get_bytes(0..size)?; return Ok(Some(elf::segment::SegmentTable::new( diff --git a/kernel/src/libs/futex/futex.rs b/kernel/src/libs/futex/futex.rs index c58a08084e..4fed442ca5 100644 --- a/kernel/src/libs/futex/futex.rs +++ b/kernel/src/libs/futex/futex.rs @@ -22,7 +22,7 @@ use crate::{ mutex::{Mutex, MutexGuard}, wait_queue::{Waiter, Waker}, }, - mm::{ucontext::AddressSpace, MemoryManagementArch, VirtAddr}, + mm::{ucontext::AddressSpace, MemoryManagementArch, VirtAddr, VirtRegion}, process::{ProcessControlBlock, ProcessManager, RawPid}, syscall::user_access::{UserBufferReader, UserBufferWriter}, time::{ @@ -586,7 +586,8 @@ impl Futex { // 共享:需要生成能跨进程匹配的键 // 按照 Linux 语义,共享 futex 基于物理页帧号(PFN)或文件身份 let address_space = AddressSpace::current()?; - let as_guard = address_space.read(); + let as_guard = address_space + .read_guard_no_reservation_conflict(VirtRegion::new(uaddr, MMArch::PAGE_SIZE)); let vma = as_guard .mappings .contains(uaddr) diff --git a/kernel/src/mm/fault.rs b/kernel/src/mm/fault.rs index 65dcca3428..e5286cb1d1 100644 --- a/kernel/src/mm/fault.rs +++ b/kernel/src/mm/fault.rs @@ -423,8 +423,8 @@ impl PageFaultHandler { let mapper = &mut pfm.mapper; let mut page_manager_guard = page_manager_lock(); - if let Ok(page) = - page_manager_guard.copy_page(&cache_page.phys_address(), mapper.allocator_mut()) + if let Ok(page) = page_manager_guard + .copy_page_as_normal(&cache_page.phys_address(), mapper.allocator_mut()) { pfm.cow_page = Some(page.clone()); } else { @@ -668,7 +668,7 @@ impl PageFaultHandler { // 私有文件映射,必须拷贝页面 let new_page = { let mut page_manager_guard = page_manager_lock(); - match page_manager_guard.copy_page(&old_paddr, mapper.allocator_mut()) { + match page_manager_guard.copy_page_as_normal(&old_paddr, mapper.allocator_mut()) { Ok(page) => page, Err(_) => return VmFaultReason::VM_FAULT_OOM, } @@ -815,9 +815,12 @@ impl PageFaultHandler { VirtAddr::new(addr.data() + ((pgoff - start_pgoff) << MMArch::PAGE_SHIFT)); if mapper.get_entry(address, 0).is_none() { let mut flags = vma_guard.flags(); - if vma_guard - .vm_flags() - .contains(VmFlags::VM_SHARED | VmFlags::VM_WRITE) + let is_private_file_vma = vma_guard.vm_file().is_some() + && !vma_guard.vm_flags().contains(VmFlags::VM_SHARED); + if is_private_file_vma + || vma_guard + .vm_flags() + .contains(VmFlags::VM_SHARED | VmFlags::VM_WRITE) { flags = flags.set_write(false); } @@ -981,9 +984,9 @@ impl PageFaultHandler { let _pt_edit = mm.page_table_edit(); let mapper = &mut pfm.mapper; - let page_to_map = if flags.contains(FaultFlags::FAULT_FLAG_WRITE) - && !vma_guard.vm_flags().contains(VmFlags::VM_SHARED) - { + let is_private_file_vma = + vma_guard.vm_file().is_some() && !vma_guard.vm_flags().contains(VmFlags::VM_SHARED); + let page_to_map = if flags.contains(FaultFlags::FAULT_FLAG_WRITE) && is_private_file_vma { // 私有文件映射的写时复制 cow_page.expect("no cow_page in PageFaultMessage") } else { @@ -995,9 +998,10 @@ impl PageFaultHandler { let mlocked = vma_guard.vm_flags().contains(VmFlags::VM_LOCKED); let mut map_flags = vma_guard.flags(); - if vma_guard - .vm_flags() - .contains(VmFlags::VM_SHARED | VmFlags::VM_WRITE) + if (is_private_file_vma + || vma_guard + .vm_flags() + .contains(VmFlags::VM_SHARED | VmFlags::VM_WRITE)) && !flags.contains(FaultFlags::FAULT_FLAG_WRITE) { map_flags = map_flags.set_write(false); diff --git a/kernel/src/mm/page.rs b/kernel/src/mm/page.rs index 8b0bfbf78d..2bd0953663 100644 --- a/kernel/src/mm/page.rs +++ b/kernel/src/mm/page.rs @@ -202,6 +202,23 @@ impl PageManager { &mut self, old_phys: &PhysAddr, allocator: &mut dyn FrameAllocator, + ) -> Result, SystemError> { + self.copy_page_with_type(old_phys, allocator, None) + } + + pub fn copy_page_as_normal( + &mut self, + old_phys: &PhysAddr, + allocator: &mut dyn FrameAllocator, + ) -> Result, SystemError> { + self.copy_page_with_type(old_phys, allocator, Some(PageType::Normal)) + } + + fn copy_page_with_type( + &mut self, + old_phys: &PhysAddr, + allocator: &mut dyn FrameAllocator, + page_type: Option, ) -> Result, SystemError> { let old_page = self.get(old_phys).ok_or(SystemError::EINVAL)?; let paddr = unsafe { allocator.allocate_one().ok_or(SystemError::ENOMEM)? }; @@ -210,6 +227,9 @@ impl PageManager { let page = Page::copy(old_page.read(), paddr) .inspect_err(|_| unsafe { allocator.free_one(paddr) })?; + if let Some(page_type) = page_type { + page.write().set_page_type(page_type); + } self.insert(&page)?; @@ -998,11 +1018,11 @@ impl PageTable { entry.set_flags(new_flags); new_table.set_entry(i, entry); } else { - let phys = allocator.allocate_one()?; let mut page_manager_guard = page_manager_lock(); let old_phys = entry.address().unwrap(); - page_manager_guard.copy_page(&old_phys, allocator).ok()?; - new_table.set_entry(i, PageEntry::new(phys, entry.flags())); + let page = page_manager_guard.copy_page(&old_phys, allocator).ok()?; + new_table + .set_entry(i, PageEntry::new(page.phys_address(), entry.flags())); } } } diff --git a/kernel/src/mm/syscall/sys_brk.rs b/kernel/src/mm/syscall/sys_brk.rs index ee3fe2950c..654faa74d8 100644 --- a/kernel/src/mm/syscall/sys_brk.rs +++ b/kernel/src/mm/syscall/sys_brk.rs @@ -2,7 +2,6 @@ use crate::arch::{interrupt::TrapFrame, syscall::nr::SYS_BRK}; use crate::mm::ucontext::AddressSpace; -use crate::mm::MemoryManagementArch; use crate::mm::VirtAddr; use crate::syscall::table::{FormattedSyscallParam, Syscall}; use system_error::SystemError; @@ -30,23 +29,7 @@ impl Syscall for SysBrkHandle { let addr = Self::addr(args); let new_addr = VirtAddr::new(addr); let address_space = AddressSpace::current()?; - let mut address_space = address_space.write(); - - if new_addr < address_space.brk_start || new_addr >= crate::arch::MMArch::USER_END_VADDR { - return Ok(address_space.brk.data()); - } - if new_addr == address_space.brk { - return Ok(address_space.brk.data()); - } - - unsafe { - address_space - .set_brk(VirtAddr::new(crate::libs::align::page_align_up( - new_addr.data(), - ))) - .ok(); - return Ok(address_space.sbrk(0).unwrap().data()); - } + return address_space.set_brk_wait(new_addr); } /// Formats the syscall arguments for display/debugging purposes. diff --git a/kernel/src/mm/syscall/sys_get_mempolicy.rs b/kernel/src/mm/syscall/sys_get_mempolicy.rs index 8e52141545..c034dbf03b 100644 --- a/kernel/src/mm/syscall/sys_get_mempolicy.rs +++ b/kernel/src/mm/syscall/sys_get_mempolicy.rs @@ -1,7 +1,7 @@ //! System call handler for the get_mempolicy system call. -use crate::arch::{interrupt::TrapFrame, syscall::nr::SYS_GET_MEMPOLICY}; -use crate::mm::{ucontext::AddressSpace, VirtAddr}; +use crate::arch::{interrupt::TrapFrame, syscall::nr::SYS_GET_MEMPOLICY, MMArch}; +use crate::mm::{ucontext::AddressSpace, MemoryManagementArch, VirtAddr, VirtRegion}; use crate::syscall::table::{FormattedSyscallParam, Syscall}; use alloc::vec::Vec; use system_error::SystemError; @@ -130,7 +130,9 @@ impl SysGetMempolicy { /// 获取VMA的内存策略 fn get_vma_mempolicy(addr: VirtAddr) -> Result { let current_as = AddressSpace::current()?; - let as_guard = current_as.read(); + let page = VirtAddr::new(addr.data() & !MMArch::PAGE_OFFSET_MASK); + let as_guard = + current_as.read_guard_no_reservation_conflict(VirtRegion::new(page, MMArch::PAGE_SIZE)); // 检查地址是否在有效的VMA中 if let Some(_vma) = as_guard.mappings.contains(addr) { diff --git a/kernel/src/mm/syscall/sys_madvise.rs b/kernel/src/mm/syscall/sys_madvise.rs index b93fb74620..526e9f9c29 100644 --- a/kernel/src/mm/syscall/sys_madvise.rs +++ b/kernel/src/mm/syscall/sys_madvise.rs @@ -71,8 +71,7 @@ impl Syscall for SysMadviseHandle { let page_count = PageFrameCount::new(aligned_len / MMArch::PAGE_SIZE); current_address_space - .write() - .madvise(start_frame, page_count, madv_flags) + .madvise_wait(start_frame, page_count, madv_flags) .map_err(|_| SystemError::EINVAL)?; return Ok(0); } diff --git a/kernel/src/mm/syscall/sys_mincore.rs b/kernel/src/mm/syscall/sys_mincore.rs index d375c29eb1..05a473c2b8 100644 --- a/kernel/src/mm/syscall/sys_mincore.rs +++ b/kernel/src/mm/syscall/sys_mincore.rs @@ -50,9 +50,7 @@ impl Syscall for SysMincoreHandle { let mut writer = UserBufferWriter::new_checked(vec as *mut u8, page_count, true)?; let buf: &mut [u8] = writer.buffer(0)?; let page_count = PageFrameCount::new(page_count); - current_address_space - .read() - .mincore(start_frame, page_count, buf)?; + current_address_space.mincore_wait(start_frame, page_count, buf)?; return Ok(0); } diff --git a/kernel/src/mm/syscall/sys_mlock.rs b/kernel/src/mm/syscall/sys_mlock.rs index 8b82de17ea..02091968ca 100644 --- a/kernel/src/mm/syscall/sys_mlock.rs +++ b/kernel/src/mm/syscall/sys_mlock.rs @@ -7,7 +7,8 @@ use crate::{ arch::{interrupt::TrapFrame, syscall::nr::SYS_MLOCK, MMArch}, libs::align::page_align_down, mm::{ - access_ok, can_do_mlock, ucontext::AddressSpace, MemoryManagementArch, VirtAddr, VmFlags, + access_ok, can_do_mlock, ucontext::AddressSpace, MemoryManagementArch, VirtAddr, + VirtRegion, VmFlags, }, process::{cred::CAPFlags, resource::RLimitID, ProcessManager}, syscall::table::{FormattedSyscallParam, Syscall}, @@ -62,13 +63,21 @@ pub(super) fn do_mlock( } let vm = AddressSpace::current()?; - let mut guard = vm.write_interruptible()?; - - let new_pages = guard.count_unlocked_pages_for_mlock(start, len)?; - check_mlock_rlimit(guard.locked_vm, new_pages)?; - - guard.apply_vma_lock_flags(start, len, new_flags, false)?; - Ok(0) + let region = VirtRegion::new(start, len); + loop { + let mut guard = vm.write_interruptible()?; + if guard.mappings.first_reservation_conflict(region).is_some() { + drop(guard); + vm.wait_for_no_reservation_conflict_interruptible(region)?; + continue; + } + + let new_pages = guard.count_unlocked_pages_for_mlock(start, len)?; + check_mlock_rlimit(guard.locked_vm, new_pages)?; + + guard.apply_vma_lock_flags(start, len, new_flags, false)?; + return Ok(0); + } } pub(super) fn check_mlock_rlimit(locked_vm: usize, new_pages: usize) -> Result<(), SystemError> { diff --git a/kernel/src/mm/syscall/sys_mlock2.rs b/kernel/src/mm/syscall/sys_mlock2.rs index 54d73eb0d6..fb3d3a3d53 100644 --- a/kernel/src/mm/syscall/sys_mlock2.rs +++ b/kernel/src/mm/syscall/sys_mlock2.rs @@ -5,7 +5,7 @@ use system_error::SystemError; use crate::{ arch::{interrupt::TrapFrame, syscall::nr::SYS_MLOCK2}, - mm::{access_ok, can_do_mlock, ucontext::AddressSpace, VirtAddr, VmFlags}, + mm::{access_ok, can_do_mlock, ucontext::AddressSpace, VirtAddr, VirtRegion, VmFlags}, syscall::table::{FormattedSyscallParam, Syscall}, }; @@ -68,16 +68,25 @@ fn do_mlock2(start: VirtAddr, len: usize, flags: usize) -> Result Result { } let vm = AddressSpace::current()?; - let mut guard = vm.write_interruptible()?; - if flags & MCL_CURRENT != 0 { - let new_pages = guard.count_unlocked_pages_for_mlockall()?; - check_mlock_rlimit(guard.locked_vm, new_pages)?; + loop { + let mut guard = vm.write_interruptible()?; + if guard.mappings.first_reservation_region().is_some() { + drop(guard); + vm.wait_for_no_reservations_interruptible()?; + continue; + } + + if flags & MCL_CURRENT != 0 { + let new_pages = guard.count_unlocked_pages_for_mlockall()?; + check_mlock_rlimit(guard.locked_vm, new_pages)?; + } + guard.set_mlock_future(VmFlags::VM_NONE); + + if flags & MCL_CURRENT != 0 { + guard.apply_mlockall_current(lock_flags)?; + } + + if flags & MCL_FUTURE != 0 { + guard.set_mlock_future(lock_flags); + } + + // TODO: when fault-time page locking is implemented, VM_LOCKONFAULT should + // mark pages unevictable on demand instead of relying only on VMA state. + return Ok(0); } - guard.set_mlock_future(VmFlags::VM_NONE); - - if flags & MCL_CURRENT != 0 { - guard.apply_mlockall_current(lock_flags)?; - } - - if flags & MCL_FUTURE != 0 { - guard.set_mlock_future(lock_flags); - } - - // TODO: when fault-time page locking is implemented, VM_LOCKONFAULT should - // mark pages unevictable on demand instead of relying only on VMA state. - Ok(0) } syscall_table_macros::declare_syscall!(SYS_MLOCKALL, SysMlockallHandle); diff --git a/kernel/src/mm/syscall/sys_mmap.rs b/kernel/src/mm/syscall/sys_mmap.rs index 66edb86bc7..7e95aedbd3 100644 --- a/kernel/src/mm/syscall/sys_mmap.rs +++ b/kernel/src/mm/syscall/sys_mmap.rs @@ -140,7 +140,7 @@ impl Syscall for SysMmapHandle { let current_address_space = AddressSpace::current()?; let start_page = if map_flags.contains(MapFlags::MAP_ANONYMOUS) { // 匿名映射 - current_address_space.write().map_anonymous( + current_address_space.map_anonymous_wait( start_vaddr, len, prot_flags, @@ -150,7 +150,7 @@ impl Syscall for SysMmapHandle { )? } else { // 文件映射 - current_address_space.write().file_mapping( + current_address_space.file_mapping( start_vaddr, len, prot_flags, diff --git a/kernel/src/mm/syscall/sys_mprotect.rs b/kernel/src/mm/syscall/sys_mprotect.rs index 08250028b9..c961b845bd 100644 --- a/kernel/src/mm/syscall/sys_mprotect.rs +++ b/kernel/src/mm/syscall/sys_mprotect.rs @@ -62,8 +62,7 @@ impl Syscall for SysMprotectHandle { let page_count = PageFrameCount::from_bytes(len_aligned).unwrap(); current_address_space - .write() - .mprotect(start_frame, page_count, prot_flags) + .mprotect_wait(start_frame, page_count, prot_flags) .map_err(|_| SystemError::EINVAL)?; return Ok(0); } diff --git a/kernel/src/mm/syscall/sys_mremap.rs b/kernel/src/mm/syscall/sys_mremap.rs index b27086e7f0..f3906c8dc0 100644 --- a/kernel/src/mm/syscall/sys_mremap.rs +++ b/kernel/src/mm/syscall/sys_mremap.rs @@ -6,7 +6,7 @@ use crate::mm::syscall::sys_munmap::do_munmap; use crate::mm::syscall::MremapFlags; use crate::mm::ucontext::AddressSpace; use crate::mm::MemoryManagementArch; -use crate::mm::{MMArch, VirtAddr, VmFlags}; +use crate::mm::{MMArch, VirtAddr, VirtRegion, VmFlags}; use crate::syscall::table::{FormattedSyscallParam, Syscall}; use system_error::SystemError; @@ -71,11 +71,23 @@ impl Syscall for SysMremapHandle { } let current_address_space = AddressSpace::current()?; - let vma = current_address_space.read().mappings.contains(old_vaddr); - if vma.is_none() { + let vma = loop { + let guard = current_address_space.read(); + if let Some(vma) = guard.mappings.contains(old_vaddr) { + break vma; + } + let probe_region = VirtRegion::new(old_vaddr, MMArch::PAGE_SIZE); + if guard + .mappings + .first_reservation_conflict(probe_region) + .is_some() + { + drop(guard); + current_address_space.wait_for_no_reservation_conflict(probe_region); + continue; + } return Err(SystemError::EINVAL); - } - let vma = vma.unwrap(); + }; let (vm_flags, vma_region) = { let g = vma.lock(); (*g.vm_flags(), *g.region()) @@ -127,7 +139,7 @@ impl Syscall for SysMremapHandle { } // 重映射到新内存区域 - let r = current_address_space.write().mremap( + let r = current_address_space.mremap_wait( old_vaddr, old_len, new_len, diff --git a/kernel/src/mm/syscall/sys_msync.rs b/kernel/src/mm/syscall/sys_msync.rs index 5a4081138e..ff4ff10b05 100644 --- a/kernel/src/mm/syscall/sys_msync.rs +++ b/kernel/src/mm/syscall/sys_msync.rs @@ -5,7 +5,7 @@ use crate::arch::{interrupt::TrapFrame, syscall::nr::SYS_MSYNC, MMArch}; use crate::mm::{ syscall::{MsFlags, VmFlags}, ucontext::AddressSpace, - MemoryManagementArch, VirtAddr, + MemoryManagementArch, VirtAddr, VirtRegion, }; use crate::syscall::table::{FormattedSyscallParam, Syscall}; @@ -61,8 +61,9 @@ impl Syscall for SysMsyncHandle { let current_address_space = AddressSpace::current()?; let mut err = Err(SystemError::ENOMEM); let mut unmapped_error = Ok(0); + let initial_region = VirtRegion::new(VirtAddr::new(start), end - start); let mut next_vma = current_address_space - .read() + .read_guard_no_reservation_conflict(initial_region) .mappings .find_nearest(VirtAddr::new(start)); loop { @@ -125,8 +126,9 @@ impl Syscall for SysMsyncHandle { err = unmapped_error; break; } + let remaining = VirtRegion::new(VirtAddr::new(start), end - start); next_vma = current_address_space - .read() + .read_guard_no_reservation_conflict(remaining) .mappings .find_nearest(VirtAddr::new(start)); } else { diff --git a/kernel/src/mm/syscall/sys_munlock.rs b/kernel/src/mm/syscall/sys_munlock.rs index 9af095b176..cd8d97ed4b 100644 --- a/kernel/src/mm/syscall/sys_munlock.rs +++ b/kernel/src/mm/syscall/sys_munlock.rs @@ -5,7 +5,7 @@ use system_error::SystemError; use crate::{ arch::{interrupt::TrapFrame, syscall::nr::SYS_MUNLOCK}, - mm::{access_ok, ucontext::AddressSpace, VirtAddr, VmFlags}, + mm::{access_ok, ucontext::AddressSpace, VirtAddr, VirtRegion, VmFlags}, syscall::table::{FormattedSyscallParam, Syscall}, }; @@ -30,9 +30,17 @@ impl Syscall for SysMunlockHandle { } let vm = AddressSpace::current()?; - let mut guard = vm.write_interruptible()?; - guard.apply_vma_lock_flags(start, len, VmFlags::VM_NONE, false)?; - Ok(0) + let region = VirtRegion::new(start, len); + loop { + let mut guard = vm.write_interruptible()?; + if guard.mappings.first_reservation_conflict(region).is_some() { + drop(guard); + vm.wait_for_no_reservation_conflict_interruptible(region)?; + continue; + } + guard.apply_vma_lock_flags(start, len, VmFlags::VM_NONE, false)?; + return Ok(0); + } } fn entry_format(&self, args: &[usize]) -> Vec { diff --git a/kernel/src/mm/syscall/sys_munlockall.rs b/kernel/src/mm/syscall/sys_munlockall.rs index 91c727a8d0..259be06a2f 100644 --- a/kernel/src/mm/syscall/sys_munlockall.rs +++ b/kernel/src/mm/syscall/sys_munlockall.rs @@ -18,9 +18,16 @@ impl Syscall for SysMunlockallHandle { fn handle(&self, _args: &[usize], _frame: &mut TrapFrame) -> Result { let vm = AddressSpace::current()?; - let mut guard = vm.write_interruptible()?; - guard.clear_all_vma_lock_flags()?; - Ok(0) + loop { + let mut guard = vm.write_interruptible()?; + if guard.mappings.first_reservation_region().is_some() { + drop(guard); + vm.wait_for_no_reservations_interruptible()?; + continue; + } + guard.clear_all_vma_lock_flags()?; + return Ok(0); + } } fn entry_format(&self, _args: &[usize]) -> Vec { diff --git a/kernel/src/mm/syscall/sys_munmap.rs b/kernel/src/mm/syscall/sys_munmap.rs index 49c77bf90e..1c44bba0d9 100644 --- a/kernel/src/mm/syscall/sys_munmap.rs +++ b/kernel/src/mm/syscall/sys_munmap.rs @@ -81,8 +81,7 @@ pub(super) fn do_munmap(start_vaddr: VirtAddr, len: usize) -> Result Result { let address_space = AddressSpace::current()?; assert!(address_space.read().user_mapper.utable.is_current()); - let mut address_space = address_space.write(); - let r = unsafe { address_space.sbrk(incr) }?; + let r = address_space.sbrk_wait(incr)?; return Ok(r.data()); } diff --git a/kernel/src/mm/ucontext.rs b/kernel/src/mm/ucontext.rs index c0b4f0eb2e..d81d22e628 100644 --- a/kernel/src/mm/ucontext.rs +++ b/kernel/src/mm/ucontext.rs @@ -23,17 +23,21 @@ use system_error::SystemError; use crate::{ arch::{mm::PageMapper, CurrentIrqArch, MMArch}, exception::InterruptArch, - filesystem::vfs::{ - file::{File, FileMode}, - FileType, InodeId, + filesystem::{ + page_cache::UnmapMappingMode, + vfs::{ + file::{File, FileMode}, + FileType, InodeId, + }, }, ipc::shm::{ShmFlags, ShmId}, libs::{ align::page_align_up, cpumask::CpuMask, mutex::{Mutex, MutexGuard}, - rwsem::RwSem, + rwsem::{RwSem, RwSemReadGuard, RwSemWriteGuard}, spinlock::SpinLock, + wait_queue::WaitQueue, }, mm::{mmu_gather::MmuGather, page::page_manager_lock, PhysAddr}, process::{cred::CAPFlags, resource::RLimitID, ProcessManager}, @@ -71,6 +75,10 @@ static LOCKEDVMA_ID_ALLOCATOR: SpinLock = /// 用于为每个地址空间分配一个全局唯一且递增的ID static ADDRESS_SPACE_ID_ALLOCATOR: AtomicU64 = AtomicU64::new(1); +pub type MmapReservationId = u64; + +static MMAP_RESERVATION_ID_ALLOCATOR: AtomicU64 = AtomicU64::new(1); + #[derive(Debug)] pub struct AddressSpace { /// 全局唯一的地址空间ID,用于标识不同的地址空间 @@ -98,6 +106,8 @@ pub struct AddressSpace { page_table_edit_lock: Mutex<()>, /// 使用RwSem而非RwLock,因为地址空间操作可能需要进行I/O(如页缺失时的文件读取) inner: RwSem, + /// 等待未发布的 mmap reservation 提交或取消。 + reservation_wait: WaitQueue, } impl AddressSpace { @@ -112,6 +122,7 @@ impl AddressSpace { tlb_gen: AtomicU64::new(0), page_table_edit_lock: Mutex::new(()), inner: RwSem::new(inner), + reservation_wait: WaitQueue::default(), }); // Back-fill the Weak so that InnerAddressSpace methods can obtain // the outer Arc to construct MmuGather / initiate TLB shootdown. @@ -207,13 +218,631 @@ impl AddressSpace { crate::mm::tlb::flush_tlb_mm(self); } - #[inline] - pub fn page_table_edit(&self) -> MutexGuard<'_, ()> { - debug_assert!( - CurrentIrqArch::is_irq_enabled(), - "page_table_edit_lock must not be taken with interrupts disabled" - ); - self.page_table_edit_lock.lock() + #[inline] + pub fn page_table_edit(&self) -> MutexGuard<'_, ()> { + debug_assert!( + CurrentIrqArch::is_irq_enabled(), + "page_table_edit_lock must not be taken with interrupts disabled" + ); + self.page_table_edit_lock.lock() + } + + pub fn wait_for_no_reservation_conflict(self: &Arc, region: VirtRegion) { + self.reservation_wait.wait_until(|| { + let guard = self.write(); + if guard.mappings.first_reservation_conflict(region).is_none() { + Some(()) + } else { + None + } + }); + } + + pub fn wait_for_no_reservation_conflict_interruptible( + self: &Arc, + region: VirtRegion, + ) -> Result<(), SystemError> { + self.reservation_wait.wait_until_interruptible(|| { + let guard = self.write(); + if guard.mappings.first_reservation_conflict(region).is_none() { + Some(()) + } else { + None + } + }) + } + + pub fn wait_for_no_reservations(self: &Arc) { + self.reservation_wait.wait_until(|| { + let guard = self.write(); + if guard.mappings.first_reservation_region().is_none() { + Some(()) + } else { + None + } + }); + } + + pub fn wait_for_no_reservations_interruptible(self: &Arc) -> Result<(), SystemError> { + self.reservation_wait.wait_until_interruptible(|| { + let guard = self.write(); + if guard.mappings.first_reservation_region().is_none() { + Some(()) + } else { + None + } + }) + } + + pub fn read_guard_no_reservation_conflict( + self: &Arc, + region: VirtRegion, + ) -> RwSemReadGuard<'_, InnerAddressSpace> { + self.reservation_wait.wait_until(|| { + let guard = self.read(); + if guard.mappings.first_reservation_conflict(region).is_none() { + Some(guard) + } else { + None + } + }) + } + + pub fn write_guard_no_reservation_conflict( + self: &Arc, + region: VirtRegion, + ) -> RwSemWriteGuard<'_, InnerAddressSpace> { + self.reservation_wait.wait_until(|| { + let guard = self.write(); + if guard.mappings.first_reservation_conflict(region).is_none() { + Some(guard) + } else { + None + } + }) + } + + pub fn read_guard_no_reservations(self: &Arc) -> RwSemReadGuard<'_, InnerAddressSpace> { + self.reservation_wait.wait_until(|| { + let guard = self.read(); + if guard.mappings.first_reservation_region().is_none() { + Some(guard) + } else { + None + } + }) + } + + fn wake_reservation_waiters(&self) { + self.reservation_wait.wake_all(); + } + + fn round_mmap_hint(start_vaddr: VirtAddr, round_to_min: bool) -> Option { + let addr = start_vaddr.data() & (!MMArch::PAGE_OFFSET_MASK); + if (addr != 0) && round_to_min && (addr < DEFAULT_MMAP_MIN_ADDR) { + Some(VirtAddr::new(page_align_up(DEFAULT_MMAP_MIN_ADDR))) + } else if addr == 0 { + None + } else { + Some(VirtAddr::new(addr)) + } + } + + fn reservation_region_for_hint( + start_vaddr: VirtAddr, + len: usize, + round_to_min: bool, + ) -> Option { + Self::round_mmap_hint(start_vaddr, round_to_min).map(|start| VirtRegion::new(start, len)) + } + + #[allow(clippy::too_many_arguments)] + pub fn map_anonymous_wait( + self: &Arc, + start_vaddr: VirtAddr, + len: usize, + prot_flags: ProtFlags, + map_flags: MapFlags, + round_to_min: bool, + allocate_at_once: bool, + ) -> Result { + let len = page_align_up(len); + loop { + let mut guard = self.write(); + if let Some(region) = Self::reservation_region_for_hint(start_vaddr, len, round_to_min) + { + if guard.mappings.first_reservation_conflict(region).is_some() { + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + } + + guard.check_rlimit_as_for_bytes(len)?; + return guard.map_anonymous( + start_vaddr, + len, + prot_flags, + map_flags, + round_to_min, + allocate_at_once, + ); + } + } + + #[allow(clippy::too_many_arguments)] + pub fn file_mapping( + self: &Arc, + start_vaddr: VirtAddr, + len: usize, + prot_flags: ProtFlags, + map_flags: MapFlags, + fd: i32, + offset: usize, + round_to_min: bool, + allocate_at_once: bool, + ) -> Result { + let binding = ProcessManager::current_pcb().fd_table(); + let fd_table_guard = binding.read(); + let file = fd_table_guard + .get_file_by_fd(fd) + .ok_or(SystemError::EBADF)?; + drop(fd_table_guard); + + self.file_mapping_with_file( + file, + start_vaddr, + len, + prot_flags, + map_flags, + offset, + round_to_min, + allocate_at_once, + ) + } + + #[allow(clippy::too_many_arguments)] + pub fn file_mapping_with_file( + self: &Arc, + file: Arc, + start_vaddr: VirtAddr, + len: usize, + prot_flags: ProtFlags, + map_flags: MapFlags, + offset: usize, + round_to_min: bool, + allocate_at_once: bool, + ) -> Result { + let len = page_align_up(len); + if len == 0 { + return Err(SystemError::EINVAL); + } + + let _force_lazy_on_page_fault_arch = allocate_at_once && MMArch::PAGE_FAULT_ENABLED; + + let file_mode = file.mode(); + if file_mode.contains(FileMode::FMODE_PATH) { + return Err(SystemError::EBADF); + } + + let wants_access = prot_flags != ProtFlags::PROT_NONE; + if wants_access && !file_mode.contains(FileMode::FMODE_READ) { + return Err(SystemError::EACCES); + } + if prot_flags.contains(ProtFlags::PROT_EXEC) && !file_mode.contains(FileMode::FMODE_READ) { + return Err(SystemError::EACCES); + } + if prot_flags.contains(ProtFlags::PROT_WRITE) { + if map_flags.contains(MapFlags::MAP_SHARED) { + if !file_mode.contains(FileMode::FMODE_WRITE) { + return Err(SystemError::EACCES); + } + } else if !file_mode.contains(FileMode::FMODE_READ) { + return Err(SystemError::EACCES); + } + } + + if matches!(file.file_type(), FileType::Pipe | FileType::Dir) { + return Err(SystemError::ENODEV); + } + if (offset & (MMArch::PAGE_SIZE - 1)) != 0 { + return Err(SystemError::EINVAL); + } + + let pgoff = offset >> MMArch::PAGE_SHIFT; + let page_count = PageFrameCount::from_bytes(len).unwrap(); + let may_write = + !map_flags.contains(MapFlags::MAP_SHARED) || file_mode.contains(FileMode::FMODE_WRITE); + let vma_file = file.inode().mmap_effective_file(&file)?; + + loop { + let mut guard = self.write(); + let page = match Self::round_mmap_hint(start_vaddr, round_to_min) { + Some(vaddr) => { + let mmap_min = guard.mmap_min; + match guard.find_free_at(mmap_min, vaddr, len, map_flags) { + Ok(region) => VirtPageFrame::new(region.start()), + Err(SystemError::EAGAIN_OR_EWOULDBLOCK) => { + let region = VirtRegion::new(vaddr, len); + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + Err(err) => return Err(err), + } + } + None => { + let region = guard + .mappings + .find_free(guard.mmap_min, len) + .ok_or(SystemError::ENOMEM)?; + VirtPageFrame::new(region.start()) + } + }; + let region = VirtRegion::new(page.virt_address(), len); + + let mut vm_flags = VmFlags::from(prot_flags) + | VmFlags::from(map_flags) + | guard.mlock_future + | VmFlags::VM_MAYREAD + | VmFlags::VM_MAYEXEC; + if may_write { + vm_flags |= VmFlags::VM_MAYWRITE; + } + + if vm_flags.contains(VmFlags::VM_LOCKED) { + let error = if map_flags.contains(MapFlags::MAP_LOCKED) + && !InnerAddressSpace::has_mlock_quota() + { + SystemError::EPERM + } else { + SystemError::EAGAIN_OR_EWOULDBLOCK + }; + guard.check_mlock_rlimit_for_pages(page_count.data(), error)?; + } + guard.check_rlimit_as_for_bytes(len)?; + + file.inode().check_mmap_file(&file, len, offset, vm_flags)?; + + let reservation_id = guard.mappings.reserve_region(region)?; + let entry_flags = EntryFlags::from_prot_flags(prot_flags, true); + let lazy_vma = if MMArch::PAGE_FAULT_ENABLED { + Some(LockedVMA::new(VMA::new( + region, + vm_flags, + entry_flags, + Some(vma_file.clone()), + Some(pgoff), + false, + ))) + } else { + None + }; + drop(guard); + + let mut reservation = MmapReservationGuard::new(self.clone(), reservation_id); + let hook_result = + file.inode() + .mmap_file(&file, region.start().data(), len, offset, vm_flags); + let mut guard = self.write(); + + if let Err(err) = hook_result { + if err != SystemError::ENOSYS { + if guard.mappings.cancel_reservation(reservation_id).is_some() { + drop(guard); + reservation.disarm(); + self.wake_reservation_waiters(); + } else { + drop(guard); + reservation.disarm(); + } + return Err(err); + } + } + + let new_vma = if let Some(vma) = lazy_vma { + vma + } else { + let mut flusher = crate::mm::page::DeferredFlusher::new(); + compiler_fence(Ordering::SeqCst); + let _pt_edit = self.page_table_edit(); + match VMA::zeroed( + page, + page_count, + vm_flags, + entry_flags, + &mut guard.user_mapper.utable, + &mut flusher, + Some(vma_file.clone()), + Some(pgoff), + ) { + Ok(vma) => vma, + Err(err) => { + if guard.mappings.cancel_reservation(reservation_id).is_some() { + drop(guard); + reservation.disarm(); + self.wake_reservation_waiters(); + } else { + drop(guard); + reservation.disarm(); + } + return Err(err); + } + } + }; + + let new_locked_vm = if vm_flags.contains(VmFlags::VM_LOCKED) { + let error = if map_flags.contains(MapFlags::MAP_LOCKED) + && !InnerAddressSpace::has_mlock_quota() + { + SystemError::EPERM + } else { + SystemError::EAGAIN_OR_EWOULDBLOCK + }; + if let Err(err) = guard.check_mlock_rlimit_for_pages(page_count.data(), error) { + if guard.mappings.cancel_reservation(reservation_id).is_some() { + drop(guard); + reservation.disarm(); + self.wake_reservation_waiters(); + } else { + drop(guard); + reservation.disarm(); + } + return Err(err); + } + Some( + guard + .locked_vm + .checked_add(page_count.data()) + .ok_or(SystemError::ENOMEM)?, + ) + } else { + None + }; + + if let Err(err) = guard.mappings.commit_reserved_vma(reservation_id, new_vma) { + drop(guard); + return Err(err); + } + + if let Some(new_locked_vm) = new_locked_vm { + guard.locked_vm = new_locked_vm; + } + reservation.disarm(); + drop(guard); + self.wake_reservation_waiters(); + return Ok(page); + } + } + + pub fn munmap_wait( + self: &Arc, + start_page: VirtPageFrame, + page_count: PageFrameCount, + ) -> Result<(), SystemError> { + let region = VirtRegion::new(start_page.virt_address(), page_count.bytes()); + loop { + let mut guard = self.write(); + if guard.mappings.first_reservation_conflict(region).is_some() { + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + return guard.munmap(start_page, page_count); + } + } + + pub fn mprotect_wait( + self: &Arc, + start_page: VirtPageFrame, + page_count: PageFrameCount, + prot_flags: ProtFlags, + ) -> Result<(), SystemError> { + let region = VirtRegion::new(start_page.virt_address(), page_count.bytes()); + loop { + let mut guard = self.write(); + if guard.mappings.first_reservation_conflict(region).is_some() { + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + return guard.mprotect(start_page, page_count, prot_flags); + } + } + + pub fn madvise_wait( + self: &Arc, + start_page: VirtPageFrame, + page_count: PageFrameCount, + behavior: MadvFlags, + ) -> Result<(), SystemError> { + let region = VirtRegion::new(start_page.virt_address(), page_count.bytes()); + loop { + let mut guard = self.write(); + if guard.mappings.first_reservation_conflict(region).is_some() { + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + return guard.madvise(start_page, page_count, behavior); + } + } + + pub fn mincore_wait( + self: &Arc, + start_page: VirtPageFrame, + page_count: PageFrameCount, + vec: &mut [u8], + ) -> Result<(), SystemError> { + let region = VirtRegion::new(start_page.virt_address(), page_count.bytes()); + loop { + let guard = self.read(); + if guard.mappings.first_reservation_conflict(region).is_some() { + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + return guard.mincore(start_page, page_count, vec); + } + } + + pub fn mremap_wait( + self: &Arc, + old_vaddr: VirtAddr, + old_len: usize, + new_len: usize, + mremap_flags: MremapFlags, + new_vaddr: VirtAddr, + vm_flags: VmFlags, + ) -> Result { + loop { + let mut guard = self.write(); + let mut wait_region = None; + if old_len != 0 { + let old_region = VirtRegion::new(old_vaddr, old_len); + if guard + .mappings + .first_reservation_conflict(old_region) + .is_some() + { + wait_region = Some(old_region); + } else if new_len > old_len { + let grow_region = VirtRegion::new(old_vaddr + old_len, new_len - old_len); + if guard + .mappings + .first_reservation_conflict(grow_region) + .is_some() + { + wait_region = Some(grow_region); + } + } + } + if wait_region.is_none() && mremap_flags.contains(MremapFlags::MREMAP_FIXED) { + let new_region = VirtRegion::new(new_vaddr, new_len); + if guard + .mappings + .first_reservation_conflict(new_region) + .is_some() + { + wait_region = Some(new_region); + } + } + + if let Some(region) = wait_region { + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + + match guard.mremap( + old_vaddr, + old_len, + new_len, + mremap_flags, + new_vaddr, + vm_flags, + ) { + Err(SystemError::EAGAIN_OR_EWOULDBLOCK) => { + let retry_region = if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { + VirtRegion::new(new_vaddr, new_len) + } else if new_len > old_len { + VirtRegion::new(old_vaddr + old_len, new_len - old_len) + } else { + VirtRegion::new(old_vaddr, old_len.max(MMArch::PAGE_SIZE)) + }; + if guard + .mappings + .first_reservation_conflict(retry_region) + .is_some() + { + drop(guard); + self.wait_for_no_reservation_conflict(retry_region); + continue; + } + return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); + } + other => return other, + } + } + } + + pub fn set_brk_wait(self: &Arc, new_addr: VirtAddr) -> Result { + loop { + let mut guard = self.write(); + + if new_addr < guard.brk_start || new_addr >= MMArch::USER_END_VADDR { + return Ok(guard.brk.data()); + } + if new_addr == guard.brk { + return Ok(guard.brk.data()); + } + + let new_brk = VirtAddr::new(page_align_up(new_addr.data())); + let wait_region = if new_brk > guard.brk { + Some(VirtRegion::new(guard.brk, new_brk - guard.brk)) + } else if new_brk < guard.brk { + Some(VirtRegion::new(new_brk, guard.brk - new_brk)) + } else { + None + }; + if let Some(region) = wait_region { + if guard.mappings.first_reservation_conflict(region).is_some() { + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + } + + unsafe { + guard.set_brk(new_brk).ok(); + return Ok(guard.sbrk(0).unwrap().data()); + } + } + } + + pub fn sbrk_wait(self: &Arc, incr: isize) -> Result { + loop { + let mut guard = self.write(); + if incr == 0 { + return Ok(guard.brk); + } + + let requested = if incr > 0 { + guard.brk + incr as usize + } else { + guard.brk - incr.unsigned_abs() + }; + let new_brk = VirtAddr::new(page_align_up(requested.data())); + let wait_region = if new_brk > guard.brk { + Some(VirtRegion::new(guard.brk, new_brk - guard.brk)) + } else if new_brk < guard.brk { + Some(VirtRegion::new(new_brk, guard.brk - new_brk)) + } else { + None + }; + + if let Some(region) = wait_region { + if guard.mappings.first_reservation_conflict(region).is_some() { + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + } + + return unsafe { guard.sbrk(incr) }; + } + } + + pub fn try_clone_wait(self: &Arc) -> Result, SystemError> { + loop { + let mut guard = self.write(); + if let Some(region) = guard.mappings.first_reservation_region() { + drop(guard); + self.wait_for_no_reservation_conflict(region); + continue; + } + return guard.try_clone(); + } } } @@ -247,6 +876,39 @@ impl core::ops::DerefMut for AddressSpace { } } +struct MmapReservationGuard { + mm: Arc, + id: MmapReservationId, + active: bool, +} + +impl MmapReservationGuard { + fn new(mm: Arc, id: MmapReservationId) -> Self { + Self { + mm, + id, + active: true, + } + } + + fn disarm(&mut self) { + self.active = false; + } +} + +impl Drop for MmapReservationGuard { + fn drop(&mut self) { + if !self.active { + return; + } + let mut guard = self.mm.write(); + if guard.mappings.cancel_reservation(self.id).is_some() { + drop(guard); + self.mm.wake_reservation_waiters(); + } + } +} + /// @brief 用户地址空间结构体(每个进程都有一个) #[derive(Debug)] pub struct InnerAddressSpace { @@ -290,13 +952,15 @@ struct VmaCloseNotification { impl InnerAddressSpace { /// 当前地址空间已占用的虚拟内存字节数(简单求和所有 VMA 尺寸) pub fn vma_usage_bytes(&self) -> usize { - self.mappings + let vma_bytes = self + .mappings .iter_vmas() .map(|v| { let g = v.lock(); g.region().size() }) - .sum() + .sum::(); + vma_bytes.saturating_add(self.mappings.reservation_usage_bytes()) } pub fn new(_create_stack: bool) -> Result { @@ -328,6 +992,10 @@ impl InnerAddressSpace { /// 返回克隆后的,新的地址空间的Arc指针 #[inline(never)] pub fn try_clone(&mut self) -> Result, SystemError> { + if self.mappings.first_reservation_region().is_some() { + return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); + } + let new_addr_space = AddressSpace::new(false)?; let mut new_guard = new_addr_space.write(); @@ -557,6 +1225,26 @@ impl InnerAddressSpace { Ok(()) } + fn check_rlimit_as_for_bytes(&self, len: usize) -> Result<(), SystemError> { + let rlim_as = ProcessManager::current_pcb() + .get_rlimit(RLimitID::As) + .rlim_cur as usize; + if rlim_as == usize::MAX { + return Ok(()); + } + + let allowance = MMArch::PAGE_SIZE; + if self + .vma_usage_bytes() + .checked_add(len) + .is_none_or(|v| v > rlim_as.saturating_add(allowance)) + { + Err(SystemError::ENOMEM) + } else { + Ok(()) + } + } + fn mlock_fault_flags(vm_flags: VmFlags) -> Option { if vm_flags.contains(VmFlags::VM_WRITE) { Some(FaultFlags::FAULT_FLAG_WRITE) @@ -764,232 +1452,6 @@ impl InnerAddressSpace { return Ok(start_page); } - /// 进行文件页映射 - /// - /// ## 参数 - /// - /// - `file`:要映射的文件(直接传入 File,而非通过 fd_table 查找) - /// - `start_vaddr`:映射的起始地址 - /// - `len`:映射的长度 - /// - `prot_flags`:保护标志 - /// - `map_flags`:映射标志 - /// - `offset`:映射偏移量 - /// - `round_to_min`:是否将`start_vaddr`对齐到`mmap_min`,如果为`true`,则当`start_vaddr`不为0时,会对齐到`mmap_min`,否则仅向下对齐到页边界 - /// - `allocate_at_once`:是否立即分配物理空间(文件映射通常应为按需缺页;此参数仅在禁用缺页机制时被强制为 true) - /// - /// ## 返回 - /// - /// 返回映射的起始虚拟页帧 - #[allow(clippy::too_many_arguments)] - pub fn file_mapping_with_file( - &mut self, - file: Arc, - start_vaddr: VirtAddr, - len: usize, - prot_flags: ProtFlags, - map_flags: MapFlags, - offset: usize, - round_to_min: bool, - allocate_at_once: bool, - ) -> Result { - let allocate_at_once = if MMArch::PAGE_FAULT_ENABLED { - allocate_at_once - } else { - true - }; - // 用于对齐hint的函数 - let round_hint_to_min = |hint: VirtAddr| { - // 先把hint向下对齐到页边界 - let addr = hint.data() & (!MMArch::PAGE_OFFSET_MASK); - // 如果hint不是0,且hint小于DEFAULT_MMAP_MIN_ADDR,则对齐到DEFAULT_MMAP_MIN_ADDR - if (addr != 0) && round_to_min && (addr < DEFAULT_MMAP_MIN_ADDR) { - Some(VirtAddr::new(page_align_up(DEFAULT_MMAP_MIN_ADDR))) - } else if addr == 0 { - None - } else { - Some(VirtAddr::new(addr)) - } - }; - - let len = page_align_up(len); - - // 权限检查遵循 Linux 语义: - // - O_PATH 直接返回 EBADF - // - 除 PROT_NONE 外,映射需要读权限;PROT_WRITE 另外需要写权限(MAP_PRIVATE 也需要读以便 COW) - // - PROT_EXEC 视为读检查 - let file_mode = file.mode(); - if file_mode.contains(FileMode::FMODE_PATH) { - return Err(SystemError::EBADF); - } - - let wants_access = prot_flags != ProtFlags::PROT_NONE; - if wants_access && !file_mode.contains(FileMode::FMODE_READ) { - return Err(SystemError::EACCES); - } - if prot_flags.contains(ProtFlags::PROT_EXEC) && !file_mode.contains(FileMode::FMODE_READ) { - return Err(SystemError::EACCES); - } - if prot_flags.contains(ProtFlags::PROT_WRITE) { - if map_flags.contains(MapFlags::MAP_SHARED) { - if !file_mode.contains(FileMode::FMODE_WRITE) { - return Err(SystemError::EACCES); - } - } else if !file_mode.contains(FileMode::FMODE_READ) { - return Err(SystemError::EACCES); - } - } - - if matches!(file.file_type(), FileType::Pipe | FileType::Dir) { - return Err(SystemError::ENODEV); - } - - // offset需要4K对齐 - if (offset & (MMArch::PAGE_SIZE - 1)) != 0 { - return Err(SystemError::EINVAL); - } - let pgoff = offset >> MMArch::PAGE_SHIFT; - - let page_count = PageFrameCount::from_bytes(len).unwrap(); - let may_write = - !map_flags.contains(MapFlags::MAP_SHARED) || file_mode.contains(FileMode::FMODE_WRITE); - let mut precheck_vm_flags = VmFlags::from(prot_flags) - | VmFlags::from(map_flags) - | self.mlock_future - | VmFlags::VM_MAYREAD - | VmFlags::VM_MAYEXEC; - if may_write { - precheck_vm_flags |= VmFlags::VM_MAYWRITE; - } - file.inode() - .check_mmap_file(&file, len, offset, precheck_vm_flags)?; - - let start_page: VirtPageFrame = self.mmap( - round_hint_to_min(start_vaddr), - page_count, - prot_flags, - map_flags, - |page, count, vm_flags, flags, mapper, flusher| { - let vm_flags = if may_write { - vm_flags - } else { - vm_flags & !VmFlags::VM_MAYWRITE - }; - if allocate_at_once { - VMA::zeroed( - page, - count, - vm_flags, - flags, - mapper, - flusher, - Some(file.clone()), - Some(pgoff), - ) - } else { - Ok(LockedVMA::new(VMA::new( - VirtRegion::new(page.virt_address(), count.data() * MMArch::PAGE_SIZE), - vm_flags, - flags, - Some(file.clone()), - Some(pgoff), - false, - ))) - } - }, - )?; - - // todo!(impl mmap for other file) - // https://github.com/DragonOS-Community/DragonOS/pull/912#discussion_r1765334272 - // 传入实际映射后的起始虚拟地址,而非用户传入的 hint - let vma = self.mappings.contains(start_page.virt_address()); - let vm_flags = vma - .as_ref() - .map(|vma| *vma.lock().vm_flags()) - .unwrap_or(VmFlags::empty()); - - match file.inode().mmap_file( - &file, - start_page.virt_address().data(), - len, - offset, - vm_flags, - ) { - Ok(_) => { - if let Some(vma) = vma { - self.mappings.attach_vma(&vma); - } - self.post_map_population(start_page.virt_address(), len, map_flags); - Ok(start_page) - } - Err(SystemError::ENOSYS) => { - if let Some(vma) = vma { - self.mappings.attach_vma(&vma); - } - self.post_map_population(start_page.virt_address(), len, map_flags); - Ok(start_page) - } // 文件系统未实现 mmap,视为成功 - Err(SystemError::ENODEV) => { - let _ = self.munmap(start_page, page_count); - Err(SystemError::ENODEV) - } - Err(e) => { - let _ = self.munmap(start_page, page_count); - Err(e) - } - } - } - - /// 进行文件页映射 - /// - /// ## 参数 - /// - /// - `start_vaddr`:映射的起始地址 - /// - `len`:映射的长度 - /// - `prot_flags`:保护标志 - /// - `map_flags`:映射标志 - /// - `fd`:文件描述符 - /// - `offset`:映射偏移量 - /// - `round_to_min`:是否将`start_vaddr`对齐到`mmap_min`,如果为`true`,则当`start_vaddr`不为0时,会对齐到`mmap_min`,否则仅向下对齐到页边界 - /// - `allocate_at_once`:是否立即分配物理空间 - /// - /// ## 返回 - /// - /// 返回映射的起始虚拟页帧 - #[allow(clippy::too_many_arguments)] - pub fn file_mapping( - &mut self, - start_vaddr: VirtAddr, - len: usize, - prot_flags: ProtFlags, - map_flags: MapFlags, - fd: i32, - offset: usize, - round_to_min: bool, - allocate_at_once: bool, - ) -> Result { - let binding = ProcessManager::current_pcb().fd_table(); - let fd_table_guard = binding.read(); - - let file = fd_table_guard.get_file_by_fd(fd); - if file.is_none() { - return Err(SystemError::EBADF); - } - // drop guard 以避免无法调度的问题 - drop(fd_table_guard); - - let file = file.unwrap(); - self.file_mapping_with_file( - file, - start_vaddr, - len, - prot_flags, - map_flags, - offset, - round_to_min, - allocate_at_once, - ) - } - /// 向进程的地址空间映射页面 /// /// # 参数 @@ -1057,7 +1519,6 @@ impl InnerAddressSpace { }; self.check_mlock_rlimit_for_pages(page_count.data(), error)?; } - // debug!("mmap: page: {:?}, region={region:?}", page.virt_address()); compiler_fence(Ordering::SeqCst); @@ -2017,6 +2478,14 @@ impl InnerAddressSpace { return Err(SystemError::EINVAL); } + if self + .mappings + .first_reservation_conflict(requested) + .is_some() + { + return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); + } + let has_conflict = self.mappings.conflicts(requested).next().is_some(); if has_conflict { if flags.contains(MapFlags::MAP_FIXED_NOREPLACE) { @@ -2093,12 +2562,20 @@ impl Drop for UserMapper { } /// 用户空间映射信息 +#[derive(Clone, Copy, Debug)] +struct MmapReservation { + id: MmapReservationId, + region: VirtRegion, +} + #[derive(Debug)] pub struct UserMappings { /// 当前用户空间的虚拟内存区域 vmas: HashSet>, /// 当前用户空间的VMA空洞 vm_holes: BTreeMap, + /// 正在建立、但尚未发布为 VMA 的 mmap 地址预约。 + reservations: BTreeMap, /// 所属地址空间,用于在 VMA 生命周期变更时回填反向引用 owner: Weak, } @@ -2109,6 +2586,7 @@ impl UserMappings { vmas: HashSet::new(), vm_holes: core::iter::once((VirtAddr::new(0), MMArch::USER_END_VADDR.data())) .collect::>(), + reservations: BTreeMap::new(), owner: Weak::new(), }; } @@ -2198,6 +2676,82 @@ impl UserMappings { return r; } + pub fn first_reservation_conflict(&self, request: VirtRegion) -> Option { + self.reservations + .values() + .find(|reservation| reservation.region.collide(&request)) + .map(|reservation| reservation.id) + } + + pub fn first_reservation_region(&self) -> Option { + self.reservations + .values() + .next() + .map(|reservation| reservation.region) + } + + fn reservation_usage_bytes(&self) -> usize { + self.reservations + .values() + .map(|reservation| reservation.region.size()) + .sum() + } + + fn region_available_for_reservation(&self, region: VirtRegion) -> bool { + self.conflicts(region).next().is_none() && self.first_reservation_conflict(region).is_none() + } + + fn reserve_region(&mut self, region: VirtRegion) -> Result { + if !self.region_available_for_reservation(region) { + return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); + } + + let id = MMAP_RESERVATION_ID_ALLOCATOR.fetch_add(1, Ordering::Relaxed); + self.reserve_hole(®ion); + self.reservations + .insert(region.start(), MmapReservation { id, region }); + Ok(id) + } + + fn cancel_reservation(&mut self, id: MmapReservationId) -> Option { + let start = self + .reservations + .iter() + .find_map(|(start, reservation)| (reservation.id == id).then_some(*start))?; + let reservation = self.reservations.remove(&start)?; + self.unreserve_hole(&reservation.region); + Some(reservation.region) + } + + fn remove_reservation_for_commit( + &mut self, + id: MmapReservationId, + region: VirtRegion, + ) -> Result<(), SystemError> { + let start = self + .reservations + .iter() + .find_map(|(start, reservation)| (reservation.id == id).then_some(*start)) + .ok_or(SystemError::EFAULT)?; + let reservation = *self.reservations.get(&start).ok_or(SystemError::EFAULT)?; + if reservation.region != region { + return Err(SystemError::EFAULT); + } + self.reservations.remove(&start); + Ok(()) + } + + fn commit_reserved_vma( + &mut self, + id: MmapReservationId, + vma: Arc, + ) -> Result<(), SystemError> { + let region = vma.lock().region; + self.remove_reservation_for_commit(id, region)?; + self.insert_vma(vma); + Ok(()) + } + /// 在当前进程的地址空间中,寻找第一个符合条件的空闲的虚拟内存范围。 /// /// @param min_vaddr 最小的起始地址 @@ -2478,26 +3032,60 @@ impl LockedVMA { /// /// This is used by file truncate/invalidate paths: future access should fault back in against /// the updated file size/content instead of tearing down the VMA object. - pub fn unmap_range(&self, region: VirtRegion, mapper: &PageMapper, tlb: &mut MmuGather<'_>) { + pub fn unmap_range( + &self, + region: VirtRegion, + mapper: &PageMapper, + tlb: &mut MmuGather<'_>, + mode: UnmapMappingMode, + ) { let self_guard = self.lock(); let Some(intersection) = self_guard.region().intersect(®ion) else { return; }; + let vma_start = self_guard.region().start(); + let backing_pgoff = self_guard.backing_page_offset(); + let file_page_cache = self_guard + .vm_file() + .and_then(|file| file.inode().page_cache()); drop(self_guard); let mut page_manager_guard = page_manager_lock(); for page in intersection.pages() { - if mapper.translate(page.virt_address()).is_none() { + let virt = page.virt_address(); + let Some((paddr, _)) = mapper.translate(virt) else { continue; + }; + + let page_arc = page_manager_guard.get_unwrap(&paddr); + if let Some(page_cache) = file_page_cache.as_ref() { + let Some(base_pgoff) = backing_pgoff else { + continue; + }; + let pgoff = base_pgoff + ((virt.data() - vma_start.data()) >> MMArch::PAGE_SHIFT); + let page_guard = page_arc.read(); + let is_target_page = match page_guard.page_type() { + PageType::File(info) if info.index == pgoff => info + .page_cache + .upgrade() + .is_some_and(|mapped_cache| Arc::ptr_eq(&mapped_cache, page_cache)), + // Truncate must also zap private COW pages. For file VMAs those pages are + // represented as normal pages, while shared file mappings remain page-cache + // backed and are covered by the PageType::File branch above. + PageType::Normal if mode == UnmapMappingMode::EvenCow => true, + _ => false, + }; + drop(page_guard); + if !is_target_page { + continue; + } } - let Some((paddr, _, flush)) = - (unsafe { mapper.unmap_phys_preserve_tables(page.virt_address()) }) + let Some((paddr, _, flush)) = (unsafe { mapper.unmap_phys_preserve_tables(virt) }) else { continue; }; - let page_arc = page_manager_guard.get_unwrap(&paddr); let can_dealloc = { let mut page_guard = page_arc.write(); page_guard.remove_vma(self); @@ -2510,7 +3098,7 @@ impl LockedVMA { } unsafe { flush.ignore() }; - tlb.accumulate_range(page.virt_address()); + tlb.accumulate_range(virt); } } diff --git a/kernel/src/process/abi.rs b/kernel/src/process/abi.rs index baff69c8f2..d04949d12d 100644 --- a/kernel/src/process/abi.rs +++ b/kernel/src/process/abi.rs @@ -52,7 +52,7 @@ pub enum AtType { /// Filename of program. ExecFn = 31, /// Minimal stack size for signal delivery. - MinSigStackSize, + MinSigStackSize = 51, } impl TryFrom for AtType { diff --git a/kernel/src/process/exit.rs b/kernel/src/process/exit.rs index 68bb28a27f..5f5e070fb8 100644 --- a/kernel/src/process/exit.rs +++ b/kernel/src/process/exit.rs @@ -281,6 +281,14 @@ fn get_thread_group_leader(pcb: &Arc) -> Arc) -> RawPid { + let current = ProcessManager::current_pcb(); + let leader = get_thread_group_leader(¤t); + child_pcb + .task_pid_nr_ns(PidType::PID, Some(leader.active_pid_ns())) + .unwrap_or(RawPid(0)) +} + /// 参考 https://code.dragonos.org.cn/xref/linux-6.1.9/kernel/exit.c#1573 fn do_wait(kwo: &mut KernelWaitOption) -> Result { let mut tmp_child_pcb: Option> = None; @@ -397,7 +405,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { let stopsig = Signal::SIGSTOP as i32; kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: stopsig, cause: SigChildCode::Stopped.into(), }); @@ -406,7 +414,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { if !kwo.options.contains(WaitOption::WNOWAIT) { pcb.sighand().flags_remove(SignalFlags::CLD_STOPPED); } - scan_result = Some(Ok((*pid).into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } else if kwo.options.contains(WaitOption::WCONTINUED) @@ -414,7 +422,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { { kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: Signal::SIGCONT as i32, cause: SigChildCode::Continued.into(), }); @@ -423,7 +431,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { if !kwo.options.contains(WaitOption::WNOWAIT) { pcb.sighand().flags_remove(SignalFlags::CLD_CONTINUED); } - scan_result = Some(Ok((*pid).into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } else if pcb.is_zombie() && kwo.options.contains(WaitOption::WEXITED) { @@ -438,7 +446,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { let status8 = wstatus_to_waitid_status(raw); kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: status8, cause: SigChildCode::Exited.into(), }); @@ -451,7 +459,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { account_reaped_child_rusage(&child_rusage); pid_to_release = Some(pcb.raw_pid()); } - scan_result = Some(Ok((*pid).into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } @@ -512,7 +520,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { let stopsig = Signal::SIGSTOP as i32; kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: stopsig, cause: SigChildCode::Stopped.into(), }); @@ -521,7 +529,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { if !kwo.options.contains(WaitOption::WNOWAIT) { pcb.sighand().flags_remove(SignalFlags::CLD_STOPPED); } - scan_result = Some(Ok((*pid).into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } else if kwo.options.contains(WaitOption::WCONTINUED) @@ -529,7 +537,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { { kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: Signal::SIGCONT as i32, cause: SigChildCode::Continued.into(), }); @@ -538,7 +546,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { if !kwo.options.contains(WaitOption::WNOWAIT) { pcb.sighand().flags_remove(SignalFlags::CLD_CONTINUED); } - scan_result = Some(Ok((*pid).into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } else if pcb.is_zombie() && kwo.options.contains(WaitOption::WEXITED) { @@ -553,7 +561,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { let status8 = wstatus_to_waitid_status(raw); kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: status8, cause: SigChildCode::Exited.into(), }); @@ -566,7 +574,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { account_reaped_child_rusage(&child_rusage); pid_to_release = Some(pcb.raw_pid()); } - scan_result = Some(Ok((*pid).into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } @@ -663,7 +671,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { let stopsig = Signal::SIGSTOP as i32; kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: stopsig, cause: SigChildCode::Stopped.into(), }); @@ -673,14 +681,14 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { pcb.sighand().flags_remove(SignalFlags::CLD_STOPPED); } - scan_result = Some(Ok(pcb.task_pid_vnr().into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } else if kwo.options.contains(WaitOption::WCONTINUED) && pcb.sighand().flags_contains(SignalFlags::CLD_CONTINUED) { kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: Signal::SIGCONT as i32, cause: SigChildCode::Continued.into(), }); @@ -690,7 +698,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { pcb.sighand().flags_remove(SignalFlags::CLD_CONTINUED); } - scan_result = Some(Ok(pcb.task_pid_vnr().into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } else if pcb.is_zombie() && kwo.options.contains(WaitOption::WEXITED) { if reap_blocked_by_group_exec(&pcb) { @@ -704,7 +712,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { let status8 = wstatus_to_waitid_status(raw); kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: status8, cause: SigChildCode::Exited.into(), }); @@ -718,7 +726,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { pid_to_release = Some(pcb.raw_pid()); } - scan_result = Some(Ok(pcb.task_pid_vnr().into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } } @@ -790,7 +798,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { let stopsig = Signal::SIGSTOP as i32; kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: stopsig, cause: SigChildCode::Stopped.into(), }); @@ -799,7 +807,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { if !kwo.options.contains(WaitOption::WNOWAIT) { pcb.sighand().flags_remove(SignalFlags::CLD_STOPPED); } - scan_result = Some(Ok(pcb.task_pid_vnr().into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } else if kwo.options.contains(WaitOption::WCONTINUED) @@ -807,7 +815,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { { kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: Signal::SIGCONT as i32, cause: SigChildCode::Continued.into(), }); @@ -816,7 +824,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { if !kwo.options.contains(WaitOption::WNOWAIT) { pcb.sighand().flags_remove(SignalFlags::CLD_CONTINUED); } - scan_result = Some(Ok(pcb.task_pid_vnr().into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } else if pcb.is_zombie() && kwo.options.contains(WaitOption::WEXITED) { @@ -831,7 +839,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { let status8 = wstatus_to_waitid_status(raw); kwo.no_task_error = None; kwo.ret_info = Some(WaitIdInfo { - pid: pcb.task_pid_vnr(), + pid: wait_visible_pid(&pcb), status: status8, cause: SigChildCode::Exited.into(), }); @@ -844,7 +852,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result { account_reaped_child_rusage(&child_rusage); pid_to_release = Some(pcb.raw_pid()); } - scan_result = Some(Ok(pcb.task_pid_vnr().into())); + scan_result = Some(Ok(wait_visible_pid(&pcb).into())); break; } @@ -923,7 +931,7 @@ fn do_waitpid( // child_pcb.raw_pid() // ); kwo.ret_info = Some(WaitIdInfo { - pid: child_pcb.task_pid_vnr(), + pid: wait_visible_pid(&child_pcb), status: Signal::SIGCONT as i32, cause: SigChildCode::Continued.into(), }); @@ -936,7 +944,7 @@ fn do_waitpid( if !kwo.options.contains(WaitOption::WNOWAIT) { child_pcb.sighand().flags_remove(SignalFlags::CLD_CONTINUED); } - return Some(Ok(child_pcb.raw_pid().data())); + return Some(Ok(wait_visible_pid(&child_pcb).into())); } let state = child_pcb.sched_info().state(); @@ -967,7 +975,7 @@ fn do_waitpid( // 填充 waitid 信息 // log::debug!("do_waitpid: report CLD_STOPPED for pid={:?}", child_pcb.raw_pid()); kwo.ret_info = Some(WaitIdInfo { - pid: child_pcb.task_pid_vnr(), + pid: wait_visible_pid(&child_pcb), status: stopsig, cause: SigChildCode::Stopped.into(), }); @@ -982,7 +990,7 @@ fn do_waitpid( child_pcb.sighand().flags_remove(SignalFlags::CLD_STOPPED); } - return Some(Ok(child_pcb.raw_pid().data())); + return Some(Ok(wait_visible_pid(&child_pcb).into())); } ProcessState::Exited(status) => { if !child_pcb.is_zombie() { @@ -991,7 +999,7 @@ fn do_waitpid( if reap_blocked_by_group_exec(&child_pcb) { return None; } - let pid = child_pcb.task_pid_vnr(); + let pid = wait_visible_pid(&child_pcb); // Linux 语义:若等待集合未包含 WEXITED,则不报告退出事件 if likely(!kwo.options.contains(WaitOption::WEXITED)) { return None; diff --git a/kernel/src/process/fork.rs b/kernel/src/process/fork.rs index cc39e03004..a7901e01ca 100644 --- a/kernel/src/process/fork.rs +++ b/kernel/src/process/fork.rs @@ -269,6 +269,11 @@ impl ProcessManager { args: KernelCloneArgs, ) -> Result { let current_pcb = ProcessManager::current_pcb(); + let caller_pid_ns = if current_pcb.raw_pid().data() == 0 { + None + } else { + Some(current_pcb.active_pid_ns()) + }; let new_kstack: KernelStack = KernelStack::new()?; @@ -307,7 +312,9 @@ impl ProcessManager { return Ok(pcb.raw_pid()); } - return Ok(pcb.pid().pid_vnr()); + return pcb + .task_pid_nr_ns(PidType::PID, caller_pid_ns) + .ok_or(SystemError::EINVAL); } fn copy_flags( @@ -361,8 +368,7 @@ impl ProcessManager { return Ok(()); } let new_address_space = old_address_space - .write() - .try_clone() + .try_clone_wait() .map_err(|_| SystemError::ENOMEM)?; unsafe { new_pcb.basic_mut().set_user_vm(Some(new_address_space)) }; return Ok(()); @@ -602,8 +608,11 @@ impl ProcessManager { // TODO: 克隆前应该锁信号处理,等待克隆完成后再处理 // 克隆架构相关 - let guard = current_pcb.arch_info_irqsave(); - unsafe { pcb.arch_info().clone_from(&guard) }; + let mut guard = current_pcb.arch_info_irqsave(); + guard.sync_current_state_before_fork(); + unsafe { + pcb.arch_info().clone_from(&guard); + } drop(guard); // 为内核线程设置WorkerPrivate @@ -810,7 +819,30 @@ impl ProcessManager { } } - // 拷贝 pidfd + let clone_into_cgroup_target = Self::resolve_clone_into_cgroup_target(&clone_args)?; + let reserved_cgroup = if pcb.raw_pid() > RawPid(0) { + let charge_node = clone_into_cgroup_target + .as_ref() + .unwrap_or(&pcb.task_cgroup_node()) + .clone(); + let src_node = pcb.task_cgroup_node(); + let guard = cgroup_accounting_lock().lock(); + cgroup_can_fork_in(&charge_node, 1)?; + if let Some(target_node) = clone_into_cgroup_target { + cgroup_migrate_vet_dst_with_src(&src_node, &target_node, 1)?; + pcb.set_task_cgroup_node_for_fork(target_node); + } + let cgroup = pcb.task_cgroup_node(); + cgroup.charge_pids(1); + drop(guard); + Some(cgroup) + } else { + None + }; + + // 安装 pidfd 会对父进程 fd 表产生外部可见副作用,必须放在 cgroup + // admission 成功之后;若后续发布前失败,需要显式回滚 fd 和 pids 预留。 + let mut installed_pidfd = None; if clone_flags.contains(CloneFlags::CLONE_PIDFD) { let pid = pcb.raw_pid().0 as i32; let root_inode = ProcessManager::current_mntns().root_inode(); @@ -819,23 +851,57 @@ impl ProcessManager { ProcessManager::current_pcb().raw_pid().data(), pid ); - let new_inode = root_inode.create(&name, FileType::File, InodeMode::S_IRWXUGO)?; - let file = File::new(new_inode, FileFlags::O_RDWR | FileFlags::O_CLOEXEC)?; + let new_inode = match root_inode.create(&name, FileType::File, InodeMode::S_IRWXUGO) { + Ok(inode) => inode, + Err(err) => { + Self::rollback_failed_fork(current_pcb, None, reserved_cgroup.as_ref()); + return Err(err); + } + }; + let file = match File::new(new_inode, FileFlags::O_RDWR | FileFlags::O_CLOEXEC) { + Ok(file) => file, + Err(err) => { + Self::rollback_failed_fork(current_pcb, None, reserved_cgroup.as_ref()); + return Err(err); + } + }; { let mut guard = file.private_data.lock(); *guard = FilePrivateData::Pid(PidPrivateData::new(pid)); } - let r = current_pcb.fd_table().write().alloc_fd(file, None, true)?; - let mut writer = UserBufferWriter::new( - clone_args.parent_tid.data() as *mut i32, - core::mem::size_of::(), - true, - )?; + let fd = match current_pcb.fd_table().write().alloc_fd(file, None, true) { + Ok(fd) => fd, + Err(err) => { + Self::rollback_failed_fork(current_pcb, None, reserved_cgroup.as_ref()); + return Err(err); + } + }; + + let write_pidfd_result = (|| -> Result<(), SystemError> { + let mut writer = UserBufferWriter::new( + clone_args.pidfd.data() as *mut i32, + core::mem::size_of::(), + true, + )?; + writer.copy_one_to_user(&(fd as i32), 0) + })(); + if let Err(err) = write_pidfd_result { + Self::rollback_failed_fork(current_pcb, Some(fd), reserved_cgroup.as_ref()); + return Err(err); + } - writer.copy_one_to_user(&(r as i32), 0)?; + installed_pidfd = Some(fd); } + // 新任务的默认落点 CPU 应在 wake_up_new_task() 时再选择;这里只保留显式 hint, + // 以避免 fork 长路径内父任务迁移导致的“过早采样当前 CPU”问题。 + pcb.sched_info().mark_new_task(clone_args.target_cpu); + sched_cgroup_fork(pcb); + + // 处理 rseq 状态。按 Linux copy_process() 顺序,应在任务对外可见前完成。 + crate::process::rseq::rseq_fork(pcb, clone_flags.contains(CloneFlags::CLONE_VM)); + let pid = pcb.pid(); if pcb.is_thread_group_leader() { if pcb.raw_pid() == RawPid(1) { @@ -881,13 +947,17 @@ impl ProcessManager { pcb.attach_pid(PidType::SID); } else { let group_leader = pcb.threads_read_irqsave().group_leader().unwrap(); - current_pcb.sighand().with_group_exec_check(|| { + let group_exec_result = current_pcb.sighand().with_group_exec_check(|| { pcb.task_join_group_stop(); group_leader .threads_write_irqsave() .group_tasks .push(Arc::downgrade(pcb)); - })?; + }); + if let Err(err) = group_exec_result { + Self::rollback_failed_fork(current_pcb, installed_pidfd, reserved_cgroup.as_ref()); + return Err(err); + } // 确保非组长线程的 TGID 与组长一致 let leader_tgid_pid = group_leader.pid(); @@ -932,24 +1002,10 @@ impl ProcessManager { } } - let clone_into_cgroup_target = Self::resolve_clone_into_cgroup_target(&clone_args)?; - if pcb.raw_pid() > RawPid(0) { - let charge_node = clone_into_cgroup_target - .as_ref() - .unwrap_or(&pcb.task_cgroup_node()) - .clone(); - let src_node = pcb.task_cgroup_node(); - let _cgroup_guard = cgroup_accounting_lock().lock(); - cgroup_can_fork_in(&charge_node, 1)?; - if let Some(target_node) = clone_into_cgroup_target { - cgroup_migrate_vet_dst_with_src(&src_node, &target_node, 1)?; - pcb.set_task_cgroup_node_for_fork(target_node); - } let cgroup = pcb.task_cgroup_node(); - cgroup.charge_pids(1); - cgroup.add_task(pcb.raw_pid()); ProcessManager::add_pcb(pcb.clone()); + cgroup.add_task(pcb.raw_pid()); pcb.mark_visible_thread_accounted(); inc_visible_thread_count(); account_successful_fork(); @@ -961,15 +1017,35 @@ impl ProcessManager { pcb.thread.write_irqsave().set_child_tid = Some(clone_args.child_tid); } - // 新任务的默认落点 CPU 应在 wake_up_new_task() 时再选择;这里只保留显式 hint, - // 以避免 fork 长路径内父任务迁移导致的“过早采样当前 CPU”问题。 - pcb.sched_info().mark_new_task(clone_args.target_cpu); - sched_cgroup_fork(pcb); + Ok(()) + } - // 处理 rseq 状态 - crate::process::rseq::rseq_fork(pcb, clone_flags.contains(CloneFlags::CLONE_VM)); + fn rollback_failed_fork( + current_pcb: &Arc, + installed_pidfd: Option, + reserved_cgroup: Option<&Arc>, + ) { + if let Some(fd) = installed_pidfd { + let dropped = { + let fd_table = current_pcb.fd_table(); + let mut fd_table_guard = fd_table.write(); + fd_table_guard.drop_fd(fd) + }; + match dropped { + Ok(dropped) => { + if let Err(err) = dropped.finish_close() { + warn!("fork: failed to close rolled back pidfd: {:?}", err); + } + } + Err(err) => { + warn!("fork: failed to roll back pidfd {}: {:?}", fd, err); + } + } + } - Ok(()) + if let Some(cgroup) = reserved_cgroup { + cgroup.uncharge_pids(1); + } } fn copy_fs( diff --git a/kernel/src/process/namespace/mnt.rs b/kernel/src/process/namespace/mnt.rs index 37dbb0c805..31ed7e3094 100644 --- a/kernel/src/process/namespace/mnt.rs +++ b/kernel/src/process/namespace/mnt.rs @@ -3,7 +3,7 @@ use crate::{ mount::{MountFSInode, MountFlags, MountList, MountPath}, FileSystem, IndexNode, InodeId, MountFS, }, - libs::{once::Once, spinlock::SpinLock}, + libs::{once::Once, rwsem::RwSem}, process::{fork::CloneFlags, namespace::NamespaceType, ProcessManager}, }; use alloc::string::{String, ToString}; @@ -44,12 +44,12 @@ pub struct MntNamespace { ns_common: NsCommon, self_ref: Weak, _user_ns: Arc, - root_mountfs: Arc, - inner: SpinLock, + inner: RwSem, } pub struct InnerMntNamespace { _dead: bool, + root_mountfs: Arc, mount_list: Arc, } @@ -78,8 +78,8 @@ impl MntNamespace { ns_common: NsCommon::new(0, NamespaceType::Mount), self_ref: self_ref.clone(), _user_ns: super::user_namespace::INIT_USER_NAMESPACE.clone(), - root_mountfs: ramfs.clone(), - inner: SpinLock::new(InnerMntNamespace { + inner: RwSem::new(InnerMntNamespace { + root_mountfs: ramfs.clone(), mount_list, _dead: false, }), @@ -100,12 +100,10 @@ impl MntNamespace { /// Forcibly replace the root mount filesystem of this MountNamespace. /// /// This method is only for use during DragonOS initialization. - pub unsafe fn force_change_root_mountfs(&self, new_root: Arc) { - let inner_guard = self.inner.lock(); - let ptr = self as *const Self as *mut Self; - let self_mut = (ptr).as_mut().unwrap(); - self_mut.root_mountfs = new_root.clone(); + pub fn force_change_root_mountfs(&self, new_root: Arc) { + let mut inner_guard = self.inner.write(); let (path, _, _) = inner_guard.mount_list.get_mount_point("/").unwrap(); + inner_guard.root_mountfs = new_root.clone(); inner_guard.mount_list.insert(None, path, new_root); @@ -120,7 +118,8 @@ impl MntNamespace { old_put_old_path: &str, new_put_old_path: &str, ) -> Result<(), SystemError> { - let old_root = self.root_mountfs.clone(); + let mut inner_guard = self.inner.write(); + let old_root = Self::root_mntfs_locked(&inner_guard); let old_root_mountpoint = old_root.self_mountpoint(); let new_root_mountpoint = new_root.self_mountpoint().ok_or(SystemError::EINVAL)?; let new_root_parent = new_root_mountpoint.mount_fs(); @@ -160,10 +159,7 @@ impl MntNamespace { new_root.set_self_mountpoint(None); - let inner_guard = self.inner.lock(); - let ptr = self as *const Self as *mut Self; - let self_mut = unsafe { (ptr).as_mut().unwrap() }; - self_mut.root_mountfs = new_root.clone(); + inner_guard.root_mountfs = new_root.clone(); inner_guard.mount_list.remove("/"); if put_old_is_new_root { @@ -229,6 +225,7 @@ impl MntNamespace { old_source_path: &str, new_target_path: &str, ) -> Result<(), SystemError> { + let inner = self.inner.write(); let moving_mounts = collect_mount_subtree(source_mfs); let old_mountpoint = source_mfs.self_mountpoint().ok_or(SystemError::EINVAL)?; let old_parent = old_mountpoint.mount_fs(); @@ -257,7 +254,6 @@ impl MntNamespace { // target_mp_id. The ino in the mount_list root record must be updated accordingly, // otherwise copy_mnt_ns() will fail when traversing mountpoints and looking up // target_mp_id in ino2mp. - let inner = self.inner.lock(); let move_result = inner.mount_list.move_subtree( source_mfs, &moving_mounts, @@ -265,7 +261,6 @@ impl MntNamespace { old_source_path, new_target_path, ); - drop(inner); if let Err(e) = move_result { target_parent.mountpoints().remove(&target_mp_id); @@ -285,9 +280,9 @@ impl MntNamespace { ns_common, self_ref: self_ref.clone(), _user_ns, - root_mountfs: new_root.clone(), - inner: SpinLock::new(InnerMntNamespace { + inner: RwSem::new(InnerMntNamespace { _dead: false, + root_mountfs: new_root.clone(), mount_list: MountList::new(), }), }); @@ -326,9 +321,9 @@ impl MntNamespace { // Return the current mount namespace if CLONE_NEWNS is not set return Ok(self.self_ref.upgrade().unwrap()); } - let inner = self.inner.lock(); + let inner = self.inner.read(); - let old_root_mntfs = self.root_mntfs().clone(); + let old_root_mntfs = Self::root_mntfs_locked(&inner); let mut queue: Vec = Vec::new(); // The root mntfs is special, so it is copied separately. @@ -349,9 +344,10 @@ impl MntNamespace { } let new_mntns = self.copy_with_mountfs(new_root_mntfs, user_ns); + let new_mntns_root = new_mntns.root_mntfs(); for x in inner.mount_list.clone_inner().values() { - if Arc::ptr_eq(x, new_mntns.root_mntfs()) { + if Arc::ptr_eq(x, &new_mntns_root) { continue; // Skip the root mountfs } } @@ -372,7 +368,7 @@ impl MntNamespace { queue.push(MountFSCopyInfo { old_mount_fs: mfs.clone(), - parent_mount_fs: new_mntns.root_mntfs().clone(), + parent_mount_fs: new_mntns_root.clone(), self_mp_inode_id: *ino, mount_path, }); @@ -439,13 +435,18 @@ impl MntNamespace { Ok(new_mntns) } - pub fn root_mntfs(&self) -> &Arc { - &self.root_mountfs + fn root_mntfs_locked(inner: &InnerMntNamespace) -> Arc { + inner.root_mountfs.clone() + } + + pub fn root_mntfs(&self) -> Arc { + Self::root_mntfs_locked(&self.inner.read()) } /// Get the root inode of this mount namespace pub fn root_inode(&self) -> Arc { - self.root_mountfs.root_inode() + let root = self.root_mntfs(); + root.root_inode() } pub fn add_mount( @@ -454,23 +455,27 @@ impl MntNamespace { mount_path: Arc, mntfs: Arc, ) -> Result<(), SystemError> { - self.inner.lock().mount_list.insert(ino, mount_path, mntfs); + self.inner.write().mount_list.insert(ino, mount_path, mntfs); Ok(()) } pub fn mount_list(&self) -> Arc { - self.inner.lock().mount_list.clone() + self.inner.read().mount_list.clone() } pub fn remove_mount(&self, mount_path: &str) -> Option> { - self.inner.lock().mount_list.remove(mount_path) + self.inner.write().mount_list.remove(mount_path) + } + + pub fn remove_mount_exact(&self, mntfs: &Arc) -> Option> { + self.inner.write().mount_list.remove_exact(mntfs) } pub fn get_mount_point( &self, mount_point: &str, ) -> Option<(Arc, String, Arc)> { - self.inner.lock().mount_list.get_mount_point(mount_point) + self.inner.read().mount_list.get_mount_point(mount_point) } } diff --git a/kernel/src/process/namespace/propagation.rs b/kernel/src/process/namespace/propagation.rs index 59e32d6483..576820e509 100644 --- a/kernel/src/process/namespace/propagation.rs +++ b/kernel/src/process/namespace/propagation.rs @@ -1336,9 +1336,7 @@ fn umount_at_peer(peer_mnt: &Arc, mountpoint_id: InodeId) -> Result<(), // 先从 mount_list 移除,再清 namespace,避免 "namespace=None 但 mount_list 仍有记录" 的 TOCTOU 中间态。 if let Some(ns) = child.namespace() { - if let Some(mp) = ns.mount_list().get_mount_path_by_mountfs(&child) { - ns.remove_mount(mp.as_str()); - } + ns.remove_mount_exact(&child); } child.clear_namespace(); diff --git a/kernel/src/process/syscall/clone_utils.rs b/kernel/src/process/syscall/clone_utils.rs index f4559fccb9..8e112d5937 100644 --- a/kernel/src/process/syscall/clone_utils.rs +++ b/kernel/src/process/syscall/clone_utils.rs @@ -5,6 +5,7 @@ use crate::arch::ipc::signal::MAX_SIG_NUM; use crate::arch::MMArch; use crate::mm::{MemoryManagementArch, VirtAddr}; use crate::process::fork::{CloneFlags, KernelCloneArgs, MAX_PID_NS_LEVEL}; +use crate::process::pid::PidType; use crate::process::{KernelStack, ProcessControlBlock, ProcessManager}; use crate::sched::completion::Completion; use crate::syscall::user_access::{write_one_to_user_protected, UserBufferReader}; @@ -77,9 +78,14 @@ pub fn do_clone( // 克隆pcb ProcessManager::copy_process(¤t_pcb, &pcb, clone_args, frame)?; + let child_vpid = pcb + .task_pid_nr_ns(PidType::PID, Some(current_pcb.active_pid_ns())) + .ok_or(SystemError::EINVAL)? + .data(); + if flags.contains(CloneFlags::CLONE_PARENT_SETTID) { // 对齐 Linux:fork 已成功,不因 parent_tid 写回失败而撤销子任务。 - let child_tid = pcb.pid().pid_vnr().data() as i32; + let child_tid = child_vpid as i32; let _ = unsafe { write_one_to_user_protected(parent_tid, &child_tid) }; } @@ -103,7 +109,7 @@ pub fn do_clone( vfork.wait_for_completion_interruptible()?; } - return Ok(pcb.raw_pid().0); + return Ok(child_vpid); } impl KernelCloneArgs { diff --git a/kernel/src/process/syscall/sys_clone.rs b/kernel/src/process/syscall/sys_clone.rs index 6e424d6359..39b46fc460 100644 --- a/kernel/src/process/syscall/sys_clone.rs +++ b/kernel/src/process/syscall/sys_clone.rs @@ -53,6 +53,9 @@ impl Syscall for SysClone { let mut clone_args = KernelCloneArgs::new(); clone_args.flags = flags; clone_args.stack = stack; + // legacy clone() 复用 parent_tid 作为 CLONE_PIDFD 的输出地址; + // clone3() 则通过独立的 pidfd 字段传入。 + clone_args.pidfd = parent_tid; clone_args.parent_tid = parent_tid; clone_args.child_tid = child_tid; clone_args.tls = tls; diff --git a/kernel/src/syscall/user_access.rs b/kernel/src/syscall/user_access.rs index 725c126127..cf123b2cd3 100644 --- a/kernel/src/syscall/user_access.rs +++ b/kernel/src/syscall/user_access.rs @@ -13,7 +13,7 @@ use defer::defer; use crate::{ arch::MMArch, - mm::{access_ok, MemoryManagementArch, VirtAddr, VmFlags}, + mm::{access_ok, MemoryManagementArch, VirtAddr, VirtRegion, VmFlags}, process::ProcessManager, }; @@ -928,7 +928,10 @@ fn check_user_access_by_page_table(addr: VirtAddr, size: usize, check_write: boo // Calculate number of pages to check (rounded up) let pages = aligned_size / MMArch::PAGE_SIZE; - let guard = vm.read(); + let guard = vm.read_guard_no_reservation_conflict(VirtRegion::new( + VirtAddr::new(aligned_addr), + aligned_size, + )); for i in 0..pages { let page_addr = aligned_addr + i * MMArch::PAGE_SIZE; let flags = match guard.user_mapper.utable.translate(VirtAddr::new(page_addr)) { @@ -1061,13 +1064,14 @@ pub fn user_accessible_len(addr: VirtAddr, size: usize, check_write: bool) -> us None => return 0, }; - let vma_read_guard = vm.read(); - let mappings = &vma_read_guard.mappings; - let mut checked = 0usize; let mut current = addr; while checked < size { + let current_page = VirtAddr::new(current.data() & !MMArch::PAGE_OFFSET_MASK); + let vma_read_guard = + vm.read_guard_no_reservation_conflict(VirtRegion::new(current_page, MMArch::PAGE_SIZE)); + let mappings = &vma_read_guard.mappings; // 判断当前地址是否落在一个有效 VMA 中 let Some(vma) = mappings.contains(current) else { break; diff --git a/kernel/src/virt/vm/kvm_host/mod.rs b/kernel/src/virt/vm/kvm_host/mod.rs index ccee2c2d6a..eefe360fef 100644 --- a/kernel/src/virt/vm/kvm_host/mod.rs +++ b/kernel/src/virt/vm/kvm_host/mod.rs @@ -74,8 +74,7 @@ impl LockedVm { .basic() .user_vm() .unwrap() - .write() - .try_clone()?, + .try_clone_wait()?, max_vcpus: CurrentKvmManager::KVM_MAX_VCPUS, memslots_set, memslots, diff --git a/user/apps/about/Makefile b/user/apps/about/Makefile index 28d16564df..789f2d608f 100644 --- a/user/apps/about/Makefile +++ b/user/apps/about/Makefile @@ -15,6 +15,6 @@ install: all mv about $(DADK_CURRENT_BUILD_DIR)/about.elf clean: - rm about *.o sys_version.h + rm -f about *.o sys_version.h fmt: diff --git a/user/apps/riscv_init/Makefile b/user/apps/riscv_init/Makefile index 8df0859b19..98a179c9a4 100644 --- a/user/apps/riscv_init/Makefile +++ b/user/apps/riscv_init/Makefile @@ -17,7 +17,7 @@ install: all clean: - rm init *.o + rm -f init *.o $(MAKE) -C riscv_rust_init ARCH=$(ARCH) clean fmt: diff --git a/user/apps/test_sqlite3/Makefile b/user/apps/test_sqlite3/Makefile index d5c0589fd9..73280d99dd 100644 --- a/user/apps/test_sqlite3/Makefile +++ b/user/apps/test_sqlite3/Makefile @@ -17,7 +17,7 @@ install: all mv test_sqlite3 $(DADK_CURRENT_BUILD_DIR)/test_sqlite3 clean: - rm test_sqlite3 *.o + rm -f test_sqlite3 *.o __download_sqlite3: @echo "Download sqlite3 from https://mirrors.dragonos.org.cn/pub/third_party/sqlite/$(SQLITE_FILENAME).zip" diff --git a/user/apps/tests/dunitest/scripts/run_tests.sh b/user/apps/tests/dunitest/scripts/run_tests.sh index f9282248c9..30f98dca00 100644 --- a/user/apps/tests/dunitest/scripts/run_tests.sh +++ b/user/apps/tests/dunitest/scripts/run_tests.sh @@ -13,7 +13,11 @@ BIN_DIR="$BASE_DIR/bin" RESULTS="$BASE_DIR/results" echo "[dunit] start running tests..." -"$RUNNER" --bin-dir "$BIN_DIR" --results-dir "$RESULTS" +if [ "${DUNITEST_PATTERN:-}" != "" ]; then + "$RUNNER" --bin-dir "$BIN_DIR" --results-dir "$RESULTS" --pattern "$DUNITEST_PATTERN" +else + "$RUNNER" --bin-dir "$BIN_DIR" --results-dir "$RESULTS" +fi status=$? echo "[dunit] 测试完成, status=$status" exit $status diff --git a/user/apps/tests/dunitest/suites/fuse/fuse_extended.cc b/user/apps/tests/dunitest/suites/fuse/fuse_extended.cc index 8ca4092f73..4594371e02 100644 --- a/user/apps/tests/dunitest/suites/fuse/fuse_extended.cc +++ b/user/apps/tests/dunitest/suites/fuse/fuse_extended.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include "fuse_gtest_common.h" @@ -34,6 +35,20 @@ static void fuse_sigsegv_longjmp_handler(int sig) { #define POSIX_FADV_NOREUSE 5 #endif +#ifndef XATTR_NAME_MAX +#define XATTR_NAME_MAX 255 +#endif + +#ifndef XATTR_SIZE_MAX +#define XATTR_SIZE_MAX 65536 +#endif + +static void fill_user_xattr_name(char *buf, size_t len) { + memset(buf, 'a', len); + memcpy(buf, "user.", strlen("user.")); + buf[len] = '\0'; +} + static int ext_test_p2_ops() { const char *mp = "/tmp/test_fuse_p2_ops"; int f = -1; @@ -246,58 +261,15 @@ static int ext_test_p2_ops() { return -1; } -static void ext_sigusr1_handler(int signo) { - (void)signo; -} - -struct ext_reader_ctx { - char path[256]; - volatile int done; - ssize_t nread; - int err; -}; - -static void *ext_reader_thread(void *arg) { - struct ext_reader_ctx *ctx = (struct ext_reader_ctx *)arg; - int fd = open(ctx->path, O_RDONLY); - if (fd < 0) { - ctx->nread = -1; - ctx->err = errno; - ctx->done = 1; - return NULL; - } - - char buf[64]; - ssize_t n = read(fd, buf, sizeof(buf)); - if (n < 0) { - ctx->nread = -1; - ctx->err = errno; - } else { - ctx->nread = n; - ctx->err = 0; - } - close(fd); - ctx->done = 1; - return NULL; -} - -static int ext_test_p3_interrupt() { - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = ext_sigusr1_handler; - sigemptyset(&sa.sa_mask); - sa.sa_flags = 0; - - struct sigaction old_sa; - if (sigaction(SIGUSR1, &sa, &old_sa) != 0) { - printf("[FAIL] sigaction(SIGUSR1): %s (errno=%d)\n", strerror(errno), errno); - return -1; - } +static int ext_test_positive_lookup_cache_respects_entry_ttl() { + const char *mp = "/tmp/test_fuse_lookup_cache"; + char hello[256]; + char missing[256]; + struct stat st; + char buf[32]; - const char *mp = "/tmp/test_fuse_p3_interrupt"; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - sigaction(SIGUSR1, &old_sa, NULL); return -1; } @@ -305,36 +277,27 @@ static int ext_test_p3_interrupt() { if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); rmdir(mp); - sigaction(SIGUSR1, &old_sa, NULL); return -1; } volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t interrupt_count = 0; - volatile uint64_t blocked_read_unique = 0; - volatile uint64_t last_interrupt_header_unique = 0; - volatile uint64_t last_interrupt_target = 0; + volatile uint32_t lookup_count = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.enable_write_ops = 0; - args.stop_on_destroy = 1; - args.block_read_until_interrupt = 1000; - args.interrupt_count = &interrupt_count; - args.blocked_read_unique = &blocked_read_unique; - args.last_interrupt_header_unique = &last_interrupt_header_unique; - args.last_interrupt_target = &last_interrupt_target; + args.lookup_count = &lookup_count; + args.entry_valid_sec = 60; + args.attr_valid_sec = 60; - pthread_t daemon_th; - if (pthread_create(&daemon_th, NULL, fuse_daemon_thread, &args) != 0) { - printf("[FAIL] pthread_create(daemon)\n"); + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); close(fd); rmdir(mp); - sigaction(SIGUSR1, &old_sa, NULL); return -1; } @@ -344,73 +307,54 @@ static int ext_test_p3_interrupt() { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); stop = 1; close(fd); - pthread_join(daemon_th, NULL); + pthread_join(th, NULL); rmdir(mp); - sigaction(SIGUSR1, &old_sa, NULL); return -1; } + if (fuseg_wait_init(&init_done) != 0) { printf("[FAIL] init handshake timeout\n"); goto fail; } - struct ext_reader_ctx rctx; - memset(&rctx, 0, sizeof(rctx)); - snprintf(rctx.path, sizeof(rctx.path), "%s/hello.txt", mp); - - pthread_t reader_th; - if (pthread_create(&reader_th, NULL, ext_reader_thread, &rctx) != 0) { - printf("[FAIL] pthread_create(reader)\n"); - goto fail; - } - - for (int i = 0; i < 200; i++) { - if (blocked_read_unique != 0) { - break; + snprintf(hello, sizeof(hello), "%s/hello.txt", mp); + for (int i = 0; i < 3; ++i) { + if (stat(hello, &st) != 0) { + printf("[FAIL] stat hello iteration %d: %s (errno=%d)\n", i, strerror(errno), errno); + goto fail; + } + int f = open(hello, O_RDONLY); + if (f < 0) { + printf("[FAIL] open hello iteration %d: %s (errno=%d)\n", i, strerror(errno), errno); + goto fail; + } + ssize_t n = read(f, buf, sizeof(buf)); + int saved_errno = errno; + close(f); + if (n <= 0) { + errno = saved_errno; + printf("[FAIL] read hello iteration %d: %s (errno=%d)\n", i, strerror(errno), errno); + goto fail; } - usleep(5 * 1000); - } - if (blocked_read_unique == 0) { - printf("[FAIL] timed out waiting for blocked read request\n"); - stop = 1; - pthread_join(reader_th, NULL); - goto fail; - } - - if (pthread_kill(reader_th, SIGUSR1) != 0) { - printf("[FAIL] pthread_kill(SIGUSR1)\n"); - stop = 1; - pthread_join(reader_th, NULL); - goto fail; } - pthread_join(reader_th, NULL); - if (rctx.nread != -1 || rctx.err != EINTR) { - printf("[FAIL] reader expected EINTR, nread=%zd err=%d (%s)\n", rctx.nread, rctx.err, - strerror(rctx.err)); + if (lookup_count != 1) { + printf("[FAIL] positive lookup cache expected 1 lookup, got %u\n", lookup_count); goto fail; } - for (int i = 0; i < 500; i++) { - if (interrupt_count > 0) { - break; + snprintf(missing, sizeof(missing), "%s/missing.txt", mp); + for (int i = 0; i < 2; ++i) { + if (stat(missing, &st) == 0 || errno != ENOENT) { + printf("[FAIL] stat missing iteration %d expected ENOENT, errno=%d (%s)\n", i, + errno, strerror(errno)); + goto fail; } - usleep(5 * 1000); } - if (interrupt_count == 0) { - printf("[FAIL] expected FUSE_INTERRUPT request\n"); - goto fail; - } - if (last_interrupt_target == 0 || last_interrupt_target != blocked_read_unique) { - printf("[FAIL] interrupt target mismatch: blocked=%llu interrupt_target=%llu\n", - (unsigned long long)blocked_read_unique, (unsigned long long)last_interrupt_target); - goto fail; - } - if (last_interrupt_header_unique != (blocked_read_unique | 1ULL)) { - printf("[FAIL] interrupt header unique mismatch: blocked=%llu header=%llu\n", - (unsigned long long)blocked_read_unique, - (unsigned long long)last_interrupt_header_unique); + if (lookup_count != 3) { + printf("[FAIL] ordinary ENOENT should not be long-term cached, lookup_count=%u\n", + lookup_count); goto fail; } @@ -420,9 +364,8 @@ static int ext_test_p3_interrupt() { } stop = 1; close(fd); - pthread_join(daemon_th, NULL); + pthread_join(th, NULL); rmdir(mp); - sigaction(SIGUSR1, &old_sa, NULL); return 0; fail: @@ -430,15 +373,24 @@ static int ext_test_p3_interrupt() { fail_no_umount: stop = 1; close(fd); - pthread_join(daemon_th, NULL); + pthread_join(th, NULL); rmdir(mp); - sigaction(SIGUSR1, &old_sa, NULL); return -1; } -static int ext_test_p3_noopen_readdirplus_notify() { - const char *mp = "/tmp/test_fuse_p3_noopen"; - ssize_t wn = -1; +static int ext_test_xattr_ops() { + const char *mp = "/tmp/test_fuse_xattr"; + char path[256]; + char list[64] = {}; + char small[4] = {}; + char value[64] = {}; + char name_255[XATTR_NAME_MAX + 1] = {}; + char name_256[XATTR_NAME_MAX + 2] = {}; + static char value_too_large[XATTR_SIZE_MAX + 1]; + static char max_xattr_buf[XATTR_SIZE_MAX + 1]; + ssize_t n = 0; + uint32_t set_count_before = 0; + if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; @@ -453,28 +405,22 @@ static int ext_test_p3_noopen_readdirplus_notify() { volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t open_count = 0; - volatile uint32_t opendir_count = 0; - volatile uint32_t release_count = 0; - volatile uint32_t releasedir_count = 0; - volatile uint32_t readdirplus_count = 0; + volatile uint32_t getxattr_count = 0; + volatile uint32_t setxattr_count = 0; + volatile uint32_t listxattr_count = 0; + volatile uint32_t removexattr_count = 0; + volatile uint32_t last_setxattr_flags = UINT32_MAX; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.enable_write_ops = 0; - args.stop_on_destroy = 1; - args.open_count = &open_count; - args.opendir_count = &opendir_count; - args.release_count = &release_count; - args.releasedir_count = &releasedir_count; - args.readdirplus_count = &readdirplus_count; - args.force_open_enosys = 1; - args.force_opendir_enosys = 1; - args.init_out_flags_override = FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_NO_OPEN_SUPPORT | - FUSE_NO_OPENDIR_SUPPORT | FUSE_DO_READDIRPLUS; + args.getxattr_count = &getxattr_count; + args.setxattr_count = &setxattr_count; + args.listxattr_count = &listxattr_count; + args.removexattr_count = &removexattr_count; + args.last_setxattr_flags = &last_setxattr_flags; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -499,169 +445,194 @@ static int ext_test_p3_noopen_readdirplus_notify() { goto fail; } - char file_path[256]; - snprintf(file_path, sizeof(file_path), "%s/hello.txt", mp); - for (int i = 0; i < 2; i++) { - int f = open(file_path, O_RDONLY); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", file_path, strerror(errno), errno); - goto fail; - } - char buf[64]; - ssize_t n = read(f, buf, sizeof(buf) - 1); - close(f); - if (n <= 0) { - printf("[FAIL] read(%s): %s (errno=%d)\n", file_path, strerror(errno), errno); - goto fail; - } + snprintf(path, sizeof(path), "%s/hello.txt", mp); + errno = 0; + n = listxattr(path, NULL, 0); + if (n <= 0) { + printf("[FAIL] listxattr size returned %zd errno=%d (%s)\n", n, errno, strerror(errno)); + goto fail; } - - for (int i = 0; i < 2; i++) { - DIR *dir = opendir(mp); - if (!dir) { - printf("[FAIL] opendir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - goto fail; - } - int saw = 0; - struct dirent *de; - while ((de = readdir(dir)) != NULL) { - if (strcmp(de->d_name, "hello.txt") == 0) { - saw = 1; - } - } - closedir(dir); - if (!saw) { - printf("[FAIL] readdir didn't see hello.txt\n"); - goto fail; - } + n = listxattr(path, list, sizeof(list)); + if (n <= 0 || memcmp(list, "user.dragonos", sizeof("user.dragonos")) != 0) { + printf("[FAIL] listxattr value n=%zd first='%s' errno=%d\n", n, list, errno); + goto fail; + } + if (listxattr_count != 2) { + printf("[FAIL] listxattr_count=%u expected=2\n", listxattr_count); + goto fail; } - struct { - struct fuse_out_header out; - struct fuse_notify_inval_inode_out inval; - } notify_msg; - memset(¬ify_msg, 0, sizeof(notify_msg)); - notify_msg.out.len = sizeof(notify_msg); - notify_msg.out.error = FUSE_NOTIFY_INVAL_INODE; - notify_msg.out.unique = 0; - notify_msg.inval.ino = 2; - notify_msg.inval.off = 0; - notify_msg.inval.len = -1; - wn = write(fd, ¬ify_msg, sizeof(notify_msg)); - if (wn != (ssize_t)sizeof(notify_msg)) { - printf("[FAIL] write notify: wn=%zd errno=%d (%s)\n", wn, errno, strerror(errno)); + args.force_listxattr_erange_at_max = 1; + errno = 0; + if (listxattr(path, max_xattr_buf, sizeof(max_xattr_buf)) != -1 || errno != E2BIG) { + printf("[FAIL] listxattr max-size ERANGE errno=%d expected=%d\n", errno, E2BIG); goto fail; } + if (listxattr_count != 3) { + printf("[FAIL] listxattr max-size count=%u expected=3\n", listxattr_count); + goto fail; + } + args.force_listxattr_erange_at_max = 0; - usleep(100 * 1000); + n = getxattr(path, "user.dragonos", NULL, 0); + if (n != (ssize_t)strlen("virtiofs-xattr")) { + printf("[FAIL] getxattr size n=%zd errno=%d (%s)\n", n, errno, strerror(errno)); + goto fail; + } + errno = 0; + if (getxattr(path, "user.dragonos", small, sizeof(small)) != -1 || errno != ERANGE) { + printf("[FAIL] getxattr small buffer errno=%d expected=%d\n", errno, ERANGE); + goto fail; + } + n = getxattr(path, "user.dragonos", value, sizeof(value)); + if (n != (ssize_t)strlen("virtiofs-xattr") || + memcmp(value, "virtiofs-xattr", strlen("virtiofs-xattr")) != 0) { + printf("[FAIL] getxattr value n=%zd value='%s' errno=%d\n", n, value, errno); + goto fail; + } + if (getxattr_count != 3) { + printf("[FAIL] getxattr_count=%u expected=3\n", getxattr_count); + goto fail; + } - if (open_count != 1 || opendir_count != 1 || release_count != 0 || releasedir_count != 0 || - readdirplus_count == 0) { - printf("[FAIL] counters open=%u opendir=%u release=%u releasedir=%u readdirplus=%u\n", - open_count, opendir_count, release_count, releasedir_count, readdirplus_count); + args.force_getxattr_erange_at_max = 1; + errno = 0; + if (getxattr(path, "user.dragonos", max_xattr_buf, sizeof(max_xattr_buf)) != -1 || + errno != E2BIG) { + printf("[FAIL] getxattr max-size ERANGE errno=%d expected=%d\n", errno, E2BIG); + goto fail; + } + if (getxattr_count != 4) { + printf("[FAIL] getxattr max-size count=%u expected=4\n", getxattr_count); goto fail; } + args.force_getxattr_erange_at_max = 0; - if (umount(mp) != 0) { - printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - goto fail_no_umount; + set_count_before = setxattr_count; + errno = 0; + if (setxattr(path, "user.dragonos", "new", 3, 0x4) != -1 || errno != EINVAL) { + printf("[FAIL] setxattr invalid flags errno=%d expected=%d\n", errno, EINVAL); + goto fail; + } + if (setxattr_count != set_count_before) { + printf("[FAIL] invalid flags reached fuse daemon count=%u before=%u\n", setxattr_count, + set_count_before); + goto fail; } - stop = 1; - close(fd); - pthread_join(th, NULL); - rmdir(mp); - return 0; -fail: - umount(mp); -fail_no_umount: - stop = 1; - close(fd); - pthread_join(th, NULL); - rmdir(mp); - return -1; -} - -static int ext_test_open_zero_fh_valid() { - const char *mp = "/tmp/test_fuse_zero_fh"; - if (ensure_dir(mp) != 0) { - printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - return -1; + errno = 0; + if (setxattr(path, "user.dragonos", value_too_large, sizeof(value_too_large), 0) != -1 || + errno != E2BIG) { + printf("[FAIL] setxattr oversized value errno=%d expected=%d\n", errno, E2BIG); + goto fail; } - - int fd = open("/dev/fuse", O_RDWR); - if (fd < 0) { - printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); - rmdir(mp); - return -1; + if (setxattr_count != set_count_before) { + printf("[FAIL] oversized value reached fuse daemon count=%u before=%u\n", setxattr_count, + set_count_before); + goto fail; } - volatile int stop = 0; - volatile int init_done = 0; - volatile uint32_t open_count = 0; - volatile uint32_t read_count = 0; - volatile uint64_t last_open_fh = UINT64_MAX; - volatile uint64_t last_read_fh = UINT64_MAX; - - struct fuse_daemon_args args; - memset(&args, 0, sizeof(args)); - args.fd = fd; - args.stop = &stop; - args.init_done = &init_done; - args.stop_on_destroy = 1; - args.open_count = &open_count; - args.read_count = &read_count; - args.last_open_fh = &last_open_fh; - args.last_read_fh = &last_read_fh; - args.has_hello_open_fh_override = 1; - args.hello_open_fh_override = 0; - - pthread_t th; - if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { - printf("[FAIL] pthread_create\n"); - close(fd); - rmdir(mp); - return -1; + errno = 0; + if (setxattr(path, "", "new", 3, 0) != -1 || errno != ERANGE) { + printf("[FAIL] setxattr empty name errno=%d expected=%d\n", errno, ERANGE); + goto fail; + } + if (setxattr_count != set_count_before) { + printf("[FAIL] empty name reached fuse daemon count=%u before=%u\n", setxattr_count, + set_count_before); + goto fail; } - char opts[256]; - snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); - if (mount("none", mp, "fuse", 0, opts) != 0) { - printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - stop = 1; - close(fd); - pthread_join(th, NULL); - rmdir(mp); - return -1; + fill_user_xattr_name(name_255, XATTR_NAME_MAX); + if (setxattr(path, name_255, "new", 3, 0) != 0) { + printf("[FAIL] setxattr 255-byte name failed errno=%d (%s)\n", errno, strerror(errno)); + goto fail; } - if (fuseg_wait_init(&init_done) != 0) { - printf("[FAIL] init handshake timeout\n"); + if (last_setxattr_flags != 0) { + printf("[FAIL] setxattr 255-byte name flags=%u expected=0\n", last_setxattr_flags); goto fail; } + set_count_before = setxattr_count; - char path[256]; - char buf[128]; - snprintf(path, sizeof(path), "%s/hello.txt", mp); - if (fuseg_read_file_cstr(path, buf, sizeof(buf)) < 0) { - printf("[FAIL] read(%s): %s (errno=%d)\n", path, strerror(errno), errno); + fill_user_xattr_name(name_256, XATTR_NAME_MAX + 1); + errno = 0; + if (setxattr(path, name_256, "new", 3, 0) != -1 || errno != ERANGE) { + printf("[FAIL] setxattr 256-byte name errno=%d expected=%d\n", errno, ERANGE); goto fail; } - if (strcmp(buf, "hello from fuse\n") != 0) { - printf("[FAIL] content mismatch: got='%s'\n", buf); + if (setxattr_count != set_count_before) { + printf("[FAIL] 256-byte name reached fuse daemon count=%u before=%u\n", setxattr_count, + set_count_before); goto fail; } - usleep(100 * 1000); - if (open_count == 0 || read_count == 0 || last_open_fh != 0 || last_read_fh != 0) { - printf("[FAIL] fh counters open=%u read=%u open_fh=%llu read_fh=%llu\n", open_count, - read_count, (unsigned long long)last_open_fh, (unsigned long long)last_read_fh); + if (setxattr(path, "user.zero", nullptr, 0, 0) != 0) { + printf("[FAIL] setxattr zero-size null value failed errno=%d (%s)\n", errno, + strerror(errno)); + goto fail; + } + if (last_setxattr_flags != 0) { + printf("[FAIL] setxattr zero-size null flags=%u expected=0\n", last_setxattr_flags); goto fail; } - if (umount(mp) != 0) { - printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - goto fail_no_umount; + if (setxattr(path, "user.dragonos", "new", 3, 0) != 0) { + printf("[FAIL] setxattr failed errno=%d (%s)\n", errno, strerror(errno)); + goto fail; + } + if (last_setxattr_flags != 0) { + printf("[FAIL] setxattr flags=%u expected=0\n", last_setxattr_flags); + goto fail; + } + errno = 0; + if (setxattr(path, "user.dragonos", "new", 3, XATTR_CREATE) != -1 || errno != EEXIST) { + printf("[FAIL] setxattr XATTR_CREATE errno=%d expected=%d\n", errno, EEXIST); + goto fail; + } + if (last_setxattr_flags != XATTR_CREATE) { + printf("[FAIL] setxattr flags=%u expected XATTR_CREATE=%d\n", last_setxattr_flags, + XATTR_CREATE); + goto fail; + } + if (setxattr(path, "user.created", "new", 3, XATTR_CREATE) != 0) { + printf("[FAIL] setxattr XATTR_CREATE missing failed errno=%d (%s)\n", errno, + strerror(errno)); + goto fail; + } + if (last_setxattr_flags != XATTR_CREATE) { + printf("[FAIL] setxattr flags=%u expected missing XATTR_CREATE=%d\n", + last_setxattr_flags, XATTR_CREATE); + goto fail; } + if (setxattr(path, "user.dragonos", "new", 3, XATTR_REPLACE) != 0) { + printf("[FAIL] setxattr XATTR_REPLACE failed errno=%d (%s)\n", errno, strerror(errno)); + goto fail; + } + if (last_setxattr_flags != XATTR_REPLACE) { + printf("[FAIL] setxattr flags=%u expected XATTR_REPLACE=%d\n", last_setxattr_flags, + XATTR_REPLACE); + goto fail; + } + errno = 0; + if (setxattr(path, "user.missing", "new", 3, XATTR_REPLACE) != -1 || errno != ENODATA) { + printf("[FAIL] setxattr XATTR_REPLACE missing errno=%d expected=%d\n", errno, ENODATA); + goto fail; + } + if (last_setxattr_flags != XATTR_REPLACE) { + printf("[FAIL] setxattr flags=%u expected missing XATTR_REPLACE=%d\n", + last_setxattr_flags, XATTR_REPLACE); + goto fail; + } + if (removexattr(path, "user.dragonos") != 0) { + printf("[FAIL] removexattr failed errno=%d (%s)\n", errno, strerror(errno)); + goto fail; + } + if (setxattr_count != 7 || removexattr_count != 1) { + printf("[FAIL] set/remove counts set=%u remove=%u\n", setxattr_count, removexattr_count); + goto fail; + } + + umount(mp); stop = 1; close(fd); pthread_join(th, NULL); @@ -670,7 +641,6 @@ static int ext_test_open_zero_fh_valid() { fail: umount(mp); -fail_no_umount: stop = 1; close(fd); pthread_join(th, NULL); @@ -678,9 +648,10 @@ static int ext_test_open_zero_fh_valid() { return -1; } -static int ext_test_noopen_fsync_uses_zero_fh() { - const char *mp = "/tmp/test_fuse_noopen_fsync"; - int f = -1; +static int ext_test_xattr_enosys_is_cached() { + const char *mp = "/tmp/test_fuse_xattr_enosys"; + char path[256]; + if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; @@ -695,23 +666,15 @@ static int ext_test_noopen_fsync_uses_zero_fh() { volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t open_count = 0; - volatile uint32_t fsync_count = 0; - volatile uint32_t release_count = 0; - volatile uint64_t last_fsync_fh = UINT64_MAX; + volatile uint32_t listxattr_count = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.stop_on_destroy = 1; - args.open_count = &open_count; - args.fsync_count = &fsync_count; - args.release_count = &release_count; - args.last_fsync_fh = &last_fsync_fh; - args.force_open_enosys = 1; - args.init_out_flags_override = FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_NO_OPEN_SUPPORT; + args.listxattr_count = &listxattr_count; + args.force_xattr_enosys = 1; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -736,31 +699,22 @@ static int ext_test_noopen_fsync_uses_zero_fh() { goto fail; } - char path[256]; snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDONLY); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); - goto fail; - } - if (fsync(f) != 0) { - printf("[FAIL] fsync(no-open file): %s (errno=%d)\n", strerror(errno), errno); - close(f); - goto fail; + for (int i = 0; i < 2; ++i) { + errno = 0; + if (listxattr(path, NULL, 0) != -1 || + (errno != EOPNOTSUPP && errno != ENOTSUP)) { + printf("[FAIL] listxattr ENOSYS cache iter=%d errno=%d (%s)\n", i, errno, + strerror(errno)); + goto fail; + } } - close(f); - - usleep(100 * 1000); - if (open_count != 1 || fsync_count == 0 || release_count != 0 || last_fsync_fh != 0) { - printf("[FAIL] counters open=%u fsync=%u release=%u fsync_fh=%llu\n", open_count, - fsync_count, release_count, (unsigned long long)last_fsync_fh); + if (listxattr_count != 1) { + printf("[FAIL] listxattr ENOSYS should be cached, count=%u\n", listxattr_count); goto fail; } - if (umount(mp) != 0) { - printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - goto fail_no_umount; - } + umount(mp); stop = 1; close(fd); pthread_join(th, NULL); @@ -769,7 +723,6 @@ static int ext_test_noopen_fsync_uses_zero_fh() { fail: umount(mp); -fail_no_umount: stop = 1; close(fd); pthread_join(th, NULL); @@ -777,12 +730,58 @@ static int ext_test_noopen_fsync_uses_zero_fh() { return -1; } -static int ext_test_fsync_enosys_cached_success() { - const char *mp = "/tmp/test_fuse_fsync_enosys"; - int f = -1; - int dfd = -1; +static void ext_sigusr1_handler(int signo) { + (void)signo; +} + +struct ext_reader_ctx { + char path[256]; + volatile int done; + ssize_t nread; + int err; +}; + +static void *ext_reader_thread(void *arg) { + struct ext_reader_ctx *ctx = (struct ext_reader_ctx *)arg; + int fd = open(ctx->path, O_RDONLY); + if (fd < 0) { + ctx->nread = -1; + ctx->err = errno; + ctx->done = 1; + return NULL; + } + + char buf[64]; + ssize_t n = read(fd, buf, sizeof(buf)); + if (n < 0) { + ctx->nread = -1; + ctx->err = errno; + } else { + ctx->nread = n; + ctx->err = 0; + } + close(fd); + ctx->done = 1; + return NULL; +} + +static int ext_test_p3_interrupt() { + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = ext_sigusr1_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + + struct sigaction old_sa; + if (sigaction(SIGUSR1, &sa, &old_sa) != 0) { + printf("[FAIL] sigaction(SIGUSR1): %s (errno=%d)\n", strerror(errno), errno); + return -1; + } + + const char *mp = "/tmp/test_fuse_p3_interrupt"; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + sigaction(SIGUSR1, &old_sa, NULL); return -1; } @@ -790,30 +789,36 @@ static int ext_test_fsync_enosys_cached_success() { if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); rmdir(mp); + sigaction(SIGUSR1, &old_sa, NULL); return -1; } volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t fsync_count = 0; - volatile uint32_t fsyncdir_count = 0; + volatile uint32_t interrupt_count = 0; + volatile uint64_t blocked_read_unique = 0; + volatile uint64_t last_interrupt_header_unique = 0; + volatile uint64_t last_interrupt_target = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; + args.enable_write_ops = 0; args.stop_on_destroy = 1; - args.fsync_count = &fsync_count; - args.fsyncdir_count = &fsyncdir_count; - args.force_fsync_errno = ENOSYS; - args.force_fsyncdir_errno = ENOSYS; + args.block_read_until_interrupt = 1000; + args.interrupt_count = &interrupt_count; + args.blocked_read_unique = &blocked_read_unique; + args.last_interrupt_header_unique = &last_interrupt_header_unique; + args.last_interrupt_target = &last_interrupt_target; - pthread_t th; - if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { - printf("[FAIL] pthread_create\n"); + pthread_t daemon_th; + if (pthread_create(&daemon_th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create(daemon)\n"); close(fd); rmdir(mp); + sigaction(SIGUSR1, &old_sa, NULL); return -1; } @@ -823,8 +828,9 @@ static int ext_test_fsync_enosys_cached_success() { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); stop = 1; close(fd); - pthread_join(th, NULL); + pthread_join(daemon_th, NULL); rmdir(mp); + sigaction(SIGUSR1, &old_sa, NULL); return -1; } if (fuseg_wait_init(&init_done) != 0) { @@ -832,35 +838,63 @@ static int ext_test_fsync_enosys_cached_success() { goto fail; } - char path[256]; - snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDONLY); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); - goto fail; - } - if (fsync(f) != 0 || fsync(f) != 0) { - printf("[FAIL] fsync(file ENOSYS cache): %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - close(f); - f = -1; + struct ext_reader_ctx rctx; + memset(&rctx, 0, sizeof(rctx)); + snprintf(rctx.path, sizeof(rctx.path), "%s/hello.txt", mp); - dfd = open(mp, O_RDONLY | O_DIRECTORY); - if (dfd < 0) { - printf("[FAIL] open dirfd(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + pthread_t reader_th; + if (pthread_create(&reader_th, NULL, ext_reader_thread, &rctx) != 0) { + printf("[FAIL] pthread_create(reader)\n"); goto fail; } - if (fsync(dfd) != 0 || fsync(dfd) != 0) { - printf("[FAIL] fsync(dir ENOSYS cache): %s (errno=%d)\n", strerror(errno), errno); + + for (int i = 0; i < 200; i++) { + if (blocked_read_unique != 0) { + break; + } + usleep(5 * 1000); + } + if (blocked_read_unique == 0) { + printf("[FAIL] timed out waiting for blocked read request\n"); + stop = 1; + pthread_join(reader_th, NULL); goto fail; } - close(dfd); - dfd = -1; - if (fsync_count != 1 || fsyncdir_count != 1) { - printf("[FAIL] ENOSYS fsync cache counters fsync=%u fsyncdir=%u\n", fsync_count, - fsyncdir_count); + if (pthread_kill(reader_th, SIGUSR1) != 0) { + printf("[FAIL] pthread_kill(SIGUSR1)\n"); + stop = 1; + pthread_join(reader_th, NULL); + goto fail; + } + pthread_join(reader_th, NULL); + + if (rctx.nread != -1 || rctx.err != EINTR) { + printf("[FAIL] reader expected EINTR, nread=%zd err=%d (%s)\n", rctx.nread, rctx.err, + strerror(rctx.err)); + goto fail; + } + + for (int i = 0; i < 500; i++) { + if (interrupt_count > 0) { + break; + } + usleep(5 * 1000); + } + + if (interrupt_count == 0) { + printf("[FAIL] expected FUSE_INTERRUPT request\n"); + goto fail; + } + if (last_interrupt_target == 0 || last_interrupt_target != blocked_read_unique) { + printf("[FAIL] interrupt target mismatch: blocked=%llu interrupt_target=%llu\n", + (unsigned long long)blocked_read_unique, (unsigned long long)last_interrupt_target); + goto fail; + } + if (last_interrupt_header_unique != (blocked_read_unique | 1ULL)) { + printf("[FAIL] interrupt header unique mismatch: blocked=%llu header=%llu\n", + (unsigned long long)blocked_read_unique, + (unsigned long long)last_interrupt_header_unique); goto fail; } @@ -870,31 +904,25 @@ static int ext_test_fsync_enosys_cached_success() { } stop = 1; close(fd); - pthread_join(th, NULL); + pthread_join(daemon_th, NULL); rmdir(mp); + sigaction(SIGUSR1, &old_sa, NULL); return 0; fail: - if (f >= 0) { - close(f); - } - if (dfd >= 0) { - close(dfd); - } umount(mp); fail_no_umount: stop = 1; close(fd); - pthread_join(th, NULL); + pthread_join(daemon_th, NULL); rmdir(mp); + sigaction(SIGUSR1, &old_sa, NULL); return -1; } -static int ext_test_open_release_flags_match_linux() { - const char *mp = "/tmp/test_fuse_open_flags"; - int requested = O_RDWR | O_NOCTTY | O_TRUNC | O_APPEND | O_NONBLOCK; - uint32_t expected_open = (uint32_t)(requested & ~(O_CREAT | O_EXCL | O_NOCTTY)); - int f = -1; +static int ext_test_p3_noopen_readdirplus_notify() { + const char *mp = "/tmp/test_fuse_p3_noopen"; + ssize_t wn = -1; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; @@ -909,19 +937,28 @@ static int ext_test_open_release_flags_match_linux() { volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t last_open_flags = 0; - volatile uint32_t last_release_flags = 0; + volatile uint32_t open_count = 0; + volatile uint32_t opendir_count = 0; + volatile uint32_t release_count = 0; + volatile uint32_t releasedir_count = 0; + volatile uint32_t readdirplus_count = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.enable_write_ops = 1; + args.enable_write_ops = 0; args.stop_on_destroy = 1; - args.last_open_in_flags = &last_open_flags; - args.last_release_in_flags = &last_release_flags; - args.init_out_flags_override = FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_ATOMIC_O_TRUNC; + args.open_count = &open_count; + args.opendir_count = &opendir_count; + args.release_count = &release_count; + args.releasedir_count = &releasedir_count; + args.readdirplus_count = &readdirplus_count; + args.force_open_enosys = 1; + args.force_opendir_enosys = 1; + args.init_out_flags_override = FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_NO_OPEN_SUPPORT | + FUSE_NO_OPENDIR_SUPPORT | FUSE_DO_READDIRPLUS; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -946,23 +983,66 @@ static int ext_test_open_release_flags_match_linux() { goto fail; } - char path[256]; - snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, requested); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); - goto fail; + char file_path[256]; + snprintf(file_path, sizeof(file_path), "%s/hello.txt", mp); + for (int i = 0; i < 2; i++) { + int f = open(file_path, O_RDONLY); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", file_path, strerror(errno), errno); + goto fail; + } + char buf[64]; + ssize_t n = read(f, buf, sizeof(buf) - 1); + close(f); + if (n <= 0) { + printf("[FAIL] read(%s): %s (errno=%d)\n", file_path, strerror(errno), errno); + goto fail; + } } - close(f); - usleep(100 * 1000); - if (last_open_flags != expected_open) { - printf("[FAIL] open flags got=0%o expected=0%o\n", last_open_flags, expected_open); + for (int i = 0; i < 2; i++) { + DIR *dir = opendir(mp); + if (!dir) { + printf("[FAIL] opendir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + goto fail; + } + int saw = 0; + struct dirent *de; + while ((de = readdir(dir)) != NULL) { + if (strcmp(de->d_name, "hello.txt") == 0) { + saw = 1; + } + } + closedir(dir); + if (!saw) { + printf("[FAIL] readdir didn't see hello.txt\n"); + goto fail; + } + } + + struct { + struct fuse_out_header out; + struct fuse_notify_inval_inode_out inval; + } notify_msg; + memset(¬ify_msg, 0, sizeof(notify_msg)); + notify_msg.out.len = sizeof(notify_msg); + notify_msg.out.error = FUSE_NOTIFY_INVAL_INODE; + notify_msg.out.unique = 0; + notify_msg.inval.ino = 2; + notify_msg.inval.off = 0; + notify_msg.inval.len = -1; + wn = write(fd, ¬ify_msg, sizeof(notify_msg)); + if (wn != (ssize_t)sizeof(notify_msg)) { + printf("[FAIL] write notify: wn=%zd errno=%d (%s)\n", wn, errno, strerror(errno)); goto fail; } - if (last_release_flags != (uint32_t)requested) { - printf("[FAIL] release flags got=0%o expected=0%o\n", last_release_flags, - (uint32_t)requested); + + usleep(100 * 1000); + + if (open_count != 1 || opendir_count != 1 || release_count != 0 || releasedir_count != 0 || + readdirplus_count == 0) { + printf("[FAIL] counters open=%u opendir=%u release=%u releasedir=%u readdirplus=%u\n", + open_count, opendir_count, release_count, releasedir_count, readdirplus_count); goto fail; } @@ -986,14 +1066,8 @@ static int ext_test_open_release_flags_match_linux() { return -1; } -static int ext_test_fsetfl_updates_fuse_io_flags() { - const char *mp = "/tmp/test_fuse_fsetfl_flags"; - int requested = O_RDWR; - int f = -1; - int old_flags = -1; - uint32_t expected_open = (uint32_t)requested; - uint32_t expected_setfl = 0; - char buf[8]; +static int ext_test_open_zero_fh_valid() { + const char *mp = "/tmp/test_fuse_zero_fh"; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; @@ -1010,42 +1084,21 @@ static int ext_test_fsetfl_updates_fuse_io_flags() { volatile int init_done = 0; volatile uint32_t open_count = 0; volatile uint32_t read_count = 0; - volatile uint32_t write_count = 0; - volatile uint32_t flush_count = 0; - volatile uint32_t release_count = 0; - volatile uint32_t last_open_flags = 0; - volatile uint32_t last_read_flags = 0; - volatile uint32_t last_write_flags = 0; - volatile uint32_t last_flush_uid = UINT32_MAX; - volatile uint32_t last_flush_gid = UINT32_MAX; - volatile uint32_t last_flush_pid = 0; - volatile uint32_t last_release_flags = 0; - volatile uint32_t last_release_uid = UINT32_MAX; - volatile uint32_t last_release_gid = UINT32_MAX; - volatile uint32_t last_release_pid = UINT32_MAX; + volatile uint64_t last_open_fh = UINT64_MAX; + volatile uint64_t last_read_fh = UINT64_MAX; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.enable_write_ops = 1; args.stop_on_destroy = 1; args.open_count = &open_count; args.read_count = &read_count; - args.write_count = &write_count; - args.flush_count = &flush_count; - args.release_count = &release_count; - args.last_open_in_flags = &last_open_flags; - args.last_read_open_flags = &last_read_flags; - args.last_write_open_flags = &last_write_flags; - args.last_flush_uid = &last_flush_uid; - args.last_flush_gid = &last_flush_gid; - args.last_flush_pid = &last_flush_pid; - args.last_release_in_flags = &last_release_flags; - args.last_release_uid = &last_release_uid; - args.last_release_gid = &last_release_gid; - args.last_release_pid = &last_release_pid; + args.last_open_fh = &last_open_fh; + args.last_read_fh = &last_read_fh; + args.has_hello_open_fh_override = 1; + args.hello_open_fh_override = 0; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -1071,67 +1124,27 @@ static int ext_test_fsetfl_updates_fuse_io_flags() { } char path[256]; + char buf[128]; snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, requested); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); - goto fail; - } - - old_flags = fcntl(f, F_GETFL); - if (old_flags < 0) { - printf("[FAIL] fcntl(F_GETFL): %s (errno=%d)\n", strerror(errno), errno); + if (fuseg_read_file_cstr(path, buf, sizeof(buf)) < 0) { + printf("[FAIL] read(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - if (fcntl(f, F_SETFL, old_flags | O_NONBLOCK) != 0) { - printf("[FAIL] fcntl(F_SETFL): %s (errno=%d)\n", strerror(errno), errno); + if (strcmp(buf, "hello from fuse\n") != 0) { + printf("[FAIL] content mismatch: got='%s'\n", buf); goto fail; } - memset(buf, 0, sizeof(buf)); - if (read(f, buf, 5) != 5 || memcmp(buf, "hello", 5) != 0) { - printf("[FAIL] read after F_SETFL got='%.*s' errno=%d\n", 5, buf, errno); - goto fail; - } - if (write(f, "X", 1) != 1) { - printf("[FAIL] write after F_SETFL: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - close(f); - f = -1; usleep(100 * 1000); - - expected_setfl = (uint32_t)(old_flags | O_NONBLOCK); - if (open_count != 1 || read_count != 1 || write_count != 1 || flush_count != 1 || - release_count != 1) { - printf("[FAIL] counters open=%u read=%u write=%u flush=%u release=%u\n", open_count, - read_count, write_count, flush_count, release_count); - goto fail; - } - if (last_open_flags != expected_open) { - printf("[FAIL] open flags got=0%o expected=0%o\n", last_open_flags, expected_open); + if (open_count == 0 || read_count == 0 || last_open_fh != 0 || last_read_fh != 0) { + printf("[FAIL] fh counters open=%u read=%u open_fh=%llu read_fh=%llu\n", open_count, + read_count, (unsigned long long)last_open_fh, (unsigned long long)last_read_fh); goto fail; } - if ((last_read_flags & O_NONBLOCK) == 0 || last_write_flags != expected_setfl || - last_release_flags != expected_setfl) { - printf("[FAIL] updated flags read=0%o write=0%o release=0%o expected=0%o\n", - last_read_flags, last_write_flags, last_release_flags, expected_setfl); - goto fail; - } - if (last_flush_uid != 0 || last_flush_gid != 0 || last_flush_pid == 0) { - printf("[FAIL] flush should use caller credentials uid=%u gid=%u pid=%u\n", - last_flush_uid, last_flush_gid, last_flush_pid); - goto fail; - } - if (last_release_uid != 0 || last_release_gid != 0 || last_release_pid != 0) { - printf("[FAIL] release should use nocreds uid=%u gid=%u pid=%u\n", last_release_uid, - last_release_gid, last_release_pid); - goto fail; - } - - if (umount(mp) != 0) { - printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - goto fail_no_umount; + + if (umount(mp) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + goto fail_no_umount; } stop = 1; close(fd); @@ -1140,9 +1153,6 @@ static int ext_test_fsetfl_updates_fuse_io_flags() { return 0; fail: - if (f >= 0) { - close(f); - } umount(mp); fail_no_umount: stop = 1; @@ -1152,78 +1162,8 @@ static int ext_test_fsetfl_updates_fuse_io_flags() { return -1; } -static int ext_test_fsetfl_updates_fuse_dev_nonblock() { - int fd = open("/dev/fuse", O_RDWR); - if (fd < 0) { - printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); - return -1; - } - - int old_flags = fcntl(fd, F_GETFL); - if (old_flags < 0) { - printf("[FAIL] fcntl(F_GETFL): %s (errno=%d)\n", strerror(errno), errno); - close(fd); - return -1; - } - if ((old_flags & O_NONBLOCK) != 0) { - printf("[FAIL] /dev/fuse unexpectedly opened nonblocking: flags=0%o\n", old_flags); - close(fd); - return -1; - } - if (fcntl(fd, F_SETFL, old_flags | O_NONBLOCK) != 0) { - printf("[FAIL] fcntl(F_SETFL O_NONBLOCK): %s (errno=%d)\n", strerror(errno), errno); - close(fd); - return -1; - } - - pid_t child = fork(); - if (child < 0) { - printf("[FAIL] fork: %s (errno=%d)\n", strerror(errno), errno); - close(fd); - return -1; - } - if (child == 0) { - unsigned char *buf = (unsigned char *)malloc(FUSE_TEST_BUF_SIZE); - if (!buf) { - _exit(11); - } - ssize_t n = read(fd, buf, FUSE_TEST_BUF_SIZE); - int saved_errno = errno; - free(buf); - if (n < 0 && (saved_errno == EAGAIN || saved_errno == EWOULDBLOCK)) { - _exit(0); - } - _exit(12); - } - - for (int i = 0; i < 50; i++) { - int status = 0; - pid_t got = waitpid(child, &status, WNOHANG); - if (got == child) { - close(fd); - if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { - return 0; - } - printf("[FAIL] child read did not return EAGAIN, status=%d\n", status); - return -1; - } - if (got < 0) { - printf("[FAIL] waitpid: %s (errno=%d)\n", strerror(errno), errno); - close(fd); - return -1; - } - usleep(20 * 1000); - } - - kill(child, SIGKILL); - waitpid(child, NULL, 0); - close(fd); - printf("[FAIL] /dev/fuse read blocked after F_SETFL O_NONBLOCK\n"); - return -1; -} - -static int ext_test_fopen_noflush_skips_flush() { - const char *mp = "/tmp/test_fuse_noflush"; +static int ext_test_noopen_fsync_uses_zero_fh() { + const char *mp = "/tmp/test_fuse_noopen_fsync"; int f = -1; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -1239,8 +1179,10 @@ static int ext_test_fopen_noflush_skips_flush() { volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t flush_count = 0; + volatile uint32_t open_count = 0; + volatile uint32_t fsync_count = 0; volatile uint32_t release_count = 0; + volatile uint64_t last_fsync_fh = UINT64_MAX; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); @@ -1248,9 +1190,12 @@ static int ext_test_fopen_noflush_skips_flush() { args.stop = &stop; args.init_done = &init_done; args.stop_on_destroy = 1; - args.flush_count = &flush_count; + args.open_count = &open_count; + args.fsync_count = &fsync_count; args.release_count = &release_count; - args.hello_open_out_flags = FOPEN_NOFLUSH; + args.last_fsync_fh = &last_fsync_fh; + args.force_open_enosys = 1; + args.init_out_flags_override = FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_NO_OPEN_SUPPORT; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -1282,12 +1227,17 @@ static int ext_test_fopen_noflush_skips_flush() { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } + if (fsync(f) != 0) { + printf("[FAIL] fsync(no-open file): %s (errno=%d)\n", strerror(errno), errno); + close(f); + goto fail; + } close(f); - f = -1; usleep(100 * 1000); - if (flush_count != 0 || release_count != 1) { - printf("[FAIL] noflush counters flush=%u release=%u\n", flush_count, release_count); + if (open_count != 1 || fsync_count == 0 || release_count != 0 || last_fsync_fh != 0) { + printf("[FAIL] counters open=%u fsync=%u release=%u fsync_fh=%llu\n", open_count, + fsync_count, release_count, (unsigned long long)last_fsync_fh); goto fail; } @@ -1302,9 +1252,6 @@ static int ext_test_fopen_noflush_skips_flush() { return 0; fail: - if (f >= 0) { - close(f); - } umount(mp); fail_no_umount: stop = 1; @@ -1314,12 +1261,10 @@ static int ext_test_fopen_noflush_skips_flush() { return -1; } -static int ext_test_close_returns_flush_error_and_closes_fd() { - const char *mp = "/tmp/test_fuse_close_flush_error"; +static int ext_test_fsync_enosys_cached_success() { + const char *mp = "/tmp/test_fuse_fsync_enosys"; int f = -1; - int oldfd = -1; - int rc = 0; - char tmp = 0; + int dfd = -1; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; @@ -1334,8 +1279,8 @@ static int ext_test_close_returns_flush_error_and_closes_fd() { volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t flush_count = 0; - volatile uint32_t release_count = 0; + volatile uint32_t fsync_count = 0; + volatile uint32_t fsyncdir_count = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); @@ -1343,9 +1288,10 @@ static int ext_test_close_returns_flush_error_and_closes_fd() { args.stop = &stop; args.init_done = &init_done; args.stop_on_destroy = 1; - args.flush_count = &flush_count; - args.release_count = &release_count; - args.force_flush_errno = EIO; + args.fsync_count = &fsync_count; + args.fsyncdir_count = &fsyncdir_count; + args.force_fsync_errno = ENOSYS; + args.force_fsyncdir_errno = ENOSYS; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -1377,25 +1323,28 @@ static int ext_test_close_returns_flush_error_and_closes_fd() { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - - errno = 0; - oldfd = f; - rc = close(f); + if (fsync(f) != 0 || fsync(f) != 0) { + printf("[FAIL] fsync(file ENOSYS cache): %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + close(f); f = -1; - if (rc != -1 || errno != EIO) { - printf("[FAIL] close should return EIO rc=%d errno=%d\n", rc, errno); + + dfd = open(mp, O_RDONLY | O_DIRECTORY); + if (dfd < 0) { + printf("[FAIL] open dirfd(%s): %s (errno=%d)\n", mp, strerror(errno), errno); goto fail; } - errno = 0; - if (read(oldfd, &tmp, 1) != -1 || errno != EBADF) { - printf("[FAIL] close error must still close fd read_errno=%d\n", errno); + if (fsync(dfd) != 0 || fsync(dfd) != 0) { + printf("[FAIL] fsync(dir ENOSYS cache): %s (errno=%d)\n", strerror(errno), errno); goto fail; } + close(dfd); + dfd = -1; - usleep(100 * 1000); - if (flush_count != 1 || release_count != 1) { - printf("[FAIL] close flush error counters flush=%u release=%u\n", flush_count, - release_count); + if (fsync_count != 1 || fsyncdir_count != 1) { + printf("[FAIL] ENOSYS fsync cache counters fsync=%u fsyncdir=%u\n", fsync_count, + fsyncdir_count); goto fail; } @@ -1413,6 +1362,9 @@ static int ext_test_close_returns_flush_error_and_closes_fd() { if (f >= 0) { close(f); } + if (dfd >= 0) { + close(dfd); + } umount(mp); fail_no_umount: stop = 1; @@ -1422,8 +1374,10 @@ static int ext_test_close_returns_flush_error_and_closes_fd() { return -1; } -static int ext_test_flush_enosys_cached_success() { - const char *mp = "/tmp/test_fuse_flush_enosys"; +static int ext_test_open_release_flags_match_linux() { + const char *mp = "/tmp/test_fuse_open_flags"; + int requested = O_RDWR | O_NOCTTY | O_TRUNC | O_APPEND | O_NONBLOCK; + uint32_t expected_open = (uint32_t)(requested & ~(O_CREAT | O_EXCL | O_NOCTTY)); int f = -1; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -1439,16 +1393,19 @@ static int ext_test_flush_enosys_cached_success() { volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t flush_count = 0; + volatile uint32_t last_open_flags = 0; + volatile uint32_t last_release_flags = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; + args.enable_write_ops = 1; args.stop_on_destroy = 1; - args.flush_count = &flush_count; - args.force_flush_errno = ENOSYS; + args.last_open_in_flags = &last_open_flags; + args.last_release_in_flags = &last_release_flags; + args.init_out_flags_override = FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_ATOMIC_O_TRUNC; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -1475,23 +1432,21 @@ static int ext_test_flush_enosys_cached_success() { char path[256]; snprintf(path, sizeof(path), "%s/hello.txt", mp); - for (int i = 0; i < 2; ++i) { - f = open(path, O_RDONLY); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); - goto fail; - } - if (close(f) != 0) { - printf("[FAIL] close after FLUSH ENOSYS: %s (errno=%d)\n", strerror(errno), errno); - f = -1; - goto fail; - } - f = -1; + f = open(path, requested); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; } + close(f); usleep(100 * 1000); - if (flush_count != 1) { - printf("[FAIL] FLUSH ENOSYS should be cached, flush_count=%u\n", flush_count); + if (last_open_flags != expected_open) { + printf("[FAIL] open flags got=0%o expected=0%o\n", last_open_flags, expected_open); + goto fail; + } + if (last_release_flags != (uint32_t)requested) { + printf("[FAIL] release flags got=0%o expected=0%o\n", last_release_flags, + (uint32_t)requested); goto fail; } @@ -1506,9 +1461,6 @@ static int ext_test_flush_enosys_cached_success() { return 0; fail: - if (f >= 0) { - close(f); - } umount(mp); fail_no_umount: stop = 1; @@ -1518,12 +1470,14 @@ static int ext_test_flush_enosys_cached_success() { return -1; } -static int ext_test_fopen_nonseekable_mode(uint32_t open_out_flags, const char *mp, - int expect_stream) { +static int ext_test_fsetfl_updates_fuse_io_flags() { + const char *mp = "/tmp/test_fuse_fsetfl_flags"; + int requested = O_RDWR; int f = -1; + int old_flags = -1; + uint32_t expected_open = (uint32_t)requested; + uint32_t expected_setfl = 0; char buf[8]; - ssize_t n = -1; - volatile uint64_t last_write_offset = UINT64_MAX; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; @@ -1538,6 +1492,21 @@ static int ext_test_fopen_nonseekable_mode(uint32_t open_out_flags, const char * volatile int stop = 0; volatile int init_done = 0; + volatile uint32_t open_count = 0; + volatile uint32_t read_count = 0; + volatile uint32_t write_count = 0; + volatile uint32_t flush_count = 0; + volatile uint32_t release_count = 0; + volatile uint32_t last_open_flags = 0; + volatile uint32_t last_read_flags = 0; + volatile uint32_t last_write_flags = 0; + volatile uint32_t last_flush_uid = UINT32_MAX; + volatile uint32_t last_flush_gid = UINT32_MAX; + volatile uint32_t last_flush_pid = 0; + volatile uint32_t last_release_flags = 0; + volatile uint32_t last_release_uid = UINT32_MAX; + volatile uint32_t last_release_gid = UINT32_MAX; + volatile uint32_t last_release_pid = UINT32_MAX; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); @@ -1546,8 +1515,21 @@ static int ext_test_fopen_nonseekable_mode(uint32_t open_out_flags, const char * args.init_done = &init_done; args.enable_write_ops = 1; args.stop_on_destroy = 1; - args.hello_open_out_flags = open_out_flags; - args.last_write_offset = &last_write_offset; + args.open_count = &open_count; + args.read_count = &read_count; + args.write_count = &write_count; + args.flush_count = &flush_count; + args.release_count = &release_count; + args.last_open_in_flags = &last_open_flags; + args.last_read_open_flags = &last_read_flags; + args.last_write_open_flags = &last_write_flags; + args.last_flush_uid = &last_flush_uid; + args.last_flush_gid = &last_flush_gid; + args.last_flush_pid = &last_flush_pid; + args.last_release_in_flags = &last_release_flags; + args.last_release_uid = &last_release_uid; + args.last_release_gid = &last_release_gid; + args.last_release_pid = &last_release_pid; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -1574,58 +1556,63 @@ static int ext_test_fopen_nonseekable_mode(uint32_t open_out_flags, const char * char path[256]; snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDWR); + f = open(path, requested); if (f < 0) { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - errno = 0; - if (lseek(f, 0, SEEK_SET) >= 0 || errno != ESPIPE) { - printf("[FAIL] lseek expected ESPIPE, ret errno=%d (%s)\n", errno, strerror(errno)); - goto fail; - } - errno = 0; - if (pread(f, buf, 1, 0) >= 0 || errno != ESPIPE) { - printf("[FAIL] pread expected ESPIPE, errno=%d (%s)\n", errno, strerror(errno)); + old_flags = fcntl(f, F_GETFL); + if (old_flags < 0) { + printf("[FAIL] fcntl(F_GETFL): %s (errno=%d)\n", strerror(errno), errno); goto fail; } - errno = 0; - if (pwrite(f, "x", 1, 0) >= 0 || errno != ESPIPE) { - printf("[FAIL] pwrite expected ESPIPE, errno=%d (%s)\n", errno, strerror(errno)); + if (fcntl(f, F_SETFL, old_flags | O_NONBLOCK) != 0) { + printf("[FAIL] fcntl(F_SETFL): %s (errno=%d)\n", strerror(errno), errno); goto fail; } memset(buf, 0, sizeof(buf)); if (read(f, buf, 5) != 5 || memcmp(buf, "hello", 5) != 0) { - printf("[FAIL] ordinary read failed got='%.*s' errno=%d\n", 5, buf, errno); + printf("[FAIL] read after F_SETFL got='%.*s' errno=%d\n", 5, buf, errno); goto fail; } - memset(buf, 0, sizeof(buf)); - n = read(f, buf, 5); - if (expect_stream) { - if (n != 5 || memcmp(buf, "hello", 5) != 0) { - printf("[FAIL] stream read did not restart at offset 0 got n=%zd data='%.*s' errno=%d\n", - n, 5, buf, errno); - goto fail; - } - if (write(f, "Z", 1) != 1) { - printf("[FAIL] stream write failed: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - if (last_write_offset != 0) { - printf("[FAIL] stream write offset expected 0 got %llu\n", - (unsigned long long)last_write_offset); - goto fail; - } - } else if (n != 5 || memcmp(buf, " from", 5) != 0) { - printf("[FAIL] nonseekable sequential read should advance offset got n=%zd data='%.*s'\n", n, - 5, buf); + if (write(f, "X", 1) != 1) { + printf("[FAIL] write after F_SETFL: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - close(f); f = -1; + usleep(100 * 1000); + + expected_setfl = (uint32_t)(old_flags | O_NONBLOCK); + if (open_count != 1 || read_count != 1 || write_count != 1 || flush_count != 1 || + release_count != 1) { + printf("[FAIL] counters open=%u read=%u write=%u flush=%u release=%u\n", open_count, + read_count, write_count, flush_count, release_count); + goto fail; + } + if (last_open_flags != expected_open) { + printf("[FAIL] open flags got=0%o expected=0%o\n", last_open_flags, expected_open); + goto fail; + } + if ((last_read_flags & O_NONBLOCK) == 0 || last_write_flags != expected_setfl || + last_release_flags != expected_setfl) { + printf("[FAIL] updated flags read=0%o write=0%o release=0%o expected=0%o\n", + last_read_flags, last_write_flags, last_release_flags, expected_setfl); + goto fail; + } + if (last_flush_uid != 0 || last_flush_gid != 0 || last_flush_pid == 0) { + printf("[FAIL] flush should use caller credentials uid=%u gid=%u pid=%u\n", + last_flush_uid, last_flush_gid, last_flush_pid); + goto fail; + } + if (last_release_uid != 0 || last_release_gid != 0 || last_release_pid != 0) { + printf("[FAIL] release should use nocreds uid=%u gid=%u pid=%u\n", last_release_uid, + last_release_gid, last_release_pid); + goto fail; + } + if (umount(mp) != 0) { printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); goto fail_no_umount; @@ -1649,7 +1636,78 @@ static int ext_test_fopen_nonseekable_mode(uint32_t open_out_flags, const char * return -1; } -static int ext_test_fopen_nonseekable_dir_mode(uint32_t open_out_flags, const char *mp) { +static int ext_test_fsetfl_updates_fuse_dev_nonblock() { + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + return -1; + } + + int old_flags = fcntl(fd, F_GETFL); + if (old_flags < 0) { + printf("[FAIL] fcntl(F_GETFL): %s (errno=%d)\n", strerror(errno), errno); + close(fd); + return -1; + } + if ((old_flags & O_NONBLOCK) != 0) { + printf("[FAIL] /dev/fuse unexpectedly opened nonblocking: flags=0%o\n", old_flags); + close(fd); + return -1; + } + if (fcntl(fd, F_SETFL, old_flags | O_NONBLOCK) != 0) { + printf("[FAIL] fcntl(F_SETFL O_NONBLOCK): %s (errno=%d)\n", strerror(errno), errno); + close(fd); + return -1; + } + + pid_t child = fork(); + if (child < 0) { + printf("[FAIL] fork: %s (errno=%d)\n", strerror(errno), errno); + close(fd); + return -1; + } + if (child == 0) { + unsigned char *buf = (unsigned char *)malloc(FUSE_TEST_BUF_SIZE); + if (!buf) { + _exit(11); + } + ssize_t n = read(fd, buf, FUSE_TEST_BUF_SIZE); + int saved_errno = errno; + free(buf); + if (n < 0 && (saved_errno == EAGAIN || saved_errno == EWOULDBLOCK)) { + _exit(0); + } + _exit(12); + } + + for (int i = 0; i < 50; i++) { + int status = 0; + pid_t got = waitpid(child, &status, WNOHANG); + if (got == child) { + close(fd); + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + return 0; + } + printf("[FAIL] child read did not return EAGAIN, status=%d\n", status); + return -1; + } + if (got < 0) { + printf("[FAIL] waitpid: %s (errno=%d)\n", strerror(errno), errno); + close(fd); + return -1; + } + usleep(20 * 1000); + } + + kill(child, SIGKILL); + waitpid(child, NULL, 0); + close(fd); + printf("[FAIL] /dev/fuse read blocked after F_SETFL O_NONBLOCK\n"); + return -1; +} + +static int ext_test_fopen_noflush_skips_flush() { + const char *mp = "/tmp/test_fuse_noflush"; int f = -1; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -1665,10 +1723,8 @@ static int ext_test_fopen_nonseekable_dir_mode(uint32_t open_out_flags, const ch volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t releasedir_count = 0; - volatile uint32_t last_releasedir_uid = UINT32_MAX; - volatile uint32_t last_releasedir_gid = UINT32_MAX; - volatile uint32_t last_releasedir_pid = UINT32_MAX; + volatile uint32_t flush_count = 0; + volatile uint32_t release_count = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); @@ -1676,11 +1732,9 @@ static int ext_test_fopen_nonseekable_dir_mode(uint32_t open_out_flags, const ch args.stop = &stop; args.init_done = &init_done; args.stop_on_destroy = 1; - args.root_open_out_flags = open_out_flags; - args.releasedir_count = &releasedir_count; - args.last_releasedir_uid = &last_releasedir_uid; - args.last_releasedir_gid = &last_releasedir_gid; - args.last_releasedir_pid = &last_releasedir_pid; + args.flush_count = &flush_count; + args.release_count = &release_count; + args.hello_open_out_flags = FOPEN_NOFLUSH; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -1705,26 +1759,19 @@ static int ext_test_fopen_nonseekable_dir_mode(uint32_t open_out_flags, const ch goto fail; } - f = open(mp, O_RDONLY | O_DIRECTORY); + char path[256]; + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, O_RDONLY); if (f < 0) { - printf("[FAIL] open(%s, O_DIRECTORY): %s (errno=%d)\n", mp, strerror(errno), errno); - goto fail; - } - - errno = 0; - if (lseek(f, 0, SEEK_SET) >= 0 || errno != ESPIPE) { - printf("[FAIL] dir lseek expected ESPIPE, errno=%d (%s)\n", errno, strerror(errno)); + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - close(f); f = -1; - usleep(100 * 1000); - if (releasedir_count != 1 || last_releasedir_uid != 0 || last_releasedir_gid != 0 || - last_releasedir_pid != 0) { - printf("[FAIL] releasedir nocreds count=%u uid=%u gid=%u pid=%u\n", releasedir_count, - last_releasedir_uid, last_releasedir_gid, last_releasedir_pid); + usleep(100 * 1000); + if (flush_count != 0 || release_count != 1) { + printf("[FAIL] noflush counters flush=%u release=%u\n", flush_count, release_count); goto fail; } @@ -1751,10 +1798,12 @@ static int ext_test_fopen_nonseekable_dir_mode(uint32_t open_out_flags, const ch return -1; } -static int ext_test_atomic_otrunc_uses_open_without_setattr() { - const char *mp = "/tmp/test_fuse_atomic_otrunc"; - int requested = O_RDWR | O_TRUNC; +static int ext_test_close_returns_flush_error_and_closes_fd() { + const char *mp = "/tmp/test_fuse_close_flush_error"; int f = -1; + int oldfd = -1; + int rc = 0; + char tmp = 0; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; @@ -1769,21 +1818,18 @@ static int ext_test_atomic_otrunc_uses_open_without_setattr() { volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t last_open_flags = 0; - volatile uint32_t open_count = 0; - volatile uint32_t setattr_count = 0; + volatile uint32_t flush_count = 0; + volatile uint32_t release_count = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.enable_write_ops = 0; args.stop_on_destroy = 1; - args.open_count = &open_count; - args.setattr_count = &setattr_count; - args.last_open_in_flags = &last_open_flags; - args.init_out_flags_override = FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_ATOMIC_O_TRUNC; + args.flush_count = &flush_count; + args.release_count = &release_count; + args.force_flush_errno = EIO; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -1810,21 +1856,30 @@ static int ext_test_atomic_otrunc_uses_open_without_setattr() { char path[256]; snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, requested); + f = open(path, O_RDONLY); if (f < 0) { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - close(f); - f = -1; - usleep(100 * 1000); - if (open_count != 1 || (last_open_flags & O_TRUNC) == 0) { - printf("[FAIL] open counters/flags open=%u flags=0%o\n", open_count, last_open_flags); + errno = 0; + oldfd = f; + rc = close(f); + f = -1; + if (rc != -1 || errno != EIO) { + printf("[FAIL] close should return EIO rc=%d errno=%d\n", rc, errno); goto fail; } - if (setattr_count != 0) { - printf("[FAIL] atomic O_TRUNC unexpectedly sent SETATTR count=%u\n", setattr_count); + errno = 0; + if (read(oldfd, &tmp, 1) != -1 || errno != EBADF) { + printf("[FAIL] close error must still close fd read_errno=%d\n", errno); + goto fail; + } + + usleep(100 * 1000); + if (flush_count != 1 || release_count != 1) { + printf("[FAIL] close flush error counters flush=%u release=%u\n", flush_count, + release_count); goto fail; } @@ -1851,8 +1906,8 @@ static int ext_test_atomic_otrunc_uses_open_without_setattr() { return -1; } -static int ext_test_ftruncate_setattr_uses_open_fh() { - const char *mp = "/tmp/test_fuse_ftruncate_fh"; +static int ext_test_flush_enosys_cached_success() { + const char *mp = "/tmp/test_fuse_flush_enosys"; int f = -1; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -1868,45 +1923,16 @@ static int ext_test_ftruncate_setattr_uses_open_fh() { volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t open_count = 0; - volatile uint32_t setattr_count = 0; - volatile uint32_t fallocate_count = 0; - volatile uint32_t write_count = 0; - volatile uint64_t last_open_fh = 0; - volatile uint32_t last_setattr_valid = 0; - volatile uint64_t last_setattr_fh = 0; - volatile uint64_t last_setattr_size = 0; - volatile uint64_t last_setattr_lock_owner = 0; - volatile uint64_t last_fallocate_fh = 0; - volatile uint64_t last_fallocate_offset = 0; - volatile uint64_t last_fallocate_length = 0; - volatile uint32_t last_fallocate_mode = 0; - volatile uint64_t last_write_offset = 0; - volatile uint32_t last_write_size = 0; + volatile uint32_t flush_count = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.enable_write_ops = 1; args.stop_on_destroy = 1; - args.open_count = &open_count; - args.setattr_count = &setattr_count; - args.fallocate_count = &fallocate_count; - args.write_count = &write_count; - args.last_open_fh = &last_open_fh; - args.last_setattr_valid = &last_setattr_valid; - args.last_setattr_fh = &last_setattr_fh; - args.last_setattr_size = &last_setattr_size; - args.last_setattr_lock_owner = &last_setattr_lock_owner; - args.last_fallocate_fh = &last_fallocate_fh; - args.last_fallocate_offset = &last_fallocate_offset; - args.last_fallocate_length = &last_fallocate_length; - args.last_fallocate_mode = &last_fallocate_mode; - args.last_write_offset = &last_write_offset; - args.last_write_size = &last_write_size; - args.next_open_fh = 940; + args.flush_count = &flush_count; + args.force_flush_errno = ENOSYS; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -1933,162 +1959,23 @@ static int ext_test_ftruncate_setattr_uses_open_fh() { char path[256]; snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDWR); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); - goto fail; - } - if (ftruncate(f, 7) != 0) { - printf("[FAIL] ftruncate: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - close(f); - f = -1; - - usleep(100 * 1000); - if (open_count != 1 || setattr_count != 1) { - printf("[FAIL] counters open=%u setattr=%u\n", open_count, setattr_count); - goto fail; - } - if ((last_setattr_valid & FATTR_SIZE) == 0 || (last_setattr_valid & FATTR_FH) == 0 || - (last_setattr_valid & FATTR_LOCKOWNER) == 0 || last_setattr_fh != 940 || - last_setattr_size != 7 || last_setattr_lock_owner == 0) { - printf("[FAIL] setattr valid=0x%x fh=%llu size=%llu lock_owner=%llu\n", - last_setattr_valid, (unsigned long long)last_setattr_fh, - (unsigned long long)last_setattr_size, - (unsigned long long)last_setattr_lock_owner); - goto fail; - } - - last_setattr_valid = 0; - last_setattr_fh = 0; - last_setattr_size = 0; - last_setattr_lock_owner = 0; - if (truncate(path, 5) != 0) { - printf("[FAIL] truncate(path): %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - usleep(100 * 1000); - if (setattr_count != 2) { - printf("[FAIL] path truncate setattr_count=%u\n", setattr_count); - goto fail; - } - if ((last_setattr_valid & FATTR_SIZE) == 0 || (last_setattr_valid & FATTR_FH) != 0 || - (last_setattr_valid & FATTR_LOCKOWNER) == 0 || last_setattr_size != 5 || - last_setattr_lock_owner == 0) { - printf("[FAIL] path setattr valid=0x%x fh=%llu size=%llu lock_owner=%llu\n", - last_setattr_valid, (unsigned long long)last_setattr_fh, - (unsigned long long)last_setattr_size, - (unsigned long long)last_setattr_lock_owner); - goto fail; - } - - last_setattr_valid = 0; - last_setattr_fh = 0; - last_setattr_size = 0; - last_setattr_lock_owner = 0; - f = open(path, O_RDWR | O_TRUNC); - if (f < 0) { - printf("[FAIL] open(O_TRUNC): %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - close(f); - f = -1; - usleep(100 * 1000); - if (setattr_count != 3) { - printf("[FAIL] open(O_TRUNC) setattr_count=%u\n", setattr_count); - goto fail; - } - if ((last_setattr_valid & FATTR_SIZE) == 0 || (last_setattr_valid & FATTR_FH) != 0 || - (last_setattr_valid & FATTR_LOCKOWNER) == 0 || last_setattr_size != 0 || - last_setattr_lock_owner == 0) { - printf("[FAIL] open truncate setattr valid=0x%x fh=%llu size=%llu lock_owner=%llu\n", - last_setattr_valid, (unsigned long long)last_setattr_fh, - (unsigned long long)last_setattr_size, - (unsigned long long)last_setattr_lock_owner); - goto fail; - } - - setattr_count = 0; - fallocate_count = 0; - last_open_fh = 0; - last_fallocate_fh = 0; - last_fallocate_offset = 0; - last_fallocate_length = 0; - last_fallocate_mode = 0; - f = open(path, O_RDWR); - if (f < 0) { - printf("[FAIL] open for fallocate: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - if (syscall(SYS_fallocate, f, 0, 0, 16) != 0) { - printf("[FAIL] fallocate: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - close(f); - f = -1; - usleep(100 * 1000); - if (setattr_count != 0 || fallocate_count != 1 || last_fallocate_fh != last_open_fh || - last_fallocate_offset != 0 || last_fallocate_length != 16 || last_fallocate_mode != 0) { - printf("[FAIL] fallocate counters setattr=%u fallocate=%u fh=%llu open_fh=%llu " - "offset=%llu length=%llu mode=%u\n", - setattr_count, fallocate_count, (unsigned long long)last_fallocate_fh, - (unsigned long long)last_open_fh, (unsigned long long)last_fallocate_offset, - (unsigned long long)last_fallocate_length, last_fallocate_mode); - goto fail; - } - struct stat st; - if (stat(path, &st) != 0 || st.st_size != 16) { - printf("[FAIL] stat after fallocate rc/size errno=%d (%s) size=%lld\n", errno, - strerror(errno), (long long)st.st_size); - goto fail; - } - - setattr_count = 0; - fallocate_count = 0; - f = open(path, O_RDWR); - if (f < 0) { - printf("[FAIL] open for fallocate overflow: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - if (syscall(SYS_fallocate, f, 0, INT64_MAX - 1, 4) == 0 || errno != EFBIG) { - printf("[FAIL] fallocate overflow expected EFBIG, errno=%d (%s)\n", errno, - strerror(errno)); - goto fail; - } - close(f); - f = -1; - usleep(100 * 1000); - if (setattr_count != 0 || fallocate_count != 0) { - printf("[FAIL] fallocate overflow sent requests setattr=%u fallocate=%u\n", setattr_count, - fallocate_count); - goto fail; + for (int i = 0; i < 2; ++i) { + f = open(path, O_RDONLY); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + if (close(f) != 0) { + printf("[FAIL] close after FLUSH ENOSYS: %s (errno=%d)\n", strerror(errno), errno); + f = -1; + goto fail; + } + f = -1; } - setattr_count = 0; - last_setattr_valid = 0; - last_setattr_fh = 0; - last_setattr_size = 0; - last_setattr_lock_owner = 0; - write_count = 0; - last_write_offset = 0; - last_write_size = 0; - f = open(path, O_RDWR); - if (f < 0) { - printf("[FAIL] open for pwrite: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - if (pwrite(f, "xy", 2, 9) != 2) { - printf("[FAIL] pwrite hole: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - close(f); - f = -1; usleep(100 * 1000); - if (setattr_count != 0 || write_count != 1 || last_write_offset != 9 || last_write_size != 2) { - printf("[FAIL] pwrite hole counters setattr=%u write=%u offset=%llu size=%u\n", - setattr_count, write_count, (unsigned long long)last_write_offset, - last_write_size); + if (flush_count != 1) { + printf("[FAIL] FLUSH ENOSYS should be cached, flush_count=%u\n", flush_count); goto fail; } @@ -2115,8 +2002,12 @@ static int ext_test_ftruncate_setattr_uses_open_fh() { return -1; } -static int ext_test_init_requests_linux_no_open_support() { - const char *mp = "/tmp/test_fuse_init_flags"; +static int ext_test_fopen_nonseekable_mode(uint32_t open_out_flags, const char *mp, + int expect_stream) { + int f = -1; + char buf[8]; + ssize_t n = -1; + volatile uint64_t last_write_offset = UINT64_MAX; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; @@ -2131,15 +2022,16 @@ static int ext_test_init_requests_linux_no_open_support() { volatile int stop = 0; volatile int init_done = 0; - volatile uint32_t init_flags = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; + args.enable_write_ops = 1; args.stop_on_destroy = 1; - args.init_in_flags = &init_flags; + args.hello_open_out_flags = open_out_flags; + args.last_write_offset = &last_write_offset; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -2164,12 +2056,60 @@ static int ext_test_init_requests_linux_no_open_support() { goto fail; } - if ((init_flags & FUSE_NO_OPEN_SUPPORT) == 0 || - (init_flags & FUSE_NO_OPENDIR_SUPPORT) == 0) { - printf("[FAIL] INIT flags missing no-open support bits: flags=0x%x\n", init_flags); + char path[256]; + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, O_RDWR); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + + errno = 0; + if (lseek(f, 0, SEEK_SET) >= 0 || errno != ESPIPE) { + printf("[FAIL] lseek expected ESPIPE, ret errno=%d (%s)\n", errno, strerror(errno)); + goto fail; + } + errno = 0; + if (pread(f, buf, 1, 0) >= 0 || errno != ESPIPE) { + printf("[FAIL] pread expected ESPIPE, errno=%d (%s)\n", errno, strerror(errno)); + goto fail; + } + errno = 0; + if (pwrite(f, "x", 1, 0) >= 0 || errno != ESPIPE) { + printf("[FAIL] pwrite expected ESPIPE, errno=%d (%s)\n", errno, strerror(errno)); + goto fail; + } + + memset(buf, 0, sizeof(buf)); + if (read(f, buf, 5) != 5 || memcmp(buf, "hello", 5) != 0) { + printf("[FAIL] ordinary read failed got='%.*s' errno=%d\n", 5, buf, errno); + goto fail; + } + memset(buf, 0, sizeof(buf)); + n = read(f, buf, 5); + if (expect_stream) { + if (n != 5 || memcmp(buf, "hello", 5) != 0) { + printf("[FAIL] stream read did not restart at offset 0 got n=%zd data='%.*s' errno=%d\n", + n, 5, buf, errno); + goto fail; + } + if (write(f, "Z", 1) != 1) { + printf("[FAIL] stream write failed: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + if (last_write_offset != 0) { + printf("[FAIL] stream write offset expected 0 got %llu\n", + (unsigned long long)last_write_offset); + goto fail; + } + } else if (n != 5 || memcmp(buf, " from", 5) != 0) { + printf("[FAIL] nonseekable sequential read should advance offset got n=%zd data='%.*s'\n", n, + 5, buf); goto fail; } + close(f); + f = -1; if (umount(mp) != 0) { printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); goto fail_no_umount; @@ -2181,6 +2121,9 @@ static int ext_test_init_requests_linux_no_open_support() { return 0; fail: + if (f >= 0) { + close(f); + } umount(mp); fail_no_umount: stop = 1; @@ -2190,8 +2133,8 @@ static int ext_test_init_requests_linux_no_open_support() { return -1; } -static int ext_test_p4_subtype_mount() { - const char *mp = "/tmp/test_fuse_p4_subtype"; +static int ext_test_fopen_nonseekable_dir_mode(uint32_t open_out_flags, const char *mp) { + int f = -1; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; @@ -2206,16 +2149,25 @@ static int ext_test_p4_subtype_mount() { volatile int stop = 0; volatile int init_done = 0; + volatile uint32_t releasedir_count = 0; + volatile uint32_t last_releasedir_uid = UINT32_MAX; + volatile uint32_t last_releasedir_gid = UINT32_MAX; + volatile uint32_t last_releasedir_pid = UINT32_MAX; + struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.enable_write_ops = 0; args.stop_on_destroy = 1; - - pthread_t th; - if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + args.root_open_out_flags = open_out_flags; + args.releasedir_count = &releasedir_count; + args.last_releasedir_uid = &last_releasedir_uid; + args.last_releasedir_gid = &last_releasedir_gid; + args.last_releasedir_pid = &last_releasedir_pid; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { printf("[FAIL] pthread_create\n"); close(fd); rmdir(mp); @@ -2224,222 +2176,754 @@ static int ext_test_p4_subtype_mount() { char opts[256]; snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); - if (mount("none", mp, "fuse.fuse3_demo", 0, opts) != 0) { - printf("[FAIL] mount(fuse.fuse3_demo): %s (errno=%d)\n", strerror(errno), errno); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); stop = 1; close(fd); pthread_join(th, NULL); rmdir(mp); return -1; } - - for (int i = 0; i < 200; i++) { - if (init_done) { - break; - } - usleep(10 * 1000); - } - if (!init_done) { + if (fuseg_wait_init(&init_done) != 0) { printf("[FAIL] init handshake timeout\n"); - umount(mp); - stop = 1; - close(fd); - pthread_join(th, NULL); - rmdir(mp); - return -1; + goto fail; } - char file_path[256]; - snprintf(file_path, sizeof(file_path), "%s/hello.txt", mp); + f = open(mp, O_RDONLY | O_DIRECTORY); + if (f < 0) { + printf("[FAIL] open(%s, O_DIRECTORY): %s (errno=%d)\n", mp, strerror(errno), errno); + goto fail; + } - char buf[128]; - if (fuseg_read_file_cstr(file_path, buf, sizeof(buf)) < 0) { - printf("[FAIL] read(%s): %s (errno=%d)\n", file_path, strerror(errno), errno); - umount(mp); - stop = 1; - close(fd); - pthread_join(th, NULL); - rmdir(mp); - return -1; + errno = 0; + if (lseek(f, 0, SEEK_SET) >= 0 || errno != ESPIPE) { + printf("[FAIL] dir lseek expected ESPIPE, errno=%d (%s)\n", errno, strerror(errno)); + goto fail; } - if (strcmp(buf, "hello from fuse\n") != 0) { - printf("[FAIL] content mismatch: got='%s'\n", buf); - umount(mp); - stop = 1; - close(fd); - pthread_join(th, NULL); - rmdir(mp); - return -1; + + close(f); + f = -1; + usleep(100 * 1000); + + if (releasedir_count != 1 || last_releasedir_uid != 0 || last_releasedir_gid != 0 || + last_releasedir_pid != 0) { + printf("[FAIL] releasedir nocreds count=%u uid=%u gid=%u pid=%u\n", releasedir_count, + last_releasedir_uid, last_releasedir_gid, last_releasedir_pid); + goto fail; } if (umount(mp) != 0) { printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - stop = 1; - close(fd); - pthread_join(th, NULL); - rmdir(mp); - return -1; + goto fail_no_umount; } - stop = 1; close(fd); pthread_join(th, NULL); rmdir(mp); return 0; -} - -static int ext_run_child_drop_priv_and_stat(const char *mp, int expect_errno, int expect_success) { - pid_t pid = fork(); - if (pid < 0) { - return -1; - } - if (pid == 0) { - if (setgid(1000) != 0) { - _exit(30); - } - if (setuid(1000) != 0) { - _exit(31); - } - - struct stat st; - int r = stat(mp, &st); - if (expect_success) { - if (r != 0) - _exit(10); - char p[256]; - snprintf(p, sizeof(p), "%s/hello.txt", mp); - int fd = open(p, O_RDONLY); - if (fd < 0) - _exit(11); - char buf[64]; - ssize_t n = read(fd, buf, sizeof(buf) - 1); - close(fd); - if (n < 0) - _exit(12); - buf[n] = '\0'; - if (strcmp(buf, "hello from fuse\n") != 0) - _exit(13); - _exit(0); - } - - if (r != 0 && errno == expect_errno) { - _exit(0); - } - if (r != 0) { - _exit(21); - } - - /* - * Linux 语义下,目录本身的 stat 可能成功;真正的拒绝点通常体现在 - * 访问目录内对象(例如 open/stat 子路径)。 - */ - char p[256]; - snprintf(p, sizeof(p), "%s/hello.txt", mp); - int fd = open(p, O_RDONLY); - if (fd >= 0) { - close(fd); - _exit(22); - } - if (errno != expect_errno) { - _exit(23); - } - _exit(0); - } - int status = 0; - if (waitpid(pid, &status, 0) < 0) { - return -1; - } - if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { - errno = ECHILD; - return -1; +fail: + if (f >= 0) { + close(f); } - return 0; + umount(mp); +fail_no_umount: + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; } -static int ext_run_permission_case(const char *mp, const char *opts, uint32_t root_mode_override, - uint32_t hello_mode_override, int expect_errno, - int expect_success) { +static int ext_test_atomic_otrunc_uses_open_without_setattr() { + const char *mp = "/tmp/test_fuse_atomic_otrunc"; + int requested = O_RDWR | O_TRUNC; + int f = -1; if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; } int fd = open("/dev/fuse", O_RDWR); if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); rmdir(mp); return -1; } volatile int stop = 0; volatile int init_done = 0; + volatile uint32_t last_open_flags = 0; + volatile uint32_t open_count = 0; + volatile uint32_t setattr_count = 0; + struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; args.enable_write_ops = 0; - args.exit_after_init = 0; - args.root_mode_override = root_mode_override; - args.hello_mode_override = hello_mode_override; + args.stop_on_destroy = 1; + args.open_count = &open_count; + args.setattr_count = &setattr_count; + args.last_open_in_flags = &last_open_flags; + args.init_out_flags_override = FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_ATOMIC_O_TRUNC; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); close(fd); rmdir(mp); return -1; } - char full_opts[512]; - snprintf(full_opts, sizeof(full_opts), "fd=%d,%s", fd, opts); - if (mount("none", mp, "fuse", 0, full_opts) != 0) { + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); stop = 1; close(fd); pthread_join(th, NULL); rmdir(mp); return -1; } - if (fuseg_wait_init(&init_done) != 0) { - umount(mp); - stop = 1; - close(fd); - pthread_join(th, NULL); - rmdir(mp); - return -1; + printf("[FAIL] init handshake timeout\n"); + goto fail; } - if (ext_run_child_drop_priv_and_stat(mp, expect_errno, expect_success) != 0) { - umount(mp); - stop = 1; - close(fd); - pthread_join(th, NULL); - rmdir(mp); - return -1; + char path[256]; + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, requested); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; } + close(f); + f = -1; - umount(mp); - rmdir(mp); + usleep(100 * 1000); + if (open_count != 1 || (last_open_flags & O_TRUNC) == 0) { + printf("[FAIL] open counters/flags open=%u flags=0%o\n", open_count, last_open_flags); + goto fail; + } + if (setattr_count != 0) { + printf("[FAIL] atomic O_TRUNC unexpectedly sent SETATTR count=%u\n", setattr_count); + goto fail; + } + + if (umount(mp) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + goto fail_no_umount; + } stop = 1; close(fd); pthread_join(th, NULL); + rmdir(mp); return 0; + +fail: + if (f >= 0) { + close(f); + } + umount(mp); +fail_no_umount: + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; } -static int ext_test_permissions() { - const uint32_t DIR_NO_PERM = 0040000; - const uint32_t REG_NO_PERM = 0100000; +static int ext_test_ftruncate_setattr_uses_open_fh() { + const char *mp = "/tmp/test_fuse_ftruncate_fh"; + int f = -1; + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + return -1; + } - { - const char *mp = "/tmp/test_fuse_perm_owner"; - if (ext_run_permission_case(mp, "rootmode=040755,user_id=0,group_id=0", 0, 0, EACCES, 0) != - 0) { - printf("[FAIL] mount owner restriction\n"); - return -1; - } + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + rmdir(mp); + return -1; } - { - const char *mp = "/tmp/test_fuse_perm_default"; + volatile int stop = 0; + volatile int init_done = 0; + volatile uint32_t open_count = 0; + volatile uint32_t setattr_count = 0; + volatile uint32_t fallocate_count = 0; + volatile uint32_t write_count = 0; + volatile uint64_t last_open_fh = 0; + volatile uint32_t last_setattr_valid = 0; + volatile uint64_t last_setattr_fh = 0; + volatile uint64_t last_setattr_size = 0; + volatile uint64_t last_setattr_lock_owner = 0; + volatile uint64_t last_fallocate_fh = 0; + volatile uint64_t last_fallocate_offset = 0; + volatile uint64_t last_fallocate_length = 0; + volatile uint32_t last_fallocate_mode = 0; + volatile uint64_t last_write_offset = 0; + volatile uint32_t last_write_size = 0; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.enable_write_ops = 1; + args.stop_on_destroy = 1; + args.open_count = &open_count; + args.setattr_count = &setattr_count; + args.fallocate_count = &fallocate_count; + args.write_count = &write_count; + args.last_open_fh = &last_open_fh; + args.last_setattr_valid = &last_setattr_valid; + args.last_setattr_fh = &last_setattr_fh; + args.last_setattr_size = &last_setattr_size; + args.last_setattr_lock_owner = &last_setattr_lock_owner; + args.last_fallocate_fh = &last_fallocate_fh; + args.last_fallocate_offset = &last_fallocate_offset; + args.last_fallocate_length = &last_fallocate_length; + args.last_fallocate_mode = &last_fallocate_mode; + args.last_write_offset = &last_write_offset; + args.last_write_size = &last_write_size; + args.next_open_fh = 940; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); + close(fd); + rmdir(mp); + return -1; + } + + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + if (fuseg_wait_init(&init_done) != 0) { + printf("[FAIL] init handshake timeout\n"); + goto fail; + } + + char path[256]; + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, O_RDWR); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + if (ftruncate(f, 7) != 0) { + printf("[FAIL] ftruncate: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + close(f); + f = -1; + + usleep(100 * 1000); + if (open_count != 1 || setattr_count != 1) { + printf("[FAIL] counters open=%u setattr=%u\n", open_count, setattr_count); + goto fail; + } + if ((last_setattr_valid & FATTR_SIZE) == 0 || (last_setattr_valid & FATTR_FH) == 0 || + (last_setattr_valid & FATTR_LOCKOWNER) == 0 || last_setattr_fh != 940 || + last_setattr_size != 7 || last_setattr_lock_owner == 0) { + printf("[FAIL] setattr valid=0x%x fh=%llu size=%llu lock_owner=%llu\n", + last_setattr_valid, (unsigned long long)last_setattr_fh, + (unsigned long long)last_setattr_size, + (unsigned long long)last_setattr_lock_owner); + goto fail; + } + + last_setattr_valid = 0; + last_setattr_fh = 0; + last_setattr_size = 0; + last_setattr_lock_owner = 0; + if (truncate(path, 5) != 0) { + printf("[FAIL] truncate(path): %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + usleep(100 * 1000); + if (setattr_count != 2) { + printf("[FAIL] path truncate setattr_count=%u\n", setattr_count); + goto fail; + } + if ((last_setattr_valid & FATTR_SIZE) == 0 || (last_setattr_valid & FATTR_FH) != 0 || + (last_setattr_valid & FATTR_LOCKOWNER) == 0 || last_setattr_size != 5 || + last_setattr_lock_owner == 0) { + printf("[FAIL] path setattr valid=0x%x fh=%llu size=%llu lock_owner=%llu\n", + last_setattr_valid, (unsigned long long)last_setattr_fh, + (unsigned long long)last_setattr_size, + (unsigned long long)last_setattr_lock_owner); + goto fail; + } + + last_setattr_valid = 0; + last_setattr_fh = 0; + last_setattr_size = 0; + last_setattr_lock_owner = 0; + f = open(path, O_RDWR | O_TRUNC); + if (f < 0) { + printf("[FAIL] open(O_TRUNC): %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + close(f); + f = -1; + usleep(100 * 1000); + if (setattr_count != 3) { + printf("[FAIL] open(O_TRUNC) setattr_count=%u\n", setattr_count); + goto fail; + } + if ((last_setattr_valid & FATTR_SIZE) == 0 || (last_setattr_valid & FATTR_FH) != 0 || + (last_setattr_valid & FATTR_LOCKOWNER) == 0 || last_setattr_size != 0 || + last_setattr_lock_owner == 0) { + printf("[FAIL] open truncate setattr valid=0x%x fh=%llu size=%llu lock_owner=%llu\n", + last_setattr_valid, (unsigned long long)last_setattr_fh, + (unsigned long long)last_setattr_size, + (unsigned long long)last_setattr_lock_owner); + goto fail; + } + + setattr_count = 0; + fallocate_count = 0; + last_open_fh = 0; + last_fallocate_fh = 0; + last_fallocate_offset = 0; + last_fallocate_length = 0; + last_fallocate_mode = 0; + f = open(path, O_RDWR); + if (f < 0) { + printf("[FAIL] open for fallocate: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + if (syscall(SYS_fallocate, f, 0, 0, 16) != 0) { + printf("[FAIL] fallocate: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + close(f); + f = -1; + usleep(100 * 1000); + if (setattr_count != 0 || fallocate_count != 1 || last_fallocate_fh != last_open_fh || + last_fallocate_offset != 0 || last_fallocate_length != 16 || last_fallocate_mode != 0) { + printf("[FAIL] fallocate counters setattr=%u fallocate=%u fh=%llu open_fh=%llu " + "offset=%llu length=%llu mode=%u\n", + setattr_count, fallocate_count, (unsigned long long)last_fallocate_fh, + (unsigned long long)last_open_fh, (unsigned long long)last_fallocate_offset, + (unsigned long long)last_fallocate_length, last_fallocate_mode); + goto fail; + } + struct stat st; + if (stat(path, &st) != 0 || st.st_size != 16) { + printf("[FAIL] stat after fallocate rc/size errno=%d (%s) size=%lld\n", errno, + strerror(errno), (long long)st.st_size); + goto fail; + } + + setattr_count = 0; + fallocate_count = 0; + f = open(path, O_RDWR); + if (f < 0) { + printf("[FAIL] open for fallocate overflow: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + if (syscall(SYS_fallocate, f, 0, INT64_MAX - 1, 4) == 0 || errno != EFBIG) { + printf("[FAIL] fallocate overflow expected EFBIG, errno=%d (%s)\n", errno, + strerror(errno)); + goto fail; + } + close(f); + f = -1; + usleep(100 * 1000); + if (setattr_count != 0 || fallocate_count != 0) { + printf("[FAIL] fallocate overflow sent requests setattr=%u fallocate=%u\n", setattr_count, + fallocate_count); + goto fail; + } + + setattr_count = 0; + last_setattr_valid = 0; + last_setattr_fh = 0; + last_setattr_size = 0; + last_setattr_lock_owner = 0; + write_count = 0; + last_write_offset = 0; + last_write_size = 0; + f = open(path, O_RDWR); + if (f < 0) { + printf("[FAIL] open for pwrite: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + if (pwrite(f, "xy", 2, 9) != 2) { + printf("[FAIL] pwrite hole: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + close(f); + f = -1; + usleep(100 * 1000); + if (setattr_count != 0 || write_count != 1 || last_write_offset != 9 || last_write_size != 2) { + printf("[FAIL] pwrite hole counters setattr=%u write=%u offset=%llu size=%u\n", + setattr_count, write_count, (unsigned long long)last_write_offset, + last_write_size); + goto fail; + } + + if (umount(mp) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + goto fail_no_umount; + } + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return 0; + +fail: + if (f >= 0) { + close(f); + } + umount(mp); +fail_no_umount: + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; +} + +static int ext_test_init_requests_linux_no_open_support() { + const char *mp = "/tmp/test_fuse_init_flags"; + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + return -1; + } + + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + rmdir(mp); + return -1; + } + + volatile int stop = 0; + volatile int init_done = 0; + volatile uint32_t init_flags = 0; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.stop_on_destroy = 1; + args.init_in_flags = &init_flags; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); + close(fd); + rmdir(mp); + return -1; + } + + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + if (fuseg_wait_init(&init_done) != 0) { + printf("[FAIL] init handshake timeout\n"); + goto fail; + } + + if ((init_flags & FUSE_NO_OPEN_SUPPORT) == 0 || + (init_flags & FUSE_NO_OPENDIR_SUPPORT) == 0) { + printf("[FAIL] INIT flags missing no-open support bits: flags=0x%x\n", init_flags); + goto fail; + } + + if (umount(mp) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + goto fail_no_umount; + } + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return 0; + +fail: + umount(mp); +fail_no_umount: + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; +} + +static int ext_test_p4_subtype_mount() { + const char *mp = "/tmp/test_fuse_p4_subtype"; + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + return -1; + } + + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + rmdir(mp); + return -1; + } + + volatile int stop = 0; + volatile int init_done = 0; + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.enable_write_ops = 0; + args.stop_on_destroy = 1; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); + close(fd); + rmdir(mp); + return -1; + } + + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); + if (mount("none", mp, "fuse.fuse3_demo", 0, opts) != 0) { + printf("[FAIL] mount(fuse.fuse3_demo): %s (errno=%d)\n", strerror(errno), errno); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + + for (int i = 0; i < 200; i++) { + if (init_done) { + break; + } + usleep(10 * 1000); + } + if (!init_done) { + printf("[FAIL] init handshake timeout\n"); + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + + char file_path[256]; + snprintf(file_path, sizeof(file_path), "%s/hello.txt", mp); + + char buf[128]; + if (fuseg_read_file_cstr(file_path, buf, sizeof(buf)) < 0) { + printf("[FAIL] read(%s): %s (errno=%d)\n", file_path, strerror(errno), errno); + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + if (strcmp(buf, "hello from fuse\n") != 0) { + printf("[FAIL] content mismatch: got='%s'\n", buf); + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + + if (umount(mp) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return 0; +} + +static int ext_run_child_drop_priv_and_stat(const char *mp, int expect_errno, int expect_success) { + pid_t pid = fork(); + if (pid < 0) { + return -1; + } + if (pid == 0) { + if (setgid(1000) != 0) { + _exit(30); + } + if (setuid(1000) != 0) { + _exit(31); + } + + struct stat st; + int r = stat(mp, &st); + if (expect_success) { + if (r != 0) + _exit(10); + char p[256]; + snprintf(p, sizeof(p), "%s/hello.txt", mp); + int fd = open(p, O_RDONLY); + if (fd < 0) + _exit(11); + char buf[64]; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (n < 0) + _exit(12); + buf[n] = '\0'; + if (strcmp(buf, "hello from fuse\n") != 0) + _exit(13); + _exit(0); + } + + if (r != 0 && errno == expect_errno) { + _exit(0); + } + if (r != 0) { + _exit(21); + } + + /* + * Linux 语义下,目录本身的 stat 可能成功;真正的拒绝点通常体现在 + * 访问目录内对象(例如 open/stat 子路径)。 + */ + char p[256]; + snprintf(p, sizeof(p), "%s/hello.txt", mp); + int fd = open(p, O_RDONLY); + if (fd >= 0) { + close(fd); + _exit(22); + } + if (errno != expect_errno) { + _exit(23); + } + _exit(0); + } + + int status = 0; + if (waitpid(pid, &status, 0) < 0) { + return -1; + } + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + errno = ECHILD; + return -1; + } + return 0; +} + +static int ext_run_permission_case(const char *mp, const char *opts, uint32_t root_mode_override, + uint32_t hello_mode_override, int expect_errno, + int expect_success) { + if (ensure_dir(mp) != 0) { + return -1; + } + + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + rmdir(mp); + return -1; + } + + volatile int stop = 0; + volatile int init_done = 0; + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.enable_write_ops = 0; + args.exit_after_init = 0; + args.root_mode_override = root_mode_override; + args.hello_mode_override = hello_mode_override; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + close(fd); + rmdir(mp); + return -1; + } + + char full_opts[512]; + snprintf(full_opts, sizeof(full_opts), "fd=%d,%s", fd, opts); + if (mount("none", mp, "fuse", 0, full_opts) != 0) { + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + + if (fuseg_wait_init(&init_done) != 0) { + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + + if (ext_run_child_drop_priv_and_stat(mp, expect_errno, expect_success) != 0) { + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + + umount(mp); + rmdir(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + return 0; +} + +static int ext_test_permissions() { + const uint32_t DIR_NO_PERM = 0040000; + const uint32_t REG_NO_PERM = 0100000; + + { + const char *mp = "/tmp/test_fuse_perm_owner"; + if (ext_run_permission_case(mp, "rootmode=040755,user_id=0,group_id=0", 0, 0, EACCES, 0) != + 0) { + printf("[FAIL] mount owner restriction\n"); + return -1; + } + } + + { + const char *mp = "/tmp/test_fuse_perm_default"; if (ext_run_permission_case( mp, "rootmode=040000,user_id=0,group_id=0,allow_other,default_permissions", DIR_NO_PERM, REG_NO_PERM, EACCES, 0) != 0) { @@ -2448,185 +2932,917 @@ static int ext_test_permissions() { } } - { - const char *mp = "/tmp/test_fuse_perm_remote"; - if (ext_run_permission_case(mp, "rootmode=040000,user_id=0,group_id=0,allow_other", - DIR_NO_PERM, REG_NO_PERM, 0, 1) != 0) { - printf("[FAIL] remote permission model allow\n"); - return -1; - } + { + const char *mp = "/tmp/test_fuse_perm_remote"; + if (ext_run_permission_case(mp, "rootmode=040000,user_id=0,group_id=0,allow_other", + DIR_NO_PERM, REG_NO_PERM, 0, 1) != 0) { + printf("[FAIL] remote permission model allow\n"); + return -1; + } + } + + return 0; +} + +static int ext_test_clone() { + const char *mp = "/tmp/test_fuse_clone"; + DIR *d = NULL; + int found = 0; + struct dirent *de = NULL; + char p[256]; + struct stat st; + char buf[128]; + int n = -1; + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + return -1; + } + + int master_fd = open("/dev/fuse", O_RDWR); + if (master_fd < 0) { + printf("[FAIL] open(/dev/fuse master): %s (errno=%d)\n", strerror(errno), errno); + rmdir(mp); + return -1; + } + + volatile int stop = 0; + volatile int init_done = 0; + + struct fuse_daemon_args master_args; + memset(&master_args, 0, sizeof(master_args)); + master_args.fd = master_fd; + master_args.stop = &stop; + master_args.init_done = &init_done; + master_args.enable_write_ops = 0; + master_args.exit_after_init = 1; + + pthread_t master_th; + if (pthread_create(&master_th, NULL, fuse_daemon_thread, &master_args) != 0) { + printf("[FAIL] pthread_create(master)\n"); + close(master_fd); + rmdir(mp); + return -1; + } + + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", master_fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + stop = 1; + close(master_fd); + pthread_join(master_th, NULL); + rmdir(mp); + return -1; + } + + for (int i = 0; i < 100; i++) { + if (init_done) + break; + usleep(10 * 1000); + } + if (!init_done) { + printf("[FAIL] init handshake timeout\n"); + umount(mp); + stop = 1; + close(master_fd); + pthread_join(master_th, NULL); + rmdir(mp); + return -1; + } + + pthread_join(master_th, NULL); + + int clone_fd = open("/dev/fuse", O_RDWR); + if (clone_fd < 0) { + printf("[FAIL] open(/dev/fuse clone): %s (errno=%d)\n", strerror(errno), errno); + umount(mp); + close(master_fd); + rmdir(mp); + return -1; + } + + uint32_t oldfd_u32 = (uint32_t)master_fd; + if (ioctl(clone_fd, FUSE_DEV_IOC_CLONE, &oldfd_u32) != 0) { + printf("[FAIL] ioctl(FUSE_DEV_IOC_CLONE): %s (errno=%d)\n", strerror(errno), errno); + umount(mp); + close(clone_fd); + close(master_fd); + rmdir(mp); + return -1; + } + + struct fuse_daemon_args clone_args; + memset(&clone_args, 0, sizeof(clone_args)); + clone_args.fd = clone_fd; + clone_args.stop = &stop; + clone_args.init_done = &init_done; + clone_args.enable_write_ops = 0; + clone_args.exit_after_init = 0; + + pthread_t clone_th; + if (pthread_create(&clone_th, NULL, fuse_daemon_thread, &clone_args) != 0) { + printf("[FAIL] pthread_create(clone)\n"); + umount(mp); + close(clone_fd); + close(master_fd); + rmdir(mp); + return -1; + } + + d = opendir(mp); + if (!d) { + printf("[FAIL] opendir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + goto fail; + } + found = 0; + while ((de = readdir(d)) != NULL) { + if (strcmp(de->d_name, "hello.txt") == 0) { + found = 1; + break; + } + } + closedir(d); + if (!found) { + printf("[FAIL] readdir: hello.txt not found\n"); + goto fail; + } + + snprintf(p, sizeof(p), "%s/hello.txt", mp); + if (stat(p, &st) != 0) { + printf("[FAIL] stat(%s): %s (errno=%d)\n", p, strerror(errno), errno); + goto fail; + } + if (!S_ISREG(st.st_mode)) { + printf("[FAIL] stat: expected regular file\n"); + goto fail; + } + + n = fuseg_read_file_cstr(p, buf, sizeof(buf)); + if (n < 0) { + printf("[FAIL] read(%s): %s (errno=%d)\n", p, strerror(errno), errno); + goto fail; + } + if (strcmp(buf, "hello from fuse\n") != 0) { + printf("[FAIL] content mismatch: got='%s'\n", buf); + goto fail; + } + + umount(mp); + rmdir(mp); + stop = 1; + close(clone_fd); + close(master_fd); + pthread_join(clone_th, NULL); + return 0; + +fail: + umount(mp); + stop = 1; + close(clone_fd); + close(master_fd); + pthread_join(clone_th, NULL); + rmdir(mp); + return -1; +} + +static int ext_test_large_read_over_max_write() { + const char *mp = "/tmp/test_fuse_large_read"; + const size_t data_size = 6000; + char path[256]; + char *buf = NULL; + int n = -1; + + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + return -1; + } + + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + rmdir(mp); + return -1; + } + + volatile int stop = 0; + volatile int init_done = 0; + volatile uint32_t read_count = 0; + volatile uint64_t read_offsets[4] = {0}; + volatile uint32_t read_sizes[4] = {0}; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.read_count = &read_count; + args.read_offsets = read_offsets; + args.read_sizes = read_sizes; + args.read_trace_capacity = 4; + args.hello_data_size_override = data_size; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); + close(fd); + rmdir(mp); + return -1; + } + + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", + fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + if (fuseg_wait_init(&init_done) != 0) { + printf("[FAIL] init handshake timeout\n"); + goto fail; + } + + buf = (char *)malloc(data_size); + if (!buf) { + printf("[FAIL] malloc read buffer\n"); + goto fail; + } + + snprintf(path, sizeof(path), "%s/hello.txt", mp); + n = fuseg_read_file(path, buf, data_size); + if (n < 0) { + printf("[FAIL] read(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + if ((size_t)n != data_size) { + printf("[FAIL] read size mismatch: got=%d expected=%zu read_count=%u\n", n, data_size, + read_count); + goto fail; + } + for (size_t i = 0; i < data_size; i++) { + char expected = (char)('A' + (i % 26)); + if (buf[i] != expected) { + printf("[FAIL] read data mismatch at %zu: got=%d expected=%d\n", i, buf[i], + expected); + goto fail; + } + } + if (read_count != 2 || read_offsets[0] != 0 || read_offsets[1] != 4096 || + read_sizes[0] != 4096 || read_sizes[1] > 4096 || read_sizes[1] == 0) { + printf("[FAIL] unexpected FUSE_READ split: count=%u off0=%llu size0=%u off1=%llu size1=%u\n", + read_count, (unsigned long long)read_offsets[0], read_sizes[0], + (unsigned long long)read_offsets[1], read_sizes[1]); + goto fail; + } + + free(buf); + buf = NULL; + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return 0; + +fail: + if (buf) { + free(buf); + } + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; +} + +static int ext_test_cached_read_uses_open_fh_without_extra_open() { + const char *mp = "/tmp/test_fuse_cached_read_fh"; + char path[256]; + char buf[32]; + int f = -1; + ssize_t n = -1; + ssize_t first_n = -1; + + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + return -1; + } + + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + rmdir(mp); + return -1; + } + + volatile int stop = 0; + volatile int init_done = 0; + volatile uint32_t open_count = 0; + volatile uint32_t read_count = 0; + volatile uint64_t read_fhs[4] = {0}; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.open_count = &open_count; + args.read_count = &read_count; + args.read_fhs = read_fhs; + args.read_trace_capacity = 4; + args.next_open_fh = 100; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); + close(fd); + rmdir(mp); + return -1; + } + + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", + fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + if (fuseg_wait_init(&init_done) != 0) { + printf("[FAIL] init handshake timeout\n"); + goto fail; + } + + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, O_RDONLY); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + n = pread(f, buf, sizeof(buf), 0); + if (n <= 0) { + printf("[FAIL] first pread got=%zd errno=%d\n", n, errno); + close(f); + goto fail; + } + first_n = n; + memset(buf, 0, sizeof(buf)); + n = pread(f, buf, sizeof(buf), 0); + close(f); + f = -1; + if (n != first_n) { + printf("[FAIL] second pread got=%zd errno=%d\n", n, errno); + goto fail; + } + if (open_count != 1 || read_count != 1 || read_fhs[0] != 100) { + printf("[FAIL] cached read counters open=%u read=%u fh0=%llu\n", open_count, + read_count, (unsigned long long)read_fhs[0]); + goto fail; + } + + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return 0; + +fail: + if (f >= 0) { + close(f); + } + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; +} + +static int ext_test_cached_short_read_updates_eof() { + const char *mp = "/tmp/test_fuse_cached_short_read"; + char path[256]; + char buf[32]; + int f = -1; + ssize_t n = -1; + + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + return -1; + } + + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + rmdir(mp); + return -1; + } + + volatile int stop = 0; + volatile int init_done = 0; + volatile uint32_t read_count = 0; + volatile uint64_t read_offsets[4] = {0}; + volatile uint32_t read_sizes[4] = {0}; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.read_count = &read_count; + args.read_offsets = read_offsets; + args.read_sizes = read_sizes; + args.read_trace_capacity = 4; + args.hello_data_size_override = 8192; + args.hello_read_size_override = 5; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); + close(fd); + rmdir(mp); + return -1; + } + + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", + fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + if (fuseg_wait_init(&init_done) != 0) { + printf("[FAIL] init handshake timeout\n"); + goto fail; + } + + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, O_RDONLY); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + + memset(buf, 0x7f, sizeof(buf)); + n = pread(f, buf, sizeof(buf), 0); + if (n != 5 || memcmp(buf, "ABCDE", 5) != 0) { + printf("[FAIL] short cached pread got=%zd data='%.*s' read=%u errno=%d\n", n, 5, buf, + read_count, errno); + goto fail; + } + memset(buf, 0x7f, sizeof(buf)); + n = pread(f, buf, sizeof(buf), 5); + if (n != 0) { + printf("[FAIL] EOF cached pread got=%zd read=%u errno=%d\n", n, read_count, errno); + goto fail; + } + + if (read_count != 1 || read_offsets[0] != 0 || read_sizes[0] != 4096) { + printf("[FAIL] short read trace count=%u off0=%llu size0=%u\n", read_count, + (unsigned long long)read_offsets[0], read_sizes[0]); + goto fail; + } + + close(f); + f = -1; + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return 0; + +fail: + if (f >= 0) { + close(f); + } + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; +} + +static int ext_test_cached_read_sees_write_through_update() { + const char *mp = "/tmp/test_fuse_cached_read_write"; + char path[256]; + char buf[16]; + int f = -1; + + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + return -1; + } + + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + rmdir(mp); + return -1; + } + + volatile int stop = 0; + volatile int init_done = 0; + volatile uint32_t open_count = 0; + volatile uint32_t read_count = 0; + volatile uint32_t write_count = 0; + volatile uint64_t last_write_fh = 0; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.enable_write_ops = 1; + args.open_count = &open_count; + args.read_count = &read_count; + args.write_count = &write_count; + args.last_write_fh = &last_write_fh; + args.next_open_fh = 300; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); + close(fd); + rmdir(mp); + return -1; + } + + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", + fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + if (fuseg_wait_init(&init_done) != 0) { + printf("[FAIL] init handshake timeout\n"); + goto fail; + } + + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, O_RDWR); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + memset(buf, 0, sizeof(buf)); + if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "hello", 5) != 0) { + printf("[FAIL] first cached pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, + errno); + goto fail; + } + if (pwrite(f, "CACHE", 5, 0) != 5) { + printf("[FAIL] pwrite CACHE: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + memset(buf, 0, sizeof(buf)); + if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "CACHE", 5) != 0) { + printf("[FAIL] second cached pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, + errno); + goto fail; + } + if (open_count != 1 || read_count != 1 || write_count != 1 || last_write_fh != 300) { + printf("[FAIL] cached write counters open=%u read=%u write=%u wfh=%llu\n", open_count, + read_count, write_count, (unsigned long long)last_write_fh); + goto fail; } + close(f); + f = -1; + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); return 0; + +fail: + if (f >= 0) { + close(f); + } + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; } -static int ext_test_clone() { - const char *mp = "/tmp/test_fuse_clone"; - DIR *d = NULL; - int found = 0; - struct dirent *de = NULL; - char p[256]; - struct stat st; - char buf[128]; - int n = -1; +static int ext_test_mmap_sees_write_through_update() { + const char *mp = "/tmp/test_fuse_mmap_write_through"; + char path[256]; + char buf[16]; + int f = -1; + void *addr = MAP_FAILED; + pid_t child = -1; + struct mmap_write_shared_state { + volatile int stop; + volatile int init_done; + volatile uint32_t open_count; + volatile uint32_t read_count; + volatile uint32_t write_count; + volatile uint64_t last_write_fh; + volatile uint64_t read_fhs[4]; + }; + struct mmap_write_shared_state *shared = + (struct mmap_write_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (shared == MAP_FAILED) { + printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); + return -1; + } + memset(shared, 0, sizeof(*shared)); + if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + munmap(shared, sizeof(*shared)); return -1; } - int master_fd = open("/dev/fuse", O_RDWR); - if (master_fd < 0) { - printf("[FAIL] open(/dev/fuse master): %s (errno=%d)\n", strerror(errno), errno); + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - volatile int stop = 0; - volatile int init_done = 0; - - struct fuse_daemon_args master_args; - memset(&master_args, 0, sizeof(master_args)); - master_args.fd = master_fd; - master_args.stop = &stop; - master_args.init_done = &init_done; - master_args.enable_write_ops = 0; - master_args.exit_after_init = 1; + child = fork(); + if (child < 0) { + printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); + close(fd); + munmap(shared, sizeof(*shared)); + rmdir(mp); + return -1; + } + if (child == 0) { + struct fuse_daemon_args child_args; + memset(&child_args, 0, sizeof(child_args)); + child_args.fd = fd; + child_args.stop = &shared->stop; + child_args.init_done = &shared->init_done; + child_args.stop_on_destroy = 1; + child_args.enable_write_ops = 1; + child_args.open_count = &shared->open_count; + child_args.read_count = &shared->read_count; + child_args.write_count = &shared->write_count; + child_args.last_write_fh = &shared->last_write_fh; + child_args.read_fhs = shared->read_fhs; + child_args.read_trace_capacity = 4; + child_args.next_open_fh = 320; + fuse_daemon_thread(&child_args); + _exit(0); + } - pthread_t master_th; - if (pthread_create(&master_th, NULL, fuse_daemon_thread, &master_args) != 0) { - printf("[FAIL] pthread_create(master)\n"); - close(master_fd); + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", + fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + shared->stop = 1; + close(fd); + kill(child, SIGTERM); + waitpid(child, NULL, 0); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } + if (fuseg_wait_init(&shared->init_done) != 0) { + printf("[FAIL] init handshake timeout\n"); + goto fail; + } + + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, O_RDWR); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + addr = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE, f, 0); + if (addr == MAP_FAILED) { + printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + if (((volatile char *)addr)[0] != 'h') { + printf("[FAIL] mmap warmup first byte got=%d\n", ((volatile char *)addr)[0]); + goto fail; + } + if (pwrite(f, "MMAP!", 5, 0) != 5) { + printf("[FAIL] pwrite MMAP!: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + if (memcmp(addr, "MMAP!", 5) != 0) { + printf("[FAIL] mmap page did not observe write-through update, got='%.*s'\n", 5, + (char *)addr); + goto fail; + } + memset(buf, 0, sizeof(buf)); + if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "MMAP!", 5) != 0) { + printf("[FAIL] cached pread after mmap write got='%.*s' read=%u errno=%d\n", 5, buf, + shared->read_count, errno); + goto fail; + } + if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 1 || + shared->last_write_fh != 320 || shared->read_fhs[0] != 320) { + printf("[FAIL] mmap write-through counters open=%u read=%u write=%u rfh=%llu wfh=%llu\n", + shared->open_count, shared->read_count, shared->write_count, + (unsigned long long)shared->read_fhs[0], + (unsigned long long)shared->last_write_fh); + goto fail; + } + + munmap(addr, 4096); + addr = MAP_FAILED; + close(f); + f = -1; + umount(mp); + shared->stop = 1; + close(fd); + waitpid(child, NULL, 0); + munmap(shared, sizeof(*shared)); + rmdir(mp); + return 0; + +fail: + if (addr != MAP_FAILED) { + munmap(addr, 4096); + } + if (f >= 0) { + close(f); + } + umount(mp); + shared->stop = 1; + close(fd); + if (child > 0) { + kill(child, SIGTERM); + waitpid(child, NULL, 0); + } + munmap(shared, sizeof(*shared)); + rmdir(mp); + return -1; +} + +static int ext_test_mmap_fault_uses_open_fh_without_extra_open() { + const char *mp = "/tmp/test_fuse_mmap_fh"; + char path[256]; + int f = -1; + void *addr = MAP_FAILED; + volatile char c = 0; + pid_t child = -1; + struct mmap_shared_state { + volatile int stop; + volatile int init_done; + volatile uint32_t open_count; + volatile uint32_t read_count; + volatile uint64_t read_fhs[4]; + }; + struct mmap_shared_state *shared = + (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (shared == MAP_FAILED) { + printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); + return -1; + } + memset(shared, 0, sizeof(*shared)); - char opts[256]; - snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", master_fd); - if (mount("none", mp, "fuse", 0, opts) != 0) { - printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - stop = 1; - close(master_fd); - pthread_join(master_th, NULL); - rmdir(mp); + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + munmap(shared, sizeof(*shared)); return -1; } - for (int i = 0; i < 100; i++) { - if (init_done) - break; - usleep(10 * 1000); - } - if (!init_done) { - printf("[FAIL] init handshake timeout\n"); - umount(mp); - stop = 1; - close(master_fd); - pthread_join(master_th, NULL); + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - pthread_join(master_th, NULL); - - int clone_fd = open("/dev/fuse", O_RDWR); - if (clone_fd < 0) { - printf("[FAIL] open(/dev/fuse clone): %s (errno=%d)\n", strerror(errno), errno); - umount(mp); - close(master_fd); + child = fork(); + if (child < 0) { + printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); + close(fd); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - - uint32_t oldfd_u32 = (uint32_t)master_fd; - if (ioctl(clone_fd, FUSE_DEV_IOC_CLONE, &oldfd_u32) != 0) { - printf("[FAIL] ioctl(FUSE_DEV_IOC_CLONE): %s (errno=%d)\n", strerror(errno), errno); - umount(mp); - close(clone_fd); - close(master_fd); - rmdir(mp); - return -1; + if (child == 0) { + struct fuse_daemon_args child_args; + memset(&child_args, 0, sizeof(child_args)); + child_args.fd = fd; + child_args.stop = &shared->stop; + child_args.init_done = &shared->init_done; + child_args.stop_on_destroy = 1; + child_args.open_count = &shared->open_count; + child_args.read_count = &shared->read_count; + child_args.read_fhs = shared->read_fhs; + child_args.read_trace_capacity = 4; + child_args.next_open_fh = 200; + fuse_daemon_thread(&child_args); + _exit(0); } - struct fuse_daemon_args clone_args; - memset(&clone_args, 0, sizeof(clone_args)); - clone_args.fd = clone_fd; - clone_args.stop = &stop; - clone_args.init_done = &init_done; - clone_args.enable_write_ops = 0; - clone_args.exit_after_init = 0; - - pthread_t clone_th; - if (pthread_create(&clone_th, NULL, fuse_daemon_thread, &clone_args) != 0) { - printf("[FAIL] pthread_create(clone)\n"); - umount(mp); - close(clone_fd); - close(master_fd); + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", + fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + shared->stop = 1; + close(fd); + kill(child, SIGTERM); + waitpid(child, NULL, 0); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - - d = opendir(mp); - if (!d) { - printf("[FAIL] opendir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + if (fuseg_wait_init(&shared->init_done) != 0) { + printf("[FAIL] init handshake timeout\n"); goto fail; } - found = 0; - while ((de = readdir(d)) != NULL) { - if (strcmp(de->d_name, "hello.txt") == 0) { - found = 1; - break; - } - } - closedir(d); - if (!found) { - printf("[FAIL] readdir: hello.txt not found\n"); + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, O_RDONLY); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - - snprintf(p, sizeof(p), "%s/hello.txt", mp); - if (stat(p, &st) != 0) { - printf("[FAIL] stat(%s): %s (errno=%d)\n", p, strerror(errno), errno); + addr = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE, f, 0); + if (addr == MAP_FAILED) { + printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); + close(f); goto fail; } - if (!S_ISREG(st.st_mode)) { - printf("[FAIL] stat: expected regular file\n"); + c = ((volatile char *)addr)[0]; + if (c != 'h') { + printf("[FAIL] mmap first byte got=%d\n", c); + munmap(addr, 4096); + close(f); goto fail; } + munmap(addr, 4096); + addr = MAP_FAILED; + close(f); + f = -1; - n = fuseg_read_file_cstr(p, buf, sizeof(buf)); - if (n < 0) { - printf("[FAIL] read(%s): %s (errno=%d)\n", p, strerror(errno), errno); - goto fail; - } - if (strcmp(buf, "hello from fuse\n") != 0) { - printf("[FAIL] content mismatch: got='%s'\n", buf); + if (shared->open_count != 1 || shared->read_count != 1 || shared->read_fhs[0] != 200) { + printf("[FAIL] mmap counters open=%u read=%u fh0=%llu\n", shared->open_count, + shared->read_count, (unsigned long long)shared->read_fhs[0]); goto fail; } umount(mp); + shared->stop = 1; + close(fd); + waitpid(child, NULL, 0); + munmap(shared, sizeof(*shared)); rmdir(mp); - stop = 1; - close(clone_fd); - close(master_fd); - pthread_join(clone_th, NULL); return 0; fail: + if (addr != MAP_FAILED) { + munmap(addr, 4096); + } + if (f >= 0) { + close(f); + } umount(mp); - stop = 1; - close(clone_fd); - close(master_fd); - pthread_join(clone_th, NULL); + shared->stop = 1; + close(fd); + if (child > 0) { + kill(child, SIGTERM); + waitpid(child, NULL, 0); + } + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } -static int ext_test_large_read_over_max_write() { - const char *mp = "/tmp/test_fuse_large_read"; - const size_t data_size = 6000; +static int ext_test_mmap_fault_batches_readaround_pages() { + const char *mp = "/tmp/test_fuse_mmap_readaround"; + const size_t page_size = 4096; + const size_t page_count = 8; + const size_t map_len = page_size * page_count; char path[256]; - char *buf = NULL; - int n = -1; + int f = -1; + void *addr = MAP_FAILED; + volatile unsigned int checksum = 0; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -2643,8 +3859,8 @@ static int ext_test_large_read_over_max_write() { volatile int stop = 0; volatile int init_done = 0; volatile uint32_t read_count = 0; - volatile uint64_t read_offsets[4] = {0}; - volatile uint32_t read_sizes[4] = {0}; + volatile uint64_t read_offsets[8] = {0}; + volatile uint32_t read_sizes[8] = {0}; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); @@ -2654,8 +3870,9 @@ static int ext_test_large_read_over_max_write() { args.read_count = &read_count; args.read_offsets = read_offsets; args.read_sizes = read_sizes; - args.read_trace_capacity = 4; - args.hello_data_size_override = data_size; + args.read_trace_capacity = 8; + args.hello_generated_size_override = map_len; + args.init_out_max_write_override = map_len; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -2666,7 +3883,7 @@ static int ext_test_large_read_over_max_write() { } char opts[256]; - snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=32768", fd); if (mount("none", mp, "fuse", 0, opts) != 0) { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); @@ -2681,41 +3898,45 @@ static int ext_test_large_read_over_max_write() { goto fail; } - buf = (char *)malloc(data_size); - if (!buf) { - printf("[FAIL] malloc read buffer\n"); - goto fail; - } - snprintf(path, sizeof(path), "%s/hello.txt", mp); - n = fuseg_read_file(path, buf, data_size); - if (n < 0) { - printf("[FAIL] read(%s): %s (errno=%d)\n", path, strerror(errno), errno); + f = open(path, O_RDONLY); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - if ((size_t)n != data_size) { - printf("[FAIL] read size mismatch: got=%d expected=%zu read_count=%u\n", n, data_size, - read_count); + addr = mmap(NULL, map_len, PROT_READ, MAP_PRIVATE, f, 0); + if (addr == MAP_FAILED) { + printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - for (size_t i = 0; i < data_size; i++) { - char expected = (char)('A' + (i % 26)); - if (buf[i] != expected) { - printf("[FAIL] read data mismatch at %zu: got=%d expected=%d\n", i, buf[i], - expected); + + for (size_t i = 0; i < page_count; i++) { + size_t offset = i * page_size; + unsigned char c = ((volatile unsigned char *)addr)[offset]; + unsigned char expected = (unsigned char)('A' + (offset % 26)); + if (c != expected) { + printf("[FAIL] mmap data mismatch page=%zu got=%u expected=%u read_count=%u\n", i, c, + expected, read_count); goto fail; } + checksum += c; } - if (read_count != 2 || read_offsets[0] != 0 || read_offsets[1] != 4096 || - read_sizes[0] != 4096 || read_sizes[1] > 4096 || read_sizes[1] == 0) { - printf("[FAIL] unexpected FUSE_READ split: count=%u off0=%llu size0=%u off1=%llu size1=%u\n", + if (checksum == 0) { + printf("[FAIL] checksum unexpectedly zero\n"); + goto fail; + } + + if (read_count != 1 || read_offsets[0] != 0 || read_sizes[0] != map_len) { + printf("[FAIL] mmap readaround not batched: count=%u off0=%llu size0=%u off1=%llu size1=%u\n", read_count, (unsigned long long)read_offsets[0], read_sizes[0], (unsigned long long)read_offsets[1], read_sizes[1]); goto fail; } - free(buf); - buf = NULL; + munmap(addr, map_len); + addr = MAP_FAILED; + close(f); + f = -1; umount(mp); stop = 1; close(fd); @@ -2724,8 +3945,11 @@ static int ext_test_large_read_over_max_write() { return 0; fail: - if (buf) { - free(buf); + if (addr != MAP_FAILED) { + munmap(addr, map_len); + } + if (f >= 0) { + close(f); } umount(mp); stop = 1; @@ -2735,13 +3959,11 @@ static int ext_test_large_read_over_max_write() { return -1; } -static int ext_test_cached_read_uses_open_fh_without_extra_open() { - const char *mp = "/tmp/test_fuse_cached_read_fh"; +static int ext_test_direct_io_read_bypasses_page_cache() { + const char *mp = "/tmp/test_fuse_direct_read"; char path[256]; char buf[32]; int f = -1; - ssize_t n = -1; - ssize_t first_n = -1; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -2770,7 +3992,8 @@ static int ext_test_cached_read_uses_open_fh_without_extra_open() { args.read_count = &read_count; args.read_fhs = read_fhs; args.read_trace_capacity = 4; - args.next_open_fh = 100; + args.next_open_fh = 700; + args.hello_open_out_flags = FOPEN_DIRECT_IO; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -2802,24 +4025,24 @@ static int ext_test_cached_read_uses_open_fh_without_extra_open() { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - n = pread(f, buf, sizeof(buf), 0); - if (n <= 0) { - printf("[FAIL] first pread got=%zd errno=%d\n", n, errno); - close(f); + memset(buf, 0, sizeof(buf)); + if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "hello", 5) != 0) { + printf("[FAIL] first direct pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, + errno); goto fail; } - first_n = n; memset(buf, 0, sizeof(buf)); - n = pread(f, buf, sizeof(buf), 0); - close(f); - f = -1; - if (n != first_n) { - printf("[FAIL] second pread got=%zd errno=%d\n", n, errno); + if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "hello", 5) != 0) { + printf("[FAIL] second direct pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, + errno); goto fail; } - if (open_count != 1 || read_count != 1 || read_fhs[0] != 100) { - printf("[FAIL] cached read counters open=%u read=%u fh0=%llu\n", open_count, - read_count, (unsigned long long)read_fhs[0]); + close(f); + f = -1; + + if (open_count != 1 || read_count != 2 || read_fhs[0] != 700 || read_fhs[1] != 700) { + printf("[FAIL] direct read counters open=%u read=%u fh0=%llu fh1=%llu\n", open_count, + read_count, (unsigned long long)read_fhs[0], (unsigned long long)read_fhs[1]); goto fail; } @@ -2842,12 +4065,12 @@ static int ext_test_cached_read_uses_open_fh_without_extra_open() { return -1; } -static int ext_test_cached_short_read_updates_eof() { - const char *mp = "/tmp/test_fuse_cached_short_read"; +static int ext_test_direct_io_write_invalidates_cached_read() { + const char *mp = "/tmp/test_fuse_direct_write_inval"; char path[256]; - char buf[32]; - int f = -1; - ssize_t n = -1; + char buf[16]; + int cached_fd = -1; + int direct_fd = -1; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -2863,21 +4086,24 @@ static int ext_test_cached_short_read_updates_eof() { volatile int stop = 0; volatile int init_done = 0; + volatile uint32_t open_count = 0; volatile uint32_t read_count = 0; - volatile uint64_t read_offsets[4] = {0}; - volatile uint32_t read_sizes[4] = {0}; + volatile uint32_t write_count = 0; + volatile uint32_t open_out_flags = 0; + volatile uint64_t last_write_fh = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; + args.enable_write_ops = 1; + args.open_count = &open_count; args.read_count = &read_count; - args.read_offsets = read_offsets; - args.read_sizes = read_sizes; - args.read_trace_capacity = 4; - args.hello_data_size_override = 8192; - args.hello_read_size_override = 5; + args.write_count = &write_count; + args.dynamic_hello_open_out_flags = &open_out_flags; + args.last_write_fh = &last_write_fh; + args.next_open_fh = 520; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -2904,34 +4130,56 @@ static int ext_test_cached_short_read_updates_eof() { } snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDONLY); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + cached_fd = open(path, O_RDWR); + if (cached_fd < 0) { + printf("[FAIL] open cached fd: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + memset(buf, 0, sizeof(buf)); + if (pread(cached_fd, buf, 5, 0) != 5 || memcmp(buf, "hello", 5) != 0) { + printf("[FAIL] initial cached pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, + errno); goto fail; } - memset(buf, 0x7f, sizeof(buf)); - n = pread(f, buf, sizeof(buf), 0); - if (n != 5 || memcmp(buf, "ABCDE", 5) != 0) { - printf("[FAIL] short cached pread got=%zd data='%.*s' read=%u errno=%d\n", n, 5, buf, - read_count, errno); + open_out_flags = FOPEN_DIRECT_IO; + direct_fd = open(path, O_WRONLY); + if (direct_fd < 0) { + printf("[FAIL] open direct fd: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - memset(buf, 0x7f, sizeof(buf)); - n = pread(f, buf, sizeof(buf), 5); - if (n != 0) { - printf("[FAIL] EOF cached pread got=%zd read=%u errno=%d\n", n, read_count, errno); + if (pwrite(direct_fd, "DIO!!", 5, 0) != 5) { + printf("[FAIL] direct pwrite: %s (errno=%d)\n", strerror(errno), errno); goto fail; } + if (pwrite(direct_fd, "TAIL!", 5, 20) != 5) { + printf("[FAIL] direct pwrite extend: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + close(direct_fd); + direct_fd = -1; + open_out_flags = 0; - if (read_count != 1 || read_offsets[0] != 0 || read_sizes[0] != 4096) { - printf("[FAIL] short read trace count=%u off0=%llu size0=%u\n", read_count, - (unsigned long long)read_offsets[0], read_sizes[0]); + memset(buf, 0, sizeof(buf)); + if (pread(cached_fd, buf, 5, 0) != 5 || memcmp(buf, "DIO!!", 5) != 0) { + printf("[FAIL] cached pread after direct write got='%.*s' read=%u errno=%d\n", 5, buf, + read_count, errno); + goto fail; + } + memset(buf, 0, sizeof(buf)); + if (pread(cached_fd, buf, 5, 20) != 5 || memcmp(buf, "TAIL!", 5) != 0) { + printf("[FAIL] cached pread after direct extend got='%.*s' read=%u errno=%d\n", 5, buf, + read_count, errno); + goto fail; + } + if (open_count != 2 || read_count != 2 || write_count != 2 || last_write_fh != 521) { + printf("[FAIL] direct write counters open=%u read=%u write=%u wfh=%llu\n", open_count, + read_count, write_count, (unsigned long long)last_write_fh); goto fail; } - close(f); - f = -1; + close(cached_fd); + cached_fd = -1; umount(mp); stop = 1; close(fd); @@ -2940,8 +4188,11 @@ static int ext_test_cached_short_read_updates_eof() { return 0; fail: - if (f >= 0) { - close(f); + if (direct_fd >= 0) { + close(direct_fd); + } + if (cached_fd >= 0) { + close(cached_fd); } umount(mp); stop = 1; @@ -2951,135 +4202,201 @@ static int ext_test_cached_short_read_updates_eof() { return -1; } -static int ext_test_cached_read_sees_write_through_update() { - const char *mp = "/tmp/test_fuse_cached_read_write"; +static int ext_test_direct_io_mmap_policy() { + const char *mp = "/tmp/test_fuse_direct_mmap"; char path[256]; - char buf[16]; int f = -1; + void *addr = MAP_FAILED; + volatile char c = 0; + char warm = 0; + pid_t child = -1; + struct direct_mmap_shared_state { + volatile int stop; + volatile int init_done; + volatile uint32_t open_out_flags; + volatile unsigned char first_byte; + volatile uint32_t open_count; + volatile uint32_t read_count; + volatile uint64_t read_fhs[4]; + }; + struct direct_mmap_shared_state *shared = + (struct direct_mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (shared == MAP_FAILED) { + printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); + return -1; + } + memset(shared, 0, sizeof(*shared)); if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + munmap(shared, sizeof(*shared)); return -1; } int fd = open("/dev/fuse", O_RDWR); if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - volatile int stop = 0; - volatile int init_done = 0; - volatile uint32_t open_count = 0; - volatile uint32_t read_count = 0; - volatile uint32_t write_count = 0; - volatile uint64_t last_write_fh = 0; - - struct fuse_daemon_args args; - memset(&args, 0, sizeof(args)); - args.fd = fd; - args.stop = &stop; - args.init_done = &init_done; - args.enable_write_ops = 1; - args.open_count = &open_count; - args.read_count = &read_count; - args.write_count = &write_count; - args.last_write_fh = &last_write_fh; - args.next_open_fh = 300; - - pthread_t th; - if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { - printf("[FAIL] pthread_create\n"); + child = fork(); + if (child < 0) { + printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); close(fd); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } + if (child == 0) { + struct fuse_daemon_args child_args; + memset(&child_args, 0, sizeof(child_args)); + child_args.fd = fd; + child_args.stop = &shared->stop; + child_args.init_done = &shared->init_done; + child_args.stop_on_destroy = 1; + child_args.open_count = &shared->open_count; + child_args.read_count = &shared->read_count; + child_args.read_fhs = shared->read_fhs; + child_args.read_trace_capacity = 4; + child_args.next_open_fh = 800; + child_args.dynamic_hello_open_out_flags = &shared->open_out_flags; + child_args.dynamic_hello_first_byte = &shared->first_byte; + fuse_daemon_thread(&child_args); + _exit(0); + } char opts[256]; snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", fd); if (mount("none", mp, "fuse", 0, opts) != 0) { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - stop = 1; + shared->stop = 1; close(fd); - pthread_join(th, NULL); + kill(child, SIGTERM); + waitpid(child, NULL, 0); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (fuseg_wait_init(&init_done) != 0) { + if (fuseg_wait_init(&shared->init_done) != 0) { printf("[FAIL] init handshake timeout\n"); goto fail; } snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDWR); + f = open(path, O_RDONLY); if (f < 0) { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - memset(buf, 0, sizeof(buf)); - if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "hello", 5) != 0) { - printf("[FAIL] first cached pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, + if (pread(f, &warm, 1, 0) != 1 || warm != 'h') { + printf("[FAIL] warm cached read got=%d read=%u errno=%d\n", warm, shared->read_count, errno); goto fail; } - if (pwrite(f, "CACHE", 5, 0) != 5) { - printf("[FAIL] pwrite CACHE: %s (errno=%d)\n", strerror(errno), errno); + close(f); + f = -1; + + shared->open_out_flags = FOPEN_DIRECT_IO; + shared->first_byte = 'Z'; + + f = open(path, O_RDONLY); + if (f < 0) { + printf("[FAIL] direct open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - memset(buf, 0, sizeof(buf)); - if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "CACHE", 5) != 0) { - printf("[FAIL] second cached pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, - errno); + + errno = 0; + addr = mmap(NULL, 4096, PROT_READ, MAP_SHARED, f, 0); + if (addr != MAP_FAILED) { + printf("[FAIL] direct_io MAP_SHARED unexpectedly succeeded\n"); + munmap(addr, 4096); + addr = MAP_FAILED; goto fail; } - if (open_count != 1 || read_count != 1 || write_count != 1 || last_write_fh != 300) { - printf("[FAIL] cached write counters open=%u read=%u write=%u wfh=%llu\n", open_count, - read_count, write_count, (unsigned long long)last_write_fh); + if (errno != ENODEV) { + printf("[FAIL] direct_io MAP_SHARED errno=%d expected=%d\n", errno, ENODEV); + goto fail; + } + + addr = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE, f, 0); + if (addr == MAP_FAILED) { + printf("[FAIL] direct_io MAP_PRIVATE mmap: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + c = ((volatile char *)addr)[0]; + if (c != 'Z') { + printf("[FAIL] direct_io MAP_PRIVATE first byte got=%d\n", c); + goto fail; + } + if (shared->open_count != 2 || shared->read_count != 2 || shared->read_fhs[1] != 801) { + printf("[FAIL] direct mmap counters open=%u read=%u fh0=%llu fh1=%llu\n", + shared->open_count, shared->read_count, (unsigned long long)shared->read_fhs[0], + (unsigned long long)shared->read_fhs[1]); goto fail; } + munmap(addr, 4096); + addr = MAP_FAILED; close(f); f = -1; umount(mp); - stop = 1; + shared->stop = 1; close(fd); - pthread_join(th, NULL); + waitpid(child, NULL, 0); + munmap(shared, sizeof(*shared)); rmdir(mp); return 0; fail: + if (addr != MAP_FAILED) { + munmap(addr, 4096); + } if (f >= 0) { close(f); } umount(mp); - stop = 1; + shared->stop = 1; close(fd); - pthread_join(th, NULL); + if (child > 0) { + kill(child, SIGTERM); + waitpid(child, NULL, 0); + } + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } -static int ext_test_mmap_sees_write_through_update() { - const char *mp = "/tmp/test_fuse_mmap_write_through"; +static int ext_test_shared_writable_mmap_msync_writeback() { + const char *mp = "/tmp/test_fuse_mmap_shared_write"; char path[256]; - char buf[16]; int f = -1; void *addr = MAP_FAILED; - pid_t child = -1; - struct mmap_write_shared_state { + volatile char c = 0; + pid_t daemon = -1; + const uint32_t expected_writeback_flags = FUSE_WRITE_CACHE; + struct mmap_shared_state { volatile int stop; volatile int init_done; volatile uint32_t open_count; volatile uint32_t read_count; volatile uint32_t write_count; volatile uint64_t last_write_fh; - volatile uint64_t read_fhs[4]; + volatile uint32_t last_open_pid; + volatile uint64_t last_write_offset; + volatile uint32_t last_write_size; + volatile uint32_t last_write_flags; + volatile uint32_t last_write_open_flags; + volatile uint32_t last_write_uid; + volatile uint32_t last_write_gid; + volatile uint32_t last_write_pid; }; - struct mmap_write_shared_state *shared = - (struct mmap_write_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); + struct mmap_shared_state *shared = + (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (shared == MAP_FAILED) { printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); return -1; @@ -3100,29 +4417,35 @@ static int ext_test_mmap_sees_write_through_update() { return -1; } - child = fork(); - if (child < 0) { + daemon = fork(); + if (daemon < 0) { printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); close(fd); munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (child == 0) { + if (daemon == 0) { struct fuse_daemon_args child_args; memset(&child_args, 0, sizeof(child_args)); child_args.fd = fd; child_args.stop = &shared->stop; child_args.init_done = &shared->init_done; - child_args.stop_on_destroy = 1; child_args.enable_write_ops = 1; + child_args.stop_on_destroy = 1; child_args.open_count = &shared->open_count; child_args.read_count = &shared->read_count; child_args.write_count = &shared->write_count; child_args.last_write_fh = &shared->last_write_fh; - child_args.read_fhs = shared->read_fhs; - child_args.read_trace_capacity = 4; - child_args.next_open_fh = 320; + child_args.last_open_pid = &shared->last_open_pid; + child_args.last_write_offset = &shared->last_write_offset; + child_args.last_write_size = &shared->last_write_size; + child_args.last_write_flags = &shared->last_write_flags; + child_args.last_write_open_flags = &shared->last_write_open_flags; + child_args.last_write_uid = &shared->last_write_uid; + child_args.last_write_gid = &shared->last_write_gid; + child_args.last_write_pid = &shared->last_write_pid; + child_args.next_open_fh = 900; fuse_daemon_thread(&child_args); _exit(0); } @@ -3134,8 +4457,8 @@ static int ext_test_mmap_sees_write_through_update() { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); shared->stop = 1; close(fd); - kill(child, SIGTERM); - waitpid(child, NULL, 0); + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); munmap(shared, sizeof(*shared)); rmdir(mp); return -1; @@ -3151,36 +4474,35 @@ static int ext_test_mmap_sees_write_through_update() { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - addr = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE, f, 0); + addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, f, 0); if (addr == MAP_FAILED) { printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); + close(f); goto fail; } - if (((volatile char *)addr)[0] != 'h') { - printf("[FAIL] mmap warmup first byte got=%d\n", ((volatile char *)addr)[0]); - goto fail; - } - if (pwrite(f, "MMAP!", 5, 0) != 5) { - printf("[FAIL] pwrite MMAP!: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - if (memcmp(addr, "MMAP!", 5) != 0) { - printf("[FAIL] mmap page did not observe write-through update, got='%.*s'\n", 5, - (char *)addr); + + c = ((volatile char *)addr)[0]; + if (c != 'h') { + printf("[FAIL] shared writable mmap first byte got=%d\n", c); goto fail; } - memset(buf, 0, sizeof(buf)); - if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "MMAP!", 5) != 0) { - printf("[FAIL] cached pread after mmap write got='%.*s' read=%u errno=%d\n", 5, buf, - shared->read_count, errno); + ((volatile char *)addr)[1] = 'M'; + if (msync(addr, 4096, MS_SYNC) != 0) { + printf("[FAIL] msync(shared writable mmap): %s (errno=%d)\n", strerror(errno), errno); goto fail; } if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 1 || - shared->last_write_fh != 320 || shared->read_fhs[0] != 320) { - printf("[FAIL] mmap write-through counters open=%u read=%u write=%u rfh=%llu wfh=%llu\n", + shared->last_write_fh != 900 || shared->last_write_offset != 0 || + shared->last_write_size != 16 || shared->last_write_flags != expected_writeback_flags || + shared->last_write_open_flags != 0 || shared->last_write_uid != 0 || + shared->last_write_gid != 0 || shared->last_open_pid == 0 || + shared->last_write_pid != shared->last_open_pid) { + printf("[FAIL] shared writable mmap counters open=%u read=%u write=%u wfh=%llu open_pid=%u off=%llu size=%u wflags=%u oflags=%u uid=%u gid=%u pid=%u\n", shared->open_count, shared->read_count, shared->write_count, - (unsigned long long)shared->read_fhs[0], - (unsigned long long)shared->last_write_fh); + (unsigned long long)shared->last_write_fh, shared->last_open_pid, + (unsigned long long)shared->last_write_offset, shared->last_write_size, + shared->last_write_flags, shared->last_write_open_flags, shared->last_write_uid, + shared->last_write_gid, shared->last_write_pid); goto fail; } @@ -3191,7 +4513,7 @@ static int ext_test_mmap_sees_write_through_update() { umount(mp); shared->stop = 1; close(fd); - waitpid(child, NULL, 0); + waitpid(daemon, NULL, 0); munmap(shared, sizeof(*shared)); rmdir(mp); return 0; @@ -3206,32 +4528,35 @@ static int ext_test_mmap_sees_write_through_update() { umount(mp); shared->stop = 1; close(fd); - if (child > 0) { - kill(child, SIGTERM); - waitpid(child, NULL, 0); + if (daemon > 0) { + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); } munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } -static int ext_test_mmap_fault_uses_open_fh_without_extra_open() { - const char *mp = "/tmp/test_fuse_mmap_fh"; +static int ext_test_shared_mmap_dirty_then_pwrite_keeps_latest_data() { + const char *mp = "/tmp/test_fuse_mmap_dirty_pwrite"; char path[256]; int f = -1; void *addr = MAP_FAILED; volatile char c = 0; - pid_t child = -1; - struct mmap_shared_state { + pid_t daemon = -1; + struct dirty_pwrite_shared_state { volatile int stop; volatile int init_done; - volatile uint32_t open_count; volatile uint32_t read_count; - volatile uint64_t read_fhs[4]; + volatile uint32_t write_count; + volatile uint64_t last_write_offset; + volatile uint32_t last_write_size; + volatile uint32_t last_write_flags; + volatile unsigned char last_write_watch_byte; }; - struct mmap_shared_state *shared = - (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); + struct dirty_pwrite_shared_state *shared = + (struct dirty_pwrite_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (shared == MAP_FAILED) { printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); return -1; @@ -3252,26 +4577,30 @@ static int ext_test_mmap_fault_uses_open_fh_without_extra_open() { return -1; } - child = fork(); - if (child < 0) { + daemon = fork(); + if (daemon < 0) { printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); close(fd); munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (child == 0) { + if (daemon == 0) { struct fuse_daemon_args child_args; memset(&child_args, 0, sizeof(child_args)); child_args.fd = fd; child_args.stop = &shared->stop; child_args.init_done = &shared->init_done; + child_args.enable_write_ops = 1; child_args.stop_on_destroy = 1; - child_args.open_count = &shared->open_count; child_args.read_count = &shared->read_count; - child_args.read_fhs = shared->read_fhs; - child_args.read_trace_capacity = 4; - child_args.next_open_fh = 200; + child_args.write_count = &shared->write_count; + child_args.last_write_offset = &shared->last_write_offset; + child_args.last_write_size = &shared->last_write_size; + child_args.last_write_flags = &shared->last_write_flags; + child_args.last_write_watch_byte = &shared->last_write_watch_byte; + child_args.write_watch_offset = 1; + child_args.next_open_fh = 901; fuse_daemon_thread(&child_args); _exit(0); } @@ -3283,8 +4612,8 @@ static int ext_test_mmap_fault_uses_open_fh_without_extra_open() { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); shared->stop = 1; close(fd); - kill(child, SIGTERM); - waitpid(child, NULL, 0); + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); munmap(shared, sizeof(*shared)); rmdir(mp); return -1; @@ -3295,39 +4624,54 @@ static int ext_test_mmap_fault_uses_open_fh_without_extra_open() { } snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDONLY); + f = open(path, O_RDWR); if (f < 0) { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - addr = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE, f, 0); + addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, f, 0); if (addr == MAP_FAILED) { printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); - close(f); goto fail; } + c = ((volatile char *)addr)[0]; if (c != 'h') { - printf("[FAIL] mmap first byte got=%d\n", c); - munmap(addr, 4096); - close(f); + printf("[FAIL] shared writable mmap first byte got=%d\n", c); + goto fail; + } + ((volatile char *)addr)[1] = 'M'; + if (pwrite(f, "P", 1, 1) != 1) { + printf("[FAIL] pwrite over dirty mmap byte: %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + if (((volatile char *)addr)[1] != 'P') { + printf("[FAIL] mmap cache was not updated by overlapping pwrite got=%d\n", + ((volatile char *)addr)[1]); goto fail; } + if (msync(addr, 4096, MS_SYNC) != 0) { + printf("[FAIL] msync(shared dirty pwrite): %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + if (shared->write_count < 2 || shared->last_write_offset != 0 || + shared->last_write_size != 16 || shared->last_write_flags != FUSE_WRITE_CACHE || + shared->last_write_watch_byte != 'P') { + printf("[FAIL] dirty mmap pwrite counters read=%u write=%u off=%llu size=%u flags=%u watched=%u\n", + shared->read_count, shared->write_count, + (unsigned long long)shared->last_write_offset, shared->last_write_size, + shared->last_write_flags, shared->last_write_watch_byte); + goto fail; + } + munmap(addr, 4096); addr = MAP_FAILED; close(f); f = -1; - - if (shared->open_count != 1 || shared->read_count != 1 || shared->read_fhs[0] != 200) { - printf("[FAIL] mmap counters open=%u read=%u fh0=%llu\n", shared->open_count, - shared->read_count, (unsigned long long)shared->read_fhs[0]); - goto fail; - } - umount(mp); shared->stop = 1; close(fd); - waitpid(child, NULL, 0); + waitpid(daemon, NULL, 0); munmap(shared, sizeof(*shared)); rmdir(mp); return 0; @@ -3342,278 +4686,359 @@ static int ext_test_mmap_fault_uses_open_fh_without_extra_open() { umount(mp); shared->stop = 1; close(fd); - if (child > 0) { - kill(child, SIGTERM); - waitpid(child, NULL, 0); + if (daemon > 0) { + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); } munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } -static int ext_test_direct_io_read_bypasses_page_cache() { - const char *mp = "/tmp/test_fuse_direct_read"; +static int ext_test_shared_writable_mmap_osync_writeback() { + const char *mp = "/tmp/test_fuse_mmap_shared_osync"; char path[256]; - char buf[32]; int f = -1; + void *addr = MAP_FAILED; + volatile char c = 0; + pid_t daemon = -1; + const uint32_t expected_writeback_flags = FUSE_WRITE_CACHE; + const size_t page_size = 4096; + const size_t map_len = page_size * 2; + const char marker = 'Z'; + struct mmap_shared_state { + volatile int stop; + volatile int init_done; + volatile uint32_t open_count; + volatile uint32_t read_count; + volatile uint32_t write_count; + volatile uint32_t fsync_count; + volatile uint64_t last_write_fh; + volatile uint64_t last_write_offset; + volatile uint32_t last_write_size; + volatile uint32_t last_write_flags; + volatile uint32_t last_write_open_flags; + volatile uint64_t last_fsync_fh; + volatile uint32_t write_count_at_fsync; + volatile uint32_t last_write_flags_at_fsync; + }; + struct mmap_shared_state *shared = + (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (shared == MAP_FAILED) { + printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); + return -1; + } + memset(shared, 0, sizeof(*shared)); if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + munmap(shared, sizeof(*shared)); return -1; } int fd = open("/dev/fuse", O_RDWR); if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - volatile int stop = 0; - volatile int init_done = 0; - volatile uint32_t open_count = 0; - volatile uint32_t read_count = 0; - volatile uint64_t read_fhs[4] = {0}; - - struct fuse_daemon_args args; - memset(&args, 0, sizeof(args)); - args.fd = fd; - args.stop = &stop; - args.init_done = &init_done; - args.open_count = &open_count; - args.read_count = &read_count; - args.read_fhs = read_fhs; - args.read_trace_capacity = 4; - args.next_open_fh = 700; - args.hello_open_out_flags = FOPEN_DIRECT_IO; - - pthread_t th; - if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { - printf("[FAIL] pthread_create\n"); + daemon = fork(); + if (daemon < 0) { + printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); close(fd); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } + if (daemon == 0) { + struct fuse_daemon_args child_args; + memset(&child_args, 0, sizeof(child_args)); + child_args.fd = fd; + child_args.stop = &shared->stop; + child_args.init_done = &shared->init_done; + child_args.enable_write_ops = 1; + child_args.stop_on_destroy = 1; + child_args.open_count = &shared->open_count; + child_args.read_count = &shared->read_count; + child_args.write_count = &shared->write_count; + child_args.fsync_count = &shared->fsync_count; + child_args.last_write_fh = &shared->last_write_fh; + child_args.last_write_offset = &shared->last_write_offset; + child_args.last_write_size = &shared->last_write_size; + child_args.last_write_flags = &shared->last_write_flags; + child_args.last_write_open_flags = &shared->last_write_open_flags; + child_args.last_fsync_fh = &shared->last_fsync_fh; + child_args.write_count_at_fsync = &shared->write_count_at_fsync; + child_args.last_write_flags_at_fsync = &shared->last_write_flags_at_fsync; + child_args.next_open_fh = 930; + child_args.hello_data_size_override = map_len; + fuse_daemon_thread(&child_args); + _exit(0); + } char opts[256]; snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", fd); if (mount("none", mp, "fuse", 0, opts) != 0) { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - stop = 1; + shared->stop = 1; close(fd); - pthread_join(th, NULL); + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (fuseg_wait_init(&init_done) != 0) { + if (fuseg_wait_init(&shared->init_done) != 0) { printf("[FAIL] init handshake timeout\n"); goto fail; } snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDONLY); + f = open(path, O_RDWR | O_SYNC); if (f < 0) { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - memset(buf, 0, sizeof(buf)); - if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "hello", 5) != 0) { - printf("[FAIL] first direct pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, - errno); + addr = mmap(NULL, map_len, PROT_READ | PROT_WRITE, MAP_SHARED, f, 0); + if (addr == MAP_FAILED) { + printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); + close(f); goto fail; } - memset(buf, 0, sizeof(buf)); - if (pread(f, buf, 5, 0) != 5 || memcmp(buf, "hello", 5) != 0) { - printf("[FAIL] second direct pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, - errno); + + c = ((volatile char *)addr)[0]; + if (c != 'A') { + printf("[FAIL] shared writable mmap first byte got=%d\n", c); + goto fail; + } + ((volatile char *)addr)[2] = 'F'; + if (pwrite(f, &marker, 1, (off_t)page_size) != 1) { + printf("[FAIL] pwrite(O_SYNC): %s (errno=%d)\n", strerror(errno), errno); goto fail; } - close(f); - f = -1; - if (open_count != 1 || read_count != 2 || read_fhs[0] != 700 || read_fhs[1] != 700) { - printf("[FAIL] direct read counters open=%u read=%u fh0=%llu fh1=%llu\n", open_count, - read_count, (unsigned long long)read_fhs[0], (unsigned long long)read_fhs[1]); + if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 2 || + shared->fsync_count != 1 || shared->last_write_fh != 930 || shared->last_fsync_fh != 930 || + shared->last_write_offset != 0 || shared->last_write_size != page_size || + shared->last_write_flags != expected_writeback_flags || shared->last_write_open_flags != 0 || + shared->write_count_at_fsync != 2 || + shared->last_write_flags_at_fsync != expected_writeback_flags) { + printf("[FAIL] shared mmap osync counters open=%u read=%u write=%u fsync=%u wfh=%llu fsh=%llu off=%llu size=%u wflags=%u oflags=%u fsync_writes=%u fsync_wflags=%u\n", + shared->open_count, shared->read_count, shared->write_count, shared->fsync_count, + (unsigned long long)shared->last_write_fh, + (unsigned long long)shared->last_fsync_fh, + (unsigned long long)shared->last_write_offset, shared->last_write_size, + shared->last_write_flags, shared->last_write_open_flags, + shared->write_count_at_fsync, shared->last_write_flags_at_fsync); goto fail; } + munmap(addr, map_len); + addr = MAP_FAILED; + close(f); + f = -1; umount(mp); - stop = 1; + shared->stop = 1; close(fd); - pthread_join(th, NULL); + waitpid(daemon, NULL, 0); + munmap(shared, sizeof(*shared)); rmdir(mp); return 0; fail: + if (addr != MAP_FAILED) { + munmap(addr, map_len); + } if (f >= 0) { close(f); } umount(mp); - stop = 1; + shared->stop = 1; close(fd); - pthread_join(th, NULL); + if (daemon > 0) { + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); + } + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } -static int ext_test_direct_io_write_invalidates_cached_read() { - const char *mp = "/tmp/test_fuse_direct_write_inval"; +static int ext_test_shared_mmap_mprotect_writeback() { + const char *mp = "/tmp/test_fuse_mmap_mprotect_write"; char path[256]; - char buf[16]; - int cached_fd = -1; - int direct_fd = -1; + int f = -1; + void *addr = MAP_FAILED; + volatile char c = 0; + pid_t daemon = -1; + struct mmap_shared_state { + volatile int stop; + volatile int init_done; + volatile uint32_t open_count; + volatile uint32_t read_count; + volatile uint32_t write_count; + volatile uint64_t last_write_fh; + }; + struct mmap_shared_state *shared = + (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (shared == MAP_FAILED) { + printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); + return -1; + } + memset(shared, 0, sizeof(*shared)); if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + munmap(shared, sizeof(*shared)); return -1; } int fd = open("/dev/fuse", O_RDWR); if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - volatile int stop = 0; - volatile int init_done = 0; - volatile uint32_t open_count = 0; - volatile uint32_t read_count = 0; - volatile uint32_t write_count = 0; - volatile uint32_t open_out_flags = 0; - volatile uint64_t last_write_fh = 0; - - struct fuse_daemon_args args; - memset(&args, 0, sizeof(args)); - args.fd = fd; - args.stop = &stop; - args.init_done = &init_done; - args.enable_write_ops = 1; - args.open_count = &open_count; - args.read_count = &read_count; - args.write_count = &write_count; - args.dynamic_hello_open_out_flags = &open_out_flags; - args.last_write_fh = &last_write_fh; - args.next_open_fh = 520; - - pthread_t th; - if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { - printf("[FAIL] pthread_create\n"); + daemon = fork(); + if (daemon < 0) { + printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); close(fd); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } + if (daemon == 0) { + struct fuse_daemon_args child_args; + memset(&child_args, 0, sizeof(child_args)); + child_args.fd = fd; + child_args.stop = &shared->stop; + child_args.init_done = &shared->init_done; + child_args.enable_write_ops = 1; + child_args.stop_on_destroy = 1; + child_args.open_count = &shared->open_count; + child_args.read_count = &shared->read_count; + child_args.write_count = &shared->write_count; + child_args.last_write_fh = &shared->last_write_fh; + child_args.next_open_fh = 910; + fuse_daemon_thread(&child_args); + _exit(0); + } char opts[256]; snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", fd); if (mount("none", mp, "fuse", 0, opts) != 0) { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - stop = 1; + shared->stop = 1; close(fd); - pthread_join(th, NULL); + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (fuseg_wait_init(&init_done) != 0) { + if (fuseg_wait_init(&shared->init_done) != 0) { printf("[FAIL] init handshake timeout\n"); goto fail; } snprintf(path, sizeof(path), "%s/hello.txt", mp); - cached_fd = open(path, O_RDWR); - if (cached_fd < 0) { - printf("[FAIL] open cached fd: %s (errno=%d)\n", strerror(errno), errno); + f = open(path, O_RDWR); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - memset(buf, 0, sizeof(buf)); - if (pread(cached_fd, buf, 5, 0) != 5 || memcmp(buf, "hello", 5) != 0) { - printf("[FAIL] initial cached pread got='%.*s' read=%u errno=%d\n", 5, buf, read_count, - errno); + addr = mmap(NULL, 4096, PROT_READ, MAP_SHARED, f, 0); + if (addr == MAP_FAILED) { + printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); + close(f); goto fail; } - open_out_flags = FOPEN_DIRECT_IO; - direct_fd = open(path, O_WRONLY); - if (direct_fd < 0) { - printf("[FAIL] open direct fd: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - if (pwrite(direct_fd, "DIO!!", 5, 0) != 5) { - printf("[FAIL] direct pwrite: %s (errno=%d)\n", strerror(errno), errno); + c = ((volatile char *)addr)[0]; + if (c != 'h') { + printf("[FAIL] mmap first byte got=%d\n", c); goto fail; } - if (pwrite(direct_fd, "TAIL!", 5, 20) != 5) { - printf("[FAIL] direct pwrite extend: %s (errno=%d)\n", strerror(errno), errno); + if (shared->open_count != 1 || shared->read_count != 1) { + printf("[FAIL] before mprotect counters open=%u read=%u\n", shared->open_count, + shared->read_count); goto fail; } - close(direct_fd); - direct_fd = -1; - open_out_flags = 0; - - memset(buf, 0, sizeof(buf)); - if (pread(cached_fd, buf, 5, 0) != 5 || memcmp(buf, "DIO!!", 5) != 0) { - printf("[FAIL] cached pread after direct write got='%.*s' read=%u errno=%d\n", 5, buf, - read_count, errno); + if (mprotect(addr, 4096, PROT_READ | PROT_WRITE) != 0) { + printf("[FAIL] mprotect shared writable FUSE mapping: %s (errno=%d)\n", strerror(errno), + errno); goto fail; } - memset(buf, 0, sizeof(buf)); - if (pread(cached_fd, buf, 5, 20) != 5 || memcmp(buf, "TAIL!", 5) != 0) { - printf("[FAIL] cached pread after direct extend got='%.*s' read=%u errno=%d\n", 5, buf, - read_count, errno); + ((volatile char *)addr)[2] = 'P'; + if (msync(addr, 4096, MS_SYNC) != 0) { + printf("[FAIL] msync(after mprotect): %s (errno=%d)\n", strerror(errno), errno); goto fail; } - if (open_count != 2 || read_count != 2 || write_count != 2 || last_write_fh != 521) { - printf("[FAIL] direct write counters open=%u read=%u write=%u wfh=%llu\n", open_count, - read_count, write_count, (unsigned long long)last_write_fh); + if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 1 || + shared->last_write_fh != 910) { + printf("[FAIL] after mprotect counters open=%u read=%u write=%u wfh=%llu\n", + shared->open_count, shared->read_count, shared->write_count, + (unsigned long long)shared->last_write_fh); goto fail; } - close(cached_fd); - cached_fd = -1; + munmap(addr, 4096); + addr = MAP_FAILED; + close(f); + f = -1; umount(mp); - stop = 1; + shared->stop = 1; close(fd); - pthread_join(th, NULL); + waitpid(daemon, NULL, 0); + munmap(shared, sizeof(*shared)); rmdir(mp); return 0; fail: - if (direct_fd >= 0) { - close(direct_fd); + if (addr != MAP_FAILED) { + munmap(addr, 4096); } - if (cached_fd >= 0) { - close(cached_fd); + if (f >= 0) { + close(f); } umount(mp); - stop = 1; + shared->stop = 1; close(fd); - pthread_join(th, NULL); + if (daemon > 0) { + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); + } + munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } -static int ext_test_direct_io_mmap_policy() { - const char *mp = "/tmp/test_fuse_direct_mmap"; +static int ext_test_shared_mmap_readonly_fd_mprotect_write_denied() { + const char *mp = "/tmp/test_fuse_mmap_readonly_mprotect"; char path[256]; int f = -1; void *addr = MAP_FAILED; volatile char c = 0; - char warm = 0; - pid_t child = -1; - struct direct_mmap_shared_state { + pid_t daemon = -1; + struct mmap_shared_state { volatile int stop; volatile int init_done; - volatile uint32_t open_out_flags; - volatile unsigned char first_byte; volatile uint32_t open_count; volatile uint32_t read_count; - volatile uint64_t read_fhs[4]; + volatile uint32_t write_count; + volatile uint64_t last_write_fh; }; - struct direct_mmap_shared_state *shared = - (struct direct_mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); + struct mmap_shared_state *shared = + (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (shared == MAP_FAILED) { printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); return -1; @@ -3634,28 +5059,27 @@ static int ext_test_direct_io_mmap_policy() { return -1; } - child = fork(); - if (child < 0) { + daemon = fork(); + if (daemon < 0) { printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); close(fd); munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (child == 0) { + if (daemon == 0) { struct fuse_daemon_args child_args; memset(&child_args, 0, sizeof(child_args)); child_args.fd = fd; child_args.stop = &shared->stop; child_args.init_done = &shared->init_done; + child_args.enable_write_ops = 1; child_args.stop_on_destroy = 1; child_args.open_count = &shared->open_count; child_args.read_count = &shared->read_count; - child_args.read_fhs = shared->read_fhs; - child_args.read_trace_capacity = 4; - child_args.next_open_fh = 800; - child_args.dynamic_hello_open_out_flags = &shared->open_out_flags; - child_args.dynamic_hello_first_byte = &shared->first_byte; + child_args.write_count = &shared->write_count; + child_args.last_write_fh = &shared->last_write_fh; + child_args.next_open_fh = 930; fuse_daemon_thread(&child_args); _exit(0); } @@ -3667,8 +5091,8 @@ static int ext_test_direct_io_mmap_policy() { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); shared->stop = 1; close(fd); - kill(child, SIGTERM); - waitpid(child, NULL, 0); + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); munmap(shared, sizeof(*shared)); rmdir(mp); return -1; @@ -3681,53 +5105,31 @@ static int ext_test_direct_io_mmap_policy() { snprintf(path, sizeof(path), "%s/hello.txt", mp); f = open(path, O_RDONLY); if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); - goto fail; - } - if (pread(f, &warm, 1, 0) != 1 || warm != 'h') { - printf("[FAIL] warm cached read got=%d read=%u errno=%d\n", warm, shared->read_count, - errno); - goto fail; - } - close(f); - f = -1; - - shared->open_out_flags = FOPEN_DIRECT_IO; - shared->first_byte = 'Z'; - - f = open(path, O_RDONLY); - if (f < 0) { - printf("[FAIL] direct open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + printf("[FAIL] open(%s, O_RDONLY): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - - errno = 0; addr = mmap(NULL, 4096, PROT_READ, MAP_SHARED, f, 0); - if (addr != MAP_FAILED) { - printf("[FAIL] direct_io MAP_SHARED unexpectedly succeeded\n"); - munmap(addr, 4096); - addr = MAP_FAILED; - goto fail; - } - if (errno != ENODEV) { - printf("[FAIL] direct_io MAP_SHARED errno=%d expected=%d\n", errno, ENODEV); + if (addr == MAP_FAILED) { + printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); + close(f); goto fail; } - addr = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE, f, 0); - if (addr == MAP_FAILED) { - printf("[FAIL] direct_io MAP_PRIVATE mmap: %s (errno=%d)\n", strerror(errno), errno); + c = ((volatile char *)addr)[0]; + if (c != 'h') { + printf("[FAIL] readonly shared mmap first byte got=%d\n", c); goto fail; } - c = ((volatile char *)addr)[0]; - if (c != 'Z') { - printf("[FAIL] direct_io MAP_PRIVATE first byte got=%d\n", c); + errno = 0; + if (mprotect(addr, 4096, PROT_READ | PROT_WRITE) == 0) { + printf("[FAIL] mprotect unexpectedly allowed write upgrade on readonly fd\n"); goto fail; } - if (shared->open_count != 2 || shared->read_count != 2 || shared->read_fhs[1] != 801) { - printf("[FAIL] direct mmap counters open=%u read=%u fh0=%llu fh1=%llu\n", - shared->open_count, shared->read_count, (unsigned long long)shared->read_fhs[0], - (unsigned long long)shared->read_fhs[1]); + if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 0 || + shared->last_write_fh != 0) { + printf("[FAIL] readonly mprotect counters open=%u read=%u write=%u wfh=%llu\n", + shared->open_count, shared->read_count, shared->write_count, + (unsigned long long)shared->last_write_fh); goto fail; } @@ -3738,7 +5140,7 @@ static int ext_test_direct_io_mmap_policy() { umount(mp); shared->stop = 1; close(fd); - waitpid(child, NULL, 0); + waitpid(daemon, NULL, 0); munmap(shared, sizeof(*shared)); rmdir(mp); return 0; @@ -3753,23 +5155,22 @@ static int ext_test_direct_io_mmap_policy() { umount(mp); shared->stop = 1; close(fd); - if (child > 0) { - kill(child, SIGTERM); - waitpid(child, NULL, 0); + if (daemon > 0) { + kill(daemon, SIGTERM); + waitpid(daemon, NULL, 0); } munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } -static int ext_test_shared_writable_mmap_msync_writeback() { - const char *mp = "/tmp/test_fuse_mmap_shared_write"; +static int ext_test_shared_writable_mmap_munmap_writeback_without_msync() { + const char *mp = "/tmp/test_fuse_mmap_munmap_writeback"; char path[256]; int f = -1; void *addr = MAP_FAILED; - volatile char c = 0; - pid_t daemon = -1; const uint32_t expected_writeback_flags = FUSE_WRITE_CACHE; + pid_t daemon = -1; struct mmap_shared_state { volatile int stop; volatile int init_done; @@ -3777,14 +5178,10 @@ static int ext_test_shared_writable_mmap_msync_writeback() { volatile uint32_t read_count; volatile uint32_t write_count; volatile uint64_t last_write_fh; - volatile uint32_t last_open_pid; volatile uint64_t last_write_offset; volatile uint32_t last_write_size; volatile uint32_t last_write_flags; volatile uint32_t last_write_open_flags; - volatile uint32_t last_write_uid; - volatile uint32_t last_write_gid; - volatile uint32_t last_write_pid; }; struct mmap_shared_state *shared = (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, @@ -3829,15 +5226,11 @@ static int ext_test_shared_writable_mmap_msync_writeback() { child_args.read_count = &shared->read_count; child_args.write_count = &shared->write_count; child_args.last_write_fh = &shared->last_write_fh; - child_args.last_open_pid = &shared->last_open_pid; child_args.last_write_offset = &shared->last_write_offset; child_args.last_write_size = &shared->last_write_size; child_args.last_write_flags = &shared->last_write_flags; child_args.last_write_open_flags = &shared->last_write_open_flags; - child_args.last_write_uid = &shared->last_write_uid; - child_args.last_write_gid = &shared->last_write_gid; - child_args.last_write_pid = &shared->last_write_pid; - child_args.next_open_fh = 900; + child_args.next_open_fh = 940; fuse_daemon_thread(&child_args); _exit(0); } @@ -3873,33 +5266,30 @@ static int ext_test_shared_writable_mmap_msync_writeback() { goto fail; } - c = ((volatile char *)addr)[0]; - if (c != 'h') { - printf("[FAIL] shared writable mmap first byte got=%d\n", c); + if (((volatile char *)addr)[0] != 'h') { + printf("[FAIL] shared close-writeback mmap first byte got=%d\n", + ((volatile char *)addr)[0]); goto fail; } - ((volatile char *)addr)[1] = 'M'; - if (msync(addr, 4096, MS_SYNC) != 0) { - printf("[FAIL] msync(shared writable mmap): %s (errno=%d)\n", strerror(errno), errno); + ((volatile char *)addr)[3] = 'C'; + if (munmap(addr, 4096) != 0) { + printf("[FAIL] munmap(shared writable mmap): %s (errno=%d)\n", strerror(errno), errno); goto fail; } + addr = MAP_FAILED; + if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 1 || - shared->last_write_fh != 900 || shared->last_write_offset != 0 || + shared->last_write_fh != 940 || shared->last_write_offset != 0 || shared->last_write_size != 16 || shared->last_write_flags != expected_writeback_flags || - shared->last_write_open_flags != 0 || shared->last_write_uid != 0 || - shared->last_write_gid != 0 || shared->last_open_pid == 0 || - shared->last_write_pid != shared->last_open_pid) { - printf("[FAIL] shared writable mmap counters open=%u read=%u write=%u wfh=%llu open_pid=%u off=%llu size=%u wflags=%u oflags=%u uid=%u gid=%u pid=%u\n", + shared->last_write_open_flags != 0) { + printf("[FAIL] munmap writeback counters open=%u read=%u write=%u wfh=%llu off=%llu size=%u wflags=%u oflags=%u\n", shared->open_count, shared->read_count, shared->write_count, - (unsigned long long)shared->last_write_fh, shared->last_open_pid, + (unsigned long long)shared->last_write_fh, (unsigned long long)shared->last_write_offset, shared->last_write_size, - shared->last_write_flags, shared->last_write_open_flags, shared->last_write_uid, - shared->last_write_gid, shared->last_write_pid); + shared->last_write_flags, shared->last_write_open_flags); goto fail; } - munmap(addr, 4096); - addr = MAP_FAILED; close(f); f = -1; umount(mp); @@ -3929,26 +5319,28 @@ static int ext_test_shared_writable_mmap_msync_writeback() { return -1; } -static int ext_test_shared_mmap_dirty_then_pwrite_keeps_latest_data() { - const char *mp = "/tmp/test_fuse_mmap_dirty_pwrite"; +static int ext_test_shared_mmap_subrange_mprotect_writeback_preserves_vma() { + const char *mp = "/tmp/test_fuse_mmap_mprotect_subrange"; + const size_t page_size = 4096; + const size_t map_len = page_size * 2; char path[256]; int f = -1; void *addr = MAP_FAILED; volatile char c = 0; pid_t daemon = -1; - struct dirty_pwrite_shared_state { + struct sigaction old_segv; + bool segv_handler_installed = false; + struct mmap_shared_state { volatile int stop; volatile int init_done; + volatile uint32_t open_count; volatile uint32_t read_count; volatile uint32_t write_count; - volatile uint64_t last_write_offset; - volatile uint32_t last_write_size; - volatile uint32_t last_write_flags; - volatile unsigned char last_write_watch_byte; + volatile uint64_t last_write_fh; }; - struct dirty_pwrite_shared_state *shared = - (struct dirty_pwrite_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); + struct mmap_shared_state *shared = + (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (shared == MAP_FAILED) { printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); return -1; @@ -3983,16 +5375,14 @@ static int ext_test_shared_mmap_dirty_then_pwrite_keeps_latest_data() { child_args.fd = fd; child_args.stop = &shared->stop; child_args.init_done = &shared->init_done; - child_args.enable_write_ops = 1; child_args.stop_on_destroy = 1; + child_args.enable_write_ops = 1; + child_args.open_count = &shared->open_count; child_args.read_count = &shared->read_count; child_args.write_count = &shared->write_count; - child_args.last_write_offset = &shared->last_write_offset; - child_args.last_write_size = &shared->last_write_size; - child_args.last_write_flags = &shared->last_write_flags; - child_args.last_write_watch_byte = &shared->last_write_watch_byte; - child_args.write_watch_offset = 1; - child_args.next_open_fh = 901; + child_args.last_write_fh = &shared->last_write_fh; + child_args.hello_data_size_override = map_len; + child_args.next_open_fh = 920; fuse_daemon_thread(&child_args); _exit(0); } @@ -4021,42 +5411,70 @@ static int ext_test_shared_mmap_dirty_then_pwrite_keeps_latest_data() { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, f, 0); + addr = mmap(NULL, map_len, PROT_READ, MAP_SHARED, f, 0); if (addr == MAP_FAILED) { printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); + close(f); goto fail; } c = ((volatile char *)addr)[0]; - if (c != 'h') { - printf("[FAIL] shared writable mmap first byte got=%d\n", c); + if (c != 'A') { + printf("[FAIL] first page byte got=%d\n", c); goto fail; } - ((volatile char *)addr)[1] = 'M'; - if (pwrite(f, "P", 1, 1) != 1) { - printf("[FAIL] pwrite over dirty mmap byte: %s (errno=%d)\n", strerror(errno), errno); + c = ((volatile char *)addr)[page_size]; + if (c != 'O') { + printf("[FAIL] second page byte got=%d\n", c); goto fail; } - if (((volatile char *)addr)[1] != 'P') { - printf("[FAIL] mmap cache was not updated by overlapping pwrite got=%d\n", - ((volatile char *)addr)[1]); + if (shared->open_count != 1 || shared->read_count != 2) { + printf("[FAIL] before subrange mprotect counters open=%u read=%u\n", + shared->open_count, shared->read_count); goto fail; } - if (msync(addr, 4096, MS_SYNC) != 0) { - printf("[FAIL] msync(shared dirty pwrite): %s (errno=%d)\n", strerror(errno), errno); + if (mprotect((char *)addr + page_size, page_size, PROT_READ | PROT_WRITE) != 0) { + printf("[FAIL] subrange mprotect(shared writable): %s (errno=%d)\n", strerror(errno), + errno); goto fail; } - if (shared->write_count < 2 || shared->last_write_offset != 0 || - shared->last_write_size != 16 || shared->last_write_flags != FUSE_WRITE_CACHE || - shared->last_write_watch_byte != 'P') { - printf("[FAIL] dirty mmap pwrite counters read=%u write=%u off=%llu size=%u flags=%u watched=%u\n", - shared->read_count, shared->write_count, - (unsigned long long)shared->last_write_offset, shared->last_write_size, - shared->last_write_flags, shared->last_write_watch_byte); + ((volatile char *)addr)[page_size + 1] = 'S'; + if (msync((char *)addr + page_size, page_size, MS_SYNC) != 0) { + printf("[FAIL] msync(subrange shared writable): %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + if (shared->write_count != 1 || shared->last_write_fh != 920) { + printf("[FAIL] subrange writeback counters write=%u wfh=%llu\n", shared->write_count, + (unsigned long long)shared->last_write_fh); + goto fail; + } + if (mprotect(addr, page_size, PROT_NONE) != 0) { + printf("[FAIL] mprotect(PROT_NONE first page): %s (errno=%d)\n", strerror(errno), errno); goto fail; } - munmap(addr, 4096); + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = fuse_sigsegv_longjmp_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, &old_segv) != 0) { + printf("[FAIL] sigaction(SIGSEGV): %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + segv_handler_installed = true; + g_fuse_sigsegv_seen = 0; + if (sigsetjmp(g_fuse_sigsegv_jmp, 1) == 0) { + c = ((volatile char *)addr)[0]; + (void)c; + } + sigaction(SIGSEGV, &old_segv, NULL); + segv_handler_installed = false; + if (!g_fuse_sigsegv_seen) { + printf("[FAIL] first page remained readable after PROT_NONE\n"); + goto fail; + } + + munmap(addr, map_len); addr = MAP_FAILED; close(f); f = -1; @@ -4069,8 +5487,11 @@ static int ext_test_shared_mmap_dirty_then_pwrite_keeps_latest_data() { return 0; fail: + if (segv_handler_installed) { + sigaction(SIGSEGV, &old_segv, NULL); + } if (addr != MAP_FAILED) { - munmap(addr, 4096); + munmap(addr, map_len); } if (f >= 0) { close(f); @@ -4087,32 +5508,21 @@ static int ext_test_shared_mmap_dirty_then_pwrite_keeps_latest_data() { return -1; } -static int ext_test_shared_writable_mmap_osync_writeback() { - const char *mp = "/tmp/test_fuse_mmap_shared_osync"; +static int ext_test_shared_mmap_unfaulted_mprotect_prot_none() { + const char *mp = "/tmp/test_fuse_mmap_unfaulted_mprotect"; + const size_t page_size = 4096; char path[256]; int f = -1; void *addr = MAP_FAILED; volatile char c = 0; pid_t daemon = -1; - const uint32_t expected_writeback_flags = FUSE_WRITE_CACHE; - const size_t page_size = 4096; - const size_t map_len = page_size * 2; - const char marker = 'Z'; + struct sigaction old_segv; + bool segv_handler_installed = false; struct mmap_shared_state { volatile int stop; - volatile int init_done; - volatile uint32_t open_count; - volatile uint32_t read_count; - volatile uint32_t write_count; - volatile uint32_t fsync_count; - volatile uint64_t last_write_fh; - volatile uint64_t last_write_offset; - volatile uint32_t last_write_size; - volatile uint32_t last_write_flags; - volatile uint32_t last_write_open_flags; - volatile uint64_t last_fsync_fh; - volatile uint32_t write_count_at_fsync; - volatile uint32_t last_write_flags_at_fsync; + volatile int init_done; + volatile uint32_t open_count; + volatile uint32_t read_count; }; struct mmap_shared_state *shared = (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, @@ -4151,22 +5561,10 @@ static int ext_test_shared_writable_mmap_osync_writeback() { child_args.fd = fd; child_args.stop = &shared->stop; child_args.init_done = &shared->init_done; - child_args.enable_write_ops = 1; child_args.stop_on_destroy = 1; child_args.open_count = &shared->open_count; child_args.read_count = &shared->read_count; - child_args.write_count = &shared->write_count; - child_args.fsync_count = &shared->fsync_count; - child_args.last_write_fh = &shared->last_write_fh; - child_args.last_write_offset = &shared->last_write_offset; - child_args.last_write_size = &shared->last_write_size; - child_args.last_write_flags = &shared->last_write_flags; - child_args.last_write_open_flags = &shared->last_write_open_flags; - child_args.last_fsync_fh = &shared->last_fsync_fh; - child_args.write_count_at_fsync = &shared->write_count_at_fsync; - child_args.last_write_flags_at_fsync = &shared->last_write_flags_at_fsync; - child_args.next_open_fh = 930; - child_args.hello_data_size_override = map_len; + child_args.hello_data_size_override = page_size; fuse_daemon_thread(&child_args); _exit(0); } @@ -4190,46 +5588,53 @@ static int ext_test_shared_writable_mmap_osync_writeback() { } snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDWR | O_SYNC); + f = open(path, O_RDWR); if (f < 0) { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - addr = mmap(NULL, map_len, PROT_READ | PROT_WRITE, MAP_SHARED, f, 0); + addr = mmap(NULL, page_size, PROT_READ, MAP_SHARED, f, 0); if (addr == MAP_FAILED) { printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); close(f); goto fail; } - - c = ((volatile char *)addr)[0]; - if (c != 'A') { - printf("[FAIL] shared writable mmap first byte got=%d\n", c); + if (shared->open_count != 1 || shared->read_count != 0) { + printf("[FAIL] before unfaulted mprotect counters open=%u read=%u\n", + shared->open_count, shared->read_count); goto fail; } - ((volatile char *)addr)[2] = 'F'; - if (pwrite(f, &marker, 1, (off_t)page_size) != 1) { - printf("[FAIL] pwrite(O_SYNC): %s (errno=%d)\n", strerror(errno), errno); + if (mprotect(addr, page_size, PROT_NONE) != 0) { + printf("[FAIL] mprotect(PROT_NONE unfaulted): %s (errno=%d)\n", strerror(errno), errno); goto fail; } - if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 2 || - shared->fsync_count != 1 || shared->last_write_fh != 930 || shared->last_fsync_fh != 930 || - shared->last_write_offset != 0 || shared->last_write_size != page_size || - shared->last_write_flags != expected_writeback_flags || shared->last_write_open_flags != 0 || - shared->write_count_at_fsync != 2 || - shared->last_write_flags_at_fsync != expected_writeback_flags) { - printf("[FAIL] shared mmap osync counters open=%u read=%u write=%u fsync=%u wfh=%llu fsh=%llu off=%llu size=%u wflags=%u oflags=%u fsync_writes=%u fsync_wflags=%u\n", - shared->open_count, shared->read_count, shared->write_count, shared->fsync_count, - (unsigned long long)shared->last_write_fh, - (unsigned long long)shared->last_fsync_fh, - (unsigned long long)shared->last_write_offset, shared->last_write_size, - shared->last_write_flags, shared->last_write_open_flags, - shared->write_count_at_fsync, shared->last_write_flags_at_fsync); + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = fuse_sigsegv_longjmp_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, &old_segv) != 0) { + printf("[FAIL] sigaction(SIGSEGV): %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + segv_handler_installed = true; + g_fuse_sigsegv_seen = 0; + if (sigsetjmp(g_fuse_sigsegv_jmp, 1) == 0) { + c = ((volatile char *)addr)[0]; + (void)c; + } + sigaction(SIGSEGV, &old_segv, NULL); + segv_handler_installed = false; + if (!g_fuse_sigsegv_seen) { + printf("[FAIL] unfaulted PROT_NONE mapping remained readable\n"); + goto fail; + } + if (shared->read_count != 0) { + printf("[FAIL] unfaulted PROT_NONE triggered read_count=%u\n", shared->read_count); goto fail; } - munmap(addr, map_len); + munmap(addr, page_size); addr = MAP_FAILED; close(f); f = -1; @@ -4242,8 +5647,11 @@ static int ext_test_shared_writable_mmap_osync_writeback() { return 0; fail: + if (segv_handler_installed) { + sigaction(SIGSEGV, &old_segv, NULL); + } if (addr != MAP_FAILED) { - munmap(addr, map_len); + munmap(addr, page_size); } if (f >= 0) { close(f); @@ -4260,20 +5668,22 @@ static int ext_test_shared_writable_mmap_osync_writeback() { return -1; } -static int ext_test_shared_mmap_mprotect_writeback() { - const char *mp = "/tmp/test_fuse_mmap_mprotect_write"; +static int ext_test_mmap_truncate_unmaps_stale_page() { + const char *mp = "/tmp/test_fuse_mmap_truncate"; + const size_t page_size = 4096; + const size_t map_len = page_size * 2; char path[256]; int f = -1; void *addr = MAP_FAILED; volatile char c = 0; pid_t daemon = -1; + struct sigaction old_bus; + bool bus_handler_installed = false; struct mmap_shared_state { volatile int stop; volatile int init_done; volatile uint32_t open_count; volatile uint32_t read_count; - volatile uint32_t write_count; - volatile uint64_t last_write_fh; }; struct mmap_shared_state *shared = (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, @@ -4312,13 +5722,11 @@ static int ext_test_shared_mmap_mprotect_writeback() { child_args.fd = fd; child_args.stop = &shared->stop; child_args.init_done = &shared->init_done; - child_args.enable_write_ops = 1; child_args.stop_on_destroy = 1; + child_args.enable_write_ops = 1; child_args.open_count = &shared->open_count; child_args.read_count = &shared->read_count; - child_args.write_count = &shared->write_count; - child_args.last_write_fh = &shared->last_write_fh; - child_args.next_open_fh = 910; + child_args.hello_data_size_override = map_len; fuse_daemon_thread(&child_args); _exit(0); } @@ -4347,42 +5755,54 @@ static int ext_test_shared_mmap_mprotect_writeback() { printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); goto fail; } - addr = mmap(NULL, 4096, PROT_READ, MAP_SHARED, f, 0); + addr = mmap(NULL, map_len, PROT_READ, MAP_PRIVATE, f, 0); if (addr == MAP_FAILED) { printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); close(f); goto fail; } - c = ((volatile char *)addr)[0]; - if (c != 'h') { - printf("[FAIL] mmap first byte got=%d\n", c); + c = ((volatile char *)addr)[page_size]; + if (c != 'O') { + printf("[FAIL] second page byte before truncate got=%d\n", c); goto fail; } if (shared->open_count != 1 || shared->read_count != 1) { - printf("[FAIL] before mprotect counters open=%u read=%u\n", shared->open_count, + printf("[FAIL] before truncate counters open=%u read=%u\n", shared->open_count, shared->read_count); goto fail; } - if (mprotect(addr, 4096, PROT_READ | PROT_WRITE) != 0) { - printf("[FAIL] mprotect shared writable FUSE mapping: %s (errno=%d)\n", strerror(errno), - errno); + if (ftruncate(f, page_size) != 0) { + printf("[FAIL] ftruncate: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - ((volatile char *)addr)[2] = 'P'; - if (msync(addr, 4096, MS_SYNC) != 0) { - printf("[FAIL] msync(after mprotect): %s (errno=%d)\n", strerror(errno), errno); + + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = fuse_sigbus_longjmp_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGBUS, &sa, &old_bus) != 0) { + printf("[FAIL] sigaction(SIGBUS): %s (errno=%d)\n", strerror(errno), errno); goto fail; } - if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 1 || - shared->last_write_fh != 910) { - printf("[FAIL] after mprotect counters open=%u read=%u write=%u wfh=%llu\n", - shared->open_count, shared->read_count, shared->write_count, - (unsigned long long)shared->last_write_fh); + bus_handler_installed = true; + g_fuse_sigbus_seen = 0; + if (sigsetjmp(g_fuse_sigbus_jmp, 1) == 0) { + c = ((volatile char *)addr)[page_size]; + (void)c; + } + sigaction(SIGBUS, &old_bus, NULL); + bus_handler_installed = false; + if (!g_fuse_sigbus_seen) { + printf("[FAIL] truncated second page remained readable read=%u\n", shared->read_count); + goto fail; + } + if (shared->read_count != 1) { + printf("[FAIL] truncated EOF fault issued extra FUSE_READ count=%u\n", shared->read_count); goto fail; } - munmap(addr, 4096); + munmap(addr, map_len); addr = MAP_FAILED; close(f); f = -1; @@ -4395,8 +5815,11 @@ static int ext_test_shared_mmap_mprotect_writeback() { return 0; fail: + if (bus_handler_installed) { + sigaction(SIGBUS, &old_bus, NULL); + } if (addr != MAP_FAILED) { - munmap(addr, 4096); + munmap(addr, map_len); } if (f >= 0) { close(f); @@ -4413,830 +5836,831 @@ static int ext_test_shared_mmap_mprotect_writeback() { return -1; } -static int ext_test_shared_mmap_readonly_fd_mprotect_write_denied() { - const char *mp = "/tmp/test_fuse_mmap_readonly_mprotect"; +static int ext_test_fadvise_without_page_cache() { + const char *mp = "/tmp/test_fuse_fadvise"; char path[256]; int f = -1; - void *addr = MAP_FAILED; - volatile char c = 0; - pid_t daemon = -1; - struct mmap_shared_state { - volatile int stop; - volatile int init_done; - volatile uint32_t open_count; - volatile uint32_t read_count; - volatile uint32_t write_count; - volatile uint64_t last_write_fh; + const int advices[] = { + POSIX_FADV_NORMAL, POSIX_FADV_RANDOM, POSIX_FADV_SEQUENTIAL, + POSIX_FADV_WILLNEED, POSIX_FADV_DONTNEED, + POSIX_FADV_NOREUSE, }; - struct mmap_shared_state *shared = - (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (shared == MAP_FAILED) { - printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); + + if (ensure_dir(mp) != 0) { + printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); return -1; } - memset(shared, 0, sizeof(*shared)); + + int fd = open("/dev/fuse", O_RDWR); + if (fd < 0) { + printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); + rmdir(mp); + return -1; + } + + volatile int stop = 0; + volatile int init_done = 0; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.stop_on_destroy = 1; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); + close(fd); + rmdir(mp); + return -1; + } + + char opts[256]; + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); + if (mount("none", mp, "fuse", 0, opts) != 0) { + printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } + if (fuseg_wait_init(&init_done) != 0) { + printf("[FAIL] init handshake timeout\n"); + goto fail; + } + + snprintf(path, sizeof(path), "%s/hello.txt", mp); + f = open(path, O_RDONLY); + if (f < 0) { + printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + goto fail; + } + + for (size_t i = 0; i < sizeof(advices) / sizeof(advices[0]); i++) { + int rc = posix_fadvise(f, 0, 0, advices[i]); + if (rc != 0) { + printf("[FAIL] posix_fadvise(advice=%d): rc=%d\n", advices[i], rc); + goto fail; + } + } + + if (posix_fadvise(f, 0, -1, POSIX_FADV_NORMAL) != EINVAL) { + printf("[FAIL] posix_fadvise negative len should return EINVAL\n"); + goto fail; + } + + close(f); + f = -1; + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return 0; + +fail: + if (f >= 0) { + close(f); + } + umount(mp); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; +} + +static int ext_test_mount_on_fuse_dir_uses_namespace_path() { + const char *mp = "/tmp/test_fuse_mount_target"; + char dir_path[512]; + char marker_path[1024]; + int ramfs_mounted = 0; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - munmap(shared, sizeof(*shared)); return -1; } int fd = open("/dev/fuse", O_RDWR); if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - daemon = fork(); - if (daemon < 0) { - printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); + volatile int stop = 0; + volatile int init_done = 0; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.enable_write_ops = 1; + args.stop_on_destroy = 1; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); close(fd); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (daemon == 0) { - struct fuse_daemon_args child_args; - memset(&child_args, 0, sizeof(child_args)); - child_args.fd = fd; - child_args.stop = &shared->stop; - child_args.init_done = &shared->init_done; - child_args.enable_write_ops = 1; - child_args.stop_on_destroy = 1; - child_args.open_count = &shared->open_count; - child_args.read_count = &shared->read_count; - child_args.write_count = &shared->write_count; - child_args.last_write_fh = &shared->last_write_fh; - child_args.next_open_fh = 930; - fuse_daemon_thread(&child_args); - _exit(0); - } char opts[256]; - snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", - fd); + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); if (mount("none", mp, "fuse", 0, opts) != 0) { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - shared->stop = 1; + stop = 1; close(fd); - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return -1; } - if (fuseg_wait_init(&shared->init_done) != 0) { + if (fuseg_wait_init(&init_done) != 0) { printf("[FAIL] init handshake timeout\n"); goto fail; } - snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDONLY); - if (f < 0) { - printf("[FAIL] open(%s, O_RDONLY): %s (errno=%d)\n", path, strerror(errno), errno); + snprintf(dir_path, sizeof(dir_path), "%s/ramfs_target", mp); + if (mkdir(dir_path, 0755) != 0) { + printf("[FAIL] mkdir(%s): %s (errno=%d)\n", dir_path, strerror(errno), errno); goto fail; } - addr = mmap(NULL, 4096, PROT_READ, MAP_SHARED, f, 0); - if (addr == MAP_FAILED) { - printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); - close(f); + + if (mount("", dir_path, "ramfs", 0, NULL) != 0) { + printf("[FAIL] mount(ramfs on fuse dir): %s (errno=%d)\n", strerror(errno), errno); goto fail; } + ramfs_mounted = 1; - c = ((volatile char *)addr)[0]; - if (c != 'h') { - printf("[FAIL] readonly shared mmap first byte got=%d\n", c); + snprintf(marker_path, sizeof(marker_path), "%s/marker", dir_path); + if (fuseg_write_file(marker_path, "mounted") != 0) { + printf("[FAIL] write marker under ramfs: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - errno = 0; - if (mprotect(addr, 4096, PROT_READ | PROT_WRITE) == 0) { - printf("[FAIL] mprotect unexpectedly allowed write upgrade on readonly fd\n"); - goto fail; + + if (umount(dir_path) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", dir_path, strerror(errno), errno); + goto fail_no_ramfs_umount; } - if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 0 || - shared->last_write_fh != 0) { - printf("[FAIL] readonly mprotect counters open=%u read=%u write=%u wfh=%llu\n", - shared->open_count, shared->read_count, shared->write_count, - (unsigned long long)shared->last_write_fh); + ramfs_mounted = 0; + if (rmdir(dir_path) != 0) { + printf("[FAIL] rmdir(%s): %s (errno=%d)\n", dir_path, strerror(errno), errno); goto fail; } - munmap(addr, 4096); - addr = MAP_FAILED; - close(f); - f = -1; umount(mp); - shared->stop = 1; + stop = 1; close(fd); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return 0; fail: - if (addr != MAP_FAILED) { - munmap(addr, 4096); - } - if (f >= 0) { - close(f); + if (ramfs_mounted) { + umount(dir_path); } +fail_no_ramfs_umount: + rmdir(dir_path); umount(mp); - shared->stop = 1; + stop = 1; close(fd); - if (daemon > 0) { - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - } - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return -1; } -static int ext_test_shared_writable_mmap_munmap_writeback_without_msync() { - const char *mp = "/tmp/test_fuse_mmap_munmap_writeback"; - char path[256]; - int f = -1; - void *addr = MAP_FAILED; - const uint32_t expected_writeback_flags = FUSE_WRITE_CACHE; - pid_t daemon = -1; - struct mmap_shared_state { - volatile int stop; - volatile int init_done; - volatile uint32_t open_count; - volatile uint32_t read_count; - volatile uint32_t write_count; - volatile uint64_t last_write_fh; - volatile uint64_t last_write_offset; - volatile uint32_t last_write_size; - volatile uint32_t last_write_flags; - volatile uint32_t last_write_open_flags; - }; - struct mmap_shared_state *shared = - (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (shared == MAP_FAILED) { - printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); - return -1; - } - memset(shared, 0, sizeof(*shared)); +static int ext_test_rename_updates_fuse_dir_cwd_path() { + const char *mp = "/tmp/test_fuse_rename_path"; + char old_path[512]; + char new_path[512]; + char cwd[512]; + int dir_fd = -1; + int ramfs_mounted = 0; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - munmap(shared, sizeof(*shared)); return -1; } int fd = open("/dev/fuse", O_RDWR); if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - daemon = fork(); - if (daemon < 0) { - printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); + volatile int stop = 0; + volatile int init_done = 0; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.enable_write_ops = 1; + args.stop_on_destroy = 1; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); close(fd); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (daemon == 0) { - struct fuse_daemon_args child_args; - memset(&child_args, 0, sizeof(child_args)); - child_args.fd = fd; - child_args.stop = &shared->stop; - child_args.init_done = &shared->init_done; - child_args.enable_write_ops = 1; - child_args.stop_on_destroy = 1; - child_args.open_count = &shared->open_count; - child_args.read_count = &shared->read_count; - child_args.write_count = &shared->write_count; - child_args.last_write_fh = &shared->last_write_fh; - child_args.last_write_offset = &shared->last_write_offset; - child_args.last_write_size = &shared->last_write_size; - child_args.last_write_flags = &shared->last_write_flags; - child_args.last_write_open_flags = &shared->last_write_open_flags; - child_args.next_open_fh = 940; - fuse_daemon_thread(&child_args); - _exit(0); - } char opts[256]; - snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", - fd); + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); if (mount("none", mp, "fuse", 0, opts) != 0) { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - shared->stop = 1; + stop = 1; close(fd); - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return -1; } - if (fuseg_wait_init(&shared->init_done) != 0) { + if (fuseg_wait_init(&init_done) != 0) { printf("[FAIL] init handshake timeout\n"); goto fail; } - snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDWR); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + snprintf(old_path, sizeof(old_path), "%s/old_dir", mp); + snprintf(new_path, sizeof(new_path), "%s/new_dir", mp); + if (mkdir(old_path, 0755) != 0) { + printf("[FAIL] mkdir(%s): %s (errno=%d)\n", old_path, strerror(errno), errno); goto fail; } - addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, f, 0); - if (addr == MAP_FAILED) { - printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); - close(f); + dir_fd = open(old_path, O_RDONLY | O_DIRECTORY); + if (dir_fd < 0) { + printf("[FAIL] open dir fd %s: %s (errno=%d)\n", old_path, strerror(errno), errno); goto fail; } - - if (((volatile char *)addr)[0] != 'h') { - printf("[FAIL] shared close-writeback mmap first byte got=%d\n", - ((volatile char *)addr)[0]); + if (rename(old_path, new_path) != 0) { + printf("[FAIL] rename(%s -> %s): %s (errno=%d)\n", old_path, new_path, strerror(errno), + errno); goto fail; } - ((volatile char *)addr)[3] = 'C'; - if (munmap(addr, 4096) != 0) { - printf("[FAIL] munmap(shared writable mmap): %s (errno=%d)\n", strerror(errno), errno); + if (fchdir(dir_fd) != 0) { + printf("[FAIL] fchdir renamed dir fd: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - addr = MAP_FAILED; + if (!getcwd(cwd, sizeof(cwd))) { + printf("[FAIL] getcwd after rename: %s (errno=%d)\n", strerror(errno), errno); + goto fail_chdir_root; + } + if (strcmp(cwd, new_path) != 0) { + printf("[FAIL] getcwd after rename: got '%s', want '%s'\n", cwd, new_path); + goto fail_chdir_root; + } + if (chdir("/") != 0) { + printf("[FAIL] chdir(/): %s (errno=%d)\n", strerror(errno), errno); + goto fail; + } + close(dir_fd); + dir_fd = -1; - if (shared->open_count != 1 || shared->read_count != 1 || shared->write_count != 1 || - shared->last_write_fh != 940 || shared->last_write_offset != 0 || - shared->last_write_size != 16 || shared->last_write_flags != expected_writeback_flags || - shared->last_write_open_flags != 0) { - printf("[FAIL] munmap writeback counters open=%u read=%u write=%u wfh=%llu off=%llu size=%u wflags=%u oflags=%u\n", - shared->open_count, shared->read_count, shared->write_count, - (unsigned long long)shared->last_write_fh, - (unsigned long long)shared->last_write_offset, shared->last_write_size, - shared->last_write_flags, shared->last_write_open_flags); + if (mount("", new_path, "ramfs", 0, NULL) != 0) { + printf("[FAIL] mount(ramfs on renamed fuse dir): %s (errno=%d)\n", strerror(errno), + errno); + goto fail; + } + ramfs_mounted = 1; + if (umount(new_path) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", new_path, strerror(errno), errno); + goto fail_no_ramfs_umount; + } + ramfs_mounted = 0; + if (rmdir(new_path) != 0) { + printf("[FAIL] rmdir(%s): %s (errno=%d)\n", new_path, strerror(errno), errno); goto fail; } - close(f); - f = -1; umount(mp); - shared->stop = 1; + stop = 1; close(fd); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return 0; +fail_chdir_root: + { + int ignored_chdir = chdir("/"); + (void)ignored_chdir; + } fail: - if (addr != MAP_FAILED) { - munmap(addr, 4096); + if (dir_fd >= 0) { + close(dir_fd); } - if (f >= 0) { - close(f); + if (ramfs_mounted) { + umount(new_path); } +fail_no_ramfs_umount: + rmdir(new_path); + rmdir(old_path); umount(mp); - shared->stop = 1; + stop = 1; close(fd); - if (daemon > 0) { - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - } - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return -1; } -static int ext_test_shared_mmap_subrange_mprotect_writeback_preserves_vma() { - const char *mp = "/tmp/test_fuse_mmap_mprotect_subrange"; - const size_t page_size = 4096; - const size_t map_len = page_size * 2; - char path[256]; - int f = -1; - void *addr = MAP_FAILED; - volatile char c = 0; - pid_t daemon = -1; - struct sigaction old_segv; - bool segv_handler_installed = false; - struct mmap_shared_state { - volatile int stop; - volatile int init_done; - volatile uint32_t open_count; - volatile uint32_t read_count; - volatile uint32_t write_count; - volatile uint64_t last_write_fh; - }; - struct mmap_shared_state *shared = - (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (shared == MAP_FAILED) { - printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); - return -1; - } - memset(shared, 0, sizeof(*shared)); +static int ext_test_lookup_nodes_forgotten_before_umount_when_unreferenced() { + const char *mp = "/tmp/test_fuse_lookup_lifetime"; + char parent_path[512]; + char child_path[512]; + struct stat st; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - munmap(shared, sizeof(*shared)); return -1; } int fd = open("/dev/fuse", O_RDWR); if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - daemon = fork(); - if (daemon < 0) { - printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); + volatile int stop = 0; + volatile int init_done = 0; + volatile uint32_t forget_count = 0; + volatile uint64_t forget_nlookup_sum = 0; + volatile uint64_t forget_trace_nodeids[32] = {0}; + volatile uint64_t forget_trace_nlookups[32] = {0}; + volatile uint32_t destroy_count = 0; + uint32_t forget_count_before_umount = 0; + uint64_t forget_sum_before_umount = 0; + uint32_t distinct_nonroot_before_umount = 0; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.enable_write_ops = 1; + args.stop_on_destroy = 1; + args.forget_count = &forget_count; + args.forget_nlookup_sum = &forget_nlookup_sum; + args.forget_trace_nodeids = forget_trace_nodeids; + args.forget_trace_nlookups = forget_trace_nlookups; + args.forget_trace_capacity = 32; + args.destroy_count = &destroy_count; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); close(fd); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (daemon == 0) { - struct fuse_daemon_args child_args; - memset(&child_args, 0, sizeof(child_args)); - child_args.fd = fd; - child_args.stop = &shared->stop; - child_args.init_done = &shared->init_done; - child_args.stop_on_destroy = 1; - child_args.enable_write_ops = 1; - child_args.open_count = &shared->open_count; - child_args.read_count = &shared->read_count; - child_args.write_count = &shared->write_count; - child_args.last_write_fh = &shared->last_write_fh; - child_args.hello_data_size_override = map_len; - child_args.next_open_fh = 920; - fuse_daemon_thread(&child_args); - _exit(0); - } char opts[256]; - snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", - fd); + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); if (mount("none", mp, "fuse", 0, opts) != 0) { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - shared->stop = 1; + stop = 1; close(fd); - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return -1; } - if (fuseg_wait_init(&shared->init_done) != 0) { + if (fuseg_wait_init(&init_done) != 0) { printf("[FAIL] init handshake timeout\n"); goto fail; } - snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDWR); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + snprintf(parent_path, sizeof(parent_path), "%s/parent", mp); + snprintf(child_path, sizeof(child_path), "%s/parent/child", mp); + if (mkdir(parent_path, 0755) != 0) { + printf("[FAIL] mkdir(%s): %s (errno=%d)\n", parent_path, strerror(errno), errno); goto fail; } - addr = mmap(NULL, map_len, PROT_READ, MAP_SHARED, f, 0); - if (addr == MAP_FAILED) { - printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); - close(f); + if (mkdir(child_path, 0755) != 0) { + printf("[FAIL] mkdir(%s): %s (errno=%d)\n", child_path, strerror(errno), errno); goto fail; } - - c = ((volatile char *)addr)[0]; - if (c != 'A') { - printf("[FAIL] first page byte got=%d\n", c); + if (stat(child_path, &st) != 0 || !S_ISDIR(st.st_mode)) { + printf("[FAIL] stat(%s): %s (errno=%d)\n", child_path, strerror(errno), errno); goto fail; } - c = ((volatile char *)addr)[page_size]; - if (c != 'O') { - printf("[FAIL] second page byte got=%d\n", c); + if (stat(parent_path, &st) != 0 || !S_ISDIR(st.st_mode)) { + printf("[FAIL] stat(%s) after child lookup: %s (errno=%d)\n", parent_path, + strerror(errno), errno); goto fail; } - if (shared->open_count != 1 || shared->read_count != 2) { - printf("[FAIL] before subrange mprotect counters open=%u read=%u\n", - shared->open_count, shared->read_count); - goto fail; + + for (int i = 0; i < 200 && forget_nlookup_sum < 2; i++) { + usleep(10 * 1000); } - if (mprotect((char *)addr + page_size, page_size, PROT_READ | PROT_WRITE) != 0) { - printf("[FAIL] subrange mprotect(shared writable): %s (errno=%d)\n", strerror(errno), - errno); + if (forget_count == 0 || forget_nlookup_sum < 2) { + printf("[FAIL] unreferenced FUSE lookup nodes not forgotten before umount: " + "count=%u nlookup=%llu\n", + forget_count, (unsigned long long)forget_nlookup_sum); goto fail; } - ((volatile char *)addr)[page_size + 1] = 'S'; - if (msync((char *)addr + page_size, page_size, MS_SYNC) != 0) { - printf("[FAIL] msync(subrange shared writable): %s (errno=%d)\n", strerror(errno), errno); - goto fail; + for (uint32_t i = 0; i < forget_count && i < 32; i++) { + if (forget_trace_nodeids[i] == 1) { + printf("[FAIL] root node unexpectedly forgotten before umount at index=%u " + "nlookup=%llu\n", + i, (unsigned long long)forget_trace_nlookups[i]); + goto fail; + } } - if (shared->write_count != 1 || shared->last_write_fh != 920) { - printf("[FAIL] subrange writeback counters write=%u wfh=%llu\n", shared->write_count, - (unsigned long long)shared->last_write_fh); - goto fail; + distinct_nonroot_before_umount = 0; + for (uint32_t i = 0; i < forget_count && i < 32; i++) { + if (forget_trace_nodeids[i] == 0 || forget_trace_nodeids[i] == 1) { + continue; + } + bool seen = false; + for (uint32_t j = 0; j < i; j++) { + if (forget_trace_nodeids[j] == forget_trace_nodeids[i]) { + seen = true; + break; + } + } + if (!seen) { + distinct_nonroot_before_umount++; + } } - if (mprotect(addr, page_size, PROT_NONE) != 0) { - printf("[FAIL] mprotect(PROT_NONE first page): %s (errno=%d)\n", strerror(errno), errno); + if (distinct_nonroot_before_umount < 2) { + printf("[FAIL] expected at least two distinct non-root nodes forgotten before umount, " + "got=%u count=%u nlookup=%llu\n", + distinct_nonroot_before_umount, forget_count, + (unsigned long long)forget_nlookup_sum); goto fail; } - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = fuse_sigsegv_longjmp_handler; - sigemptyset(&sa.sa_mask); - if (sigaction(SIGSEGV, &sa, &old_segv) != 0) { - printf("[FAIL] sigaction(SIGSEGV): %s (errno=%d)\n", strerror(errno), errno); + forget_count_before_umount = forget_count; + forget_sum_before_umount = forget_nlookup_sum; + + if (umount(mp) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); goto fail; } - segv_handler_installed = true; - g_fuse_sigsegv_seen = 0; - if (sigsetjmp(g_fuse_sigsegv_jmp, 1) == 0) { - c = ((volatile char *)addr)[0]; - (void)c; + for (int i = 0; i < 200 && destroy_count == 0; i++) { + usleep(10 * 1000); } - sigaction(SIGSEGV, &old_segv, NULL); - segv_handler_installed = false; - if (!g_fuse_sigsegv_seen) { - printf("[FAIL] first page remained readable after PROT_NONE\n"); - goto fail; + if (destroy_count == 0) { + printf("[FAIL] timed out waiting for FUSE_DESTROY after umount\n"); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; } - - munmap(addr, map_len); - addr = MAP_FAILED; - close(f); - f = -1; - umount(mp); - shared->stop = 1; close(fd); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); + if (destroy_count != 1 || forget_count < forget_count_before_umount || + forget_nlookup_sum < forget_sum_before_umount) { + printf("[FAIL] FUSE teardown lost forget accounting or missed destroy: " + "forget=%u/%u nlookup=%llu/%llu destroy=%u\n", + forget_count, forget_count_before_umount, (unsigned long long)forget_nlookup_sum, + (unsigned long long)forget_sum_before_umount, destroy_count); + rmdir(mp); + return -1; + } + for (uint32_t i = 0; i < forget_count && i < 32; i++) { + if (forget_trace_nodeids[i] == 1) { + printf("[FAIL] root node unexpectedly forgotten at index=%u nlookup=%llu\n", i, + (unsigned long long)forget_trace_nlookups[i]); + rmdir(mp); + return -1; + } + } rmdir(mp); return 0; fail: - if (segv_handler_installed) { - sigaction(SIGSEGV, &old_segv, NULL); - } - if (addr != MAP_FAILED) { - munmap(addr, map_len); - } - if (f >= 0) { - close(f); - } umount(mp); - shared->stop = 1; + stop = 1; close(fd); - if (daemon > 0) { - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - } - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return -1; } -static int ext_test_shared_mmap_unfaulted_mprotect_prot_none() { - const char *mp = "/tmp/test_fuse_mmap_unfaulted_mprotect"; - const size_t page_size = 4096; - char path[256]; - int f = -1; - void *addr = MAP_FAILED; - volatile char c = 0; - pid_t daemon = -1; - struct sigaction old_segv; - bool segv_handler_installed = false; - struct mmap_shared_state { - volatile int stop; - volatile int init_done; - volatile uint32_t open_count; - volatile uint32_t read_count; - }; - struct mmap_shared_state *shared = - (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (shared == MAP_FAILED) { - printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); - return -1; +static bool forget_trace_contains(volatile uint64_t *nodeids, uint32_t count, uint64_t nodeid) { + for (uint32_t i = 0; i < count && i < 32; i++) { + if (nodeids[i] == nodeid) { + return true; + } } - memset(shared, 0, sizeof(*shared)); + return false; +} + +static bool forget_trace_contains_pair(volatile uint64_t *nodeids, + volatile uint64_t *nlookups, + uint32_t count, uint64_t nodeid, + uint64_t nlookup) { + for (uint32_t i = 0; i < count && i < 32; i++) { + if (nodeids[i] == nodeid && nlookups[i] == nlookup) { + return true; + } + } + return false; +} + +static int ext_test_positive_lookup_cache_expires_and_forgets_before_umount() { + const char *mp = "/tmp/test_fuse_positive_lookup_lifetime"; + char parent_path[512]; + char child_path[512]; + char hello_path[512]; + struct stat st; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - munmap(shared, sizeof(*shared)); return -1; } int fd = open("/dev/fuse", O_RDWR); if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - daemon = fork(); - if (daemon < 0) { - printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); + volatile int stop = 0; + volatile int init_done = 0; + volatile uint32_t forget_count = 0; + volatile uint64_t forget_nlookup_sum = 0; + volatile uint64_t forget_trace_nodeids[32] = {0}; + volatile uint64_t forget_trace_nlookups[32] = {0}; + volatile uint32_t destroy_count = 0; + uint32_t forget_count_before_umount = 0; + uint64_t forget_sum_before_umount = 0; + uint64_t parent_nodeid = 0; + uint64_t child_nodeid = 0; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.enable_write_ops = 1; + args.stop_on_destroy = 1; + args.forget_count = &forget_count; + args.forget_nlookup_sum = &forget_nlookup_sum; + args.forget_trace_nodeids = forget_trace_nodeids; + args.forget_trace_nlookups = forget_trace_nlookups; + args.forget_trace_capacity = 32; + args.destroy_count = &destroy_count; + args.entry_valid_sec = 1; + args.attr_valid_sec = 1; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); close(fd); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (daemon == 0) { - struct fuse_daemon_args child_args; - memset(&child_args, 0, sizeof(child_args)); - child_args.fd = fd; - child_args.stop = &shared->stop; - child_args.init_done = &shared->init_done; - child_args.stop_on_destroy = 1; - child_args.open_count = &shared->open_count; - child_args.read_count = &shared->read_count; - child_args.hello_data_size_override = page_size; - fuse_daemon_thread(&child_args); - _exit(0); - } char opts[256]; - snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", - fd); + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); if (mount("none", mp, "fuse", 0, opts) != 0) { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - shared->stop = 1; + stop = 1; close(fd); - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return -1; } - if (fuseg_wait_init(&shared->init_done) != 0) { + if (fuseg_wait_init(&init_done) != 0) { printf("[FAIL] init handshake timeout\n"); goto fail; } - snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDWR); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + snprintf(parent_path, sizeof(parent_path), "%s/parent", mp); + snprintf(child_path, sizeof(child_path), "%s/parent/child", mp); + snprintf(hello_path, sizeof(hello_path), "%s/hello.txt", mp); + if (mkdir(parent_path, 0755) != 0) { + printf("[FAIL] mkdir(%s): %s (errno=%d)\n", parent_path, strerror(errno), errno); goto fail; } - addr = mmap(NULL, page_size, PROT_READ, MAP_SHARED, f, 0); - if (addr == MAP_FAILED) { - printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); - close(f); + if (stat(parent_path, &st) != 0 || !S_ISDIR(st.st_mode)) { + printf("[FAIL] stat(%s): %s (errno=%d)\n", parent_path, strerror(errno), errno); goto fail; } - if (shared->open_count != 1 || shared->read_count != 0) { - printf("[FAIL] before unfaulted mprotect counters open=%u read=%u\n", - shared->open_count, shared->read_count); + parent_nodeid = (uint64_t)st.st_ino; + if (mkdir(child_path, 0755) != 0) { + printf("[FAIL] mkdir(%s): %s (errno=%d)\n", child_path, strerror(errno), errno); goto fail; } - if (mprotect(addr, page_size, PROT_NONE) != 0) { - printf("[FAIL] mprotect(PROT_NONE unfaulted): %s (errno=%d)\n", strerror(errno), errno); + if (stat(child_path, &st) != 0 || !S_ISDIR(st.st_mode)) { + printf("[FAIL] stat(%s): %s (errno=%d)\n", child_path, strerror(errno), errno); goto fail; } + child_nodeid = (uint64_t)st.st_ino; - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = fuse_sigsegv_longjmp_handler; - sigemptyset(&sa.sa_mask); - if (sigaction(SIGSEGV, &sa, &old_segv) != 0) { - printf("[FAIL] sigaction(SIGSEGV): %s (errno=%d)\n", strerror(errno), errno); + usleep(2000 * 1000); + if (stat(hello_path, &st) != 0 || !S_ISREG(st.st_mode)) { + printf("[FAIL] stat(%s) after TTL: %s (errno=%d)\n", hello_path, strerror(errno), errno); goto fail; } - segv_handler_installed = true; - g_fuse_sigsegv_seen = 0; - if (sigsetjmp(g_fuse_sigsegv_jmp, 1) == 0) { - c = ((volatile char *)addr)[0]; - (void)c; + + for (int i = 0; i < 200; i++) { + uint32_t count = forget_count; + if (forget_trace_contains(forget_trace_nodeids, count, parent_nodeid) && + forget_trace_contains(forget_trace_nodeids, count, child_nodeid)) { + break; + } + usleep(10 * 1000); } - sigaction(SIGSEGV, &old_segv, NULL); - segv_handler_installed = false; - if (!g_fuse_sigsegv_seen) { - printf("[FAIL] unfaulted PROT_NONE mapping remained readable\n"); + if (!forget_trace_contains(forget_trace_nodeids, forget_count, parent_nodeid) || + !forget_trace_contains(forget_trace_nodeids, forget_count, child_nodeid)) { + printf("[FAIL] positive TTL cache-only nodes were not forgotten before umount: " + "count=%u nlookup=%llu parent=%llu child=%llu saw_parent=%d saw_child=%d\n", + forget_count, (unsigned long long)forget_nlookup_sum, + (unsigned long long)parent_nodeid, (unsigned long long)child_nodeid, + forget_trace_contains(forget_trace_nodeids, forget_count, parent_nodeid), + forget_trace_contains(forget_trace_nodeids, forget_count, child_nodeid)); goto fail; } - if (shared->read_count != 0) { - printf("[FAIL] unfaulted PROT_NONE triggered read_count=%u\n", shared->read_count); + if (forget_trace_contains(forget_trace_nodeids, forget_count, 1)) { + printf("[FAIL] root node unexpectedly forgotten before umount\n"); goto fail; } - munmap(addr, page_size); - addr = MAP_FAILED; - close(f); - f = -1; - umount(mp); - shared->stop = 1; + forget_count_before_umount = forget_count; + forget_sum_before_umount = forget_nlookup_sum; + if (umount(mp) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + goto fail; + } + for (int i = 0; i < 200 && destroy_count == 0; i++) { + usleep(10 * 1000); + } + if (destroy_count == 0) { + printf("[FAIL] timed out waiting for FUSE_DESTROY after umount\n"); + stop = 1; + close(fd); + pthread_join(th, NULL); + rmdir(mp); + return -1; + } close(fd); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); + if (destroy_count != 1 || forget_count < forget_count_before_umount || + forget_nlookup_sum < forget_sum_before_umount || + forget_trace_contains(forget_trace_nodeids, forget_count, 1)) { + printf("[FAIL] FUSE teardown regressed: forget=%u/%u nlookup=%llu/%llu destroy=%u " + "root_forget=%d\n", + forget_count, forget_count_before_umount, (unsigned long long)forget_nlookup_sum, + (unsigned long long)forget_sum_before_umount, destroy_count, + forget_trace_contains(forget_trace_nodeids, forget_count, 1)); + rmdir(mp); + return -1; + } rmdir(mp); return 0; fail: - if (segv_handler_installed) { - sigaction(SIGSEGV, &old_segv, NULL); - } - if (addr != MAP_FAILED) { - munmap(addr, page_size); - } - if (f >= 0) { - close(f); - } umount(mp); - shared->stop = 1; + stop = 1; close(fd); - if (daemon > 0) { - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - } - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return -1; } -static int ext_test_mmap_truncate_unmaps_stale_page() { - const char *mp = "/tmp/test_fuse_mmap_truncate"; - const size_t page_size = 4096; - const size_t map_len = page_size * 2; - char path[256]; - int f = -1; - void *addr = MAP_FAILED; - volatile char c = 0; - pid_t daemon = -1; - struct sigaction old_bus; - bool bus_handler_installed = false; - struct mmap_shared_state { - volatile int stop; - volatile int init_done; - volatile uint32_t open_count; - volatile uint32_t read_count; - }; - struct mmap_shared_state *shared = - (struct mmap_shared_state *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (shared == MAP_FAILED) { - printf("[FAIL] mmap(shared counters): %s (errno=%d)\n", strerror(errno), errno); - return -1; - } - memset(shared, 0, sizeof(*shared)); +static int ext_test_active_directory_parent_survives_lookup_cache_prune() { + const char *mp = "/tmp/test_fuse_active_parent_prune"; + char parent_path[512]; + char child_path[512]; + char hello_path[512]; + struct stat st; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); - munmap(shared, sizeof(*shared)); return -1; } int fd = open("/dev/fuse", O_RDWR); if (fd < 0) { printf("[FAIL] open(/dev/fuse): %s (errno=%d)\n", strerror(errno), errno); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - daemon = fork(); - if (daemon < 0) { - printf("[FAIL] fork fuse daemon: %s (errno=%d)\n", strerror(errno), errno); + volatile int stop = 0; + volatile int init_done = 0; + volatile uint32_t destroy_count = 0; + volatile uint32_t lookup_count = 0; + uint32_t lookup_count_before_parent_relookup = 0; + + struct fuse_daemon_args args; + memset(&args, 0, sizeof(args)); + args.fd = fd; + args.stop = &stop; + args.init_done = &init_done; + args.enable_write_ops = 1; + args.stop_on_destroy = 1; + args.destroy_count = &destroy_count; + args.lookup_count = &lookup_count; + args.entry_valid_sec = 1; + args.attr_valid_sec = 1; + + pthread_t th; + if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { + printf("[FAIL] pthread_create\n"); close(fd); - munmap(shared, sizeof(*shared)); rmdir(mp); return -1; } - if (daemon == 0) { - struct fuse_daemon_args child_args; - memset(&child_args, 0, sizeof(child_args)); - child_args.fd = fd; - child_args.stop = &shared->stop; - child_args.init_done = &shared->init_done; - child_args.stop_on_destroy = 1; - child_args.enable_write_ops = 1; - child_args.open_count = &shared->open_count; - child_args.read_count = &shared->read_count; - child_args.hello_data_size_override = map_len; - fuse_daemon_thread(&child_args); - _exit(0); - } char opts[256]; - snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0,max_read=4096", - fd); + snprintf(opts, sizeof(opts), "fd=%d,rootmode=040755,user_id=0,group_id=0", fd); if (mount("none", mp, "fuse", 0, opts) != 0) { printf("[FAIL] mount(fuse): %s (errno=%d)\n", strerror(errno), errno); - shared->stop = 1; + stop = 1; close(fd); - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); return -1; } - if (fuseg_wait_init(&shared->init_done) != 0) { + if (fuseg_wait_init(&init_done) != 0) { printf("[FAIL] init handshake timeout\n"); goto fail; } - snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDWR); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + snprintf(parent_path, sizeof(parent_path), "%s/parent", mp); + snprintf(child_path, sizeof(child_path), "%s/parent/child", mp); + snprintf(hello_path, sizeof(hello_path), "%s/hello.txt", mp); + if (mkdir(parent_path, 0755) != 0 || mkdir(child_path, 0755) != 0) { + printf("[FAIL] mkdir active parent tree: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - addr = mmap(NULL, map_len, PROT_READ, MAP_PRIVATE, f, 0); - if (addr == MAP_FAILED) { - printf("[FAIL] mmap(%s): %s (errno=%d)\n", path, strerror(errno), errno); - close(f); + if (chdir(child_path) != 0) { + printf("[FAIL] chdir(%s): %s (errno=%d)\n", child_path, strerror(errno), errno); goto fail; } - - c = ((volatile char *)addr)[page_size]; - if (c != 'O') { - printf("[FAIL] second page byte before truncate got=%d\n", c); - goto fail; + usleep(2000 * 1000); + if (stat(hello_path, &st) != 0 || !S_ISREG(st.st_mode)) { + printf("[FAIL] stat(%s) after TTL: %s (errno=%d)\n", hello_path, strerror(errno), errno); + goto fail_chdir; } - if (shared->open_count != 1 || shared->read_count != 1) { - printf("[FAIL] before truncate counters open=%u read=%u\n", shared->open_count, - shared->read_count); - goto fail; + lookup_count_before_parent_relookup = lookup_count; + if (stat(parent_path, &st) != 0 || !S_ISDIR(st.st_mode)) { + printf("[FAIL] stat(%s) after prune: %s (errno=%d)\n", parent_path, strerror(errno), + errno); + goto fail_chdir; } - if (ftruncate(f, page_size) != 0) { - printf("[FAIL] ftruncate: %s (errno=%d)\n", strerror(errno), errno); - goto fail; + if (lookup_count <= lookup_count_before_parent_relookup) { + printf("[FAIL] parent cache entry was not pruned: before=%u after=%u\n", + lookup_count_before_parent_relookup, lookup_count); + goto fail_chdir; } - - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = fuse_sigbus_longjmp_handler; - sigemptyset(&sa.sa_mask); - if (sigaction(SIGBUS, &sa, &old_bus) != 0) { - printf("[FAIL] sigaction(SIGBUS): %s (errno=%d)\n", strerror(errno), errno); - goto fail; + if (stat("..", &st) != 0 || !S_ISDIR(st.st_mode)) { + printf("[FAIL] stat(..) after cache prune: %s (errno=%d)\n", strerror(errno), errno); + goto fail_chdir; } - bus_handler_installed = true; - g_fuse_sigbus_seen = 0; - if (sigsetjmp(g_fuse_sigbus_jmp, 1) == 0) { - c = ((volatile char *)addr)[page_size]; - (void)c; + + if (chdir("/") != 0) { + printf("[FAIL] chdir(/): %s (errno=%d)\n", strerror(errno), errno); + goto fail_chdir; } - sigaction(SIGBUS, &old_bus, NULL); - bus_handler_installed = false; - if (!g_fuse_sigbus_seen) { - printf("[FAIL] truncated second page remained readable read=%u\n", shared->read_count); - goto fail; + if (umount(mp) != 0) { + printf("[FAIL] umount(%s): %s (errno=%d)\n", mp, strerror(errno), errno); + goto fail_no_umount; } - if (shared->read_count != 1) { - printf("[FAIL] truncated EOF fault issued extra FUSE_READ count=%u\n", shared->read_count); - goto fail; + for (int i = 0; i < 200 && destroy_count == 0; i++) { + usleep(10 * 1000); } - - munmap(addr, map_len); - addr = MAP_FAILED; - close(f); - f = -1; - umount(mp); - shared->stop = 1; + stop = 1; close(fd); - waitpid(daemon, NULL, 0); - munmap(shared, sizeof(*shared)); + pthread_join(th, NULL); rmdir(mp); + if (destroy_count == 0) { + printf("[FAIL] timed out waiting for FUSE_DESTROY after umount\n"); + return -1; + } return 0; -fail: - if (bus_handler_installed) { - sigaction(SIGBUS, &old_bus, NULL); - } - if (addr != MAP_FAILED) { - munmap(addr, map_len); - } - if (f >= 0) { - close(f); +fail_chdir: + if (chdir("/") != 0) { + printf("[FAIL] cleanup chdir(/): %s (errno=%d)\n", strerror(errno), errno); } +fail: umount(mp); - shared->stop = 1; - close(fd); - if (daemon > 0) { - kill(daemon, SIGTERM); - waitpid(daemon, NULL, 0); - } - munmap(shared, sizeof(*shared)); +fail_no_umount: + stop = 1; + close(fd); + pthread_join(th, NULL); rmdir(mp); return -1; } -static int ext_test_fadvise_without_page_cache() { - const char *mp = "/tmp/test_fuse_fadvise"; - char path[256]; - int f = -1; - const int advices[] = { - POSIX_FADV_NORMAL, POSIX_FADV_RANDOM, POSIX_FADV_SEQUENTIAL, - POSIX_FADV_WILLNEED, POSIX_FADV_DONTNEED, - POSIX_FADV_NOREUSE, - }; +static int ext_test_lookup_self_alias_rejected_and_forgotten() { + const char *mp = "/tmp/test_fuse_self_alias"; + char parent_path[512]; + char alias_path[512]; + struct stat st; + uint64_t parent_nodeid = 0; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -5252,13 +6676,26 @@ static int ext_test_fadvise_without_page_cache() { volatile int stop = 0; volatile int init_done = 0; + volatile uint32_t forget_count = 0; + volatile uint64_t forget_nlookup_sum = 0; + volatile uint64_t forget_trace_nodeids[32] = {0}; + volatile uint64_t forget_trace_nlookups[32] = {0}; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; + args.enable_write_ops = 1; args.stop_on_destroy = 1; + args.lookup_self_alias = 1; + args.forget_count = &forget_count; + args.forget_nlookup_sum = &forget_nlookup_sum; + args.forget_trace_nodeids = forget_trace_nodeids; + args.forget_trace_nlookups = forget_trace_nlookups; + args.forget_trace_capacity = 32; + args.entry_valid_sec = 60; + args.attr_valid_sec = 60; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -5283,28 +6720,37 @@ static int ext_test_fadvise_without_page_cache() { goto fail; } - snprintf(path, sizeof(path), "%s/hello.txt", mp); - f = open(path, O_RDONLY); - if (f < 0) { - printf("[FAIL] open(%s): %s (errno=%d)\n", path, strerror(errno), errno); + snprintf(parent_path, sizeof(parent_path), "%s/parent", mp); + snprintf(alias_path, sizeof(alias_path), "%s/parent/self_alias", mp); + if (mkdir(parent_path, 0755) != 0) { + printf("[FAIL] mkdir(%s): %s (errno=%d)\n", parent_path, strerror(errno), errno); + goto fail; + } + if (stat(parent_path, &st) != 0 || !S_ISDIR(st.st_mode)) { + printf("[FAIL] stat(%s): %s (errno=%d)\n", parent_path, strerror(errno), errno); goto fail; } + parent_nodeid = (uint64_t)st.st_ino; - for (size_t i = 0; i < sizeof(advices) / sizeof(advices[0]); i++) { - int rc = posix_fadvise(f, 0, 0, advices[i]); - if (rc != 0) { - printf("[FAIL] posix_fadvise(advice=%d): rc=%d\n", advices[i], rc); - goto fail; + errno = 0; + if (stat(alias_path, &st) == 0 || errno != EIO) { + printf("[FAIL] self alias lookup expected EIO, ret_errno=%d\n", errno); + goto fail; + } + for (int i = 0; i < 200; i++) { + if (forget_trace_contains(forget_trace_nodeids, forget_count, parent_nodeid)) { + break; } + usleep(10 * 1000); } - - if (posix_fadvise(f, 0, -1, POSIX_FADV_NORMAL) != EINVAL) { - printf("[FAIL] posix_fadvise negative len should return EINVAL\n"); + if (!forget_trace_contains(forget_trace_nodeids, forget_count, parent_nodeid) || + forget_nlookup_sum == 0) { + printf("[FAIL] self alias lookup ref was not forgotten: parent=%llu count=%u sum=%llu\n", + (unsigned long long)parent_nodeid, forget_count, + (unsigned long long)forget_nlookup_sum); goto fail; } - close(f); - f = -1; umount(mp); stop = 1; close(fd); @@ -5313,9 +6759,6 @@ static int ext_test_fadvise_without_page_cache() { return 0; fail: - if (f >= 0) { - close(f); - } umount(mp); stop = 1; close(fd); @@ -5324,11 +6767,11 @@ static int ext_test_fadvise_without_page_cache() { return -1; } -static int ext_test_mount_on_fuse_dir_uses_namespace_path() { - const char *mp = "/tmp/test_fuse_mount_target"; - char dir_path[512]; - char marker_path[1024]; - int ramfs_mounted = 0; +static int ext_test_same_generation_type_mismatch_stales_old_node() { + const char *mp = "/tmp/test_fuse_type_mismatch"; + char file_path[512]; + int old_fd = -1; + struct stat st; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -5350,7 +6793,6 @@ static int ext_test_mount_on_fuse_dir_uses_namespace_path() { args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.enable_write_ops = 1; args.stop_on_destroy = 1; pthread_t th; @@ -5376,33 +6818,34 @@ static int ext_test_mount_on_fuse_dir_uses_namespace_path() { goto fail; } - snprintf(dir_path, sizeof(dir_path), "%s/ramfs_target", mp); - if (mkdir(dir_path, 0755) != 0) { - printf("[FAIL] mkdir(%s): %s (errno=%d)\n", dir_path, strerror(errno), errno); + snprintf(file_path, sizeof(file_path), "%s/hello.txt", mp); + old_fd = open(file_path, O_RDONLY); + if (old_fd < 0) { + printf("[FAIL] open old hello: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - - if (mount("", dir_path, "ramfs", 0, NULL) != 0) { - printf("[FAIL] mount(ramfs on fuse dir): %s (errno=%d)\n", strerror(errno), errno); + char buf[64]; + if (read(old_fd, buf, sizeof(buf)) <= 0) { + printf("[FAIL] initial read old hello: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - ramfs_mounted = 1; - snprintf(marker_path, sizeof(marker_path), "%s/marker", dir_path); - if (fuseg_write_file(marker_path, "mounted") != 0) { - printf("[FAIL] write marker under ramfs: %s (errno=%d)\n", strerror(errno), errno); + args.fs.nodes[1].is_dir = 1; + args.fs.nodes[1].mode = S_IFDIR | 0755; + args.fs.nodes[1].size = 0; + if (stat(file_path, &st) != 0 || !S_ISDIR(st.st_mode)) { + printf("[FAIL] stat same-generation replacement dir: %s (errno=%d) mode=%o\n", + strerror(errno), errno, st.st_mode); goto fail; } - if (umount(dir_path) != 0) { - printf("[FAIL] umount(%s): %s (errno=%d)\n", dir_path, strerror(errno), errno); - goto fail_no_ramfs_umount; - } - ramfs_mounted = 0; - if (rmdir(dir_path) != 0) { - printf("[FAIL] rmdir(%s): %s (errno=%d)\n", dir_path, strerror(errno), errno); + errno = 0; + if (pread(old_fd, buf, sizeof(buf), 0) >= 0 || errno != ESTALE) { + printf("[FAIL] old fd after type mismatch expected ESTALE, errno=%d\n", errno); goto fail; } + close(old_fd); + old_fd = -1; umount(mp); stop = 1; @@ -5412,11 +6855,9 @@ static int ext_test_mount_on_fuse_dir_uses_namespace_path() { return 0; fail: - if (ramfs_mounted) { - umount(dir_path); + if (old_fd >= 0) { + close(old_fd); } -fail_no_ramfs_umount: - rmdir(dir_path); umount(mp); stop = 1; close(fd); @@ -5425,13 +6866,13 @@ static int ext_test_mount_on_fuse_dir_uses_namespace_path() { return -1; } -static int ext_test_rename_updates_fuse_dir_cwd_path() { - const char *mp = "/tmp/test_fuse_rename_path"; - char old_path[512]; - char new_path[512]; - char cwd[512]; - int dir_fd = -1; - int ramfs_mounted = 0; +static int ext_test_readdirplus_generation_mismatch_stales_old_node() { + const char *mp = "/tmp/test_fuse_readdirplus_generation"; + char file_path[512]; + int old_fd = -1; + int new_fd = -1; + DIR *dir = NULL; + int saw = 0; if (ensure_dir(mp) != 0) { printf("[FAIL] ensure_dir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -5447,14 +6888,18 @@ static int ext_test_rename_updates_fuse_dir_cwd_path() { volatile int stop = 0; volatile int init_done = 0; + volatile uint32_t readdirplus_count = 0; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); args.fd = fd; args.stop = &stop; args.init_done = &init_done; - args.enable_write_ops = 1; args.stop_on_destroy = 1; + args.readdirplus_count = &readdirplus_count; + args.force_opendir_enosys = 1; + args.init_out_flags_override = + FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_NO_OPENDIR_SUPPORT | FUSE_DO_READDIRPLUS; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -5479,56 +6924,57 @@ static int ext_test_rename_updates_fuse_dir_cwd_path() { goto fail; } - snprintf(old_path, sizeof(old_path), "%s/old_dir", mp); - snprintf(new_path, sizeof(new_path), "%s/new_dir", mp); - if (mkdir(old_path, 0755) != 0) { - printf("[FAIL] mkdir(%s): %s (errno=%d)\n", old_path, strerror(errno), errno); - goto fail; - } - dir_fd = open(old_path, O_RDONLY | O_DIRECTORY); - if (dir_fd < 0) { - printf("[FAIL] open dir fd %s: %s (errno=%d)\n", old_path, strerror(errno), errno); + snprintf(file_path, sizeof(file_path), "%s/hello.txt", mp); + old_fd = open(file_path, O_RDONLY); + if (old_fd < 0) { + printf("[FAIL] open old hello: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - if (rename(old_path, new_path) != 0) { - printf("[FAIL] rename(%s -> %s): %s (errno=%d)\n", old_path, new_path, strerror(errno), - errno); + char buf[64]; + if (read(old_fd, buf, sizeof(buf)) <= 0) { + printf("[FAIL] initial read old hello: %s (errno=%d)\n", strerror(errno), errno); goto fail; } - if (fchdir(dir_fd) != 0) { - printf("[FAIL] fchdir renamed dir fd: %s (errno=%d)\n", strerror(errno), errno); + + args.fs.nodes[1].generation = 2; + dir = opendir(mp); + if (!dir) { + printf("[FAIL] opendir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); goto fail; } - if (!getcwd(cwd, sizeof(cwd))) { - printf("[FAIL] getcwd after rename: %s (errno=%d)\n", strerror(errno), errno); - goto fail_chdir_root; - } - if (strcmp(cwd, new_path) != 0) { - printf("[FAIL] getcwd after rename: got '%s', want '%s'\n", cwd, new_path); - goto fail_chdir_root; + struct dirent *de; + while ((de = readdir(dir)) != NULL) { + if (strcmp(de->d_name, "hello.txt") == 0) { + saw = 1; + } } - if (chdir("/") != 0) { - printf("[FAIL] chdir(/): %s (errno=%d)\n", strerror(errno), errno); + closedir(dir); + dir = NULL; + if (!saw || readdirplus_count == 0) { + printf("[FAIL] expected hello.txt from READDIRPLUS, saw=%d count=%u\n", saw, + readdirplus_count); goto fail; } - close(dir_fd); - dir_fd = -1; - if (mount("", new_path, "ramfs", 0, NULL) != 0) { - printf("[FAIL] mount(ramfs on renamed fuse dir): %s (errno=%d)\n", strerror(errno), - errno); + errno = 0; + if (pread(old_fd, buf, sizeof(buf), 0) >= 0) { + printf("[FAIL] stale old fd read unexpectedly succeeded\n"); goto fail; } - ramfs_mounted = 1; - if (umount(new_path) != 0) { - printf("[FAIL] umount(%s): %s (errno=%d)\n", new_path, strerror(errno), errno); - goto fail_no_ramfs_umount; + close(old_fd); + old_fd = -1; + + new_fd = open(file_path, O_RDONLY); + if (new_fd < 0) { + printf("[FAIL] open fresh hello: %s (errno=%d)\n", strerror(errno), errno); + goto fail; } - ramfs_mounted = 0; - if (rmdir(new_path) != 0) { - printf("[FAIL] rmdir(%s): %s (errno=%d)\n", new_path, strerror(errno), errno); + if (read(new_fd, buf, sizeof(buf)) <= 0) { + printf("[FAIL] read fresh hello: %s (errno=%d)\n", strerror(errno), errno); goto fail; } + close(new_fd); + new_fd = -1; umount(mp); stop = 1; @@ -5537,21 +6983,16 @@ static int ext_test_rename_updates_fuse_dir_cwd_path() { rmdir(mp); return 0; -fail_chdir_root: - { - int ignored_chdir = chdir("/"); - (void)ignored_chdir; - } fail: - if (dir_fd >= 0) { - close(dir_fd); + if (dir) { + closedir(dir); } - if (ramfs_mounted) { - umount(new_path); + if (new_fd >= 0) { + close(new_fd); + } + if (old_fd >= 0) { + close(old_fd); } -fail_no_ramfs_umount: - rmdir(new_path); - rmdir(old_path); umount(mp); stop = 1; close(fd); @@ -5560,11 +7001,8 @@ static int ext_test_rename_updates_fuse_dir_cwd_path() { return -1; } -static int ext_test_readdirplus_generation_mismatch_stales_old_node() { - const char *mp = "/tmp/test_fuse_readdirplus_generation"; - char file_path[512]; - int old_fd = -1; - int new_fd = -1; +static int ext_test_readdirplus_invalid_attr_forgets_unconsumed_entry() { + const char *mp = "/tmp/test_fuse_readdirplus_invalid_attr"; DIR *dir = NULL; int saw = 0; @@ -5583,6 +7021,9 @@ static int ext_test_readdirplus_generation_mismatch_stales_old_node() { volatile int stop = 0; volatile int init_done = 0; volatile uint32_t readdirplus_count = 0; + volatile uint32_t forget_count = 0; + volatile uint64_t forget_trace_nodeids[32] = {0}; + volatile uint64_t forget_trace_nlookups[32] = {0}; struct fuse_daemon_args args; memset(&args, 0, sizeof(args)); @@ -5590,10 +7031,18 @@ static int ext_test_readdirplus_generation_mismatch_stales_old_node() { args.stop = &stop; args.init_done = &init_done; args.stop_on_destroy = 1; - args.readdirplus_count = &readdirplus_count; args.force_opendir_enosys = 1; args.init_out_flags_override = FUSE_INIT_EXT | FUSE_MAX_PAGES | FUSE_NO_OPENDIR_SUPPORT | FUSE_DO_READDIRPLUS; + args.entry_valid_sec = 60; + args.attr_valid_sec = 60; + args.readdirplus_count = &readdirplus_count; + args.readdirplus_invalid_attr_name = "hello.txt"; + args.readdirplus_invalid_attr_size = 0x8000000000000000ULL; + args.forget_count = &forget_count; + args.forget_trace_nodeids = forget_trace_nodeids; + args.forget_trace_nlookups = forget_trace_nlookups; + args.forget_trace_capacity = 32; pthread_t th; if (pthread_create(&th, NULL, fuse_daemon_thread, &args) != 0) { @@ -5618,19 +7067,6 @@ static int ext_test_readdirplus_generation_mismatch_stales_old_node() { goto fail; } - snprintf(file_path, sizeof(file_path), "%s/hello.txt", mp); - old_fd = open(file_path, O_RDONLY); - if (old_fd < 0) { - printf("[FAIL] open old hello: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - char buf[64]; - if (read(old_fd, buf, sizeof(buf)) <= 0) { - printf("[FAIL] initial read old hello: %s (errno=%d)\n", strerror(errno), errno); - goto fail; - } - - args.fs.nodes[1].generation = 2; dir = opendir(mp); if (!dir) { printf("[FAIL] opendir(%s): %s (errno=%d)\n", mp, strerror(errno), errno); @@ -5650,25 +7086,19 @@ static int ext_test_readdirplus_generation_mismatch_stales_old_node() { goto fail; } - errno = 0; - if (pread(old_fd, buf, sizeof(buf), 0) >= 0) { - printf("[FAIL] stale old fd read unexpectedly succeeded\n"); - goto fail; - } - close(old_fd); - old_fd = -1; - - new_fd = open(file_path, O_RDONLY); - if (new_fd < 0) { - printf("[FAIL] open fresh hello: %s (errno=%d)\n", strerror(errno), errno); - goto fail; + for (int i = 0; i < 200; i++) { + if (forget_trace_contains_pair(forget_trace_nodeids, forget_trace_nlookups, forget_count, 2, + 1)) { + break; + } + usleep(10 * 1000); } - if (read(new_fd, buf, sizeof(buf)) <= 0) { - printf("[FAIL] read fresh hello: %s (errno=%d)\n", strerror(errno), errno); + if (!forget_trace_contains_pair(forget_trace_nodeids, forget_trace_nlookups, forget_count, 2, + 1)) { + printf("[FAIL] invalid READDIRPLUS entry was not forgotten before umount: count=%u\n", + forget_count); goto fail; } - close(new_fd); - new_fd = -1; umount(mp); stop = 1; @@ -5681,12 +7111,6 @@ static int ext_test_readdirplus_generation_mismatch_stales_old_node() { if (dir) { closedir(dir); } - if (new_fd >= 0) { - close(new_fd); - } - if (old_fd >= 0) { - close(old_fd); - } umount(mp); stop = 1; close(fd); @@ -6026,6 +7450,18 @@ TEST(FuseExtended, OpsAccessCreateSymlinkLinkRename2FlushFsync) { ASSERT_EQ(0, ext_test_p2_ops()); } +TEST(FuseExtended, PositiveLookupCacheRespectsEntryTtl) { + ASSERT_EQ(0, ext_test_positive_lookup_cache_respects_entry_ttl()); +} + +TEST(FuseExtended, XattrOps) { + ASSERT_EQ(0, ext_test_xattr_ops()); +} + +TEST(FuseExtended, XattrEnosysIsCached) { + ASSERT_EQ(0, ext_test_xattr_enosys_is_cached()); +} + TEST(FuseExtended, InterruptDeliversFuseInterrupt) { ASSERT_EQ(0, ext_test_p3_interrupt()); } @@ -6062,6 +7498,10 @@ TEST(FuseExtended, MmapFaultUsesOpenFhWithoutExtraOpen) { ASSERT_EQ(0, ext_test_mmap_fault_uses_open_fh_without_extra_open()); } +TEST(FuseExtended, DISABLED_MmapFaultBatchesReadaroundPages) { + ASSERT_EQ(0, ext_test_mmap_fault_batches_readaround_pages()); +} + TEST(FuseExtended, DirectIoReadBypassesPageCache) { ASSERT_EQ(0, ext_test_direct_io_read_bypasses_page_cache()); } @@ -6118,6 +7558,22 @@ TEST(FuseExtended, MountRamfsOnFuseDirectoryUsesNamespacePath) { ASSERT_EQ(0, ext_test_mount_on_fuse_dir_uses_namespace_path()); } +TEST(FuseExtended, LookupNodesForgottenBeforeUmountWhenUnreferenced) { + ASSERT_EQ(0, ext_test_lookup_nodes_forgotten_before_umount_when_unreferenced()); +} + +TEST(FuseExtended, PositiveLookupCacheExpiresAndForgetsBeforeUmount) { + ASSERT_EQ(0, ext_test_positive_lookup_cache_expires_and_forgets_before_umount()); +} + +TEST(FuseExtended, ActiveDirectoryParentSurvivesLookupCachePrune) { + ASSERT_EQ(0, ext_test_active_directory_parent_survives_lookup_cache_prune()); +} + +TEST(FuseExtended, LookupSelfAliasRejectedAndForgotten) { + ASSERT_EQ(0, ext_test_lookup_self_alias_rejected_and_forgotten()); +} + TEST(FuseExtended, RenameUpdatesFuseDirectoryCwdPath) { ASSERT_EQ(0, ext_test_rename_updates_fuse_dir_cwd_path()); } @@ -6126,6 +7582,14 @@ TEST(FuseExtended, ReaddirplusGenerationMismatchStalesOldNode) { ASSERT_EQ(0, ext_test_readdirplus_generation_mismatch_stales_old_node()); } +TEST(FuseExtended, ReaddirplusInvalidAttrForgetsUnconsumedEntry) { + ASSERT_EQ(0, ext_test_readdirplus_invalid_attr_forgets_unconsumed_entry()); +} + +TEST(FuseExtended, SameGenerationTypeMismatchStalesOldNode) { + ASSERT_EQ(0, ext_test_same_generation_type_mismatch_stales_old_node()); +} + TEST(FuseExtended, CreateGenerationMismatchStalesOldNode) { ASSERT_EQ(0, ext_test_create_generation_mismatch_stales_old_node()); } diff --git a/user/apps/tests/dunitest/suites/fuse/fuse_test_simplefs_local.h b/user/apps/tests/dunitest/suites/fuse/fuse_test_simplefs_local.h index b3d263374b..1dc87341b2 100644 --- a/user/apps/tests/dunitest/suites/fuse/fuse_test_simplefs_local.h +++ b/user/apps/tests/dunitest/suites/fuse/fuse_test_simplefs_local.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #define FUSE_TEST_LOG_PREFIX "[fuse-test] " @@ -114,6 +115,18 @@ static inline int fuse_test_log_enabled(void) { #ifndef FUSE_FSYNC #define FUSE_FSYNC 20 #endif +#ifndef FUSE_SETXATTR +#define FUSE_SETXATTR 21 +#endif +#ifndef FUSE_GETXATTR +#define FUSE_GETXATTR 22 +#endif +#ifndef FUSE_LISTXATTR +#define FUSE_LISTXATTR 23 +#endif +#ifndef FUSE_REMOVEXATTR +#define FUSE_REMOVEXATTR 24 +#endif #ifndef FUSE_FLUSH #define FUSE_FLUSH 25 #endif @@ -379,6 +392,21 @@ struct fuse_write_out { uint32_t padding; }; +struct fuse_setxattr_in_compat { + uint32_t size; + uint32_t flags; +}; + +struct fuse_getxattr_in { + uint32_t size; + uint32_t padding; +}; + +struct fuse_getxattr_out { + uint32_t size; + uint32_t padding; +}; + struct fuse_fallocate_in { uint64_t fh; uint64_t offset; @@ -682,11 +710,15 @@ struct fuse_daemon_args { volatile unsigned char *dynamic_hello_first_byte; volatile uint32_t *forget_count; volatile uint64_t *forget_nlookup_sum; + volatile uint64_t *forget_trace_nodeids; + volatile uint64_t *forget_trace_nlookups; + uint32_t forget_trace_capacity; volatile uint32_t *destroy_count; volatile uint32_t *init_in_flags; volatile uint32_t *init_in_flags2; volatile uint32_t *init_in_max_readahead; volatile uint32_t *access_count; + volatile uint32_t *lookup_count; volatile uint32_t *flush_count; volatile uint32_t *last_flush_uid; volatile uint32_t *last_flush_gid; @@ -699,6 +731,11 @@ struct fuse_daemon_args { volatile uint32_t *opendir_count; volatile uint32_t *setattr_count; volatile uint32_t *fallocate_count; + volatile uint32_t *getxattr_count; + volatile uint32_t *setxattr_count; + volatile uint32_t *listxattr_count; + volatile uint32_t *removexattr_count; + volatile uint32_t *last_setxattr_flags; volatile uint32_t *last_setattr_valid; volatile uint64_t *last_setattr_fh; volatile uint64_t *last_setattr_size; @@ -747,7 +784,10 @@ struct fuse_daemon_args { volatile uint64_t *last_interrupt_header_unique; volatile uint64_t *last_interrupt_target; uint32_t access_deny_mask; + uint64_t entry_valid_sec; + uint64_t attr_valid_sec; uint32_t init_out_flags_override; + uint32_t init_out_max_write_override; uint64_t write_watch_offset; uint64_t hello_open_fh_override; uint64_t next_open_fh; @@ -755,7 +795,10 @@ struct fuse_daemon_args { uint64_t create_generation_override; uint64_t link_generation_override; uint64_t hello_generation_override; + const char *readdirplus_invalid_attr_name; + uint64_t readdirplus_invalid_attr_size; int link_reuse_old_nodeid; + int lookup_self_alias; int allow_rename_replace; int has_hello_open_fh_override; int force_open_enosys; @@ -763,9 +806,13 @@ struct fuse_daemon_args { int force_flush_errno; int force_fsync_errno; int force_fsyncdir_errno; + int force_xattr_enosys; + int force_getxattr_erange_at_max; + int force_listxattr_erange_at_max; int block_read_until_interrupt; size_t hello_data_size_override; size_t hello_read_size_override; + size_t hello_generated_size_override; struct simplefs fs; }; @@ -797,6 +844,8 @@ static inline int simplefs_fill_entry_reply(struct fuse_daemon_args *a, const st memset(&out, 0, sizeof(out)); out.nodeid = node->nodeid; out.generation = node->generation; + out.entry_valid = a->entry_valid_sec; + out.attr_valid = a->attr_valid_sec; simplefs_fill_attr(node, &out.attr); return fuse_write_reply(a->fd, h->unique, 0, &out, sizeof(out)); } @@ -891,7 +940,7 @@ static inline int fuse_handle_one(struct fuse_daemon_args *a, const unsigned cha } out.flags = init_flags; out.flags2 = 0; - out.max_write = 4096; + out.max_write = a->init_out_max_write_override ? a->init_out_max_write_override : 4096; out.max_pages = 32; if (fuse_write_reply(a->fd, h->unique, 0, &out, sizeof(out)) != 0) { return -1; @@ -903,13 +952,23 @@ static inline int fuse_handle_one(struct fuse_daemon_args *a, const unsigned cha if (payload_len < sizeof(struct fuse_forget_in)) return -1; const struct fuse_forget_in *in = (const struct fuse_forget_in *)payload; + uint32_t idx = 0; + if (a->forget_count) + idx = *a->forget_count; if (a->forget_count) (*a->forget_count)++; if (a->forget_nlookup_sum) (*a->forget_nlookup_sum) += in->nlookup; + if (a->forget_trace_nodeids && a->forget_trace_nlookups && + idx < a->forget_trace_capacity) { + a->forget_trace_nodeids[idx] = h->nodeid; + a->forget_trace_nlookups[idx] = in->nlookup; + } return 0; } case FUSE_LOOKUP: { + if (a->lookup_count) + (*a->lookup_count)++; const char *name = (const char *)payload; if (payload_len == 0 || name[payload_len - 1] != '\0') { return -1; @@ -918,6 +977,16 @@ static inline int fuse_handle_one(struct fuse_daemon_args *a, const unsigned cha if (!parent || !simplefs_node_is_dir(parent)) { return fuse_write_reply(a->fd, h->unique, -ENOENT, NULL, 0); } + if (a->lookup_self_alias && strcmp(name, "self_alias") == 0) { + struct fuse_entry_out out; + memset(&out, 0, sizeof(out)); + out.nodeid = parent->nodeid; + out.generation = parent->generation; + out.entry_valid = a->entry_valid_sec; + out.attr_valid = a->attr_valid_sec; + simplefs_fill_attr(parent, &out.attr); + return fuse_write_reply(a->fd, h->unique, 0, &out, sizeof(out)); + } struct simplefs_node *child = simplefs_find_child(&a->fs, h->nodeid, name); if (!child) { return fuse_write_reply(a->fd, h->unique, -ENOENT, NULL, 0); @@ -926,6 +995,8 @@ static inline int fuse_handle_one(struct fuse_daemon_args *a, const unsigned cha memset(&out, 0, sizeof(out)); out.nodeid = child->nodeid; out.generation = child->generation; + out.entry_valid = a->entry_valid_sec; + out.attr_valid = a->attr_valid_sec; simplefs_fill_attr(child, &out.attr); return fuse_write_reply(a->fd, h->unique, 0, &out, sizeof(out)); } @@ -1044,6 +1115,10 @@ static inline int fuse_handle_one(struct fuse_daemon_args *a, const unsigned cha } } size_t effective_size = node->size; + int generated_hello = h->nodeid == 2 && a->hello_generated_size_override > 0; + if (generated_hello) { + effective_size = a->hello_generated_size_override; + } if (h->nodeid == 2 && a->hello_read_size_override > 0 && a->hello_read_size_override < effective_size) { effective_size = a->hello_read_size_override; @@ -1051,7 +1126,8 @@ static inline int fuse_handle_one(struct fuse_daemon_args *a, const unsigned cha if (in->offset >= effective_size) { return fuse_write_reply(a->fd, h->unique, 0, NULL, 0); } - if (h->nodeid == 2 && a->dynamic_hello_first_byte && *a->dynamic_hello_first_byte != 0 + if (!generated_hello && h->nodeid == 2 && a->dynamic_hello_first_byte + && *a->dynamic_hello_first_byte != 0 && node->size > 0) { node->data[0] = *a->dynamic_hello_first_byte; } @@ -1060,6 +1136,18 @@ static inline int fuse_handle_one(struct fuse_daemon_args *a, const unsigned cha if (to_copy > remain) { to_copy = remain; } + if (generated_hello) { + unsigned char *generated = (unsigned char *)malloc(to_copy); + if (!generated) { + return fuse_write_reply(a->fd, h->unique, -ENOMEM, NULL, 0); + } + for (size_t i = 0; i < to_copy; i++) { + generated[i] = (unsigned char)('A' + ((in->offset + i) % 26)); + } + int ret = fuse_write_reply(a->fd, h->unique, 0, generated, to_copy); + free(generated); + return ret; + } return fuse_write_reply(a->fd, h->unique, 0, node->data + in->offset, to_copy); } case FUSE_READDIR: @@ -1144,6 +1232,10 @@ static inline int fuse_handle_one(struct fuse_daemon_args *a, const unsigned cha dp.entry_out.nodeid = c->nodeid; dp.entry_out.generation = c->generation; simplefs_fill_attr(c, &dp.entry_out.attr); + if (a->readdirplus_invalid_attr_name && + strcmp(c->name, a->readdirplus_invalid_attr_name) == 0) { + dp.entry_out.attr.size = a->readdirplus_invalid_attr_size; + } dp.dirent.ino = c->ino; dp.dirent.off = child_base + 1; dp.dirent.namelen = (uint32_t)nmlen; @@ -1683,6 +1775,112 @@ static inline int fuse_handle_one(struct fuse_daemon_args *a, const unsigned cha simplefs_fill_attr(node, &out.attr); return fuse_write_reply(a->fd, h->unique, 0, &out, sizeof(out)); } + case FUSE_GETXATTR: { + if (a->getxattr_count) { + (*a->getxattr_count)++; + } + if (a->force_xattr_enosys) { + return fuse_write_reply(a->fd, h->unique, -ENOSYS, NULL, 0); + } + if (payload_len < sizeof(struct fuse_getxattr_in) + 1) { + return -1; + } + const struct fuse_getxattr_in *in = (const struct fuse_getxattr_in *)payload; + const char *name = (const char *)(payload + sizeof(*in)); + size_t name_len = payload_len - sizeof(*in); + if (name[name_len - 1] != '\0') { + return -1; + } + if (strcmp(name, "user.dragonos") != 0) { + return fuse_write_reply(a->fd, h->unique, -ENODATA, NULL, 0); + } + const char value[] = "virtiofs-xattr"; + size_t value_len = sizeof(value) - 1; + if (in->size == 0) { + struct fuse_getxattr_out out; + memset(&out, 0, sizeof(out)); + out.size = (uint32_t)value_len; + return fuse_write_reply(a->fd, h->unique, 0, &out, sizeof(out)); + } + if (a->force_getxattr_erange_at_max && in->size == 65536) { + return fuse_write_reply(a->fd, h->unique, -ERANGE, NULL, 0); + } + if (in->size < value_len) { + return fuse_write_reply(a->fd, h->unique, -ERANGE, NULL, 0); + } + return fuse_write_reply(a->fd, h->unique, 0, value, value_len); + } + case FUSE_LISTXATTR: { + if (a->listxattr_count) { + (*a->listxattr_count)++; + } + if (a->force_xattr_enosys) { + return fuse_write_reply(a->fd, h->unique, -ENOSYS, NULL, 0); + } + if (payload_len < sizeof(struct fuse_getxattr_in)) { + return -1; + } + const struct fuse_getxattr_in *in = (const struct fuse_getxattr_in *)payload; + const char list[] = "user.dragonos"; + size_t list_len = sizeof(list); + if (in->size == 0) { + struct fuse_getxattr_out out; + memset(&out, 0, sizeof(out)); + out.size = (uint32_t)list_len; + return fuse_write_reply(a->fd, h->unique, 0, &out, sizeof(out)); + } + if (a->force_listxattr_erange_at_max && in->size == 65536) { + return fuse_write_reply(a->fd, h->unique, -ERANGE, NULL, 0); + } + if (in->size < list_len) { + return fuse_write_reply(a->fd, h->unique, -ERANGE, NULL, 0); + } + return fuse_write_reply(a->fd, h->unique, 0, list, list_len); + } + case FUSE_SETXATTR: { + if (a->setxattr_count) { + (*a->setxattr_count)++; + } + if (a->force_xattr_enosys) { + return fuse_write_reply(a->fd, h->unique, -ENOSYS, NULL, 0); + } + if (payload_len < sizeof(struct fuse_setxattr_in_compat) + 1) { + return -1; + } + const struct fuse_setxattr_in_compat *in = + (const struct fuse_setxattr_in_compat *)payload; + if (a->last_setxattr_flags) { + *a->last_setxattr_flags = in->flags; + } + const char *name = (const char *)payload + sizeof(struct fuse_setxattr_in_compat); + size_t name_len = strnlen(name, payload_len - sizeof(struct fuse_setxattr_in_compat)); + if (name_len >= payload_len - sizeof(struct fuse_setxattr_in_compat)) { + return -1; + } + if ((in->flags & XATTR_CREATE) && strcmp(name, "user.dragonos") == 0) { + return fuse_write_reply(a->fd, h->unique, -EEXIST, NULL, 0); + } + if ((in->flags & XATTR_REPLACE) && strcmp(name, "user.missing") == 0) { + return fuse_write_reply(a->fd, h->unique, -ENODATA, NULL, 0); + } + return fuse_write_reply(a->fd, h->unique, 0, NULL, 0); + } + case FUSE_REMOVEXATTR: { + if (a->removexattr_count) { + (*a->removexattr_count)++; + } + if (a->force_xattr_enosys) { + return fuse_write_reply(a->fd, h->unique, -ENOSYS, NULL, 0); + } + const char *name = (const char *)payload; + if (payload_len == 0 || name[payload_len - 1] != '\0') { + return -1; + } + if (strcmp(name, "user.dragonos") != 0) { + return fuse_write_reply(a->fd, h->unique, -ENODATA, NULL, 0); + } + return fuse_write_reply(a->fd, h->unique, 0, NULL, 0); + } case FUSE_FALLOCATE: { if (!a->enable_write_ops) { return fuse_write_reply(a->fd, h->unique, -ENOSYS, NULL, 0); @@ -1763,6 +1961,9 @@ static inline void *fuse_daemon_thread(void *arg) { } a->fs.nodes[1].size = size; } + if (a->hello_generated_size_override > 0) { + a->fs.nodes[1].size = a->hello_generated_size_override; + } while (!*a->stop) { FUSE_TEST_LOG("daemon read start"); diff --git a/user/apps/tests/dunitest/suites/normal/cubesandbox_pty_exec_chain.cc b/user/apps/tests/dunitest/suites/normal/cubesandbox_pty_exec_chain.cc new file mode 100644 index 0000000000..9e65f2b376 --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/cubesandbox_pty_exec_chain.cc @@ -0,0 +1,1636 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace { + +class UniqueFd { +public: + UniqueFd() = default; + explicit UniqueFd(int fd) : fd_(fd) {} + UniqueFd(const UniqueFd&) = delete; + UniqueFd& operator=(const UniqueFd&) = delete; + + UniqueFd(UniqueFd&& other) noexcept : fd_(other.fd_) { other.fd_ = -1; } + + UniqueFd& operator=(UniqueFd&& other) noexcept { + if (this != &other) { + reset(); + fd_ = other.fd_; + other.fd_ = -1; + } + return *this; + } + + ~UniqueFd() { reset(); } + + int get() const { return fd_; } + + void reset(int fd = -1) { + if (fd_ >= 0) { + close(fd_); + } + fd_ = fd; + } + +private: + int fd_ = -1; +}; + +struct PtyPair { + UniqueFd master; + UniqueFd slave; +}; + +PtyPair OpenPty() { + int master = -1; + int slave = -1; + if (openpty(&master, &slave, nullptr, nullptr, nullptr) < 0) { + ADD_FAILURE() << "openpty failed: errno=" << errno << " (" << strerror(errno) << ")"; + return {}; + } + return PtyPair{UniqueFd(master), UniqueFd(slave)}; +} + +void SetNonblock(int fd) { + int flags = fcntl(fd, F_GETFL); + ASSERT_GE(flags, 0) << "fcntl(F_GETFL) failed: errno=" << errno << " (" << strerror(errno) + << ")"; + ASSERT_EQ(0, fcntl(fd, F_SETFL, flags | O_NONBLOCK)) + << "fcntl(F_SETFL, O_NONBLOCK) failed: errno=" << errno << " (" << strerror(errno) + << ")"; +} + +void SetRawByteMode(int fd) { + struct termios term = {}; + ASSERT_EQ(0, tcgetattr(fd, &term)) + << "tcgetattr failed: errno=" << errno << " (" << strerror(errno) << ")"; + + term.c_iflag = 0; + term.c_oflag = 0; + term.c_lflag = 0; + term.c_cflag |= CS8; + term.c_cc[VMIN] = 1; + term.c_cc[VTIME] = 0; + + ASSERT_EQ(0, tcsetattr(fd, TCSANOW, &term)) + << "tcsetattr failed: errno=" << errno << " (" << strerror(errno) << ")"; +} + +bool WriteAll(int fd, const char* data, size_t len) { + size_t written = 0; + while (written < len) { + ssize_t ret = write(fd, data + written, len - written); + if (ret > 0) { + written += static_cast(ret); + continue; + } + if (ret < 0 && errno == EINTR) { + continue; + } + return false; + } + return true; +} + +bool SetNonblockNoAssert(int fd) { + int flags = fcntl(fd, F_GETFL); + if (flags < 0) { + return false; + } + return fcntl(fd, F_SETFL, flags | O_NONBLOCK) == 0; +} + +bool WaitForChild(pid_t child, int* status, int rounds = 300) { + for (int i = 0; i < rounds; ++i) { + pid_t ret = waitpid(child, status, WNOHANG); + if (ret == child) { + return true; + } + if (ret < 0 && errno != EINTR) { + return false; + } + usleep(10 * 1000); + } + return false; +} + +void WriteReport(int report_fd, const std::string& report) { + WriteAll(report_fd, report.c_str(), report.size()); +} + +void KillAndReap(pid_t child) { + kill(child, SIGKILL); + waitpid(child, nullptr, 0); +} + +bool ReadUntilContains(int fd, const std::string& needle, std::string* output, int timeout_ms) { + const size_t search_from = 0; + int elapsed_ms = 0; + while (elapsed_ms < timeout_ms) { + struct pollfd pfd = { + .fd = fd, + .events = POLLIN | POLLERR | POLLHUP, + .revents = 0, + }; + + sigset_t mask; + sigemptyset(&mask); + struct timespec ts = { + .tv_sec = 0, + .tv_nsec = 10 * 1000 * 1000, + }; + int ret = ppoll(&pfd, 1, &ts, &mask); + if (ret < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + if (ret == 0) { + elapsed_ms += 10; + continue; + } + + if ((pfd.revents & POLLIN) != 0) { + char buf[256] = {}; + ssize_t n = read(fd, buf, sizeof(buf)); + if (n > 0) { + output->append(buf, static_cast(n)); + if (output->find(needle, search_from) != std::string::npos) { + return true; + } + continue; + } + if (n < 0 && errno == EINTR) { + continue; + } + if (n < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + continue; + } + return false; + } + + if ((pfd.revents & (POLLERR | POLLHUP)) != 0) { + return output->find(needle, search_from) != std::string::npos; + } + + usleep(10 * 1000); + elapsed_ms += 10; + } + return false; +} + +bool ReadUntilContainsFrom(int fd, const std::string& needle, std::string* output, + size_t search_from, int timeout_ms) { + if (output->find(needle, search_from) != std::string::npos) { + return true; + } + + int elapsed_ms = 0; + while (elapsed_ms < timeout_ms) { + struct pollfd pfd = { + .fd = fd, + .events = POLLIN | POLLERR | POLLHUP, + .revents = 0, + }; + + sigset_t mask; + sigemptyset(&mask); + struct timespec ts = { + .tv_sec = 0, + .tv_nsec = 10 * 1000 * 1000, + }; + int ret = ppoll(&pfd, 1, &ts, &mask); + if (ret < 0) { + if (errno == EINTR) { + continue; + } + return false; + } + if (ret == 0) { + elapsed_ms += 10; + continue; + } + + if ((pfd.revents & POLLIN) != 0) { + char buf[256] = {}; + ssize_t n = read(fd, buf, sizeof(buf)); + if (n > 0) { + output->append(buf, static_cast(n)); + if (output->find(needle, search_from) != std::string::npos) { + return true; + } + continue; + } + if (n < 0 && errno == EINTR) { + continue; + } + if (n < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + continue; + } + return false; + } + + if ((pfd.revents & (POLLERR | POLLHUP)) != 0) { + return output->find(needle, search_from) != std::string::npos; + } + + usleep(10 * 1000); + elapsed_ms += 10; + } + return false; +} + +bool WaitUntilAtomic(const std::atomic& value, int expected_mask, int timeout_ms) { + for (int elapsed_ms = 0; elapsed_ms < timeout_ms; elapsed_ms += 10) { + if ((value.load(std::memory_order_acquire) & expected_mask) == expected_mask) { + return true; + } + usleep(10 * 1000); + } + return false; +} + +bool OutputLooksLikeShellPrompt(const std::string& output) { + return output.find("# ") != std::string::npos || output.find("$ ") != std::string::npos; +} + +void ExecUnameProgram() { + execl("/bin/uname", "uname", "-a", nullptr); + execl("/usr/bin/uname", "uname", "-a", nullptr); + execl("/bin/busybox", "busybox", "uname", "-a", nullptr); + _exit(127); +} + +void ExecLsProgram() { + execl("/bin/ls", "ls", "/", nullptr); + execl("/usr/bin/ls", "ls", "/", nullptr); + execl("/bin/busybox", "busybox", "ls", "/", nullptr); + _exit(127); +} + +std::string CollectFdUntilChildExit(int fd, pid_t child, int timeout_ms, int* status) { + std::string output; + for (int elapsed_ms = 0; elapsed_ms < timeout_ms; elapsed_ms += 10) { + struct pollfd pfd = { + .fd = fd, + .events = POLLIN | POLLERR | POLLHUP, + .revents = 0, + }; + struct timespec ts = { + .tv_sec = 0, + .tv_nsec = 10 * 1000 * 1000, + }; + sigset_t empty; + sigemptyset(&empty); + int pret = ppoll(&pfd, 1, &ts, &empty); + if (pret < 0) { + if (errno == EINTR) { + continue; + } + ADD_FAILURE() << "ppoll failed while collecting child output: errno=" << errno << " (" + << strerror(errno) << ")"; + break; + } + + if (pret > 0 && (pfd.revents & POLLIN) != 0) { + std::array buf = {}; + for (;;) { + ssize_t n = read(fd, buf.data(), buf.size()); + if (n > 0) { + output.append(buf.data(), static_cast(n)); + continue; + } + if (n < 0 && errno == EINTR) { + continue; + } + if (n < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + break; + } + break; + } + } + + pid_t wait_ret = waitpid(child, status, WNOHANG); + if (wait_ret == child) { + for (;;) { + std::array buf = {}; + ssize_t n = read(fd, buf.data(), buf.size()); + if (n > 0) { + output.append(buf.data(), static_cast(n)); + continue; + } + if (n < 0 && errno == EINTR) { + continue; + } + break; + } + return output; + } + if (wait_ret < 0 && errno != EINTR) { + ADD_FAILURE() << "waitpid failed while collecting child output: errno=" << errno << " (" + << strerror(errno) << ")"; + break; + } + + if (pret > 0 && (pfd.revents & (POLLERR | POLLHUP)) != 0) { + continue; + } + } + + kill(child, SIGKILL); + waitpid(child, status, 0); + ADD_FAILURE() << "child did not exit within " << timeout_ms << " ms, captured: " << output; + return output; +} + +std::string ReadProcPrintk() { + UniqueFd fd(open("/proc/sys/kernel/printk", O_RDONLY)); + if (fd.get() < 0) { + ADD_FAILURE() << "open /proc/sys/kernel/printk failed: errno=" << errno << " (" + << strerror(errno) << ")"; + return {}; + } + + std::array buf = {}; + ssize_t n = read(fd.get(), buf.data(), buf.size() - 1); + if (n < 0) { + ADD_FAILURE() << "read /proc/sys/kernel/printk failed: errno=" << errno << " (" + << strerror(errno) << ")"; + return {}; + } + return std::string(buf.data(), static_cast(n)); +} + +void ExpectZeroLengthReadReturnsZero(int fd, const char* name) { + errno = 0; + EXPECT_EQ(0, read(fd, nullptr, 0)) + << name << " zero-length read should match Linux semantics, errno=" << errno << " (" + << strerror(errno) << ")"; +} + +void ExpectZeroLengthWriteReturnsZero(int fd, const char* name) { + errno = 0; + EXPECT_EQ(0, write(fd, nullptr, 0)) + << name << " zero-length write should match Linux semantics, errno=" << errno << " (" + << strerror(errno) << ")"; +} + +void WriteProcPrintk(const char* value) { + UniqueFd fd(open("/proc/sys/kernel/printk", O_WRONLY)); + ASSERT_GE(fd.get(), 0) << "open /proc/sys/kernel/printk for write failed: errno=" << errno + << " (" << strerror(errno) << ")"; + ASSERT_EQ(static_cast(strlen(value)), write(fd.get(), value, strlen(value))) + << "write /proc/sys/kernel/printk failed: errno=" << errno << " (" << strerror(errno) + << ")"; +} + +void BindSlaveAsControllingTty(int slave_fd) { + if (setsid() < 0) { + _exit(120); + } + if (ioctl(slave_fd, TIOCSCTTY, 0) != 0) { + _exit(121); + } + if (tcgetpgrp(slave_fd) != getpgrp()) { + _exit(122); + } + + dup2(slave_fd, STDIN_FILENO); + dup2(slave_fd, STDOUT_FILENO); + dup2(slave_fd, STDERR_FILENO); + if (slave_fd > STDERR_FILENO) { + close(slave_fd); + } +} + +void ExecDefaultShellOnSlave(int slave_fd) { + BindSlaveAsControllingTty(slave_fd); + + execl("/bin/sh", "sh", nullptr); + execl("/usr/bin/sh", "sh", nullptr); + execl("/bin/busybox", "busybox", "sh", nullptr); + _exit(127); +} + +void ExecBusyBoxShellOnSlave(int slave_fd) { + BindSlaveAsControllingTty(slave_fd); + + execl("/bin/busybox", "busybox", "sh", nullptr); + _exit(127); +} + +void ExecUnameOnSlave(int slave_fd) { + BindSlaveAsControllingTty(slave_fd); + ExecUnameProgram(); +} + +void RunShellCommandSequence(void (*exec_shell)(int), const char* shell_name) { + PtyPair pair = OpenPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + SetNonblock(pair.master.get()); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + pair.master.reset(); + exec_shell(pair.slave.get()); + } + + pair.slave.reset(); + + std::string output; + ASSERT_TRUE(WriteAll(pair.master.get(), "ls /\n", strlen("ls /\n"))) + << "write ls failed through " << shell_name << ": errno=" << errno << " (" + << strerror(errno) << ")"; + ASSERT_TRUE(ReadUntilContains(pair.master.get(), "bin", &output, 3000)) + << "did not observe ls output through " << shell_name << ", captured: " << output; + + ASSERT_TRUE(WriteAll(pair.master.get(), "uname -a\n", strlen("uname -a\n"))) + << "write uname failed through " << shell_name << ": errno=" << errno << " (" + << strerror(errno) << ")"; + ASSERT_TRUE(ReadUntilContains(pair.master.get(), "Linux", &output, 5000)) + << "did not observe uname output through " << shell_name << ", captured: " << output; + + ASSERT_TRUE(WriteAll(pair.master.get(), "exit\n", strlen("exit\n"))) + << "write exit failed through " << shell_name << ": errno=" << errno << " (" + << strerror(errno) << ")"; + + int status = 0; + if (!WaitForChild(child, &status)) { + kill(child, SIGKILL); + waitpid(child, nullptr, 0); + FAIL() << shell_name << " did not exit after command sequence, captured: " << output; + } + ASSERT_TRUE(WIFEXITED(status) || WIFSIGNALED(status)); +} + +void RunRepeatedShellCommandSequence(void (*exec_shell)(int), const char* shell_name) { + PtyPair pair = OpenPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + SetNonblock(pair.master.get()); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + pair.master.reset(); + exec_shell(pair.slave.get()); + } + + pair.slave.reset(); + + std::string output; + for (int i = 0; i < 4; ++i) { + size_t before_ls = output.size(); + ASSERT_TRUE(WriteAll(pair.master.get(), "ls /\n", strlen("ls /\n"))) + << "write ls failed through " << shell_name << " iteration " << i + << ": errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_TRUE(ReadUntilContainsFrom(pair.master.get(), "bin", &output, before_ls, 3000)) + << "did not observe fresh ls output through " << shell_name << " iteration " << i + << ", captured: " << output; + + size_t before_uname = output.size(); + ASSERT_TRUE(WriteAll(pair.master.get(), "uname -a\n", strlen("uname -a\n"))) + << "write uname failed through " << shell_name << " iteration " << i + << ": errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_TRUE( + ReadUntilContainsFrom(pair.master.get(), "Linux", &output, before_uname, 5000)) + << "did not observe fresh uname output through " << shell_name << " iteration " << i + << ", captured: " << output; + } + + ASSERT_TRUE(WriteAll(pair.master.get(), "exit\n", strlen("exit\n"))) + << "write exit failed through " << shell_name << ": errno=" << errno << " (" + << strerror(errno) << ")"; + + int status = 0; + if (!WaitForChild(child, &status)) { + kill(child, SIGKILL); + waitpid(child, nullptr, 0); + FAIL() << shell_name << " did not exit after repeated command sequence, captured: " + << output; + } + ASSERT_TRUE(WIFEXITED(status) || WIFSIGNALED(status)); +} + +enum MockShimStage : int { + kShimChildForked = 1 << 0, + kShimStdinForwardStarted = 1 << 1, + kShimStdinForwardFinished = 1 << 2, + kShimSawLsOutput = 1 << 3, + kShimSawUnameOutput = 1 << 4, + kShimChildExited = 1 << 5, +}; + +void AppendStage(std::string* out, int stages, int bit, const char* name) { + if ((stages & bit) == 0) { + return; + } + if (!out->empty()) { + out->append("|"); + } + out->append(name); +} + +std::string DescribeShimStages(int stages) { + std::string out; + AppendStage(&out, stages, kShimChildForked, "child-forked"); + AppendStage(&out, stages, kShimStdinForwardStarted, "stdin-forward-started"); + AppendStage(&out, stages, kShimStdinForwardFinished, "stdin-forward-finished"); + AppendStage(&out, stages, kShimSawLsOutput, "saw-ls-output"); + AppendStage(&out, stages, kShimSawUnameOutput, "saw-uname-output"); + AppendStage(&out, stages, kShimChildExited, "child-exited"); + return out.empty() ? "none" : out; +} + +struct MockShimForwarder { + int source_fd = -1; + int pty_master_fd = -1; + std::atomic* stages = nullptr; +}; + +enum MockShimConcurrentStage : int { + kConcurrentChildForked = 1 << 0, + kConcurrentStdinForwardStarted = 1 << 1, + kConcurrentStdinForwardFinished = 1 << 2, + kConcurrentStdoutForwardStarted = 1 << 3, + kConcurrentSawStartMarker = 1 << 4, + kConcurrentSawLsOutput = 1 << 5, + kConcurrentSawUnameOutput = 1 << 6, + kConcurrentSawEndMarker = 1 << 7, + kConcurrentChildExited = 1 << 8, + kConcurrentStdoutForwardFinished = 1 << 9, +}; + +std::string DescribeConcurrentStages(int stages) { + std::string out; + AppendStage(&out, stages, kConcurrentChildForked, "child-forked"); + AppendStage(&out, stages, kConcurrentStdinForwardStarted, "stdin-forward-started"); + AppendStage(&out, stages, kConcurrentStdinForwardFinished, "stdin-forward-finished"); + AppendStage(&out, stages, kConcurrentStdoutForwardStarted, "stdout-forward-started"); + AppendStage(&out, stages, kConcurrentSawStartMarker, "saw-start-marker"); + AppendStage(&out, stages, kConcurrentSawLsOutput, "saw-ls-output"); + AppendStage(&out, stages, kConcurrentSawUnameOutput, "saw-uname-output"); + AppendStage(&out, stages, kConcurrentSawEndMarker, "saw-end-marker"); + AppendStage(&out, stages, kConcurrentChildExited, "child-exited"); + AppendStage(&out, stages, kConcurrentStdoutForwardFinished, "stdout-forward-finished"); + return out.empty() ? "none" : out; +} + +struct MockShimStdoutForwarder { + int pty_master_fd = -1; + int client_stdout_fd = -1; + std::atomic* stages = nullptr; + std::atomic* child_exited = nullptr; +}; + +void* ForwardClientInputToPty(void* arg) { + auto* forwarder = reinterpret_cast(arg); + forwarder->stages->fetch_or(kShimStdinForwardStarted, std::memory_order_release); + + std::array buf = {}; + for (;;) { + ssize_t n = read(forwarder->source_fd, buf.data(), buf.size()); + if (n == 0) { + break; + } + if (n < 0) { + if (errno == EINTR) { + continue; + } + return reinterpret_cast(1); + } + + if (!WriteAll(forwarder->pty_master_fd, buf.data(), static_cast(n))) { + return reinterpret_cast(2); + } + } + + forwarder->stages->fetch_or(kShimStdinForwardFinished, std::memory_order_release); + return nullptr; +} + +void* ForwardPtyOutputToClient(void* arg) { + auto* forwarder = reinterpret_cast(arg); + forwarder->stages->fetch_or(kConcurrentStdoutForwardStarted, std::memory_order_release); + + int idle_after_child_exit = 0; + std::array buf = {}; + for (;;) { + struct pollfd pfd = { + .fd = forwarder->pty_master_fd, + .events = POLLIN | POLLERR | POLLHUP, + .revents = 0, + }; + struct timespec ts = { + .tv_sec = 0, + .tv_nsec = 10 * 1000 * 1000, + }; + sigset_t empty; + sigemptyset(&empty); + int ret = ppoll(&pfd, 1, &ts, &empty); + if (ret < 0) { + if (errno == EINTR) { + continue; + } + close(forwarder->client_stdout_fd); + return reinterpret_cast(1); + } + + if (ret == 0) { + if (forwarder->child_exited->load(std::memory_order_acquire) != 0 && + ++idle_after_child_exit >= 5) { + break; + } + continue; + } + idle_after_child_exit = 0; + + if ((pfd.revents & POLLIN) != 0) { + for (;;) { + ssize_t n = read(forwarder->pty_master_fd, buf.data(), buf.size()); + if (n > 0) { + if (!WriteAll(forwarder->client_stdout_fd, buf.data(), + static_cast(n))) { + close(forwarder->client_stdout_fd); + return reinterpret_cast(2); + } + continue; + } + if (n == 0 || (n < 0 && errno == EIO)) { + break; + } + if (n < 0 && errno == EINTR) { + continue; + } + if (n < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + break; + } + close(forwarder->client_stdout_fd); + return reinterpret_cast(3); + } + } + + if ((pfd.revents & (POLLERR | POLLHUP)) != 0) { + break; + } + } + + forwarder->stages->fetch_or(kConcurrentStdoutForwardFinished, std::memory_order_release); + close(forwarder->client_stdout_fd); + return nullptr; +} + +void RunCubeShimLikeExecChain(void (*exec_shell)(int), const char* shell_name) { + // Model the interactive Cube exec path at the kernel ABI boundary: + // client stdin pipe -> shim forwarder thread -> PTY master -> shell on the + // controlling PTY slave -> shim stdout polling on the PTY master. + PtyPair pair = OpenPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + SetNonblock(pair.master.get()); + + int client_stdin[2] = {-1, -1}; + ASSERT_EQ(0, pipe(client_stdin)) << strerror(errno); + UniqueFd client_stdin_read(client_stdin[0]); + UniqueFd client_stdin_write(client_stdin[1]); + + std::atomic stages{0}; + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + client_stdin_read.reset(); + client_stdin_write.reset(); + pair.master.reset(); + exec_shell(pair.slave.get()); + } + + stages.fetch_or(kShimChildForked, std::memory_order_release); + pair.slave.reset(); + + MockShimForwarder forwarder = { + .source_fd = client_stdin_read.get(), + .pty_master_fd = pair.master.get(), + .stages = &stages, + }; + + pthread_t stdin_thread = {}; + ASSERT_EQ(0, pthread_create(&stdin_thread, nullptr, ForwardClientInputToPty, &forwarder)) + << strerror(errno); + + ASSERT_TRUE(WaitUntilAtomic(stages, kShimStdinForwardStarted, 1000)) + << "stdin forwarder did not start for " << shell_name; + + constexpr char kCommands[] = "ls /\nuname -a\nexit\n"; + ASSERT_TRUE(WriteAll(client_stdin_write.get(), kCommands, strlen(kCommands))) + << "client stdin write failed for " << shell_name << ": errno=" << errno << " (" + << strerror(errno) << ")"; + client_stdin_write.reset(); + + std::string output; + int elapsed_ms = 0; + while (elapsed_ms < 5000) { + struct pollfd pfd = { + .fd = pair.master.get(), + .events = POLLIN | POLLERR | POLLHUP, + .revents = 0, + }; + struct timespec ts = { + .tv_sec = 0, + .tv_nsec = 10 * 1000 * 1000, + }; + sigset_t empty; + sigemptyset(&empty); + int ret = ppoll(&pfd, 1, &ts, &empty); + if (ret < 0) { + if (errno == EINTR) { + continue; + } + FAIL() << "ppoll failed in mock shim stdout loop for " << shell_name + << ": errno=" << errno << " (" << strerror(errno) << ")"; + } + if (ret == 0) { + elapsed_ms += 10; + continue; + } + + if ((pfd.revents & POLLIN) != 0) { + std::array buf = {}; + ssize_t n = read(pair.master.get(), buf.data(), buf.size()); + if (n > 0) { + output.append(buf.data(), static_cast(n)); + if (output.find("bin") != std::string::npos) { + stages.fetch_or(kShimSawLsOutput, std::memory_order_release); + } + if (output.find("Linux") != std::string::npos) { + stages.fetch_or(kShimSawUnameOutput, std::memory_order_release); + break; + } + continue; + } + if (n < 0 && (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)) { + continue; + } + } + + if ((pfd.revents & (POLLERR | POLLHUP)) != 0) { + break; + } + } + + void* thread_result = nullptr; + ASSERT_EQ(0, pthread_join(stdin_thread, &thread_result)) << strerror(errno); + EXPECT_EQ(nullptr, thread_result) << "stdin forwarder failed for " << shell_name; + + int status = 0; + if (!WaitForChild(child, &status)) { + int observed = stages.load(std::memory_order_acquire); + kill(child, SIGKILL); + waitpid(child, nullptr, 0); + FAIL() << "mock shim child did not exit for " << shell_name << ", stages=0x" << std::hex + << observed << " (" << DescribeShimStages(observed) << "), captured: " << output; + } + stages.fetch_or(kShimChildExited, std::memory_order_release); + + constexpr int kExpectedStages = kShimChildForked | kShimStdinForwardStarted | + kShimStdinForwardFinished | kShimSawLsOutput | + kShimSawUnameOutput | kShimChildExited; + int observed = stages.load(std::memory_order_acquire); + EXPECT_EQ(kExpectedStages, observed) + << "mock shim chain did not reach all stages for " << shell_name + << ", stages=0x" << std::hex << observed << " (" << DescribeShimStages(observed) << ")" + << ", captured output: " << output; + ASSERT_TRUE(WIFEXITED(status) || WIFSIGNALED(status)); +} + +void RunCubeShimLikeConcurrentForwarders(void (*exec_shell)(int), const char* shell_name) { + PtyPair pair = OpenPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + SetNonblock(pair.master.get()); + + int client_stdin[2] = {-1, -1}; + ASSERT_EQ(0, pipe(client_stdin)) << strerror(errno); + UniqueFd client_stdin_read(client_stdin[0]); + UniqueFd client_stdin_write(client_stdin[1]); + + int client_stdout[2] = {-1, -1}; + ASSERT_EQ(0, pipe(client_stdout)) << strerror(errno); + UniqueFd client_stdout_read(client_stdout[0]); + int client_stdout_write = client_stdout[1]; + SetNonblock(client_stdout_read.get()); + + std::atomic stages{0}; + std::atomic child_exited{0}; + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + client_stdin_read.reset(); + client_stdin_write.reset(); + client_stdout_read.reset(); + close(client_stdout_write); + pair.master.reset(); + exec_shell(pair.slave.get()); + } + + stages.fetch_or(kConcurrentChildForked, std::memory_order_release); + pair.slave.reset(); + + MockShimForwarder stdin_forwarder = { + .source_fd = client_stdin_read.get(), + .pty_master_fd = pair.master.get(), + .stages = &stages, + }; + MockShimStdoutForwarder stdout_forwarder = { + .pty_master_fd = pair.master.get(), + .client_stdout_fd = client_stdout_write, + .stages = &stages, + .child_exited = &child_exited, + }; + + pthread_t stdin_thread = {}; + ASSERT_EQ(0, pthread_create(&stdin_thread, nullptr, ForwardClientInputToPty, &stdin_forwarder)) + << strerror(errno); + + pthread_t stdout_thread = {}; + ASSERT_EQ(0, + pthread_create(&stdout_thread, nullptr, ForwardPtyOutputToClient, &stdout_forwarder)) + << strerror(errno); + + ASSERT_TRUE(WaitUntilAtomic(stages, kShimStdinForwardStarted, 1000)) + << "stdin forwarder did not start for " << shell_name; + ASSERT_TRUE(WaitUntilAtomic(stages, kConcurrentStdoutForwardStarted, 1000)) + << "stdout forwarder did not start for " << shell_name; + + constexpr char kCommands[] = + "echo cube-atomic-start\n" + "ls /\n" + "uname -a\n" + "echo cube-atomic-end\n" + "exit\n"; + ASSERT_TRUE(WriteAll(client_stdin_write.get(), kCommands, strlen(kCommands))) + << "client stdin write failed for " << shell_name << ": errno=" << errno << " (" + << strerror(errno) << ")"; + client_stdin_write.reset(); + + std::string output; + ASSERT_TRUE(ReadUntilContainsFrom(client_stdout_read.get(), "cube-atomic-start", &output, + 0, 3000)) + << "did not observe start marker for " << shell_name << ", captured: " << output; + stages.fetch_or(kConcurrentSawStartMarker, std::memory_order_release); + + ASSERT_TRUE(ReadUntilContainsFrom(client_stdout_read.get(), "bin", &output, 0, 3000)) + << "did not observe ls output for " << shell_name << ", captured: " << output; + stages.fetch_or(kConcurrentSawLsOutput, std::memory_order_release); + + ASSERT_TRUE(ReadUntilContainsFrom(client_stdout_read.get(), "Linux", &output, 0, 5000)) + << "did not observe uname output for " << shell_name << ", captured: " << output; + stages.fetch_or(kConcurrentSawUnameOutput, std::memory_order_release); + + ASSERT_TRUE(ReadUntilContainsFrom(client_stdout_read.get(), "cube-atomic-end", &output, + 0, 3000)) + << "did not observe end marker for " << shell_name << ", captured: " << output; + stages.fetch_or(kConcurrentSawEndMarker, std::memory_order_release); + + void* stdin_result = nullptr; + ASSERT_EQ(0, pthread_join(stdin_thread, &stdin_result)) << strerror(errno); + EXPECT_EQ(nullptr, stdin_result) << "stdin forwarder failed for " << shell_name; + + int status = 0; + if (!WaitForChild(child, &status)) { + int observed = stages.load(std::memory_order_acquire); + kill(child, SIGKILL); + waitpid(child, nullptr, 0); + child_exited.store(1, std::memory_order_release); + pthread_join(stdout_thread, nullptr); + FAIL() << "child did not exit for " << shell_name << ", stages=0x" << std::hex + << observed << " (" << DescribeConcurrentStages(observed) + << "), captured: " << output; + } + stages.fetch_or(kConcurrentChildExited, std::memory_order_release); + child_exited.store(1, std::memory_order_release); + + void* stdout_result = nullptr; + ASSERT_EQ(0, pthread_join(stdout_thread, &stdout_result)) << strerror(errno); + EXPECT_EQ(nullptr, stdout_result) << "stdout forwarder failed for " << shell_name; + + constexpr int kExpectedStages = + kConcurrentChildForked | kConcurrentStdinForwardStarted | + kConcurrentStdinForwardFinished | kConcurrentStdoutForwardStarted | + kConcurrentSawStartMarker | kConcurrentSawLsOutput | kConcurrentSawUnameOutput | + kConcurrentSawEndMarker | kConcurrentChildExited | kConcurrentStdoutForwardFinished; + int observed = stages.load(std::memory_order_acquire); + EXPECT_EQ(kExpectedStages, observed) + << "concurrent shim chain did not reach all stages for " << shell_name + << ", stages=0x" << std::hex << observed << " (" + << DescribeConcurrentStages(observed) << ")" + << ", captured output: " << output; + ASSERT_TRUE(WIFEXITED(status) || WIFSIGNALED(status)); +} + +void RunCubeShimLikeByteStreamInput(void (*exec_shell)(int), const char* shell_name) { + PtyPair pair = OpenPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + SetNonblock(pair.master.get()); + + int client_stdin[2] = {-1, -1}; + ASSERT_EQ(0, pipe(client_stdin)) << strerror(errno); + UniqueFd client_stdin_read(client_stdin[0]); + UniqueFd client_stdin_write(client_stdin[1]); + + int client_stdout[2] = {-1, -1}; + ASSERT_EQ(0, pipe(client_stdout)) << strerror(errno); + UniqueFd client_stdout_read(client_stdout[0]); + int client_stdout_write = client_stdout[1]; + SetNonblock(client_stdout_read.get()); + + std::atomic stages{0}; + std::atomic child_exited{0}; + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + client_stdin_read.reset(); + client_stdin_write.reset(); + client_stdout_read.reset(); + close(client_stdout_write); + pair.master.reset(); + exec_shell(pair.slave.get()); + } + + stages.fetch_or(kConcurrentChildForked, std::memory_order_release); + pair.slave.reset(); + + MockShimForwarder stdin_forwarder = { + .source_fd = client_stdin_read.get(), + .pty_master_fd = pair.master.get(), + .stages = &stages, + }; + MockShimStdoutForwarder stdout_forwarder = { + .pty_master_fd = pair.master.get(), + .client_stdout_fd = client_stdout_write, + .stages = &stages, + .child_exited = &child_exited, + }; + + pthread_t stdin_thread = {}; + ASSERT_EQ(0, pthread_create(&stdin_thread, nullptr, ForwardClientInputToPty, &stdin_forwarder)) + << strerror(errno); + + pthread_t stdout_thread = {}; + ASSERT_EQ(0, + pthread_create(&stdout_thread, nullptr, ForwardPtyOutputToClient, &stdout_forwarder)) + << strerror(errno); + + ASSERT_TRUE(WaitUntilAtomic(stages, kShimStdinForwardStarted, 1000)) + << "stdin forwarder did not start for " << shell_name; + ASSERT_TRUE(WaitUntilAtomic(stages, kConcurrentStdoutForwardStarted, 1000)) + << "stdout forwarder did not start for " << shell_name; + + constexpr char kCommands[] = + "echo cube-byte-start\n" + "echo cube-byte-done\n" + "exit\n"; + for (size_t i = 0; i < sizeof(kCommands) - 1; ++i) { + ASSERT_TRUE(WriteAll(client_stdin_write.get(), &kCommands[i], 1)) + << "client byte write failed for " << shell_name << " at byte " << i + << ": errno=" << errno << " (" << strerror(errno) << ")"; + usleep(1000); + } + client_stdin_write.reset(); + + std::string output; + ASSERT_TRUE(ReadUntilContainsFrom(client_stdout_read.get(), "cube-byte-start", &output, + 0, 5000)) + << "did not observe byte-stream start marker for " << shell_name + << ", captured: " << output; + ASSERT_TRUE(ReadUntilContainsFrom(client_stdout_read.get(), "cube-byte-done", &output, + 0, 5000)) + << "did not observe byte-stream done marker for " << shell_name + << ", captured: " << output; + + void* stdin_result = nullptr; + ASSERT_EQ(0, pthread_join(stdin_thread, &stdin_result)) << strerror(errno); + EXPECT_EQ(nullptr, stdin_result) << "stdin forwarder failed for " << shell_name; + + int status = 0; + if (!WaitForChild(child, &status)) { + int observed = stages.load(std::memory_order_acquire); + kill(child, SIGKILL); + waitpid(child, nullptr, 0); + child_exited.store(1, std::memory_order_release); + pthread_join(stdout_thread, nullptr); + FAIL() << "byte-stream child did not exit for " << shell_name << ", stages=0x" + << std::hex << observed << " (" << DescribeConcurrentStages(observed) + << "), captured: " << output; + } + child_exited.store(1, std::memory_order_release); + + void* stdout_result = nullptr; + ASSERT_EQ(0, pthread_join(stdout_thread, &stdout_result)) << strerror(errno); + EXPECT_EQ(nullptr, stdout_result) << "stdout forwarder failed for " << shell_name; + ASSERT_TRUE(WIFEXITED(status) || WIFSIGNALED(status)); +} + +int RunVforkExecLsAndReport(int report_fd) { + int stdout_pipe[2] = {-1, -1}; + if (pipe(stdout_pipe) != 0) { + return 10; + } + + pid_t child = vfork(); + if (child == 0) { + close(stdout_pipe[0]); + dup2(stdout_pipe[1], STDOUT_FILENO); + dup2(stdout_pipe[1], STDERR_FILENO); + if (stdout_pipe[1] > STDERR_FILENO) { + close(stdout_pipe[1]); + } + ExecLsProgram(); + } + if (child < 0) { + return 11; + } + + close(stdout_pipe[1]); + + std::string output; + std::array buf = {}; + for (;;) { + ssize_t n = read(stdout_pipe[0], buf.data(), buf.size()); + if (n > 0) { + output.append(buf.data(), static_cast(n)); + continue; + } + if (n < 0 && errno == EINTR) { + continue; + } + break; + } + close(stdout_pipe[0]); + + int status = 0; + if (waitpid(child, &status, 0) != child) { + return 12; + } + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + return 13; + } + if (output.find("bin") == std::string::npos) { + return 14; + } + char ok = '1'; + if (write(report_fd, &ok, 1) != 1) { + return 15; + } + return 0; +} + +int RunPidNamespaceInteractiveShellAndReport(int report_fd) { + int master = -1; + int slave = -1; + if (openpty(&master, &slave, nullptr, nullptr, nullptr) != 0) { + WriteReport(report_fd, "openpty failed"); + return 30; + } + if (!SetNonblockNoAssert(master)) { + WriteReport(report_fd, "set nonblock failed"); + close(master); + close(slave); + return 31; + } + + pid_t shell = fork(); + if (shell == 0) { + close(master); + ExecDefaultShellOnSlave(slave); + } + if (shell < 0) { + WriteReport(report_fd, "fork shell failed"); + close(master); + close(slave); + return 32; + } + + close(slave); + + std::string output; + ReadUntilContains(master, " ", &output, 1000); + + constexpr char kCommands[] = + "echo cube-pidns-start\n" + "ls /\n" + "uname -a\n" + "echo cube-pidns-end\n" + "exit\n"; + if (!WriteAll(master, kCommands, strlen(kCommands))) { + WriteReport(report_fd, "write commands failed: " + std::to_string(errno)); + kill(shell, SIGKILL); + waitpid(shell, nullptr, 0); + close(master); + return 33; + } + + if (!ReadUntilContains(master, "cube-pidns-start", &output, 3000)) { + WriteReport(report_fd, "missing start marker: " + output); + kill(shell, SIGKILL); + waitpid(shell, nullptr, 0); + close(master); + return 34; + } + if (!ReadUntilContains(master, "bin", &output, 3000)) { + WriteReport(report_fd, "missing ls output: " + output); + kill(shell, SIGKILL); + waitpid(shell, nullptr, 0); + close(master); + return 35; + } + if (!ReadUntilContains(master, "Linux", &output, 5000)) { + WriteReport(report_fd, "missing uname output: " + output); + kill(shell, SIGKILL); + waitpid(shell, nullptr, 0); + close(master); + return 36; + } + if (!ReadUntilContains(master, "cube-pidns-end", &output, 3000)) { + WriteReport(report_fd, "missing end marker: " + output); + kill(shell, SIGKILL); + waitpid(shell, nullptr, 0); + close(master); + return 37; + } + + int status = 0; + if (!WaitForChild(shell, &status)) { + WriteReport(report_fd, "shell did not exit: " + output); + kill(shell, SIGKILL); + waitpid(shell, nullptr, 0); + close(master); + return 38; + } + close(master); + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + WriteReport(report_fd, "shell bad status: " + std::to_string(status) + " output: " + + output); + return 39; + } + + WriteReport(report_fd, "OK"); + return 0; +} + +void ExpectVforkExecLsCompletesInChildPidNamespace() { + int report_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(report_pipe)) << strerror(errno); + UniqueFd report_read(report_pipe[0]); + UniqueFd report_write(report_pipe[1]); + SetNonblock(report_read.get()); + + pid_t outer = fork(); + ASSERT_GE(outer, 0) << strerror(errno); + if (outer == 0) { + report_read.reset(); + if (unshare(CLONE_NEWPID) != 0) { + _exit(errno == ENOSYS || errno == EINVAL ? 77 : 20); + } + + pid_t init = fork(); + if (init == 0) { + int rc = RunVforkExecLsAndReport(report_write.get()); + _exit(rc); + } + if (init < 0) { + _exit(21); + } + + int status = 0; + if (waitpid(init, &status, 0) != init) { + _exit(22); + } + if (!WIFEXITED(status)) { + _exit(23); + } + _exit(WEXITSTATUS(status)); + } + + report_write.reset(); + + std::string report; + int status = 0; + bool exited = false; + for (int elapsed_ms = 0; elapsed_ms < 5000; elapsed_ms += 10) { + char ch = 0; + ssize_t n = read(report_read.get(), &ch, 1); + if (n == 1) { + report.push_back(ch); + } + pid_t ret = waitpid(outer, &status, WNOHANG); + if (ret == outer) { + exited = true; + break; + } + ASSERT_FALSE(ret < 0 && errno != EINTR) << strerror(errno); + usleep(10 * 1000); + } + + if (!exited) { + KillAndReap(outer); + FAIL() << "vfork+exec /bin/ls did not finish inside a child PID namespace"; + } + ASSERT_TRUE(WIFEXITED(status)) << "outer status=" << status; + if (WEXITSTATUS(status) == 77) { + GTEST_SKIP() << "CLONE_NEWPID is not available"; + } + ASSERT_EQ(0, WEXITSTATUS(status)) << "outer exit status=" << WEXITSTATUS(status); + EXPECT_EQ("1", report); +} + +void ExpectInteractiveShellCommandsCompleteInChildPidNamespace() { + int report_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(report_pipe)) << strerror(errno); + UniqueFd report_read(report_pipe[0]); + UniqueFd report_write(report_pipe[1]); + SetNonblock(report_read.get()); + + pid_t outer = fork(); + ASSERT_GE(outer, 0) << strerror(errno); + if (outer == 0) { + report_read.reset(); + if (unshare(CLONE_NEWPID) != 0) { + _exit(errno == ENOSYS || errno == EINVAL ? 77 : 40); + } + + pid_t init = fork(); + if (init == 0) { + int rc = RunPidNamespaceInteractiveShellAndReport(report_write.get()); + _exit(rc); + } + if (init < 0) { + _exit(41); + } + + int status = 0; + if (waitpid(init, &status, 0) != init) { + _exit(42); + } + if (!WIFEXITED(status)) { + _exit(43); + } + _exit(WEXITSTATUS(status)); + } + + report_write.reset(); + + std::string report; + int status = 0; + bool exited = false; + for (int elapsed_ms = 0; elapsed_ms < 10000; elapsed_ms += 10) { + std::array buf = {}; + ssize_t n = read(report_read.get(), buf.data(), buf.size()); + if (n > 0) { + report.append(buf.data(), static_cast(n)); + } + + pid_t ret = waitpid(outer, &status, WNOHANG); + if (ret == outer) { + exited = true; + break; + } + ASSERT_FALSE(ret < 0 && errno != EINTR) << strerror(errno); + usleep(10 * 1000); + } + + if (!exited) { + KillAndReap(outer); + FAIL() << "interactive shell commands did not finish inside a child PID namespace, " + "captured report: " + << report; + } + ASSERT_TRUE(WIFEXITED(status)) << "outer status=" << status << ", report: " << report; + if (WEXITSTATUS(status) == 77) { + GTEST_SKIP() << "CLONE_NEWPID is not available"; + } + ASSERT_EQ(0, WEXITSTATUS(status)) + << "outer exit status=" << WEXITSTATUS(status) << ", report: " << report; + EXPECT_EQ("OK", report); +} + +TEST(CubeSandboxPtyExecChain, BlockingPtyMasterReadDoesNotBlockConcurrentWrite) { + PtyPair pair = OpenPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + pid_t shell = fork(); + ASSERT_GE(shell, 0) << strerror(errno); + if (shell == 0) { + pair.master.reset(); + ExecDefaultShellOnSlave(pair.slave.get()); + } + + pair.slave.reset(); + + int captured_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(captured_pipe)) << strerror(errno); + UniqueFd captured_read(captured_pipe[0]); + UniqueFd captured_write(captured_pipe[1]); + SetNonblock(captured_read.get()); + + pid_t reader = fork(); + ASSERT_GE(reader, 0) << strerror(errno); + if (reader == 0) { + captured_read.reset(); + + std::array buf = {}; + for (;;) { + ssize_t n = read(pair.master.get(), buf.data(), buf.size()); + if (n > 0) { + if (!WriteAll(captured_write.get(), buf.data(), static_cast(n))) { + _exit(2); + } + continue; + } + if (n < 0 && errno == EINTR) { + continue; + } + _exit(n == 0 ? 0 : 1); + } + } + + captured_write.reset(); + + std::string output; + ASSERT_TRUE(ReadUntilContainsFrom(captured_read.get(), " ", &output, 0, 3000)) + << "reader did not capture any shell output"; + ASSERT_TRUE(OutputLooksLikeShellPrompt(output)) << "initial prompt not captured: " << output; + + constexpr char kEchoCommand[] = "echo cube-blocking-read-ok\n"; + ASSERT_TRUE(WriteAll(pair.master.get(), kEchoCommand, strlen(kEchoCommand))) + << "write to PTY master failed while reader is blocked: errno=" << errno << " (" + << strerror(errno) << ")"; + + const size_t after_prompt = output.size(); + ASSERT_TRUE(ReadUntilContainsFrom(captured_read.get(), "cube-blocking-read-ok", &output, + after_prompt, 5000)) + << "blocking reader did not observe command output after concurrent write, captured: " + << output; + + ASSERT_TRUE(WriteAll(pair.master.get(), "exit\n", strlen("exit\n"))) + << "write exit failed: errno=" << errno << " (" << strerror(errno) << ")"; + + int shell_status = 0; + if (!WaitForChild(shell, &shell_status)) { + kill(shell, SIGKILL); + waitpid(shell, nullptr, 0); + kill(reader, SIGKILL); + waitpid(reader, nullptr, 0); + FAIL() << "shell did not exit after blocking-read test, captured: " << output; + } + + pair.master.reset(); + + int reader_status = 0; + if (!WaitForChild(reader, &reader_status)) { + kill(reader, SIGKILL); + waitpid(reader, nullptr, 0); + FAIL() << "PTY master reader did not exit after shell close, captured: " << output; + } + + ASSERT_TRUE(WIFEXITED(shell_status) || WIFSIGNALED(shell_status)); + ASSERT_TRUE(WIFEXITED(reader_status) || WIFSIGNALED(reader_status)); +} + +TEST(CubeSandboxPtyExecChain, VforkExecLsCompletesInChildPidNamespace) { + ExpectVforkExecLsCompletesInChildPidNamespace(); +} + +TEST(CubeSandboxPtyExecChain, InteractiveShellCommandsCompleteInChildPidNamespace) { + ExpectInteractiveShellCommandsCompleteInChildPidNamespace(); +} + +TEST(CubeSandboxPtyExecChain, PipeExecCollectsUnameStdout) { + int stdout_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(stdout_pipe)) << strerror(errno); + UniqueFd stdout_read(stdout_pipe[0]); + UniqueFd stdout_write(stdout_pipe[1]); + SetNonblock(stdout_read.get()); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + stdout_read.reset(); + dup2(stdout_write.get(), STDOUT_FILENO); + dup2(stdout_write.get(), STDERR_FILENO); + if (stdout_write.get() > STDERR_FILENO) { + stdout_write.reset(); + } + ExecUnameProgram(); + } + + stdout_write.reset(); + + int status = 0; + std::string output = CollectFdUntilChildExit(stdout_read.get(), child, 5000, &status); + ASSERT_TRUE(WIFEXITED(status)) << "uname status=" << status << ", output: " << output; + ASSERT_EQ(0, WEXITSTATUS(status)) << "uname output: " << output; + EXPECT_NE(std::string::npos, output.find("Linux")) << "uname output: " << output; +} + +TEST(CubeSandboxPtyExecChain, ZeroLengthPipeIoMatchesLinux) { + int fds[2] = {-1, -1}; + ASSERT_EQ(0, pipe(fds)) << strerror(errno); + UniqueFd read_end(fds[0]); + UniqueFd write_end(fds[1]); + + ExpectZeroLengthReadReturnsZero(read_end.get(), "pipe read end"); + ExpectZeroLengthWriteReturnsZero(write_end.get(), "pipe write end"); + + ASSERT_TRUE(WriteAll(write_end.get(), "x", 1)) + << "pipe write failed after zero-length operations: errno=" << errno << " (" + << strerror(errno) << ")"; + char observed = 0; + ASSERT_EQ(1, read(read_end.get(), &observed, 1)) << strerror(errno); + EXPECT_EQ('x', observed); +} + +TEST(CubeSandboxPtyExecChain, PtyExecDirectUnameEmitsOutputAndExits) { + PtyPair pair = OpenPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + SetNonblock(pair.master.get()); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + pair.master.reset(); + ExecUnameOnSlave(pair.slave.get()); + } + + pair.slave.reset(); + + int status = 0; + std::string output = CollectFdUntilChildExit(pair.master.get(), child, 5000, &status); + ASSERT_TRUE(WIFEXITED(status)) << "uname status=" << status << ", output: " << output; + ASSERT_EQ(0, WEXITSTATUS(status)) << "uname output: " << output; + EXPECT_NE(std::string::npos, output.find("Linux")) << "uname output: " << output; +} + +TEST(CubeSandboxPtyExecChain, ZeroLengthPtyIoMatchesLinux) { + PtyPair pair = OpenPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + SetRawByteMode(pair.slave.get()); + + ExpectZeroLengthReadReturnsZero(pair.master.get(), "pty master"); + ExpectZeroLengthReadReturnsZero(pair.slave.get(), "pty slave"); + ExpectZeroLengthWriteReturnsZero(pair.master.get(), "pty master"); + ExpectZeroLengthWriteReturnsZero(pair.slave.get(), "pty slave"); + + ASSERT_TRUE(WriteAll(pair.master.get(), "z", 1)) + << "pty master write failed after zero-length operations: errno=" << errno << " (" + << strerror(errno) << ")"; + char observed = 0; + ASSERT_EQ(1, read(pair.slave.get(), &observed, 1)) << strerror(errno); + EXPECT_EQ('z', observed); +} + +TEST(CubeSandboxPtyExecChain, DefaultPrintkLevelDoesNotEnableDebugConsoleSpam) { + ASSERT_EQ("7\t4\t1\t7\n", ReadProcPrintk()); + + WriteProcPrintk("8\n"); + EXPECT_EQ("8\t4\t1\t7\n", ReadProcPrintk()); + + WriteProcPrintk("7\n"); + EXPECT_EQ("7\t4\t1\t7\n", ReadProcPrintk()); +} + +TEST(CubeSandboxPtyExecChain, RawPtyByteReadsSurviveSigmaskAndPpoll) { + PtyPair pair = OpenPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + SetRawByteMode(pair.slave.get()); + + int ready_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(ready_pipe)) << strerror(errno); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + close(ready_pipe[0]); + pair.master.reset(); + + char ready = 'r'; + if (write(ready_pipe[1], &ready, 1) != 1) { + _exit(2); + } + + constexpr char kExpected[] = "ls\nuname -a\n"; + for (size_t i = 0; i < sizeof(kExpected) - 1; ++i) { + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGCHLD); + if (sigprocmask(SIG_SETMASK, &mask, nullptr) < 0) { + _exit(3); + } + + struct pollfd pfd = { + .fd = pair.slave.get(), + .events = POLLIN | POLLERR | POLLHUP, + .revents = 0, + }; + sigset_t empty; + sigemptyset(&empty); + int pret = ppoll(&pfd, 1, nullptr, &empty); + if (pret <= 0 || (pfd.revents & POLLIN) == 0) { + _exit(4); + } + + if (sigprocmask(SIG_SETMASK, &empty, nullptr) < 0) { + _exit(5); + } + + char ch = 0; + ssize_t n = read(pair.slave.get(), &ch, 1); + if (n != 1 || ch != kExpected[i]) { + _exit(6); + } + } + _exit(0); + } + + close(ready_pipe[1]); + pair.slave.reset(); + + char ready = 0; + ASSERT_EQ(1, read(ready_pipe[0], &ready, 1)) << strerror(errno); + ASSERT_EQ('r', ready); + close(ready_pipe[0]); + + ASSERT_TRUE(WriteAll(pair.master.get(), "ls\nuname -a\n", strlen("ls\nuname -a\n"))) + << "write master failed: errno=" << errno << " (" << strerror(errno) << ")"; + + int status = 0; + if (!WaitForChild(child, &status)) { + kill(child, SIGKILL); + waitpid(child, nullptr, 0); + FAIL() << "raw pty byte reader timed out"; + } + ASSERT_TRUE(WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(CubeSandboxPtyExecChain, ShellRunsLsThenUnameThroughControllingPty) { + RunShellCommandSequence(ExecDefaultShellOnSlave, "default shell"); +} + +TEST(CubeSandboxPtyExecChain, ShellRepeatedlyRunsLsThenUnameThroughControllingPty) { + RunRepeatedShellCommandSequence(ExecDefaultShellOnSlave, "default shell"); +} + +TEST(CubeSandboxPtyExecChain, BusyBoxAshRunsLsThenUnameThroughControllingPty) { + if (access("/bin/busybox", X_OK) != 0) { + GTEST_SKIP() << "/bin/busybox is not available on this host"; + } + RunShellCommandSequence(ExecBusyBoxShellOnSlave, "busybox ash"); +} + +TEST(CubeSandboxPtyExecChain, MockShimForwardsClientInputAndCollectsExecOutput) { + RunCubeShimLikeExecChain(ExecDefaultShellOnSlave, "default shell"); +} + +TEST(CubeSandboxPtyExecChain, MockShimConcurrentForwardersPublishExecProgress) { + RunCubeShimLikeConcurrentForwarders(ExecDefaultShellOnSlave, "default shell"); +} + +TEST(CubeSandboxPtyExecChain, MockShimByteStreamInputCommandsReachShell) { + RunCubeShimLikeByteStreamInput(ExecDefaultShellOnSlave, "default shell"); +} + +} // namespace + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/debugfs_mount.cc b/user/apps/tests/dunitest/suites/normal/debugfs_mount.cc new file mode 100644 index 0000000000..052d4d253e --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/debugfs_mount.cc @@ -0,0 +1,90 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "cap_common.h" + +namespace { + +int ensure_dir(const char* path) { + struct stat st = {}; + if (stat(path, &st) == 0) { + return S_ISDIR(st.st_mode) ? 0 : -1; + } + return mkdir(path, 0755); +} + +TEST(DebugFsMount, MountCreatesDebugFilesystem) { + char root[128] = {}; + snprintf(root, sizeof(root), "/tmp/debugfs_mount_%d", getpid()); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, mkdir(root, 0755)) << strerror(errno); + + if (mount("none", root, "debugfs", 0, nullptr) != 0) { + int saved_errno = errno; + rmdir(root); + FAIL() << "mount debugfs failed: errno=" << saved_errno << " (" << strerror(saved_errno) + << ")"; + } + + struct stat st = {}; + ASSERT_EQ(0, stat(root, &st)) << strerror(errno); + EXPECT_TRUE(S_ISDIR(st.st_mode)); + + ASSERT_EQ(0, umount(root)) << strerror(errno); + ASSERT_EQ(0, rmdir(root)) << strerror(errno); +} + +TEST(DebugFsMount, MountRequiresCapSysAdmin) { + char root[128] = {}; + snprintf(root, sizeof(root), "/tmp/debugfs_mount_no_cap_%d", getpid()); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, mkdir(root, 0755)) << strerror(errno); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + + if (child == 0) { + cap_user_data_t zero[_LINUX_CAPABILITY_U32S_3] = {}; + if (capset_errno(_LINUX_CAPABILITY_VERSION_3, 0, zero) != 0) { + _exit(10); + } + + errno = 0; + if (mount("none", root, "debugfs", 0, nullptr) == 0) { + umount(root); + _exit(11); + } + _exit(errno == EPERM ? 0 : 12); + } + + int status = 0; + ASSERT_EQ(child, waitpid(child, &status, 0)) << strerror(errno); + + bool child_exited = WIFEXITED(status); + int child_exit_code = child_exited ? WEXITSTATUS(status) : -1; + if (child_exit_code == 11) { + umount(root); + } + + EXPECT_TRUE(child_exited) << "child terminated abnormally, status=" << status; + EXPECT_EQ(0, child_exit_code) + << "child expected debugfs mount to fail with EPERM after dropping caps"; + + ASSERT_EQ(0, rmdir(root)) << strerror(errno); +} + +} // namespace + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/eventfd_pending_signal.cc b/user/apps/tests/dunitest/suites/normal/eventfd_pending_signal.cc new file mode 100644 index 0000000000..9f310b1bad --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/eventfd_pending_signal.cc @@ -0,0 +1,107 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +class UniqueFd { +public: + UniqueFd() = default; + explicit UniqueFd(int fd) : fd_(fd) {} + UniqueFd(const UniqueFd&) = delete; + UniqueFd& operator=(const UniqueFd&) = delete; + + ~UniqueFd() { reset(); } + + int get() const { return fd_; } + + void reset(int fd = -1) { + if (fd_ >= 0) { + close(fd_); + } + fd_ = fd; + } + +private: + int fd_ = -1; +}; + +void BlockSignal(int signum, sigset_t* old_mask) { + sigset_t blocked; + sigemptyset(&blocked); + sigaddset(&blocked, signum); + ASSERT_EQ(0, sigprocmask(SIG_BLOCK, &blocked, old_mask)) << strerror(errno); +} + +void RestoreSigmask(const sigset_t& old_mask) { + ASSERT_EQ(0, sigprocmask(SIG_SETMASK, &old_mask, nullptr)) << strerror(errno); +} + +void SendQueuedSignalToSelf(int signum) { + siginfo_t info {}; + info.si_signo = signum; + info.si_errno = 0; + info.si_code = SI_QUEUE; + info.si_pid = getpid(); + info.si_uid = getuid(); + info.si_value.sival_int = 0xefd; + + errno = 0; + long ret = syscall(__NR_rt_sigqueueinfo, getpid(), signum, &info); + ASSERT_EQ(0, ret) << "rt_sigqueueinfo failed: errno=" << errno << " (" << strerror(errno) + << ")"; +} + +void ExpectQueuedSignal(int signum) { + sigset_t waitset; + sigemptyset(&waitset); + sigaddset(&waitset, signum); + + siginfo_t received {}; + timespec timeout {}; + timeout.tv_sec = 2; + int ret = sigtimedwait(&waitset, &received, &timeout); + ASSERT_EQ(signum, ret) << "sigtimedwait failed: errno=" << errno << " (" << strerror(errno) + << ")"; + EXPECT_EQ(SI_QUEUE, received.si_code); + EXPECT_EQ(0xefd, received.si_value.sival_int); +} + +TEST(EventFdPendingSignal, WriteSucceedsWhenCounterHasSpace) { + sigset_t old_mask; + BlockSignal(SIGUSR1, &old_mask); + SendQueuedSignalToSelf(SIGUSR1); + + UniqueFd fd(eventfd(0, EFD_NONBLOCK)); + ASSERT_GE(fd.get(), 0) << "eventfd failed: " << strerror(errno); + + uint64_t value = 1; + ssize_t written = write(fd.get(), &value, sizeof(value)); + EXPECT_EQ(static_cast(sizeof(value)), written) + << "eventfd write should not fail merely because a signal is pending, errno=" << errno + << " (" << strerror(errno) << ")"; + + uint64_t observed = 0; + ssize_t read_bytes = read(fd.get(), &observed, sizeof(observed)); + ASSERT_EQ(static_cast(sizeof(observed)), read_bytes) + << "eventfd read failed after successful write, errno=" << errno << " (" << strerror(errno) + << ")"; + EXPECT_EQ(value, observed); + + ExpectQueuedSignal(SIGUSR1); + RestoreSigmask(old_mask); +} + +} // namespace + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/exec_abi.cc b/user/apps/tests/dunitest/suites/normal/exec_abi.cc new file mode 100644 index 0000000000..05040c7ffb --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/exec_abi.cc @@ -0,0 +1,177 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +char g_self_path[PATH_MAX] = {}; + +int check_auxv_credentials() { + if (getauxval(AT_UID) != static_cast(getuid())) { + return 41; + } + if (getauxval(AT_EUID) != static_cast(geteuid())) { + return 42; + } + if (getauxval(AT_GID) != static_cast(getgid())) { + return 43; + } + if (getauxval(AT_EGID) != static_cast(getegid())) { + return 44; + } + return 0; +} + +#if defined(__x86_64__) + +constexpr unsigned char kCheckRdxElf[] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x3e, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x78, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x38, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + + 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + + // _start: + // test %rdx,%rdx + // jnz fail + // exit(0) + // fail: + // exit(42) + 0x48, 0x85, 0xd2, 0x75, 0x09, 0x31, 0xff, 0xb8, 0x3c, 0x00, 0x00, 0x00, + 0x0f, 0x05, 0xbf, 0x2a, 0x00, 0x00, 0x00, 0xb8, 0x3c, 0x00, 0x00, 0x00, + 0x0f, 0x05, +}; + +void write_all(int fd, const void* data, size_t size) { + const char* p = static_cast(data); + while (size > 0) { + ssize_t n = write(fd, p, size); + ASSERT_GT(n, 0) << "write failed: errno=" << errno << " (" << strerror(errno) << ")"; + p += n; + size -= static_cast(n); + } +} + +void write_check_rdx_elf(char* path, size_t path_size) { + snprintf(path, path_size, "/tmp/exec_abi_check_rdx_%d", getpid()); + int fd = open(path, O_CREAT | O_TRUNC | O_WRONLY, 0755); + ASSERT_GE(fd, 0) << "open(" << path << ") failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + write_all(fd, kCheckRdxElf, sizeof(kCheckRdxElf)); + ASSERT_EQ(0, close(fd)) << "close(" << path << ") failed: errno=" << errno << " (" + << strerror(errno) << ")"; + ASSERT_EQ(0, chmod(path, 0755)) << "chmod(" << path << ") failed: errno=" << errno << " (" + << strerror(errno) << ")"; +} + +#endif + +void ensure_tmp_dir() { + if (mkdir("/tmp", 0755) != 0 && errno != EEXIST) { + FAIL() << "mkdir(/tmp) failed: errno=" << errno << " (" << strerror(errno) << ")"; + } +} + +} // namespace + +TEST(ExecAbi, X86_64ExecClearsRdxForProgramEntry) { +#if !defined(__x86_64__) + GTEST_SKIP() << "x86_64-specific exec register ABI test"; +#else + ensure_tmp_dir(); +#endif +} + +#if defined(__x86_64__) + +TEST(ExecAbi, X86_64ExecClearsRdxWhenEnvpIsNonNull) { + ensure_tmp_dir(); + + char path[128] = {}; + write_check_rdx_elf(path, sizeof(path)); + + pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + + if (child == 0) { + char arg0[] = "check-rdx"; + char env0[] = "DRAGONOS_EXEC_ABI_RDX=non-null-envp"; + char* const argv[] = {arg0, nullptr}; + char* const envp[] = {env0, nullptr}; + execve(path, argv, envp); + _exit(errno); + } + + int status = 0; + ASSERT_EQ(child, waitpid(child, &status, 0)) + << "waitpid failed: errno=" << errno << " (" << strerror(errno) << ")"; + unlink(path); + + ASSERT_TRUE(WIFEXITED(status)) << "child did not exit normally, status=" << status; + EXPECT_EQ(0, WEXITSTATUS(status)) + << "exec entry %rdx was not cleared; exit 42 means old envp leaked into %rdx"; +} + +#endif + +TEST(ExecAbi, AuxvUidGidFollowCredentialsAtExec) { + ASSERT_NE('\0', g_self_path[0]) << "self executable path was not initialized"; + + pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + + if (child == 0) { + if (setgid(1234) != 0 || setuid(1234) != 0) { + _exit(120); + } + + char env0[] = "DRAGONOS_EXEC_ABI_CHECK_AUXV=1"; + char* const argv[] = {g_self_path, nullptr}; + char* const envp[] = {env0, nullptr}; + execve(g_self_path, argv, envp); + _exit(errno); + } + + int status = 0; + ASSERT_EQ(child, waitpid(child, &status, 0)) + << "waitpid failed: errno=" << errno << " (" << strerror(errno) << ")"; + + ASSERT_TRUE(WIFEXITED(status)) << "child did not exit normally, status=" << status; + EXPECT_EQ(0, WEXITSTATUS(status)) + << "exec auxv uid/gid entries did not match process credentials"; +} + +int main(int argc, char** argv) { + if (getenv("DRAGONOS_EXEC_ABI_CHECK_AUXV") != nullptr) { + return check_auxv_credentials(); + } + + ssize_t path_len = readlink("/proc/self/exe", g_self_path, sizeof(g_self_path) - 1); + if (path_len > 0) { + g_self_path[path_len] = '\0'; + } else if (argc > 0 && argv[0] != nullptr) { + strncpy(g_self_path, argv[0], sizeof(g_self_path) - 1); + g_self_path[sizeof(g_self_path) - 1] = '\0'; + } + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/ext4_xattr.cc b/user/apps/tests/dunitest/suites/normal/ext4_xattr.cc new file mode 100644 index 0000000000..33fc352abf --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/ext4_xattr.cc @@ -0,0 +1,183 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace { + +constexpr long kExt4SuperMagic = 0xEF53; + +class TempFile { + public: + TempFile() { + char tmpl[] = "/root/dunitest_ext4_xattr_XXXXXX"; + fd_ = mkstemp(tmpl); + if (fd_ >= 0) { + path_ = tmpl; + } + } + + ~TempFile() { + if (fd_ >= 0) { + close(fd_); + } + if (!path_.empty()) { + unlink(path_.c_str()); + } + } + + TempFile(const TempFile&) = delete; + TempFile& operator=(const TempFile&) = delete; + + bool valid() const { + return fd_ >= 0; + } + + const char* path() const { + return path_.c_str(); + } + + private: + std::string path_; + int fd_ = -1; +}; + +void ExpectValue(const char* path, const char* name, const char* expected) { + char buf[32] = {}; + errno = 0; + ssize_t n = getxattr(path, name, buf, sizeof(buf)); + ASSERT_EQ(static_cast(strlen(expected)), n) + << "getxattr failed errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(0, memcmp(buf, expected, strlen(expected))); +} + +bool IsXattrUnsupported(int err) { + return err == ENOTSUP || err == ENOSYS || err == EOPNOTSUPP; +} + +std::vector SplitXattrList(const std::vector& list, ssize_t len) { + std::vector names; + size_t start = 0; + for (size_t i = 0; i < static_cast(len); ++i) { + if (list[i] == '\0') { + names.emplace_back(list.data() + start, i - start); + start = i + 1; + } + } + EXPECT_EQ(static_cast(len), start) << "xattr list is not NUL terminated"; + return names; +} + +bool ContainsName(const std::vector& names, const char* name) { + return std::find(names.begin(), names.end(), name) != names.end(); +} + +std::vector ExpectListContains(const char* path, const char* name) { + errno = 0; + ssize_t needed = listxattr(path, nullptr, 0); + EXPECT_GT(needed, 0) << "listxattr size failed errno=" << errno << " (" << strerror(errno) + << ")"; + if (needed <= 0) { + return {}; + } + + if (needed > 1) { + std::vector small(static_cast(needed - 1)); + errno = 0; + EXPECT_EQ(-1, listxattr(path, small.data(), small.size())); + EXPECT_EQ(ERANGE, errno); + } + + std::vector list(static_cast(needed)); + errno = 0; + ssize_t n = listxattr(path, list.data(), list.size()); + EXPECT_EQ(needed, n) << "listxattr value failed errno=" << errno << " (" << strerror(errno) + << ")"; + if (n != needed) { + return {}; + } + + auto names = SplitXattrList(list, n); + EXPECT_TRUE(ContainsName(names, name)) << "xattr list does not contain " << name; + return names; +} + +} // namespace + +TEST(Ext4Xattr, CreateReplaceFlagsAndFailurePreserveValue) { + struct statfs st = {}; + ASSERT_EQ(0, statfs("/root", &st)) << "statfs(/root) failed: " << strerror(errno); + if (st.f_type != kExt4SuperMagic) { + GTEST_SKIP() << "/root is not ext4, f_type=0x" << std::hex << st.f_type; + } + + TempFile file; + ASSERT_TRUE(file.valid()) << "mkstemp failed: " << strerror(errno); + + constexpr const char* kName = "user.dragonos_ext4_flags"; + + errno = 0; + if (setxattr(file.path(), kName, "base", 4, 0) != 0) { + if (IsXattrUnsupported(errno)) { + GTEST_SKIP() << "xattr is not supported on ext4 path"; + } + FAIL() << "initial setxattr failed errno=" << errno << " (" << strerror(errno) << ")"; + } + ExpectValue(file.path(), kName, "base"); + + errno = 0; + EXPECT_EQ(-1, setxattr(file.path(), kName, "create", 6, XATTR_CREATE)); + EXPECT_EQ(EEXIST, errno); + ExpectValue(file.path(), kName, "base"); + + errno = 0; + ASSERT_EQ(0, setxattr(file.path(), kName, "replace", 7, XATTR_REPLACE)) + << "replace existing failed errno=" << errno << " (" << strerror(errno) << ")"; + ExpectValue(file.path(), kName, "replace"); + + constexpr const char* kMissing = "user.dragonos_ext4_missing"; + errno = 0; + EXPECT_EQ(-1, setxattr(file.path(), kMissing, "value", 5, XATTR_REPLACE)); + EXPECT_EQ(ENODATA, errno); + ExpectValue(file.path(), kName, "replace"); + + errno = 0; + ASSERT_EQ(0, setxattr(file.path(), kMissing, "created", 7, XATTR_CREATE)) + << "create missing failed errno=" << errno << " (" << strerror(errno) << ")"; + ExpectValue(file.path(), kMissing, "created"); + + auto names = ExpectListContains(file.path(), kName); + EXPECT_TRUE(ContainsName(names, kMissing)) << "xattr list does not contain " << kMissing; + + errno = 0; + ASSERT_EQ(0, removexattr(file.path(), kName)) + << "removexattr existing failed errno=" << errno << " (" << strerror(errno) << ")"; + + char buf[32] = {}; + errno = 0; + EXPECT_EQ(-1, getxattr(file.path(), kName, buf, sizeof(buf))); + EXPECT_EQ(ENODATA, errno); + ExpectValue(file.path(), kMissing, "created"); + + names = ExpectListContains(file.path(), kMissing); + EXPECT_FALSE(ContainsName(names, kName)) << "removed xattr is still listed"; + + errno = 0; + EXPECT_EQ(-1, removexattr(file.path(), kName)); + EXPECT_EQ(ENODATA, errno); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/mmap_truncate_cow.cc b/user/apps/tests/dunitest/suites/normal/mmap_truncate_cow.cc new file mode 100644 index 0000000000..e0d177fb7d --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/mmap_truncate_cow.cc @@ -0,0 +1,136 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace { + +size_t PageSize() { + const long ps = sysconf(_SC_PAGESIZE); + return ps > 0 ? static_cast(ps) : 4096; +} + +class TempFile { + public: + TempFile() { + char tmpl[] = "/tmp/dunitest_mmap_truncate_cow_XXXXXX"; + fd_ = mkstemp(tmpl); + if (fd_ >= 0) { + path_ = tmpl; + } + } + + ~TempFile() { + if (fd_ >= 0) { + close(fd_); + } + if (!path_.empty()) { + unlink(path_.c_str()); + } + } + + TempFile(const TempFile&) = delete; + TempFile& operator=(const TempFile&) = delete; + + bool valid() const { + return fd_ >= 0; + } + + int fd() const { + return fd_; + } + + private: + std::string path_; + int fd_ = -1; +}; + +void ExpectChildDiesBySignal(int signal, void (*fn)()) { + const pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + + if (child == 0) { + fn(); + _exit(0); + } + + int status = 0; + ASSERT_EQ(child, waitpid(child, &status, 0)) + << "waitpid failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_TRUE(WIFSIGNALED(status)) << "child exited without signal, status=" << status; + EXPECT_EQ(signal, WTERMSIG(status)) << "unexpected signal, status=" << status; +} + +volatile char* g_mapping = nullptr; + +void ReadMappedByte() { + const volatile char byte = g_mapping[0]; + (void)byte; +} + +} // namespace + +TEST(MmapTruncateCow, PrivateCowPageIsInvalidatedAfterTruncateToZero) { + const size_t ps = PageSize(); + TempFile file; + ASSERT_TRUE(file.valid()) << "mkstemp failed: errno=" << errno << " (" << strerror(errno) + << ")"; + ASSERT_EQ(0, ftruncate(file.fd(), static_cast(ps))) + << "ftruncate to page failed: errno=" << errno << " (" << strerror(errno) << ")"; + + void* mapping = mmap(nullptr, ps, PROT_READ | PROT_WRITE, MAP_PRIVATE, file.fd(), 0); + ASSERT_NE(MAP_FAILED, mapping) << "mmap failed: errno=" << errno << " (" << strerror(errno) + << ")"; + + memset(mapping, 'a', ps); + ASSERT_EQ(0, ftruncate(file.fd(), 0)) + << "ftruncate to zero failed: errno=" << errno << " (" << strerror(errno) << ")"; + + g_mapping = static_cast(mapping); + ExpectChildDiesBySignal(SIGBUS, ReadMappedByte); + + ASSERT_EQ(0, munmap(mapping, ps)) << "munmap failed: errno=" << errno << " (" + << strerror(errno) << ")"; + g_mapping = nullptr; +} + +TEST(MmapTruncateCow, PartialPageTruncateKeepsContainingPageAndInvalidatesFollowingCowPage) { + const size_t ps = PageSize(); + TempFile file; + ASSERT_TRUE(file.valid()) << "mkstemp failed: errno=" << errno << " (" << strerror(errno) + << ")"; + ASSERT_EQ(0, ftruncate(file.fd(), static_cast(ps * 2))) + << "ftruncate to two pages failed: errno=" << errno << " (" << strerror(errno) << ")"; + + void* mapping = mmap(nullptr, ps * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE, file.fd(), 0); + ASSERT_NE(MAP_FAILED, mapping) << "mmap failed: errno=" << errno << " (" << strerror(errno) + << ")"; + + memset(mapping, 'b', ps * 2); + ASSERT_EQ(0, ftruncate(file.fd(), static_cast(ps / 2))) + << "partial ftruncate failed: errno=" << errno << " (" << strerror(errno) << ")"; + + auto* bytes = static_cast(mapping); + EXPECT_EQ('b', bytes[0]); + + g_mapping = bytes + ps; + ExpectChildDiesBySignal(SIGBUS, ReadMappedByte); + + ASSERT_EQ(0, munmap(mapping, ps * 2)) << "munmap failed: errno=" << errno << " (" + << strerror(errno) << ")"; + g_mapping = nullptr; +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/mount_reconfigure.cc b/user/apps/tests/dunitest/suites/normal/mount_reconfigure.cc index fd9ea289d7..3a27d240a2 100644 --- a/user/apps/tests/dunitest/suites/normal/mount_reconfigure.cc +++ b/user/apps/tests/dunitest/suites/normal/mount_reconfigure.cc @@ -862,6 +862,52 @@ TEST(MountReconfigure, StackedMountKeepsOriginalTarget) { rmdir(base); } +TEST(MountReconfigure, StackedMountRepeatedUnmountKeepsLowerIndex) { + const char *base = "/tmp/test_stacked_mount_repeated"; + const char *target = "/tmp/test_stacked_mount_repeated/target"; + char lower_marker[256]; + char upper_marker[256]; + + ensure_dir("/tmp"); + ensure_dir(base); + ensure_dir(target); + + if (unshare(CLONE_NEWNS) != 0) { + GTEST_SKIP() << strerror(errno); + } + + mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL); + + for (int i = 0; i < 16; ++i) { + snprintf(lower_marker, sizeof(lower_marker), "%s/lower_marker_%d", target, i); + snprintf(upper_marker, sizeof(upper_marker), "%s/upper_marker_%d", target, i); + + ASSERT_EQ(0, mount("", target, "ramfs", 0, NULL)) << strerror(errno); + ASSERT_EQ(0, write_file(lower_marker)) << strerror(errno); + + ASSERT_EQ(0, mount("", target, "ramfs", 0, NULL)) << strerror(errno); + ASSERT_EQ(0, write_file(upper_marker)) << strerror(errno); + + ASSERT_EQ(0, umount(target)) << "top umount failed at round " << i << ": " + << strerror(errno); + EXPECT_TRUE(path_exists(lower_marker)) << "lower mount lost at round " << i; + EXPECT_FALSE(path_exists(upper_marker)) << "upper mount remained visible at round " << i; + + ASSERT_EQ(0, unshare(CLONE_NEWNS)) << "copy_mnt_ns failed at round " << i << ": " + << strerror(errno); + EXPECT_TRUE(path_exists(lower_marker)) << "lower mount index lost after unshare at round " + << i; + EXPECT_FALSE(path_exists(upper_marker)) << "upper mount reappeared after unshare at round " + << i; + + ASSERT_EQ(0, umount(target)) << "lower umount failed at round " << i << ": " + << strerror(errno); + } + + rmdir(target); + rmdir(base); +} + int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/user/apps/tests/dunitest/suites/normal/mqueue_mount.cc b/user/apps/tests/dunitest/suites/normal/mqueue_mount.cc new file mode 100644 index 0000000000..77b4eff3cc --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/mqueue_mount.cc @@ -0,0 +1,47 @@ +#include + +#include +#include +#include +#include +#include + +namespace { + +int ensure_dir(const char* path) { + struct stat st = {}; + if (stat(path, &st) == 0) { + return S_ISDIR(st.st_mode) ? 0 : -1; + } + return mkdir(path, 0755); +} + +TEST(MqueueMount, MountCreatesMqueueFilesystem) { + char root[128] = {}; + snprintf(root, sizeof(root), "/tmp/mqueue_mount_%d", getpid()); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, mkdir(root, 0755)) << strerror(errno); + + if (mount("mqueue", root, "mqueue", 0, nullptr) != 0) { + int saved_errno = errno; + rmdir(root); + FAIL() << "mount mqueue failed: errno=" << saved_errno << " (" << strerror(saved_errno) + << ")"; + } + + struct stat st = {}; + ASSERT_EQ(0, stat(root, &st)) << strerror(errno); + EXPECT_TRUE(S_ISDIR(st.st_mode)); + EXPECT_EQ(static_cast(01777), st.st_mode & static_cast(07777)); + + ASSERT_EQ(0, umount(root)) << strerror(errno); + ASSERT_EQ(0, rmdir(root)) << strerror(errno); +} + +} // namespace + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/overlayfs_semantics.cc b/user/apps/tests/dunitest/suites/normal/overlayfs_semantics.cc index 236bf6f046..215362d9d1 100644 --- a/user/apps/tests/dunitest/suites/normal/overlayfs_semantics.cc +++ b/user/apps/tests/dunitest/suites/normal/overlayfs_semantics.cc @@ -9,11 +9,29 @@ #include #include #include +#include #include +#include #include namespace { +#ifndef __NR_renameat2 +#define __NR_renameat2 316 +#endif + +#ifndef RENAME_NOREPLACE +#define RENAME_NOREPLACE (1U << 0) +#endif + +#ifndef RENAME_EXCHANGE +#define RENAME_EXCHANGE (1U << 1) +#endif + +#ifndef RENAME_WHITEOUT +#define RENAME_WHITEOUT (1U << 2) +#endif + int ensure_dir(const char* path) { struct stat st = {}; if (stat(path, &st) == 0) { @@ -22,6 +40,119 @@ int ensure_dir(const char* path) { return mkdir(path, 0755); } +std::string join_path(const std::string& dir, const char* name) { + return dir + "/" + name; +} + +bool path_exists(const std::string& path) { + struct stat st = {}; + return stat(path.c_str(), &st) == 0; +} + +bool is_whiteout(const std::string& path) { + struct stat st = {}; + if (lstat(path.c_str(), &st) != 0) { + return false; + } + return S_ISCHR(st.st_mode) && major(st.st_rdev) == 0 && minor(st.st_rdev) == 0; +} + +int write_text(const std::string& path, const char* text) { + int fd = open(path.c_str(), O_CREAT | O_WRONLY | O_TRUNC, 0644); + if (fd < 0) { + return -1; + } + size_t len = strlen(text); + ssize_t written = write(fd, text, len); + int saved_errno = errno; + close(fd); + errno = saved_errno; + return written == static_cast(len) ? 0 : -1; +} + +std::string read_text(const std::string& path) { + char buf[128] = {}; + int fd = open(path.c_str(), O_RDONLY); + if (fd < 0) { + return {}; + } + ssize_t n = read(fd, buf, sizeof(buf) - 1); + int saved_errno = errno; + close(fd); + errno = saved_errno; + if (n < 0) { + return {}; + } + return std::string(buf, static_cast(n)); +} + +long renameat2_call(const std::string& old_path, const std::string& new_path, unsigned flags) { + return syscall(__NR_renameat2, AT_FDCWD, old_path.c_str(), AT_FDCWD, new_path.c_str(), flags); +} + +void remove_recursive(const std::string& path) { + struct stat st = {}; + if (lstat(path.c_str(), &st) != 0) { + return; + } + if (!S_ISDIR(st.st_mode)) { + unlink(path.c_str()); + return; + } + + DIR* dir = opendir(path.c_str()); + if (dir != nullptr) { + while (dirent* ent = readdir(dir)) { + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + continue; + } + remove_recursive(join_path(path, ent->d_name)); + } + closedir(dir); + } + rmdir(path.c_str()); +} + +struct OverlayRenameEnv { + std::string root; + std::string upper; + std::string lower; + std::string work; + std::string merged; +}; + +OverlayRenameEnv make_overlay_env(const char* name) { + std::string root = std::string("/tmp/") + name + "_" + std::to_string(getpid()); + OverlayRenameEnv env = {}; + env.root = root; + env.upper = join_path(root, "u"); + env.lower = join_path(root, "l"); + env.work = join_path(root, "w"); + env.merged = join_path(root, "m"); + return env; +} + +void cleanup_overlay_env(const OverlayRenameEnv& env) { + umount(env.merged.c_str()); + remove_recursive(env.root); +} + +bool setup_overlay_env(const OverlayRenameEnv& env) { + if (ensure_dir("/tmp") != 0 || ensure_dir(env.root.c_str()) != 0 + || ensure_dir(env.upper.c_str()) != 0 || ensure_dir(env.lower.c_str()) != 0 + || ensure_dir(env.work.c_str()) != 0 || ensure_dir(env.merged.c_str()) != 0) { + cleanup_overlay_env(env); + return false; + } + std::string options = + "lowerdir=" + env.lower + ",upperdir=" + env.upper + ",workdir=" + env.work; + if (mount("overlay", env.merged.c_str(), "overlay", 0, options.c_str()) != 0) { + cleanup_overlay_env(env); + return false; + } + return true; +} + void remove_tree(const char* root) { char path[256] = {}; @@ -161,6 +292,97 @@ TEST(OverlayFsSemantics, CreateOverWhiteoutAfterLowerUnlink) { remove_tree(root); } +TEST(OverlayFsSemantics, MkdirOverWhiteoutAfterLowerUnlink) { + char root[128] = {}; + char upper[160] = {}; + char lower[160] = {}; + char work[160] = {}; + char merged[160] = {}; + char lower_x[192] = {}; + char merged_x[192] = {}; + char options[512] = {}; + + snprintf(root, sizeof(root), "/tmp/overlayfs_whiteout_mkdir_%d", getpid()); + snprintf(upper, sizeof(upper), "%s/u", root); + snprintf(lower, sizeof(lower), "%s/l", root); + snprintf(work, sizeof(work), "%s/w", root); + snprintf(merged, sizeof(merged), "%s/m", root); + snprintf(lower_x, sizeof(lower_x), "%s/x", lower); + snprintf(merged_x, sizeof(merged_x), "%s/x", merged); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root)); + ASSERT_EQ(0, ensure_dir(upper)); + ASSERT_EQ(0, ensure_dir(lower)); + ASSERT_EQ(0, ensure_dir(work)); + ASSERT_EQ(0, ensure_dir(merged)); + + FILE* lower_file = fopen(lower_x, "w"); + ASSERT_NE(nullptr, lower_file) << strerror(errno); + fclose(lower_file); + + snprintf(options, sizeof(options), "lowerdir=%s,upperdir=%s,workdir=%s", lower, upper, work); + if (mount("overlay", merged, "overlay", 0, options) != 0) { + remove_tree(root); + GTEST_SKIP() << strerror(errno); + } + + ASSERT_EQ(0, unlink(merged_x)) << strerror(errno); + ASSERT_EQ(0, mkdir(merged_x, 0755)) << strerror(errno); + + struct stat st = {}; + ASSERT_EQ(0, stat(merged_x, &st)) << strerror(errno); + EXPECT_TRUE(S_ISDIR(st.st_mode)); + + remove_tree(root); +} + +TEST(OverlayFsSemantics, MknodWhiteoutOnOverlayIsDenied) { + char root[128] = {}; + char upper[160] = {}; + char lower[160] = {}; + char work[160] = {}; + char merged[160] = {}; + char lower_x[192] = {}; + char merged_x[192] = {}; + char options[512] = {}; + + snprintf(root, sizeof(root), "/tmp/overlayfs_whiteout_mknod_%d", getpid()); + snprintf(upper, sizeof(upper), "%s/u", root); + snprintf(lower, sizeof(lower), "%s/l", root); + snprintf(work, sizeof(work), "%s/w", root); + snprintf(merged, sizeof(merged), "%s/m", root); + snprintf(lower_x, sizeof(lower_x), "%s/x", lower); + snprintf(merged_x, sizeof(merged_x), "%s/x", merged); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root)); + ASSERT_EQ(0, ensure_dir(upper)); + ASSERT_EQ(0, ensure_dir(lower)); + ASSERT_EQ(0, ensure_dir(work)); + ASSERT_EQ(0, ensure_dir(merged)); + + FILE* lower_file = fopen(lower_x, "w"); + ASSERT_NE(nullptr, lower_file) << strerror(errno); + fclose(lower_file); + + snprintf(options, sizeof(options), "lowerdir=%s,upperdir=%s,workdir=%s", lower, upper, work); + if (mount("overlay", merged, "overlay", 0, options) != 0) { + remove_tree(root); + GTEST_SKIP() << strerror(errno); + } + + ASSERT_EQ(0, unlink(merged_x)) << strerror(errno); + ASSERT_EQ(-1, mknod(merged_x, S_IFCHR | 0600, makedev(0, 0))); + EXPECT_EQ(EPERM, errno); + + struct stat st = {}; + ASSERT_EQ(-1, stat(merged_x, &st)); + EXPECT_EQ(ENOENT, errno); + + remove_tree(root); +} + TEST(OverlayFsSemantics, LowerWhiteoutHidesLowerLayers) { char root[128] = {}; char upper[160] = {}; @@ -332,6 +554,636 @@ TEST(OverlayFsSemantics, UnlinkLowerWhiteoutReturnsEnoent) { rmdir(root); } +TEST(OverlayFsSemantics, MkdirUnderLowerOnlyDirCopiesUpParent) { + char root[128] = {}; + char upper[160] = {}; + char lower[160] = {}; + char work[160] = {}; + char merged[160] = {}; + char lower_dev[192] = {}; + char lower_pts[224] = {}; + char upper_dev[192] = {}; + char upper_pts[224] = {}; + char merged_pts[224] = {}; + char options[512] = {}; + + snprintf(root, sizeof(root), "/tmp/overlayfs_mkdir_lower_%d", getpid()); + snprintf(upper, sizeof(upper), "%s/u", root); + snprintf(lower, sizeof(lower), "%s/l", root); + snprintf(work, sizeof(work), "%s/w", root); + snprintf(merged, sizeof(merged), "%s/m", root); + snprintf(lower_dev, sizeof(lower_dev), "%s/dev", lower); + snprintf(lower_pts, sizeof(lower_pts), "%s/pts", lower_dev); + snprintf(upper_dev, sizeof(upper_dev), "%s/dev", upper); + snprintf(upper_pts, sizeof(upper_pts), "%s/pts", upper_dev); + snprintf(merged_pts, sizeof(merged_pts), "%s/dev/pts", merged); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root)); + ASSERT_EQ(0, ensure_dir(upper)); + ASSERT_EQ(0, ensure_dir(lower)); + ASSERT_EQ(0, ensure_dir(work)); + ASSERT_EQ(0, ensure_dir(merged)); + ASSERT_EQ(0, mkdir(lower_dev, 0755)); + + snprintf(options, sizeof(options), "lowerdir=%s,upperdir=%s,workdir=%s", lower, upper, work); + if (mount("overlay", merged, "overlay", 0, options) != 0) { + rmdir(merged); + rmdir(work); + rmdir(lower_dev); + rmdir(lower); + rmdir(upper); + rmdir(root); + GTEST_SKIP() << strerror(errno); + } + + ASSERT_EQ(0, mkdir(merged_pts, 0755)) << strerror(errno); + + struct stat st = {}; + ASSERT_EQ(0, stat(merged_pts, &st)) << strerror(errno); + EXPECT_TRUE(S_ISDIR(st.st_mode)); + ASSERT_EQ(0, stat(upper_dev, &st)) << strerror(errno); + EXPECT_TRUE(S_ISDIR(st.st_mode)); + ASSERT_EQ(0, stat(upper_pts, &st)) << strerror(errno); + EXPECT_TRUE(S_ISDIR(st.st_mode)); + ASSERT_EQ(-1, stat(lower_pts, &st)); + EXPECT_EQ(ENOENT, errno); + + umount(merged); + rmdir(merged); + rmdir(upper_pts); + rmdir(upper_dev); + rmdir(work); + rmdir(lower_dev); + rmdir(lower); + rmdir(upper); + rmdir(root); +} + +TEST(OverlayFsSemantics, BindMountOnOverlayChildUsesNamespacePath) { + char root[128] = {}; + char upper[160] = {}; + char lower[160] = {}; + char work[160] = {}; + char merged[160] = {}; + char lower_tmp[192] = {}; + char source[160] = {}; + char source_file[192] = {}; + char merged_tmp[192] = {}; + char mounted_file[224] = {}; + char options[512] = {}; + + snprintf(root, sizeof(root), "/tmp/overlayfs_bind_child_%d", getpid()); + snprintf(upper, sizeof(upper), "%s/u", root); + snprintf(lower, sizeof(lower), "%s/l", root); + snprintf(work, sizeof(work), "%s/w", root); + snprintf(merged, sizeof(merged), "%s/m", root); + snprintf(lower_tmp, sizeof(lower_tmp), "%s/tmp", lower); + snprintf(source, sizeof(source), "%s/src", root); + snprintf(source_file, sizeof(source_file), "%s/token", source); + snprintf(merged_tmp, sizeof(merged_tmp), "%s/tmp", merged); + snprintf(mounted_file, sizeof(mounted_file), "%s/token", merged_tmp); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root)); + ASSERT_EQ(0, ensure_dir(upper)); + ASSERT_EQ(0, ensure_dir(lower)); + ASSERT_EQ(0, ensure_dir(work)); + ASSERT_EQ(0, ensure_dir(merged)); + ASSERT_EQ(0, ensure_dir(source)); + ASSERT_EQ(0, mkdir(lower_tmp, 0755)); + + FILE* fp = fopen(source_file, "w"); + ASSERT_NE(nullptr, fp) << strerror(errno); + ASSERT_EQ(5U, fwrite("token", 1, 5, fp)); + fclose(fp); + + snprintf(options, sizeof(options), "lowerdir=%s,upperdir=%s,workdir=%s", lower, upper, work); + if (mount("overlay", merged, "overlay", 0, options) != 0) { + unlink(source_file); + rmdir(source); + rmdir(merged); + rmdir(work); + rmdir(lower_tmp); + rmdir(lower); + rmdir(upper); + rmdir(root); + GTEST_SKIP() << strerror(errno); + } + + ASSERT_EQ(0, mount(source, merged_tmp, nullptr, MS_BIND | MS_REC, nullptr)) << strerror(errno); + + struct stat st = {}; + ASSERT_EQ(0, stat(mounted_file, &st)) << strerror(errno); + EXPECT_TRUE(S_ISREG(st.st_mode)); + + umount(merged_tmp); + umount(merged); + unlink(source_file); + rmdir(source); + rmdir(merged); + rmdir(work); + rmdir(lower_tmp); + rmdir(lower); + rmdir(upper); + rmdir(root); +} + +TEST(OverlayFsSemantics, OpenOverlayDirectoryWithoutFsOpenHook) { + char root[128] = {}; + char upper[160] = {}; + char lower[160] = {}; + char work[160] = {}; + char merged[160] = {}; + char lower_dir[192] = {}; + char merged_dir[192] = {}; + char options[512] = {}; + + snprintf(root, sizeof(root), "/tmp/overlayfs_open_dir_%d", getpid()); + snprintf(upper, sizeof(upper), "%s/u", root); + snprintf(lower, sizeof(lower), "%s/l", root); + snprintf(work, sizeof(work), "%s/w", root); + snprintf(merged, sizeof(merged), "%s/m", root); + snprintf(lower_dir, sizeof(lower_dir), "%s/dir", lower); + snprintf(merged_dir, sizeof(merged_dir), "%s/dir", merged); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root)); + ASSERT_EQ(0, ensure_dir(upper)); + ASSERT_EQ(0, ensure_dir(lower)); + ASSERT_EQ(0, ensure_dir(work)); + ASSERT_EQ(0, ensure_dir(merged)); + ASSERT_EQ(0, mkdir(lower_dir, 0755)); + + snprintf(options, sizeof(options), "lowerdir=%s,upperdir=%s,workdir=%s", lower, upper, work); + if (mount("overlay", merged, "overlay", 0, options) != 0) { + rmdir(merged); + rmdir(work); + rmdir(lower_dir); + rmdir(lower); + rmdir(upper); + rmdir(root); + GTEST_SKIP() << strerror(errno); + } + + int root_fd = open(merged, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + ASSERT_GE(root_fd, 0) << strerror(errno); + close(root_fd); + + int child_fd = open(merged_dir, O_RDONLY | O_DIRECTORY | O_CLOEXEC); + ASSERT_GE(child_fd, 0) << strerror(errno); + close(child_fd); + + umount(merged); + rmdir(merged); + rmdir(work); + rmdir(lower_dir); + rmdir(lower); + rmdir(upper); + rmdir(root); +} + +TEST(RenameAt2Semantics, RejectsUnknownWhiteoutAndInvalidFlagCombinations) { + std::string root = std::string("/tmp/renameat2_flags_") + std::to_string(getpid()); + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root.c_str())); + std::string old_path = join_path(root, "old"); + std::string new_path = join_path(root, "new"); + + ASSERT_EQ(0, write_text(old_path, "old")); + errno = 0; + EXPECT_EQ(-1, renameat2_call(old_path, new_path, 0x80000000U)); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ("old", read_text(old_path)); + EXPECT_FALSE(path_exists(new_path)); + + errno = 0; + EXPECT_EQ(-1, renameat2_call(old_path, new_path, 0x80000000U | RENAME_NOREPLACE)); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ("old", read_text(old_path)); + EXPECT_FALSE(path_exists(new_path)); + + errno = 0; + EXPECT_EQ(-1, renameat2_call(join_path(root, "missing"), new_path, 0x80000000U)); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ("old", read_text(old_path)); + EXPECT_FALSE(path_exists(new_path)); + + ASSERT_EQ(0, write_text(new_path, "new")); + errno = 0; + EXPECT_EQ(-1, renameat2_call(old_path, new_path, RENAME_EXCHANGE | RENAME_NOREPLACE)); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ("old", read_text(old_path)); + EXPECT_EQ("new", read_text(new_path)); + + errno = 0; + EXPECT_EQ(-1, renameat2_call(old_path, new_path, RENAME_EXCHANGE | RENAME_WHITEOUT)); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ("old", read_text(old_path)); + EXPECT_EQ("new", read_text(new_path)); + + errno = 0; + EXPECT_EQ(-1, renameat2_call(old_path, old_path, RENAME_NOREPLACE)); + EXPECT_EQ(EEXIST, errno); + EXPECT_EQ("old", read_text(old_path)); + + remove_recursive(root); +} + +TEST(RenameAt2Semantics, WhiteoutRenamesAndLeavesCharZeroZero) { + std::string root = std::string("/tmp/renameat2_whiteout_") + std::to_string(getpid()); + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root.c_str())); + if (mount("tmpfs", root.c_str(), "tmpfs", 0, "") != 0) { + remove_recursive(root); + GTEST_SKIP() << strerror(errno); + } + std::string old_path = join_path(root, "old"); + std::string new_path = join_path(root, "new"); + + ASSERT_EQ(0, write_text(old_path, "old")); + ASSERT_EQ(0, renameat2_call(old_path, new_path, RENAME_WHITEOUT)) << strerror(errno); + + EXPECT_TRUE(is_whiteout(old_path)); + EXPECT_EQ("old", read_text(new_path)); + umount(root.c_str()); + remove_recursive(root); +} + +TEST(RenameAt2Semantics, TmpfsExchangeDirAndFileUpdatesParentNlink) { + std::string root = std::string("/tmp/tmpfs_exchange_nlink_") + std::to_string(getpid()); + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root.c_str())); + if (mount("tmpfs", root.c_str(), "tmpfs", 0, "") != 0) { + remove_recursive(root); + GTEST_SKIP() << strerror(errno); + } + + std::string a = join_path(root, "a"); + std::string b = join_path(root, "b"); + std::string dir = join_path(a, "dir"); + std::string file = join_path(b, "file"); + ASSERT_EQ(0, mkdir(a.c_str(), 0755)); + ASSERT_EQ(0, mkdir(b.c_str(), 0755)); + ASSERT_EQ(0, mkdir(dir.c_str(), 0755)); + ASSERT_EQ(0, write_text(file, "file")); + + struct stat a_before = {}; + struct stat b_before = {}; + ASSERT_EQ(0, stat(a.c_str(), &a_before)) << strerror(errno); + ASSERT_EQ(0, stat(b.c_str(), &b_before)) << strerror(errno); + + ASSERT_EQ(0, renameat2_call(dir, file, RENAME_EXCHANGE)) << strerror(errno); + + struct stat a_after = {}; + struct stat b_after = {}; + ASSERT_EQ(0, stat(a.c_str(), &a_after)) << strerror(errno); + ASSERT_EQ(0, stat(b.c_str(), &b_after)) << strerror(errno); + EXPECT_EQ(a_before.st_nlink - 1, a_after.st_nlink); + EXPECT_EQ(b_before.st_nlink + 1, b_after.st_nlink); + EXPECT_EQ("file", read_text(join_path(a, "dir"))); + struct stat moved_dir = {}; + ASSERT_EQ(0, stat(join_path(b, "file").c_str(), &moved_dir)) << strerror(errno); + EXPECT_TRUE(S_ISDIR(moved_dir.st_mode)); + + umount(root.c_str()); + remove_recursive(root); +} + +TEST(RenameAt2Semantics, TmpfsSameDirDirReplaceUpdatesParentNlink) { + std::string root = std::string("/tmp/tmpfs_dir_replace_nlink_") + std::to_string(getpid()); + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root.c_str())); + if (mount("tmpfs", root.c_str(), "tmpfs", 0, "") != 0) { + remove_recursive(root); + GTEST_SKIP() << strerror(errno); + } + + std::string old_dir = join_path(root, "old"); + std::string new_dir = join_path(root, "new"); + ASSERT_EQ(0, mkdir(old_dir.c_str(), 0755)); + ASSERT_EQ(0, mkdir(new_dir.c_str(), 0755)); + struct stat before = {}; + ASSERT_EQ(0, stat(root.c_str(), &before)) << strerror(errno); + + ASSERT_EQ(0, rename(old_dir.c_str(), new_dir.c_str())) << strerror(errno); + + struct stat after = {}; + ASSERT_EQ(0, stat(root.c_str(), &after)) << strerror(errno); + EXPECT_EQ(before.st_nlink - 1, after.st_nlink); + EXPECT_FALSE(path_exists(old_dir)); + struct stat moved = {}; + ASSERT_EQ(0, stat(new_dir.c_str(), &moved)) << strerror(errno); + EXPECT_TRUE(S_ISDIR(moved.st_mode)); + + umount(root.c_str()); + remove_recursive(root); +} + +TEST(RenameAt2Semantics, TmpfsExchangeAncestorDirectoryReturnsEinval) { + std::string root = std::string("/tmp/tmpfs_exchange_ancestor_") + std::to_string(getpid()); + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(root.c_str())); + if (mount("tmpfs", root.c_str(), "tmpfs", 0, "") != 0) { + remove_recursive(root); + GTEST_SKIP() << strerror(errno); + } + + std::string a = join_path(root, "a"); + std::string b = join_path(a, "b"); + std::string child = join_path(b, "child"); + ASSERT_EQ(0, mkdir(a.c_str(), 0755)); + ASSERT_EQ(0, mkdir(b.c_str(), 0755)); + ASSERT_EQ(0, write_text(child, "child")); + + errno = 0; + EXPECT_EQ(-1, renameat2_call(b, a, RENAME_EXCHANGE)); + EXPECT_EQ(EINVAL, errno); + EXPECT_TRUE(path_exists(a)); + EXPECT_TRUE(path_exists(b)); + EXPECT_EQ("child", read_text(child)); + + umount(root.c_str()); + remove_recursive(root); +} + +TEST(OverlayFsSemantics, UpperOnlyRenameMovesEntry) { + auto env = make_overlay_env("overlayfs_rename_upper"); + std::string upper_old = join_path(env.upper, "old"); + std::string upper_new = join_path(env.upper, "new"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, write_text(upper_old, "upper-old")); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + ASSERT_EQ(0, rename(merged_old.c_str(), merged_new.c_str())) << strerror(errno); + + EXPECT_FALSE(path_exists(merged_old)); + EXPECT_EQ("upper-old", read_text(merged_new)); + EXPECT_FALSE(path_exists(upper_old)); + EXPECT_EQ("upper-old", read_text(upper_new)); + cleanup_overlay_env(env); +} + +TEST(OverlayFsSemantics, UpperOnlyRenameNoReplacePreservesState) { + auto env = make_overlay_env("overlayfs_rename_noreplace"); + std::string upper_old = join_path(env.upper, "old"); + std::string upper_new = join_path(env.upper, "new"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, write_text(upper_old, "upper-old")); + ASSERT_EQ(0, write_text(upper_new, "upper-new")); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + errno = 0; + EXPECT_EQ(-1, renameat2_call(merged_old, merged_new, RENAME_NOREPLACE)); + EXPECT_EQ(EEXIST, errno); + + EXPECT_EQ("upper-old", read_text(upper_old)); + EXPECT_EQ("upper-new", read_text(upper_new)); + EXPECT_EQ("upper-old", read_text(merged_old)); + EXPECT_EQ("upper-new", read_text(merged_new)); + cleanup_overlay_env(env); +} + +TEST(OverlayFsSemantics, UserWhiteoutRenameIsRejected) { + auto env = make_overlay_env("overlayfs_user_whiteout_reject"); + std::string upper_old = join_path(env.upper, "old"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, write_text(upper_old, "upper-old")); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + + errno = 0; + EXPECT_EQ(-1, renameat2_call(merged_old, merged_new, RENAME_WHITEOUT)); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ("upper-old", read_text(merged_old)); + EXPECT_FALSE(path_exists(merged_new)); + cleanup_overlay_env(env); +} + +TEST(OverlayFsSemantics, ExchangeCopiesUpLowerTarget) { + auto env = make_overlay_env("overlayfs_exchange_lower_target"); + std::string upper_old = join_path(env.upper, "old"); + std::string upper_new = join_path(env.upper, "new"); + std::string lower_new = join_path(env.lower, "new"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, write_text(upper_old, "upper-old")); + ASSERT_EQ(0, write_text(lower_new, "lower-new")); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + + ASSERT_EQ(0, renameat2_call(merged_old, merged_new, RENAME_EXCHANGE)) << strerror(errno); + + EXPECT_EQ("lower-new", read_text(merged_old)); + EXPECT_EQ("upper-old", read_text(merged_new)); + EXPECT_EQ("lower-new", read_text(upper_old)); + EXPECT_EQ("upper-old", read_text(upper_new)); + EXPECT_EQ("lower-new", read_text(lower_new)); + cleanup_overlay_env(env); +} + +TEST(OverlayFsSemantics, LowerOnlyFileRenameCopiesUpAndWhiteoutsOldPath) { + auto env = make_overlay_env("overlayfs_lower_rename_whiteout"); + std::string lower_old = join_path(env.lower, "old"); + std::string upper_old = join_path(env.upper, "old"); + std::string upper_new = join_path(env.upper, "new"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, write_text(lower_old, "lower-old")); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + + ASSERT_EQ(0, rename(merged_old.c_str(), merged_new.c_str())) << strerror(errno); + + EXPECT_EQ("lower-old", read_text(lower_old)); + EXPECT_FALSE(path_exists(merged_old)); + EXPECT_EQ("lower-old", read_text(merged_new)); + EXPECT_TRUE(is_whiteout(upper_old)); + EXPECT_EQ("lower-old", read_text(upper_new)); + cleanup_overlay_env(env); +} + +TEST(OverlayFsSemantics, RenameNoReplaceOverWhiteoutTargetTreatsTargetAsAbsent) { + auto env = make_overlay_env("overlayfs_rename_over_whiteout"); + std::string upper_old = join_path(env.upper, "old"); + std::string upper_new = join_path(env.upper, "new"); + std::string lower_new = join_path(env.lower, "new"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, write_text(upper_old, "upper-old")); + ASSERT_EQ(0, write_text(lower_new, "lower-new")); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + ASSERT_EQ(0, unlink(merged_new.c_str())) << strerror(errno); + ASSERT_TRUE(is_whiteout(upper_new)); + EXPECT_FALSE(path_exists(merged_new)); + + ASSERT_EQ(0, renameat2_call(merged_old, merged_new, RENAME_NOREPLACE)) << strerror(errno); + EXPECT_FALSE(path_exists(merged_old)); + EXPECT_EQ("upper-old", read_text(merged_new)); + EXPECT_FALSE(is_whiteout(upper_new)); + EXPECT_EQ("upper-old", read_text(upper_new)); + EXPECT_EQ("lower-new", read_text(lower_new)); + cleanup_overlay_env(env); +} + +TEST(OverlayFsSemantics, LowerFileRenameToDirectoryFailsWithoutCopyUp) { + auto env = make_overlay_env("overlayfs_lower_file_to_dir_no_copyup"); + std::string lower_old = join_path(env.lower, "old"); + std::string upper_old = join_path(env.upper, "old"); + std::string upper_new = join_path(env.upper, "new"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, write_text(lower_old, "lower-old")); + ASSERT_EQ(0, mkdir(upper_new.c_str(), 0755)); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + + errno = 0; + EXPECT_EQ(-1, rename(merged_old.c_str(), merged_new.c_str())); + EXPECT_EQ(EISDIR, errno); + EXPECT_EQ("lower-old", read_text(merged_old)); + EXPECT_TRUE(path_exists(merged_new)); + EXPECT_FALSE(path_exists(upper_old)); + cleanup_overlay_env(env); +} + +TEST(OverlayFsSemantics, ExchangeLowerDirReturnsExdevNoUpperHalfMove) { + auto env = make_overlay_env("overlayfs_exchange_lower_dir"); + std::string lower_old = join_path(env.lower, "old"); + std::string lower_child = join_path(lower_old, "child"); + std::string upper_old = join_path(env.upper, "old"); + std::string upper_new = join_path(env.upper, "new"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, mkdir(lower_old.c_str(), 0755)); + ASSERT_EQ(0, write_text(lower_child, "child")); + ASSERT_EQ(0, write_text(upper_new, "upper-new")); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + + errno = 0; + EXPECT_EQ(-1, renameat2_call(merged_old, merged_new, RENAME_EXCHANGE)); + EXPECT_EQ(EXDEV, errno); + + EXPECT_FALSE(path_exists(upper_old)); + EXPECT_EQ("upper-new", read_text(upper_new)); + EXPECT_EQ("child", read_text(lower_child)); + EXPECT_EQ("child", read_text(join_path(merged_old, "child"))); + EXPECT_EQ("upper-new", read_text(merged_new)); + cleanup_overlay_env(env); +} + +TEST(OverlayFsSemantics, UpperDirRenameOverNonEmptyLowerDirReturnsEnotempty) { + auto env = make_overlay_env("overlayfs_upper_dir_over_nonempty_lower_dir"); + std::string upper_old = join_path(env.upper, "old"); + std::string upper_new = join_path(env.upper, "new"); + std::string lower_new = join_path(env.lower, "new"); + std::string lower_child = join_path(lower_new, "child"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, mkdir(upper_old.c_str(), 0755)); + ASSERT_EQ(0, mkdir(lower_new.c_str(), 0755)); + ASSERT_EQ(0, write_text(lower_child, "child")); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + + errno = 0; + EXPECT_EQ(-1, rename(merged_old.c_str(), merged_new.c_str())); + EXPECT_EQ(ENOTEMPTY, errno); + + EXPECT_TRUE(path_exists(upper_old)); + EXPECT_FALSE(path_exists(upper_new)); + EXPECT_EQ("child", read_text(lower_child)); + EXPECT_TRUE(path_exists(merged_old)); + EXPECT_EQ("child", read_text(join_path(merged_new, "child"))); + cleanup_overlay_env(env); +} + +TEST(OverlayFsSemantics, UpperDirRenameOverEmptyLowerDirSucceeds) { + auto env = make_overlay_env("overlayfs_upper_dir_over_empty_lower_dir"); + std::string upper_old = join_path(env.upper, "old"); + std::string upper_new = join_path(env.upper, "new"); + std::string lower_new = join_path(env.lower, "new"); + std::string merged_old = join_path(env.merged, "old"); + std::string merged_new = join_path(env.merged, "new"); + + ASSERT_EQ(0, ensure_dir("/tmp")); + ASSERT_EQ(0, ensure_dir(env.root.c_str())); + ASSERT_EQ(0, ensure_dir(env.upper.c_str())); + ASSERT_EQ(0, ensure_dir(env.lower.c_str())); + ASSERT_EQ(0, ensure_dir(env.work.c_str())); + ASSERT_EQ(0, ensure_dir(env.merged.c_str())); + ASSERT_EQ(0, mkdir(upper_old.c_str(), 0755)); + ASSERT_EQ(0, mkdir(lower_new.c_str(), 0755)); + ASSERT_TRUE(setup_overlay_env(env)) << strerror(errno); + + ASSERT_EQ(0, rename(merged_old.c_str(), merged_new.c_str())) << strerror(errno); + + EXPECT_FALSE(path_exists(merged_old)); + EXPECT_TRUE(path_exists(merged_new)); + EXPECT_FALSE(path_exists(upper_old)); + EXPECT_TRUE(path_exists(upper_new)); + cleanup_overlay_env(env); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/user/apps/tests/dunitest/suites/normal/pipe_release_wakeup.cc b/user/apps/tests/dunitest/suites/normal/pipe_release_wakeup.cc new file mode 100644 index 0000000000..7fa15f74a8 --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/pipe_release_wakeup.cc @@ -0,0 +1,432 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifndef POLL_IN +#define POLL_IN 1 +#endif + +#ifndef POLL_OUT +#define POLL_OUT 2 +#endif + +#ifndef F_SETSIG +#define F_SETSIG 10 +#endif + +#ifndef O_PATH +#define O_PATH 010000000 +#endif + +namespace { + +volatile sig_atomic_t g_signal_count = 0; +volatile sig_atomic_t g_signal_number = 0; +volatile sig_atomic_t g_signal_fd = -1; +volatile sig_atomic_t g_signal_code = 0; +volatile sig_atomic_t g_signal_band = 0; + +void ResetSignalState() { + g_signal_count = 0; + g_signal_number = 0; + g_signal_fd = -1; + g_signal_code = 0; + g_signal_band = 0; +} + +void FasyncSignalHandler(int sig, siginfo_t* info, void*) { + g_signal_count++; + g_signal_number = sig; + if (info != nullptr) { + g_signal_fd = info->si_fd; + g_signal_code = info->si_code; + g_signal_band = info->si_band; + } +} + +void InstallSignalHandler(int signum) { + struct sigaction action {}; + action.sa_sigaction = FasyncSignalHandler; + sigemptyset(&action.sa_mask); + action.sa_flags = SA_SIGINFO; + ASSERT_EQ(0, sigaction(signum, &action, nullptr)) << strerror(errno); +} + +bool WaitForSignal(int rounds = 100) { + for (int i = 0; i < rounds; ++i) { + if (g_signal_count > 0) { + return true; + } + usleep(10 * 1000); + } + return false; +} + +void SleepForMillis(long millis) { + timespec ts {}; + ts.tv_sec = millis / 1000; + ts.tv_nsec = (millis % 1000) * 1000 * 1000; + while (nanosleep(&ts, &ts) != 0 && errno == EINTR) { + } +} + +bool WaitForExit(pid_t child, int* status, int rounds = 300) { + for (int i = 0; i < rounds; ++i) { + pid_t ret = waitpid(child, status, WNOHANG); + if (ret == child) { + return true; + } + if (ret < 0 && errno != EINTR) { + return false; + } + SleepForMillis(10); + } + return false; +} + +int FillPipeNonblock(int write_fd) { + int old_flags = fcntl(write_fd, F_GETFL); + if (old_flags < 0) { + return -1; + } + if (fcntl(write_fd, F_SETFL, old_flags | O_NONBLOCK) != 0) { + return -1; + } + + std::vector bytes(4096, 'x'); + for (;;) { + ssize_t n = write(write_fd, bytes.data(), bytes.size()); + if (n > 0) { + continue; + } + if (n < 0 && errno == EAGAIN) { + break; + } + return -1; + } + + if (fcntl(write_fd, F_SETFL, old_flags & ~O_NONBLOCK) != 0) { + return -1; + } + return 0; +} + +std::string MakeTempFifo() { + char tmpl[] = "/tmp/pipe_release_fifo_XXXXXX"; + char* dir = mkdtemp(tmpl); + if (dir == nullptr) { + return ""; + } + std::string path = std::string(dir) + "/fifo"; + if (mkfifo(path.c_str(), 0600) != 0) { + rmdir(dir); + return ""; + } + return path; +} + +void CleanupTempFifo(const std::string& path) { + unlink(path.c_str()); + std::string dir = path.substr(0, path.rfind('/')); + rmdir(dir.c_str()); +} + +void EnableFasyncSignal(int fd, int signum) { + ASSERT_EQ(0, fcntl(fd, F_SETOWN, getpid())) << strerror(errno); + ASSERT_EQ(0, fcntl(fd, F_SETSIG, signum)) << strerror(errno); + int flags = fcntl(fd, F_GETFL); + ASSERT_GE(flags, 0) << strerror(errno); + ASSERT_EQ(0, fcntl(fd, F_SETFL, flags | O_ASYNC)) << strerror(errno); +} + +} // namespace + +TEST(PipeReleaseWakeup, ReadUnblocksWithEofWhenLastWriterCloses) { + int fds[2] = {-1, -1}; + ASSERT_EQ(0, pipe(fds)) << strerror(errno); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + close(fds[0]); + SleepForMillis(50); + close(fds[1]); + _exit(0); + } + + close(fds[1]); + char ch = 0; + errno = 0; + EXPECT_EQ(0, read(fds[0], &ch, sizeof(ch))) << strerror(errno); + close(fds[0]); + + int status = 0; + ASSERT_TRUE(WaitForExit(child, &status)); + ASSERT_TRUE(WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(PipeReleaseWakeup, WriteUnblocksWithEpipeWhenLastReaderCloses) { + int data_pipe[2] = {-1, -1}; + int ready_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(data_pipe)) << strerror(errno); + ASSERT_EQ(0, pipe(ready_pipe)) << strerror(errno); + ASSERT_EQ(0, FillPipeNonblock(data_pipe[1])) << strerror(errno); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + signal(SIGPIPE, SIG_IGN); + close(data_pipe[0]); + close(ready_pipe[0]); + char ready = 'r'; + if (write(ready_pipe[1], &ready, 1) != 1) { + _exit(2); + } + close(ready_pipe[1]); + + char byte = 'z'; + ssize_t n = write(data_pipe[1], &byte, 1); + if (n < 0 && errno == EPIPE) { + _exit(0); + } + _exit(3); + } + + close(ready_pipe[1]); + close(data_pipe[1]); + + char ready = 0; + ASSERT_EQ(1, read(ready_pipe[0], &ready, 1)) << strerror(errno); + close(ready_pipe[0]); + + SleepForMillis(50); + close(data_pipe[0]); + + int status = 0; + if (!WaitForExit(child, &status)) { + kill(child, SIGKILL); + waitpid(child, &status, 0); + FAIL() << "writer stayed blocked after the last reader closed"; + } + ASSERT_TRUE(WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(PipeReleaseWakeup, EpollReportsHupAndErrAfterEndpointClose) { + int fds[2] = {-1, -1}; + ASSERT_EQ(0, pipe(fds)) << strerror(errno); + + int epfd = epoll_create1(0); + ASSERT_GE(epfd, 0) << strerror(errno); + + epoll_event ev {}; + ev.events = EPOLLIN; + ev.data.fd = fds[0]; + ASSERT_EQ(0, epoll_ctl(epfd, EPOLL_CTL_ADD, fds[0], &ev)) << strerror(errno); + + ASSERT_EQ(0, close(fds[1])) << strerror(errno); + epoll_event out {}; + ASSERT_EQ(1, epoll_wait(epfd, &out, 1, 1000)) << strerror(errno); + EXPECT_EQ(fds[0], static_cast(out.data.fd)); + EXPECT_NE(0U, out.events & EPOLLHUP); + + close(fds[0]); + close(epfd); + + ASSERT_EQ(0, pipe(fds)) << strerror(errno); + epfd = epoll_create1(0); + ASSERT_GE(epfd, 0) << strerror(errno); + + ev = {}; + ev.events = EPOLLOUT; + ev.data.fd = fds[1]; + ASSERT_EQ(0, epoll_ctl(epfd, EPOLL_CTL_ADD, fds[1], &ev)) << strerror(errno); + + ASSERT_EQ(0, close(fds[0])) << strerror(errno); + out = {}; + ASSERT_EQ(1, epoll_wait(epfd, &out, 1, 1000)) << strerror(errno); + EXPECT_EQ(fds[1], static_cast(out.data.fd)); + EXPECT_NE(0U, out.events & EPOLLERR); + + close(fds[1]); + close(epfd); +} + +TEST(PipeReleaseWakeup, NonblockErrnoRegression) { + int fds[2] = {-1, -1}; + ASSERT_EQ(0, pipe(fds)) << strerror(errno); + + int read_flags = fcntl(fds[0], F_GETFL); + ASSERT_GE(read_flags, 0) << strerror(errno); + ASSERT_EQ(0, fcntl(fds[0], F_SETFL, read_flags | O_NONBLOCK)) << strerror(errno); + + char byte = 0; + errno = 0; + EXPECT_EQ(-1, read(fds[0], &byte, 1)); + EXPECT_EQ(EAGAIN, errno); + + ASSERT_EQ(0, FillPipeNonblock(fds[1])) << strerror(errno); + int write_flags = fcntl(fds[1], F_GETFL); + ASSERT_GE(write_flags, 0) << strerror(errno); + ASSERT_EQ(0, fcntl(fds[1], F_SETFL, write_flags | O_NONBLOCK)) << strerror(errno); + errno = 0; + EXPECT_EQ(-1, write(fds[1], "x", 1)); + EXPECT_EQ(EAGAIN, errno); + + struct sigaction old_sigpipe {}; + struct sigaction ignore_sigpipe {}; + ignore_sigpipe.sa_handler = SIG_IGN; + sigemptyset(&ignore_sigpipe.sa_mask); + ASSERT_EQ(0, sigaction(SIGPIPE, &ignore_sigpipe, &old_sigpipe)) << strerror(errno); + ASSERT_EQ(0, close(fds[0])) << strerror(errno); + errno = 0; + EXPECT_EQ(-1, write(fds[1], "x", 1)); + EXPECT_EQ(EPIPE, errno); + ASSERT_EQ(0, sigaction(SIGPIPE, &old_sigpipe, nullptr)) << strerror(errno); + + close(fds[1]); +} + +TEST(PipeReleaseWakeup, RdwrCloseNotifiesRemainingReader) { + std::string path = MakeTempFifo(); + ASSERT_FALSE(path.empty()) << strerror(errno); + + int rdwr_fd = open(path.c_str(), O_RDWR | O_NONBLOCK); + ASSERT_GE(rdwr_fd, 0) << strerror(errno); + int read_fd = open(path.c_str(), O_RDONLY | O_NONBLOCK); + ASSERT_GE(read_fd, 0) << strerror(errno); + + InstallSignalHandler(SIGUSR1); + EnableFasyncSignal(read_fd, SIGUSR1); + ResetSignalState(); + + int epfd = epoll_create1(0); + ASSERT_GE(epfd, 0) << strerror(errno); + epoll_event ev {}; + ev.events = EPOLLIN; + ev.data.fd = read_fd; + ASSERT_EQ(0, epoll_ctl(epfd, EPOLL_CTL_ADD, read_fd, &ev)) << strerror(errno); + + ASSERT_EQ(0, close(rdwr_fd)) << strerror(errno); + ASSERT_TRUE(WaitForSignal()) << "reader did not receive SIGIO after O_RDWR writer vanished"; + EXPECT_EQ(SIGUSR1, g_signal_number); + EXPECT_EQ(read_fd, g_signal_fd); + EXPECT_EQ(POLL_IN, g_signal_code); + EXPECT_EQ(static_cast(EPOLLIN | EPOLLRDNORM), g_signal_band); + + epoll_event out {}; + ASSERT_EQ(1, epoll_wait(epfd, &out, 1, 1000)) << strerror(errno); + EXPECT_EQ(read_fd, static_cast(out.data.fd)); + EXPECT_NE(0U, out.events & EPOLLHUP); + + close(epfd); + close(read_fd); + CleanupTempFifo(path); +} + +TEST(PipeReleaseWakeup, RdwrCloseNotifiesRemainingWriter) { + std::string path = MakeTempFifo(); + ASSERT_FALSE(path.empty()) << strerror(errno); + + int rdwr_fd = open(path.c_str(), O_RDWR | O_NONBLOCK); + ASSERT_GE(rdwr_fd, 0) << strerror(errno); + int write_fd = open(path.c_str(), O_WRONLY | O_NONBLOCK); + ASSERT_GE(write_fd, 0) << strerror(errno); + + InstallSignalHandler(SIGUSR2); + EnableFasyncSignal(write_fd, SIGUSR2); + ResetSignalState(); + + ASSERT_EQ(0, close(rdwr_fd)) << strerror(errno); + ASSERT_TRUE(WaitForSignal()) << "writer did not receive SIGIO after O_RDWR reader vanished"; + EXPECT_EQ(SIGUSR2, g_signal_number); + EXPECT_EQ(write_fd, g_signal_fd); + EXPECT_EQ(POLL_OUT, g_signal_code); + EXPECT_EQ(static_cast(EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND), g_signal_band); + + close(write_fd); + CleanupTempFifo(path); +} + +TEST(PipeReleaseWakeup, OPathCloseDoesNotNotifyPipeEndpoint) { + std::string path = MakeTempFifo(); + ASSERT_FALSE(path.empty()) << strerror(errno); + + int rdwr_fd = open(path.c_str(), O_RDWR | O_NONBLOCK); + ASSERT_GE(rdwr_fd, 0) << strerror(errno); + int read_fd = open(path.c_str(), O_RDONLY | O_NONBLOCK); + ASSERT_GE(read_fd, 0) << strerror(errno); + + InstallSignalHandler(SIGUSR1); + EnableFasyncSignal(read_fd, SIGUSR1); + ResetSignalState(); + + int opath_fd = open(path.c_str(), O_PATH); + ASSERT_GE(opath_fd, 0) << strerror(errno); + ASSERT_EQ(0, close(opath_fd)) << strerror(errno); + EXPECT_FALSE(WaitForSignal(10)); + + close(read_fd); + close(rdwr_fd); + CleanupTempFifo(path); +} + +TEST(PipeReleaseWakeup, ForkedLoggerPipeSeesEofAfterChildClosesWriteEnd) { + int log_pipe[2] = {-1, -1}; + int ready_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(log_pipe)) << strerror(errno); + ASSERT_EQ(0, pipe(ready_pipe)) << strerror(errno); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + close(log_pipe[0]); + close(ready_pipe[0]); + char ready = 'r'; + if (write(ready_pipe[1], &ready, 1) != 1) { + _exit(2); + } + close(ready_pipe[1]); + close(log_pipe[1]); + _exit(0); + } + + close(log_pipe[1]); + close(ready_pipe[1]); + char ready = 0; + ASSERT_EQ(1, read(ready_pipe[0], &ready, 1)) << strerror(errno); + close(ready_pipe[0]); + + pollfd pfd {}; + pfd.fd = log_pipe[0]; + pfd.events = POLLIN; + ASSERT_EQ(1, poll(&pfd, 1, 1000)) << strerror(errno); + EXPECT_NE(0, pfd.revents & POLLHUP); + + char byte = 0; + EXPECT_EQ(0, read(log_pipe[0], &byte, 1)) << strerror(errno); + close(log_pipe[0]); + + int status = 0; + ASSERT_TRUE(WaitForExit(child, &status)); + ASSERT_TRUE(WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/pipe_waitqueue_wakeup.cc b/user/apps/tests/dunitest/suites/normal/pipe_waitqueue_wakeup.cc new file mode 100644 index 0000000000..0041975eeb --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/pipe_waitqueue_wakeup.cc @@ -0,0 +1,92 @@ +#include + +#include +#include +#include +#include +#include +#include + +namespace { + +void SleepForMillis(long millis) { + timespec ts {}; + ts.tv_sec = millis / 1000; + ts.tv_nsec = (millis % 1000) * 1000 * 1000; + while (nanosleep(&ts, &ts) != 0 && errno == EINTR) { + } +} + +bool WaitForChild(pid_t child, int* status, int rounds = 300) { + for (int i = 0; i < rounds; ++i) { + pid_t ret = waitpid(child, status, WNOHANG); + if (ret == child) { + return true; + } + if (ret < 0 && errno != EINTR) { + return false; + } + SleepForMillis(10); + } + return false; +} + +} // namespace + +TEST(PipeWaitqueueWakeup, BlockingReadConsumesChildReadyByte) { + int ready_pipe[2] = {-1, -1}; + int release_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(ready_pipe)) << strerror(errno); + ASSERT_EQ(0, pipe(release_pipe)) << strerror(errno); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + close(ready_pipe[0]); + close(release_pipe[1]); + + for (int i = 0; i < 200; ++i) { + char ready = 'r'; + if (write(ready_pipe[1], &ready, 1) != 1) { + _exit(2); + } + + char release = 0; + ssize_t n = read(release_pipe[0], &release, 1); + if (n != 1 || release != 'c') { + _exit(3); + } + } + close(ready_pipe[1]); + close(release_pipe[0]); + _exit(0); + } + + close(ready_pipe[1]); + close(release_pipe[0]); + + for (int i = 0; i < 200; ++i) { + char ready = 0; + ASSERT_EQ(1, read(ready_pipe[0], &ready, 1)) << strerror(errno); + ASSERT_EQ('r', ready); + + char release = 'c'; + ASSERT_EQ(1, write(release_pipe[1], &release, 1)) << strerror(errno); + } + close(ready_pipe[0]); + close(release_pipe[1]); + + int status = 0; + if (!WaitForChild(child, &status)) { + kill(child, SIGKILL); + waitpid(child, nullptr, 0); + FAIL() << "child did not finish pipe wakeup handshake"; + } + ASSERT_TRUE(WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/spawn_exec_pipe_race.cc b/user/apps/tests/dunitest/suites/normal/spawn_exec_pipe_race.cc new file mode 100644 index 0000000000..4f09438ae0 --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/spawn_exec_pipe_race.cc @@ -0,0 +1,280 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern char** environ; + +namespace { + +constexpr char kSiblingExecMode[] = "--spawn-exec-pipe-race-sibling-exec"; +constexpr char kExecExitMode[] = "--spawn-exec-pipe-race-exec-exit"; +constexpr int kIterations = 512; + +void SleepForMillis(long millis) { + timespec ts {}; + ts.tv_sec = millis / 1000; + ts.tv_nsec = (millis % 1000) * 1000 * 1000; + while (nanosleep(&ts, &ts) != 0 && errno == EINTR) { + } +} + +bool SetCloseOnExec(int fd) { + int flags = fcntl(fd, F_GETFD); + if (flags < 0) { + return false; + } + return fcntl(fd, F_SETFD, flags | FD_CLOEXEC) == 0; +} + +bool SetNonblock(int fd) { + int flags = fcntl(fd, F_GETFL); + if (flags < 0) { + return false; + } + return fcntl(fd, F_SETFL, flags | O_NONBLOCK) == 0; +} + +void CloseIfOpen(int* fd) { + if (*fd >= 0) { + close(*fd); + *fd = -1; + } +} + +void KillProcessGroup(pid_t child) { + kill(-child, SIGKILL); + kill(child, SIGKILL); +} + +void WriteErrnoAndExit(int fd, int saved_errno, int code) { + ssize_t n = write(fd, &saved_errno, sizeof(saved_errno)); + if (n != static_cast(sizeof(saved_errno))) { + _exit(126); + } + _exit(code); +} + +void DrainReadyFd(int fd, short revents, bool* eof, std::vector* dst) { + char buf[512]; + for (;;) { + ssize_t n = read(fd, buf, sizeof(buf)); + if (n > 0) { + if (dst != nullptr) { + dst->insert(dst->end(), buf, buf + n); + } + continue; + } + if (n == 0) { + *eof = true; + } else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { + *eof = true; + } + break; + } + + if ((revents & POLLHUP) != 0) { + *eof = true; + } +} + +bool WaitForChildExit(pid_t child, int* status, int timeout_ms) { + const int rounds = timeout_ms / 10; + for (int i = 0; i < rounds; ++i) { + pid_t ret = waitpid(child, status, WNOHANG); + if (ret == child) { + return true; + } + if (ret < 0 && errno != EINTR) { + return false; + } + SleepForMillis(10); + } + return false; +} + +void* SiblingExecThread(void*) { + char arg0[] = "/proc/self/exe"; + char arg1[] = "--spawn-exec-pipe-race-exec-exit"; + char* const argv[] = {arg0, arg1, nullptr}; + char* const envp[] = {nullptr}; + execve("/proc/self/exe", argv, envp); + _exit(errno); +} + +void RunSiblingExecHelper() { + pthread_t thread; + if (pthread_create(&thread, nullptr, SiblingExecThread, nullptr) != 0) { + _exit(1); + } + + for (;;) { + pause(); + } +} + +void TriggerSiblingExecOnce() { + pid_t child = fork(); + ASSERT_GE(child, 0) << "fork sibling trigger failed: " << strerror(errno); + if (child == 0) { + char arg0[] = "/proc/self/exe"; + char* const argv[] = {arg0, const_cast(kSiblingExecMode), nullptr}; + char* const envp[] = {nullptr}; + execve("/proc/self/exe", argv, envp); + _exit(errno); + } + + int status = 0; + ASSERT_TRUE(WaitForChildExit(child, &status, 5000)) + << "sibling exec trigger did not exit"; + ASSERT_TRUE(WIFEXITED(status)) << "sibling trigger status=" << status; + ASSERT_EQ(0, WEXITSTATUS(status)) << "sibling trigger status=" << status; +} + +void SpawnGtestHelpWithCloexecPipeOnce(int iter) { + int err_pipe[2] = {-1, -1}; + int out_pipe[2] = {-1, -1}; + int stderr_pipe[2] = {-1, -1}; + + ASSERT_EQ(0, pipe(err_pipe)) << strerror(errno); + ASSERT_EQ(0, pipe(out_pipe)) << strerror(errno); + ASSERT_EQ(0, pipe(stderr_pipe)) << strerror(errno); + ASSERT_TRUE(SetCloseOnExec(err_pipe[1])) << strerror(errno); + ASSERT_TRUE(SetNonblock(err_pipe[0])) << strerror(errno); + ASSERT_TRUE(SetNonblock(out_pipe[0])) << strerror(errno); + ASSERT_TRUE(SetNonblock(stderr_pipe[0])) << strerror(errno); + + pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed at iter=" << iter << ": " << strerror(errno); + if (child == 0) { + close(err_pipe[0]); + close(out_pipe[0]); + close(stderr_pipe[0]); + + if (setpgid(0, 0) != 0) { + int saved = errno; + WriteErrnoAndExit(err_pipe[1], saved, 120); + } + + if (dup2(out_pipe[1], STDOUT_FILENO) < 0 || + dup2(stderr_pipe[1], STDERR_FILENO) < 0) { + int saved = errno; + WriteErrnoAndExit(err_pipe[1], saved, 121); + } + + close(out_pipe[1]); + close(stderr_pipe[1]); + + char arg0[] = "/proc/self/exe"; + char arg1[] = "--gtest_help"; + char* const argv[] = {arg0, arg1, nullptr}; + execve("/proc/self/exe", argv, environ); + + int saved = errno; + WriteErrnoAndExit(err_pipe[1], saved, 127); + } + + close(err_pipe[1]); + err_pipe[1] = -1; + close(out_pipe[1]); + out_pipe[1] = -1; + close(stderr_pipe[1]); + stderr_pipe[1] = -1; + + bool err_eof = false; + bool out_eof = false; + bool stderr_eof = false; + bool child_exited = false; + int status = 0; + std::vector err; + std::vector out; + std::vector child_stderr; + + for (int waited_ms = 0; waited_ms < 5000; waited_ms += 10) { + pollfd fds[3] = {}; + fds[0].fd = err_eof ? -1 : err_pipe[0]; + fds[0].events = err_eof ? 0 : (POLLIN | POLLHUP); + fds[1].fd = out_eof ? -1 : out_pipe[0]; + fds[1].events = out_eof ? 0 : (POLLIN | POLLHUP); + fds[2].fd = stderr_eof ? -1 : stderr_pipe[0]; + fds[2].events = stderr_eof ? 0 : (POLLIN | POLLHUP); + poll(fds, 3, 10); + + if (!err_eof) { + DrainReadyFd(err_pipe[0], fds[0].revents, &err_eof, &err); + } + if (!out_eof) { + DrainReadyFd(out_pipe[0], fds[1].revents, &out_eof, &out); + } + if (!stderr_eof) { + DrainReadyFd(stderr_pipe[0], fds[2].revents, &stderr_eof, &child_stderr); + } + + if (!child_exited) { + pid_t ret = waitpid(child, &status, WNOHANG); + if (ret == child) { + child_exited = true; + } else if (ret < 0 && errno != EINTR) { + FAIL() << "waitpid failed at iter=" << iter << ": " << strerror(errno); + } + } + + if (err_eof && out_eof && stderr_eof && child_exited) { + break; + } + } + + CloseIfOpen(&err_pipe[0]); + CloseIfOpen(&out_pipe[0]); + CloseIfOpen(&stderr_pipe[0]); + + if (!(err_eof && out_eof && stderr_eof && child_exited)) { + KillProcessGroup(child); + waitpid(child, nullptr, 0); + } + + ASSERT_TRUE(err_eof) << "exec CLOEXEC error pipe did not reach EOF at iter=" << iter; + ASSERT_TRUE(err.empty()) << "pre-exec/exec error pipe has data at iter=" << iter; + ASSERT_TRUE(out_eof) << "stdout pipe did not reach EOF at iter=" << iter; + ASSERT_TRUE(stderr_eof) << "stderr pipe did not reach EOF at iter=" << iter; + ASSERT_TRUE(child_exited) << "child did not exit at iter=" << iter; + ASSERT_TRUE(WIFEXITED(status)) << "child status=" << status << " iter=" << iter; + ASSERT_EQ(0, WEXITSTATUS(status)) << "child status=" << status << " iter=" << iter; + ASSERT_FALSE(out.empty()) << "gtest help produced no stdout at iter=" << iter; +} + +} // namespace + +TEST(SpawnExecPipeRace, GtestHelpSpawnAfterSiblingExecStress) { + for (int i = 0; i < kIterations; ++i) { + if ((i % 16) == 0) { + ASSERT_NO_FATAL_FAILURE(TriggerSiblingExecOnce()); + } + ASSERT_NO_FATAL_FAILURE(SpawnGtestHelpWithCloexecPipeOnce(i)); + } +} + +int main(int argc, char** argv) { + if (argc >= 2 && strcmp(argv[1], kExecExitMode) == 0) { + return 0; + } + if (argc >= 2 && strcmp(argv[1], kSiblingExecMode) == 0) { + RunSiblingExecHelper(); + return 1; + } + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/tty_pty_hangup.cc b/user/apps/tests/dunitest/suites/normal/tty_pty_hangup.cc new file mode 100644 index 0000000000..5b0cb2857a --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/tty_pty_hangup.cc @@ -0,0 +1,613 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +#ifndef TIOCPKT +constexpr int kTiocpkt = 0x5420; +#else +constexpr int kTiocpkt = TIOCPKT; +#endif + +#ifndef TIOCGPTN +constexpr int kTiocgptn = 0x80045430; +#else +constexpr int kTiocgptn = TIOCGPTN; +#endif + +#ifndef TIOCPKT_FLUSHWRITE +constexpr unsigned char kTiocpktFlushWrite = 2; +#else +constexpr unsigned char kTiocpktFlushWrite = TIOCPKT_FLUSHWRITE; +#endif + +class UniqueFd { +public: + UniqueFd() = default; + explicit UniqueFd(int fd) : fd_(fd) {} + UniqueFd(const UniqueFd&) = delete; + UniqueFd& operator=(const UniqueFd&) = delete; + + UniqueFd(UniqueFd&& other) noexcept : fd_(other.fd_) { other.fd_ = -1; } + + UniqueFd& operator=(UniqueFd&& other) noexcept { + if (this != &other) { + reset(); + fd_ = other.fd_; + other.fd_ = -1; + } + return *this; + } + + ~UniqueFd() { reset(); } + + int get() const { return fd_; } + + int release() { + int fd = fd_; + fd_ = -1; + return fd; + } + + void reset(int fd = -1) { + if (fd_ >= 0) { + close(fd_); + } + fd_ = fd; + } + +private: + int fd_ = -1; +}; + +struct PtyPair { + UniqueFd master; + UniqueFd slave; +}; + +PtyPair OpenRawPty(char* name = nullptr) { + int master = -1; + int slave = -1; + if (openpty(&master, &slave, name, nullptr, nullptr) < 0) { + ADD_FAILURE() << "openpty failed: errno=" << errno << " (" << strerror(errno) << ")"; + return {}; + } + + PtyPair pair{UniqueFd(master), UniqueFd(slave)}; + + struct termios term = {}; + if (tcgetattr(pair.slave.get(), &term) < 0) { + ADD_FAILURE() << "tcgetattr failed: errno=" << errno << " (" << strerror(errno) << ")"; + return pair; + } + + term.c_iflag = 0; + term.c_oflag = 0; + term.c_lflag = 0; + term.c_cflag |= CS8; + term.c_cc[VMIN] = 1; + term.c_cc[VTIME] = 0; + + if (tcsetattr(pair.slave.get(), TCSANOW, &term) < 0) { + ADD_FAILURE() << "tcsetattr failed: errno=" << errno << " (" << strerror(errno) << ")"; + } + + return pair; +} + +PtyPair OpenCanonicalNoEchoPty() { + int master = -1; + int slave = -1; + if (openpty(&master, &slave, nullptr, nullptr, nullptr) < 0) { + ADD_FAILURE() << "openpty failed: errno=" << errno << " (" << strerror(errno) << ")"; + return {}; + } + + PtyPair pair{UniqueFd(master), UniqueFd(slave)}; + + struct termios term = {}; + if (tcgetattr(pair.slave.get(), &term) < 0) { + ADD_FAILURE() << "tcgetattr failed: errno=" << errno << " (" << strerror(errno) << ")"; + return pair; + } + + term.c_lflag |= ICANON; + term.c_lflag &= ~ECHO; + term.c_cc[VMIN] = 1; + term.c_cc[VTIME] = 0; + + if (tcsetattr(pair.slave.get(), TCSANOW, &term) < 0) { + ADD_FAILURE() << "tcsetattr failed: errno=" << errno << " (" << strerror(errno) << ")"; + } + + return pair; +} + +void SetNonblock(int fd) { + int flags = fcntl(fd, F_GETFL); + ASSERT_GE(flags, 0) << "fcntl(F_GETFL) failed: errno=" << errno << " (" << strerror(errno) + << ")"; + ASSERT_EQ(0, fcntl(fd, F_SETFL, flags | O_NONBLOCK)) + << "fcntl(F_SETFL, O_NONBLOCK) failed: errno=" << errno << " (" << strerror(errno) + << ")"; +} + +short PollEvents(int fd) { + struct pollfd pfd = { + .fd = fd, + .events = POLLIN | POLLOUT | POLLERR | POLLHUP, + .revents = 0, + }; + int ret = poll(&pfd, 1, 0); + EXPECT_GE(ret, 0) << "poll failed: errno=" << errno << " (" << strerror(errno) << ")"; + return pfd.revents; +} + +void ExpectReadErrno(int fd, int expected_errno) { + char ch = 0; + errno = 0; + EXPECT_EQ(-1, read(fd, &ch, 1)); + EXPECT_EQ(expected_errno, errno) << "unexpected read errno=" << errno << " (" + << strerror(errno) << ")"; +} + +bool IsWouldBlock(int err) { + return err == EAGAIN +#if EWOULDBLOCK != EAGAIN + || err == EWOULDBLOCK +#endif + ; +} + +struct ConcurrentSlaveOpenArgs { + const char* slave_name; + int start_read_fd; + int opened_fd; + int open_errno; +}; + +void* ConcurrentSlaveOpen(void* raw) { + auto* args = static_cast(raw); + char token = 0; + ssize_t n = read(args->start_read_fd, &token, 1); + close(args->start_read_fd); + if (n != 1) { + args->opened_fd = -1; + args->open_errno = errno == 0 ? EIO : errno; + return nullptr; + } + + errno = 0; + args->opened_fd = open(args->slave_name, O_RDWR | O_NOCTTY); + args->open_errno = args->opened_fd >= 0 ? 0 : errno; + return nullptr; +} + +bool WaitForChild(pid_t child, int* status, int rounds = 300) { + for (int i = 0; i < rounds; ++i) { + pid_t ret = waitpid(child, status, WNOHANG); + if (ret == child) { + return true; + } + if (ret < 0 && errno != EINTR) { + return false; + } + usleep(10 * 1000); + } + return false; +} + +TEST(TtyPtyHangup, MasterReadAfterSlaveCloseReturnsEio) { + PtyPair pair = OpenRawPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + pair.slave.reset(); + + ExpectReadErrno(pair.master.get(), EIO); +} + +TEST(TtyPtyHangup, CanonicalReaderDoesNotMissLineWakeup) { + PtyPair pair = OpenCanonicalNoEchoPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + int ready_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(ready_pipe)) << strerror(errno); + + pid_t child = fork(); + ASSERT_GE(child, 0) << strerror(errno); + if (child == 0) { + close(ready_pipe[0]); + pair.master.reset(); + + for (int i = 0; i < 200; ++i) { + char ready = 'r'; + if (write(ready_pipe[1], &ready, 1) != 1) { + _exit(2); + } + + char buf[32] = {}; + ssize_t n = read(pair.slave.get(), buf, sizeof(buf)); + if (n != 5 || memcmp(buf, "line\n", 5) != 0) { + _exit(3); + } + } + _exit(0); + } + + close(ready_pipe[1]); + pair.slave.reset(); + + for (int i = 0; i < 200; ++i) { + char ready = 0; + ASSERT_EQ(1, read(ready_pipe[0], &ready, 1)) << strerror(errno); + ASSERT_EQ('r', ready); + ASSERT_EQ(5, write(pair.master.get(), "line\n", 5)) << strerror(errno); + } + close(ready_pipe[0]); + + int status = 0; + if (!WaitForChild(child, &status)) { + kill(child, SIGKILL); + waitpid(child, nullptr, 0); + FAIL() << "canonical pty reader did not consume all lines"; + } + ASSERT_TRUE(WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); +} + +TEST(TtyPtyHangup, MasterPollAfterSlaveCloseReportsHupAndOut) { + PtyPair pair = OpenRawPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + pair.slave.reset(); + + short revents = PollEvents(pair.master.get()); + EXPECT_NE(0, revents & POLLHUP); + EXPECT_NE(0, revents & POLLOUT); +} + +TEST(TtyPtyHangup, MasterDrainsBufferedDataBeforeEio) { + PtyPair pair = OpenRawPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + ASSERT_EQ(3, write(pair.slave.get(), "abc", 3)) + << "write slave failed: errno=" << errno << " (" << strerror(errno) << ")"; + pair.slave.reset(); + + char buf[4] = {}; + ASSERT_EQ(3, read(pair.master.get(), buf, 3)) + << "read buffered data failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_STREQ("abc", buf); + ExpectReadErrno(pair.master.get(), EIO); +} + +TEST(TtyPtyHangup, MasterWriteAfterSlaveCloseSucceeds) { + PtyPair pair = OpenRawPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + pair.slave.reset(); + + EXPECT_EQ(1, write(pair.master.get(), "x", 1)) + << "master write after slave close failed: errno=" << errno << " (" << strerror(errno) + << ")"; +} + +TEST(TtyPtyHangup, NonblockEmptyMasterReadStillEagainBeforeHangup) { + PtyPair pair = OpenRawPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + SetNonblock(pair.master.get()); + + char ch = 0; + errno = 0; + EXPECT_EQ(-1, read(pair.master.get(), &ch, 1)); + EXPECT_TRUE(IsWouldBlock(errno)) << "empty nonblocking pty master read errno=" << errno << " (" + << strerror(errno) << ")"; +} + +TEST(TtyPtyHangup, SlaveCanReopenWhileMasterAliveAfterLastSlaveClose) { + char slave_name[128] = {}; + PtyPair pair = OpenRawPty(slave_name); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + ASSERT_NE('\0', slave_name[0]); + + pair.slave.reset(); + ExpectReadErrno(pair.master.get(), EIO); + + UniqueFd reopened(open(slave_name, O_RDWR | O_NOCTTY)); + ASSERT_GE(reopened.get(), 0) << "reopen(" << slave_name << ") failed: errno=" << errno + << " (" << strerror(errno) << ")"; + + SetNonblock(pair.master.get()); + char ch = 0; + errno = 0; + EXPECT_EQ(-1, read(pair.master.get(), &ch, 1)); + EXPECT_TRUE(IsWouldBlock(errno)) << "master should stop reporting hangup after slave reopen," + << " errno=" << errno << " (" << strerror(errno) << ")"; + + ASSERT_EQ(1, write(reopened.get(), "r", 1)) + << "write reopened slave failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_EQ(1, read(pair.master.get(), &ch, 1)) + << "master read from reopened slave failed: errno=" << errno << " (" << strerror(errno) + << ")"; + EXPECT_EQ('r', ch); +} + +TEST(TtyPtyHangup, ReopenedSlaveKeepsIndexReservedAfterMasterClose) { + char first_name[128] = {}; + PtyPair pair = OpenRawPty(first_name); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + ASSERT_NE('\0', first_name[0]); + + pair.slave.reset(); + + UniqueFd reopened(open(first_name, O_RDWR | O_NOCTTY)); + ASSERT_GE(reopened.get(), 0) << "reopen(" << first_name << ") failed: errno=" << errno + << " (" << strerror(errno) << ")"; + + pair.master.reset(); + + char second_name[128] = {}; + PtyPair second = OpenRawPty(second_name); + ASSERT_GE(second.master.get(), 0); + ASSERT_GE(second.slave.get(), 0); + ASSERT_NE('\0', second_name[0]); + EXPECT_NE(0, strcmp(first_name, second_name)) + << "pty index was reused while a reopened slave fd was still alive"; +} + +TEST(TtyPtyHangup, ConcurrentSlaveOpenAndMasterCloseNeverReusesLiveIndex) { + constexpr int kIterations = 64; + int reopened_success = 0; + int eio_failures = 0; + int enoent_failures = 0; + int other_failures = 0; + + for (int i = 0; i < kIterations; ++i) { + SCOPED_TRACE(i); + char first_name[128] = {}; + PtyPair pair = OpenRawPty(first_name); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + ASSERT_NE('\0', first_name[0]); + + pair.slave.reset(); + + int start_pipe[2] = {-1, -1}; + ASSERT_EQ(0, pipe(start_pipe)) << "pipe failed: errno=" << errno << " (" + << strerror(errno) << ")"; + UniqueFd write_end(start_pipe[1]); + + ConcurrentSlaveOpenArgs args = { + .slave_name = first_name, + .start_read_fd = start_pipe[0], + .opened_fd = -1, + .open_errno = 0, + }; + + pthread_t thread = {}; + ASSERT_EQ(0, pthread_create(&thread, nullptr, ConcurrentSlaveOpen, &args)) + << "pthread_create failed"; + + ASSERT_EQ(1, write(write_end.get(), "x", 1)) + << "failed to release slave opener: errno=" << errno << " (" << strerror(errno) + << ")"; + write_end.reset(); + pair.master.reset(); + + ASSERT_EQ(0, pthread_join(thread, nullptr)) << "pthread_join failed"; + + if (args.opened_fd >= 0) { + ++reopened_success; + UniqueFd reopened(args.opened_fd); + + char second_name[128] = {}; + PtyPair second = OpenRawPty(second_name); + ASSERT_GE(second.master.get(), 0); + ASSERT_GE(second.slave.get(), 0); + ASSERT_NE('\0', second_name[0]); + EXPECT_NE(0, strcmp(first_name, second_name)) + << "pty index was reused while concurrent reopened slave fd was still alive"; + continue; + } + + if (args.open_errno == EIO) { + ++eio_failures; + } else if (args.open_errno == ENOENT) { + ++enoent_failures; + } else { + ++other_failures; + ADD_FAILURE() << "unexpected concurrent slave open errno=" << args.open_errno << " (" + << strerror(args.open_errno) << ")"; + } + } + + EXPECT_EQ(0, other_failures) + << "success=" << reopened_success << " eio=" << eio_failures + << " enoent=" << enoent_failures; +} + +TEST(TtyPtyHangup, MasterThenSlaveCloseAllowsCleanIndexReuse) { + char first_name[128] = {}; + PtyPair first = OpenRawPty(first_name); + ASSERT_GE(first.master.get(), 0); + ASSERT_GE(first.slave.get(), 0); + ASSERT_NE('\0', first_name[0]); + + first.master.reset(); + first.slave.reset(); + + char second_name[128] = {}; + PtyPair second = OpenRawPty(second_name); + ASSERT_GE(second.master.get(), 0); + ASSERT_GE(second.slave.get(), 0); + ASSERT_STREQ(first_name, second_name) + << "test expects the freed pty index to be immediately reusable"; + + ASSERT_EQ(1, write(second.slave.get(), "z", 1)) + << "write to reused slave failed: errno=" << errno << " (" << strerror(errno) << ")"; + char ch = 0; + ASSERT_EQ(1, read(second.master.get(), &ch, 1)) + << "read from reused master failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ('z', ch); +} + +TEST(TtyPtyHangup, MasterOnlyCloseReleasesDevptsIndex) { + constexpr uint32_t kPtyMax = 128; + constexpr int kIterations = 160; + bool seen[kPtyMax] = {}; + bool saw_reuse = false; + + for (int i = 0; i < kIterations; ++i) { + UniqueFd master(open("/dev/ptmx", O_RDWR | O_NOCTTY)); + ASSERT_GE(master.get(), 0) << "open(/dev/ptmx) failed at iteration " << i + << ": errno=" << errno << " (" << strerror(errno) << ")"; + + uint32_t index = UINT32_MAX; + ASSERT_EQ(0, ioctl(master.get(), kTiocgptn, &index)) + << "ioctl(TIOCGPTN) failed at iteration " << i << ": errno=" << errno << " (" + << strerror(errno) << ")"; + + ASSERT_LT(index, kPtyMax) << "unexpected PTY index " << index; + if (seen[index]) { + saw_reuse = true; + } + seen[index] = true; + } + + EXPECT_TRUE(saw_reuse) << "master-only open/close did not visibly reuse any devpts index"; +} + +TEST(TtyPtyHangup, ClosingOneOfMultipleSlaveFdsDoesNotHangupMaster) { + PtyPair pair = OpenRawPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + UniqueFd second_slave(dup(pair.slave.get())); + ASSERT_GE(second_slave.get(), 0) << "dup(slave) failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + pair.slave.reset(); + SetNonblock(pair.master.get()); + + short revents = PollEvents(pair.master.get()); + EXPECT_EQ(0, revents & POLLHUP); + + char ch = 0; + errno = 0; + EXPECT_EQ(-1, read(pair.master.get(), &ch, 1)); + EXPECT_TRUE(IsWouldBlock(errno)) << "closing one slave fd should not hang up master, errno=" + << errno << " (" << strerror(errno) << ")"; + + second_slave.reset(); + revents = PollEvents(pair.master.get()); + EXPECT_NE(0, revents & POLLHUP); + ExpectReadErrno(pair.master.get(), EIO); +} + +TEST(TtyPtyHangup, PacketStatusIsDeliveredBeforeHangupEio) { + PtyPair pair = OpenRawPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + int on = 1; + ASSERT_EQ(0, ioctl(pair.master.get(), kTiocpkt, &on)) + << "ioctl(TIOCPKT) failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_EQ(0, tcflush(pair.slave.get(), TCOFLUSH)) + << "tcflush(TCOFLUSH) failed: errno=" << errno << " (" << strerror(errno) << ")"; + + pair.slave.reset(); + + unsigned char status = 0; + ASSERT_EQ(1, read(pair.master.get(), &status, 1)) + << "read packet status failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(kTiocpktFlushWrite, status & kTiocpktFlushWrite); + ExpectReadErrno(pair.master.get(), EIO); +} + +TEST(TtyPtyHangup, MasterCloseMakesSlaveObserveHangup) { + PtyPair pair = OpenRawPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + pair.master.reset(); + + short revents = PollEvents(pair.slave.get()); + EXPECT_NE(0, revents & POLLHUP); + + char ch = 0; + errno = 0; + EXPECT_EQ(0, read(pair.slave.get(), &ch, 1)); + + errno = 0; + EXPECT_EQ(-1, write(pair.slave.get(), "x", 1)); + EXPECT_EQ(EIO, errno) << "slave write after master close errno=" << errno << " (" + << strerror(errno) << ")"; +} + +TEST(TtyPtyHangup, ChildExitDrainsSlaveOutputBeforeMasterEio) { + PtyPair pair = OpenRawPty(); + ASSERT_GE(pair.master.get(), 0); + ASSERT_GE(pair.slave.get(), 0); + + const char message[] = "short-output\n"; + pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + + if (child == 0) { + close(pair.master.release()); + ssize_t written = write(pair.slave.get(), message, sizeof(message) - 1); + pair.slave.reset(); + _exit(written == static_cast(sizeof(message) - 1) ? 0 : 1); + } + + pair.slave.reset(); + + char buf[sizeof(message)] = {}; + size_t total = 0; + while (total < sizeof(message) - 1) { + ssize_t n = read(pair.master.get(), buf + total, sizeof(message) - 1 - total); + if (n < 0 && errno == EINTR) { + continue; + } + ASSERT_GT(n, 0) << "master failed to drain child output: errno=" << errno << " (" + << strerror(errno) << ")"; + total += static_cast(n); + } + EXPECT_STREQ(message, buf); + + int status = 0; + ASSERT_EQ(child, waitpid(child, &status, 0)) + << "waitpid failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_TRUE(WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + ExpectReadErrno(pair.master.get(), EIO); +} + +} // namespace + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/suites/normal/virtiofs_smoke.cc b/user/apps/tests/dunitest/suites/normal/virtiofs_smoke.cc new file mode 100644 index 0000000000..cbabf9a2a0 --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/virtiofs_smoke.cc @@ -0,0 +1,304 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef EOPNOTSUPP +#define EOPNOTSUPP 95 +#endif + +namespace { + +int ensure_dir(const char* path) { + struct stat st = {}; + if (stat(path, &st) == 0) { + return S_ISDIR(st.st_mode) ? 0 : -1; + } + return mkdir(path, 0755); +} + +void best_effort_umount(const char* path) { + if (umount(path) != 0 && errno != EINVAL && errno != ENOENT) { + ADD_FAILURE() << "umount(" << path << ") failed: errno=" << errno << " (" + << strerror(errno) << ")"; + } +} + +void best_effort_rmdir(const char* path) { + if (rmdir(path) != 0 && errno != ENOENT && errno != ENOTEMPTY) { + ADD_FAILURE() << "rmdir(" << path << ") failed: errno=" << errno << " (" + << strerror(errno) << ")"; + } +} + +void cleanup_tree(const char* root) { + char path[256] = {}; + snprintf(path, sizeof(path), "%s/local_busybox", root); + unlink(path); + snprintf(path, sizeof(path), "%s/merged", root); + best_effort_umount(path); + best_effort_rmdir(path); + snprintf(path, sizeof(path), "%s/mnt", root); + best_effort_umount(path); + best_effort_rmdir(path); + snprintf(path, sizeof(path), "%s/upper", root); + best_effort_rmdir(path); + snprintf(path, sizeof(path), "%s/work", root); + best_effort_rmdir(path); + best_effort_rmdir(root); +} + +void assert_file_contains_prefix(const char* path, const char* expected) { + int fd = open(path, O_RDONLY); + ASSERT_GE(fd, 0) << "open(" << path << ") failed: " << strerror(errno); + + char buf[128] = {}; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + int saved_errno = errno; + close(fd); + + ASSERT_GT(n, 0) << "read(" << path << ") failed: " << strerror(saved_errno); + EXPECT_EQ(0, strncmp(buf, expected, strlen(expected))) + << "path=" << path << " content=" << buf; +} + +void assert_copy_file(const char* src, const char* dst) { + int in = open(src, O_RDONLY); + ASSERT_GE(in, 0) << "open(" << src << ") failed: " << strerror(errno); + int out = open(dst, O_CREAT | O_TRUNC | O_WRONLY, 0755); + ASSERT_GE(out, 0) << "open(" << dst << ") failed: " << strerror(errno); + + char buf[8192]; + for (;;) { + ssize_t n = read(in, buf, sizeof(buf)); + ASSERT_GE(n, 0) << "read(" << src << ") failed: " << strerror(errno); + if (n == 0) { + break; + } + ssize_t off = 0; + while (off < n) { + ssize_t written = write(out, buf + off, n - off); + ASSERT_GT(written, 0) << "write(" << dst << ") failed: " << strerror(errno); + off += written; + } + } + ASSERT_EQ(0, close(out)) << "close(" << dst << ") failed: " << strerror(errno); + ASSERT_EQ(0, close(in)) << "close(" << src << ") failed: " << strerror(errno); + ASSERT_EQ(0, chmod(dst, 0755)) << "chmod(" << dst << ") failed: " << strerror(errno); +} + +void assert_mmap_matches_pread(const char* path) { + pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: " << strerror(errno); + + if (child == 0) { + int fd = open(path, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "open(%s) failed: %s\n", path, strerror(errno)); + _exit(101); + } + struct stat st = {}; + if (fstat(fd, &st) != 0 || st.st_size <= 0) { + fprintf(stderr, "fstat(%s) failed or empty: %s\n", path, strerror(errno)); + _exit(102); + } + void* map = mmap(nullptr, st.st_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0); + if (map == MAP_FAILED) { + fprintf(stderr, "mmap(%s) failed: %s\n", path, strerror(errno)); + _exit(103); + } + + char buf[4096]; + const unsigned char* mapped = static_cast(map); + for (off_t off = 0; off < st.st_size; off += sizeof(buf)) { + size_t want = st.st_size - off < (off_t)sizeof(buf) ? st.st_size - off : sizeof(buf); + ssize_t n = pread(fd, buf, want, off); + if (n != (ssize_t)want) { + fprintf(stderr, "pread(%s, off=%ld) got %zd want %zu errno=%d\n", path, + (long)off, n, want, errno); + _exit(104); + } + if (memcmp(mapped + off, buf, want) != 0) { + fprintf(stderr, "mmap/pread mismatch path=%s off=%ld len=%zu\n", path, (long)off, + want); + _exit(105); + } + } + munmap(map, st.st_size); + close(fd); + _exit(0); + } + + int status = 0; + ASSERT_EQ(child, waitpid(child, &status, 0)) << "waitpid failed: " << strerror(errno); + ASSERT_TRUE(WIFEXITED(status)) << "mmap compare child signaled, status=" << status; + EXPECT_EQ(0, WEXITSTATUS(status)) << "mmap compare failed for " << path + << " status=" << status; +} + +void assert_dir_has_entry(const char* path, const char* name) { + DIR* dir = opendir(path); + ASSERT_NE(nullptr, dir) << "opendir(" << path << ") failed: " << strerror(errno); + + bool found = false; + while (dirent* ent = readdir(dir)) { + if (strcmp(ent->d_name, name) == 0) { + found = true; + break; + } + } + closedir(dir); + EXPECT_TRUE(found) << path << " missing " << name; +} + +void assert_listxattr_reaches_filesystem(const char* path) { + errno = 0; + ssize_t n = listxattr(path, nullptr, 0); + if (n >= 0) { + return; + } + ASSERT_NE(ENOSYS, errno) << "listxattr syscall is not registered for " << path; + ASSERT_TRUE(errno == EOPNOTSUPP || errno == ENOTSUP || errno == ENODATA) + << "listxattr(" << path << ") failed unexpectedly: errno=" << errno << " (" + << strerror(errno) << ")"; +} + +void assert_exec_busybox(const char* busybox, const char* applet, int iteration) { + pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: " << strerror(errno); + + if (child == 0) { + char* const argv[] = { + const_cast("busybox"), + const_cast(applet), + const_cast("-a"), + nullptr, + }; + execv(busybox, argv); + _exit(127); + } + + int status = 0; + for (int i = 0; i < 100; ++i) { + pid_t ret = waitpid(child, &status, WNOHANG); + if (ret == child) { + ASSERT_TRUE(WIFEXITED(status)) + << "child status=" << status << " busybox=" << busybox << " applet=" << applet + << " iteration=" << iteration; + EXPECT_EQ(0, WEXITSTATUS(status)) << "child status=" << status + << " busybox=" << busybox; + return; + } + ASSERT_EQ(0, ret) << "waitpid failed: " << strerror(errno); + usleep(100000); + } + + kill(child, SIGKILL); + waitpid(child, &status, 0); + FAIL() << "exec timed out for " << busybox << " " << applet << " iteration=" << iteration; +} + +void assert_repeated_busybox_exec(const char* busybox, const char* applet, int count) { + for (int i = 0; i < count; ++i) { + assert_exec_busybox(busybox, applet, i); + } +} + +void assert_directory_probe_loop(const char* path, int count, bool probe_xattr) { + for (int i = 0; i < count; ++i) { + assert_dir_has_entry(path, "busybox"); + if (probe_xattr) { + assert_listxattr_reaches_filesystem(path); + } + + char child[256] = {}; + snprintf(child, sizeof(child), "%s/hello.txt", path); + struct stat st = {}; + ASSERT_EQ(0, lstat(child, &st)) << "lstat(" << child << ") failed: " << strerror(errno); + ASSERT_TRUE(S_ISREG(st.st_mode)) << child << " is not a regular file"; + } +} + +bool should_skip_missing_virtiofs(int err) { + return err == ENODEV || err == ENOENT || err == EINVAL || err == EOPNOTSUPP || err == ENOSYS; +} + +} // namespace + +TEST(VirtioFsSmoke, MountReadExecAndOverlayLower) { + char root[128] = {}; + char mnt[160] = {}; + char upper[160] = {}; + char work[160] = {}; + char merged[160] = {}; + char path[256] = {}; + char options[512] = {}; + + snprintf(root, sizeof(root), "/tmp/virtiofs_smoke_%d", getpid()); + snprintf(mnt, sizeof(mnt), "%s/mnt", root); + snprintf(upper, sizeof(upper), "%s/upper", root); + snprintf(work, sizeof(work), "%s/work", root); + snprintf(merged, sizeof(merged), "%s/merged", root); + + ASSERT_EQ(0, ensure_dir("/tmp")) << strerror(errno); + ASSERT_EQ(0, ensure_dir(root)) << strerror(errno); + ASSERT_EQ(0, ensure_dir(mnt)) << strerror(errno); + ASSERT_EQ(0, ensure_dir(upper)) << strerror(errno); + ASSERT_EQ(0, ensure_dir(work)) << strerror(errno); + ASSERT_EQ(0, ensure_dir(merged)) << strerror(errno); + + assert_repeated_busybox_exec("/bin/busybox", "ls", 1); + assert_repeated_busybox_exec("/bin/busybox", "uname", 1); + + if (mount("hostshare", mnt, "virtiofs", 0, nullptr) != 0) { + int err = errno; + cleanup_tree(root); + if (should_skip_missing_virtiofs(err)) { + GTEST_SKIP() << "virtiofs hostshare is unavailable: errno=" << err << " (" + << strerror(err) << ")"; + } + FAIL() << "mount virtiofs failed: errno=" << err << " (" << strerror(err) << ")"; + } + + assert_directory_probe_loop(mnt, 3, true); + snprintf(path, sizeof(path), "%s/hello.txt", mnt); + assert_file_contains_prefix(path, "virtiofs-host-file"); + + snprintf(path, sizeof(path), "%s/busybox", mnt); + assert_mmap_matches_pread(path); + char local_copy[160] = {}; + snprintf(local_copy, sizeof(local_copy), "%s/local_busybox", root); + assert_copy_file(path, local_copy); + assert_repeated_busybox_exec(local_copy, "ls", 1); + assert_repeated_busybox_exec(local_copy, "uname", 1); + assert_repeated_busybox_exec(path, "ls", 2); + assert_repeated_busybox_exec(path, "uname", 3); + + snprintf(options, sizeof(options), "lowerdir=%s,upperdir=%s,workdir=%s", mnt, upper, work); + ASSERT_EQ(0, mount("overlay", merged, "overlay", 0, options)) + << "mount overlay failed: " << strerror(errno); + + assert_directory_probe_loop(merged, 3, false); + snprintf(path, sizeof(path), "%s/busybox", merged); + assert_mmap_matches_pread(path); + assert_repeated_busybox_exec(path, "ls", 2); + assert_repeated_busybox_exec(path, "uname", 3); + + cleanup_tree(root); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/whitelist.txt b/user/apps/tests/dunitest/whitelist.txt index 0be1a35b1a..ea801a0497 100644 --- a/user/apps/tests/dunitest/whitelist.txt +++ b/user/apps/tests/dunitest/whitelist.txt @@ -3,12 +3,14 @@ demo/gtest_demo normal/capability normal/fdatasync +normal/ext4_xattr normal/fallocate_semantics normal/syncfs_semantics normal/fcntl_lock normal/fcntl_signal normal/epoll_timeout_budget normal/devpts_dir_read +normal/debugfs_mount normal/test_pivot_root normal/mount_reconfigure normal/mount_propagation @@ -18,6 +20,7 @@ normal/test_fchown_o_path normal/proc_self_limits normal/proc_fd_devfs_readlink normal/mlock_semantics +normal/mmap_truncate_cow normal/sched_affinity normal/sync_file_range normal/splice_concurrent_io @@ -52,3 +55,10 @@ normal/devfs_ptmx_unlink normal/devtmpfs_semantics normal/mknod_socket normal/pmem_block +normal/pipe_release_wakeup +normal/pipe_waitqueue_wakeup +normal/spawn_exec_pipe_race +normal/tty_pty_hangup +normal/cubesandbox_pty_exec_chain +normal/eventfd_pending_signal +normal/virtiofs_smoke