diff --git a/kernel/src/arch/x86_64/mm/fault.rs b/kernel/src/arch/x86_64/mm/fault.rs index 449208c70e..7b43ebebea 100644 --- a/kernel/src/arch/x86_64/mm/fault.rs +++ b/kernel/src/arch/x86_64/mm/fault.rs @@ -12,7 +12,7 @@ use crate::{ }, exception::{extable::ExceptionTableManager, InterruptArch}, ipc::{ - signal::force_sig_fault_to_current, + signal::{force_kernel_signal_to_current, force_sig_fault_to_current}, signal_types::{BUS_ADRERR, SEGV_ACCERR, SEGV_MAPERR}, }, mm::{ @@ -516,7 +516,9 @@ impl X86_64MMArch { regs.rip, fault ); - // TODO: OOM 处理 + if let Err(err) = force_kernel_signal_to_current(Signal::SIGKILL) { + error!("failed to send SIGKILL for page fault OOM: {:?}", err); + } return; } else if fault.contains(VmFaultReason::VM_FAULT_SIGBUS) || fault.contains(VmFaultReason::VM_FAULT_HWPOISON) diff --git a/kernel/src/filesystem/ext4/filesystem.rs b/kernel/src/filesystem/ext4/filesystem.rs index 7762862c28..3a209671b6 100644 --- a/kernel/src/filesystem/ext4/filesystem.rs +++ b/kernel/src/filesystem/ext4/filesystem.rs @@ -11,7 +11,7 @@ use crate::{ VFS_MAX_FOLLOW_SYMLINK_TIMES, }, }, - libs::mutex::Mutex, + libs::{mutex::Mutex, rwsem::RwSem}, mm::{ fault::{PageFaultHandler, PageFaultMessage}, VmFaultReason, @@ -297,6 +297,7 @@ impl Ext4FileSystem { dirty_state: super::inode::InodeDirtyState::empty(), }), Mutex::new(()), + RwSem::new(()), ) }); diff --git a/kernel/src/filesystem/ext4/inode.rs b/kernel/src/filesystem/ext4/inode.rs index 6ab90309af..6a66ec0539 100644 --- a/kernel/src/filesystem/ext4/inode.rs +++ b/kernel/src/filesystem/ext4/inode.rs @@ -12,6 +12,7 @@ use crate::{ libs::{ casting::DowncastArc, mutex::{Mutex, MutexGuard}, + rwsem::RwSem, }, mm::{truncate::truncate_inode_pages, MemoryManagementArch}, time::PosixTimeSpec, @@ -81,7 +82,11 @@ pub struct Ext4Inode { } #[derive(Debug)] -pub struct LockedExt4Inode(pub(super) Mutex, pub(super) Mutex<()>); +pub struct LockedExt4Inode( + pub(super) Mutex, + pub(super) Mutex<()>, + pub(super) RwSem<()>, +); impl IndexNode for LockedExt4Inode { fn mmap(&self, _start: usize, _len: usize, _offset: usize) -> Result<(), SystemError> { @@ -208,6 +213,7 @@ impl IndexNode for LockedExt4Inode { if len == 0 { return Ok(0); } + let _size_guard = self.2.read(); let buf = &buf[0..len]; let (fs, inode_num, page_cache) = { @@ -553,31 +559,51 @@ impl IndexNode for LockedExt4Inode { } fn resize(&self, len: usize) -> Result<(), SystemError> { - let guard = self.0.lock(); - let ext4 = &guard.concret_fs().fs; - // 仅调整文件大小,其他属性保持不变 - ext4.setattr( - guard.inner_inode_num, - another_ext4::SetAttr { - mode: None, - uid: None, - gid: None, - size: Some(len as u64), - atime: None, - mtime: None, - ctime: None, - crtime: None, - }, - ) - .map_err(SystemError::from)?; - drop(guard); - // 更新缓存的文件大小 + let _size_guard = self.2.write(); + let (fs, inode_num, page_cache, cached_size) = { + let guard = self.0.lock(); + ( + guard.concret_fs(), + guard.inner_inode_num, + guard.page_cache.clone(), + guard.cached_file_size, + ) + }; + let old_size = match cached_size { + Some(size) => size, + None => fs.fs.getattr(inode_num)?.size, + }; { - let mut guard = self.0.lock(); - guard.cached_file_size = Some(len as u64); - guard - .dirty_state - .remove(InodeDirtyState::SIZE_DIRTY | InodeDirtyState::MTIME_DIRTY); + let _io_guard = self.1.lock(); + let ext4 = &fs.fs; + // 仅调整文件大小,其他属性保持不变 + ext4.setattr( + inode_num, + another_ext4::SetAttr { + mode: None, + uid: None, + gid: None, + size: Some(len as u64), + atime: None, + mtime: None, + ctime: None, + crtime: None, + }, + ) + .map_err(SystemError::from)?; + // 更新缓存的文件大小 + { + let mut guard = self.0.lock(); + guard.cached_file_size = Some(len as u64); + guard + .dirty_state + .remove(InodeDirtyState::SIZE_DIRTY | InodeDirtyState::MTIME_DIRTY); + } + } + if len < old_size as usize { + if let Some(page_cache) = page_cache { + page_cache.truncate(len)?; + } } Ok(()) } @@ -1004,6 +1030,7 @@ impl LockedExt4Inode { LockedExt4Inode( Mutex::new(Ext4Inode::new(inode_num, fs_ptr.clone(), dname, parent)), Mutex::new(()), + RwSem::new(()), ) }); let mut guard = inode.0.lock(); diff --git a/kernel/src/filesystem/fat/fs.rs b/kernel/src/filesystem/fat/fs.rs index 332f926376..0377d12268 100644 --- a/kernel/src/filesystem/fat/fs.rs +++ b/kernel/src/filesystem/fat/fs.rs @@ -1,6 +1,5 @@ use crate::arch::MMArch; use crate::filesystem::vfs::syscall::RenameFlags; -use crate::mm::truncate::truncate_inode_pages; use crate::mm::MemoryManagementArch; use alloc::string::ToString; use alloc::{ @@ -24,6 +23,7 @@ use crate::filesystem::vfs::utils::DName; use crate::filesystem::vfs::{Magic, SpecialNodeData, SuperBlock}; use crate::ipc::pipe::LockedPipeInode; use crate::libs::casting::DowncastArc; +use crate::libs::rwsem::RwSem; use crate::mm::fault::{PageFaultHandler, PageFaultMessage}; use crate::mm::VmFaultReason; use crate::{ @@ -126,7 +126,7 @@ pub struct FATFileSystem { /// FAT文件系统的Inode #[derive(Debug)] -pub struct LockedFATInode(Mutex); +pub struct LockedFATInode(Mutex, RwSem<()>); #[derive(Debug)] pub struct LockedFATFsInfo(Mutex); @@ -286,39 +286,42 @@ impl LockedFATInode { FileType::File }; - let inode: Arc = Arc::new(LockedFATInode(Mutex::new(FATInode { - parent, - self_ref: Weak::default(), - children: HashMap::new(), - negative_children: FATInode::negative_children_cache(), - fs: Arc::downgrade(&fs), - inode_type, - metadata: Metadata { - dev_id: 0, - inode_id: generate_inode_id(), - size: 0, - blk_size: fs.bpb.bytes_per_sector as usize, - blocks: if let FATType::FAT32(_) = fs.bpb.fat_type { - fs.bpb.total_sectors_32 as usize - } else { - fs.bpb.total_sectors_16 as usize + let inode: Arc = Arc::new(LockedFATInode( + Mutex::new(FATInode { + parent, + self_ref: Weak::default(), + children: HashMap::new(), + negative_children: FATInode::negative_children_cache(), + fs: Arc::downgrade(&fs), + inode_type, + metadata: Metadata { + dev_id: 0, + inode_id: generate_inode_id(), + size: 0, + blk_size: fs.bpb.bytes_per_sector as usize, + blocks: if let FATType::FAT32(_) = fs.bpb.fat_type { + fs.bpb.total_sectors_32 as usize + } else { + fs.bpb.total_sectors_16 as usize + }, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + btime: PosixTimeSpec::default(), + file_type, + mode: InodeMode::S_IRWXUGO, + flags: InodeFlags::empty(), + nlinks: if file_type == FileType::Dir { 2 } else { 1 }, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), }, - atime: PosixTimeSpec::default(), - mtime: PosixTimeSpec::default(), - ctime: PosixTimeSpec::default(), - btime: PosixTimeSpec::default(), - file_type, - mode: InodeMode::S_IRWXUGO, - flags: InodeFlags::empty(), - nlinks: if file_type == FileType::Dir { 2 } else { 1 }, - uid: 0, - gid: 0, - raw_dev: DeviceNumber::default(), - }, - special_node: None, - dname, - page_cache: None, - }))); + special_node: None, + dname, + page_cache: None, + }), + RwSem::new(()), + )); if !inode.0.lock().inode_type.is_dir() { let backend = Arc::new(AsyncPageCacheBackend::new( @@ -720,39 +723,42 @@ impl FATFileSystem { bpb.rsvd_sec_cnt as u64 + (bpb.num_fats as u64 * fat_size) + root_dir_sectors; // 创建文件系统的根节点 - let root_inode: Arc = Arc::new(LockedFATInode(Mutex::new(FATInode { - parent: Weak::default(), - self_ref: Weak::default(), - children: HashMap::new(), - negative_children: FATInode::negative_children_cache(), - fs: Weak::default(), - inode_type: FATDirEntry::UnInit, - metadata: Metadata { - dev_id: 0, - inode_id: generate_inode_id(), - size: 0, - blk_size: bpb.bytes_per_sector as usize, - blocks: if let FATType::FAT32(_) = bpb.fat_type { - bpb.total_sectors_32 as usize - } else { - bpb.total_sectors_16 as usize + let root_inode: Arc = Arc::new(LockedFATInode( + Mutex::new(FATInode { + parent: Weak::default(), + self_ref: Weak::default(), + children: HashMap::new(), + negative_children: FATInode::negative_children_cache(), + fs: Weak::default(), + inode_type: FATDirEntry::UnInit, + metadata: Metadata { + dev_id: 0, + inode_id: generate_inode_id(), + size: 0, + blk_size: bpb.bytes_per_sector as usize, + blocks: if let FATType::FAT32(_) = bpb.fat_type { + bpb.total_sectors_32 as usize + } else { + bpb.total_sectors_16 as usize + }, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + btime: PosixTimeSpec::default(), + file_type: FileType::Dir, + mode: InodeMode::S_IRWXUGO, + flags: InodeFlags::empty(), + nlinks: 2, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), }, - atime: PosixTimeSpec::default(), - mtime: PosixTimeSpec::default(), - ctime: PosixTimeSpec::default(), - btime: PosixTimeSpec::default(), - file_type: FileType::Dir, - mode: InodeMode::S_IRWXUGO, - flags: InodeFlags::empty(), - nlinks: 2, - uid: 0, - gid: 0, - raw_dev: DeviceNumber::default(), - }, - special_node: None, - dname: DName::default(), - page_cache: None, - }))); + special_node: None, + dname: DName::default(), + page_cache: None, + }), + RwSem::new(()), + )); let result: Arc = Arc::new(FATFileSystem { gendisk, @@ -1836,6 +1842,7 @@ impl LockedFATInode { } fn try_write_pagecache(&self, offset: usize, buf: &[u8]) -> Result { + let _size_guard = self.1.read(); let page_cache = self.0.lock().page_cache.clone(); if let Some(page_cache) = page_cache { let write_len = PageCache::write(&page_cache, offset, buf)?; @@ -2091,20 +2098,15 @@ impl IndexNode for LockedFATInode { Ok(()) } fn resize(&self, len: usize) -> Result<(), SystemError> { + let _size_guard = self.1.write(); //检查是否超过fat支持的最大容量 if (len as u64) > MAX_FILE_SIZE { return Err(SystemError::EFBIG); } - // 先调整页缓存:清除被截断区间的缓存页,再缩容缓存大小 - if let Some(page_cache) = self.page_cache() { - let start_page = (len + MMArch::PAGE_SIZE - 1) >> MMArch::PAGE_SHIFT; - truncate_inode_pages(page_cache.clone(), start_page); - page_cache.manager().resize(len)?; - } - let mut guard: MutexGuard = self.0.lock(); let fs: &Arc = &guard.fs.upgrade().unwrap(); let old_size = guard.metadata.size as usize; + let page_cache = guard.page_cache.clone(); match &mut guard.inode_type { FATDirEntry::File(file) | FATDirEntry::VolId(file) => { @@ -2127,7 +2129,26 @@ impl IndexNode for LockedFATInode { } } Ordering::Less => { - file.truncate(fs, len as u64)?; + guard.metadata.size = len as i64; + drop(guard); + if let Some(page_cache) = page_cache { + page_cache.manager().resize(len)?; + } + let mut guard: MutexGuard = self.0.lock(); + let fs: &Arc = &guard.fs.upgrade().unwrap(); + match &mut guard.inode_type { + FATDirEntry::File(file) | FATDirEntry::VolId(file) => { + file.truncate(fs, len as u64)?; + guard.synchronize_metadata(); + guard.metadata.size = len as i64; + return Ok(()); + } + FATDirEntry::Dir(_) => return Err(SystemError::ENOSYS), + FATDirEntry::UnInit => { + error!("FATFS: param: Inode_type uninitialized."); + return Err(SystemError::EROFS); + } + } } } // 同步元数据:从文件对象获取最新大小,并确保一致 diff --git a/kernel/src/filesystem/page_cache.rs b/kernel/src/filesystem/page_cache.rs index a0084c4635..f505e73eb7 100644 --- a/kernel/src/filesystem/page_cache.rs +++ b/kernel/src/filesystem/page_cache.rs @@ -275,6 +275,7 @@ pub struct PageCache { writeback_error: ErrSeq, unevictable: AtomicBool, is_shmem: AtomicBool, + reclassify_lock: Mutex<()>, manager: PageCacheManager, } @@ -283,53 +284,15 @@ pub struct InnerPageCache { #[allow(unused)] id: usize, pages: HashMap>, + page_indices: BTreeSet, dirty_pages: BTreeSet, page_cache_ref: Weak, } -#[derive(Debug, Clone, Copy)] -struct EvictPolicy { - allow_dirty: bool, - allow_mapped: bool, - allow_writeback: bool, -} - -impl EvictPolicy { - const fn clean_only() -> Self { - Self { - allow_dirty: false, - allow_mapped: false, - allow_writeback: false, - } - } - - fn can_evict(self, entry: &PageEntry) -> bool { - let state = entry.state(); - if matches!(state, PageState::Loading) { - return false; - } - if !self.allow_writeback && state == PageState::Writeback { - return false; - } - - let guard = entry.page.read(); - let dirty = guard.flags().contains(PageFlags::PG_DIRTY); - let mapped = guard.map_count() != 0; - drop(guard); - - if dirty && !self.allow_dirty { - return false; - } - if mapped && !self.allow_mapped { - return false; - } - true - } -} - /// 描述一次从页缓存到目标缓冲区的拷贝 pub struct CopyItem { entry: Arc, + _pin: PageEntryPin, page_index: usize, page_offset: usize, sub_len: usize, @@ -385,6 +348,8 @@ impl PageState { struct PageEntry { page: Arc, state: AtomicU8, + accounted_unevictable: AtomicBool, + active_users: AtomicUsize, wait_queue: WaitQueue, } @@ -463,6 +428,11 @@ impl PageCacheManager { self.upgrade()?.get_or_create_page_for_read(page_index) } + pub fn commit_page_pinned(&self, page_index: usize) -> Result { + self.upgrade()? + .get_or_create_page_for_read_pinned(page_index) + } + pub fn commit_page_with(&self, page_index: usize, fill: F) -> Result, SystemError> where F: FnOnce(usize, &mut [u8]) -> Result, @@ -474,6 +444,13 @@ impl PageCacheManager { self.upgrade()?.get_or_create_page_zero(page_index) } + pub fn commit_overwrite_pinned( + &self, + page_index: usize, + ) -> Result { + self.upgrade()?.get_or_create_page_zero_pinned(page_index) + } + pub fn prefetch_page(&self, page_index: usize) -> Result<(), SystemError> { self.upgrade()?.start_async_read(page_index) } @@ -502,6 +479,12 @@ impl PageCacheManager { .and_then(|cache| cache.get_ready_page(page_index)) } + pub fn peek_page_pinned(&self, page_index: usize) -> Option { + self.upgrade() + .ok() + .and_then(|cache| cache.get_ready_page_pinned(page_index)) + } + pub fn get_page_any(&self, page_index: usize) -> Option> { self.upgrade() .ok() @@ -676,7 +659,8 @@ impl PageCacheManager { } pub fn resize(&self, len: usize) -> Result<(), SystemError> { - self.upgrade()?.lock().resize(len) + let cache = self.upgrade()?; + cache.truncate(len) } pub fn writeback_range(&self, start_index: usize, end_index: usize) -> Result<(), SystemError> { @@ -706,15 +690,9 @@ impl PageCacheManager { let entries: Vec> = { let inner = cache.inner.lock(); inner - .pages - .iter() - .filter_map(|(idx, entry)| { - if *idx >= start_index && *idx <= end_index { - Some(entry.clone()) - } else { - None - } - }) + .page_indices + .range(start_index..=end_index) + .filter_map(|idx| inner.pages.get(idx).cloned()) .collect() }; @@ -757,7 +735,9 @@ impl PageCacheManager { PageState::UpToDate | PageState::Dirty => {} } - page.write().add_flags(PageFlags::PG_DIRTY); + { + page.write().add_flags(PageFlags::PG_DIRTY); + } let mut inner = cache.inner.lock(); let Some(current) = inner.get_entry(page_index) else { @@ -810,8 +790,7 @@ impl PageCacheManager { ) -> Result { Ok(self .upgrade()? - .lock() - .invalidate_range(start_index, end_index)) + .evict_clean_pages_for_invalidate(Some((start_index, end_index)))) } pub fn discard_clean_range( @@ -820,56 +799,21 @@ impl PageCacheManager { end_index: usize, ) -> Result { let cache = self.upgrade()?; - let indices: Vec = { - let guard = cache.inner.lock(); - (start_index..=end_index) - .filter(|index| guard.get_entry(*index).is_some()) - .collect() - }; + if cache.is_shmem() { + return Ok(0); + } + let indices = cache.clean_evict_indices(Some((start_index, end_index))); let mut discarded = 0; for page_index in indices { - loop { - let entry = { - let guard = cache.inner.lock(); - guard.get_entry(page_index) - }; - let Some(entry) = entry else { - break; - }; - - match entry.state() { - PageState::Loading => { - let _ = entry.wait_ready(); - continue; - } - PageState::UpToDate | PageState::Error => {} - PageState::Dirty | PageState::Writeback => break, - } - - let removed = { - let mut guard = cache.inner.lock(); - let Some(current) = guard.get_entry(page_index) else { - break; - }; - if !Arc::ptr_eq(¤t, &entry) - || !matches!(current.state(), PageState::UpToDate | PageState::Error) - { - continue; - } - guard.remove_page(page_index) - }; - - if let Some(page) = removed { - let paddr = page.phys_address(); - let can_remove_from_manager = page.read().can_deallocate(); - let _ = page_reclaimer_lock().remove_page(&paddr); - if can_remove_from_manager { - page_manager_lock().remove_page(&paddr); - } - discarded += 1; + if let Some(page) = cache.remove_clean_page_candidate(page_index) { + let paddr = page.phys_address(); + let can_remove_from_manager = page.read().can_deallocate(); + let _ = page_reclaimer_lock().remove_page(&paddr); + if can_remove_from_manager { + page_manager_lock().remove_page(&paddr); } - break; + discarded += 1; } } @@ -877,22 +821,20 @@ impl PageCacheManager { } pub fn invalidate_all_clean(&self) -> Result { - Ok(self.upgrade()?.lock().evict_clean_pages()) + let cache = self.upgrade()?; + if cache.is_shmem() { + return Ok(0); + } + let dropped = cache.evict_clean_pages_for_invalidate(None); + Ok(dropped) } pub(crate) fn discard_clean_page(&self, page_index: usize) -> Result<(), SystemError> { let cache = self.upgrade()?; - let removed = { - let mut guard = cache.lock(); - let Some(entry) = guard.get_entry(page_index) else { - return Ok(()); - }; - if entry.state() != PageState::UpToDate { - return Ok(()); - } - guard.remove_page(page_index) - }; - if let Some(page) = removed { + if cache.is_shmem() { + return Ok(()); + } + if let Some(page) = cache.remove_clean_page_candidate(page_index) { cache.discard_unlinked_page(&page); } Ok(()) @@ -902,10 +844,68 @@ impl PageCacheManager { Ok(self.upgrade()?.lock().pages_count()) } + pub fn supports_clean_reclaim(&self) -> bool { + self.upgrade() + .map(|cache| !cache.is_shmem()) + .unwrap_or(false) + } + pub fn remove_page(&self, page_index: usize) -> Result>, SystemError> { Ok(self.upgrade()?.lock().remove_page(page_index)) } + pub fn remove_clean_page_for_reclaim( + &self, + page_index: usize, + expected_page: &Arc, + ) -> Result>, SystemError> { + let cache = self.upgrade()?; + if cache.is_shmem() { + return Ok(None); + } + let entry = match cache.lock().get_entry(page_index) { + Some(entry) => entry, + None => return Ok(None), + }; + if !Arc::ptr_eq(&entry.page, expected_page) + || cache.mapping_unevictable() + || entry.active_users() != 0 + { + return Ok(None); + } + let state = entry.state(); + if matches!( + state, + PageState::Loading | PageState::Writeback | PageState::Error + ) { + return Ok(None); + } + let page_reclaimable = { + let page_guard = entry.page.write(); + !page_guard.flags().intersects( + PageFlags::PG_DIRTY | PageFlags::PG_WRITEBACK | PageFlags::PG_UNEVICTABLE, + ) && page_guard.map_count() == 0 + }; + if !page_reclaimable { + return Ok(None); + } + + let mut guard = cache.lock(); + let Some(current) = guard.get_entry(page_index) else { + return Ok(None); + }; + if !Arc::ptr_eq(¤t, &entry) + || !Arc::ptr_eq(¤t.page, expected_page) + || cache.mapping_unevictable() + || current.active_users() != 0 + || current.state() != state + { + return Ok(None); + } + let removed = guard.remove_page(page_index); + Ok(removed) + } + pub fn writeback_page(&self, page_index: usize) -> Result<(), SystemError> { let cache = self.upgrade()?; let entry = match cache.inner.lock().get_entry(page_index) { @@ -1037,17 +1037,19 @@ impl PageCacheManager { { let mut guard = page.write(); guard.remove_flags(PageFlags::PG_ERROR); - if guard.flags().contains(PageFlags::PG_DIRTY) { - cache.account_state_transition(PageState::Writeback, PageState::Dirty); - entry.set_state(PageState::Dirty); - let mut inner = cache.inner.lock(); - inner.dirty_pages.insert(page_index); - } else { - cache.account_state_transition(PageState::Writeback, PageState::UpToDate); - entry.set_state(PageState::UpToDate); - let mut inner = cache.inner.lock(); - inner.dirty_pages.remove(&page_index); - } + } + + let page_dirty = page.read().flags().contains(PageFlags::PG_DIRTY); + if page_dirty { + cache.account_state_transition(PageState::Writeback, PageState::Dirty); + entry.set_state(PageState::Dirty); + let mut inner = cache.inner.lock(); + inner.dirty_pages.insert(page_index); + } else { + cache.account_state_transition(PageState::Writeback, PageState::UpToDate); + entry.set_state(PageState::UpToDate); + let mut inner = cache.inner.lock(); + inner.dirty_pages.remove(&page_index); } entry.wait_queue.wake_all(); Ok(()) @@ -1222,6 +1224,8 @@ impl PageEntry { Self { page, state: AtomicU8::new(state as u8), + accounted_unevictable: AtomicBool::new(false), + active_users: AtomicUsize::new(0), wait_queue: WaitQueue::default(), } } @@ -1234,6 +1238,39 @@ impl PageEntry { self.state.store(state as u8, Ordering::Release); } + fn account_unevictable_if_needed(&self) { + if !self.accounted_unevictable.swap(true, Ordering::AcqRel) { + pc_stats::inc_unevictable(); + } + } + + fn unaccount_unevictable_if_needed(&self) { + if self.accounted_unevictable.swap(false, Ordering::AcqRel) { + pc_stats::dec_unevictable(); + } + } + + fn active_users(&self) -> usize { + self.active_users.load(Ordering::Acquire) + } + + fn wait_inactive(&self) { + self.wait_queue.wait_until(|| { + if self.active_users() == 0 { + Some(()) + } else { + None + } + }); + } + + fn pin(self: &Arc) -> PageEntryPin { + self.active_users.fetch_add(1, Ordering::AcqRel); + PageEntryPin { + entry: self.clone(), + } + } + fn compare_exchange_state( &self, current: PageState, @@ -1270,11 +1307,48 @@ impl PageEntry { } } +struct PageEntryPin { + entry: Arc, +} + +impl core::fmt::Debug for PageEntryPin { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("PageEntryPin") + .field("paddr", &self.entry.page.phys_address()) + .finish() + } +} + +impl Drop for PageEntryPin { + fn drop(&mut self) { + if self.entry.active_users.fetch_sub(1, Ordering::AcqRel) == 1 { + self.entry.wait_queue.wake_all(); + } + } +} + +#[derive(Debug)] +pub struct PageCachePagePin { + page: Arc, + _pin: PageEntryPin, +} + +impl PageCachePagePin { + fn new(page: Arc, pin: PageEntryPin) -> Self { + Self { page, _pin: pin } + } + + pub fn page(&self) -> Arc { + self.page.clone() + } +} + impl InnerPageCache { pub fn new(page_cache_ref: Weak, id: usize) -> InnerPageCache { Self { id, pages: HashMap::new(), + page_indices: BTreeSet::new(), dirty_pages: BTreeSet::new(), page_cache_ref, } @@ -1286,9 +1360,10 @@ impl InnerPageCache { pub fn remove_page(&mut self, offset: usize) -> Option> { let entry = self.pages.remove(&offset)?; + self.page_indices.remove(&offset); self.dirty_pages.remove(&offset); if let Some(cache) = self.page_cache_ref.upgrade() { - cache.account_entry_remove(entry.state()); + cache.account_entry_remove(&entry); } Some(entry.page.clone()) } @@ -1298,10 +1373,15 @@ impl InnerPageCache { } fn insert_entry(&mut self, offset: usize, entry: Arc) { - self.pages.insert(offset, entry); if let Some(cache) = self.page_cache_ref.upgrade() { - cache.account_entry_insert(); + cache.account_entry_insert(&entry); + } + if let Some(old_entry) = self.pages.insert(offset, entry) { + if let Some(cache) = self.page_cache_ref.upgrade() { + cache.account_entry_remove(&old_entry); + } } + self.page_indices.insert(offset); } fn is_page_ready(&self, offset: usize) -> bool { @@ -1311,89 +1391,32 @@ impl InnerPageCache { .unwrap_or(false) } - pub fn resize(&mut self, len: usize) -> Result<(), SystemError> { - let page_num = page_align_up(len) / MMArch::PAGE_SIZE; - - let mut reclaimer = page_reclaimer_lock(); - for (i, entry) in self.pages.drain_filter(|index, entry| { - *index >= page_num && entry.state().is_ready() && entry.state() != PageState::Writeback - }) { - self.dirty_pages.remove(&i); - let _ = reclaimer.remove_page(&entry.page.phys_address()); - if let Some(cache) = self.page_cache_ref.upgrade() { - cache.account_entry_remove(entry.state()); - } - } - - if page_num > 0 { - let last_page_index = page_num - 1; - let last_len = len - last_page_index * MMArch::PAGE_SIZE; - if let Some(page) = self.get_page(last_page_index) { - unsafe { - page.write().truncate(last_len); - }; - } - // 对于新文件,最后一页不存在是正常的,不需要返回错误 - // 只有当文件需要截断到更小的尺寸时,才需要处理最后一页 - } - - Ok(()) - } - pub fn pages_count(&self) -> usize { return self.pages.len(); } - - fn evict_pages_inner(&mut self, range: Option<(usize, usize)>, policy: EvictPolicy) -> usize { - let mut evicted = 0; - let mut page_reclaimer = page_reclaimer_lock(); - let indices: Vec = match range { - Some((start, end)) => (start..=end).collect(), - None => self.pages.keys().cloned().collect(), - }; - - for idx in indices { - if let Some(entry) = self.pages.get(&idx) { - if !policy.can_evict(entry) { - continue; - } - if Arc::strong_count(&entry.page) > 3 { - continue; - } - if let Some(removed_page) = self.remove_page(idx) { - let paddr = removed_page.phys_address(); - page_manager_lock().remove_page(&paddr); - let _ = page_reclaimer.remove_page(&paddr); - evicted += 1; - } - } - } - - evicted - } - - /// 驱逐指定范围的干净页 - /// - /// 只驱逐干净的、无外部引用的页 - pub fn invalidate_range(&mut self, start_index: usize, end_index: usize) -> usize { - self.evict_pages_inner(Some((start_index, end_index)), EvictPolicy::clean_only()) - } - - fn evict_clean_pages(&mut self) -> usize { - self.evict_pages_inner(None, EvictPolicy::clean_only()) - } } impl Drop for InnerPageCache { fn drop(&mut self) { // log::debug!("page cache drop"); + let page_addrs = self + .pages + .values() + .map(|entry| entry.page.phys_address()) + .collect::>(); let mut page_manager = page_manager_lock(); for entry in self.pages.values() { if let Some(cache) = self.page_cache_ref.upgrade() { - cache.account_entry_remove(entry.state()); + cache.account_entry_remove(entry); } page_manager.remove_page(&entry.page.phys_address()); } + drop(page_manager); + + let mut reclaimer = page_reclaimer_lock(); + for paddr in page_addrs { + reclaimer.remove_page(&paddr); + } } } @@ -1429,6 +1452,7 @@ impl PageCache { writeback_error: ErrSeq::new(), unevictable: AtomicBool::new(false), is_shmem: AtomicBool::new(false), + reclassify_lock: Mutex::new(()), manager: PageCacheManager::new(weak.clone()), }); register_page_cache(&cache); @@ -1660,14 +1684,28 @@ impl PageCache { } pub fn truncate(&self, new_size: usize) -> Result<(), SystemError> { - let _invalidate = self.invalidate_write(); - self.truncate_locked(new_size) - } - - fn truncate_locked(&self, new_size: usize) -> Result<(), SystemError> { let hole_start_page = page_align_up(new_size) >> MMArch::PAGE_SHIFT; - self.unmap_mapping_pages_even_cow(hole_start_page, None)?; + loop { + // Keep the MM lock order out of invalidate_write: + // first tear down existing PTEs, then block new faults while removing cache pages. + self.unmap_mapping_pages_even_cow(hole_start_page, None)?; + let truncate_committed = { + let _invalidate = self.invalidate_write(); + self.truncate_locked(new_size)? + }; + + if truncate_committed { + // Match Linux truncate_pagecache(): private COW pages can appear after + // the first unmap and before cache truncation commits, so unmap again + // after releasing invalidate_write to preserve the global lock order. + self.unmap_mapping_pages_even_cow(hole_start_page, None)?; + return Ok(()); + } + } + } + + fn truncate_locked(&self, new_size: usize) -> Result { let first_full_truncate_page = page_align_up(new_size) >> MMArch::PAGE_SHIFT; let truncate_indices: Vec = { let guard = self.inner.lock(); @@ -1704,20 +1742,50 @@ impl PageCache { _ => {} } + if entry.active_users() != 0 { + entry.wait_inactive(); + continue; + } + + let mut retry_after_unmap = false; let removed_page = { - let mut guard = self.inner.lock(); - guard.remove_page(page_index) + let page_guard = entry.page.read(); + if page_guard.map_count() != 0 { + retry_after_unmap = true; + None + } else { + drop(page_guard); + + let mut guard = self.inner.lock(); + let Some(current) = guard.get_entry(page_index) else { + break; + }; + if !Arc::ptr_eq(¤t, &entry) { + continue; + } + if current.active_users() != 0 { + drop(guard); + current.wait_inactive(); + continue; + } + + let page_guard = current.page.read(); + if page_guard.map_count() != 0 { + retry_after_unmap = true; + None + } else { + drop(page_guard); + guard.remove_page(page_index) + } + } }; + + if retry_after_unmap { + return Ok(false); + } + if let Some(page) = removed_page { - let paddr = page.phys_address(); - let can_remove_from_manager = page.read().can_deallocate(); - // The page is no longer reachable from this page cache, so it must not - // remain on the file-page reclaimer LRU even if existing mappings still - // keep its Page metadata alive via page_manager. - let _ = page_reclaimer_lock().remove_page(&paddr); - if can_remove_from_manager { - page_manager_lock().remove_page(&paddr); - } + self.discard_unlinked_page(&page); } break; } @@ -1750,9 +1818,7 @@ impl PageCache { } } - self.unmap_mapping_pages_even_cow(hole_start_page, None)?; - - Ok(()) + Ok(true) } pub fn mkclean_page( @@ -1826,13 +1892,86 @@ impl PageCache { } pub fn drop_clean_pages(&self) -> usize { - self.inner.lock().evict_clean_pages() + if self.is_shmem() { + return 0; + } + self.evict_clean_pages_for_invalidate(None) + } + + fn clean_evict_indices(&self, range: Option<(usize, usize)>) -> Vec { + let guard = self.inner.lock(); + match range { + Some((start, end)) => guard.page_indices.range(start..=end).copied().collect(), + None => guard.page_indices.iter().copied().collect(), + } + } + + fn remove_clean_page_candidate(&self, page_index: usize) -> Option> { + loop { + let entry = { + let guard = self.inner.lock(); + guard.get_entry(page_index) + }?; + + match entry.state() { + PageState::Loading => { + let _ = entry.wait_ready(); + continue; + } + PageState::UpToDate | PageState::Error => {} + PageState::Dirty | PageState::Writeback => return None, + } + + if self.mapping_unevictable() || entry.active_users() != 0 { + return None; + } + + let page_reclaimable = { + let page_guard = entry.page.read(); + !page_guard.flags().intersects( + PageFlags::PG_DIRTY | PageFlags::PG_WRITEBACK | PageFlags::PG_UNEVICTABLE, + ) && page_guard.map_count() == 0 + }; + if !page_reclaimable { + return None; + } + + let mut guard = self.inner.lock(); + let current = guard.get_entry(page_index)?; + if !Arc::ptr_eq(¤t, &entry) { + continue; + } + if self.mapping_unevictable() + || current.active_users() != 0 + || !matches!(current.state(), PageState::UpToDate | PageState::Error) + { + return None; + } + return guard.remove_page(page_index); + } + } + + fn evict_clean_pages_for_invalidate(&self, range: Option<(usize, usize)>) -> usize { + let mut evicted = 0; + for page_index in self.clean_evict_indices(range) { + if let Some(page) = self.remove_clean_page_candidate(page_index) { + let paddr = page.phys_address(); + page_manager_lock().remove_page(&paddr); + let _ = page_reclaimer_lock().remove_page(&paddr); + evicted += 1; + } + } + evicted } /// Mark this page cache as unevictable (or revert). When enabled, newly created /// pages will carry PG_UNEVICTABLE to keep the reclaimer from reclaiming them. - pub fn set_unevictable(&self, unevictable: bool) { - self.unevictable.store(unevictable, Ordering::Relaxed); + pub fn set_unevictable(&self, unevictable: bool) -> bool { + self.unevictable.swap(unevictable, Ordering::Relaxed) + } + + pub fn mapping_unevictable(&self) -> bool { + self.unevictable.load(Ordering::Relaxed) } pub fn set_shmem(&self, shmem: bool) { @@ -1844,38 +1983,158 @@ impl PageCache { } fn page_flags(&self) -> PageFlags { - if self.unevictable.load(Ordering::Relaxed) { + if self.mapping_unevictable() { PageFlags::PG_LRU | PageFlags::PG_UNEVICTABLE } else { PageFlags::PG_LRU } } - fn account_entry_insert(&self) { + pub fn reclassify_unevictable_pages(&self, old_mapping_unevictable: bool) { + const RECLASSIFY_BATCH: usize = 64; + + let _reclassify_guard = self.reclassify_lock.lock(); + let mapping_unevictable = self.mapping_unevictable(); + if old_mapping_unevictable == mapping_unevictable { + return; + } + + let mut next_index = 0usize; + loop { + let entries = { + let guard = self.inner.lock(); + guard + .page_indices + .range(next_index..) + .take(RECLASSIFY_BATCH) + .filter_map(|index| { + guard.pages.get(index).cloned().map(|entry| (*index, entry)) + }) + .collect::>() + }; + if entries.is_empty() { + break; + } + let last_index = entries[entries.len() - 1].0; + if last_index == usize::MAX { + next_index = usize::MAX; + } else { + next_index = last_index + 1; + } + + for (index, entry) in entries { + let page = &entry.page; + if mapping_unevictable { + if !self.mapping_unevictable() { + return; + } + let guard = self.inner.lock(); + let Some(current) = guard.pages.get(&index) else { + continue; + }; + if !Arc::ptr_eq(current, &entry) { + continue; + } + if !self.mapping_unevictable() { + continue; + } + + let mut page_guard = page.write(); + let was_unevictable = page_guard.flags().contains(PageFlags::PG_UNEVICTABLE); + if !was_unevictable { + page_guard.add_flags(PageFlags::PG_UNEVICTABLE); + } + let paddr = page.phys_address(); + drop(page_guard); + entry.account_unevictable_if_needed(); + drop(guard); + if !was_unevictable { + let _ = page_reclaimer_lock().remove_page(&paddr); + } + } else { + let guard = self.inner.lock(); + let Some(current) = guard.pages.get(&index) else { + continue; + }; + if !Arc::ptr_eq(current, &entry) || self.mapping_unevictable() { + continue; + } + + let mut page_guard = page.write(); + let keep_unevictable = page_guard.has_unevictable_source(); + let was_unevictable = page_guard.flags().contains(PageFlags::PG_UNEVICTABLE); + entry.unaccount_unevictable_if_needed(); + if !keep_unevictable && was_unevictable { + page_guard.remove_flags(PageFlags::PG_UNEVICTABLE); + let paddr = page.phys_address(); + let should_reclaim = + !self.is_shmem() && page_guard.flags().contains(PageFlags::PG_LRU); + drop(page_guard); + drop(guard); + if should_reclaim { + page_reclaimer_lock().insert_page(paddr, page); + } + } + } + } + if next_index == usize::MAX { + break; + } + } + } + + fn account_entry_insert(&self, entry: &PageEntry) { pc_stats::inc_file_pages(); if self.is_shmem() { pc_stats::inc_shmem_pages(); } - if self.unevictable.load(Ordering::Relaxed) { - pc_stats::inc_unevictable(); + if self.mapping_unevictable() { + entry.account_unevictable_if_needed(); } } - fn account_entry_remove(&self, state: PageState) { + fn reconcile_entry_unevictable_for_insert(&self, entry: &PageEntry) { + let mapping_unevictable = self.mapping_unevictable(); + let paddr = entry.page.phys_address(); + if mapping_unevictable { + let mut page_guard = entry.page.write(); + let was_unevictable = page_guard.flags().contains(PageFlags::PG_UNEVICTABLE); + if !was_unevictable { + page_guard.add_flags(PageFlags::PG_UNEVICTABLE); + } + drop(page_guard); + if !was_unevictable { + let _ = page_reclaimer_lock().remove_page(&paddr); + } + return; + } + + entry.unaccount_unevictable_if_needed(); + let mut page_guard = entry.page.write(); + let was_unevictable = page_guard.flags().contains(PageFlags::PG_UNEVICTABLE); + if was_unevictable && !page_guard.has_unevictable_source() { + page_guard.remove_flags(PageFlags::PG_UNEVICTABLE); + let should_reclaim = page_guard.flags().contains(PageFlags::PG_LRU); + drop(page_guard); + if should_reclaim { + page_reclaimer_lock().insert_page(paddr, &entry.page); + } + } + } + + fn account_entry_remove(&self, entry: &PageEntry) { pc_stats::dec_file_pages(); if self.is_shmem() { pc_stats::dec_shmem_pages(); } - if self.unevictable.load(Ordering::Relaxed) { - pc_stats::dec_unevictable(); - } + entry.unaccount_unevictable_if_needed(); + let state = entry.state(); match state { PageState::Dirty => pc_stats::dec_file_dirty(), PageState::Writeback => pc_stats::dec_file_writeback(), _ => {} } } - fn account_state_transition(&self, old: PageState, new: PageState) { if old == new { return; @@ -1974,29 +2233,29 @@ impl PageCache { return Ok(entry); } - let mut page = Some(self.allocate_page( - page_cache_ref.expect("page_cache_ref should exist"), - page_index, - )?); - let (entry, need_populate) = { - let mut guard = self.inner.lock(); + let guard = self.inner.lock(); if let Some(entry) = guard.get_entry(page_index) { (entry, false) } else { - let entry = Arc::new(PageEntry::new( - page.take().expect("allocated page must exist"), - PageState::Loading, - )); - guard.insert_entry(page_index, entry.clone()); - (entry, true) + drop(guard); + let page = self.allocate_page( + page_cache_ref.expect("page_cache_ref should exist"), + page_index, + )?; + let mut guard = self.inner.lock(); + if let Some(entry) = guard.get_entry(page_index) { + self.discard_unlinked_page(&page); + (entry, false) + } else { + let entry = Arc::new(PageEntry::new(page, PageState::Loading)); + guard.insert_entry(page_index, entry.clone()); + (entry, true) + } } }; if !need_populate { - if let Some(page) = page.take() { - self.discard_unlinked_page(&page); - } let state = entry.state(); if state.is_ready() { return Ok(entry); @@ -2007,6 +2266,7 @@ impl PageCache { let _ = entry.wait_ready()?; return Ok(entry); } + self.reconcile_entry_unevictable_for_insert(&entry); let populate_result = if populate_backend { self.populate_page_from_backend(page_index, &entry.page) @@ -2029,15 +2289,30 @@ impl PageCache { } } + fn get_or_create_entry_pinned( + &self, + page_index: usize, + populate_backend: bool, + ) -> Result<(Arc, PageEntryPin), SystemError> { + loop { + let entry = self.get_or_create_entry(page_index, populate_backend)?; + let guard = self.inner.lock(); + let Some(current) = guard.get_entry(page_index) else { + continue; + }; + if !Arc::ptr_eq(¤t, &entry) || !entry.state().is_ready() { + continue; + } + let pin = entry.pin(); + return Ok((entry, pin)); + } + } + fn remove_failed_entry(&self, page_index: usize, entry: &Arc) { let mut guard = self.inner.lock(); if let Some(current) = guard.get_entry(page_index) { if Arc::ptr_eq(¤t, entry) { - guard.pages.remove(&page_index); - guard.dirty_pages.remove(&page_index); - if let Some(cache) = guard.page_cache_ref.upgrade() { - cache.account_entry_remove(entry.state()); - } + guard.remove_page(page_index); } } self.discard_unlinked_page(&entry.page); @@ -2052,24 +2327,24 @@ impl PageCache { if entry.state() != PageState::Error { return; } - guard.dirty_pages.remove(&page_index); - let removed = guard.pages.remove(&page_index); - if let Some(entry) = removed.as_ref() { - if let Some(cache) = guard.page_cache_ref.upgrade() { - cache.account_entry_remove(entry.state()); - } - } - removed + guard.remove_page(page_index) }; - if let Some(entry) = removed { - self.discard_unlinked_page(&entry.page); + if let Some(page) = removed { + self.discard_unlinked_page(&page); } } fn discard_unlinked_page(&self, page: &Arc) { let paddr = page.phys_address(); - page_manager_lock().remove_page(&paddr); + let can_remove_from_manager = { + let mut page_guard = page.write(); + page_guard.clear_unlinked_file_mapping_unevictable(); + page_guard.can_deallocate() + }; + if can_remove_from_manager { + page_manager_lock().remove_page(&paddr); + } let _ = page_reclaimer_lock().remove_page(&paddr); } @@ -2096,12 +2371,16 @@ impl PageCache { return Ok(()); } - let page = self.allocate_page( - page_cache_ref.expect("page_cache_ref should exist"), - page_index, - )?; - let entry = { + let guard = self.inner.lock(); + if guard.get_entry(page_index).is_some() { + return Ok(()); + } + drop(guard); + let page = self.allocate_page( + page_cache_ref.expect("page_cache_ref should exist"), + page_index, + )?; let mut guard = self.inner.lock(); if guard.get_entry(page_index).is_some() { self.discard_unlinked_page(&page); @@ -2111,6 +2390,7 @@ impl PageCache { guard.insert_entry(page_index, entry.clone()); entry }; + self.reconcile_entry_unevictable_for_insert(&entry); let backend = self.backend(); let inode = self.inode(); @@ -2161,10 +2441,27 @@ impl PageCache { .map(|entry| entry.page.clone()) } + pub fn get_ready_page_pinned(&self, page_index: usize) -> Option { + let guard = self.inner.lock(); + let entry = guard.get_entry(page_index)?; + if !entry.state().is_ready() { + return None; + } + let pin = entry.pin(); + Some(PageCachePagePin::new(entry.page.clone(), pin)) + } + pub fn get_or_create_page_for_read(&self, page_index: usize) -> Result, SystemError> { Ok(self.get_or_create_entry(page_index, true)?.page.clone()) } + pub fn get_or_create_page_for_read_pinned( + &self, + page_index: usize, + ) -> Result { + self.get_or_create_page_pinned(page_index, true) + } + pub fn get_or_create_page_with( &self, page_index: usize, @@ -2196,29 +2493,29 @@ impl PageCache { return Ok(page); } - let mut page = Some(self.allocate_page( - page_cache_ref.expect("page_cache_ref should exist"), - page_index, - )?); - let (entry, need_populate) = { - let mut guard = self.inner.lock(); + let guard = self.inner.lock(); if let Some(entry) = guard.get_entry(page_index) { (entry, false) } else { - let entry = Arc::new(PageEntry::new( - page.take().expect("allocated page must exist"), - PageState::Loading, - )); - guard.insert_entry(page_index, entry.clone()); - (entry, true) + drop(guard); + let page = self.allocate_page( + page_cache_ref.expect("page_cache_ref should exist"), + page_index, + )?; + let mut guard = self.inner.lock(); + if let Some(entry) = guard.get_entry(page_index) { + self.discard_unlinked_page(&page); + (entry, false) + } else { + let entry = Arc::new(PageEntry::new(page, PageState::Loading)); + guard.insert_entry(page_index, entry.clone()); + (entry, true) + } } }; if !need_populate { - if let Some(page) = page.take() { - self.discard_unlinked_page(&page); - } let state = entry.state(); if state.is_ready() { return Ok(entry.page.clone()); @@ -2228,6 +2525,7 @@ impl PageCache { } return entry.wait_ready(); } + self.reconcile_entry_unevictable_for_insert(&entry); let populate_result = { let mut tmp = vec![0; MMArch::PAGE_SIZE]; @@ -2263,6 +2561,32 @@ impl PageCache { Ok(self.get_or_create_entry(page_index, false)?.page.clone()) } + pub fn get_or_create_page_zero_pinned( + &self, + page_index: usize, + ) -> Result { + self.get_or_create_page_pinned(page_index, false) + } + + fn get_or_create_page_pinned( + &self, + page_index: usize, + populate_backend: bool, + ) -> Result { + loop { + let entry = self.get_or_create_entry(page_index, populate_backend)?; + let guard = self.inner.lock(); + let Some(current) = guard.get_entry(page_index) else { + continue; + }; + if !Arc::ptr_eq(¤t, &entry) || !entry.state().is_ready() { + continue; + } + let pin = entry.pin(); + return Ok(PageCachePagePin::new(entry.page.clone(), pin)); + } + } + pub fn mark_page_dirty(&self, page_index: usize) { let mut guard = self.inner.lock(); if let Some(entry) = guard.get_entry(page_index) { @@ -2311,11 +2635,21 @@ impl PageCache { /// Insert a pre-allocated page into page cache and mark it ready. /// This is for special in-kernel users (e.g. perf ring buffers). pub fn insert_ready_page(&self, page_index: usize, page: Arc) -> Result<(), SystemError> { + let entry = Arc::new(PageEntry::new(page, PageState::UpToDate)); + let _reclassify_guard = self.reclassify_lock.lock(); + { + let guard = self.inner.lock(); + if guard.get_entry(page_index).is_some() { + return Err(SystemError::EEXIST); + } + } + self.reconcile_entry_unevictable_for_insert(&entry); let mut guard = self.inner.lock(); if guard.get_entry(page_index).is_some() { + drop(guard); + self.discard_unlinked_page(&entry.page); return Err(SystemError::EEXIST); } - let entry = Arc::new(PageEntry::new(page, PageState::UpToDate)); guard.insert_entry(page_index, entry); Ok(()) } @@ -2362,9 +2696,10 @@ impl PageCache { continue; } - let entry = self.get_or_create_entry(page_index, true)?; + let (entry, pin) = self.get_or_create_entry_pinned(page_index, true)?; copies.push(CopyItem { entry, + _pin: pin, page_index, page_offset: read_start - page_start, sub_len: page_read_len, @@ -2417,9 +2752,10 @@ impl PageCache { write_start == page_start && page_write_len == MMArch::PAGE_SIZE; let populate_backend = !self.is_shmem() && !full_page_overwrite; self.discard_error_entry(page_index); - let entry = self.get_or_create_entry(page_index, populate_backend)?; + let (entry, pin) = self.get_or_create_entry_pinned(page_index, populate_backend)?; copies.push(CopyItem { entry, + _pin: pin, page_index, page_offset: write_start - page_start, sub_len: page_write_len, diff --git a/kernel/src/filesystem/tmpfs/mod.rs b/kernel/src/filesystem/tmpfs/mod.rs index 522f9eeeaa..1e871d294b 100644 --- a/kernel/src/filesystem/tmpfs/mod.rs +++ b/kernel/src/filesystem/tmpfs/mod.rs @@ -2,7 +2,7 @@ use core::any::Any; use core::intrinsics::unlikely; use core::sync::atomic::{AtomicU64, Ordering}; -use crate::filesystem::page_cache::{PageCache, PageCacheBackend}; +use crate::filesystem::page_cache::{PageCache, PageCacheBackend, PageCachePagePin}; use crate::filesystem::vfs::syscall::RenameFlags; use crate::filesystem::vfs::{FileSystemMakerData, FSMAKER}; use crate::libs::rwsem::RwSem; @@ -39,6 +39,7 @@ use super::vfs::{ use linkme::distributed_slice; use super::vfs::{Magic, MountableFileSystem, SuperBlock}; +use lazy_static::lazy_static; const TMPFS_MAX_NAMELEN: usize = 255; const TMPFS_BLOCK_SIZE: u64 = 4096; @@ -222,7 +223,7 @@ fn tmpfs_insert_whiteout(dir: &mut TmpfsInode, name: &DName) -> Result<(), Syste return Err(SystemError::EEXIST); } - let whiteout = Arc::new(LockedTmpfsInode(Mutex::new(TmpfsInode { + let whiteout = Arc::new(LockedTmpfsInode::new(TmpfsInode { parent: dir.self_ref.clone(), self_ref: Weak::default(), children: BTreeMap::new(), @@ -248,14 +249,20 @@ fn tmpfs_insert_whiteout(dir: &mut TmpfsInode, name: &DName) -> Result<(), Syste fs: dir.fs.clone(), special_node: None, name: name.clone(), - }))); + })); whiteout.0.lock().self_ref = Arc::downgrade(&whiteout); dir.children.insert(name.clone(), whiteout); Ok(()) } #[derive(Debug)] -pub struct LockedTmpfsInode(pub Mutex); +pub struct LockedTmpfsInode(pub Mutex, RwSem<()>); + +impl LockedTmpfsInode { + fn new(inode: TmpfsInode) -> Self { + Self(Mutex::new(inode), RwSem::new(())) + } +} #[derive(Debug)] pub struct Tmpfs { @@ -265,6 +272,41 @@ pub struct Tmpfs { current_size: AtomicU64, } +#[derive(Debug)] +pub struct TmpfsShmemFile { + inode: Arc, + fs: Arc, + inode_id: InodeId, + page_cache: Arc, + charged_size: usize, +} + +impl TmpfsShmemFile { + pub fn inode(&self) -> Arc { + self.inode.clone() + } + + pub fn inode_id(&self) -> InodeId { + self.inode_id + } + + pub fn page_cache(&self) -> Arc { + self.page_cache.clone() + } + + pub fn set_locked(&self, locked: bool) -> (Arc, bool) { + let page_cache = self.page_cache(); + let old_locked = page_cache.set_unevictable(locked); + (page_cache, old_locked) + } +} + +impl Drop for TmpfsShmemFile { + fn drop(&mut self) { + self.fs.decrease_size(self.charged_size); + } +} + #[derive(Debug)] pub struct TmpfsInode { parent: Weak, @@ -462,7 +504,10 @@ impl Tmpfs { let size_limit = mount_data .size_bytes .or_else(|| Some(Self::default_size_bytes() as u64)); + Self::new_with_size_limit(mount_data.mode, size_limit) + } + fn new_with_size_limit(mode: Option, size_limit: Option) -> Arc { let mut sb = SuperBlock::new( Magic::TMPFS_MAGIC, TMPFS_BLOCK_SIZE, @@ -476,7 +521,7 @@ impl Tmpfs { sb.bavail = blocks; } - let root: Arc = Arc::new(LockedTmpfsInode(Mutex::new(TmpfsInode::new()))); + let root: Arc = Arc::new(LockedTmpfsInode::new(TmpfsInode::new())); let result: Arc = Arc::new(Tmpfs { root_inode: root, @@ -489,12 +534,16 @@ impl Tmpfs { root_guard.parent = Arc::downgrade(&result.root_inode); root_guard.self_ref = Arc::downgrade(&result.root_inode); root_guard.fs = Arc::downgrade(&result); - root_guard.metadata.mode = mount_data.mode.unwrap_or(InodeMode::S_IRWXUGO); + root_guard.metadata.mode = mode.unwrap_or(InodeMode::S_IRWXUGO); drop(root_guard); result } + pub fn new_unlimited(mode: Option) -> Arc { + Self::new_with_size_limit(mode, None) + } + /// 原子地增加文件系统使用的大小 /// 返回Ok(())如果更新成功,Err(SystemError::ENOSPC)如果超过限制 /// 使用compare_exchange_weak循环确保并发安全 @@ -541,6 +590,89 @@ impl Tmpfs { self.update_superblock_free(new); } } + + fn create_unlinked_shmem_inode( + self: &Arc, + name: DName, + mode: InodeMode, + size: usize, + ) -> Result, SystemError> { + if size > i64::MAX as usize { + return Err(SystemError::EOVERFLOW); + } + let charged_size = size + .checked_add(MMArch::PAGE_SIZE - 1) + .ok_or(SystemError::EOVERFLOW)? + & !(MMArch::PAGE_SIZE - 1); + let charged_size_u64 = charged_size as u64; + let blocks_u64 = Self::bytes_to_blocks_ceil(size as u64); + if blocks_u64 > usize::MAX as u64 { + return Err(SystemError::EOVERFLOW); + } + self.increase_size(charged_size_u64)?; + + let inode_id = generate_inode_id(); + let result: Arc = Arc::new(LockedTmpfsInode::new(TmpfsInode { + parent: Weak::default(), + self_ref: Weak::default(), + children: BTreeMap::new(), + page_cache: None, + metadata: Metadata { + dev_id: 0, + inode_id, + size: size as i64, + blk_size: TMPFS_BLOCK_SIZE as usize, + blocks: blocks_u64 as usize, + atime: PosixTimeSpec::default(), + mtime: PosixTimeSpec::default(), + ctime: PosixTimeSpec::default(), + btime: PosixTimeSpec::default(), + file_type: FileType::File, + mode, + flags: InodeFlags::empty(), + nlinks: 0, + uid: 0, + gid: 0, + raw_dev: DeviceNumber::default(), + }, + fs: Arc::downgrade(self), + special_node: None, + name, + })); + + result.0.lock().self_ref = Arc::downgrade(&result); + let inode_dyn: Arc = result.clone(); + let backend = Arc::new(TmpfsPageCacheBackend::new(Arc::downgrade(&inode_dyn))); + let pc = PageCache::new(Some(Arc::downgrade(&inode_dyn)), Some(backend)); + pc.set_shmem(true); + result.0.lock().page_cache = Some(pc.clone()); + + Ok(Arc::new(TmpfsShmemFile { + inode: inode_dyn, + fs: self.clone(), + inode_id, + page_cache: pc, + charged_size, + })) + } +} + +lazy_static! { + static ref SYSV_SHMEM_TMPFS: Arc = Tmpfs::new_unlimited(Some(InodeMode::S_IRWXUGO)); +} + +pub fn create_unlinked_shmem_file(size: usize) -> Result, SystemError> { + static NEXT_SYSV_SHMEM_NAME: AtomicU64 = AtomicU64::new(1); + let name = format!( + "SYSV{:08x}", + NEXT_SYSV_SHMEM_NAME.fetch_add(1, Ordering::Relaxed) + ); + let name = DName::from(name.as_str()); + SYSV_SHMEM_TMPFS.create_unlinked_shmem_inode( + name, + InodeMode::S_IRUSR | InodeMode::S_IWUSR, + size, + ) } impl MountableFileSystem for Tmpfs { @@ -734,13 +866,15 @@ impl IndexNode for LockedTmpfsInode { if len == 0 { return Ok(0); } + let _size_guard = self.1.read(); let inode = self.0.lock(); if inode.metadata.file_type == FileType::Dir { return Err(SystemError::EISDIR); } let page_cache = inode.page_cache.clone().ok_or(SystemError::EIO)?; let old_size = inode.metadata.size as usize; - let new_size = (offset + len).max(old_size); + let write_end = offset.checked_add(len).ok_or(SystemError::EFBIG)?; + let new_size = write_end.max(old_size); let size_diff = new_size.saturating_sub(old_size) as u64; // 获取文件系统引用 @@ -750,7 +884,7 @@ impl IndexNode for LockedTmpfsInode { .downcast_ref::() .ok_or(SystemError::EIO)?; - // 先预留空间,失败直接返回 + // 先预留空间,失败直接返回;后续 page-cache/拷贝失败时必须回滚本次预留。 if size_diff > 0 { tmpfs.increase_size(size_diff)?; } @@ -758,10 +892,10 @@ impl IndexNode for LockedTmpfsInode { drop(inode); let start_page_index = offset >> MMArch::PAGE_SHIFT; - let end_page_index = (offset + len - 1) >> MMArch::PAGE_SHIFT; + let end_page_index = (write_end - 1) >> MMArch::PAGE_SHIFT; // 两阶段写入:同样避免在持有 page_cache 锁时触碰用户缓冲区(SelfRead)。 struct WriteItem { - page: Arc, + pin: PageCachePagePin, page_index: usize, page_offset: usize, sub_len: usize, @@ -773,16 +907,24 @@ impl IndexNode for LockedTmpfsInode { let page_end = page_start + MMArch::PAGE_SIZE; let write_start = core::cmp::max(offset, page_start); - let write_end = core::cmp::min(offset + len, page_end); - let page_write_len = write_end.saturating_sub(write_start); + let page_write_end = core::cmp::min(write_end, page_end); + let page_write_len = page_write_end.saturating_sub(write_start); if page_write_len == 0 { continue; } - let page = page_cache.manager().commit_overwrite(page_index)?; + let pin = match page_cache.manager().commit_overwrite_pinned(page_index) { + Ok(pin) => pin, + Err(err) => { + if size_diff > 0 { + tmpfs.decrease_size(size_diff as usize); + } + return Err(err); + } + }; items.push(WriteItem { - page, + pin, page_index, page_offset: write_start - page_start, sub_len: page_write_len, @@ -799,19 +941,38 @@ impl IndexNode for LockedTmpfsInode { volatile_read!(buf[src_off]); volatile_read!(buf[src_off + it.sub_len - 1]); - let mut page_guard = it.page.write(); + let page = it.pin.page(); + let mut page_guard = page.write(); unsafe { page_guard.as_slice_mut()[it.page_offset..it.page_offset + it.sub_len] .copy_from_slice(&buf[src_off..src_off + it.sub_len]); } page_guard.add_flags(crate::mm::page::PageFlags::PG_DIRTY); - page_cache.manager().update_page(it.page_index)?; + if let Err(err) = page_cache.manager().update_page(it.page_index) { + if size_diff > 0 { + tmpfs.decrease_size(size_diff as usize); + } + return Err(err); + } src_off += it.sub_len; } - // 更新文件大小 + // 更新文件大小并按当前 inode size 结算本次预留,避免并发扩容写重复 charge。 let mut inode = self.0.lock(); - if new_size > old_size { + let committed_size = inode.metadata.size as usize; + let actual_growth = new_size.saturating_sub(committed_size); + if actual_growth > size_diff as usize { + let extra = actual_growth - size_diff as usize; + if let Err(err) = tmpfs.increase_size(extra as u64) { + if size_diff > 0 { + tmpfs.decrease_size(size_diff as usize); + } + return Err(err); + } + } else if (size_diff as usize) > actual_growth { + tmpfs.decrease_size(size_diff as usize - actual_growth); + } + if new_size > committed_size { inode.metadata.size = new_size as i64; } Ok(len) @@ -843,11 +1004,15 @@ impl IndexNode for LockedTmpfsInode { } fn resize(&self, len: usize) -> Result<(), SystemError> { - let mut inode = self.0.lock(); - if inode.metadata.file_type == FileType::File { + let _size_guard = self.1.write(); + let (old_size, new_size, page_cache, fs) = { + let mut inode = self.0.lock(); + if inode.metadata.file_type != FileType::File { + return Err(SystemError::EINVAL); + } + let old_size = inode.metadata.size as usize; let new_size = len; - let size_diff = new_size.saturating_sub(old_size) as i64; // 获取文件系统引用 let fs = inode.fs.upgrade().ok_or(SystemError::EIO)?; @@ -856,27 +1021,31 @@ impl IndexNode for LockedTmpfsInode { .downcast_ref::() .ok_or(SystemError::EIO)?; - // 如果扩大,原子地预留空间 - if size_diff > 0 { - tmpfs.increase_size(size_diff as u64)?; + let growth = new_size.saturating_sub(old_size); + if growth > 0 { + tmpfs.increase_size(growth as u64)?; } - // 调整页缓存(会释放多余页,并截断最后一页) - if let Some(pc) = inode.page_cache.clone() { - pc.manager().resize(len)?; - } + // Linux truncate_setsize() writes the new i_size before truncating page cache. + // Drop the inode lock before page-cache unmap/truncate so page faults do not + // form an inode-lock/MM-lock ABBA with the truncate path. + inode.metadata.size = len as i64; + (old_size, new_size, inode.page_cache.clone(), fs) + }; - // 如果缩小,减少current_size - if size_diff < 0 { - tmpfs.decrease_size((-size_diff) as usize); + if new_size < old_size { + if let Some(pc) = page_cache { + pc.manager().resize(len)?; } - inode.metadata.size = len as i64; - - Ok(()) - } else { - Err(SystemError::EINVAL) + let tmpfs = fs + .as_any_ref() + .downcast_ref::() + .ok_or(SystemError::EIO)?; + tmpfs.decrease_size(old_size - new_size); } + + Ok(()) } fn fallocate_file( @@ -907,7 +1076,7 @@ impl IndexNode for LockedTmpfsInode { return Err(SystemError::EEXIST); } - let result: Arc = Arc::new(LockedTmpfsInode(Mutex::new(TmpfsInode { + let result: Arc = Arc::new(LockedTmpfsInode::new(TmpfsInode { parent: inode.self_ref.clone(), self_ref: Weak::default(), children: BTreeMap::new(), @@ -933,7 +1102,7 @@ impl IndexNode for LockedTmpfsInode { fs: inode.fs.clone(), special_node: None, name: name.clone(), - }))); + })); result.0.lock().self_ref = Arc::downgrade(&result); @@ -948,7 +1117,6 @@ impl IndexNode for LockedTmpfsInode { Some(Arc::downgrade(&result) as Weak), Some(backend), ); - pc.set_unevictable(true); pc.set_shmem(true); result.0.lock().page_cache = Some(pc); } @@ -1313,7 +1481,7 @@ impl IndexNode for LockedTmpfsInode { _ => return Err(SystemError::EINVAL), }; - let nod = Arc::new(LockedTmpfsInode(Mutex::new(TmpfsInode { + let nod = Arc::new(LockedTmpfsInode::new(TmpfsInode { parent: inode.self_ref.clone(), self_ref: Weak::default(), children: BTreeMap::new(), @@ -1339,7 +1507,7 @@ impl IndexNode for LockedTmpfsInode { fs: inode.fs.clone(), special_node: None, name: filename.clone(), - }))); + })); nod.0.lock().self_ref = Arc::downgrade(&nod); diff --git a/kernel/src/filesystem/vfs/vcore.rs b/kernel/src/filesystem/vfs/vcore.rs index 89a44ab9dc..8f2ef44e3c 100644 --- a/kernel/src/filesystem/vfs/vcore.rs +++ b/kernel/src/filesystem/vfs/vcore.rs @@ -768,12 +768,6 @@ where clear_suid_sgid_after_size_change(inode.as_ref())?; } - if result.is_ok() && len < old_size as usize { - if let Some(page_cache) = inode.page_cache() { - page_cache.truncate(len)?; - } - } - result } diff --git a/kernel/src/ipc/id.rs b/kernel/src/ipc/id.rs new file mode 100644 index 0000000000..b11d93b3bb --- /dev/null +++ b/kernel/src/ipc/id.rs @@ -0,0 +1,78 @@ +use ida::IdAllocator; +use system_error::SystemError; + +/// Linux-compatible SysV IPC id allocator. +/// +/// A user-visible IPC id is encoded as `(seq << IPC_ID_SEQ_SHIFT) | idx`. +/// The low bits address the object table, and the high bits distinguish stale +/// userspace ids after an index is reused. +#[derive(Debug)] +pub struct IpcIdAllocator { + ida: IdAllocator, + seq: usize, + last_idx: Option, +} + +#[derive(Debug, Clone, Copy)] +pub struct IpcId { + pub raw: usize, + pub idx: usize, + pub seq: usize, +} + +impl IpcIdAllocator { + pub const IPC_ID_INDEX_BITS: usize = 15; + pub const IPC_ID_IDX_MASK: usize = (1usize << Self::IPC_ID_INDEX_BITS) - 1; + pub const IPC_ID_SEQ_SHIFT: usize = Self::IPC_ID_INDEX_BITS; + pub const IPC_ID_SEQ_MAX: usize = (i32::MAX as usize) >> Self::IPC_ID_SEQ_SHIFT; + + pub fn new(max_ids: usize) -> Result { + if max_ids == 0 || max_ids > Self::IPC_ID_IDX_MASK + 1 { + return Err(SystemError::EINVAL); + } + + Ok(Self { + ida: IdAllocator::new(0, max_ids).ok_or(SystemError::EINVAL)?, + seq: 0, + last_idx: None, + }) + } + + pub fn alloc(&mut self) -> Result { + let idx = self.ida.alloc().ok_or(SystemError::ENOSPC)?; + if let Some(last_idx) = self.last_idx { + if idx <= last_idx { + self.seq += 1; + if self.seq >= Self::IPC_ID_SEQ_MAX { + self.seq = 0; + } + } + } + self.last_idx = Some(idx); + + Ok(IpcId { + raw: Self::build_raw(idx, self.seq), + idx, + seq: self.seq, + }) + } + + pub fn free_idx(&mut self, idx: usize) { + self.ida.free(idx); + } + + pub fn decode(raw: usize) -> Result { + if raw > i32::MAX as usize { + return Err(SystemError::EINVAL); + } + + let idx = raw & Self::IPC_ID_IDX_MASK; + let seq = raw >> Self::IPC_ID_SEQ_SHIFT; + Ok(IpcId { raw, idx, seq }) + } + + #[inline] + pub fn build_raw(idx: usize, seq: usize) -> usize { + (seq << Self::IPC_ID_SEQ_SHIFT) | idx + } +} diff --git a/kernel/src/ipc/mod.rs b/kernel/src/ipc/mod.rs index b062eefc1a..ea636f6178 100644 --- a/kernel/src/ipc/mod.rs +++ b/kernel/src/ipc/mod.rs @@ -1,4 +1,5 @@ pub mod generic_signal; +pub mod id; pub mod kill; pub mod pipe; pub mod shm; diff --git a/kernel/src/ipc/shm.rs b/kernel/src/ipc/shm.rs index 84ce513e39..430292171a 100644 --- a/kernel/src/ipc/shm.rs +++ b/kernel/src/ipc/shm.rs @@ -1,28 +1,123 @@ use crate::{ - arch::mm::LockedFrameAllocator, - libs::align::page_align_up, - mm::{ - allocator::page_frame::{FrameAllocator, PageFrameCount, PhysPageFrame}, - page::{page_manager_lock, PageFlags, PageType}, - PhysAddr, + arch::MMArch, + filesystem::{ + page_cache::PageCache, + tmpfs::{create_unlinked_shmem_file, TmpfsShmemFile}, + vfs::{ + file::{File, FileFlags}, + InodeId, + }, + }, + ipc::id::IpcIdAllocator, + libs::mutex::Mutex, + mm::MemoryManagementArch, + process::{ + cred::{capable, ns_capable, CAPFlags, Cred, Kgid, Kuid}, + namespace::{ + ipc_namespace::IpcNamespace, + user_namespace::{map_id_down, map_id_up, UserNamespace}, + }, + resource::RLimitID, + ProcessManager, RawPid, }, - process::{cred::Cred, ProcessManager, RawPid}, - syscall::user_access::{UserBufferReader, UserBufferWriter}, time::PosixTimeSpec, }; use alloc::sync::Arc; -use core::fmt; +use core::{ + fmt, + hash::{Hash, Hasher}, + sync::atomic::{AtomicU64, Ordering}, +}; use hashbrown::HashMap; -use ida::IdAllocator; use num::ToPrimitive; use system_error::SystemError; /// 用于创建新的私有IPC对象 pub const IPC_PRIVATE: ShmKey = ShmKey::new(0); +const DEFAULT_OVERFLOW_ID: u32 = 65534; int_like!(ShmId, usize); int_like!(ShmKey, usize); +static NEXT_SYSV_SHM_ATTACH_ID: AtomicU64 = AtomicU64::new(1); + +lazy_static::lazy_static! { + static ref SYSV_SHM_MEMLOCK_ACCOUNT: Mutex> = + Mutex::new(HashMap::new()); +} + +pub type SysVShmBackingRef = Arc; + +pub trait SysVShmBacking: fmt::Debug + Send + Sync { + fn inode_id(&self) -> InodeId; + + fn open_file(&self, readonly: bool) -> Result, SystemError>; + + fn resident_pages(&self) -> Result; + + fn set_locked(&self, locked: bool) -> (Arc, bool); +} + +impl SysVShmBacking for TmpfsShmemFile { + fn inode_id(&self) -> InodeId { + TmpfsShmemFile::inode_id(self) + } + + fn open_file(&self, readonly: bool) -> Result, SystemError> { + let flags = if readonly { + FileFlags::O_LARGEFILE + } else { + FileFlags::O_RDWR | FileFlags::O_LARGEFILE + }; + Ok(Arc::new(File::new(self.inode(), flags)?)) + } + + fn resident_pages(&self) -> Result { + self.page_cache().manager().pages_count() + } + + fn set_locked(&self, locked: bool) -> (Arc, bool) { + TmpfsShmemFile::set_locked(self, locked) + } +} + +#[derive(Clone, Copy, Eq)] +struct SysVShmMemlockAccountKey { + user_ns: usize, + uid: usize, +} + +impl SysVShmMemlockAccountKey { + fn new(user_ns: &Arc, uid: usize) -> Self { + Self { + user_ns: Arc::as_ptr(user_ns) as usize, + uid, + } + } +} + +impl PartialEq for SysVShmMemlockAccountKey { + fn eq(&self, other: &Self) -> bool { + self.user_ns == other.user_ns && self.uid == other.uid + } +} + +impl Hash for SysVShmMemlockAccountKey { + fn hash(&self, state: &mut H) { + self.user_ns.hash(state); + self.uid.hash(state); + } +} + +impl fmt::Debug for SysVShmMemlockAccountKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SysVShmMemlockAccountKey") + .field("user_ns", &format_args!("{:#x}", self.user_ns)) + .field("uid", &self.uid) + .finish() + } +} + bitflags! { pub struct ShmFlags:u32{ const PERM_MASK = 0o777; @@ -106,15 +201,168 @@ impl PartialEq for ShmCtlCmd { } } +pub struct SysVShmAttach { + attach_id: u64, + ipcns: Arc, + shmid: ShmId, + backing_inode_id: InodeId, + size: usize, + attach_file: Arc, +} + +impl fmt::Debug for SysVShmAttach { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SysVShmAttach") + .field("attach_id", &self.attach_id) + .field("shmid", &self.shmid) + .field("backing_inode_id", &self.backing_inode_id) + .finish_non_exhaustive() + } +} + +impl SysVShmAttach { + pub fn new( + ipcns: Arc, + shmid: ShmId, + backing_inode_id: InodeId, + size: usize, + attach_file: Arc, + ) -> Arc { + Arc::new(Self { + attach_id: NEXT_SYSV_SHM_ATTACH_ID.fetch_add(1, Ordering::Relaxed), + ipcns, + shmid, + backing_inode_id, + size, + attach_file, + }) + } + + pub fn attach_id(&self) -> u64 { + self.attach_id + } + + pub fn shmid(&self) -> ShmId { + self.shmid + } + + pub fn backing_inode_id(&self) -> InodeId { + self.backing_inode_id + } + + pub fn size(&self) -> usize { + self.size + } + + pub fn attach_file(&self) -> Arc { + self.attach_file.clone() + } + + pub fn open_vma(&self) -> Result<(), SystemError> { + let mut guard = self.ipcns.shm.lock(); + guard.attach_open(self.shmid, self.backing_inode_id) + } + + pub fn close_vma(&self) { + let destroy = { + let mut guard = self.ipcns.shm.lock(); + guard.attach_close(self.shmid, self.backing_inode_id) + }; + if let Some(destroy) = destroy { + destroy.finish_or_log("SysV SHM close_vma destroy cleanup"); + } + } +} + +pub struct SysVShmAttachGuard { + ipcns: Arc, + shmid: ShmId, + backing: SysVShmBackingRef, + backing_inode_id: InodeId, + size: usize, + active: bool, +} + +impl fmt::Debug for SysVShmAttachGuard { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SysVShmAttachGuard") + .field("shmid", &self.shmid) + .field("backing_inode_id", &self.backing_inode_id) + .field("size", &self.size) + .field("active", &self.active) + .finish_non_exhaustive() + } +} + +impl SysVShmAttachGuard { + fn new( + ipcns: Arc, + shmid: ShmId, + backing: SysVShmBackingRef, + backing_inode_id: InodeId, + size: usize, + ) -> Self { + Self { + ipcns, + shmid, + backing, + backing_inode_id, + size, + active: true, + } + } + + pub fn size(&self) -> usize { + self.size + } + + pub fn create_attach(&self, readonly: bool) -> Result, SystemError> { + let attach_file = self.backing.open_file(readonly)?; + Ok(SysVShmAttach::new( + self.ipcns.clone(), + self.shmid, + self.backing_inode_id, + self.size, + attach_file, + )) + } + + pub fn finish(mut self) { + self.release_pin(); + } + + fn release_pin(&mut self) { + if !self.active { + return; + } + self.active = false; + let destroy = { + let mut guard = self.ipcns.shm.lock(); + guard.attach_end(self.shmid, self.backing_inode_id) + }; + if let Some(destroy) = destroy { + destroy.finish_or_log("SysV SHM attach guard destroy cleanup"); + } + } +} + +impl Drop for SysVShmAttachGuard { + fn drop(&mut self) { + self.release_pin(); + } +} + /// 共享内存管理器 #[derive(Debug)] pub struct ShmManager { /// ShmId分配器 - id_allocator: IdAllocator, - /// ShmId映射共享内存信息表 - id2shm: HashMap, + id_allocator: IpcIdAllocator, + /// 低位 IPC idx 映射共享内存信息表 + id2shm: HashMap, /// ShmKey映射ShmId表 key2id: HashMap, + /// SysV SHM namespace-wide allocated pages, matching Linux shm_tot/SHMALL accounting. + total_pages: usize, } impl Default for ShmManager { @@ -124,11 +372,59 @@ impl Default for ShmManager { } impl ShmManager { + const IPC_READ: u32 = 0o4; + const IPC_WRITE: u32 = 0o2; + const IPC_EXEC: u32 = 0o1; + pub fn new() -> Self { ShmManager { - id_allocator: IdAllocator::new(0, usize::MAX - 1).unwrap(), + id_allocator: IpcIdAllocator::new(PosixShmMetaInfo::SHMMNI).unwrap(), id2shm: HashMap::new(), key2id: HashMap::new(), + total_pages: 0, + } + } + + pub fn page_count_for_size(size: usize) -> Result { + let rounded = size + .checked_add(MMArch::PAGE_SIZE - 1) + .ok_or(SystemError::ENOSPC)? + & !(MMArch::PAGE_SIZE - 1); + Ok(rounded >> MMArch::PAGE_SHIFT) + } + + pub fn validate_new_segment_size(&self, size: usize) -> Result { + if !(PosixShmMetaInfo::SHMMIN..=PosixShmMetaInfo::SHMMAX).contains(&size) { + return Err(SystemError::EINVAL); + } + + let numpages = Self::page_count_for_size(size)?; + let total_pages_after = self + .total_pages + .checked_add(numpages) + .ok_or(SystemError::ENOSPC)?; + if total_pages_after > PosixShmMetaInfo::SHMALL { + return Err(SystemError::ENOSPC); + } + + Ok(numpages) + } + + fn release_total_pages(&mut self, pages: usize) { + if let Some(total_pages) = self.total_pages.checked_sub(pages) { + self.total_pages = total_pages; + } else { + log::error!( + "SysV SHM total_pages accounting underflow: total_pages={}, release={}", + self.total_pages, + pages + ); + debug_assert!( + false, + "SysV SHM total_pages accounting underflow: total_pages={}, release={}", + self.total_pages, pages + ); + self.total_pages = 0; } } @@ -144,120 +440,380 @@ impl ShmManager { /// /// 成功:共享内存id /// 失败:对应错误码 - pub fn add( + pub fn add_prepared( &mut self, key: ShmKey, size: usize, shmflg: ShmFlags, + backing: SysVShmBackingRef, + numpages: usize, ) -> Result { - // 判断共享内存大小是否过小或溢出 - if !(PosixShmMetaInfo::SHMMIN..=PosixShmMetaInfo::SHMMAX).contains(&size) { + let expected_numpages = self.validate_new_segment_size(size)?; + if expected_numpages != numpages { return Err(SystemError::EINVAL); } - - let id = self.id_allocator.alloc().ok_or(SystemError::ENOSPC)?; - // TODO: 实现 IPC 序列号机制以防止 ID 重用攻击 - // 参考 Linux ipc_idr_alloc(): - // - 跟踪 last_idx,检测 idx 回绕时递增 seq - // - 构建最终 ID: (seq << SEQ_SHIFT) | idx - // - 在获取对象时验证 seq (ipc_checkid) - // - // 具体实现方式见 (https://github.com/DragonOS-Community/DragonOS/issues/1678) - let shm_id = ShmId::new(id); - - // 分配共享内存页面 - let page_count = - PageFrameCount::from_bytes(page_align_up(size)).ok_or(SystemError::EINVAL)?; - // 创建共享内存page,并添加到PAGE_MANAGER中 - let mut page_manager_guard = page_manager_lock(); - let (paddr, _page) = page_manager_guard.create_pages( - PageType::Shm, - PageFlags::PG_UNEVICTABLE, - &mut LockedFrameAllocator, - page_count, - )?; + let total_pages_after = self + .total_pages + .checked_add(numpages) + .ok_or(SystemError::ENOSPC)?; + let ipc_id = self.id_allocator.alloc()?; + let shm_id = ShmId::new(ipc_id.raw); // 创建共享内存段信息结构体 let current_cred = ProcessManager::current_pcb().cred(); - let kern_ipc_perm = - KernIpcPerm::new_with_cred(shm_id, key, current_cred, shmflg & ShmFlags::PERM_MASK); - let shm_kernel = KernelShm::new(kern_ipc_perm, paddr, size); + let kern_ipc_perm = KernIpcPerm::new_with_cred( + shm_id, + key, + current_cred, + shmflg & ShmFlags::PERM_MASK, + ipc_id.seq, + ); + let shm_kernel = KernelShm::new(kern_ipc_perm, backing, size, numpages); // 更新共享内存管理器相关映射表 - self.key2id.insert(key, shm_id); - self.id2shm.insert(shm_id, shm_kernel); + if key != IPC_PRIVATE { + self.key2id.insert(key, shm_id); + } + self.id2shm.insert(ipc_id.idx, shm_kernel); + self.total_pages = total_pages_after; return Ok(shm_id.data()); } + pub fn create_default_backing(size: usize) -> Result { + Ok(create_unlinked_shmem_file(size)?) + } + pub fn contains_key(&self, key: &ShmKey) -> Option<&ShmId> { self.key2id.get(key) } - pub fn get_mut(&mut self, id: &ShmId) -> Option<&mut KernelShm> { - self.id2shm.get_mut(id) + pub fn get_by_shmid_checked(&self, id: ShmId) -> Result<&KernelShm, SystemError> { + let decoded = IpcIdAllocator::decode(id.data())?; + let kernel_shm = self.id2shm.get(&decoded.idx).ok_or(SystemError::EINVAL)?; + if kernel_shm.kern_ipc_perm.id != id || kernel_shm.kern_ipc_perm.seq != decoded.seq { + return Err(SystemError::EINVAL); + } + + Ok(kernel_shm) + } + + pub fn get_by_shmid_checked_mut(&mut self, id: ShmId) -> Result<&mut KernelShm, SystemError> { + let decoded = IpcIdAllocator::decode(id.data())?; + let kernel_shm = self + .id2shm + .get_mut(&decoded.idx) + .ok_or(SystemError::EINVAL)?; + if kernel_shm.kern_ipc_perm.id != id || kernel_shm.kern_ipc_perm.seq != decoded.seq { + return Err(SystemError::EINVAL); + } + + Ok(kernel_shm) + } + + fn get_by_index_for_shm_stat(&self, idx: usize) -> Result<&KernelShm, SystemError> { + if idx > IpcIdAllocator::IPC_ID_IDX_MASK { + return Err(SystemError::EINVAL); + } + self.id2shm.get(&idx).ok_or(SystemError::EINVAL) + } + + fn get_by_attach_token_mut( + &mut self, + id: ShmId, + backing_inode_id: InodeId, + ) -> Result<&mut KernelShm, SystemError> { + let decoded = IpcIdAllocator::decode(id.data())?; + let kernel_shm = self + .id2shm + .get_mut(&decoded.idx) + .ok_or(SystemError::EINVAL)?; + if kernel_shm.kern_ipc_perm.id != id || kernel_shm.kern_ipc_perm.seq != decoded.seq { + return Err(SystemError::EINVAL); + } + if kernel_shm.backing_inode_id != backing_inode_id { + return Err(SystemError::EINVAL); + } + + Ok(kernel_shm) } pub fn free_key(&mut self, key: &ShmKey) { self.key2id.remove(key); } - pub fn free_id(&mut self, id: &ShmId) { - self.id2shm.remove(id); - self.id_allocator.free(id.0); + pub fn free_id(&mut self, id: &ShmId) -> Option { + let Ok(decoded) = IpcIdAllocator::decode(id.data()) else { + return None; + }; + let current = self.id2shm.get(&decoded.idx)?; + if current.kern_ipc_perm.id != *id || current.kern_ipc_perm.seq != decoded.seq { + return None; + } + if let Some(shm) = self.id2shm.remove(&decoded.idx) { + self.release_total_pages(shm.numpages); + self.id_allocator.free_idx(decoded.idx); + self.key2id.remove(&shm.kern_ipc_perm.key); + return Some(KernelShmDestroy::new(shm)); + } + None + } + + fn cred_in_group(cred: &Cred, gid: Kgid) -> bool { + cred.fsgid == gid + || cred.groups.contains(&gid) + || cred + .group_info + .as_ref() + .map(|group_info| group_info.gids.contains(&gid)) + .unwrap_or(false) } - pub fn ipc_info(&self, user_buf: *const u8, from_user: bool) -> Result { - let mut user_buffer_writer = UserBufferWriter::new( - user_buf as *mut u8, - core::mem::size_of::(), - from_user, - )?; + fn ipc_permission( + kern_ipc_perm: &KernIpcPerm, + requested: u32, + target_user_ns: &Arc, + ) -> Result<(), SystemError> { + let requested = ((requested >> 6) | (requested >> 3) | requested) & 0o7; + if requested == 0 { + return Ok(()); + } + + let cred = ProcessManager::current_pcb().cred(); + let mut granted = kern_ipc_perm.mode.bits(); + if cred.euid == kern_ipc_perm.cuid || cred.euid == kern_ipc_perm.uid { + granted >>= 6; + } else if Self::cred_in_group(&cred, kern_ipc_perm.cgid) + || Self::cred_in_group(&cred, kern_ipc_perm.gid) + { + granted >>= 3; + } - let shm_meta_info = PosixShmMetaInfo::new(); - user_buffer_writer.copy_one_to_user(&shm_meta_info, 0)?; + if (requested & !(granted & 0o7)) != 0 + && !ns_capable(target_user_ns, CAPFlags::CAP_IPC_OWNER) + { + return Err(SystemError::EACCES); + } - return Ok(0); + Ok(()) + } + + fn check_control_permission( + kern_ipc_perm: &KernIpcPerm, + target_user_ns: &Arc, + ) -> Result<(), SystemError> { + let cred = ProcessManager::current_pcb().cred(); + if cred.euid == kern_ipc_perm.cuid + || cred.euid == kern_ipc_perm.uid + || ns_capable(target_user_ns, CAPFlags::CAP_SYS_ADMIN) + { + Ok(()) + } else { + Err(SystemError::EPERM) + } + } + + fn check_lock_permission( + kern_ipc_perm: &KernIpcPerm, + target_user_ns: &Arc, + ) -> Result<(), SystemError> { + let cred = ProcessManager::current_pcb().cred(); + if cred.euid == kern_ipc_perm.cuid + || cred.euid == kern_ipc_perm.uid + || ns_capable(target_user_ns, CAPFlags::CAP_IPC_LOCK) + { + Ok(()) + } else { + Err(SystemError::EPERM) + } + } + + pub(crate) fn charge_memlock_for_shm(size: usize) -> Result { + let pcb = ProcessManager::current_pcb(); + let rlimit = pcb.get_rlimit(RLimitID::Memlock).rlim_cur; + let bytes = size + .checked_add(MMArch::PAGE_SIZE - 1) + .ok_or(SystemError::ENOMEM)? + & !(MMArch::PAGE_SIZE - 1); + let cred = pcb.cred(); + let uid = cred.uid.data(); + let account_user_ns = cred.user_ns.clone(); + let account_key = SysVShmMemlockAccountKey::new(&account_user_ns, uid); + let mut guard = SYSV_SHM_MEMLOCK_ACCOUNT.lock(); + let current = guard.get(&account_key).copied().unwrap_or(0); + let next = current.checked_add(bytes).ok_or(SystemError::ENOMEM)?; + if (next as u128) > rlimit as u128 && !capable(CAPFlags::CAP_IPC_LOCK) { + return Err(SystemError::ENOMEM); + } + + guard.insert(account_key, next); + Ok(SysVShmMemlockToken { + account_user_ns, + account_key, + bytes, + }) + } + + pub fn check_existing_key_permission( + &self, + id: ShmId, + shmflg: ShmFlags, + ) -> Result<(), SystemError> { + let kernel_shm = self.get_by_shmid_checked(id)?; + let requested = shmflg.bits() & ShmFlags::PERM_MASK.bits(); + let target_user_ns = ProcessManager::current_ipcns().user_ns.clone(); + Self::ipc_permission(&kernel_shm.kern_ipc_perm, requested, &target_user_ns) + } + + fn maybe_take_destroy_candidate_locked(&mut self, id: ShmId) -> Option { + let decoded = IpcIdAllocator::decode(id.data()).ok()?; + let shm = self.id2shm.get(&decoded.idx)?; + if shm.kern_ipc_perm.id != id || shm.kern_ipc_perm.seq != decoded.seq { + return None; + } + if !shm.mode().contains(ShmFlags::SHM_DEST) || shm.nattch() != 0 || shm.pin_count != 0 { + return None; + } + + let shm = self.id2shm.remove(&decoded.idx)?; + self.release_total_pages(shm.numpages); + self.id_allocator.free_idx(decoded.idx); + self.key2id.remove(&shm.kern_ipc_perm.key); + Some(KernelShmDestroy::new(shm)) + } + + pub fn attach_begin( + &mut self, + ipcns: Arc, + id: ShmId, + readonly: bool, + executable: bool, + ) -> Result { + let kernel_shm = self.get_by_shmid_checked_mut(id)?; + let mut requested = Self::IPC_READ; + if !readonly { + requested |= Self::IPC_WRITE; + } + if executable { + requested |= Self::IPC_EXEC; + } + Self::ipc_permission(&kernel_shm.kern_ipc_perm, requested, &ipcns.user_ns)?; + let backing = kernel_shm.backing.clone(); + let backing_inode_id = kernel_shm.backing_inode_id; + let size = kernel_shm.size(); + kernel_shm.pin_count = kernel_shm + .pin_count + .checked_add(1) + .ok_or(SystemError::EOVERFLOW)?; + Ok(SysVShmAttachGuard::new( + ipcns, + id, + backing, + backing_inode_id, + size, + )) } - pub fn shm_info(&self, user_buf: *const u8, from_user: bool) -> Result { + pub fn attach_open(&mut self, id: ShmId, backing_inode_id: InodeId) -> Result<(), SystemError> { + let kernel_shm = self.get_by_attach_token_mut(id, backing_inode_id)?; + kernel_shm.update_atim(); + kernel_shm.increase_count()?; + Ok(()) + } + + fn attach_close(&mut self, id: ShmId, backing_inode_id: InodeId) -> Option { + let kernel_shm = match self.get_by_attach_token_mut(id, backing_inode_id) { + Ok(kernel_shm) => kernel_shm, + Err(err) => { + log::error!( + "SysV SHM attach_close token mismatch for shmid={}, backing_inode_id={:?}: {:?}", + id.data(), + backing_inode_id, + err + ); + debug_assert!( + false, + "SysV SHM attach_close token mismatch for shmid={}", + id.data() + ); + return None; + } + }; + kernel_shm.update_dtim(); + kernel_shm.decrease_count(); + self.maybe_take_destroy_candidate_locked(id) + } + + fn attach_end(&mut self, id: ShmId, backing_inode_id: InodeId) -> Option { + let kernel_shm = match self.get_by_attach_token_mut(id, backing_inode_id) { + Ok(kernel_shm) => kernel_shm, + Err(err) => { + log::error!( + "SysV SHM attach_end token mismatch for shmid={}, backing_inode_id={:?}: {:?}", + id.data(), + backing_inode_id, + err + ); + debug_assert!( + false, + "SysV SHM attach_end token mismatch for shmid={}", + id.data() + ); + return None; + } + }; + if let Some(pin_count) = kernel_shm.pin_count.checked_sub(1) { + kernel_shm.pin_count = pin_count; + } else { + log::error!("SysV SHM pin_count underflow for shmid={}", id.data()); + debug_assert!( + false, + "SysV SHM pin_count underflow for shmid={}", + id.data() + ); + kernel_shm.pin_count = 0; + } + self.maybe_take_destroy_candidate_locked(id) + } + + fn current_max_index(&self) -> usize { + self.id2shm.keys().copied().max().unwrap_or(0) + } + + pub fn ipc_info_data(&self) -> (usize, PosixShmMetaInfo) { + (self.current_max_index(), PosixShmMetaInfo::new()) + } + + pub fn shm_info_data(&self) -> Result<(usize, PosixShmInfo), SystemError> { // 已使用id数量 let used_ids = self.id2shm.len().to_i32().ok_or(SystemError::EOVERFLOW)?; - // 共享内存总和 - let shm_tot = self.id2shm.iter().fold(0, |acc, (_, kernel_shm)| { - acc + PageFrameCount::from_bytes(page_align_up(kernel_shm.shm_size)) - .unwrap() - .data() - }); - let shm_info = PosixShmInfo::new(used_ids, shm_tot, 0, 0, 0, 0); - - let mut user_buffer_writer = UserBufferWriter::new( - user_buf as *mut u8, - core::mem::size_of::(), - from_user, - )?; - user_buffer_writer.copy_one_to_user(&shm_info, 0)?; - - return Ok(0); + let shm_rss = self.id2shm.values().try_fold(0usize, |acc, shm| { + let resident = shm.backing.resident_pages()?; + acc.checked_add(resident).ok_or(SystemError::EOVERFLOW) + })?; + let shm_info = PosixShmInfo::new(used_ids, self.total_pages, shm_rss, 0, 0, 0); + Ok((self.current_max_index(), shm_info)) } - pub fn shm_stat( + pub fn shm_stat_data( &self, id: ShmId, cmd: ShmCtlCmd, - user_buf: *const u8, - from_user: bool, - ) -> Result { - let kernel_shm = self.id2shm.get(&id).ok_or(SystemError::EINVAL)?; + ) -> Result<(usize, PosixShmIdDs), SystemError> { + let kernel_shm = match cmd { + ShmCtlCmd::IpcStat => self.get_by_shmid_checked(id)?, + ShmCtlCmd::ShmStat | ShmCtlCmd::ShmtStatAny => { + self.get_by_index_for_shm_stat(id.data())? + } + _ => return Err(SystemError::EINVAL), + }; + if cmd != ShmCtlCmd::ShmtStatAny { + let target_user_ns = ProcessManager::current_ipcns().user_ns.clone(); + Self::ipc_permission(&kernel_shm.kern_ipc_perm, Self::IPC_READ, &target_user_ns)?; + } let kern_ipc_perm = &kernel_shm.kern_ipc_perm; - let _key = kern_ipc_perm - .key - .data() - .to_i32() - .ok_or(SystemError::EOVERFLOW)?; - let _mode = kern_ipc_perm.mode.bits(); - - let shm_perm = PosixIpcPerm::try_from(kern_ipc_perm)?; + let current_user_ns = ProcessManager::current_user_ns(); + let shm_perm = kern_ipc_perm.to_posix(¤t_user_ns)?; let shm_segsz = kernel_shm.shm_size; let shm_atime = kernel_shm.shm_atim.tv_sec; let shm_dtime = kernel_shm.shm_dtim.tv_sec; @@ -286,110 +842,178 @@ impl ShmManager { _unused2: 0, }; - let mut user_buffer_writer = UserBufferWriter::new( - user_buf as *mut u8, - core::mem::size_of::(), - from_user, - )?; - user_buffer_writer.copy_one_to_user(&shm_id_ds, 0)?; - let r: usize = if cmd == ShmCtlCmd::IpcStat { 0 } else { - id.data() + kern_ipc_perm.id.data() }; - return Ok(r); + return Ok((r, shm_id_ds)); } - pub fn ipc_set( - &mut self, - id: ShmId, - user_buf: *const u8, - from_user: bool, - ) -> Result { - let kernel_shm = self.id2shm.get_mut(&id).ok_or(SystemError::EINVAL)?; - - let user_buffer_reader = - UserBufferReader::new(user_buf, core::mem::size_of::(), from_user)?; - let mut shm_id_ds = PosixShmIdDs::default(); - user_buffer_reader.copy_one_from_user(&mut shm_id_ds, 0)?; + pub fn ipc_set(&mut self, id: ShmId, shm_id_ds: PosixShmIdDs) -> Result { + let kernel_shm = self.get_by_shmid_checked_mut(id)?; + let target_user_ns = ProcessManager::current_ipcns().user_ns.clone(); + Self::check_control_permission(&kernel_shm.kern_ipc_perm, &target_user_ns)?; - kernel_shm.copy_from(shm_id_ds); + let current_user_ns = ProcessManager::current_user_ns(); + kernel_shm.copy_from(shm_id_ds, ¤t_user_ns)?; return Ok(0); } - pub fn ipc_rmid(&mut self, id: ShmId) -> Result { - let kernel_shm = self.id2shm.get_mut(&id).ok_or(SystemError::EINVAL)?; - kernel_shm.set_mode(ShmFlags::SHM_DEST, true); + pub(crate) fn ipc_rmid(&mut self, id: ShmId) -> Result, SystemError> { + let key = { + let kernel_shm = self.get_by_shmid_checked_mut(id)?; + let target_user_ns = ProcessManager::current_ipcns().user_ns.clone(); + Self::check_control_permission(&kernel_shm.kern_ipc_perm, &target_user_ns)?; + // Linux do_shm_rmid() marks an attached segment as SHM_DEST and + // hides its key, but does not refresh shm_ctim. IPC_SET remains + // the metadata-changing operation that updates shm_ctim. + kernel_shm.set_mode_no_ctime(ShmFlags::SHM_DEST, true); + let key = kernel_shm.kern_ipc_perm.key; + kernel_shm.kern_ipc_perm.key = IPC_PRIVATE; + key + }; + self.free_key(&key); + Ok(self.maybe_take_destroy_candidate_locked(id)) + } - let mut cur_phys = PhysPageFrame::new(kernel_shm.shm_start_paddr); - let count = PageFrameCount::from_bytes(page_align_up(kernel_shm.shm_size)) - .ok_or(SystemError::EINVAL)?; - let key = kernel_shm.kern_ipc_perm.key; - let id = kernel_shm.kern_ipc_perm.id; - let map_count = kernel_shm.map_count(); - - let mut page_manager_guard = page_manager_lock(); - if map_count > 0 { - // 设置共享内存物理页当映射计数等于0时可被回收 - // TODO 后续需要加入到lru中 - for _ in 0..count.data() { - let paddr = cur_phys.phys_address(); - let page = page_manager_guard.get(&paddr).ok_or(SystemError::EFAULT)?; - page.write().remove_flags(PageFlags::PG_UNEVICTABLE); - - cur_phys = cur_phys.next(); - } + pub(crate) fn shm_lock_begin(&mut self, id: ShmId) -> Result { + let kernel_shm = self.get_by_shmid_checked_mut(id)?; + let target_user_ns = ProcessManager::current_ipcns().user_ns.clone(); + Self::check_lock_permission(&kernel_shm.kern_ipc_perm, &target_user_ns)?; + let has_target_ns_cap = ns_capable(&target_user_ns, CAPFlags::CAP_IPC_LOCK); + if ProcessManager::current_pcb() + .get_rlimit(RLimitID::Memlock) + .rlim_cur + == 0 + && !has_target_ns_cap + { + return Err(SystemError::EPERM); + } + if kernel_shm.mode().contains(ShmFlags::SHM_LOCKED) { + return Ok(ShmLockBegin::Done(None)); + } - // 释放key,不让后续进程连接 - self.free_key(&key); - } else { - // 释放共享内存物理页 - for _ in 0..count.data() { - let paddr = cur_phys.phys_address(); - unsafe { - LockedFrameAllocator.free(paddr, PageFrameCount::new(1)); - } - // 将已回收的物理页面对应的Page从PAGE_MANAGER中删去 - page_manager_guard.remove_page(&paddr); - cur_phys = cur_phys.next(); - } + Ok(ShmLockBegin::NeedCharge { + size: kernel_shm.shm_size, + }) + } - // 释放key和id - self.free_id(&id); - self.free_key(&key) + pub(crate) fn shm_lock_commit( + &mut self, + id: ShmId, + token: SysVShmMemlockToken, + ) -> Result, bool)>, SystemError> { + let kernel_shm = match self.get_by_shmid_checked_mut(id) { + Ok(kernel_shm) => kernel_shm, + Err(err) => { + token.release(); + return Err(err); + } + }; + let target_user_ns = ProcessManager::current_ipcns().user_ns.clone(); + if let Err(err) = Self::check_lock_permission(&kernel_shm.kern_ipc_perm, &target_user_ns) { + token.release(); + return Err(err); + } + if kernel_shm.mode().contains(ShmFlags::SHM_LOCKED) { + token.release(); + return Ok(None); } - return Ok(0); + let page_cache = kernel_shm.backing.set_locked(true); + kernel_shm.memlock_token = Some(token); + kernel_shm.set_mode_no_ctime(ShmFlags::SHM_LOCKED, true); + Ok(Some(page_cache)) } - pub fn shm_lock(&mut self, id: ShmId) -> Result { - let kernel_shm = self.id2shm.get_mut(&id).ok_or(SystemError::EINVAL)?; - kernel_shm.set_mode(ShmFlags::SHM_LOCKED, true); + pub fn shm_unlock(&mut self, id: ShmId) -> Result, bool)>, SystemError> { + let kernel_shm = self.get_by_shmid_checked_mut(id)?; + let target_user_ns = ProcessManager::current_ipcns().user_ns.clone(); + Self::check_lock_permission(&kernel_shm.kern_ipc_perm, &target_user_ns)?; + if !kernel_shm.mode().contains(ShmFlags::SHM_LOCKED) { + return Ok(None); + } - return Ok(0); + let page_cache = kernel_shm.backing.set_locked(false); + kernel_shm.set_mode_no_ctime(ShmFlags::SHM_LOCKED, false); + if let Some(token) = kernel_shm.memlock_token.take() { + token.release(); + } + Ok(Some(page_cache)) } +} - pub fn shm_unlock(&mut self, id: ShmId) -> Result { - let kernel_shm = self.id2shm.get_mut(&id).ok_or(SystemError::EINVAL)?; - kernel_shm.set_mode(ShmFlags::SHM_LOCKED, false); +#[derive(Debug)] +pub(crate) enum ShmLockBegin { + Done(Option<(Arc, bool)>), + NeedCharge { size: usize }, +} - return Ok(0); +pub(crate) struct SysVShmMemlockToken { + account_user_ns: Arc, + account_key: SysVShmMemlockAccountKey, + bytes: usize, +} + +impl fmt::Debug for SysVShmMemlockToken { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SysVShmMemlockToken") + .field( + "account_user_ns", + &format_args!("{:#x}", Arc::as_ptr(&self.account_user_ns) as usize), + ) + .field("account_key", &self.account_key) + .field("bytes", &self.bytes) + .finish() + } +} + +impl SysVShmMemlockToken { + fn release(self) { + let mut guard = SYSV_SHM_MEMLOCK_ACCOUNT.lock(); + if let Some(current) = guard.get_mut(&self.account_key) { + if let Some(next) = current.checked_sub(self.bytes) { + *current = next; + } else { + log::error!( + "SysV SHM memlock accounting underflow: current={}, release={}", + *current, + self.bytes + ); + debug_assert!( + false, + "SysV SHM memlock accounting underflow: current={}, release={}", + *current, self.bytes + ); + *current = 0; + } + if *current == 0 { + guard.remove(&self.account_key); + } + } } } + /// 共享内存段信息 #[derive(Debug)] pub struct KernelShm { /// 权限信息 kern_ipc_perm: KernIpcPerm, - /// 共享内存段起始物理地址 - shm_start_paddr: PhysAddr, + /// 共享内存段底层 backing。当前默认实现为 tmpfs,但 SysV IPC 层只依赖此抽象。 + backing: SysVShmBackingRef, + /// backing inode id cached at creation time; read under shm SpinLock without touching tmpfs mutexes. + backing_inode_id: InodeId, /// 共享内存段大小(bytes),注意是用户指定的大小(未经过页面对齐) shm_size: usize, - /// 映射计数 - map_count: usize, + /// 共享内存段页面数,用于 SysV SHMALL/shm_tot accounting. + numpages: usize, + /// live SysV VMA descriptor 计数 + nattch: usize, + /// attach 正在建立过程中的临时 pin 计数 + pin_count: usize, /// 最后一次 attach 的时间 shm_atim: PosixTimeSpec, /// 最后一次 detach 的时间 @@ -400,28 +1024,35 @@ pub struct KernelShm { shm_cprid: RawPid, /// 最后操作者进程id (这里的操作者是指最后一次 attach 或 detach 操作的进程,创建共享内存段的进程不算操作者) shm_lprid: RawPid, + /// SysV SHM_LOCK memlock accounting token. + memlock_token: Option, } impl KernelShm { - pub fn new(kern_ipc_perm: KernIpcPerm, shm_start_paddr: PhysAddr, shm_size: usize) -> Self { + pub fn new( + kern_ipc_perm: KernIpcPerm, + backing: SysVShmBackingRef, + shm_size: usize, + numpages: usize, + ) -> Self { let shm_cprid = ProcessManager::current_pid(); KernelShm { kern_ipc_perm, - shm_start_paddr, + backing_inode_id: backing.inode_id(), + backing, shm_size, - map_count: 0, + numpages, + nattch: 0, + pin_count: 0, shm_atim: PosixTimeSpec::new(0, 0), shm_dtim: PosixTimeSpec::new(0, 0), shm_ctim: PosixTimeSpec::now(), shm_cprid, shm_lprid: RawPid::new(0), // 初始值为0,表示尚未有进程对这个共享内存段执行 attach 或 detach 操作,对齐 Linux 行为 + memlock_token: None, } } - pub fn start_paddr(&self) -> PhysAddr { - self.shm_start_paddr - } - pub fn size(&self) -> usize { self.shm_size } @@ -452,39 +1083,129 @@ impl KernelShm { /// 共享内存段的映射计数(有多少个不同的VMA映射) pub fn map_count(&self) -> usize { - self.map_count + self.nattch } - pub fn copy_from(&mut self, shm_id_ds: PosixShmIdDs) { - self.kern_ipc_perm.uid = shm_id_ds.uid() as usize; - self.kern_ipc_perm.gid = shm_id_ds.gid() as usize; + pub fn nattch(&self) -> usize { + self.nattch + } + + pub fn copy_from( + &mut self, + shm_id_ds: PosixShmIdDs, + user_ns: &Arc, + ) -> Result<(), SystemError> { + let uid = KernIpcPerm::make_kuid(user_ns, shm_id_ds.uid())?; + let gid = KernIpcPerm::make_kgid(user_ns, shm_id_ds.gid())?; let perm_bits = ShmFlags::from_bits_truncate(shm_id_ds.mode()) & ShmFlags::PERM_MASK; + self.kern_ipc_perm.uid = uid; + self.kern_ipc_perm.gid = gid; self.kern_ipc_perm.mode.remove(ShmFlags::PERM_MASK); self.kern_ipc_perm.mode.insert(perm_bits); self.update_ctim(); + Ok(()) } pub fn set_mode(&mut self, shmflg: ShmFlags, set: bool) { + self.set_mode_no_ctime(shmflg, set); + self.update_ctim(); + } + + pub fn set_mode_no_ctime(&mut self, shmflg: ShmFlags, set: bool) { if set { self.kern_ipc_perm.mode.insert(shmflg); } else { self.kern_ipc_perm.mode.remove(shmflg); } - - self.update_ctim(); } pub fn mode(&self) -> &ShmFlags { &self.kern_ipc_perm.mode } - pub fn increase_count(&mut self) { - self.map_count += 1; + pub fn increase_count(&mut self) -> Result<(), SystemError> { + self.nattch = self.nattch.checked_add(1).ok_or(SystemError::EOVERFLOW)?; + Ok(()) } pub fn decrease_count(&mut self) { - assert!(self.map_count > 0, "map_count is zero"); - self.map_count -= 1; + assert!(self.nattch > 0, "nattch is zero"); + self.nattch -= 1; + } + + fn prepare_destroy_cleanup(&mut self) -> Option<(Arc, bool)> { + let reclassify = if self.mode().contains(ShmFlags::SHM_LOCKED) { + let reclassify = self.backing.set_locked(false); + self.set_mode_no_ctime(ShmFlags::SHM_LOCKED, false); + Some(reclassify) + } else { + None + }; + if let Some(token) = self.memlock_token.take() { + token.release(); + } + reclassify + } +} + +pub struct KernelShmDestroy { + shm: Option, +} + +impl fmt::Debug for KernelShmDestroy { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("KernelShmDestroy") + .field("pending", &self.shm.is_some()) + .finish() + } +} + +impl KernelShmDestroy { + fn new(shm: KernelShm) -> Self { + Self { shm: Some(shm) } + } + + pub fn finish(mut self) { + let Some(mut shm) = self.shm.take() else { + return; + }; + let reclassify = shm.prepare_destroy_cleanup(); + drop(shm); + if let Some((page_cache, old_mapping_unevictable)) = reclassify { + page_cache.reclassify_unevictable_pages(old_mapping_unevictable); + } + } + + pub fn finish_or_log(self, context: &str) { + let _ = context; + self.finish(); + } +} + +impl Drop for KernelShmDestroy { + fn drop(&mut self) { + if self.shm.is_some() { + log::error!("KernelShmDestroy dropped without explicit finish()"); + debug_assert!(false, "KernelShmDestroy dropped without explicit finish()"); + } + } +} + +impl Drop for KernelShm { + fn drop(&mut self) { + if let Some(token) = self.memlock_token.take() { + log::error!( + "KernelShm dropped with unreleased SysV SHM_LOCK memlock token; releasing token" + ); + token.release(); + } + if self.mode().contains(ShmFlags::SHM_LOCKED) { + log::error!("KernelShm dropped while SHM_LOCKED; explicit destroy cleanup was skipped"); + debug_assert!( + false, + "KernelShm dropped while SHM_LOCKED; explicit destroy cleanup was skipped" + ); + } } } @@ -495,14 +1216,14 @@ pub struct KernIpcPerm { id: ShmId, /// 共享内存段键值,由创建共享内存用户指定 key: ShmKey, - /// 共享内存段拥有者用户id - uid: usize, - /// 共享内存段拥有者所在组id - gid: usize, - /// 共享内存段创建者用户id - cuid: usize, - /// 共享内存段创建者所在组id - cgid: usize, + /// 共享内存段拥有者用户id(kernel-global uid) + uid: Kuid, + /// 共享内存段拥有者所在组id(kernel-global gid) + gid: Kgid, + /// 共享内存段创建者用户id(kernel-global uid) + cuid: Kuid, + /// 共享内存段创建者所在组id(kernel-global gid) + cgid: Kgid, /// 共享内存段权限模式 mode: ShmFlags, /// 序列号:用于在 ShmId 被重用的时候进行区分 @@ -511,18 +1232,71 @@ pub struct KernIpcPerm { } impl KernIpcPerm { - pub fn new_with_cred(id: ShmId, key: ShmKey, cred: Arc, mode: ShmFlags) -> Self { + pub fn new_with_cred( + id: ShmId, + key: ShmKey, + cred: Arc, + mode: ShmFlags, + seq: usize, + ) -> Self { KernIpcPerm { id, key, - uid: cred.uid.data(), - gid: cred.gid.data(), - cuid: cred.uid.data(), - cgid: cred.gid.data(), + uid: cred.euid, + gid: cred.egid, + cuid: cred.euid, + cgid: cred.egid, mode, - seq: 0, + seq, } } + + fn make_kuid(user_ns: &Arc, uid: u32) -> Result { + let inner = user_ns.inner.lock(); + map_id_down(&inner.uid_map, uid) + .map(|uid| Kuid::new(uid as usize)) + .ok_or(SystemError::EINVAL) + } + + fn make_kgid(user_ns: &Arc, gid: u32) -> Result { + let inner = user_ns.inner.lock(); + map_id_down(&inner.gid_map, gid) + .map(|gid| Kgid::new(gid as usize)) + .ok_or(SystemError::EINVAL) + } + + fn kuid_to_user(user_ns: &Arc, kuid: Kuid) -> u32 { + let Ok(uid) = u32::try_from(kuid.data()) else { + return DEFAULT_OVERFLOW_ID; + }; + let inner = user_ns.inner.lock(); + map_id_up(&inner.uid_map, uid).unwrap_or(DEFAULT_OVERFLOW_ID) + } + + fn kgid_to_user(user_ns: &Arc, kgid: Kgid) -> u32 { + let Ok(gid) = u32::try_from(kgid.data()) else { + return DEFAULT_OVERFLOW_ID; + }; + let inner = user_ns.inner.lock(); + map_id_up(&inner.gid_map, gid).unwrap_or(DEFAULT_OVERFLOW_ID) + } + + fn to_posix(&self, user_ns: &Arc) -> Result { + let key = self.key.data() as u32 as i32; + + Ok(PosixIpcPerm { + key, + uid: Self::kuid_to_user(user_ns, self.uid), + gid: Self::kgid_to_user(user_ns, self.gid), + cuid: Self::kuid_to_user(user_ns, self.cuid), + cgid: Self::kgid_to_user(user_ns, self.cgid), + mode: self.mode.bits(), + seq: self.seq.to_i32().ok_or(SystemError::EOVERFLOW)?, + _pad1: 0, + _unused1: 0, + _unused2: 0, + }) + } } /// 共享内存元信息,符合POSIX标准 @@ -578,6 +1352,8 @@ impl PosixShmMetaInfo { pub struct PosixShmInfo { /// 已使用id数 used_ids: i32, + /// 显式填充,避免 copy_to_user 时泄漏 repr(C) 隐式 padding。 + _pad0: i32, /// 共享内存总量(pages) shm_tot: usize, /// 保留在内存中的共享内存大小 @@ -601,6 +1377,7 @@ impl PosixShmInfo { ) -> Self { PosixShmInfo { used_ids, + _pad0: 0, shm_tot, shm_rss, shm_swp, @@ -670,28 +1447,3 @@ pub struct PosixIpcPerm { _unused1: usize, _unused2: usize, } - -impl TryFrom<&KernIpcPerm> for PosixIpcPerm { - type Error = SystemError; - - fn try_from(kern_ipc_perm: &KernIpcPerm) -> Result { - let key = kern_ipc_perm - .key - .data() - .to_i32() - .ok_or(SystemError::EOVERFLOW)?; - - Ok(PosixIpcPerm { - key, - uid: kern_ipc_perm.uid as u32, - gid: kern_ipc_perm.gid as u32, - cuid: kern_ipc_perm.cuid as u32, - cgid: kern_ipc_perm.cgid as u32, - mode: kern_ipc_perm.mode.bits(), - seq: kern_ipc_perm.seq as i32, - _pad1: 0, - _unused1: 0, - _unused2: 0, - }) - } -} diff --git a/kernel/src/ipc/syscall/sys_shmat.rs b/kernel/src/ipc/syscall/sys_shmat.rs index 6d0397fc85..b77b58c5d7 100644 --- a/kernel/src/ipc/syscall/sys_shmat.rs +++ b/kernel/src/ipc/syscall/sys_shmat.rs @@ -3,19 +3,14 @@ use crate::arch::interrupt::TrapFrame; use crate::syscall::table::FormattedSyscallParam; use crate::{ arch::syscall::nr::SYS_SHMAT, - arch::MMArch, ipc::shm::{ShmFlags, ShmId}, - libs::align::page_align_up, mm::{ - allocator::page_frame::{PageFrameCount, PhysPageFrame, VirtPageFrame}, - mmu_gather::MmuGather, - page::{page_manager_lock, DeferredFlusher, EntryFlags}, - syscall::ProtFlags, - ucontext::{AddressSpace, PhysmapParams, VMA}, - VirtAddr, VirtRegion, VmFlags, + syscall::{MapFlags, ProtFlags}, + ucontext::{AddressSpace, FileMappingWithFileArgs}, + MemoryManagementArch, VirtAddr, }, process::ProcessManager, - syscall::{table::Syscall, user_access::UserBufferReader}, + syscall::table::Syscall, }; use syscall_table_macros::declare_syscall; use system_error::SystemError; @@ -38,127 +33,101 @@ pub(super) fn do_kernel_shmat( vaddr: VirtAddr, shmflg: ShmFlags, ) -> Result { + let user_supplied_addr = vaddr.data() != 0; + let mut addr = vaddr; + let shmlba = crate::arch::MMArch::SHMLBA; + + if user_supplied_addr { + if !addr.check_aligned(shmlba) { + if shmflg.contains(ShmFlags::SHM_RND) { + addr = VirtAddr::new(addr.data() & !(shmlba - 1)); + if addr.data() == 0 && shmflg.contains(ShmFlags::SHM_REMAP) { + return Err(SystemError::EINVAL); + } + } else { + return Err(SystemError::EINVAL); + } + } + } else if shmflg.contains(ShmFlags::SHM_REMAP) { + return Err(SystemError::EINVAL); + } + let ipcns = ProcessManager::current_ipcns(); let current_address_space = AddressSpace::current()?; - let size = { + let attach_guard = { let mut shm_manager_guard = ipcns.shm.lock(); - let kernel_shm = shm_manager_guard.get_mut(&id).ok_or(SystemError::EINVAL)?; - page_align_up(kernel_shm.size()) - }; - - let mut address_write_guard = if vaddr.data() == 0 { - current_address_space.write() - } else { - current_address_space.write_guard_no_reservation_conflict(VirtRegion::new(vaddr, size)) + shm_manager_guard.attach_begin( + ipcns.clone(), + id, + shmflg.contains(ShmFlags::SHM_RDONLY), + shmflg.contains(ShmFlags::SHM_EXEC), + )? }; - - let mut shm_manager_guard = ipcns.shm.lock(); - let kernel_shm = shm_manager_guard.get_mut(&id).ok_or(SystemError::EINVAL)?; - let mut phys = PhysPageFrame::new(kernel_shm.start_paddr()); - let count = PageFrameCount::from_bytes(size).unwrap(); - let r = match vaddr.data() { - // 找到空闲区域并映射到共享内存 - 0 => { - // 找到空闲区域 - let region = address_write_guard - .mappings - .find_free(vaddr, size) - .ok_or(SystemError::EINVAL)?; - let vm_flags = VmFlags::from(shmflg); - let destination = VirtPageFrame::new(region.start()); - let page_flags: EntryFlags = - EntryFlags::from_prot_flags(ProtFlags::from(vm_flags), true); - // New region mapping: no prior PTE, no TLB shootdown needed; - // use DeferredFlusher to silently consume internal PageFlush tokens. - let flusher = DeferredFlusher::new(); - - // 将共享内存映射到对应虚拟区域 - let params = PhysmapParams { - phys, - destination, - count, - vm_flags, - flags: page_flags, - shm_id: Some(id), - }; - let vma = VMA::physmap(params, &mut address_write_guard.user_mapper.utable, flusher)?; - - // 将VMA加入到当前进程的VMA列表中 - address_write_guard.mappings.insert_vma(vma); - - region.start().data() + let size = attach_guard + .size() + .checked_add(crate::arch::MMArch::PAGE_SIZE - 1) + .ok_or(SystemError::EINVAL)? + & !(crate::arch::MMArch::PAGE_SIZE - 1); + if user_supplied_addr { + let end = addr.data().checked_add(size).ok_or(SystemError::EINVAL)?; + if end > crate::arch::MMArch::USER_END_VADDR.data() { + return Err(SystemError::EINVAL); } - // 指定虚拟地址 - _ => { - // 获取对应vma - let vma = address_write_guard - .mappings - .contains(vaddr) - .ok_or(SystemError::EINVAL)?; - if vma.lock().region().start() != vaddr { - return Err(SystemError::EINVAL); - } - - // 验证用户虚拟内存区域是否有效 - let _ = UserBufferReader::new(vaddr.data() as *const u8, size, true)?; + } - // 必须在取消映射前获取到EntryFlags - let page_flags = address_write_guard - .user_mapper - .utable - .translate(vaddr) - .ok_or(SystemError::EINVAL)? - .1; + let readonly = shmflg.contains(ShmFlags::SHM_RDONLY); + let sysv_attach = attach_guard.create_attach(readonly)?; + let attach_file = sysv_attach.attach_file(); + let mut prot_flags = ProtFlags::PROT_READ; + if !readonly { + prot_flags |= ProtFlags::PROT_WRITE; + } + if shmflg.contains(ShmFlags::SHM_EXEC) { + prot_flags |= ProtFlags::PROT_EXEC; + } + let mut map_flags = MapFlags::MAP_SHARED; + if user_supplied_addr { + if shmflg.contains(ShmFlags::SHM_REMAP) { + map_flags |= MapFlags::MAP_FIXED; + } else { + // Linux checks the no-remap collision while holding mmap_write_lock. + // Use DragonOS' no-replace fixed mapping path so the conflict check + // and VMA insertion are performed atomically under the address-space + // write lock instead of relying on a syscall-layer pre-check. + map_flags |= MapFlags::MAP_FIXED_NOREPLACE; + } + } - // Unmap the old mapping via MmuGather: cross-core shootdown first, then free physical pages (INV-3). + let mapped = current_address_space + .file_mapping_with_file_ext(FileMappingWithFileArgs { + file: attach_file, + start_vaddr: addr, + len: size, + prot_flags, + map_flags, + may_exec: true, + offset: 0, + round_to_min: !user_supplied_addr, + allocate_at_once: false, + sysv_shm: Some(sysv_attach), + fixed_noreplace_conflict_error_before_mmap_min: if user_supplied_addr + && !shmflg.contains(ShmFlags::SHM_REMAP) { - let mut tlb = MmuGather::gather(¤t_address_space); - vma.unmap(&mut address_write_guard.user_mapper.utable, &mut tlb); - tlb.finish(); - } - - // 将该虚拟内存区域映射到共享内存区域 - let mut page_manager_guard = page_manager_lock(); - let mut virt = VirtPageFrame::new(vaddr); - for _ in 0..count.data() { - let r = unsafe { - address_write_guard.user_mapper.utable.map_phys( - virt.virt_address(), - phys.phys_address(), - page_flags, - ) - } - .expect("Failed to map zero, may be OOM error"); - r.flush(); - - // 将vma加入到对应Page的anon_vma - page_manager_guard - .get_unwrap(&phys.phys_address()) - .write() - .insert_vma(vma.clone()); - - phys = phys.next(); - virt = virt.next(); + Some(SystemError::EINVAL) + } else { + None + }, + }) + .map_err(|err| { + if err == SystemError::EEXIST && map_flags.contains(MapFlags::MAP_FIXED_NOREPLACE) { + SystemError::EINVAL + } else { + err } - - // 更新vma的映射状态 - let mut vma_guard = vma.lock(); - vma_guard.set_mapped(true); - vma_guard.set_shm_id(Some(id)); - drop(vma_guard); - - vaddr.data() - } - }; - - // 更新最后一次连接时间 - kernel_shm.update_atim(); - - // 映射计数增加 - kernel_shm.increase_count(); - - Ok(r) + })?; + attach_guard.finish(); + Ok(mapped.virt_address().data()) } impl SysShmatHandle { diff --git a/kernel/src/ipc/syscall/sys_shmctl.rs b/kernel/src/ipc/syscall/sys_shmctl.rs index 44fd769e20..aeba51007a 100644 --- a/kernel/src/ipc/syscall/sys_shmctl.rs +++ b/kernel/src/ipc/syscall/sys_shmctl.rs @@ -2,9 +2,12 @@ use crate::alloc::vec::Vec; use crate::arch::interrupt::TrapFrame; use crate::{ arch::syscall::nr::SYS_SHMCTL, - ipc::shm::{ShmCtlCmd, ShmId}, + ipc::shm::{ + PosixShmIdDs, PosixShmInfo, PosixShmMetaInfo, ShmCtlCmd, ShmId, ShmLockBegin, ShmManager, + }, process::ProcessManager, syscall::table::{FormattedSyscallParam, Syscall}, + syscall::user_access::{UserBufferReader, UserBufferWriter}, }; use syscall_table_macros::declare_syscall; use system_error::SystemError; @@ -35,21 +38,88 @@ pub(super) fn do_kernel_shmctl( match cmd { // 查看共享内存元信息 - ShmCtlCmd::IpcInfo => shm_manager_guard.ipc_info(user_buf, from_user), + ShmCtlCmd::IpcInfo => { + let (ret, shm_meta_info) = shm_manager_guard.ipc_info_data(); + drop(shm_manager_guard); + let mut user_buffer_writer = UserBufferWriter::new( + user_buf as *mut u8, + core::mem::size_of::(), + from_user, + )?; + user_buffer_writer.copy_one_to_user(&shm_meta_info, 0)?; + Ok(ret) + } // 查看共享内存使用信息 - ShmCtlCmd::ShmInfo => shm_manager_guard.shm_info(user_buf, from_user), + ShmCtlCmd::ShmInfo => { + let (ret, shm_info) = shm_manager_guard.shm_info_data()?; + drop(shm_manager_guard); + let mut user_buffer_writer = UserBufferWriter::new( + user_buf as *mut u8, + core::mem::size_of::(), + from_user, + )?; + user_buffer_writer.copy_one_to_user(&shm_info, 0)?; + Ok(ret) + } // 查看id对应的共享内存信息 ShmCtlCmd::ShmStat | ShmCtlCmd::ShmtStatAny | ShmCtlCmd::IpcStat => { - shm_manager_guard.shm_stat(id, cmd, user_buf, from_user) + let (ret, shm_id_ds) = shm_manager_guard.shm_stat_data(id, cmd)?; + drop(shm_manager_guard); + let mut user_buffer_writer = UserBufferWriter::new( + user_buf as *mut u8, + core::mem::size_of::(), + from_user, + )?; + user_buffer_writer.copy_one_to_user(&shm_id_ds, 0)?; + Ok(ret) } // 设置KernIpcPerm - ShmCtlCmd::IpcSet => shm_manager_guard.ipc_set(id, user_buf, from_user), + ShmCtlCmd::IpcSet => { + drop(shm_manager_guard); + let user_buffer_reader = + UserBufferReader::new(user_buf, core::mem::size_of::(), from_user)?; + let mut shm_id_ds = PosixShmIdDs::default(); + user_buffer_reader.copy_one_from_user(&mut shm_id_ds, 0)?; + let ipcns = ProcessManager::current_ipcns(); + let mut shm_manager_guard = ipcns.shm.lock(); + shm_manager_guard.ipc_set(id, shm_id_ds) + } // 将共享内存段设置为可回收状态 - ShmCtlCmd::IpcRmid => shm_manager_guard.ipc_rmid(id), + ShmCtlCmd::IpcRmid => { + let destroy = shm_manager_guard.ipc_rmid(id)?; + drop(shm_manager_guard); + if let Some(destroy) = destroy { + destroy.finish(); + } + Ok(0) + } // 锁住共享内存段,不允许内存置换 - ShmCtlCmd::ShmLock => shm_manager_guard.shm_lock(id), + ShmCtlCmd::ShmLock => { + let begin = shm_manager_guard.shm_lock_begin(id)?; + drop(shm_manager_guard); + let reclassify = match begin { + ShmLockBegin::Done(reclassify) => reclassify, + ShmLockBegin::NeedCharge { size } => { + let token = ShmManager::charge_memlock_for_shm(size)?; + let ipcns = ProcessManager::current_ipcns(); + let mut shm_manager_guard = ipcns.shm.lock(); + shm_manager_guard.shm_lock_commit(id, token)? + } + }; + if let Some((page_cache, old_mapping_unevictable)) = reclassify { + page_cache.reclassify_unevictable_pages(old_mapping_unevictable); + } + Ok(0) + } // 解锁共享内存段,允许内存置换 - ShmCtlCmd::ShmUnlock => shm_manager_guard.shm_unlock(id), + ShmCtlCmd::ShmUnlock => { + let reclassify = shm_manager_guard.shm_unlock(id)?; + drop(shm_manager_guard); + if let Some((page_cache, old_mapping_unevictable)) = reclassify { + page_cache.reclassify_unevictable_pages(old_mapping_unevictable); + } + Ok(0) + } // 无效操作码 ShmCtlCmd::Default => Err(SystemError::EINVAL), } @@ -85,6 +155,9 @@ impl Syscall for SysShmctlHandle { } fn handle(&self, args: &[usize], frame: &mut TrapFrame) -> Result { + if args[0] > i32::MAX as usize || args[1] > i32::MAX as usize { + return Err(SystemError::EINVAL); + } let id = Self::id(args); let cmd = Self::cmd(args); let user_buf = Self::user_buf(args); diff --git a/kernel/src/ipc/syscall/sys_shmdt.rs b/kernel/src/ipc/syscall/sys_shmdt.rs index 025eb7be05..34871f7e7e 100644 --- a/kernel/src/ipc/syscall/sys_shmdt.rs +++ b/kernel/src/ipc/syscall/sys_shmdt.rs @@ -1,9 +1,8 @@ use crate::arch::interrupt::TrapFrame; -use crate::mm::mmu_gather::MmuGather; use crate::syscall::table::FormattedSyscallParam; use crate::{ - arch::{syscall::nr::SYS_SHMDT, MMArch}, - mm::{ucontext::AddressSpace, MemoryManagementArch, VirtAddr, VirtRegion}, + arch::syscall::nr::SYS_SHMDT, + mm::{ucontext::AddressSpace, VirtAddr}, syscall::table::Syscall, }; use alloc::vec::Vec; @@ -42,28 +41,8 @@ impl Syscall for SysShmdtHandle { fn handle(&self, args: &[usize], _frame: &mut TrapFrame) -> Result { let vaddr = Self::vaddr(args); let current_address_space = AddressSpace::current()?; - let mut address_write_guard = current_address_space - .write_guard_no_reservation_conflict(VirtRegion::new(vaddr, MMArch::PAGE_SIZE)); - - // 获取vma - let vma = address_write_guard - .mappings - .contains(vaddr) - .ok_or(SystemError::EINVAL)?; - - // 判断vaddr是否为起始地址 - if vma.lock().region().start() != vaddr { - return Err(SystemError::EINVAL); - } - - // Unmap via MmuGather: shootdown first, then free physical pages (INV-3). - { - let mut tlb = MmuGather::gather(¤t_address_space); - vma.unmap(&mut address_write_guard.user_mapper.utable, &mut tlb); - tlb.finish(); - } - - return Ok(0); + current_address_space.detach_sysv_shm_wait(vaddr)?; + Ok(0) } } diff --git a/kernel/src/ipc/syscall/sys_shmget.rs b/kernel/src/ipc/syscall/sys_shmget.rs index c7c5bcc6c7..f50b120ad0 100644 --- a/kernel/src/ipc/syscall/sys_shmget.rs +++ b/kernel/src/ipc/syscall/sys_shmget.rs @@ -3,7 +3,7 @@ use crate::arch::interrupt::TrapFrame; use crate::syscall::table::FormattedSyscallParam; use crate::{ arch::syscall::nr::SYS_SHMGET, - ipc::shm::{ShmFlags, ShmKey, IPC_PRIVATE}, + ipc::shm::{ShmFlags, ShmKey, ShmManager, IPC_PRIVATE}, process::ProcessManager, syscall::table::Syscall, }; @@ -34,37 +34,61 @@ pub(super) fn do_kernel_shmget( return Err(SystemError::ENOSYS); } + fn existing_segment_result( + shm_manager: &mut ShmManager, + id: crate::ipc::shm::ShmId, + size: usize, + shmflg: ShmFlags, + ) -> Result { + if shmflg.contains(ShmFlags::IPC_CREAT | ShmFlags::IPC_EXCL) { + return Err(SystemError::EEXIST); + } + + let kernel_shm = shm_manager.get_by_shmid_checked(id)?; + if size > kernel_shm.size() { + return Err(SystemError::EINVAL); + } + + shm_manager.check_existing_key_permission(id, shmflg)?; + Ok(id.data()) + } + let ipcns = ProcessManager::current_ipcns(); - let mut shm_manager_guard = ipcns.shm.lock(); match key { - IPC_PRIVATE => shm_manager_guard.add(key, size, shmflg), + IPC_PRIVATE => { + let numpages = { + let shm_manager_guard = ipcns.shm.lock(); + shm_manager_guard.validate_new_segment_size(size)? + }; + let backing = ShmManager::create_default_backing(size)?; + let mut shm_manager_guard = ipcns.shm.lock(); + shm_manager_guard.add_prepared(key, size, shmflg, backing, numpages) + } _ => { - let id = shm_manager_guard.contains_key(&key); - - if let Some(id) = id { - let id = *id; - if shmflg.contains(ShmFlags::IPC_CREAT | ShmFlags::IPC_EXCL) { - // IPC_CREAT | IPC_EXCL with existing segment -> EEXIST (Linux semantics) - return Err(SystemError::EEXIST); - } + let create_numpages = { + let mut shm_manager_guard = ipcns.shm.lock(); + let id = shm_manager_guard.contains_key(&key).copied(); - let kernel_shm = shm_manager_guard.get_mut(&id).ok_or(SystemError::EINVAL)?; - - if size > kernel_shm.size() { - // request_size > existing segment size -> EINVAL (Linux semantics) - return Err(SystemError::EINVAL); + if let Some(id) = id { + return existing_segment_result(&mut shm_manager_guard, id, size, shmflg); } - return Ok(id.data()); - } else { if !shmflg.contains(ShmFlags::IPC_CREAT) { // no existing segment and no IPC_CREAT -> ENOENT (Linux semantics) return Err(SystemError::ENOENT); } - return shm_manager_guard.add(key, size, shmflg); + shm_manager_guard.validate_new_segment_size(size)? + }; + + let backing = ShmManager::create_default_backing(size)?; + let mut shm_manager_guard = ipcns.shm.lock(); + if let Some(id) = shm_manager_guard.contains_key(&key).copied() { + return existing_segment_result(&mut shm_manager_guard, id, size, shmflg); } + + shm_manager_guard.add_prepared(key, size, shmflg, backing, create_numpages) } } } @@ -72,7 +96,7 @@ pub(super) fn do_kernel_shmget( impl SysShmgetHandle { #[inline(always)] fn key(args: &[usize]) -> ShmKey { - ShmKey::new(args[0]) + ShmKey::new(args[0] as u32 as usize) } #[inline(always)] diff --git a/kernel/src/mm/fault.rs b/kernel/src/mm/fault.rs index e5286cb1d1..8ee6f574a1 100644 --- a/kernel/src/mm/fault.rs +++ b/kernel/src/mm/fault.rs @@ -10,6 +10,7 @@ use system_error::SystemError; use crate::{ arch::{mm::PageMapper, MMArch}, + filesystem::page_cache::PageCachePagePin, libs::align::align_down, mm::{ page::{page_manager_lock, EntryFlags}, @@ -57,6 +58,8 @@ pub struct PageFaultMessage<'a> { backing_pgoff: Option, /// 缺页对应PageCache中的文件页 page: Option>, + /// PageCache entry pin held until the fault either installs a PTE/rmap or fails. + page_pin: Option, /// 写时拷贝需要的页面 cow_page: Option>, /// 缺页所属的地址空间。 @@ -84,6 +87,7 @@ impl<'a> PageFaultMessage<'a> { flags, backing_pgoff, page: None, + page_pin: None, mapper, cow_page: None, mm, @@ -146,18 +150,14 @@ impl PageFaultHandler { fn attach_fault_mapped_page(page: &Arc, vma: &Arc, mlocked: bool) { let mut page_guard = page.write(); - page_guard.insert_vma(vma.clone()); - if mlocked { - page_guard.add_flags(PageFlags::PG_UNEVICTABLE); - } + page_guard.insert_vma(vma.clone(), mlocked); } fn detach_fault_mapped_page(page: &Arc, vma: &Arc) { let mut page_guard = page.write(); page_guard.remove_vma(vma.as_ref()); - if !InnerAddressSpace::page_should_remain_unevictable(&page_guard) { - page_guard.remove_flags(PageFlags::PG_UNEVICTABLE); - } + drop(page_guard); + InnerAddressSpace::remove_page_unevictable_if_unneeded(page); } fn file_page_cache( @@ -504,7 +504,8 @@ impl PageFaultHandler { // 将pagecache页设为脏页,以便回收时能够回写 cache_page.write().add_flags(PageFlags::PG_DIRTY); - if let PageType::File(info) = cache_page.read().page_type().clone() { + let page_type = { cache_page.read().page_type().clone() }; + if let PageType::File(info) = page_type { if let Some(page_cache) = info.page_cache.upgrade() { page_cache.mark_page_dirty(info.index); } @@ -612,7 +613,8 @@ impl PageFaultHandler { let table = mapper.get_table(address, 0).unwrap(); let i = table.index_of(address).unwrap(); old_page.write().add_flags(PageFlags::PG_DIRTY); - if let PageType::File(info) = old_page.read().page_type().clone() { + let page_type = { old_page.read().page_type().clone() }; + if let PageType::File(info) = page_type { if let Some(page_cache) = info.page_cache.upgrade() { page_cache.mark_page_dirty(info.index); } @@ -796,38 +798,44 @@ impl PageFaultHandler { let _pt_edit = mm.page_table_edit(); let mapper = &mut pfm.mapper; let mlocked = vma_guard.vm_flags().contains(VmFlags::VM_LOCKED); + let vma_flags = *vma_guard.vm_flags(); + let entry_flags = vma_guard.flags(); + let region_start = vma_guard.region().start; + let backing_page_offset = vma_guard + .backing_page_offset() + .expect("backing_page_offset is none"); + let is_private_file_vma = + vma_guard.vm_file().is_some() && !vma_flags.contains(VmFlags::VM_SHARED); // 起始页地址 - let addr = vma_guard.region().start - + ((start_pgoff - - vma_guard - .backing_page_offset() - .expect("backing_page_offset is none")) - << MMArch::PAGE_SHIFT); + let addr = region_start + ((start_pgoff - backing_page_offset) << MMArch::PAGE_SHIFT); + drop(vma_guard); for pgoff in start_pgoff..end_pgoff { - if let Some(page) = page_cache.manager().peek_page(pgoff) { + if let Some(page_pin) = page_cache.manager().peek_page_pinned(pgoff) { + let page = page_pin.page(); let page_guard = page.upread(); - if page_guard.flags().contains(PageFlags::PG_UPTODATE) { - let phys = page.phys_address(); - - let address = - VirtAddr::new(addr.data() + ((pgoff - start_pgoff) << MMArch::PAGE_SHIFT)); - if mapper.get_entry(address, 0).is_none() { - let mut flags = vma_guard.flags(); - let is_private_file_vma = vma_guard.vm_file().is_some() - && !vma_guard.vm_flags().contains(VmFlags::VM_SHARED); - if is_private_file_vma - || vma_guard - .vm_flags() - .contains(VmFlags::VM_SHARED | VmFlags::VM_WRITE) - { - flags = flags.set_write(false); - } - mapper.map_phys(address, phys, flags).unwrap().flush(); + if !page_guard.flags().contains(PageFlags::PG_UPTODATE) { + continue; + } + let phys = page.phys_address(); + drop(page_guard); + + let address = + VirtAddr::new(addr.data() + ((pgoff - start_pgoff) << MMArch::PAGE_SHIFT)); + if mapper.get_entry(address, 0).is_none() { + let mut flags = entry_flags; + if is_private_file_vma + || vma_flags.contains(VmFlags::VM_SHARED | VmFlags::VM_WRITE) + { + flags = flags.set_write(false); } - drop(page_guard); Self::attach_fault_mapped_page(&page, &vma, mlocked); + if let Some(flush) = mapper.map_phys(address, phys, flags) { + flush.flush(); + } else { + Self::detach_fault_mapped_page(&page, &vma); + } } } } @@ -871,9 +879,14 @@ impl PageFaultHandler { ret = VmFaultReason::VM_FAULT_MAJOR; } - match page_cache.manager().commit_page(backing_pgoff) { - Ok(page) => { + match page_cache.manager().commit_page_pinned(backing_pgoff) { + Ok(page_pin) => { + let page = page_pin.page(); pfm.page = Some(page); + pfm.page_pin = Some(page_pin); + } + Err(SystemError::ENOMEM) => { + return VmFaultReason::VM_FAULT_OOM; } Err(_) => { return VmFaultReason::VM_FAULT_SIGBUS; @@ -956,11 +969,14 @@ impl PageFaultHandler { } } - match page_cache.manager().commit_overwrite(backing_pgoff) { - Ok(page) => { + match page_cache.manager().commit_overwrite_pinned(backing_pgoff) { + Ok(page_pin) => { + let page = page_pin.page(); pfm.page = Some(page); + pfm.page_pin = Some(page_pin); VmFaultReason::empty() } + Err(SystemError::ENOMEM) => VmFaultReason::VM_FAULT_OOM, Err(_) => VmFaultReason::VM_FAULT_SIGBUS, } } @@ -1006,10 +1022,18 @@ impl PageFaultHandler { { map_flags = map_flags.set_write(false); } + drop(vma_guard); - mapper.map_phys(address, page_phys, map_flags); - Self::attach_fault_mapped_page(&page_to_map, &pfm.vma(), mlocked); - VmFaultReason::VM_FAULT_COMPLETED + Self::attach_fault_mapped_page(&page_to_map, &vma, mlocked); + let result = if let Some(flush) = mapper.map_phys(address, page_phys, map_flags) { + flush.flush(); + VmFaultReason::VM_FAULT_COMPLETED + } else { + Self::detach_fault_mapped_page(&page_to_map, &vma); + VmFaultReason::VM_FAULT_OOM + }; + pfm.page_pin.take(); + result } /// Map a zeroed anonymous page for /dev/zero style mappings. diff --git a/kernel/src/mm/madvise.rs b/kernel/src/mm/madvise.rs index b0baeb22dd..d537633eb8 100644 --- a/kernel/src/mm/madvise.rs +++ b/kernel/src/mm/madvise.rs @@ -1,86 +1,31 @@ +use crate::arch::mm::PageMapper; use system_error::SystemError; -use crate::arch::{mm::PageMapper, MMArch}; - -use super::{ - mmu_gather::MmuGather, syscall::MadvFlags, ucontext::LockedVMA, MemoryManagementArch, VirtAddr, - VmFlags, -}; +use super::{mmu_gather::MmuGather, syscall::MadvFlags, ucontext::LockedVMA, VmFlags}; impl LockedVMA { - pub fn do_madvise( + pub fn madvise_updated_flags( &self, behavior: MadvFlags, - mapper: &mut PageMapper, - tlb: &mut MmuGather<'_>, - ) -> Result<(), SystemError> { - //TODO https://code.dragonos.org.cn/xref/linux-6.6.21/mm/madvise.c?fi=madvise#do_madvise - let mut vma = self.lock(); + ) -> Result, SystemError> { + let vma = self.lock(); let mut new_flags = *vma.vm_flags(); match behavior { MadvFlags::MADV_DONTNEED | MadvFlags::MADV_DONTNEED_LOCKED => { - if behavior == MadvFlags::MADV_DONTNEED - && vma.vm_flags().contains(VmFlags::VM_LOCKED) - { - return Err(SystemError::EINVAL); - } - - // MADV_DONTNEED: 释放指定范围内的页面 - // 这是glibc在pthread_create时用来管理线程栈的关键操作 - // 参考: https://code.dragonos.org.cn/xref/linux-6.6.21/mm/madvise.c#madvise_dontneed_single_vma - - let region = *vma.region(); - drop(vma); - - // 遍历VMA覆盖的所有页面,解除映射 - let start_page = region.start(); - let end_page = region.end(); - let mut current_page = start_page; - - while current_page < end_page { - let virt_addr = VirtAddr::new(current_page.data()); - if let Some((_paddr, _)) = mapper.translate(virt_addr) { - // 只有当页面已经映射时才需要解除映射 - unsafe { - if let Some((_, _, flush)) = mapper.unmap_phys(virt_addr, false) { - // Local PTE cleared; actual TLB invalidation is performed uniformly by MmuGather. - // Note: the current implementation does not reclaim physical pages (keeping legacy - // behavior). To support real reclamation of anon pages in the future, call - // `tlb.stash_paddr(_paddr)` here. - flush.ignore(); - tlb.accumulate_range(virt_addr); - } - } - } - current_page = VirtAddr::new(current_page.data() + MMArch::PAGE_SIZE); - } - - return Ok(()); + debug_assert!( + false, + "MADV_DONTNEED is a range operation, not a VMA flag update" + ); + return Ok(None); } - MadvFlags::MADV_REMOVE => { - // TODO - } - - MadvFlags::MADV_WILLNEED => { - // TODO - } - - MadvFlags::MADV_COLD => { - // TODO - } - - MadvFlags::MADV_PAGEOUT => { - // TODO - } - - MadvFlags::MADV_FREE => { - // TODO - } - - MadvFlags::MADV_POPULATE_READ | MadvFlags::MADV_POPULATE_WRITE => { - // TODO - } + MadvFlags::MADV_REMOVE + | MadvFlags::MADV_WILLNEED + | MadvFlags::MADV_COLD + | MadvFlags::MADV_PAGEOUT + | MadvFlags::MADV_FREE + | MadvFlags::MADV_POPULATE_READ + | MadvFlags::MADV_POPULATE_WRITE => {} MadvFlags::MADV_NORMAL => { new_flags = new_flags & !VmFlags::VM_RAND_READ & !VmFlags::VM_SEQ_READ @@ -97,6 +42,10 @@ impl LockedVMA { MadvFlags::MADV_DOFORK => { if vma.vm_flags().contains(VmFlags::VM_IO) { + debug_assert!( + false, + "MADV_DOFORK on VM_IO must be rejected before VMA split" + ); return Err(SystemError::EINVAL); } new_flags &= !VmFlags::VM_DONTCOPY; @@ -104,6 +53,9 @@ impl LockedVMA { MadvFlags::MADV_WIPEONFORK => { //MADV_WIPEONFORK仅支持匿名映射,后续实现其他映射方式后要在此处添加判断条件 + if vma.vm_file().is_some() || vma.vm_flags().contains(VmFlags::VM_SHARED) { + return Err(SystemError::EINVAL); + } new_flags |= VmFlags::VM_WIPEONFORK; } @@ -112,7 +64,15 @@ impl LockedVMA { MadvFlags::MADV_DONTDUMP => new_flags |= VmFlags::VM_DONTDUMP, //MADV_DODUMP不支持巨页映射,后续需要添加判断条件 - MadvFlags::MADV_DODUMP => new_flags &= !VmFlags::VM_DONTDUMP, + MadvFlags::MADV_DODUMP => { + let special_flags = VmFlags::VM_IO | VmFlags::VM_PFNMAP | VmFlags::VM_DONTEXPAND; + if !vma.vm_flags().contains(VmFlags::VM_HUGETLB) + && vma.vm_flags().intersects(special_flags) + { + return Err(SystemError::EINVAL); + } + new_flags &= !VmFlags::VM_DONTDUMP; + } MadvFlags::MADV_MERGEABLE | MadvFlags::MADV_UNMERGEABLE => {} @@ -121,7 +81,19 @@ impl LockedVMA { MadvFlags::MADV_COLLAPSE => {} _ => {} } - vma.set_vm_flags(new_flags); - Ok(()) + Ok(Some(new_flags)) + } + + pub fn do_madvise( + &self, + behavior: MadvFlags, + _mapper: &mut PageMapper, + _tlb: &mut MmuGather<'_>, + ) { + //TODO https://code.dragonos.org.cn/xref/linux-6.6.21/mm/madvise.c?fi=madvise#do_madvise + let Ok(Some(new_flags)) = self.madvise_updated_flags(behavior) else { + return; + }; + self.lock().set_vm_flags(new_flags); } } diff --git a/kernel/src/mm/mod.rs b/kernel/src/mm/mod.rs index aa5bb6cb6e..bbff4f1bde 100644 --- a/kernel/src/mm/mod.rs +++ b/kernel/src/mm/mod.rs @@ -514,6 +514,11 @@ pub trait MemoryManagementArch: Clone + Copy + Debug { /// 每个页面的大小 const PAGE_SIZE: usize = 1 << Self::PAGE_SHIFT; + /// SysV SHM attach ABI alignment. + /// + /// Linux exposes this as SHMLBA. It equals PAGE_SIZE on x86_64, but it is + /// intentionally modeled as a separate ABI constant. + const SHMLBA: usize = Self::PAGE_SIZE; /// 通过这个mask,获取地址的页内偏移量 const PAGE_OFFSET_MASK: usize = Self::PAGE_SIZE - 1; /// 通过这个mask,获取页的首地址 diff --git a/kernel/src/mm/page.rs b/kernel/src/mm/page.rs index 2bd0953663..d371965bbf 100644 --- a/kernel/src/mm/page.rs +++ b/kernel/src/mm/page.rs @@ -228,7 +228,9 @@ impl PageManager { let page = Page::copy(old_page.read(), paddr) .inspect_err(|_| unsafe { allocator.free_one(paddr) })?; if let Some(page_type) = page_type { - page.write().set_page_type(page_type); + let mut guard = page.write(); + guard.set_page_type(page_type); + guard.clear_mapping_unevictable_source_for_cow(); } self.insert(&page)?; @@ -357,6 +359,10 @@ impl PageReclaimer { for page in victims { let mut guard = page.write(); if let PageType::File(info) = guard.page_type().clone() { + if guard.flags().contains(PageFlags::PG_UNEVICTABLE) { + continue; + } + // Never evict a file-backed page that is still mapped into any VMA. // Our eviction path removes the page from page_cache/page_manager; dropping a // still-mapped page will trip InnerPage::drop assertions and can crash userland. @@ -383,11 +389,17 @@ impl PageReclaimer { let _ = page_cache.manager().writeback_page(page_index); } else { let mut guard = page.write(); - Self::page_writeback(&mut guard, true); + guard.remove_flags(PageFlags::PG_DIRTY | PageFlags::PG_WRITEBACK); + drop(guard); + page_manager_lock().remove_page(&paddr); + continue; } guard = page.write(); - if guard.flags().contains(PageFlags::PG_DIRTY) { + if guard.flags().intersects( + PageFlags::PG_DIRTY | PageFlags::PG_WRITEBACK | PageFlags::PG_UNEVICTABLE, + ) || guard.map_count() != 0 + { drop(guard); page_reclaimer_lock().insert_page(paddr, &page); continue; @@ -398,13 +410,22 @@ impl PageReclaimer { // // FileMapInfo 内保存 Weak 以避免 PageCache <-> Page 的强引用环。 // 如果此时 PageCache 已被释放(upgrade 失败),说明其 pages 映射也已销毁,无需再 remove。 - if let Some(page_cache) = info.page_cache.upgrade() { - if !page_cache.is_page_ready(page_index) { - drop(guard); + let page_cache = info.page_cache.upgrade(); + drop(guard); + + if let Some(page_cache) = page_cache { + if !page_cache.manager().supports_clean_reclaim() { + continue; + } + let removed = page_cache + .manager() + .remove_clean_page_for_reclaim(page_index, &page) + .ok() + .flatten(); + if removed.is_none() { page_reclaimer_lock().insert_page(paddr, &page); continue; } - let _ = page_cache.manager().remove_page(page_index); } page_manager_lock().remove_page(&paddr); } @@ -463,6 +484,10 @@ impl PageReclaimer { for vma in unmapped_vmas { guard.remove_vma(vma.as_ref()); } + if guard.flags().contains(PageFlags::PG_UNEVICTABLE) && !guard.has_unevictable_source() + { + guard.remove_flags(PageFlags::PG_UNEVICTABLE); + } } let page_start = page_index * MMArch::PAGE_SIZE; @@ -671,6 +696,19 @@ impl Page { pub struct InnerPage { /// 映射到当前page的VMA vma_set: HashSet>, + /// 当前对该页贡献 mlock/unevictable 原因的 VMA 子集。 + /// + /// 该集合必须始终是 `vma_set` 的子集。它让 page-cache reclassify + /// 可以 O(1) 判断页是否仍因 VMA mlock 不可回收,而不需要反向锁住所有 VMA。 + mlocked_vmas: HashSet>, + /// 内核子系统显式创建的 non-page-cache unevictable 页。 + intrinsic_unevictable: bool, + /// Backing object lifetime pins. + /// + /// 这只表达“页面仍被 backing 对象拥有,不能从 page_manager 释放”,不表达 + /// reclaim policy,也不应设置 `PG_UNEVICTABLE`。shared-anon backing 使用它 + /// 替代旧的 `PG_UNEVICTABLE` lifetime workaround。 + backing_lifetime_pins: usize, /// 标志 flags: PageFlags, /// 页面所在物理地址 @@ -681,8 +719,13 @@ pub struct InnerPage { impl InnerPage { pub fn new(phys_addr: PhysAddr, page_type: PageType, flags: PageFlags) -> Self { + let intrinsic_unevictable = + flags.contains(PageFlags::PG_UNEVICTABLE) && !matches!(page_type, PageType::File(_)); Self { vma_set: HashSet::new(), + mlocked_vmas: HashSet::new(), + intrinsic_unevictable, + backing_lifetime_pins: 0, flags, phys_addr, page_type, @@ -690,18 +733,29 @@ impl InnerPage { } /// 将vma加入anon_vma - pub fn insert_vma(&mut self, vma: Arc) { + pub fn insert_vma(&mut self, vma: Arc, vma_mlocked: bool) { let was_mapped = self.map_count() > 0; - self.vma_set.insert(vma); + let inserted = self.vma_set.insert(vma.clone()); + if inserted && vma_mlocked { + self.mlocked_vmas.insert(vma); + self.flags.insert(PageFlags::PG_UNEVICTABLE); + } if !was_mapped && matches!(self.page_type, PageType::File(_)) { pc_stats::inc_file_mapped(); } } /// 将vma从anon_vma中删去 + /// + /// This only removes the VMA from the page-level source sets. Callers that + /// may have removed the last unevictable source must subsequently run + /// `InnerAddressSpace::remove_page_unevictable_if_unneeded`. pub fn remove_vma(&mut self, vma: &LockedVMA) { let was_mapped = self.map_count() > 0; let removed = self.vma_set.remove(vma); + if removed { + self.mlocked_vmas.remove(vma); + } if removed && was_mapped && self.map_count() == 0 @@ -713,13 +767,78 @@ impl InnerPage { /// 判断当前物理页是否能被回 pub fn can_deallocate(&self) -> bool { - self.map_count() == 0 && !self.flags.contains(PageFlags::PG_UNEVICTABLE) + self.map_count() == 0 + && !self.flags.contains(PageFlags::PG_UNEVICTABLE) + && self.backing_lifetime_pins == 0 + } + + /// Whether a VMA unmap may release this page from the global page manager. + /// + /// Page-cache backed pages are owned by their address_space/PageCache. VMA + /// teardown only removes PTE/rmap state; the page cache eviction path is the + /// authority that unlinks and drops those pages from page_manager. + pub fn can_deallocate_after_vma_unmap(&self) -> bool { + self.can_deallocate() && !matches!(self.page_type, PageType::File(_)) } pub fn shared(&self) -> bool { self.map_count() > 1 } + pub fn add_mlocked_vma_ref(&mut self, vma: &Arc) { + if self.vma_set.contains(vma) && self.mlocked_vmas.insert(vma.clone()) { + self.flags.insert(PageFlags::PG_UNEVICTABLE); + } + } + + pub fn remove_mlocked_vma_ref(&mut self, vma: &Arc) { + self.mlocked_vmas.remove(vma); + } + + pub fn clear_unlinked_file_mapping_unevictable(&mut self) { + if matches!(self.page_type, PageType::File(_)) + && self.map_count() == 0 + && !self.has_mlocked_vma_refs() + { + self.flags.remove(PageFlags::PG_UNEVICTABLE); + } + } + + #[inline(always)] + pub fn has_mlocked_vma_refs(&self) -> bool { + !self.mlocked_vmas.is_empty() + } + + pub fn has_unevictable_source(&self) -> bool { + self.intrinsic_unevictable + || self.has_mlocked_vma_refs() + || self + .page_cache() + .map(|page_cache| page_cache.mapping_unevictable()) + .unwrap_or(false) + } + + pub fn clear_mapping_unevictable_source_for_cow(&mut self) { + self.intrinsic_unevictable = false; + if !self.has_unevictable_source() { + self.flags.remove(PageFlags::PG_UNEVICTABLE); + } + } + + pub fn add_backing_lifetime_pin(&mut self) { + self.backing_lifetime_pins = self + .backing_lifetime_pins + .checked_add(1) + .expect("backing lifetime pin count overflow"); + } + + pub fn remove_backing_lifetime_pin(&mut self) { + self.backing_lifetime_pins = self + .backing_lifetime_pins + .checked_sub(1) + .expect("backing lifetime pin count underflow"); + } + pub fn page_cache(&self) -> Option> { match &self.page_type { PageType::File(info) => info.page_cache.upgrade(), @@ -1587,7 +1706,11 @@ impl PageMapper { .ok()?; drop(page_manager_guard); let phys = page.phys_address(); - return self.map_phys(virt, phys, flags); + let mapped = self.map_phys(virt, phys, flags); + if mapped.is_none() { + let _ = page_manager_lock().remove_page(&phys); + } + return mapped; } /// 映射一个物理页到指定的虚拟地址 diff --git a/kernel/src/mm/syscall/sys_madvise.rs b/kernel/src/mm/syscall/sys_madvise.rs index 526e9f9c29..c4809fef2b 100644 --- a/kernel/src/mm/syscall/sys_madvise.rs +++ b/kernel/src/mm/syscall/sys_madvise.rs @@ -70,9 +70,7 @@ impl Syscall for SysMadviseHandle { let start_frame = VirtPageFrame::new(start_vaddr); let page_count = PageFrameCount::new(aligned_len / MMArch::PAGE_SIZE); - current_address_space - .madvise_wait(start_frame, page_count, madv_flags) - .map_err(|_| SystemError::EINVAL)?; + current_address_space.madvise_wait(start_frame, page_count, madv_flags)?; return Ok(0); } diff --git a/kernel/src/mm/syscall/sys_mmap.rs b/kernel/src/mm/syscall/sys_mmap.rs index 7e95aedbd3..3c44b1872f 100644 --- a/kernel/src/mm/syscall/sys_mmap.rs +++ b/kernel/src/mm/syscall/sys_mmap.rs @@ -4,7 +4,7 @@ use super::ProtFlags; use crate::arch::{interrupt::TrapFrame, syscall::nr::SYS_MMAP, MMArch}; use crate::mm::syscall::page_align_up; use crate::mm::syscall::MapFlags; -use crate::mm::ucontext::DEFAULT_MMAP_MIN_ADDR; +use crate::mm::ucontext::{check_mmap_min_addr, DEFAULT_MMAP_MIN_ADDR}; use crate::mm::AddressSpace; use crate::mm::VirtAddr; use crate::mm::{access_ok, MemoryManagementArch}; @@ -12,7 +12,6 @@ use crate::syscall::table::{FormattedSyscallParam, Syscall}; use log::error; use system_error::SystemError; -use crate::process::{resource::RLimitID, ProcessManager}; use alloc::vec::Vec; /// Handler for the mmap system call, which maps files or devices into memory. @@ -97,24 +96,6 @@ impl Syscall for SysMmapHandle { return Err(SystemError::EINVAL); } - // RLIMIT_AS 检查(粗略:累计 VMA 大小) - let rlim_as = ProcessManager::current_pcb() - .get_rlimit(RLimitID::As) - .rlim_cur as usize; - if rlim_as != usize::MAX { - let vm = AddressSpace::current()?; - let usage = vm.read().vma_usage_bytes(); - // Allow a small one-page slack to mirror Linux rounding behaviour and - // avoid spuriously rejecting near-limit mappings. - let allowance = MMArch::PAGE_SIZE; - if usage - .checked_add(len) - .is_none_or(|v| v > rlim_as.saturating_add(allowance)) - { - return Err(SystemError::ENOMEM); - } - } - // MAP_FIXED 需页对齐 if map_flags.contains(MapFlags::MAP_FIXED) && !start_vaddr.check_aligned(::PAGE_SIZE) @@ -125,11 +106,7 @@ impl Syscall for SysMmapHandle { if start_vaddr < VirtAddr::new(DEFAULT_MMAP_MIN_ADDR) && map_flags.contains(MapFlags::MAP_FIXED) { - error!( - "mmap: MAP_FIXED is not supported for address below {}", - DEFAULT_MMAP_MIN_ADDR - ); - return Err(SystemError::EINVAL); + check_mmap_min_addr(start_vaddr, VirtAddr::new(DEFAULT_MMAP_MIN_ADDR))?; } // 暂时不支持巨页映射 diff --git a/kernel/src/mm/syscall/sys_mprotect.rs b/kernel/src/mm/syscall/sys_mprotect.rs index c961b845bd..784dc153ea 100644 --- a/kernel/src/mm/syscall/sys_mprotect.rs +++ b/kernel/src/mm/syscall/sys_mprotect.rs @@ -38,7 +38,7 @@ impl Syscall for SysMprotectHandle { return Err(SystemError::EINVAL); } if len == 0 { - return Err(SystemError::EINVAL); + return Ok(0); } // 将长度向上对齐,同时检测溢出;超大长度视为 ENOMEM let len_aligned = page_align_up(len); @@ -61,9 +61,7 @@ impl Syscall for SysMprotectHandle { let start_frame = VirtPageFrame::new(start_vaddr); let page_count = PageFrameCount::from_bytes(len_aligned).unwrap(); - current_address_space - .mprotect_wait(start_frame, page_count, prot_flags) - .map_err(|_| SystemError::EINVAL)?; + current_address_space.mprotect_wait(start_frame, page_count, prot_flags)?; return Ok(0); } diff --git a/kernel/src/mm/syscall/sys_mremap.rs b/kernel/src/mm/syscall/sys_mremap.rs index f3906c8dc0..8c80d53fd2 100644 --- a/kernel/src/mm/syscall/sys_mremap.rs +++ b/kernel/src/mm/syscall/sys_mremap.rs @@ -1,7 +1,6 @@ //! System call handler for the mremap system call. use crate::arch::{interrupt::TrapFrame, syscall::nr::SYS_MREMAP}; -use crate::mm::syscall::page_align_up; use crate::mm::syscall::sys_munmap::do_munmap; use crate::mm::syscall::MremapFlags; use crate::mm::ucontext::AddressSpace; @@ -38,13 +37,20 @@ impl Syscall for SysMremapHandle { let old_vaddr = VirtAddr::new(Self::old_vaddr(args)); let old_len = Self::old_len(args); let new_len = Self::new_len(args); - let mremap_flags = MremapFlags::from_bits_truncate(Self::mremap_flags(args) as u8); + let mremap_flags_raw = Self::mremap_flags(args); + let allowed_mremap_flags = (MremapFlags::MREMAP_MAYMOVE + | MremapFlags::MREMAP_FIXED + | MremapFlags::MREMAP_DONTUNMAP) + .bits() as usize; + if mremap_flags_raw & !allowed_mremap_flags != 0 { + return Err(SystemError::EINVAL); + } + let mremap_flags = MremapFlags::from_bits(mremap_flags_raw as u8).unwrap(); let new_vaddr = VirtAddr::new(Self::new_vaddr(args)); // 需要重映射到新内存区域的情况下,必须包含MREMAP_MAYMOVE并且指定新地址 if mremap_flags.contains(MremapFlags::MREMAP_FIXED) - && (!mremap_flags.contains(MremapFlags::MREMAP_MAYMOVE) - || new_vaddr == VirtAddr::new(0)) + && !mremap_flags.contains(MremapFlags::MREMAP_MAYMOVE) { return Err(SystemError::EINVAL); } @@ -55,15 +61,22 @@ impl Syscall for SysMremapHandle { { return Err(SystemError::EINVAL); } + if mremap_flags.contains(MremapFlags::MREMAP_DONTUNMAP) + && !new_vaddr.check_aligned(MMArch::PAGE_SIZE) + { + return Err(SystemError::EINVAL); + } // 旧内存地址必须对齐 if !old_vaddr.check_aligned(MMArch::PAGE_SIZE) { return Err(SystemError::EINVAL); } - // 将old_len、new_len 对齐页面大小 - let old_len = page_align_up(old_len); - let new_len = page_align_up(new_len); + // Linux PAGE_ALIGN 使用 unsigned wrap 语义;极大长度可能 wrap 到 0, + // 并继续进入 mremap 的 legacy old_len==0 duplicate 分支。这里显式 + // wrapping,避免 Rust debug overflow panic,同时保持 Linux 兼容。 + let old_len = wrapping_page_align_up(old_len); + let new_len = wrapping_page_align_up(new_len); // 不允许重映射内存区域大小为0 if new_len == 0 { @@ -86,7 +99,7 @@ impl Syscall for SysMremapHandle { current_address_space.wait_for_no_reservation_conflict(probe_region); continue; } - return Err(SystemError::EINVAL); + return Err(SystemError::EFAULT); }; let (vm_flags, vma_region) = { let g = vma.lock(); @@ -99,15 +112,16 @@ impl Syscall for SysMremapHandle { // - For expansion, the check is against old_len (not new_len), otherwise all fixed // expansions would spuriously fail. if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { + validate_fixed_target(old_vaddr, old_len, new_vaddr, new_len)?; let span_len = if old_len > new_len { new_len } else { old_len }; let span_end = old_vaddr .data() .checked_add(span_len) .ok_or(SystemError::EINVAL)?; if span_end > vma_region.end().data() { - // Side effects like Linux mremap_to(): - // - unmap destination range first - // - if shrinking, unmap the tail [old+new_len, old+old_len) + // Match Linux mremap_to() ordering: MREMAP_FIXED unmaps the + // destination and, when shrinking, the source tail before the + // resized source span is rejected by vma_to_resize(). do_munmap(new_vaddr, new_len)?; if old_len > new_len { do_munmap(old_vaddr + new_len, old_len - new_len)?; @@ -148,13 +162,6 @@ impl Syscall for SysMremapHandle { vm_flags, )?; - // Unmap the old mapping only if this was a move (i.e. result differs from old_vaddr). - // - old_len==0 is a special duplication request and must never unmap the source. - // - DONTUNMAP keeps the source by definition. - if !mremap_flags.contains(MremapFlags::MREMAP_DONTUNMAP) && old_len != 0 && r != old_vaddr { - do_munmap(old_vaddr, old_len)?; - } - return Ok(r.data()); } @@ -193,4 +200,33 @@ impl SysMremapHandle { } } +fn wrapping_page_align_up(len: usize) -> usize { + let mask = MMArch::PAGE_SIZE - 1; + len.wrapping_add(mask) & !mask +} + +fn validate_fixed_target( + old_vaddr: VirtAddr, + old_len: usize, + new_vaddr: VirtAddr, + new_len: usize, +) -> Result<(), SystemError> { + if !new_vaddr.check_aligned(MMArch::PAGE_SIZE) { + return Err(SystemError::EINVAL); + } + let old_end = old_vaddr.data().wrapping_add(old_len); + let new_end = new_vaddr + .data() + .checked_add(new_len) + .ok_or(SystemError::EINVAL)?; + if new_end > MMArch::USER_END_VADDR.data() { + return Err(SystemError::EINVAL); + } + if old_end > new_vaddr.data() && new_end > old_vaddr.data() { + return Err(SystemError::EINVAL); + } + + Ok(()) +} + syscall_table_macros::declare_syscall!(SYS_MREMAP, SysMremapHandle); diff --git a/kernel/src/mm/syscall/sys_msync.rs b/kernel/src/mm/syscall/sys_msync.rs index ff4ff10b05..7bf97d6294 100644 --- a/kernel/src/mm/syscall/sys_msync.rs +++ b/kernel/src/mm/syscall/sys_msync.rs @@ -30,7 +30,7 @@ impl Syscall for SysMsyncHandle { fn handle(&self, args: &[usize], _frame: &mut TrapFrame) -> Result { let start = VirtAddr::new(Self::start_vaddr(args)); let mut len = Self::len(args); - let flags = MsFlags::from_bits_truncate(Self::flags(args)); + let flags = MsFlags::from_bits(Self::flags(args)).ok_or(SystemError::EINVAL)?; // 检查 start 地址是否页对齐 if !start.check_aligned(MMArch::PAGE_SIZE) { diff --git a/kernel/src/mm/ucontext.rs b/kernel/src/mm/ucontext.rs index d81d22e628..5dcdec8b8c 100644 --- a/kernel/src/mm/ucontext.rs +++ b/kernel/src/mm/ucontext.rs @@ -17,7 +17,7 @@ use defer::defer; use hashbrown::HashMap; use hashbrown::HashSet; use ida::IdAllocator; -use log::warn; +use log::{error, warn}; use system_error::SystemError; use crate::{ @@ -30,7 +30,7 @@ use crate::{ FileType, InodeId, }, }, - ipc::shm::{ShmFlags, ShmId}, + ipc::shm::SysVShmAttach, libs::{ align::page_align_up, cpumask::CpuMask, @@ -39,8 +39,16 @@ use crate::{ spinlock::SpinLock, wait_queue::WaitQueue, }, - mm::{mmu_gather::MmuGather, page::page_manager_lock, PhysAddr}, - process::{cred::CAPFlags, resource::RLimitID, ProcessManager}, + mm::{ + mmu_gather::MmuGather, + page::{page_manager_lock, page_reclaimer_lock}, + PhysAddr, + }, + process::{ + cred::{capable, CAPFlags}, + resource::RLimitID, + ProcessManager, + }, }; use super::{ @@ -67,6 +75,18 @@ use crate::arch::mm::LockedFrameAllocator; // protection by setting the value to 0. pub const DEFAULT_MMAP_MIN_ADDR: usize = 65536; +/// Linux `security_mmap_addr()`/`cap_mmap_addr()` semantics for low fixed mappings. +/// +/// Mapping below `mmap_min_addr` is denied with `EPERM` unless the caller has +/// `CAP_SYS_RAWIO` in the initial user namespace. Non-fixed hints are rounded +/// by the caller and should not enter this helper. +pub fn check_mmap_min_addr(vaddr: VirtAddr, min_vaddr: VirtAddr) -> Result<(), SystemError> { + if vaddr < min_vaddr && !capable(CAPFlags::CAP_SYS_RAWIO) { + return Err(SystemError::EPERM); + } + Ok(()) +} + /// LockedVMA的id分配器 static LOCKEDVMA_ID_ALLOCATOR: SpinLock = SpinLock::new(IdAllocator::new(0, usize::MAX).unwrap()); @@ -79,6 +99,20 @@ pub type MmapReservationId = u64; static MMAP_RESERVATION_ID_ALLOCATOR: AtomicU64 = AtomicU64::new(1); +pub struct FileMappingWithFileArgs { + pub file: Arc, + pub start_vaddr: VirtAddr, + pub len: usize, + pub prot_flags: ProtFlags, + pub map_flags: MapFlags, + pub may_exec: bool, + pub offset: usize, + pub round_to_min: bool, + pub allocate_at_once: bool, + pub sysv_shm: Option>, + pub fixed_noreplace_conflict_error_before_mmap_min: Option, +} + #[derive(Debug)] pub struct AddressSpace { /// 全局唯一的地址空间ID,用于标识不同的地址空间 @@ -313,14 +347,31 @@ impl AddressSpace { }) } + pub fn write_guard_no_reservations(self: &Arc) -> RwSemWriteGuard<'_, InnerAddressSpace> { + self.reservation_wait.wait_until(|| { + let guard = self.write(); + if guard.mappings.first_reservation_region().is_none() { + Some(guard) + } else { + None + } + }) + } + fn wake_reservation_waiters(&self) { self.reservation_wait.wake_all(); } - fn round_mmap_hint(start_vaddr: VirtAddr, round_to_min: bool) -> Option { + fn round_mmap_hint( + start_vaddr: VirtAddr, + round_to_min: bool, + fixed_hint: bool, + ) -> Option { let addr = start_vaddr.data() & (!MMArch::PAGE_OFFSET_MASK); if (addr != 0) && round_to_min && (addr < DEFAULT_MMAP_MIN_ADDR) { Some(VirtAddr::new(page_align_up(DEFAULT_MMAP_MIN_ADDR))) + } else if addr == 0 && fixed_hint { + Some(VirtAddr::new(0)) } else if addr == 0 { None } else { @@ -332,8 +383,10 @@ impl AddressSpace { start_vaddr: VirtAddr, len: usize, round_to_min: bool, + fixed_hint: bool, ) -> Option { - Self::round_mmap_hint(start_vaddr, round_to_min).map(|start| VirtRegion::new(start, len)) + Self::round_mmap_hint(start_vaddr, round_to_min, fixed_hint) + .map(|start| VirtRegion::new(start, len)) } #[allow(clippy::too_many_arguments)] @@ -349,7 +402,10 @@ impl AddressSpace { let len = page_align_up(len); loop { let mut guard = self.write(); - if let Some(region) = Self::reservation_region_for_hint(start_vaddr, len, round_to_min) + let fixed_hint = + map_flags.intersects(MapFlags::MAP_FIXED | MapFlags::MAP_FIXED_NOREPLACE); + if let Some(region) = + Self::reservation_region_for_hint(start_vaddr, len, round_to_min, fixed_hint) { if guard.mappings.first_reservation_conflict(region).is_some() { drop(guard); @@ -358,16 +414,43 @@ impl AddressSpace { } } - guard.check_rlimit_as_for_bytes(len)?; - return guard.map_anonymous( + let (page, notifications) = match guard.map_anonymous_collect( start_vaddr, len, prot_flags, map_flags, round_to_min, allocate_at_once, - ); + ) { + Ok(outcome) => outcome, + Err(failure) => { + drop(guard); + InnerAddressSpace::notify_close_notifications(failure.notifications); + return Err(failure.err); + } + }; + drop(guard); + InnerAddressSpace::notify_close_notifications(notifications); + return Ok(page); + } + } + + pub fn has_vma_intersection( + self: &Arc, + start_vaddr: VirtAddr, + len: usize, + ) -> Result { + let end = start_vaddr + .data() + .checked_add(len) + .ok_or(SystemError::EINVAL)?; + if len == 0 || end > MMArch::USER_END_VADDR.data() { + return Err(SystemError::EINVAL); } + + let requested = VirtRegion::new(start_vaddr, len); + let guard = self.read(); + Ok(guard.mappings.has_conflict(requested)) } #[allow(clippy::too_many_arguments)] @@ -413,6 +496,38 @@ impl AddressSpace { round_to_min: bool, allocate_at_once: bool, ) -> Result { + self.file_mapping_with_file_ext(FileMappingWithFileArgs { + file, + start_vaddr, + len, + prot_flags, + map_flags, + may_exec: true, + offset, + round_to_min, + allocate_at_once, + sysv_shm: None, + fixed_noreplace_conflict_error_before_mmap_min: None, + }) + } + + pub fn file_mapping_with_file_ext( + self: &Arc, + args: FileMappingWithFileArgs, + ) -> Result { + let FileMappingWithFileArgs { + file, + start_vaddr, + len, + prot_flags, + map_flags, + may_exec, + offset, + round_to_min, + allocate_at_once, + sysv_shm, + fixed_noreplace_conflict_error_before_mmap_min, + } = args; let len = page_align_up(len); if len == 0 { return Err(SystemError::EINVAL); @@ -457,10 +572,46 @@ impl AddressSpace { loop { let mut guard = self.write(); - let page = match Self::round_mmap_hint(start_vaddr, round_to_min) { + let fixed_hint = + map_flags.intersects(MapFlags::MAP_FIXED | MapFlags::MAP_FIXED_NOREPLACE); + let mut close_notifications = VmaCloseNotifications::default(); + macro_rules! map_fail { + ($err:expr) => {{ + drop(guard); + InnerAddressSpace::notify_close_notifications(close_notifications); + return Err($err); + }}; + } + if let Some(conflict_error) = fixed_noreplace_conflict_error_before_mmap_min.as_ref() { + if map_flags.contains(MapFlags::MAP_FIXED_NOREPLACE) { + let end = start_vaddr + .data() + .checked_add(len) + .ok_or(SystemError::EINVAL)?; + if end > MMArch::USER_END_VADDR.data() + || !start_vaddr.check_aligned(MMArch::PAGE_SIZE) + { + map_fail!(SystemError::EINVAL); + } + let requested = VirtRegion::new(start_vaddr, len); + if guard + .mappings + .first_reservation_conflict(requested) + .is_some() + { + drop(guard); + self.wait_for_no_reservation_conflict(requested); + continue; + } + if guard.mappings.has_conflict(requested) { + map_fail!(conflict_error.clone()); + } + } + } + let page = match Self::round_mmap_hint(start_vaddr, round_to_min, fixed_hint) { Some(vaddr) => { let mmap_min = guard.mmap_min; - match guard.find_free_at(mmap_min, vaddr, len, map_flags) { + match guard.find_free_at_prepare(mmap_min, vaddr, len, map_flags) { Ok(region) => VirtPageFrame::new(region.start()), Err(SystemError::EAGAIN_OR_EWOULDBLOCK) => { let region = VirtRegion::new(vaddr, len); @@ -485,7 +636,10 @@ impl AddressSpace { | VmFlags::from(map_flags) | guard.mlock_future | VmFlags::VM_MAYREAD - | VmFlags::VM_MAYEXEC; + | VmFlags::VM_NONE; + if may_exec { + vm_flags |= VmFlags::VM_MAYEXEC; + } if may_write { vm_flags |= VmFlags::VM_MAYWRITE; } @@ -498,36 +652,106 @@ impl AddressSpace { } else { SystemError::EAGAIN_OR_EWOULDBLOCK }; - guard.check_mlock_rlimit_for_pages(page_count.data(), error)?; + if let Err(err) = guard.check_mlock_rlimit_for_pages(page_count.data(), error) { + map_fail!(err); + } + } + if let Err(err) = guard.check_rlimit_as_for_region(region, len, map_flags) { + map_fail!(err); + } + + if let Err(err) = file.inode().check_mmap_file(&file, len, offset, vm_flags) { + map_fail!(err); } - guard.check_rlimit_as_for_bytes(len)?; - file.inode().check_mmap_file(&file, len, offset, vm_flags)?; + if map_flags.contains(MapFlags::MAP_FIXED) && guard.mappings.has_conflict(region) { + match guard.munmap_collect( + VirtPageFrame::new(region.start()), + PageFrameCount::from_bytes(region.size()).unwrap(), + ) { + Ok(notifications) => close_notifications.extend(notifications), + Err(err) => map_fail!(err), + } + } - let reservation_id = guard.mappings.reserve_region(region)?; + let reservation_id = match guard.mappings.reserve_region(region) { + Ok(reservation_id) => reservation_id, + Err(err) => map_fail!(err), + }; let entry_flags = EntryFlags::from_prot_flags(prot_flags, true); + let locked_pages_reserved = if vm_flags.contains(VmFlags::VM_LOCKED) { + let new_locked_vm = match guard.locked_vm.checked_add(page_count.data()) { + Some(new_locked_vm) => new_locked_vm, + None => { + if guard.mappings.cancel_reservation(reservation_id).is_some() { + drop(guard); + self.wake_reservation_waiters(); + } else { + drop(guard); + } + InnerAddressSpace::notify_close_notifications(close_notifications); + return Err(SystemError::ENOMEM); + } + }; + guard.locked_vm = new_locked_vm; + true + } else { + false + }; let lazy_vma = if MMArch::PAGE_FAULT_ENABLED { - Some(LockedVMA::new(VMA::new( + let vma = LockedVMA::new(VMA::new( region, vm_flags, entry_flags, Some(vma_file.clone()), Some(pgoff), false, - ))) + )); + if let Some(sysv_shm) = sysv_shm.clone() { + vma.lock().set_sysv_shm(Some(sysv_shm)); + } + Some(vma) } else { None }; drop(guard); + InnerAddressSpace::notify_close_notifications(close_notifications); let mut reservation = MmapReservationGuard::new(self.clone(), reservation_id); let hook_result = file.inode() .mmap_file(&file, region.start().data(), len, offset, vm_flags); + let file_mmap_opened = hook_result.is_ok(); let mut guard = self.write(); - - if let Err(err) = hook_result { - if err != SystemError::ENOSYS { + macro_rules! close_file_mmap_if_opened { + () => { + if file_mmap_opened { + InnerAddressSpace::notify_vma_close(VmaCloseNotification { + file: file.clone(), + region, + vm_flags, + }); + } + }; + } + macro_rules! release_locked_pages_if_reserved { + () => { + if locked_pages_reserved { + guard.locked_vm = + guard.locked_vm.checked_sub(page_count.data()).unwrap_or_else(|| { + error!( + "file mmap locked_vm accounting underflow: locked_vm={}, pages={}", + guard.locked_vm, + page_count.data() + ); + 0 + }); + } + }; + } + macro_rules! cancel_reservation_and_unlock_pages { + () => {{ + release_locked_pages_if_reserved!(); if guard.mappings.cancel_reservation(reservation_id).is_some() { drop(guard); reservation.disarm(); @@ -536,6 +760,12 @@ impl AddressSpace { drop(guard); reservation.disarm(); } + }}; + } + + if let Err(err) = hook_result { + if err != SystemError::ENOSYS { + cancel_reservation_and_unlock_pages!(); return Err(err); } } @@ -556,58 +786,42 @@ impl AddressSpace { Some(vma_file.clone()), Some(pgoff), ) { - Ok(vma) => vma, - Err(err) => { - if guard.mappings.cancel_reservation(reservation_id).is_some() { - drop(guard); - reservation.disarm(); - self.wake_reservation_waiters(); - } else { - drop(guard); - reservation.disarm(); + Ok(vma) => { + if let Some(sysv_shm) = sysv_shm.clone() { + vma.lock().set_sysv_shm(Some(sysv_shm)); } + vma + } + Err(err) => { + cancel_reservation_and_unlock_pages!(); + close_file_mmap_if_opened!(); return Err(err); } } }; - let new_locked_vm = if vm_flags.contains(VmFlags::VM_LOCKED) { - let error = if map_flags.contains(MapFlags::MAP_LOCKED) - && !InnerAddressSpace::has_mlock_quota() - { - SystemError::EPERM - } else { - SystemError::EAGAIN_OR_EWOULDBLOCK - }; - if let Err(err) = guard.check_mlock_rlimit_for_pages(page_count.data(), error) { - if guard.mappings.cancel_reservation(reservation_id).is_some() { - drop(guard); - reservation.disarm(); - self.wake_reservation_waiters(); - } else { - drop(guard); - reservation.disarm(); - } + let sysv_opened = if let Some(sysv_shm) = sysv_shm.as_ref() { + if let Err(err) = sysv_shm.open_vma() { + cancel_reservation_and_unlock_pages!(); + close_file_mmap_if_opened!(); return Err(err); } - Some( - guard - .locked_vm - .checked_add(page_count.data()) - .ok_or(SystemError::ENOMEM)?, - ) + true } else { - None + false }; if let Err(err) = guard.mappings.commit_reserved_vma(reservation_id, new_vma) { + let sysv_to_close = if sysv_opened { sysv_shm.clone() } else { None }; + release_locked_pages_if_reserved!(); drop(guard); + close_file_mmap_if_opened!(); + if let Some(sysv_shm) = sysv_to_close { + sysv_shm.close_vma(); + } return Err(err); } - if let Some(new_locked_vm) = new_locked_vm { - guard.locked_vm = new_locked_vm; - } reservation.disarm(); drop(guard); self.wake_reservation_waiters(); @@ -628,10 +842,22 @@ impl AddressSpace { self.wait_for_no_reservation_conflict(region); continue; } - return guard.munmap(start_page, page_count); + let notifications = guard.munmap_collect(start_page, page_count)?; + drop(guard); + InnerAddressSpace::notify_close_notifications(notifications); + return Ok(()); } } + pub fn detach_sysv_shm_wait(self: &Arc, addr: VirtAddr) -> Result<(), SystemError> { + let notifications = { + let mut guard = self.write_guard_no_reservations(); + guard.detach_sysv_shm(addr)? + }; + InnerAddressSpace::notify_close_notifications(notifications); + Ok(()) + } + pub fn mprotect_wait( self: &Arc, start_page: VirtPageFrame, @@ -698,7 +924,7 @@ impl AddressSpace { loop { let mut guard = self.write(); let mut wait_region = None; - if old_len != 0 { + if old_len != 0 && old_vaddr.data().checked_add(old_len).is_some() { let old_region = VirtRegion::new(old_vaddr, old_len); if guard .mappings @@ -707,13 +933,16 @@ impl AddressSpace { { wait_region = Some(old_region); } else if new_len > old_len { - let grow_region = VirtRegion::new(old_vaddr + old_len, new_len - old_len); - if guard - .mappings - .first_reservation_conflict(grow_region) - .is_some() - { - wait_region = Some(grow_region); + if let Some(grow_start) = old_vaddr.data().checked_add(old_len) { + let grow_region = + VirtRegion::new(VirtAddr::new(grow_start), new_len - old_len); + if guard + .mappings + .first_reservation_conflict(grow_region) + .is_some() + { + wait_region = Some(grow_region); + } } } } @@ -727,6 +956,24 @@ impl AddressSpace { wait_region = Some(new_region); } } + if wait_region.is_none() + && mremap_flags.contains(MremapFlags::MREMAP_DONTUNMAP) + && !mremap_flags.contains(MremapFlags::MREMAP_FIXED) + && new_vaddr != VirtAddr::new(0) + && new_vaddr + .data() + .checked_add(new_len) + .is_some_and(|end| end <= MMArch::USER_END_VADDR.data()) + { + let new_region = VirtRegion::new(new_vaddr, new_len); + if guard + .mappings + .first_reservation_conflict(new_region) + .is_some() + { + wait_region = Some(new_region); + } + } if let Some(region) = wait_region { drop(guard); @@ -742,11 +989,23 @@ impl AddressSpace { new_vaddr, vm_flags, ) { - Err(SystemError::EAGAIN_OR_EWOULDBLOCK) => { - let retry_region = if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { + Ok(outcome) => { + drop(guard); + InnerAddressSpace::notify_close_notifications(outcome.notifications); + return Ok(outcome.addr); + } + Err(failure) if failure.err == SystemError::EAGAIN_OR_EWOULDBLOCK => { + let retry_region = if mremap_flags.contains(MremapFlags::MREMAP_FIXED) + || (mremap_flags.contains(MremapFlags::MREMAP_DONTUNMAP) + && new_vaddr != VirtAddr::new(0)) + { VirtRegion::new(new_vaddr, new_len) } else if new_len > old_len { - VirtRegion::new(old_vaddr + old_len, new_len - old_len) + if let Some(grow_start) = old_vaddr.data().checked_add(old_len) { + VirtRegion::new(VirtAddr::new(grow_start), new_len - old_len) + } else { + VirtRegion::new(old_vaddr, old_len.max(MMArch::PAGE_SIZE)) + } } else { VirtRegion::new(old_vaddr, old_len.max(MMArch::PAGE_SIZE)) }; @@ -756,12 +1015,19 @@ impl AddressSpace { .is_some() { drop(guard); + InnerAddressSpace::notify_close_notifications(failure.notifications); self.wait_for_no_reservation_conflict(retry_region); continue; } + drop(guard); + InnerAddressSpace::notify_close_notifications(failure.notifications); return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); } - other => return other, + Err(failure) => { + drop(guard); + InnerAddressSpace::notify_close_notifications(failure.notifications); + return Err(failure.err); + } } } } @@ -949,6 +1215,126 @@ struct VmaCloseNotification { vm_flags: VmFlags, } +#[derive(Default)] +struct VmaCloseNotifications { + vma: Vec, + sysv: Vec>, +} + +impl VmaCloseNotifications { + fn is_empty(&self) -> bool { + self.vma.is_empty() && self.sysv.is_empty() + } + + fn extend(&mut self, mut other: VmaCloseNotifications) { + self.vma.append(&mut other.vma); + self.sysv.append(&mut other.sysv); + } +} + +struct MremapOutcome { + addr: VirtAddr, + notifications: VmaCloseNotifications, +} + +struct MremapFailure { + err: SystemError, + notifications: VmaCloseNotifications, +} + +impl From for MremapFailure { + fn from(err: SystemError) -> Self { + Self { + err, + notifications: VmaCloseNotifications::default(), + } + } +} + +struct MmapFailure { + err: SystemError, + notifications: VmaCloseNotifications, +} + +impl From for MmapFailure { + fn from(err: SystemError) -> Self { + Self { + err, + notifications: VmaCloseNotifications::default(), + } + } +} + +struct MunmapVmaPlan { + original_region: VirtRegion, + intersection: VirtRegion, + locked_vm_after_unmap: Option, + split_lifecycle: VmaSplitLifecycle, +} + +struct MprotectVmaPlan { + original_region: VirtRegion, + intersection: VirtRegion, + new_vm_flags: VmFlags, + split_lifecycle: VmaSplitLifecycle, +} + +struct MadviseVmaPlan { + original_region: VirtRegion, + intersection: VirtRegion, + split_lifecycle: VmaSplitLifecycle, +} + +#[derive(Debug)] +struct VmaSplitLifecycle { + sysv_shm: Option>, + open_count: usize, + committed: bool, +} + +impl VmaSplitLifecycle { + fn none() -> Self { + Self { + sysv_shm: None, + open_count: 0, + committed: false, + } + } + + fn commit(mut self) { + self.committed = true; + } + + fn rollback_into(mut self, notifications: &mut VmaCloseNotifications) { + if self.committed { + return; + } + if let Some(sysv_shm) = self.sysv_shm.take() { + for _ in 0..self.open_count { + notifications.sysv.push(sysv_shm.clone()); + } + } + self.open_count = 0; + self.committed = true; + } +} + +impl Drop for VmaSplitLifecycle { + fn drop(&mut self) { + if self.committed { + return; + } + error!( + "VmaSplitLifecycle dropped without explicit commit/rollback; falling back to immediate SysV SHM close" + ); + if let Some(sysv_shm) = self.sysv_shm.as_ref() { + for _ in 0..self.open_count { + sysv_shm.close_vma(); + } + } + } +} + impl InnerAddressSpace { /// 当前地址空间已占用的虚拟内存字节数(简单求和所有 VMA 尺寸) pub fn vma_usage_bytes(&self) -> usize { @@ -1029,111 +1415,119 @@ impl InnerAddressSpace { new_guard.start_data = self.start_data; new_guard.end_data = self.end_data; - // 遍历父进程的每个VMA,根据VMA属性进行适当的复制 - // 参考 Linux: https://code.dragonos.org.cn/xref/linux-6.6.21/mm/memory.c#copy_page_range - for vma in self.mappings.vmas.iter() { - // 锁顺序:VMA 锁 -> page_manager -> shm_manager,避免交叉获取导致死锁。 - let vma_guard = vma.lock(); + let mut parent_cow_remaps: Vec<(VirtAddr, EntryFlags)> = Vec::new(); + let clone_result: Result<(), SystemError> = (|| { + // 遍历父进程的每个VMA,根据VMA属性进行适当的复制 + // 参考 Linux: https://code.dragonos.org.cn/xref/linux-6.6.21/mm/memory.c#copy_page_range + for vma in self.mappings.vmas.iter() { + // 锁顺序:VMA 锁 -> page_manager -> shm_manager,避免交叉获取导致死锁。 + let vma_guard = vma.lock(); + + // VM_DONTCOPY: 跳过不复制的VMA (例如 MADV_DONTFORK 标记的) + if vma_guard.vm_flags().contains(VmFlags::VM_DONTCOPY) { + drop(vma_guard); + continue; + } - // VM_DONTCOPY: 跳过不复制的VMA (例如 MADV_DONTFORK 标记的) - if vma_guard.vm_flags().contains(VmFlags::VM_DONTCOPY) { + let vm_flags = *vma_guard.vm_flags(); + let is_shared = vm_flags.contains(VmFlags::VM_SHARED); + let region = *vma_guard.region(); + let page_flags = vma_guard.flags(); + let sysv_shm = vma_guard.sysv_shm(); + + // 创建新的VMA + let mut child_vma = vma_guard.clone_info_only(); + child_vma.vm_flags &= VmFlags::VM_LOCKED_CLEAR_MASK; + let new_vma = LockedVMA::new(child_vma); + new_guard.mappings.insert_vma(new_vma.clone()); drop(vma_guard); - continue; - } - let vm_flags = *vma_guard.vm_flags(); - let is_shared = vm_flags.contains(VmFlags::VM_SHARED); - let region = *vma_guard.region(); - let page_flags = vma_guard.flags(); - let shm_id = vma_guard.shm_id; - - // 创建新的VMA - let mut child_vma = vma_guard.clone_info_only(); - child_vma.vm_flags &= VmFlags::VM_LOCKED_CLEAR_MASK; - let new_vma = LockedVMA::new(child_vma); - new_guard.mappings.attach_vma(&new_vma); - new_guard.mappings.vmas.insert(new_vma.clone()); - drop(vma_guard); - - let mut skip_mapping = false; - if let Some(shm_id) = shm_id { - let ipcns = ProcessManager::current_ipcns(); - let mut shm_manager_guard = ipcns.shm.lock(); - match shm_manager_guard.get_mut(&shm_id) { - Some(kernel_shm) => { - // Forked SHM mappings count as new attachments. - kernel_shm.increase_count(); - } - None => { - warn!( - "Fork: SHM segment {:?} no longer exists, skipping VMA clone", - shm_id - ); - skip_mapping = true; + if let Some(sysv_shm) = sysv_shm { + if let Err(err) = sysv_shm.open_vma() { + if let Some(removed) = new_guard.mappings.remove_vma(®ion) { + removed.lock().set_mapped(false); + } + return Err(err); } } - } - - if skip_mapping { - let _ = new_guard.mappings.remove_vma(®ion); - continue; - } - // 根据VMA类型进行不同的页面复制策略 - let start_page = region.start(); - let end_page = region.end(); - let mut current_page = start_page; - - { - let _parent_pt_edit = parent_mm.page_table_edit(); - let old_mapper = &mut self.user_mapper.utable; - let new_mapper = &mut new_guard.user_mapper.utable; - let mut page_manager_guard = page_manager_lock(); + // 根据VMA类型进行不同的页面复制策略 + let start_page = region.start(); + let end_page = region.end(); + let mut current_page = start_page; - while current_page < end_page { - if let Some((phys_addr, old_flags)) = old_mapper.translate(current_page) { - unsafe { - if is_shared { - if new_mapper - .map_phys(current_page, phys_addr, page_flags) - .is_none() - { - warn!("Failed to map shared page at {:?} to phys {:?} in child process (current_pid: {:?})", - current_page, phys_addr, ProcessManager::current_pcb().raw_pid()); - } - } else { - let cow_flags = page_flags.set_write(false); + { + let _parent_pt_edit = parent_mm.page_table_edit(); + let old_mapper = &mut self.user_mapper.utable; + let new_mapper = &mut new_guard.user_mapper.utable; + let new_vma_mlocked = new_vma.lock().vm_flags().contains(VmFlags::VM_LOCKED); + let mut page_manager_guard = page_manager_lock(); + + while current_page < end_page { + if let Some((phys_addr, old_flags)) = old_mapper.translate(current_page) { + unsafe { + if is_shared { + if new_mapper + .map_phys(current_page, phys_addr, page_flags) + .is_none() + { + return Err(SystemError::ENOMEM); + } + } else { + let cow_flags = page_flags.set_write(false); + + if old_flags.has_write() { + if let Some(flush) = + old_mapper.remap(current_page, cow_flags) + { + flush.ignore(); + parent_tlb.accumulate_range(current_page); + parent_cow_remaps.push((current_page, old_flags)); + } + } - if old_flags.has_write() { - if let Some(flush) = old_mapper.remap(current_page, cow_flags) { - flush.ignore(); - parent_tlb.accumulate_range(current_page); + if new_mapper + .map_phys(current_page, phys_addr, cow_flags) + .is_none() + { + return Err(SystemError::ENOMEM); } } - - if new_mapper - .map_phys(current_page, phys_addr, cow_flags) - .is_none() - { - warn!("Failed to map COW page at {:?} to phys {:?} in child process (current_pid: {:?})", - current_page, phys_addr, ProcessManager::current_pcb().raw_pid()); + if let Some(page) = page_manager_guard.get(&phys_addr) { + page.write().insert_vma(new_vma.clone(), new_vma_mlocked); } } - if let Some(page) = page_manager_guard.get(&phys_addr) { - page.write().insert_vma(new_vma.clone()); - } } + current_page = VirtAddr::new(current_page.data() + MMArch::PAGE_SIZE); } - current_page = VirtAddr::new(current_page.data() + MMArch::PAGE_SIZE); } } - } + Ok(()) + })(); - drop(new_guard); - // 完成父 mm 的 mm-aware shootdown:INV-3 要求 TLB 生效完成后再继续后续逻辑, - // 此处没有 page 进入 pending_pages,因此实际只触发 flush_tlb_mm_range。 - parent_tlb.finish(); - return Ok(new_addr_space); + if let Err(err) = clone_result { + { + let _parent_pt_edit = parent_mm.page_table_edit(); + let old_mapper = &mut self.user_mapper.utable; + for (page, flags) in parent_cow_remaps.into_iter().rev() { + if let Some(flush) = unsafe { old_mapper.remap(page, flags) } { + unsafe { flush.ignore() }; + parent_tlb.accumulate_range(page); + } else { + warn!("fork rollback lost expected parent PTE at {:?}", page); + } + } + } + drop(new_guard); + parent_tlb.finish(); + return Err(err); + } + + drop(new_guard); + // 完成父 mm 的 mm-aware shootdown:INV-3 要求 TLB 生效完成后再继续后续逻辑, + // 此处没有 page 进入 pending_pages,因此实际只触发 flush_tlb_mm_range。 + parent_tlb.finish(); + return Ok(new_addr_space); } /// Check if the stack can be extended @@ -1226,6 +1620,32 @@ impl InnerAddressSpace { } fn check_rlimit_as_for_bytes(&self, len: usize) -> Result<(), SystemError> { + self.check_rlimit_as_for_growth(len) + } + + fn check_rlimit_as_for_region( + &self, + region: VirtRegion, + len: usize, + map_flags: MapFlags, + ) -> Result<(), SystemError> { + let covered = if map_flags.contains(MapFlags::MAP_FIXED) { + self.covered_vma_bytes(region) + } else { + 0 + }; + self.check_rlimit_as_for_growth(len.saturating_sub(covered)) + } + + fn check_rlimit_as_for_growth(&self, growth: usize) -> Result<(), SystemError> { + if growth == 0 { + return Ok(()); + } + + if !ProcessManager::initialized() { + return Ok(()); + } + let rlim_as = ProcessManager::current_pcb() .get_rlimit(RLimitID::As) .rlim_cur as usize; @@ -1233,11 +1653,15 @@ impl InnerAddressSpace { return Ok(()); } - let allowance = MMArch::PAGE_SIZE; - if self - .vma_usage_bytes() - .checked_add(len) - .is_none_or(|v| v > rlim_as.saturating_add(allowance)) + let limit_pages = rlim_as >> MMArch::PAGE_SHIFT; + let used_pages = self.vma_usage_bytes() >> MMArch::PAGE_SHIFT; + let growth_pages = growth + .checked_add(MMArch::PAGE_SIZE - 1) + .ok_or(SystemError::ENOMEM)? + >> MMArch::PAGE_SHIFT; + if used_pages + .checked_add(growth_pages) + .is_none_or(|v| v > limit_pages) { Err(SystemError::ENOMEM) } else { @@ -1245,6 +1669,16 @@ impl InnerAddressSpace { } } + fn covered_vma_bytes(&self, region: VirtRegion) -> usize { + self.mappings + .conflicts(region) + .into_iter() + .filter_map(|vma| vma.lock().region().intersect(®ion)) + .fold(0usize, |total, intersection| { + total.saturating_add(intersection.size()) + }) + } + fn mlock_fault_flags(vm_flags: VmFlags) -> Option { if vm_flags.contains(VmFlags::VM_WRITE) { Some(FaultFlags::FAULT_FLAG_WRITE) @@ -1257,11 +1691,51 @@ impl InnerAddressSpace { } } - fn mark_present_page_unevictable(&mut self, addr: VirtAddr) { + fn add_present_page_mlock_ref(&mut self, addr: VirtAddr, vma: &Arc) { if let Some((paddr, _)) = self.user_mapper.utable.translate(addr) { let mut page_manager_guard = page_manager_lock(); let page = page_manager_guard.get_unwrap(&paddr); - page.write().add_flags(PageFlags::PG_UNEVICTABLE); + page.write().add_mlocked_vma_ref(vma); + } + } + + fn update_present_page_mlock_refs( + &mut self, + vma: &Arc, + start: VirtAddr, + end: VirtAddr, + old_locked: bool, + new_locked: bool, + ) { + if old_locked == new_locked { + return; + } + + let mut pages_to_reclassify = Vec::new(); + let mut vaddr = start; + while vaddr < end { + if let Some((paddr, _)) = self.user_mapper.utable.translate(vaddr) { + let page = { + let mut page_manager_guard = page_manager_lock(); + page_manager_guard.get_unwrap(&paddr) + }; + { + let mut page_guard = page.write(); + if new_locked { + page_guard.add_mlocked_vma_ref(vma); + } else { + page_guard.remove_mlocked_vma_ref(vma); + } + } + if !new_locked { + pages_to_reclassify.push(page); + } + } + vaddr = VirtAddr::new(vaddr.data() + MMArch::PAGE_SIZE); + } + + for page in pages_to_reclassify { + Self::remove_page_unevictable_if_unneeded(&page); } } @@ -1294,7 +1768,7 @@ impl InnerAddressSpace { fault_in_missing: bool, ) -> Result<(), SystemError> { let target = Self::checked_user_region(start, len)?; - let mut vmas = self.mappings.conflicts(target).collect::>(); + let mut vmas = self.mappings.conflicts(target); vmas.sort_by_key(|vma| vma.lock().region().start().data()); let mut cursor = target.start(); @@ -1318,7 +1792,7 @@ impl InnerAddressSpace { while addr < intersection.end() { if self.user_mapper.utable.translate(addr).is_some() { if vm_flags.contains(VmFlags::VM_LOCKED) { - self.mark_present_page_unevictable(addr); + self.add_present_page_mlock_ref(addr, &vma); } } else if fault_in_missing { self.populate_vma_page(vma.clone(), addr, fault_flags)?; @@ -1385,25 +1859,52 @@ impl InnerAddressSpace { round_to_min: bool, allocate_at_once: bool, ) -> Result { + let (page, notifications) = match self.map_anonymous_collect( + start_vaddr, + len, + prot_flags, + map_flags, + round_to_min, + allocate_at_once, + ) { + Ok(outcome) => outcome, + Err(failure) => { + debug_assert!( + failure.notifications.is_empty(), + "locked map_anonymous caller must not replace existing VMAs" + ); + if !failure.notifications.is_empty() { + error!("locked map_anonymous failed after replacing existing VMAs"); + return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); + } + return Err(failure.err); + } + }; + debug_assert!( + notifications.is_empty(), + "locked map_anonymous caller must not replace existing VMAs" + ); + if !notifications.is_empty() { + return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); + } + Ok(page) + } + + #[allow(clippy::too_many_arguments)] + fn map_anonymous_collect( + &mut self, + start_vaddr: VirtAddr, + len: usize, + prot_flags: ProtFlags, + map_flags: MapFlags, + round_to_min: bool, + allocate_at_once: bool, + ) -> Result<(VirtPageFrame, VmaCloseNotifications), MmapFailure> { let allocate_at_once = if MMArch::PAGE_FAULT_ENABLED { allocate_at_once } else { true }; - // 用于对齐hint的函数 - let round_hint_to_min = |hint: VirtAddr| { - // 先把hint向下对齐到页边界 - let addr = hint.data() & (!MMArch::PAGE_OFFSET_MASK); - // debug!("map_anonymous: hint = {:?}, addr = {addr:#x}", hint); - // 如果hint不是0,且hint小于DEFAULT_MMAP_MIN_ADDR,则对齐到DEFAULT_MMAP_MIN_ADDR - if (addr != 0) && round_to_min && (addr < DEFAULT_MMAP_MIN_ADDR) { - Some(VirtAddr::new(page_align_up(DEFAULT_MMAP_MIN_ADDR))) - } else if addr == 0 { - None - } else { - Some(VirtAddr::new(addr)) - } - }; // debug!("map_anonymous: start_vaddr = {:?}", start_vaddr); // debug!("map_anonymous: len(no align) = {}", len); @@ -1411,8 +1912,9 @@ impl InnerAddressSpace { // debug!("map_anonymous: len = {}", len); - let start_page: VirtPageFrame = self.mmap( - round_hint_to_min(start_vaddr), + let fixed_hint = map_flags.intersects(MapFlags::MAP_FIXED | MapFlags::MAP_FIXED_NOREPLACE); + let (start_page, notifications) = self.mmap_collect( + AddressSpace::round_mmap_hint(start_vaddr, round_to_min, fixed_hint), PageFrameCount::from_bytes(len).unwrap(), prot_flags, map_flags, @@ -1449,7 +1951,7 @@ impl InnerAddressSpace { self.post_map_population(start_page.virt_address(), len, map_flags); - return Ok(start_page); + return Ok((start_page, notifications)); } /// 向进程的地址空间映射页面 @@ -1469,7 +1971,7 @@ impl InnerAddressSpace { /// # Errors /// /// - `EINVAL`:参数错误 - pub fn mmap< + fn mmap_collect< F: FnOnce( VirtPageFrame, PageFrameCount, @@ -1485,25 +1987,12 @@ impl InnerAddressSpace { prot_flags: ProtFlags, map_flags: MapFlags, map_func: F, - ) -> Result { + ) -> Result<(VirtPageFrame, VmaCloseNotifications), MmapFailure> { if page_count == PageFrameCount::new(0) { - return Err(SystemError::EINVAL); + return Err(SystemError::EINVAL.into()); } // debug!("mmap: addr: {addr:?}, page_count: {page_count:?}, prot_flags: {prot_flags:?}, map_flags: {map_flags:?}"); - // 找到未使用的区域 - let region = match addr { - Some(vaddr) => { - self.find_free_at(self.mmap_min, vaddr, page_count.bytes(), map_flags)? - } - None => self - .mappings - .find_free(self.mmap_min, page_count.bytes()) - .ok_or(SystemError::ENOMEM)?, - }; - - let page = VirtPageFrame::new(region.start()); - let vm_flags = VmFlags::from(prot_flags) | VmFlags::from(map_flags) | self.mlock_future @@ -1519,8 +2008,64 @@ impl InnerAddressSpace { }; self.check_mlock_rlimit_for_pages(page_count.data(), error)?; } + + let mut notifications = VmaCloseNotifications::default(); + macro_rules! mmap_fail { + ($err:expr) => { + return Err(MmapFailure { + err: $err, + notifications, + }) + }; + } + macro_rules! mmap_try { + ($expr:expr) => { + match $expr { + Ok(value) => value, + Err(err) => mmap_fail!(err), + } + }; + } + + // 先只解析目标区域;MAP_FIXED 的破坏性替换要等前置检查完成后再提交。 + let region = match addr { + Some(vaddr) => { + mmap_try!(self.find_free_at_prepare( + self.mmap_min, + vaddr, + page_count.bytes(), + map_flags, + )) + } + None => self + .mappings + .find_free(self.mmap_min, page_count.bytes()) + .ok_or(SystemError::ENOMEM)?, + }; + + self.check_rlimit_as_for_region(region, page_count.bytes(), map_flags)?; + + if map_flags.contains(MapFlags::MAP_FIXED) && self.mappings.has_conflict(region) { + let close_notifications = mmap_try!(self.munmap_collect( + VirtPageFrame::new(region.start()), + PageFrameCount::from_bytes(region.size()).unwrap(), + )); + notifications.extend(close_notifications); + } + + let page = VirtPageFrame::new(region.start()); // debug!("mmap: page: {:?}, region={region:?}", page.virt_address()); + let new_locked_vm = if vm_flags.contains(VmFlags::VM_LOCKED) { + Some( + self.locked_vm + .checked_add(page_count.data()) + .ok_or(SystemError::ENOMEM)?, + ) + } else { + None + }; + compiler_fence(Ordering::SeqCst); // New mapping: the new region had no prior PTE, no TLB invalidation needed. // Use DeferredFlusher to silently consume internal PageFlush tokens. @@ -1530,26 +2075,25 @@ impl InnerAddressSpace { compiler_fence(Ordering::SeqCst); // 映射页面,并将VMA插入到地址空间的VMA列表中 let new_vma = { - let mm = self.outer_addr_space().ok_or(SystemError::EFAULT)?; + let Some(mm) = self.outer_addr_space() else { + mmap_fail!(SystemError::EFAULT); + }; let _pt_edit = mm.page_table_edit(); - map_func( + mmap_try!(map_func( page, page_count, vm_flags, EntryFlags::from_prot_flags(prot_flags, true), &mut self.user_mapper.utable, &mut flusher, - )? + )) }; self.mappings.insert_vma(new_vma); - if vm_flags.contains(VmFlags::VM_LOCKED) { - self.locked_vm = self - .locked_vm - .checked_add(page_count.data()) - .ok_or(SystemError::ENOMEM)?; + if let Some(new_locked_vm) = new_locked_vm { + self.locked_vm = new_locked_vm; } - return Ok(page); + return Ok((page, notifications)); } /// 重映射内存区域 @@ -1570,39 +2114,81 @@ impl InnerAddressSpace { /// # Errors /// /// - `EINVAL`:参数错误 - pub fn mremap( + fn mremap( &mut self, old_vaddr: VirtAddr, - old_len: usize, + mut old_len: usize, new_len: usize, mremap_flags: MremapFlags, new_vaddr: VirtAddr, vm_flags: VmFlags, - ) -> Result { - // 仅在 MREMAP_FIXED 下需要检查 new_vaddr(否则 new_vaddr 参数应被忽略,由内核选择新地址) - if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { - // 检查新内存地址是否对齐 + ) -> Result { + let fixed_new_region = if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { if !new_vaddr.check_aligned(MMArch::PAGE_SIZE) { - return Err(SystemError::EINVAL); + return Err(SystemError::EINVAL.into()); } - - // 检查新、旧内存区域是否冲突 - let old_region = VirtRegion::new(old_vaddr, old_len); - let new_region = VirtRegion::new(new_vaddr, new_len); - if old_region.collide(&new_region) { - return Err(SystemError::EINVAL); + let new_region = Self::checked_user_region(new_vaddr, new_len)?; + let old_end = old_vaddr.data().wrapping_add(old_len); + let new_end = new_vaddr + .data() + .checked_add(new_len) + .ok_or(SystemError::EINVAL)?; + if old_end > new_vaddr.data() && new_end > old_vaddr.data() { + return Err(SystemError::EINVAL.into()); } - } - + if old_len != 0 { + let old_region = Self::checked_user_region(old_vaddr, old_len)?; + debug_assert!(!old_region.collide(&new_region)); + } + Some(new_region) + } else { + None + }; // 初始化内存区域保护标志 let prot_flags: ProtFlags = vm_flags.into(); + let mut notifications = VmaCloseNotifications::default(); + macro_rules! mremap_fail { + ($err:expr) => { + return Err(MremapFailure { + err: $err, + notifications, + }) + }; + } + macro_rules! mremap_try { + ($expr:expr) => { + match $expr { + Ok(value) => value, + Err(err) => mremap_fail!(err), + } + }; + } + + if mremap_flags.contains(MremapFlags::MREMAP_FIXED) + && self.mappings.contains(old_vaddr).is_none() + { + mremap_fail!(SystemError::EFAULT); + } + if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { + let start_page = VirtPageFrame::new(new_vaddr); + let page_count = PageFrameCount::from_bytes(new_len).unwrap(); + notifications.extend(mremap_try!(self.munmap_collect(start_page, page_count))); + } + if mremap_flags.contains(MremapFlags::MREMAP_FIXED) && old_len > new_len { + notifications.extend(mremap_try!(self.munmap_collect( + VirtPageFrame::new(old_vaddr + new_len), + PageFrameCount::from_bytes(old_len - new_len).unwrap(), + ))); + old_len = new_len; + } // 读取旧 VMA 的后备信息(file/shared-anon)以及页偏移基址。 - let old_vma = self - .mappings - .contains(old_vaddr) - .ok_or(SystemError::EINVAL)?; - let (old_region, vm_file, shared_anon, base_pgoff) = { + // MREMAP_FIXED 在上方可能已拆掉目标区间以及 shrink tail;重新查询源 + // VMA,避免使用可能被 split 后失效的旧缓存。 + let Some(old_vma) = self.mappings.contains(old_vaddr) else { + mremap_fail!(SystemError::EFAULT); + }; + let (old_region, vm_file, shared_anon, base_pgoff, sysv_shm) = { let g = old_vma.lock(); let region = *g.region(); let vma_start = region.start(); @@ -1612,7 +2198,13 @@ impl InnerAddressSpace { .backing_page_offset() .unwrap_or(0) .saturating_add(off_pages); - (region, g.vm_file(), g.shared_anon.clone(), base) + ( + region, + g.vm_file(), + g.shared_anon.clone(), + base, + g.sysv_shm(), + ) }; // 构造目标映射 flags:mremap 需要保留 shared/private 语义,并区分 anon/file。 @@ -1625,11 +2217,39 @@ impl InnerAddressSpace { if vm_file.is_none() { map_flags |= MapFlags::MAP_ANONYMOUS; } + if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { + map_flags |= MapFlags::MAP_FIXED; + } - let dontunmap = mremap_flags.contains(MremapFlags::MREMAP_DONTUNMAP) || old_len == 0; + let dontunmap_flag = mremap_flags.contains(MremapFlags::MREMAP_DONTUNMAP); let locked_source = vm_flags.contains(VmFlags::VM_LOCKED); + let sysv_mremap = sysv_shm.is_some(); + let source_len = old_len; + let Some(max_old_len) = old_region.end().data().checked_sub(old_vaddr.data()) else { + mremap_fail!(SystemError::EINVAL); + }; + if source_len > max_old_len { + mremap_fail!(SystemError::EFAULT); + } + let source_region = VirtRegion::new(old_vaddr, source_len); + if dontunmap_flag { + if vm_flags.intersects(VmFlags::VM_DONTEXPAND | VmFlags::VM_PFNMAP) { + mremap_fail!(SystemError::EINVAL); + } + let Some(old_end) = old_vaddr.data().checked_add(old_len) else { + mremap_fail!(SystemError::EINVAL); + }; + let Some(new_end) = new_vaddr.data().checked_add(new_len) else { + mremap_fail!(SystemError::EINVAL); + }; + if old_end > new_vaddr.data() && new_end > old_vaddr.data() { + mremap_fail!(SystemError::EINVAL); + } + } if locked_source { - let additional_locked_pages = if dontunmap { + let additional_locked_pages = if old_len == 0 { + new_len >> MMArch::PAGE_SHIFT + } else if dontunmap_flag { 0 } else if new_len > old_len { (new_len - old_len) >> MMArch::PAGE_SHIFT @@ -1637,19 +2257,19 @@ impl InnerAddressSpace { 0 }; if additional_locked_pages != 0 { - self.check_mlock_rlimit_for_pages( + mremap_try!(self.check_mlock_rlimit_for_pages( additional_locked_pages, SystemError::EAGAIN_OR_EWOULDBLOCK, - )?; + )); } } - - // 取消新内存区域的原映射 - if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { - map_flags |= MapFlags::MAP_FIXED; - let start_page = VirtPageFrame::new(new_vaddr); - let page_count = PageFrameCount::from_bytes(new_len).unwrap(); - self.munmap(start_page, page_count)?; + let as_delta = if old_len == 0 || dontunmap_flag { + new_len + } else { + new_len.saturating_sub(old_len) + }; + if as_delta != 0 { + mremap_try!(self.check_rlimit_as_for_bytes(as_delta)); } // 是否允许移动(Linux: 只有 MAYMOVE / FIXED 才能移动) @@ -1661,59 +2281,141 @@ impl InnerAddressSpace { // - 没有 MAYMOVE/FIXED 时返回 ENOMEM if old_len == 0 { if !vm_flags.intersects(VmFlags::VM_SHARED | VmFlags::VM_MAYSHARE) { - return Err(SystemError::EINVAL); + mremap_fail!(SystemError::EINVAL); } if !can_move { - return Err(SystemError::ENOMEM); + mremap_fail!(SystemError::ENOMEM); + } + } + + if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { + if let Err(err) = check_mmap_min_addr(new_vaddr, self.mmap_min) { + mremap_fail!(err); } } // 不允许移动时,只能尝试原地扩展。 if !can_move { if new_len <= old_len { - return Ok(old_vaddr); + return Ok(MremapOutcome { + addr: old_vaddr, + notifications: VmaCloseNotifications::default(), + }); } - // 仅支持从 VMA 起始地址扩展整个 VMA 的常见场景(符合 gVisor 测例)。 - if old_vaddr != old_region.start() || old_len != old_region.size() { - return Err(SystemError::ENOMEM); + // Linux only allows in-place expansion when the old range reaches the VMA end. + if old_len != max_old_len { + mremap_fail!(SystemError::ENOMEM); } let grow = new_len - old_len; + let Some(grown_region_size) = old_region.size().checked_add(grow) else { + mremap_fail!(SystemError::ENOMEM); + }; + let Some(grown_end) = old_region.start().data().checked_add(grown_region_size) else { + mremap_fail!(SystemError::ENOMEM); + }; + if grown_end > MMArch::USER_END_VADDR.data() { + mremap_fail!(SystemError::ENOMEM); + } + let locked_vm_after_grow = if locked_source { + let Some(locked_vm_after_grow) = + self.locked_vm.checked_add(grow >> MMArch::PAGE_SHIFT) + else { + mremap_fail!(SystemError::ENOMEM); + }; + Some(locked_vm_after_grow) + } else { + None + }; let grow_region = VirtRegion::new(old_vaddr + old_len, grow); - if self.mappings.conflicts(grow_region).next().is_some() { - return Err(SystemError::ENOMEM); + if self.mappings.has_conflict(grow_region) { + mremap_fail!(SystemError::ENOMEM); } - let removed = self - .mappings - .remove_vma(&old_region) - .ok_or(SystemError::EINVAL)?; - removed.lock().set_region_size(new_len); + let Some(removed) = self.mappings.remove_vma(&old_region) else { + mremap_fail!(SystemError::EINVAL); + }; + removed.lock().set_region_size(grown_region_size); self.mappings.insert_vma(removed); - if locked_source { - self.locked_vm = self - .locked_vm - .checked_add(grow >> MMArch::PAGE_SHIFT) - .ok_or(SystemError::ENOMEM)?; + if let Some(locked_vm_after_grow) = locked_vm_after_grow { + self.locked_vm = locked_vm_after_grow; self.best_effort_locked_population(old_vaddr + old_len, grow, vm_flags); } - return Ok(old_vaddr); + return Ok(MremapOutcome { + addr: old_vaddr, + notifications: VmaCloseNotifications::default(), + }); } // 需要创建一个新映射并迁移(FIXED 或 MAYMOVE)。 // 注意:必须避免在持有地址空间写锁时触碰用户地址(会触发缺页递归死锁)。 // Linux 的 mremap 通过移动/复制页表项实现,而不是字节拷贝。 - let new_region: VirtRegion = if mremap_flags.contains(MremapFlags::MREMAP_FIXED) { - VirtRegion::new(new_vaddr, new_len) + let new_region: VirtRegion = if let Some(new_region) = fixed_new_region { + new_region + } else if dontunmap_flag { + let (region, close_notifications) = mremap_try!(self.find_free_at_collect( + self.mmap_min, + new_vaddr, + new_len, + map_flags, + )); + notifications.extend(close_notifications); + region } else { - self.mappings - .find_free(self.mmap_min, new_len) - .ok_or(SystemError::ENOMEM)? + let Some(new_region) = self.mappings.find_free(self.mmap_min, new_len) else { + mremap_fail!(SystemError::ENOMEM); + }; + new_region }; let entry_flags = EntryFlags::from_prot_flags(prot_flags, true); + let Some(mm) = self.outer_addr_space() else { + mremap_fail!(SystemError::EFAULT); + }; + let remove_source_vma_on_commit = + !dontunmap_flag && old_len != 0 && new_region.start() != old_vaddr; + let split_source_on_commit = old_len != 0 && source_region != old_region && !dontunmap_flag; + + let locked_vm_after_move_commit = if locked_source { + let new_pages = new_len >> MMArch::PAGE_SHIFT; + let old_pages = source_len >> MMArch::PAGE_SHIFT; + if old_len == 0 { + let Some(locked_vm_after_commit) = self.locked_vm.checked_add(new_pages) else { + mremap_fail!(SystemError::ENOMEM); + }; + Some(locked_vm_after_commit) + } else if dontunmap_flag { + // Linux move_vma() clears VM_LOCKED on the old VMA for + // MREMAP_DONTUNMAP but deliberately leaves mm->locked_vm + // unchanged because the source range is not unmapped. + Some(self.locked_vm) + } else { + let Some(locked_after_add) = self.locked_vm.checked_add(new_pages) else { + mremap_fail!(SystemError::ENOMEM); + }; + let Some(locked_vm_after_commit) = locked_after_add.checked_sub(old_pages) else { + mremap_fail!(SystemError::ENOMEM); + }; + Some(locked_vm_after_commit) + } + } else { + None + }; + let mut source_split_lifecycle = if split_source_on_commit { + Some(mremap_try!(old_vma.prepare_split_lifecycle(source_region))) + } else { + None + }; + if let Some(sysv_shm) = sysv_shm.as_ref() { + if let Err(err) = sysv_shm.open_vma() { + if let Some(lifecycle) = source_split_lifecycle.take() { + lifecycle.rollback_into(&mut notifications); + } + mremap_fail!(err); + } + } // 创建目标 VMA(初始不映射物理页;存在的页表项会在下面被移动/复制)。 let new_vma: Arc = { @@ -1734,81 +2436,202 @@ impl InnerAddressSpace { vg.shared_anon = Some(shared); vg.backing_pgoff = Some(base_pgoff); } - self.mappings.insert_vma(vma.clone()); - if locked_source && !dontunmap { - self.locked_vm = self - .locked_vm - .checked_add(new_len >> MMArch::PAGE_SHIFT) - .ok_or(SystemError::ENOMEM)?; + if let Some(sysv_shm) = sysv_shm.clone() { + vma.lock().set_sysv_shm(Some(sysv_shm)); } vma }; - if let Some(f) = vm_file.as_ref() { - let _ = f.inode().mmap( - new_region.start().data(), - new_len, - base_pgoff * MMArch::PAGE_SIZE, - ); - } - - let source_len = if old_len == 0 { new_len } else { old_len }; + // Linux mremap moves/duplicates an existing VMA; it does not call the + // filesystem mmap hook again. The file mapping was already accepted + // when the source VMA was created. + self.mappings.insert_vma(new_vma.clone()); let move_len = core::cmp::min(source_len, new_len); - // mremap does not free physical pages (old VMA pages are moved to the new VMA, or dual-mapped under DONTUNMAP); + // mremap does not free physical pages; old PTEs are migrated to the new VMA, while + // old_len==0 keeps the legacy duplicate-mapping behavior. // using MmuGather here is solely for a unified cross-core TLB shootdown at the end. - let mm = self.outer_addr_space().ok_or(SystemError::EFAULT)?; let mut tlb = MmuGather::gather(&mm); // 迁移/复制已存在的页表映射。 - // - DONTUNMAP:复制映射(旧映射仍保留) - // - 否则:移动映射(旧地址解除映射) + // 阶段 A:先完整安装目标 PTE,不破坏源 PTE;失败时只需删除目标 PTE。 + // 阶段 B:目标 PTE 全部安装成功后,再不可失败地移除源 PTE 并切换 vma_set。 + // Linux 的 MREMAP_DONTUNMAP 保留旧 VMA,但页表仍会迁移;不能长期保留源 PTE。 let mapper = &mut self.user_mapper.utable; let old_vma = old_vma.clone(); + let mut installed_target_pte = false; { let _pt_edit = mm.page_table_edit(); let mut page_manager_guard = page_manager_lock(); + let mut migrated = Vec::new(); + let mut err = None; let mut off = 0usize; while off < move_len { let src = old_vaddr + off; let dst = new_region.start() + off; if let Some((paddr, src_flags)) = mapper.translate(src) { - if !dontunmap { - if let Some((_paddr2, _flags2, flush, freed_tables)) = - unsafe { mapper.unmap_phys_with_freed_tables(src, true) } - { - unsafe { flush.ignore() }; - tlb.accumulate_range(src); - if freed_tables { - tlb.note_pt_table_freed(); - } - } - } + let Some(flush) = (unsafe { mapper.map_phys(dst, paddr, src_flags) }) else { + err = Some(SystemError::ENOMEM); + break; + }; + unsafe { flush.ignore() }; + tlb.accumulate_range(dst); + installed_target_pte = true; + page_manager_guard + .get_unwrap(&paddr) + .write() + .insert_vma(new_vma.clone(), locked_source); + + migrated.push((src, dst, paddr, src_flags)); + } + off += MMArch::PAGE_SIZE; + } - if let Some(flush) = unsafe { mapper.map_phys(dst, paddr, src_flags) } { + if let Some(err) = err { + for (_src, dst, paddr, _src_flags) in migrated.into_iter().rev() { + if let Some((_unmapped_paddr, _flags, flush)) = + unsafe { mapper.unmap_phys_preserve_tables(dst) } + { unsafe { flush.ignore() }; tlb.accumulate_range(dst); + } + if let Some(page) = page_manager_guard.get(&paddr) { + page.write().remove_vma(new_vma.as_ref()); + } + } + + self.mappings.remove_vma(&new_region); + drop(page_manager_guard); + tlb.finish(); + if let Some(sysv_shm) = sysv_shm.as_ref() { + notifications.sysv.push(sysv_shm.clone()); + } + if let Some(lifecycle) = source_split_lifecycle.take() { + lifecycle.rollback_into(&mut notifications); + } + mremap_fail!(err); + } + + if old_len != 0 { + for (src, _dst, paddr, _src_flags) in migrated { + if let Some((_paddr2, _flags2, flush)) = + unsafe { mapper.unmap_phys_preserve_tables(src) } + { + unsafe { flush.ignore() }; + tlb.accumulate_range(src); } else { - return Err(SystemError::ENOMEM); + panic!("mremap commit lost expected source PTE at {:?}", src); } - // 更新物理页的 vma_set let page = page_manager_guard.get_unwrap(&paddr); let mut pg = page.write(); - if !dontunmap { - pg.remove_vma(old_vma.as_ref()); + pg.remove_vma(old_vma.as_ref()); + } + } + } + if installed_target_pte { + new_vma.lock().set_mapped(true); + } + + if sysv_mremap || remove_source_vma_on_commit || (locked_source && dontunmap_flag) { + let mut source_vma = old_vma.clone(); + let mut split_before = None; + let mut split_after = None; + + if split_source_on_commit { + let removed = self + .mappings + .remove_vma(&old_region) + .expect("validated mremap source VMA must exist"); + debug_assert!(Arc::ptr_eq(&removed, &old_vma)); + let split_result = removed + .extract(source_region, &self.user_mapper.utable) + .expect("validated mremap source region must split"); + source_vma = split_result.middle; + split_before = split_result.prev; + split_after = split_result.after; + } + + if locked_source && dontunmap_flag { + self.update_present_page_mlock_refs( + &source_vma, + old_region.start(), + old_region.end(), + true, + false, + ); + let clear_locked = |vma: &Arc| { + let mut guard = vma.lock(); + let unlocked_flags = *guard.vm_flags() & VmFlags::VM_LOCKED_CLEAR_MASK; + guard.set_vm_flags(unlocked_flags); + guard.set_flags(); + }; + clear_locked(&source_vma); + } + + if let Some(before) = split_before { + self.mappings.insert_vma(before); + } + if let Some(after) = split_after { + self.mappings.insert_vma(after); + } + if let Some(lifecycle) = source_split_lifecycle.take() { + lifecycle.commit(); + } + if remove_source_vma_on_commit { + if split_source_on_commit { + source_vma.unmap(&mut self.user_mapper.utable, &mut tlb); + source_vma.lock().set_mapped(false); + if let Some(notification) = Self::collect_vma_close(&source_vma, source_region) + { + notifications.vma.push(notification); + } + if let Some(notification) = Self::collect_sysv_shm_close(&source_vma) { + notifications.sysv.push(notification); + } + } else { + let removed = self + .mappings + .remove_vma(&old_region) + .expect("validated mremap source VMA must exist"); + removed.unmap(&mut self.user_mapper.utable, &mut tlb); + removed.lock().set_mapped(false); + if let Some(notification) = Self::collect_vma_close(&removed, old_region) { + notifications.vma.push(notification); + } + if let Some(notification) = Self::collect_sysv_shm_close(&removed) { + notifications.sysv.push(notification); } - pg.insert_vma(new_vma.clone()); } - off += MMArch::PAGE_SIZE; } + if split_source_on_commit && !remove_source_vma_on_commit { + self.mappings.insert_vma(source_vma); + } + + if let Some(locked_vm_after_commit) = locked_vm_after_move_commit { + self.locked_vm = locked_vm_after_commit; + } + tlb.finish(); + + if locked_source && new_len > old_len { + self.best_effort_locked_population( + new_region.start() + old_len, + new_len - old_len, + vm_flags, + ); + } + + return Ok(MremapOutcome { + addr: new_region.start(), + notifications, + }); } - tlb.finish(); - if locked_source && dontunmap { - self.apply_vma_lock_flags(old_vaddr, source_len, VmFlags::VM_NONE, false)?; + if let Some(locked_vm_after_commit) = locked_vm_after_move_commit { + self.locked_vm = locked_vm_after_commit; } + tlb.finish(); if locked_source && new_len > old_len { self.best_effort_locked_population( @@ -1818,7 +2641,10 @@ impl InnerAddressSpace { ); } - Ok(new_region.start()) + Ok(MremapOutcome { + addr: new_region.start(), + notifications, + }) } /// 取消进程的地址空间中的映射 @@ -1838,19 +2664,31 @@ impl InnerAddressSpace { start_page: VirtPageFrame, page_count: PageFrameCount, ) -> Result<(), SystemError> { + let notifications = self.munmap_collect(start_page, page_count)?; + Self::notify_close_notifications(notifications); + Ok(()) + } + + fn munmap_collect( + &mut self, + start_page: VirtPageFrame, + page_count: PageFrameCount, + ) -> Result { defer!({ compiler_fence(Ordering::SeqCst); }); // 获取取消映射操作关联的 VMAS (用户传入的区域可能横跨多个 VMA) let region_to_unmap = VirtRegion::new(start_page.virt_address(), page_count.bytes()); - let vmas_related: Vec> = - self.mappings.conflicts(region_to_unmap).collect::>(); + let vmas_related: Vec> = self.mappings.conflicts(region_to_unmap); // Use MmuGather: clear PTEs + stash pages first, then unified shootdown, and finally free physical pages (INV-3) let mm = self.outer_addr_space().ok_or(SystemError::EFAULT)?; let mut tlb = MmuGather::gather(&mm); - let mut vma_close_notifications: Vec = Vec::new(); + let mut notifications = VmaCloseNotifications::default(); + let mut plans: Vec = Vec::with_capacity(vmas_related.len()); + let mut unmapped_vmas: Vec> = Vec::with_capacity(vmas_related.len()); + let mut locked_vm_after_commit = self.locked_vm; // 遍历每个相关的 VMA,将当前的 VMA 拆分为可能的三块 VMA,然后删除与需要删除的区域相交的部分。 // 示意图:对每个与 region_to_unmap 相交的 VMA,按交集拆分成三段(before / intersection / after), @@ -1867,30 +2705,69 @@ impl InnerAddressSpace { // keep unmap keep // // 注意:用户传入的 region_to_unmap 可能跨多个 VMA,因此需要对每个相关 VMA 分别处理。 + // + // 第一阶段只做校验和 SysV split side 预打开,不修改 mappings。这样后面的 + // VMA 若因 RMID/引用限制导致 open_vma 失败,前面 VMA 不会已经被删除。 for cur_vma in vmas_related { - let r = cur_vma.lock().region; - let cur_vma = self.mappings.remove_vma(&r).ok_or(SystemError::EFAULT)?; - let intersection = cur_vma - .lock() - .region() - .intersect(®ion_to_unmap) - .ok_or(SystemError::EFAULT)?; - if cur_vma.lock().vm_flags().contains(VmFlags::VM_LOCKED) { - self.locked_vm = self - .locked_vm - .saturating_sub(intersection.size() >> MMArch::PAGE_SHIFT); - } + let (original_region, intersection, locked) = { + let guard = cur_vma.lock(); + let original_region = *guard.region(); + let intersection = original_region + .intersect(®ion_to_unmap) + .ok_or(SystemError::EFAULT)?; + ( + original_region, + intersection, + guard.vm_flags().contains(VmFlags::VM_LOCKED), + ) + }; + let locked_vm_after_unmap = if locked { + locked_vm_after_commit = locked_vm_after_commit + .checked_sub(intersection.size() >> MMArch::PAGE_SHIFT) + .ok_or(SystemError::EFAULT)?; + Some(locked_vm_after_commit) + } else { + None + }; + + let split_lifecycle = cur_vma.prepare_split_lifecycle(intersection)?; + + plans.push(MunmapVmaPlan { + original_region, + intersection, + locked_vm_after_unmap, + split_lifecycle, + }); + } + + plans.reverse(); + while let Some(plan) = plans.pop() { + let cur_vma = match self.mappings.remove_vma(&plan.original_region) { + Some(vma) => vma, + None => return Err(SystemError::EFAULT), + }; let (before, after) = { let _pt_edit = mm.page_table_edit(); - let split_result = cur_vma - .extract(intersection, &self.user_mapper.utable) - .ok_or(SystemError::EFAULT)?; + let Some(split_result) = + cur_vma.extract(plan.intersection, &self.user_mapper.utable) + else { + self.mappings.insert_vma(cur_vma.clone()); + return Err(SystemError::EFAULT); + }; + let before = split_result.prev; + let after = split_result.after; + if let Some(locked_vm_after_unmap) = plan.locked_vm_after_unmap { + self.locked_vm = locked_vm_after_unmap; + } cur_vma.unmap(&mut self.user_mapper.utable, &mut tlb); - (split_result.prev, split_result.after) + (before, after) }; - if let Some(notification) = Self::collect_vma_close(&cur_vma, intersection) { - vma_close_notifications.push(notification); + if let Some(notification) = Self::collect_vma_close(&cur_vma, plan.intersection) { + notifications.vma.push(notification); + } + if let Some(notification) = Self::collect_sysv_shm_close(&cur_vma) { + notifications.sysv.push(notification); } if let Some(before) = before { @@ -1900,16 +2777,156 @@ impl InnerAddressSpace { if let Some(after) = after { self.mappings.insert_vma(after); } + plan.split_lifecycle.commit(); + // Keep the removed VMA alive until after TLB shootdown. Its drop may + // destroy the last shared-anon backing object, which can release + // physical pages that were just unmapped above. + unmapped_vmas.push(cur_vma); + } + + // Shootdown first, then free physical pages + tlb.finish(); + drop(unmapped_vmas); + + Ok(notifications) + } + + fn detach_sysv_shm(&mut self, addr: VirtAddr) -> Result { + if !addr.check_aligned(MMArch::PAGE_SIZE) { + return Err(SystemError::EINVAL); + } + + let mut attach_file = None; + let mut attach_id = None; + let mut segment_size = 0usize; + let mut targets: Vec> = Vec::new(); + + for vma in self.mappings.iter_vmas_starting_at(addr) { + let (region, pgoff, sysv, file) = { + let guard = vma.lock(); + ( + *guard.region(), + guard.backing_page_offset(), + guard.sysv_shm(), + guard.vm_file(), + ) + }; + if !targets.is_empty() + && region.start().data().saturating_sub(addr.data()) >= segment_size + { + break; + } + let Some(sysv) = sysv else { + continue; + }; + if region.start() < addr { + continue; + } + let Some(pgoff) = pgoff else { + continue; + }; + + if file.is_none() { + continue; + }; + let expected_pgoff = (region.start().data() - addr.data()) >> MMArch::PAGE_SHIFT; + let first_match = attach_file.is_none(); + if first_match { + if pgoff != expected_pgoff { + continue; + } + segment_size = page_align_up(sysv.size()); + attach_id = Some(sysv.attach_id()); + attach_file = file; + } else { + if region.end().data().saturating_sub(addr.data()) > segment_size { + break; + } + if attach_id != Some(sysv.attach_id()) { + continue; + } + if pgoff != expected_pgoff { + continue; + } + let Some(expected_file) = attach_file.as_ref() else { + continue; + }; + let Some(file) = file else { + continue; + }; + if !Arc::ptr_eq(&file, expected_file) { + continue; + } + } + targets.push(vma.clone()); + } + + if targets.is_empty() { + return Err(SystemError::EINVAL); + } + + let locked_pages = targets.iter().try_fold(0usize, |acc, target| { + let guard = target.lock(); + if guard.vm_flags().contains(VmFlags::VM_LOCKED) { + acc.checked_add(guard.region().size() >> MMArch::PAGE_SHIFT) + .ok_or(SystemError::EOVERFLOW) + } else { + Ok(acc) + } + })?; + let new_locked_vm = self.locked_vm.checked_sub(locked_pages).ok_or_else(|| { + error!( + "shmdt locked_vm accounting underflow: locked_vm={}, pages={}", + self.locked_vm, locked_pages + ); + debug_assert!( + false, + "shmdt locked_vm accounting underflow: locked_vm={}, pages={}", + self.locked_vm, locked_pages + ); + SystemError::EFAULT + })?; + + let mm = self.outer_addr_space().ok_or(SystemError::EFAULT)?; + let _pt_edit = mm.page_table_edit(); + let mut tlb = MmuGather::gather(&mm); + let mut notifications = VmaCloseNotifications::default(); + + for target in targets { + let region = *target.lock().region(); + let vma = self + .mappings + .remove_vma(®ion) + .ok_or(SystemError::EFAULT)?; + if let Some(notification) = Self::collect_vma_close(&vma, region) { + notifications.vma.push(notification); + } + if let Some(notification) = Self::collect_sysv_shm_close(&vma) { + notifications.sysv.push(notification); + } + vma.unmap(&mut self.user_mapper.utable, &mut tlb); } - - // Shootdown first, then free physical pages + self.locked_vm = new_locked_vm; tlb.finish(); - for notification in vma_close_notifications { + Ok(notifications) + } + + fn collect_sysv_shm_close(vma: &Arc) -> Option> { + vma.lock().sysv_shm() + } + + fn notify_sysv_shm_close(notification: Arc) { + notification.close_vma(); + } + + fn notify_close_notifications(notifications: VmaCloseNotifications) { + for notification in notifications.vma { Self::notify_vma_close(notification); } - - return Ok(()); + for notification in notifications.sysv { + Self::notify_sysv_shm_close(notification); + } } fn collect_vma_close(vma: &Arc, region: VirtRegion) -> Option { @@ -1956,12 +2973,16 @@ impl InnerAddressSpace { let region = VirtRegion::new(start_page.virt_address(), page_count.bytes()); // debug!("mprotect: region: {:?}", region); - let regions = self.mappings.conflicts(region).collect::>(); + let (regions, has_unmapped) = self.mappings.conflicts_with_unmapped(region); + if has_unmapped { + return Err(SystemError::ENOMEM); + } // debug!("mprotect: regions: {:?}", regions); - for r in regions { + let mut plans = Vec::with_capacity(regions.len()); + for r in ®ions { // debug!("mprotect: r: {:?}", r); - let (r, new_vm_flags) = { + let (original_region, new_vm_flags) = { let guard = r.lock(); if !guard.can_have_flags(prot_flags) { return Err(SystemError::EACCES); @@ -1969,26 +2990,42 @@ impl InnerAddressSpace { let old_vm_flags = *guard.vm_flags(); let access_flags = VmFlags::VM_READ | VmFlags::VM_WRITE | VmFlags::VM_EXEC; let new_vm_flags = (old_vm_flags & !access_flags) | VmFlags::from(prot_flags); + if new_vm_flags == old_vm_flags { + continue; + } if let Some(file) = guard.vm_file() { file.inode().fs().mprotect(old_vm_flags, new_vm_flags)?; } (*guard.region(), new_vm_flags) }; - let r = self.mappings.remove_vma(&r).unwrap(); + let intersection = original_region.intersect(®ion).unwrap(); + let split_lifecycle = r.prepare_split_lifecycle(intersection)?; + plans.push(MprotectVmaPlan { + original_region, + intersection, + new_vm_flags, + split_lifecycle, + }); + } + + for plan in plans { + let r = match self.mappings.remove_vma(&plan.original_region) { + Some(vma) => vma, + None => return Err(SystemError::EFAULT), + }; - let intersection = r.lock().region().intersect(®ion).unwrap(); let remap_result: Result = { let _pt_edit = mm.page_table_edit(); let split_result = r - .extract(intersection, mapper) + .extract(plan.intersection, mapper) .expect("Failed to extract VMA"); let mut r_guard = r.lock(); - r_guard.set_vm_flags(new_vm_flags); + r_guard.set_vm_flags(plan.new_vm_flags); - let new_flags: EntryFlags = MMArch::vm_get_page_prot(new_vm_flags); + let new_flags: EntryFlags = MMArch::vm_get_page_prot(plan.new_vm_flags); - r_guard.remap(new_flags, mapper, &mut tlb)?; + r_guard.remap(new_flags, mapper, &mut tlb); Ok((split_result.prev, split_result.after)) }; let (before, after) = match remap_result { @@ -2006,6 +3043,7 @@ impl InnerAddressSpace { self.mappings.insert_vma(after); } self.mappings.insert_vma(r); + plan.split_lifecycle.commit(); } // Unified shootdown. mprotect does not free physical pages; tlb.finish() mainly flushes the TLB. @@ -2027,7 +3065,7 @@ impl InnerAddressSpace { let mut last_vaddr = start_page.virt_address(); let region = VirtRegion::new(start_page.virt_address(), page_count.bytes()); - let mut vmas = self.mappings.conflicts(region).collect::>(); + let mut vmas = self.mappings.conflicts(region); // 为保证与地址连续性的判断正确,这里按起始地址升序遍历 vmas.sort_by_key(|v| v.lock().region().start().data()); let mut offset = 0; @@ -2059,7 +3097,7 @@ impl InnerAddressSpace { len: usize, ) -> Result { let target = Self::checked_user_region(start, len)?; - let mut vmas = self.mappings.conflicts(target).collect::>(); + let mut vmas = self.mappings.conflicts(target); vmas.sort_by_key(|vma| vma.lock().region().start().data()); let mut cursor = target.start(); @@ -2163,50 +3201,80 @@ impl InnerAddressSpace { self.count_unlocked_pages_for_mlock(start, len)?; let wants_locked = new_flags.contains(VmFlags::VM_LOCKED); - let mut vmas = self.mappings.conflicts(target).collect::>(); + let mut vmas = self.mappings.conflicts(target); vmas.sort_by_key(|vma| vma.lock().region().start().data()); for cur_vma in vmas { - let original_region = *cur_vma.lock().region(); - let cur_vma = self - .mappings - .remove_vma(&original_region) - .ok_or(SystemError::EFAULT)?; - let intersection = cur_vma - .lock() - .region() - .intersect(&target) - .ok_or(SystemError::EFAULT)?; - let old_flags = *cur_vma.lock().vm_flags(); + let (original_region, intersection, old_flags) = { + let guard = cur_vma.lock(); + ( + *guard.region(), + guard + .region() + .intersect(&target) + .ok_or(SystemError::EFAULT)?, + *guard.vm_flags(), + ) + }; let old_locked = old_flags.contains(VmFlags::VM_LOCKED); + let committed_flags = (old_flags & VmFlags::VM_LOCKED_CLEAR_MASK) | new_flags; + if committed_flags == old_flags { + continue; + } + let pages = intersection.size() >> MMArch::PAGE_SHIFT; + let locked_vm_after = if wants_locked && !old_locked { + Some( + self.locked_vm + .checked_add(pages) + .ok_or(SystemError::ENOMEM)?, + ) + } else if !wants_locked && old_locked { + Some( + self.locked_vm + .checked_sub(pages) + .ok_or(SystemError::ENOMEM)?, + ) + } else { + None + }; + let split_lifecycle = cur_vma.prepare_split_lifecycle(intersection)?; + let cur_vma = match self.mappings.remove_vma(&original_region) { + Some(vma) => vma, + None => return Err(SystemError::EFAULT), + }; let split_result = cur_vma .extract(intersection, &self.user_mapper.utable) - .ok_or(SystemError::EFAULT)?; + .ok_or_else(|| { + self.mappings.insert_vma(cur_vma.clone()); + SystemError::EFAULT + })?; + let (before, after) = (split_result.prev, split_result.after); - let committed_flags = (old_flags & VmFlags::VM_LOCKED_CLEAR_MASK) | new_flags; { let mut guard = cur_vma.lock(); guard.set_vm_flags(committed_flags); } - let pages = intersection.size() >> MMArch::PAGE_SHIFT; - if wants_locked && !old_locked { - self.locked_vm = self - .locked_vm - .checked_add(pages) - .ok_or(SystemError::ENOMEM)?; - } else if !wants_locked && old_locked { - self.locked_vm = self.locked_vm.saturating_sub(pages); + if let Some(locked_vm_after) = locked_vm_after { + self.locked_vm = locked_vm_after; } + self.update_present_page_mlock_refs( + &cur_vma, + intersection.start(), + intersection.end(), + old_locked, + wants_locked, + ); - if let Some(before) = split_result.prev { + if let Some(before) = before { self.mappings.insert_vma(before); } - if let Some(after) = split_result.after { + if let Some(after) = after { self.mappings.insert_vma(after); } self.mappings.insert_vma(cur_vma); + split_lifecycle.commit(); } if wants_locked { @@ -2226,7 +3294,13 @@ impl InnerAddressSpace { if len == 0 { return Err(SystemError::EINVAL); } - start.data().checked_add(len).ok_or(SystemError::EINVAL)?; + if !start.check_aligned(MMArch::PAGE_SIZE) { + return Err(SystemError::EINVAL); + } + let end = start.data().checked_add(len).ok_or(SystemError::EINVAL)?; + if end > MMArch::USER_END_VADDR.data() { + return Err(SystemError::EINVAL); + } Ok(VirtRegion::new(start, len)) } @@ -2239,23 +3313,43 @@ impl InnerAddressSpace { let mut vaddr = start; while vaddr < end { if let Some((paddr, _)) = mapper.translate(vaddr) { - let mut page_manager_guard = page_manager_lock(); - let page = page_manager_guard.get_unwrap(&paddr); - let mut page_guard = page.write(); - if !Self::page_should_remain_unevictable(&page_guard) { - page_guard.remove_flags(PageFlags::PG_UNEVICTABLE); - } + let page = { + let mut page_manager_guard = page_manager_lock(); + page_manager_guard.get_unwrap(&paddr) + }; + Self::remove_page_unevictable_if_unneeded(&page); } vaddr = VirtAddr::new(vaddr.data() + MMArch::PAGE_SIZE); } Ok(()) } - pub(crate) fn page_should_remain_unevictable(page: &crate::mm::page::InnerPage) -> bool { - page.vma_set().iter().any(|vma| { - let guard = vma.lock(); - guard.vm_flags().contains(VmFlags::VM_LOCKED) || guard.shared_anon.is_some() - }) + pub(crate) fn remove_page_unevictable_if_unneeded(page: &Arc) { + let mut page_guard = page.write(); + if !page_guard.flags().contains(PageFlags::PG_UNEVICTABLE) + || page_guard.has_unevictable_source() + { + return; + } + + page_guard.remove_flags(PageFlags::PG_UNEVICTABLE); + let paddr = page.phys_address(); + let should_reclaim = page_guard.flags().contains(PageFlags::PG_LRU); + drop(page_guard); + if should_reclaim { + page_reclaimer_lock().insert_page(paddr, page); + } + } + + fn madvise_uses_range_without_vma_split(behavior: MadvFlags) -> bool { + behavior == MadvFlags::MADV_DONTNEED + || behavior == MadvFlags::MADV_DONTNEED_LOCKED + || behavior == MadvFlags::MADV_WILLNEED + || behavior == MadvFlags::MADV_COLD + || behavior == MadvFlags::MADV_PAGEOUT + || behavior == MadvFlags::MADV_FREE + || behavior == MadvFlags::MADV_POPULATE_READ + || behavior == MadvFlags::MADV_POPULATE_WRITE } pub fn madvise( @@ -2270,28 +3364,95 @@ impl InnerAddressSpace { let mapper = &mut self.user_mapper.utable; let region = VirtRegion::new(start_page.virt_address(), page_count.bytes()); - let regions = self.mappings.conflicts(region).collect::>(); + let (regions, has_unmapped) = self.mappings.conflicts_with_unmapped(region); - if behavior == MadvFlags::MADV_DONTNEED { + if behavior == MadvFlags::MADV_DOFORK { for vma in ®ions { - if vma.lock().vm_flags().contains(VmFlags::VM_LOCKED) { + if vma.lock().vm_flags().contains(VmFlags::VM_IO) { return Err(SystemError::EINVAL); } } } + if behavior == MadvFlags::MADV_REMOVE { + return if regions.is_empty() { + Err(SystemError::ENOMEM) + } else { + Err(SystemError::EINVAL) + }; + } - for r in regions { - let r = *r.lock().region(); - let r = self.mappings.remove_vma(&r).unwrap(); + if Self::madvise_uses_range_without_vma_split(behavior) { + for r in regions { + let (original_region, vm_flags) = { + let guard = r.lock(); + (*guard.region(), *guard.vm_flags()) + }; + let intersection = original_region.intersect(®ion).unwrap(); - let intersection = r.lock().region().intersect(®ion).unwrap(); - let (before, after) = { + let _pt_edit = mm.page_table_edit(); + match behavior { + MadvFlags::MADV_DONTNEED | MadvFlags::MADV_DONTNEED_LOCKED => { + if vm_flags.contains(VmFlags::VM_PFNMAP) + || (behavior == MadvFlags::MADV_DONTNEED + && vm_flags.contains(VmFlags::VM_LOCKED)) + { + tlb.finish(); + return Err(SystemError::EINVAL); + } + r.unmap_range(intersection, mapper, &mut tlb, UnmapMappingMode::EvenCow); + } + _ => r.do_madvise(behavior, mapper, &mut tlb), + } + } + tlb.finish(); + return if has_unmapped { + Err(SystemError::ENOMEM) + } else { + Ok(()) + }; + } + + let mut plans = Vec::with_capacity(regions.len()); + for r in ®ions { + let (original_region, old_flags) = { + let guard = r.lock(); + (*guard.region(), *guard.vm_flags()) + }; + let Some(new_flags) = r.madvise_updated_flags(behavior)? else { + continue; + }; + if new_flags == old_flags { + continue; + } + let intersection = original_region.intersect(®ion).unwrap(); + let split_lifecycle = r.prepare_split_lifecycle(intersection)?; + plans.push(MadviseVmaPlan { + original_region, + intersection, + split_lifecycle, + }); + } + + for plan in plans { + let r = match self.mappings.remove_vma(&plan.original_region) { + Some(vma) => vma, + None => return Err(SystemError::EFAULT), + }; + + let madvise_result: Result = { let _pt_edit = mm.page_table_edit(); let split_result = r - .extract(intersection, mapper) + .extract(plan.intersection, mapper) .expect("Failed to extract VMA"); - r.do_madvise(behavior, mapper, &mut tlb)?; - (split_result.prev, split_result.after) + r.do_madvise(behavior, mapper, &mut tlb); + Ok((split_result.prev, split_result.after)) + }; + let (before, after) = match madvise_result { + Ok(result) => result, + Err(err) => { + self.mappings.insert_vma(r); + return Err(err); + } }; if let Some(before) = before { self.mappings.insert_vma(before); @@ -2300,9 +3461,14 @@ impl InnerAddressSpace { self.mappings.insert_vma(after); } self.mappings.insert_vma(r); + plan.split_lifecycle.commit(); } tlb.finish(); - Ok(()) + if has_unmapped { + Err(SystemError::ENOMEM) + } else { + Ok(()) + } } /// 取消与指定 inode 关联的文件映射的页表项,保留 VMA 以便后续访问触发缺页并按最新文件大小处理 @@ -2363,12 +3529,17 @@ impl InnerAddressSpace { // Full-mm flush (fullmm); no need to accumulate ranges. tlb.set_fullmm(); let mut vma_close_notifications = Vec::new(); + let mut sysv_close_notifications = Vec::new(); for vma in self.mappings.iter_vmas() { + let region = *vma.lock().region(); + if let Some(notification) = Self::collect_vma_close(vma, region) { + vma_close_notifications.push(notification); + } + let sysv_close = Self::collect_sysv_shm_close(vma); + if let Some(notification) = sysv_close { + sysv_close_notifications.push(notification); + } if vma.mapped() { - let region = *vma.lock().region(); - if let Some(notification) = Self::collect_vma_close(vma, region) { - vma_close_notifications.push(notification); - } vma.unmap(&mut self.user_mapper.utable, &mut tlb); } } @@ -2376,6 +3547,9 @@ impl InnerAddressSpace { for notification in vma_close_notifications { Self::notify_vma_close(notification); } + for notification in sysv_close_notifications { + Self::notify_sysv_shm_close(notification); + } } /// 设置进程的堆的内存空间 @@ -2409,8 +3583,13 @@ impl InnerAddressSpace { if new_brk > self.brk { let len = new_brk - self.brk; + let brk_region = VirtRegion::new(self.brk, len); + if self.mappings.has_conflict(brk_region) { + return Err(SystemError::ENOMEM); + } let prot_flags = ProtFlags::PROT_READ | ProtFlags::PROT_WRITE; - let map_flags = MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS | MapFlags::MAP_FIXED; + let map_flags = + MapFlags::MAP_PRIVATE | MapFlags::MAP_ANONYMOUS | MapFlags::MAP_FIXED_NOREPLACE; self.map_anonymous(old_brk, len, prot_flags, map_flags, true, false)?; self.brk = new_brk; @@ -2456,28 +3635,69 @@ impl InnerAddressSpace { return self.set_brk(new_brk); } - pub fn find_free_at( + fn find_free_at_prepare( &mut self, min_vaddr: VirtAddr, vaddr: VirtAddr, size: usize, flags: MapFlags, ) -> Result { + self.find_free_at_internal(min_vaddr, vaddr, size, flags, false) + .map(|(region, _)| region) + } + + fn find_free_at_collect( + &mut self, + min_vaddr: VirtAddr, + vaddr: VirtAddr, + size: usize, + flags: MapFlags, + ) -> Result<(VirtRegion, VmaCloseNotifications), SystemError> { + self.find_free_at_internal(min_vaddr, vaddr, size, flags, true) + } + + fn find_free_at_internal( + &mut self, + min_vaddr: VirtAddr, + vaddr: VirtAddr, + size: usize, + flags: MapFlags, + unmap_fixed: bool, + ) -> Result<(VirtRegion, VmaCloseNotifications), SystemError> { // 如果没有指定地址,那么就在当前进程的地址空间中寻找一个空闲的虚拟内存范围。 - if vaddr == VirtAddr::new(0) { - return self + if vaddr == VirtAddr::new(0) + && !flags.intersects(MapFlags::MAP_FIXED | MapFlags::MAP_FIXED_NOREPLACE) + { + let region = self .mappings .find_free(min_vaddr, size) - .ok_or(SystemError::ENOMEM); + .ok_or(SystemError::ENOMEM)?; + return Ok((region, VmaCloseNotifications::default())); } - // 如果指定了地址,那么就检查指定的地址是否可用。 - let requested = VirtRegion::new(vaddr, size); - - if requested.end() >= MMArch::USER_END_VADDR || !vaddr.check_aligned(MMArch::PAGE_SIZE) { + let end = vaddr.data().checked_add(size).ok_or(SystemError::EINVAL)?; + if size == 0 + || end > MMArch::USER_END_VADDR.data() + || !vaddr.check_aligned(MMArch::PAGE_SIZE) + { return Err(SystemError::EINVAL); } + if vaddr < min_vaddr { + if flags.intersects(MapFlags::MAP_FIXED | MapFlags::MAP_FIXED_NOREPLACE) { + check_mmap_min_addr(vaddr, min_vaddr)?; + } else { + let region = self + .mappings + .find_free(min_vaddr, size) + .ok_or(SystemError::ENOMEM)?; + return Ok((region, VmaCloseNotifications::default())); + } + } + + // 如果指定了地址,那么就检查指定的地址是否可用。 + let requested = VirtRegion::new(vaddr, size); + if self .mappings .first_reservation_conflict(requested) @@ -2486,7 +3706,7 @@ impl InnerAddressSpace { return Err(SystemError::EAGAIN_OR_EWOULDBLOCK); } - let has_conflict = self.mappings.conflicts(requested).next().is_some(); + let has_conflict = self.mappings.has_conflict(requested); if has_conflict { if flags.contains(MapFlags::MAP_FIXED_NOREPLACE) { // 如果指定了 MAP_FIXED_NOREPLACE 标志,由于所指定的地址无法成功建立映射,则放弃映射,不对地址做修正 @@ -2494,13 +3714,16 @@ impl InnerAddressSpace { } if flags.contains(MapFlags::MAP_FIXED) { + if !unmap_fixed { + return Ok((requested, VmaCloseNotifications::default())); + } // Linux mmap_region() unmaps the whole requested range for MAP_FIXED, // because the new mapping may overlap more than the first conflicting VMA. - self.munmap( + let notifications = self.munmap_collect( VirtPageFrame::new(requested.start()), PageFrameCount::from_bytes(requested.size()).unwrap(), )?; - return Ok(requested); + return Ok((requested, notifications)); } // 如果没有指定MAP_FIXED标志,那么就对地址做修正 @@ -2508,10 +3731,10 @@ impl InnerAddressSpace { .mappings .find_free(min_vaddr, size) .ok_or(SystemError::ENOMEM)?; - return Ok(requested); + return Ok((requested, VmaCloseNotifications::default())); } - return Ok(requested); + return Ok((requested, VmaCloseNotifications::default())); } } @@ -2572,6 +3795,8 @@ struct MmapReservation { pub struct UserMappings { /// 当前用户空间的虚拟内存区域 vmas: HashSet>, + /// 按起始地址索引的 VMA,用于地址查找、范围扫描和删除。 + vmas_by_start: BTreeMap>, /// 当前用户空间的VMA空洞 vm_holes: BTreeMap, /// 正在建立、但尚未发布为 VMA 的 mmap 地址预约。 @@ -2584,6 +3809,7 @@ impl UserMappings { pub fn new() -> Self { return Self { vmas: HashSet::new(), + vmas_by_start: BTreeMap::new(), vm_holes: core::iter::once((VirtAddr::new(0), MMArch::USER_END_VADDR.data())) .collect::>(), reservations: BTreeMap::new(), @@ -2627,13 +3853,12 @@ impl UserMappings { /// 如果有,返回包含指定虚拟地址的VMA的Arc指针,否则返回None。 #[allow(dead_code)] pub fn contains(&self, vaddr: VirtAddr) -> Option> { - for v in self.vmas.iter() { - let guard = v.lock(); - if guard.region.contains(vaddr) { - return Some(v.clone()); - } + let (_, vma) = self.vmas_by_start.range(..=vaddr).next_back()?; + if vma.lock().region.contains(vaddr) { + Some(vma.clone()) + } else { + None } - return None; } /// 向下寻找距离虚拟地址最近的VMA @@ -2646,34 +3871,70 @@ impl UserMappings { /// - None: 未找到VMA #[allow(dead_code)] pub fn find_nearest(&self, vaddr: VirtAddr) -> Option> { - let mut nearest: Option> = None; - for v in self.vmas.iter() { - let guard = v.lock(); - if guard.region.contains(vaddr) { - return Some(v.clone()); - } - // 向下寻找:选择起始地址大于 vaddr 的 VMA 中,起始地址最小的一个(最近的下一个VMA) - if guard.region.start > vaddr - && if let Some(ref current) = nearest { - guard.region.start < current.lock().region.start - } else { - true - } - { - nearest = Some(v.clone()); + if let Some(vma) = self.contains(vaddr) { + return Some(vma); + } + self.vmas_by_start + .range(vaddr..) + .next() + .map(|(_, vma)| vma.clone()) + } + + /// 获取当前进程的地址空间中,与给定虚拟地址范围有重叠的VMA。 + pub fn conflicts(&self, request: VirtRegion) -> Vec> { + let mut result = Vec::new(); + if let Some((start, vma)) = self.vmas_by_start.range(..=request.start()).next_back() { + if *start < request.start() && vma.lock().region.intersect(&request).is_some() { + result.push(vma.clone()); + } + } + for (_start, vma) in self.vmas_by_start.range(request.start()..request.end()) { + if vma.lock().region.intersect(&request).is_some() { + result.push(vma.clone()); } } - return nearest; + result } - /// 获取当前进程的地址空间中,与给定虚拟地址范围有重叠的VMA的迭代器。 - pub fn conflicts(&self, request: VirtRegion) -> impl Iterator> + '_ { - let r = self - .vmas - .iter() - .filter(move |v| v.lock().region.intersect(&request).is_some()) - .cloned(); - return r; + pub fn has_conflict(&self, request: VirtRegion) -> bool { + if let Some((start, vma)) = self.vmas_by_start.range(..=request.start()).next_back() { + if *start < request.start() && vma.lock().region.intersect(&request).is_some() { + return true; + } + } + self.vmas_by_start + .range(request.start()..request.end()) + .any(|(_start, vma)| vma.lock().region.intersect(&request).is_some()) + } + + pub fn conflicts_with_unmapped(&self, request: VirtRegion) -> (Vec>, bool) { + let conflicts = self.conflicts(request); + let mut cursor = request.start(); + let mut has_unmapped = false; + + for vma in &conflicts { + let vma_region = *vma.lock().region(); + if vma_region.start() > cursor { + has_unmapped = true; + } + if vma_region.end() > cursor { + cursor = cmp::min(vma_region.end(), request.end()); + } + } + if cursor < request.end() { + has_unmapped = true; + } + + (conflicts, has_unmapped) + } + + pub fn iter_vmas_starting_at( + &self, + start: VirtAddr, + ) -> impl Iterator> + '_ { + self.vmas_by_start + .range(start..) + .map(|(_start, vma)| vma.clone()) } pub fn first_reservation_conflict(&self, request: VirtRegion) -> Option { @@ -2698,7 +3959,7 @@ impl UserMappings { } fn region_available_for_reservation(&self, region: VirtRegion) -> bool { - self.conflicts(region).next().is_none() && self.first_reservation_conflict(region).is_none() + !self.has_conflict(region) && self.first_reservation_conflict(region).is_none() } fn reserve_region(&mut self, region: VirtRegion) -> Result { @@ -2831,10 +4092,11 @@ impl UserMappings { pub fn insert_vma(&mut self, vma: Arc) { let region = vma.lock().region; // 要求插入的地址范围必须是空闲的,也就是说,当前进程的地址空间中,不能有任何与之重叠的VMA。 - assert!(self.conflicts(region).next().is_none()); + assert!(!self.has_conflict(region)); self.reserve_hole(®ion); self.attach_vma(&vma); + self.vmas_by_start.insert(region.start(), vma.clone()); self.vmas.insert(vma); } @@ -2853,11 +4115,13 @@ impl UserMappings { /// - 会修改vm_holes中的空洞信息 /// pub fn remove_vma(&mut self, region: &VirtRegion) -> Option> { - // 请注意,由于这里会对每个VMA加锁,因此性能很低 - let vma: Arc = self - .vmas - .drain_filter(|vma| vma.lock().region == *region) - .next()?; + let vma = self.vmas_by_start.remove(®ion.start())?; + if vma.lock().region != *region { + self.vmas_by_start.insert(region.start(), vma); + return None; + } + let removed = self.vmas.remove(&vma); + debug_assert!(removed, "vmas_by_start and vmas diverged for {:?}", region); self.unreserve_hole(region); self.detach_vma(&vma); @@ -2883,6 +4147,7 @@ impl Default for UserMappings { pub struct LockedVMA { /// 用于计算哈希值,避免总是获取vma锁来计算哈希值 id: usize, + state_seq: AtomicU64, vma: Mutex, } @@ -2905,6 +4170,7 @@ impl LockedVMA { pub fn new(vma: VMA) -> Arc { let r = Arc::new(Self { id: LOCKEDVMA_ID_ALLOCATOR.lock().alloc().unwrap(), + state_seq: AtomicU64::new(0), vma: Mutex::new(vma), }); r.vma.lock().self_ref = Arc::downgrade(&r); @@ -2915,10 +4181,56 @@ impl LockedVMA { self.id } + pub fn state_seq(&self) -> u64 { + self.state_seq.load(Ordering::Acquire) + } + + fn bump_state_seq(&self) { + self.state_seq.fetch_add(1, Ordering::AcqRel); + } + pub fn lock(&self) -> MutexGuard<'_, VMA> { return self.vma.lock(); } + fn prepare_split_lifecycle( + &self, + intersection: VirtRegion, + ) -> Result { + let (original_region, sysv_shm) = { + let guard = self.lock(); + if intersection == *guard.region() { + return Ok(VmaSplitLifecycle::none()); + } + (*guard.region(), guard.sysv_shm()) + }; + let Some(sysv_shm) = sysv_shm else { + return Ok(VmaSplitLifecycle::none()); + }; + + let mut lifecycle = VmaSplitLifecycle { + sysv_shm: Some(sysv_shm.clone()), + open_count: 0, + committed: false, + }; + if original_region.before(&intersection).is_some() { + sysv_shm.open_vma()?; + lifecycle.open_count += 1; + } + if original_region.after(&intersection).is_some() { + if let Err(err) = sysv_shm.open_vma() { + for _ in 0..lifecycle.open_count { + sysv_shm.close_vma(); + } + lifecycle.open_count = 0; + lifecycle.committed = true; + return Err(err); + } + lifecycle.open_count += 1; + } + Ok(lifecycle) + } + /// 调整当前VMA的页面的标志位 /// /// TODO:增加调整虚拟页映射的物理地址的功能 @@ -2954,52 +4266,54 @@ impl LockedVMA { /// performs cross-core TLB shootdown first and then frees physical pages (INV-3). pub fn unmap(&self, mapper: &mut PageMapper, tlb: &mut MmuGather<'_>) { // todo: 如果当前vma与文件相关,完善文件相关的逻辑 - let mut self_guard = self.lock(); + let (region, should_wakeup_writeback) = { + let mut self_guard = self.lock(); + let region = *self_guard.region(); + self_guard.mapped = false; + let should_wakeup_writeback = self_guard.vm_file().is_some() + && self_guard + .vm_flags() + .contains(VmFlags::VM_SHARED | VmFlags::VM_WRITE); + (region, should_wakeup_writeback) + }; - // 获取物理页的anon_vma的守卫 - let mut page_manager_guard = page_manager_lock(); + let mut pages_to_reclassify = Vec::new(); + { + let mut page_manager_guard = page_manager_lock(); + for page in region.pages() { + if mapper.translate(page.virt_address()).is_none() { + continue; + } + let (paddr, _, flush, freed_tables) = + unsafe { mapper.unmap_phys_with_freed_tables(page.virt_address(), true) } + .expect("Failed to unmap, beacuse of some page is not mapped"); - // 获取映射的物理地址 - if let Some((paddr, _flags)) = mapper.translate(self_guard.region().start()) { - // 如果是共享页,执行释放操作 - let page = page_manager_guard.get(&paddr).unwrap(); - let _page_guard = page.read(); - if let Some(shm_id) = self_guard.shm_id { - let ipcns = ProcessManager::current_ipcns(); - let mut shm_manager_guard = ipcns.shm.lock(); - if let Some(kernel_shm) = shm_manager_guard.get_mut(&shm_id) { - // 更新最后一次断开连接时间 - kernel_shm.update_dtim(); - - // 映射计数减少 - kernel_shm.decrease_count(); - - // 释放shm_id - if kernel_shm.map_count() == 0 && kernel_shm.mode().contains(ShmFlags::SHM_DEST) - { - shm_manager_guard.free_id(&shm_id); - } + // 从anon_vma中删除当前VMA + let page_arc = page_manager_guard.get_unwrap(&paddr); + { + let mut page_guard = page_arc.write(); + page_guard.remove_vma(self); + } + pages_to_reclassify.push((paddr, page_arc)); + + // Local PTE cleared; no immediate invlpg. Final TLB invalidation is performed uniformly by MmuGather. + unsafe { flush.ignore() }; + tlb.accumulate_range(page.virt_address()); + if freed_tables { + tlb.note_pt_table_freed(); } } } - for page in self_guard.region.pages() { - if mapper.translate(page.virt_address()).is_none() { - continue; - } - let (paddr, _, flush, freed_tables) = - unsafe { mapper.unmap_phys_with_freed_tables(page.virt_address(), true) } - .expect("Failed to unmap, beacuse of some page is not mapped"); - - // 从anon_vma中删除当前VMA - let page_arc = page_manager_guard.get_unwrap(&paddr); - let can_dealloc = { - let mut page_guard = page_arc.write(); - page_guard.remove_vma(self); - // The physical page's VMA list length is 0 and it is not marked as non-reclaimable, so it can be freed. - // TODO: LRU-based physical page reclamation in the future - page_guard.can_deallocate() - }; + for (_, page_arc) in &pages_to_reclassify { + InnerAddressSpace::remove_page_unevictable_if_unneeded(page_arc); + } + + let mut page_manager_guard = page_manager_lock(); + for (paddr, page_arc) in pages_to_reclassify { + // The physical page's VMA list length is 0 and it is not marked as non-reclaimable, so it can be freed. + // TODO: LRU-based physical page reclamation in the future + let can_dealloc = page_arc.read().can_deallocate_after_vma_unmap(); if can_dealloc { // Remove this `Arc` from page_manager, deferring the drop until after TLB shootdown. @@ -3008,22 +4322,10 @@ impl LockedVMA { tlb.stash_page(p); } } - - // Local PTE cleared; no immediate invlpg. Final TLB invalidation is performed uniformly by MmuGather. - unsafe { flush.ignore() }; - tlb.accumulate_range(page.virt_address()); - if freed_tables { - tlb.note_pt_table_freed(); - } } - self_guard.mapped = false; // 当vma对应共享文件的写映射时,唤醒脏页回写线程 - if self_guard.vm_file().is_some() - && self_guard - .vm_flags() - .contains(VmFlags::VM_SHARED | VmFlags::VM_WRITE) - { + if should_wakeup_writeback { crate::mm::page::PageReclaimer::wakeup_claim_thread(); } } @@ -3050,55 +4352,68 @@ impl LockedVMA { .and_then(|file| file.inode().page_cache()); drop(self_guard); - let mut page_manager_guard = page_manager_lock(); - for page in intersection.pages() { - let virt = page.virt_address(); - let Some((paddr, _)) = mapper.translate(virt) else { - continue; - }; - - let page_arc = page_manager_guard.get_unwrap(&paddr); - if let Some(page_cache) = file_page_cache.as_ref() { - let Some(base_pgoff) = backing_pgoff else { + let mut pages_to_reclassify = Vec::new(); + { + let mut page_manager_guard = page_manager_lock(); + for page in intersection.pages() { + let virt = page.virt_address(); + let Some((paddr, _)) = mapper.translate(virt) else { continue; }; - let pgoff = base_pgoff + ((virt.data() - vma_start.data()) >> MMArch::PAGE_SHIFT); - let page_guard = page_arc.read(); - let is_target_page = match page_guard.page_type() { - PageType::File(info) if info.index == pgoff => info - .page_cache - .upgrade() - .is_some_and(|mapped_cache| Arc::ptr_eq(&mapped_cache, page_cache)), - // Truncate must also zap private COW pages. For file VMAs those pages are - // represented as normal pages, while shared file mappings remain page-cache - // backed and are covered by the PageType::File branch above. - PageType::Normal if mode == UnmapMappingMode::EvenCow => true, - _ => false, - }; - drop(page_guard); - if !is_target_page { + + let page_arc = page_manager_guard.get_unwrap(&paddr); + if let Some(page_cache) = file_page_cache.as_ref() { + let Some(base_pgoff) = backing_pgoff else { + continue; + }; + let pgoff = + base_pgoff + ((virt.data() - vma_start.data()) >> MMArch::PAGE_SHIFT); + let page_guard = page_arc.read(); + let is_target_page = match page_guard.page_type() { + PageType::File(info) if info.index == pgoff => info + .page_cache + .upgrade() + .is_some_and(|mapped_cache| Arc::ptr_eq(&mapped_cache, page_cache)), + // Truncate must also zap private COW pages. For file VMAs those pages are + // represented as normal pages, while shared file mappings remain page-cache + // backed and are covered by the PageType::File branch above. + PageType::Normal if mode == UnmapMappingMode::EvenCow => true, + _ => false, + }; + drop(page_guard); + if !is_target_page { + continue; + } + } + + let Some((paddr, _, flush)) = (unsafe { mapper.unmap_phys_preserve_tables(virt) }) + else { continue; + }; + + { + let mut page_guard = page_arc.write(); + page_guard.remove_vma(self); } + pages_to_reclassify.push((paddr, page_arc)); + + unsafe { flush.ignore() }; + tlb.accumulate_range(virt); } + } - let Some((paddr, _, flush)) = (unsafe { mapper.unmap_phys_preserve_tables(virt) }) - else { - continue; - }; + for (_, page_arc) in &pages_to_reclassify { + InnerAddressSpace::remove_page_unevictable_if_unneeded(page_arc); + } - let can_dealloc = { - let mut page_guard = page_arc.write(); - page_guard.remove_vma(self); - page_guard.can_deallocate() - }; + let mut page_manager_guard = page_manager_lock(); + for (paddr, page_arc) in pages_to_reclassify { + let can_dealloc = page_arc.read().can_deallocate_after_vma_unmap(); if can_dealloc { if let Some(p) = page_manager_guard.remove_page(&paddr) { tlb.stash_page(p); } } - - unsafe { flush.ignore() }; - tlb.accumulate_range(virt); } } @@ -3196,6 +4511,7 @@ impl LockedVMA { vma }); + let vma_mlocked = guard.vm_flags().contains(VmFlags::VM_LOCKED); // 重新设置before、after这两个VMA里面的物理页的anon_vma let mut page_manager_guard = page_manager_lock(); if let Some(before) = before.clone() { @@ -3204,7 +4520,7 @@ impl LockedVMA { if let Some((paddr, _)) = utable.translate(frame.virt_address()) { let page = page_manager_guard.get_unwrap(&paddr); let mut page_guard = page.write(); - page_guard.insert_vma(before.clone()); + page_guard.insert_vma(before.clone(), vma_mlocked); page_guard.remove_vma(self); before.lock().mapped = true; } @@ -3216,7 +4532,7 @@ impl LockedVMA { if let Some((paddr, _)) = utable.translate(frame.virt_address()) { let page = page_manager_guard.get_unwrap(&paddr); let mut page_guard = page.write(); - page_guard.insert_vma(after.clone()); + page_guard.insert_vma(after.clone(), vma_mlocked); page_guard.remove_vma(self); after.lock().mapped = true; } @@ -3336,7 +4652,6 @@ pub struct PhysmapParams { pub count: PageFrameCount, pub vm_flags: VmFlags, pub flags: EntryFlags, - pub shm_id: Option, } /// @brief 虚拟内存区域 @@ -3359,8 +4674,8 @@ pub struct VMA { backing_pgoff: Option, provider: Provider, - /// 关联的 SysV SHM 标识(当此 VMA 来自 shmat 时设置) - shm_id: Option, + /// SysV SHM attach 身份,用于 Linux 风格 VMA open/close 生命周期。 + sysv_shm: Option>, /// 共享匿名映射的稳定身份(用于跨进程共享 futex key) pub(crate) shared_anon: Option>, } @@ -3423,8 +4738,7 @@ impl AnonSharedMapping { let mut pm = page_manager_lock(); let mut allocator = LockedFrameAllocator; let page = pm.create_one_page(PageType::Normal, PageFlags::empty(), &mut allocator)?; - // Mark shared-anon pages as unevictable so shrinking/unmapping doesn't drop their contents. - page.write().add_flags(PageFlags::PG_UNEVICTABLE); + page.write().add_backing_lifetime_pin(); guard.insert(pgoff, page.phys_address()); Ok(page) } @@ -3442,7 +4756,7 @@ impl Drop for AnonSharedMapping { for paddr in pages { if let Some(page) = pm.get(&paddr) { let mut pg = page.write(); - pg.remove_flags(PageFlags::PG_UNEVICTABLE); + pg.remove_backing_lifetime_pin(); if pg.can_deallocate() { drop(pg); pm.remove_page(&paddr); @@ -3472,7 +4786,7 @@ impl VMA { provider: Provider::Allocated, vm_file: file, backing_pgoff: pgoff, - shm_id: None, + sysv_shm: None, shared_anon: None, } } @@ -3494,7 +4808,13 @@ impl VMA { } pub fn set_vm_flags(&mut self, vm_flags: VmFlags) { + let changed = self.vm_flags != vm_flags; self.vm_flags = vm_flags; + if changed { + if let Some(vma) = self.self_ref.upgrade() { + vma.bump_state_seq(); + } + } } pub fn set_region_size(&mut self, new_region_size: usize) { @@ -3510,8 +4830,13 @@ impl VMA { } #[inline(always)] - pub fn set_shm_id(&mut self, shm: Option) { - self.shm_id = shm; + pub fn set_sysv_shm(&mut self, sysv_shm: Option>) { + self.sysv_shm = sysv_shm; + } + + #[inline(always)] + pub fn sysv_shm(&self) -> Option> { + self.sysv_shm.clone() } /// # 拷贝当前VMA的内容 @@ -3530,7 +4855,7 @@ impl VMA { provider: Provider::Allocated, backing_pgoff: self.backing_pgoff, vm_file: self.vm_file.clone(), - shm_id: self.shm_id, + sysv_shm: self.sysv_shm.clone(), shared_anon: self.shared_anon.clone(), }; } @@ -3546,7 +4871,7 @@ impl VMA { provider: Provider::Allocated, backing_pgoff: self.backing_pgoff, vm_file: self.vm_file.clone(), - shm_id: self.shm_id, + sysv_shm: self.sysv_shm.clone(), shared_anon: self.shared_anon.clone(), }; } @@ -3578,7 +4903,7 @@ impl VMA { flags: EntryFlags, mapper: &mut PageMapper, tlb: &mut MmuGather<'_>, - ) -> Result<(), SystemError> { + ) { let pte_flags = if self.vm_file.is_some() && self .vm_flags @@ -3603,7 +4928,6 @@ impl VMA { // debug!("remap page {:?} done", page.virt_address()); } self.flags = flags; - return Ok(()); } /// 检查当前VMA是否可以拥有指定的标志位 @@ -3684,17 +5008,14 @@ impl VMA { None, true, )); - if let Some(id) = params.shm_id { - r.lock().set_shm_id(Some(id)); - } - // 将VMA加入到anon_vma中 let mut page_manager_guard = page_manager_lock(); cur_phy = params.phys; + let vma_mlocked = params.vm_flags.contains(VmFlags::VM_LOCKED); for _ in 0..params.count.data() { let paddr = cur_phy.phys_address(); let page = page_manager_guard.get_unwrap(&paddr); - page.write().insert_vma(r.clone()); + page.write().insert_vma(r.clone(), vma_mlocked); cur_phy = cur_phy.next(); } @@ -3727,6 +5048,7 @@ impl VMA { pgoff: Option, ) -> Result, SystemError> { let mut cur_dest: VirtPageFrame = destination; + let mut mapped_pages = Vec::new(); // debug!( // "VMA::zeroed: page_count = {:?}, destination={destination:?}", // page_count @@ -3736,11 +5058,22 @@ impl VMA { // "VMA::zeroed: cur_dest={cur_dest:?}, vaddr = {:?}", // cur_dest.virt_address() // ); - let r = - unsafe { mapper.map(cur_dest.virt_address(), flags) }.ok_or(SystemError::ENOMEM)?; + let Some(r) = (unsafe { mapper.map(cur_dest.virt_address(), flags) }) else { + let mut page_manager_guard = page_manager_lock(); + for mapped in mapped_pages.into_iter().rev() { + if let Some((paddr, _flags, flush, _freed_tables)) = + unsafe { mapper.unmap_phys_with_freed_tables(mapped, true) } + { + flusher.consume(flush); + let _ = page_manager_guard.remove_page(&paddr); + } + } + return Err(SystemError::ENOMEM); + }; // 稍后再刷新TLB,这里取消刷新 flusher.consume(r); + mapped_pages.push(cur_dest.virt_address()); cur_dest = cur_dest.next(); } let r = LockedVMA::new(VMA::new( @@ -3761,12 +5094,13 @@ impl VMA { let mut page_manager_guard = page_manager_lock(); let virt_iter: VirtPageFrameIter = VirtPageFrameIter::new(destination, destination.add(page_count)); + let vma_mlocked = vm_flags.contains(VmFlags::VM_LOCKED); for frame in virt_iter { let paddr = mapper.translate(frame.virt_address()).unwrap().0; // 将VMA加入到anon_vma let page = page_manager_guard.get_unwrap(&paddr); - page.write().insert_vma(r.clone()); + page.write().insert_vma(r.clone(), vma_mlocked); } // debug!("VMA::zeroed: done"); return Ok(r); diff --git a/user/apps/tests/dunitest/suites/normal/mlock_semantics.cc b/user/apps/tests/dunitest/suites/normal/mlock_semantics.cc index a572481e74..a05844921e 100644 --- a/user/apps/tests/dunitest/suites/normal/mlock_semantics.cc +++ b/user/apps/tests/dunitest/suites/normal/mlock_semantics.cc @@ -42,6 +42,10 @@ #define MREMAP_DONTUNMAP 4 #endif +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0x100000 +#endif + namespace { size_t PageSize() { @@ -445,7 +449,81 @@ TEST(Mremap, DontUnmapClearsSourceLock) { EXPECT_EQ(0, munmap(addr, ps)); } -TEST(Mremap, DuplicateOldLenZeroClearsSourceLock) { +TEST(Mremap, DontUnmapPartialRangeClearsWholeSourceLock) { + const size_t ps = PageSize(); + ScopedMemlockLimit lim; + ASSERT_TRUE(lim.valid()); + ASSERT_TRUE(lim.set_bytes(ps * 3)); + + auto* addr = static_cast( + mmap(nullptr, ps * 3, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(0, mlock(addr, ps * 3)); + + errno = 0; + void* moved = reinterpret_cast( + syscall(SYS_mremap, addr + ps, ps, ps, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0)); + ASSERT_NE(MAP_FAILED, moved) << "errno=" << errno << " (" << strerror(errno) << ")"; + + errno = 0; + EXPECT_EQ(0, msync(addr, ps, MS_INVALIDATE)) + << "errno=" << errno << " (" << strerror(errno) << ")"; + + errno = 0; + EXPECT_EQ(0, msync(addr + ps, ps, MS_INVALIDATE)) + << "errno=" << errno << " (" << strerror(errno) << ")"; + + errno = 0; + EXPECT_EQ(0, msync(addr + 2 * ps, ps, MS_INVALIDATE)) + << "errno=" << errno << " (" << strerror(errno) << ")"; + + errno = 0; + EXPECT_EQ(-1, msync(moved, ps, MS_INVALIDATE)); + EXPECT_EQ(EBUSY, errno); + + EXPECT_EQ(0, munmap(moved, ps)); + EXPECT_EQ(0, munmap(addr, ps * 3)); +} + +TEST(Mremap, DontUnmapRejectsUnalignedNewAddress) { + const size_t ps = PageSize(); + auto* addr = static_cast( + mmap(nullptr, ps, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, addr); + + errno = 0; + void* moved = reinterpret_cast( + syscall(SYS_mremap, addr, ps, ps, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, addr + 1)); + EXPECT_EQ(MAP_FAILED, moved); + EXPECT_EQ(EINVAL, errno); + + addr[0] = 0x5a; + EXPECT_EQ(0x5a, addr[0]); + EXPECT_EQ(0, munmap(addr, ps)); +} + +TEST(Mremap, DontUnmapUsesNewAddressHintWhenAvailable) { + const size_t ps = PageSize(); + auto* addr = static_cast( + mmap(nullptr, ps, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, addr); + + void* hint = mmap(nullptr, ps, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, hint); + ASSERT_EQ(0, munmap(hint, ps)); + + errno = 0; + void* moved = + reinterpret_cast(syscall(SYS_mremap, addr, ps, ps, + MREMAP_MAYMOVE | MREMAP_DONTUNMAP, hint)); + ASSERT_NE(MAP_FAILED, moved) << "errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(hint, moved); + + EXPECT_EQ(0, munmap(moved, ps)); + EXPECT_EQ(0, munmap(addr, ps)); +} + +TEST(Mremap, DuplicateOldLenZeroKeepsSourceLock) { const size_t ps = PageSize(); ScopedMemlockLimit lim; ASSERT_TRUE(lim.valid()); @@ -466,17 +544,50 @@ TEST(Mremap, DuplicateOldLenZeroClearsSourceLock) { EXPECT_EQ(0, munlock(dup, ps)) << "errno=" << errno << " (" << strerror(errno) << ")"; - // Linux 6.6 mremap duplicate-mapping semantics clear VM_LOCKED on the - // source VMA, so once the duplicated destination is munlocked the source - // alias must no longer behave as locked. + // Linux 6.6 only clears VM_LOCKED on the old VMA for MREMAP_DONTUNMAP. + // The legacy old_len==0 duplicate path keeps the source VMA locked while + // also creating a locked duplicate. errno = 0; - EXPECT_EQ(0, msync(addr, ps, MS_INVALIDATE)) << "errno=" << errno << " (" << strerror(errno) - << ")"; + EXPECT_EQ(-1, msync(addr, ps, MS_INVALIDATE)) << "errno=" << errno << " (" << strerror(errno) + << ")"; + EXPECT_EQ(EBUSY, errno); EXPECT_EQ(0, munmap(dup, ps)); EXPECT_EQ(0, munmap(addr, ps)); } +TEST(Madvise, DontNeedKeepsLinuxOrderedSideEffectsBeforeLockedVmaError) { + const size_t ps = PageSize(); + ScopedMemlockLimit lim; + ASSERT_TRUE(lim.valid()); + ASSERT_TRUE(lim.set_bytes(ps * 2)); + + auto* first = static_cast( + mmap(nullptr, ps, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, first); + memset(first, 0x5a, ps); + + auto* second = static_cast( + mmap(first + ps, ps, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0)); + if (second == MAP_FAILED) { + EXPECT_EQ(0, munmap(first, ps)); + GTEST_SKIP() << "failed to reserve adjacent VMA: errno=" << errno << " (" + << strerror(errno) << ")"; + } + ASSERT_EQ(first + ps, second); + ASSERT_EQ(0, mlock(second, ps)) << "errno=" << errno << " (" << strerror(errno) << ")"; + + errno = 0; + EXPECT_EQ(-1, madvise(first, ps * 2, MADV_DONTNEED)); + EXPECT_EQ(EINVAL, errno); + + EXPECT_EQ('\0', first[0]) + << "Linux applies MADV_DONTNEED to earlier valid VMAs before returning EINVAL for a later locked VMA"; + + EXPECT_EQ(0, munmap(first, ps * 2)); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/user/apps/tests/dunitest/suites/normal/sysv_shm_semantics.cc b/user/apps/tests/dunitest/suites/normal/sysv_shm_semantics.cc new file mode 100644 index 0000000000..cd61861b79 --- /dev/null +++ b/user/apps/tests/dunitest/suites/normal/sysv_shm_semantics.cc @@ -0,0 +1,1909 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef SYS_mremap +#define SYS_mremap 25 +#endif + +#ifndef MREMAP_MAYMOVE +#define MREMAP_MAYMOVE 1 +#endif + +#ifndef MREMAP_FIXED +#define MREMAP_FIXED 2 +#endif + +#ifndef MREMAP_DONTUNMAP +#define MREMAP_DONTUNMAP 4 +#endif + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0x100000 +#endif + +#ifndef SHM_EXEC +#define SHM_EXEC 0100000 +#endif + +#ifndef SHM_STAT +#define SHM_STAT 13 +#endif + +#ifndef SHM_STAT_ANY +#define SHM_STAT_ANY 15 +#endif + +#ifndef IPC_INFO +#define IPC_INFO 3 +#endif + +#ifndef SHM_INFO +#define SHM_INFO 14 +#endif + +#ifndef SHM_LOCKED +#define SHM_LOCKED 02000 +#endif + +namespace { + +size_t PageSize() { + const long ps = sysconf(_SC_PAGESIZE); + return ps > 0 ? static_cast(ps) : 4096; +} + +size_t CurrentVmSizeBytes() { + FILE* fp = fopen("/proc/self/status", "r"); + if (fp == nullptr) { + return 0; + } + + char line[256]; + size_t kb = 0; + while (fgets(line, sizeof(line), fp) != nullptr) { + if (sscanf(line, "VmSize: %zu kB", &kb) == 1) { + break; + } + } + fclose(fp); + return kb * 1024; +} + +size_t SegmentSize() { + return PageSize() * 4; +} + +uintptr_t MmapMinAddr() { + return 65536; +} + +class ScopedMemlockLimit { + public: + ScopedMemlockLimit() : valid_(getrlimit(RLIMIT_MEMLOCK, &saved_) == 0) {} + + ~ScopedMemlockLimit() { + if (valid_) { + (void)setrlimit(RLIMIT_MEMLOCK, &saved_); + } + } + + bool valid() const { + return valid_; + } + + bool set_bytes(size_t bytes) { + if (!valid_) { + return false; + } + rlim_t want = static_cast(bytes); + if (saved_.rlim_max != RLIM_INFINITY && want > saved_.rlim_max) { + return false; + } + struct rlimit lim = saved_; + lim.rlim_cur = want; + return setrlimit(RLIMIT_MEMLOCK, &lim) == 0; + } + + private: + struct rlimit saved_ {}; + bool valid_ = false; +}; + +class ScopedAddressSpaceLimit { + public: + ScopedAddressSpaceLimit() : valid_(getrlimit(RLIMIT_AS, &saved_) == 0) {} + + ~ScopedAddressSpaceLimit() { + if (valid_) { + (void)setrlimit(RLIMIT_AS, &saved_); + } + } + + bool valid() const { + return valid_; + } + + bool set_bytes(size_t bytes) { + if (!valid_) { + return false; + } + rlim_t want = static_cast(bytes); + if (saved_.rlim_max != RLIM_INFINITY && want > saved_.rlim_max) { + return false; + } + struct rlimit lim = saved_; + lim.rlim_cur = want; + return setrlimit(RLIMIT_AS, &lim) == 0; + } + + bool restore() { + if (!valid_) { + return false; + } + return setrlimit(RLIMIT_AS, &saved_) == 0; + } + + private: + struct rlimit saved_ {}; + bool valid_ = false; +}; + +key_t UniqueKey() { + static int seq = 0; + return static_cast(0x53000000 ^ (getpid() << 8) ^ (++seq)); +} + +class ShmSegment { + public: + explicit ShmSegment(size_t size, int flags = IPC_CREAT | 0600) { + id_ = shmget(IPC_PRIVATE, size, flags); + } + + ShmSegment(key_t key, size_t size, int flags) { + id_ = shmget(key, size, flags); + } + + ~ShmSegment() { + if (id_ >= 0 && owns_) { + shmctl(id_, IPC_RMID, nullptr); + } + } + + ShmSegment(const ShmSegment&) = delete; + ShmSegment& operator=(const ShmSegment&) = delete; + + bool valid() const { + return id_ >= 0; + } + + int id() const { + return id_; + } + + int release() { + owns_ = false; + return id_; + } + + private: + int id_ = -1; + bool owns_ = true; +}; + +int ShmNattch(int shmid) { + struct shmid_ds ds; + if (shmctl(shmid, IPC_STAT, &ds) != 0) { + return -1; + } + return static_cast(ds.shm_nattch); +} + +void* Attach(int shmid, int flags = 0) { + void* addr = shmat(shmid, nullptr, flags); + return addr == reinterpret_cast(-1) ? MAP_FAILED : addr; +} + +void ExpectChildDiesBySignal(int signo, void (*fn)(void*), void* arg) { + const pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + + if (child == 0) { + fn(arg); + _exit(0); + } + + int status = 0; + ASSERT_EQ(child, waitpid(child, &status, 0)) + << "waitpid failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_TRUE(WIFSIGNALED(status)) << "child exited without signal, status=" << status; + EXPECT_EQ(signo, WTERMSIG(status)) << "unexpected signal, status=" << status; +} + +void WriteFirstByte(void* arg) { + volatile char* p = static_cast(arg); + p[0] = 'x'; +} + +void WaitChildOk(pid_t child) { + int status = 0; + ASSERT_EQ(child, waitpid(child, &status, 0)) + << "waitpid failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_TRUE(WIFEXITED(status)) << "child did not exit normally, status=" << status; + EXPECT_EQ(0, WEXITSTATUS(status)) << "child failed, status=" << status; +} + +void DropPageCache() { + int fd = open("/proc/sys/vm/drop_caches", O_WRONLY); + ASSERT_GE(fd, 0) << "open(drop_caches) failed: errno=" << errno << " (" << strerror(errno) + << ")"; + const char value[] = "1\n"; + ASSERT_EQ(static_cast(sizeof(value) - 1), write(fd, value, sizeof(value) - 1)) + << "write(drop_caches) failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(0, close(fd)); +} + +int RunShmExecPermissionScenario() { + int id = shmget(IPC_PRIVATE, SegmentSize(), IPC_CREAT | 0600); + if (id < 0) { + return 1; + } + + errno = 0; + void* denied = shmat(id, nullptr, SHM_EXEC); + if (denied != reinterpret_cast(-1) || errno != EACCES) { + if (denied != reinterpret_cast(-1)) { + shmdt(denied); + } + shmctl(id, IPC_RMID, nullptr); + return 2; + } + + void* addr = shmat(id, nullptr, 0); + if (addr == reinterpret_cast(-1)) { + shmctl(id, IPC_RMID, nullptr); + return 3; + } + if (mprotect(addr, SegmentSize(), PROT_READ | PROT_EXEC) != 0) { + shmdt(addr); + shmctl(id, IPC_RMID, nullptr); + return 4; + } + shmdt(addr); + shmctl(id, IPC_RMID, nullptr); + + id = shmget(IPC_PRIVATE, SegmentSize(), IPC_CREAT | 0700); + if (id < 0) { + return 5; + } + void* exec_addr = shmat(id, nullptr, SHM_EXEC); + if (exec_addr == reinterpret_cast(-1)) { + shmctl(id, IPC_RMID, nullptr); + return 6; + } + shmdt(exec_addr); + shmctl(id, IPC_RMID, nullptr); + return 0; +} + +int RunMremapFixedLowAddressOrderScenario() { + const size_t ps = PageSize(); + char* source = static_cast( + mmap(nullptr, ps * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + if (source == MAP_FAILED) { + return 1; + } + source[0] = 'a'; + source[ps] = 'b'; + + void* low_target = reinterpret_cast(ps); + errno = 0; + void* moved = reinterpret_cast( + syscall(SYS_mremap, source, ps * 2, ps, MREMAP_MAYMOVE | MREMAP_FIXED, low_target)); + if (moved != MAP_FAILED || errno != EPERM) { + if (moved != MAP_FAILED) { + munmap(moved, ps); + } else { + munmap(source, ps * 2); + } + return 2; + } + + if (source[0] != 'a') { + munmap(source, ps); + return 3; + } + + void* tail = mmap(source + ps, ps, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0); + if (tail != source + ps) { + munmap(source, ps); + return errno == EEXIST ? 4 : 5; + } + munmap(tail, ps); + munmap(source, ps); + + char* private_source = static_cast( + mmap(nullptr, ps, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + if (private_source == MAP_FAILED) { + return 6; + } + + errno = 0; + moved = reinterpret_cast(syscall(SYS_mremap, private_source, 0, ps, + MREMAP_MAYMOVE | MREMAP_FIXED, low_target)); + const int saved_errno = errno; + munmap(private_source, ps); + if (moved != MAP_FAILED || saved_errno != EINVAL) { + if (moved != MAP_FAILED) { + munmap(moved, ps); + } + return 7; + } + + return 0; +} + +int RunShmdtReleasesLockedVmScenario() { + ScopedMemlockLimit lim; + if (!lim.valid() || !lim.set_bytes(SegmentSize())) { + return 1; + } + + ShmSegment shm(SegmentSize()); + if (!shm.valid()) { + return 2; + } + + char* addr = static_cast(Attach(shm.id())); + if (addr == MAP_FAILED) { + return 3; + } + if (mlock(addr, SegmentSize()) != 0) { + shmdt(addr); + return 4; + } + if (shmdt(addr) != 0) { + return 5; + } + + void* probe = mmap(nullptr, SegmentSize(), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (probe == MAP_FAILED) { + return 6; + } + const int lock_result = mlock(probe, SegmentSize()); + const int saved_errno = errno; + munlock(probe, SegmentSize()); + munmap(probe, SegmentSize()); + if (lock_result != 0) { + return saved_errno == ENOMEM ? 7 : 8; + } + return 0; +} + +int RunLockedRmidLastDetachReleasesMemlockScenario() { + ScopedMemlockLimit lim; + if (!lim.valid() || !lim.set_bytes(SegmentSize())) { + return 1; + } + + ShmSegment first(SegmentSize()); + if (!first.valid()) { + return 2; + } + char* addr = static_cast(Attach(first.id())); + if (addr == MAP_FAILED) { + return 3; + } + if (shmctl(first.id(), SHM_LOCK, nullptr) != 0) { + shmdt(addr); + return 4; + } + if (shmctl(first.id(), IPC_RMID, nullptr) != 0) { + shmdt(addr); + return 5; + } + if (shmdt(addr) != 0) { + return 6; + } + + ShmSegment second(SegmentSize()); + if (!second.valid()) { + return 7; + } + if (shmctl(second.id(), SHM_LOCK, nullptr) != 0) { + return errno == ENOMEM ? 8 : 9; + } + if (shmctl(second.id(), SHM_UNLOCK, nullptr) != 0) { + return 10; + } + return 0; +} + +constexpr int kIpcIdIndexMask = (1 << 15) - 1; + +} // namespace + +TEST(SysvShmSemantics, ReusedLowIndexRejectsStaleFullId) { + ShmSegment first(SegmentSize()); + ASSERT_TRUE(first.valid()) << "first shmget failed: errno=" << errno << " (" + << strerror(errno) << ")"; + const int stale_id = first.release(); + const int stale_idx = stale_id & kIpcIdIndexMask; + ASSERT_EQ(0, shmctl(stale_id, IPC_RMID, nullptr)) + << "IPC_RMID failed: errno=" << errno << " (" << strerror(errno) << ")"; + + ShmSegment second(SegmentSize()); + ASSERT_TRUE(second.valid()) << "second shmget failed: errno=" << errno << " (" + << strerror(errno) << ")"; + const int current_id = second.id(); + const int current_idx = current_id & kIpcIdIndexMask; + if (current_idx != stale_idx || current_id == stale_id) { + GTEST_SKIP() << "allocator did not immediately reuse the low IPC index"; + } + + struct shmid_ds ds; + errno = 0; + EXPECT_EQ(-1, shmctl(stale_id, IPC_STAT, &ds)); + EXPECT_EQ(EINVAL, errno); + + errno = 0; + void* stale_attach = shmat(stale_id, nullptr, 0); + EXPECT_EQ(reinterpret_cast(-1), stale_attach); + EXPECT_EQ(EINVAL, errno); + + errno = 0; + EXPECT_EQ(current_id, shmctl(current_idx, SHM_STAT, &ds)); + EXPECT_EQ(0, errno); + + errno = 0; + EXPECT_EQ(current_id, shmctl(current_idx, SHM_STAT_ANY, &ds)); + EXPECT_EQ(0, errno); +} + +TEST(SysvShmSemantics, ShmInfoReturnsCurrentMaxIndex) { + ShmSegment first(SegmentSize()); + ASSERT_TRUE(first.valid()) << "first shmget failed: errno=" << errno << " (" + << strerror(errno) << ")"; + ShmSegment second(SegmentSize()); + ASSERT_TRUE(second.valid()) << "second shmget failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + const int expected_min_max_idx = + std::max(first.id() & kIpcIdIndexMask, second.id() & kIpcIdIndexMask); + struct shm_info info; + memset(&info, 0, sizeof(info)); + errno = 0; + const int max_idx = + shmctl(0, SHM_INFO, reinterpret_cast(&info)); + ASSERT_GE(max_idx, 0) << "SHM_INFO failed: errno=" << errno << " (" << strerror(errno) + << ")"; + EXPECT_GE(max_idx, expected_min_max_idx); + EXPECT_GE(info.used_ids, 2); + + char* addr = static_cast(Attach(first.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'r'; + memset(&info, 0, sizeof(info)); + ASSERT_GE(shmctl(0, SHM_INFO, reinterpret_cast(&info)), 0) + << "SHM_INFO after fault failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_GE(info.shm_rss, 1UL); + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, ShmctlRejectsNegativeIdBeforeInfoCommands) { + struct shminfo ipc_info; + memset(&ipc_info, 0, sizeof(ipc_info)); + errno = 0; + EXPECT_EQ(-1, shmctl(-1, IPC_INFO, reinterpret_cast(&ipc_info))); + EXPECT_EQ(EINVAL, errno); + + struct shm_info info; + memset(&info, 0, sizeof(info)); + errno = 0; + EXPECT_EQ(-1, shmctl(-1, SHM_INFO, reinterpret_cast(&info))); + EXPECT_EQ(EINVAL, errno); +} + +TEST(SysvShmSemantics, NegativeKeyRoundTripsThroughIpcStat) { + const key_t key = static_cast(-1); + const int id = shmget(key, SegmentSize(), IPC_CREAT | IPC_EXCL | 0600); + if (id < 0 && errno == EEXIST) { + GTEST_SKIP() << "negative key already exists"; + } + ASSERT_GE(id, 0) << "shmget negative key failed: errno=" << errno << " (" << strerror(errno) + << ")"; + + struct shmid_ds ds; + ASSERT_EQ(0, shmctl(id, IPC_STAT, &ds)); + EXPECT_EQ(key, ds.shm_perm.__key); + EXPECT_EQ(0, shmctl(id, IPC_RMID, nullptr)); +} + +TEST(SysvShmSemantics, ShmLockUnlockUpdatesMode) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()) << "shmget failed: errno=" << errno << " (" << strerror(errno) + << ")"; + + struct shmid_ds ds; + ASSERT_EQ(0, shmctl(shm.id(), IPC_STAT, &ds)); + EXPECT_EQ(0u, ds.shm_perm.mode & SHM_LOCKED); + const time_t original_ctime = ds.shm_ctime; + sleep(1); + + ASSERT_EQ(0, shmctl(shm.id(), SHM_LOCK, nullptr)) + << "SHM_LOCK failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_EQ(0, shmctl(shm.id(), IPC_STAT, &ds)); + EXPECT_NE(0u, ds.shm_perm.mode & SHM_LOCKED); + EXPECT_EQ(original_ctime, ds.shm_ctime); + + EXPECT_EQ(0, shmctl(shm.id(), SHM_LOCK, nullptr)) + << "repeat SHM_LOCK failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_EQ(0, shmctl(shm.id(), IPC_STAT, &ds)); + EXPECT_NE(0u, ds.shm_perm.mode & SHM_LOCKED); + EXPECT_EQ(original_ctime, ds.shm_ctime); + + ASSERT_EQ(0, shmctl(shm.id(), SHM_UNLOCK, nullptr)) + << "SHM_UNLOCK failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_EQ(0, shmctl(shm.id(), IPC_STAT, &ds)); + EXPECT_EQ(0u, ds.shm_perm.mode & SHM_LOCKED); + EXPECT_EQ(original_ctime, ds.shm_ctime); + + EXPECT_EQ(0, shmctl(shm.id(), SHM_UNLOCK, nullptr)) + << "repeat SHM_UNLOCK failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_EQ(0, shmctl(shm.id(), IPC_STAT, &ds)); + EXPECT_EQ(original_ctime, ds.shm_ctime); +} + +TEST(SysvShmSemantics, LockedRmidLastDetachReleasesMemlockAccounting) { + if (geteuid() == 0) { + const pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + if (child == 0) { + if (setgid(1000) != 0 || setuid(1000) != 0) { + _exit(10); + } + _exit(RunLockedRmidLastDetachReleasesMemlockScenario()); + } + WaitChildOk(child); + } else { + EXPECT_EQ(0, RunLockedRmidLastDetachReleasesMemlockScenario()); + } +} + +TEST(SysvShmSemantics, ShmNattchTracksAttachDetach) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()) << "shmget failed: errno=" << errno << " (" << strerror(errno) + << ")"; + EXPECT_EQ(0, ShmNattch(shm.id())); + + void* addr1 = Attach(shm.id()); + ASSERT_NE(MAP_FAILED, addr1) << "shmat failed: errno=" << errno << " (" << strerror(errno) + << ")"; + EXPECT_EQ(1, ShmNattch(shm.id())); + + void* addr2 = Attach(shm.id()); + ASSERT_NE(MAP_FAILED, addr2) << "second shmat failed: errno=" << errno << " (" + << strerror(errno) << ")"; + EXPECT_EQ(2, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(addr1)) << "shmdt(addr1) failed: errno=" << errno << " (" + << strerror(errno) << ")"; + EXPECT_EQ(1, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(addr2)) << "shmdt(addr2) failed: errno=" << errno << " (" + << strerror(errno) << ")"; + EXPECT_EQ(0, ShmNattch(shm.id())); +} + +TEST(SysvShmSemantics, DetachedSegmentPersists) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'D'; + ASSERT_EQ(0, shmdt(addr)); + + addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + EXPECT_EQ('D', addr[0]); + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, ShmdtRejectsNonShmMapping) { + const size_t ps = PageSize(); + char* mapping = static_cast( + mmap(nullptr, ps, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, mapping); + + mapping[0] = 'a'; + errno = 0; + EXPECT_EQ(-1, shmdt(mapping)); + EXPECT_EQ(EINVAL, errno); + mapping[0] = 'b'; + + EXPECT_EQ(0, munmap(mapping, ps)); +} + +TEST(SysvShmSemantics, ReadonlyAttachRejectsWrite) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id(), SHM_RDONLY)); + ASSERT_NE(MAP_FAILED, addr); + static_cast(addr[0]); + + ExpectChildDiesBySignal(SIGSEGV, WriteFirstByte, addr); + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, ReadonlyAttachMprotectWriteDenied) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + void* addr = Attach(shm.id(), SHM_RDONLY); + ASSERT_NE(MAP_FAILED, addr); + + errno = 0; + EXPECT_EQ(-1, mprotect(addr, SegmentSize(), PROT_READ | PROT_WRITE)); + EXPECT_EQ(EACCES, errno); + + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, MprotectZeroLengthSucceeds) { + void* mapping = mmap(nullptr, PageSize(), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, mapping); + + errno = 0; + EXPECT_EQ(0, mprotect(mapping, 0, PROT_NONE)); + EXPECT_EQ(0, errno); + + EXPECT_EQ(0, munmap(mapping, PageSize())); +} + +TEST(SysvShmSemantics, MprotectAndMadviseReportUnmappedHoles) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + const size_t ps = PageSize(); + addr[0] = 'h'; + addr[ps * 2] = 'i'; + + ASSERT_EQ(0, munmap(addr + ps, ps)); + + errno = 0; + EXPECT_EQ(-1, mprotect(addr, SegmentSize(), PROT_READ)); + EXPECT_EQ(ENOMEM, errno); + + errno = 0; + EXPECT_EQ(-1, madvise(addr, SegmentSize(), MADV_RANDOM)); + EXPECT_EQ(ENOMEM, errno); + + EXPECT_EQ('h', addr[0]); + EXPECT_EQ('i', addr[ps * 2]); + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, PartialUnmapThenShmdt) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'p'; + addr[SegmentSize() - 1] = 'q'; + + const size_t ps = PageSize(); + ASSERT_EQ(0, munmap(addr + ps, ps * 2)); + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, PartialUnmapNattchTracksFragments) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + EXPECT_EQ(1, ShmNattch(shm.id())); + + const size_t ps = PageSize(); + ASSERT_EQ(0, munmap(addr + ps, ps * 2)); + EXPECT_EQ(2, ShmNattch(shm.id())); + + ASSERT_EQ(0, shmctl(shm.id(), IPC_RMID, nullptr)); + addr[0] = 'x'; + addr[SegmentSize() - 1] = 'y'; + EXPECT_EQ(0, shmdt(addr)); + + shm.release(); +} + +TEST(SysvShmSemantics, RepeatedRmidWhileAttachedDoesNotDestroy) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr1 = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr1); + char* addr2 = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr2); + + const int id = shm.release(); + for (int i = 0; i < 6; ++i) { + ASSERT_EQ(0, shmctl(id, IPC_RMID, nullptr)); + } + + addr1[0] = 'a'; + addr2[0] = 'b'; + EXPECT_EQ(0, shmdt(addr1)); + addr2[0] = 'c'; + EXPECT_EQ(0, shmdt(addr2)); +} + +TEST(SysvShmSemantics, AllowsAttachToRemovedSegmentWithRefs) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr1 = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr1); + + const int id = shm.release(); + ASSERT_EQ(0, shmctl(id, IPC_RMID, nullptr)); + + char* addr2 = static_cast(Attach(id)); + ASSERT_NE(MAP_FAILED, addr2) << "attach to removed-but-referenced segment failed: errno=" + << errno << " (" << strerror(errno) << ")"; + addr1[0] = 'r'; + EXPECT_EQ('r', addr2[0]); + + EXPECT_EQ(0, shmdt(addr1)); + EXPECT_EQ(0, shmdt(addr2)); +} + +TEST(SysvShmSemantics, RemovedSegmentsAreNotDiscoverable) { + const key_t key = UniqueKey(); + ShmSegment shm(key, SegmentSize(), IPC_CREAT | IPC_EXCL | 0600); + ASSERT_TRUE(shm.valid()) << "shmget key failed: errno=" << errno << " (" << strerror(errno) + << ")"; + + void* addr = Attach(shm.id()); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(0, shmctl(shm.id(), IPC_RMID, nullptr)); + shm.release(); + + errno = 0; + EXPECT_EQ(-1, shmget(key, SegmentSize(), 0600)); + EXPECT_EQ(ENOENT, errno); + + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, ExistingKeyAndControlsHonorIpcPermissions) { + if (geteuid() != 0) { + GTEST_SKIP() << "requires root to drop credentials"; + } + + const key_t key = UniqueKey(); + ShmSegment shm(key, SegmentSize(), IPC_CREAT | IPC_EXCL | 0600); + ASSERT_TRUE(shm.valid()) << "shmget key failed: errno=" << errno << " (" << strerror(errno) + << ")"; + + const pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + if (child == 0) { + if (setgid(1000) != 0 || setuid(1000) != 0) { + _exit(10); + } + + errno = 0; + if (shmget(key, 0, 0600) != -1 || errno != EACCES) { + _exit(20); + } + + errno = 0; + void* addr = shmat(shm.id(), nullptr, SHM_RDONLY); + if (addr != reinterpret_cast(-1) || errno != EACCES) { + if (addr != reinterpret_cast(-1)) { + shmdt(addr); + } + _exit(21); + } + + errno = 0; + if (shmctl(shm.id(), IPC_RMID, nullptr) != -1 || errno != EPERM) { + _exit(22); + } + _exit(0); + } + WaitChildOk(child); +} + +TEST(SysvShmSemantics, ShmExecRequiresExecutePermissionAtAttach) { + if (geteuid() == 0) { + const pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + if (child == 0) { + if (setgid(1000) != 0 || setuid(1000) != 0) { + _exit(10); + } + _exit(RunShmExecPermissionScenario()); + } + WaitChildOk(child); + } else { + EXPECT_EQ(0, RunShmExecPermissionScenario()); + } +} + +TEST(SysvShmSemantics, ShmatIgnoresNonAttachFlagBitsLikeLinux) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* base = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, base); + base[0] = 'q'; + + constexpr int kUnknownAttachFlag = 040000000; + const int extra_flags[] = { + IPC_CREAT, + IPC_EXCL, + IPC_CREAT | IPC_EXCL, + kUnknownAttachFlag, + SHM_RDONLY | IPC_CREAT | kUnknownAttachFlag, + }; + + for (int flags : extra_flags) { + errno = 0; + char* addr = static_cast(shmat(shm.id(), nullptr, flags)); + ASSERT_NE(reinterpret_cast(-1), addr) + << "shmat unexpectedly rejected flags 0" << std::oct << flags << std::dec + << ": errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ('q', addr[0]); + EXPECT_EQ(0, shmdt(addr)); + } + + EXPECT_EQ(0, shmdt(base)); +} + +TEST(SysvShmSemantics, FixedAttachWithoutRemapDoesNotReplaceExistingMapping) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + EXPECT_EQ(0, ShmNattch(shm.id())); + + char* target = static_cast( + mmap(nullptr, SegmentSize(), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, target); + target[0] = 'n'; + + errno = 0; + void* denied = shmat(shm.id(), target, 0); + EXPECT_EQ(reinterpret_cast(-1), denied); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ('n', target[0]); + EXPECT_EQ(0, ShmNattch(shm.id())); + + EXPECT_EQ(0, munmap(target, SegmentSize())); +} + +TEST(SysvShmSemantics, ShmatLowAddressConflictReturnsEinvalBeforeMmapMin) { + const size_t ps = PageSize(); + void* low = reinterpret_cast(ps); + bool own_guard = false; + char* guard = static_cast(mmap(low, ps, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, + 0)); + if (guard == MAP_FAILED && errno == EPERM) { + GTEST_SKIP() << "caller lacks CAP_SYS_RAWIO for mmap_min_addr"; + } + if (guard == MAP_FAILED && errno == EEXIST) { + GTEST_SKIP() << "low address already unavailable in this process"; + } + ASSERT_EQ(low, guard) << "low guard mmap failed: errno=" << errno << " (" << strerror(errno) + << ")"; + own_guard = true; + + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + errno = 0; + void* denied = shmat(shm.id(), low, 0); + EXPECT_EQ(reinterpret_cast(-1), denied); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ(0, ShmNattch(shm.id())); + + if (own_guard) { + EXPECT_EQ(0, munmap(guard, ps)); + } +} + +TEST(SysvShmSemantics, FailedLockedMapFixedReplacementKeepsAttachConsistent) { + ScopedMemlockLimit lim; + ASSERT_TRUE(lim.valid()); + if (!lim.set_bytes(0)) { + GTEST_SKIP() << "cannot lower RLIMIT_MEMLOCK"; + } + + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(1, ShmNattch(shm.id())); + addr[0] = 'a'; + + errno = 0; + void* replaced = mmap(addr, SegmentSize(), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | MAP_LOCKED, -1, 0); + if (replaced == MAP_FAILED) { + ASSERT_TRUE(errno == EPERM || errno == EAGAIN) + << "unexpected errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ('a', addr[0]); + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); + return; + } + + ASSERT_EQ(addr, replaced); + EXPECT_EQ(0, ShmNattch(shm.id())); + EXPECT_EQ(0, munmap(replaced, SegmentSize())); +} + +TEST(SysvShmSemantics, FailedLockedFileMapFixedReplacementKeepsAttachConsistent) { + ScopedMemlockLimit lim; + ASSERT_TRUE(lim.valid()); + if (!lim.set_bytes(0)) { + GTEST_SKIP() << "cannot lower RLIMIT_MEMLOCK"; + } + + char tmpl[] = "/tmp/dunitest_shm_file_XXXXXX"; + int fd = mkstemp(tmpl); + ASSERT_GE(fd, 0) << "mkstemp failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(0, unlink(tmpl)); + ASSERT_EQ(0, ftruncate(fd, static_cast(SegmentSize()))) + << "ftruncate failed: errno=" << errno << " (" << strerror(errno) << ")"; + + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(1, ShmNattch(shm.id())); + addr[0] = 'f'; + + errno = 0; + void* replaced = + mmap(addr, SegmentSize(), PROT_READ, MAP_PRIVATE | MAP_FIXED | MAP_LOCKED, fd, 0); + if (replaced == MAP_FAILED) { + ASSERT_TRUE(errno == EPERM || errno == EAGAIN) + << "unexpected errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ('f', addr[0]); + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); + EXPECT_EQ(0, close(fd)); + return; + } + + ASSERT_EQ(addr, replaced); + EXPECT_EQ(0, ShmNattch(shm.id())); + EXPECT_EQ(0, munmap(replaced, SegmentSize())); + EXPECT_EQ(0, close(fd)); +} + +TEST(SysvShmSemantics, MapFixedReplacementUsesNetRlimitAsGrowth) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(1, ShmNattch(shm.id())); + addr[0] = 'r'; + + ScopedAddressSpaceLimit lim; + ASSERT_TRUE(lim.valid()); + const size_t current_vm = CurrentVmSizeBytes(); + if (current_vm == 0) { + GTEST_SKIP() << "cannot read VmSize from /proc/self/status"; + } + if (!lim.set_bytes(current_vm)) { + GTEST_SKIP() << "cannot lower RLIMIT_AS"; + } + + errno = 0; + void* replaced = mmap(addr, SegmentSize(), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + const int saved_errno = errno; + ASSERT_TRUE(lim.restore()) << "restore RLIMIT_AS failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + ASSERT_EQ(addr, replaced) << "MAP_FIXED net-zero replacement failed: errno=" << saved_errno + << " (" << strerror(saved_errno) << ")"; + EXPECT_EQ(0, ShmNattch(shm.id())); + EXPECT_EQ(0, munmap(replaced, SegmentSize())); +} + +TEST(SysvShmSemantics, FailedRlimitAsAnonMapFixedKeepsAttachConsistent) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(1, ShmNattch(shm.id())); + addr[0] = 'a'; + + ScopedAddressSpaceLimit lim; + ASSERT_TRUE(lim.valid()); + if (!lim.set_bytes(PageSize())) { + GTEST_SKIP() << "cannot lower RLIMIT_AS"; + } + + errno = 0; + void* replaced = mmap(addr, SegmentSize() * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + const int saved_errno = errno; + ASSERT_TRUE(lim.restore()) << "restore RLIMIT_AS failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + ASSERT_EQ(MAP_FAILED, replaced); + EXPECT_EQ(ENOMEM, saved_errno); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ('a', addr[0]); + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); +} + +TEST(SysvShmSemantics, FailedRlimitAsFileMapFixedKeepsAttachConsistent) { + char tmpl[] = "/tmp/dunitest_shm_rlimit_as_XXXXXX"; + int fd = mkstemp(tmpl); + ASSERT_GE(fd, 0) << "mkstemp failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(0, unlink(tmpl)); + ASSERT_EQ(0, ftruncate(fd, static_cast(SegmentSize() * 2))) + << "ftruncate failed: errno=" << errno << " (" << strerror(errno) << ")"; + + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(1, ShmNattch(shm.id())); + addr[0] = 'f'; + + ScopedAddressSpaceLimit lim; + ASSERT_TRUE(lim.valid()); + if (!lim.set_bytes(PageSize())) { + EXPECT_EQ(0, close(fd)); + GTEST_SKIP() << "cannot lower RLIMIT_AS"; + } + + errno = 0; + void* replaced = + mmap(addr, SegmentSize() * 2, PROT_READ, MAP_PRIVATE | MAP_FIXED, fd, 0); + const int saved_errno = errno; + ASSERT_TRUE(lim.restore()) << "restore RLIMIT_AS failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + ASSERT_EQ(MAP_FAILED, replaced); + EXPECT_EQ(ENOMEM, saved_errno); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ('f', addr[0]); + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); + EXPECT_EQ(0, close(fd)); +} + +TEST(SysvShmSemantics, ExplicitLowAttachIsNeverRoundedToMmapMin) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + void* requested = reinterpret_cast(PageSize()); + errno = 0; + void* attached = shmat(shm.id(), requested, 0); + if (attached != reinterpret_cast(-1)) { + EXPECT_EQ(requested, attached) + << "explicit shmat address was silently rewritten"; + EXPECT_EQ(0, shmdt(attached)); + } else { + EXPECT_NE(0, errno); + } + EXPECT_NE(reinterpret_cast(MmapMinAddr()), attached); +} + +TEST(SysvShmSemantics, LowAddressRemapDoesNotOverwriteMmapMinGuard) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* guard = static_cast(mmap(reinterpret_cast(MmapMinAddr()), PageSize(), + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, + 0)); + if (guard == MAP_FAILED) { + GTEST_SKIP() << "mmap_min guard unavailable: errno=" << errno << " (" << strerror(errno) + << ")"; + } + guard[0] = 'g'; + + void* requested = reinterpret_cast(PageSize()); + errno = 0; + void* attached = shmat(shm.id(), requested, SHM_REMAP); + if (attached != reinterpret_cast(-1)) { + EXPECT_EQ(requested, attached) + << "SHM_REMAP low explicit address was silently rewritten"; + EXPECT_EQ(0, shmdt(attached)); + } else { + EXPECT_NE(0, errno); + } + EXPECT_EQ('g', guard[0]) << "SHM_REMAP overwrote the rounded mmap_min address"; + EXPECT_EQ(0, munmap(guard, PageSize())); +} + +TEST(SysvShmSemantics, CapSysRawioAllowsMapFixedNull) { + const size_t ps = PageSize(); + errno = 0; + void* mapped = mmap(nullptr, ps, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (mapped == MAP_FAILED && errno == EPERM) { + GTEST_SKIP() << "caller lacks CAP_SYS_RAWIO for mmap_min_addr"; + } + ASSERT_EQ(nullptr, mapped) << "MAP_FIXED null mmap failed: errno=" << errno << " (" + << strerror(errno) << ")"; + static_cast(mapped)[0] = '0'; + EXPECT_EQ('0', static_cast(mapped)[0]); + EXPECT_EQ(0, munmap(mapped, ps)); +} + +TEST(SysvShmSemantics, CapSysRawioAllowsMremapFixedNull) { + const size_t ps = PageSize(); + char* source = static_cast( + mmap(nullptr, ps, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, source); + source[0] = 'r'; + + errno = 0; + void* moved = reinterpret_cast( + syscall(SYS_mremap, source, ps, ps, MREMAP_MAYMOVE | MREMAP_FIXED, nullptr)); + if (moved == MAP_FAILED && errno == EPERM) { + EXPECT_EQ(0, munmap(source, ps)); + GTEST_SKIP() << "caller lacks CAP_SYS_RAWIO for mmap_min_addr"; + } + ASSERT_EQ(nullptr, moved) << "MREMAP_FIXED null failed: errno=" << errno << " (" + << strerror(errno) << ")"; + EXPECT_EQ('r', static_cast(moved)[0]); + EXPECT_EQ(0, munmap(moved, ps)); +} + +TEST(SysvShmSemantics, MremapFixedLowAddressChecksAfterLinuxSideEffects) { + if (geteuid() == 0) { + const pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + if (child == 0) { + if (setgid(1000) != 0 || setuid(1000) != 0) { + _exit(10); + } + _exit(RunMremapFixedLowAddressOrderScenario()); + } + WaitChildOk(child); + } else { + EXPECT_EQ(0, RunMremapFixedLowAddressOrderScenario()); + } +} + +TEST(SysvShmSemantics, ShmRemapReplacesExactHighAddress) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* target = static_cast( + mmap(nullptr, SegmentSize(), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, target); + target[0] = 'r'; + + void* attached = shmat(shm.id(), target, SHM_REMAP); + ASSERT_NE(reinterpret_cast(-1), attached) + << "SHM_REMAP failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(target, attached); + static_cast(attached)[0] = 's'; + EXPECT_EQ('s', static_cast(attached)[0]); + EXPECT_EQ(0, shmdt(attached)); +} + +TEST(SysvShmSemantics, ShmRemapNullFailsWithoutAttach) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + EXPECT_EQ(0, ShmNattch(shm.id())); + + errno = 0; + void* attached = shmat(shm.id(), nullptr, SHM_REMAP); + EXPECT_EQ(reinterpret_cast(-1), attached); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ(0, ShmNattch(shm.id())); +} + +TEST(SysvShmSemantics, UnalignedAttachRequiresShmRnd) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* reservation = static_cast( + mmap(nullptr, SegmentSize() * 2, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, reservation); + ASSERT_EQ(0, munmap(reservation, SegmentSize() * 2)); + + char* unaligned = reservation + PageSize() / 2; + errno = 0; + void* denied = shmat(shm.id(), unaligned, 0); + EXPECT_EQ(reinterpret_cast(-1), denied); + EXPECT_EQ(EINVAL, errno); + + void* rounded = shmat(shm.id(), unaligned, SHM_RND); + ASSERT_NE(reinterpret_cast(-1), rounded) + << "SHM_RND attach failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(reservation, rounded); + EXPECT_EQ(0, shmdt(rounded)); +} + +TEST(SysvShmSemantics, MultipleDetachFails) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + void* addr = Attach(shm.id()); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(0, shmdt(addr)); + + errno = 0; + EXPECT_EQ(-1, shmdt(addr)); + EXPECT_EQ(EINVAL, errno); +} + +TEST(SysvShmSemantics, SegmentsSizeFixedOnCreation) { + const key_t key = UniqueKey(); + ShmSegment shm(key, SegmentSize(), IPC_CREAT | IPC_EXCL | 0600); + ASSERT_TRUE(shm.valid()); + + const int same = shmget(key, SegmentSize() / 2, 0600); + ASSERT_EQ(shm.id(), same); + + errno = 0; + EXPECT_EQ(-1, shmget(key, SegmentSize() * 2, 0600)); + EXPECT_EQ(EINVAL, errno); + + char* addr = static_cast(Attach(same)); + ASSERT_NE(MAP_FAILED, addr); + addr[SegmentSize() - 1] = 's'; + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, TwoAttachSplitDetachIsolation) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr1 = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr1); + char* addr2 = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr2); + + const size_t ps = PageSize(); + ASSERT_EQ(0, mprotect(addr1 + ps, ps, PROT_READ)); + ASSERT_EQ(0, munmap(addr1 + ps * 2, ps)); + ASSERT_EQ(0, shmdt(addr1)); + + addr2[0] = 'i'; + addr2[SegmentSize() - 1] = 'j'; + EXPECT_EQ(0, shmdt(addr2)); +} + +TEST(SysvShmSemantics, MlockSplitNattchTracksFragments) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + EXPECT_EQ(1, ShmNattch(shm.id())); + + const size_t ps = PageSize(); + ASSERT_EQ(0, mlock(addr + ps, ps)); + EXPECT_EQ(3, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); +} + +TEST(SysvShmSemantics, RepeatedMlockDoesNotResplitSysvShmVma) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + EXPECT_EQ(1, ShmNattch(shm.id())); + + const size_t ps = PageSize(); + ASSERT_EQ(0, mlock(addr + ps, ps)); + EXPECT_EQ(3, ShmNattch(shm.id())); + + ASSERT_EQ(0, mlock(addr + ps, ps)); + EXPECT_EQ(3, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); +} + +TEST(SysvShmSemantics, ShmdtReleasesLockedVmAccounting) { + if (geteuid() == 0) { + const pid_t child = fork(); + ASSERT_GE(child, 0) << "fork failed: errno=" << errno << " (" << strerror(errno) << ")"; + if (child == 0) { + if (setgid(1000) != 0 || setuid(1000) != 0) { + _exit(10); + } + _exit(RunShmdtReleasesLockedVmScenario()); + } + WaitChildOk(child); + } else { + EXPECT_EQ(0, RunShmdtReleasesLockedVmScenario()); + } +} + +TEST(SysvShmSemantics, MadviseSplitNattchTracksFragments) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + EXPECT_EQ(1, ShmNattch(shm.id())); + + const size_t ps = PageSize(); + ASSERT_EQ(0, madvise(addr + ps, ps, MADV_RANDOM)); + EXPECT_EQ(3, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); +} + +TEST(SysvShmSemantics, RepeatedMadviseDoesNotResplitSysvShmVma) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + EXPECT_EQ(1, ShmNattch(shm.id())); + + const size_t ps = PageSize(); + ASSERT_EQ(0, madvise(addr + ps, ps, MADV_RANDOM)); + EXPECT_EQ(3, ShmNattch(shm.id())); + + ASSERT_EQ(0, madvise(addr + ps, ps, MADV_RANDOM)); + EXPECT_EQ(3, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); +} + +TEST(SysvShmSemantics, MadviseDontneedDoesNotSplitSysvShmVma) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + EXPECT_EQ(1, ShmNattch(shm.id())); + + const size_t ps = PageSize(); + addr[0] = 'a'; + addr[ps] = 'b'; + addr[ps * 2] = 'c'; + + ASSERT_EQ(0, madvise(addr + ps, ps, MADV_DONTNEED)) + << "MADV_DONTNEED failed: errno=" << errno << " (" << strerror(errno) << ")"; + EXPECT_EQ(1, ShmNattch(shm.id())) + << "MADV_DONTNEED must zap pages without splitting the SysV SHM VMA"; + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); +} + +TEST(SysvShmSemantics, RepeatedMprotectDoesNotResplitSysvShmVma) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + EXPECT_EQ(1, ShmNattch(shm.id())); + + const size_t ps = PageSize(); + ASSERT_EQ(0, mprotect(addr + ps, ps, PROT_READ)); + EXPECT_EQ(3, ShmNattch(shm.id())); + + ASSERT_EQ(0, mprotect(addr + ps, ps, PROT_READ)); + EXPECT_EQ(3, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, ShmNattch(shm.id())); +} + +TEST(SysvShmSemantics, SysvShmMremapMoveNetNattch) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'm'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + void* target = mmap(nullptr, SegmentSize(), PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, target); + + void* moved = reinterpret_cast( + syscall(SYS_mremap, addr, SegmentSize(), SegmentSize(), MREMAP_MAYMOVE | MREMAP_FIXED, + target)); + ASSERT_NE(MAP_FAILED, moved) << "mremap failed: errno=" << errno << " (" << strerror(errno) + << ")"; + EXPECT_EQ(target, moved); + EXPECT_EQ('m', static_cast(moved)[0]); + EXPECT_EQ(1, ShmNattch(shm.id())); + ExpectChildDiesBySignal(SIGSEGV, WriteFirstByte, addr); + + EXPECT_EQ(0, shmdt(moved)); +} + +TEST(SysvShmSemantics, MremapFixedWrapTargetFailsWithoutChangingSource) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'w'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + const uintptr_t bad_target = ~(static_cast(PageSize()) - 1); + errno = 0; + void* moved = reinterpret_cast( + syscall(SYS_mremap, addr, PageSize(), PageSize() * 2, MREMAP_MAYMOVE | MREMAP_FIXED, + reinterpret_cast(bad_target))); + EXPECT_EQ(MAP_FAILED, moved); + EXPECT_NE(0, errno); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ('w', addr[0]); + + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, MremapFixedRejectsUnalignedTargetBeforeUnmap) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'u'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + char* target = static_cast( + mmap(nullptr, SegmentSize(), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, target); + target[0] = 't'; + + errno = 0; + void* moved = reinterpret_cast(syscall( + SYS_mremap, addr, SegmentSize() * 2, PageSize(), MREMAP_MAYMOVE | MREMAP_FIXED, + target + 1)); + EXPECT_EQ(MAP_FAILED, moved); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ('u', addr[0]); + EXPECT_EQ('t', target[0]); + + EXPECT_EQ(0, munmap(target, SegmentSize())); + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, MremapFixedReplacesLazyTargetVma) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + const size_t ps = PageSize(); + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'm'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + char* target = static_cast( + mmap(nullptr, ps, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); + ASSERT_NE(MAP_FAILED, target); + + errno = 0; + void* moved = reinterpret_cast( + syscall(SYS_mremap, addr, ps, ps, MREMAP_MAYMOVE | MREMAP_FIXED, target)); + ASSERT_EQ(target, moved) << "mremap fixed failed: errno=" << errno << " (" + << strerror(errno) << ")"; + EXPECT_EQ('m', target[0]); + target[0] = 'M'; + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(0, shmdt(target)); +} + +TEST(SysvShmSemantics, MremapMissingSourceReturnsEfault) { + void* addr = + mmap(nullptr, PageSize(), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(0, munmap(addr, PageSize())); + + errno = 0; + void* moved = + reinterpret_cast(syscall(SYS_mremap, addr, PageSize(), PageSize(), 0, nullptr)); + EXPECT_EQ(MAP_FAILED, moved); + EXPECT_EQ(EFAULT, errno); +} + +TEST(SysvShmSemantics, MremapRejectsUnknownFlagsWithoutChangingSource) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'k'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + errno = 0; + void* moved = reinterpret_cast( + syscall(SYS_mremap, addr, SegmentSize(), SegmentSize(), 0x8, nullptr)); + EXPECT_EQ(MAP_FAILED, moved); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ('k', addr[0]); + + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, MremapHugeOldLenFailsWithoutChangingSource) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'h'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + const size_t huge_len = ~(PageSize() - 1); + errno = 0; + void* moved = reinterpret_cast( + syscall(SYS_mremap, addr, huge_len, huge_len, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, nullptr)); + EXPECT_EQ(MAP_FAILED, moved); + EXPECT_NE(0, errno); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ('h', addr[0]); + + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, SysvShmPartialMremapMoveSplitsSource) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + const size_t ps = PageSize(); + addr[0] = 'a'; + addr[ps] = 'b'; + addr[ps * 2] = 'c'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + void* target = mmap(nullptr, ps, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, target); + + void* moved = reinterpret_cast( + syscall(SYS_mremap, addr + ps, ps, ps, MREMAP_MAYMOVE | MREMAP_FIXED, target)); + ASSERT_NE(MAP_FAILED, moved) << "partial mremap failed: errno=" << errno << " (" + << strerror(errno) << ")"; + EXPECT_EQ(target, moved); + EXPECT_EQ('b', static_cast(moved)[0]); + EXPECT_EQ('a', addr[0]); + EXPECT_EQ('c', addr[ps * 2]); + EXPECT_EQ(3, ShmNattch(shm.id())); + ExpectChildDiesBySignal(SIGSEGV, WriteFirstByte, addr + ps); + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ(0, shmdt(static_cast(moved) - ps)); +} + +TEST(SysvShmSemantics, SysvShmMremapDontunmapNattchPlusOne) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'd'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + void* dup = reinterpret_cast(syscall( + SYS_mremap, addr, SegmentSize(), SegmentSize(), MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0)); + ASSERT_NE(MAP_FAILED, dup) << "mremap DONTUNMAP failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + EXPECT_EQ('d', static_cast(dup)[0]); + static_cast(dup)[0] = 'e'; + EXPECT_EQ('e', addr[0]); + EXPECT_EQ(2, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ(0, shmdt(dup)); +} + +TEST(SysvShmSemantics, SysvShmPartialMremapDontunmapKeepsSourceVma) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + const size_t ps = PageSize(); + addr[0] = 'x'; + addr[ps] = 'y'; + addr[ps * 2] = 'z'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + void* dup = reinterpret_cast(syscall( + SYS_mremap, addr + ps, ps, ps, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0)); + ASSERT_NE(MAP_FAILED, dup) << "partial mremap DONTUNMAP failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + EXPECT_EQ('y', static_cast(dup)[0]); + static_cast(dup)[0] = 'q'; + EXPECT_EQ('q', addr[ps]); + addr[ps] = 'r'; + EXPECT_EQ('r', static_cast(dup)[0]); + EXPECT_EQ('x', addr[0]); + EXPECT_EQ('z', addr[ps * 2]); + EXPECT_EQ(2, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ(0, shmdt(static_cast(dup) - ps)); +} + +TEST(SysvShmSemantics, LockedPartialMremapDontunmapDoesNotResplitSource) { + ScopedMemlockLimit lim; + ASSERT_TRUE(lim.valid()); + ASSERT_TRUE(lim.set_bytes(SegmentSize() * 2)); + + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + const size_t ps = PageSize(); + addr[ps] = 'l'; + ASSERT_EQ(0, mlock(addr, SegmentSize())); + EXPECT_EQ(1, ShmNattch(shm.id())); + + void* dup = reinterpret_cast(syscall( + SYS_mremap, addr + ps, ps, ps, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0)); + ASSERT_NE(MAP_FAILED, dup) << "locked partial mremap DONTUNMAP failed: errno=" << errno + << " (" << strerror(errno) << ")"; + + EXPECT_EQ('l', static_cast(dup)[0]); + static_cast(dup)[0] = 'L'; + EXPECT_EQ('L', addr[ps]); + EXPECT_EQ(2, ShmNattch(shm.id())) + << "locked MREMAP_DONTUNMAP must not split the SysV SHM source VMA"; + + EXPECT_EQ(0, shmdt(addr)); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ(0, shmdt(static_cast(dup) - ps)); +} + +TEST(SysvShmSemantics, SysvShmMremapOldLenZeroDuplicate) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'z'; + + void* dup = + reinterpret_cast(syscall(SYS_mremap, addr, 0, SegmentSize(), MREMAP_MAYMOVE)); + ASSERT_NE(MAP_FAILED, dup) << "mremap duplicate failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + EXPECT_EQ('z', static_cast(dup)[0]); + static_cast(dup)[0] = 'w'; + EXPECT_EQ('w', addr[0]); + EXPECT_EQ(2, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(dup)); + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, SysvShmMremapOldLenZeroKeepsSourceMlockAccounting) { + ScopedMemlockLimit lim; + ASSERT_TRUE(lim.valid()); + ASSERT_TRUE(lim.set_bytes(SegmentSize() * 2)); + + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + ASSERT_EQ(0, mlock(addr, SegmentSize())); + + void* dup = + reinterpret_cast(syscall(SYS_mremap, addr, 0, SegmentSize(), MREMAP_MAYMOVE)); + ASSERT_NE(MAP_FAILED, dup) << "mremap duplicate failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + errno = 0; + EXPECT_EQ(-1, msync(dup, SegmentSize(), MS_INVALIDATE)); + EXPECT_EQ(EBUSY, errno); + + EXPECT_EQ(0, munlock(dup, SegmentSize())); + + errno = 0; + EXPECT_EQ(-1, msync(addr, SegmentSize(), MS_INVALIDATE)); + EXPECT_EQ(EBUSY, errno); + + EXPECT_EQ(0, munlock(addr, SegmentSize())); + + void* probe = mmap(nullptr, SegmentSize() * 2, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, probe); + EXPECT_EQ(0, mlock(probe, SegmentSize() * 2)) + << "SysV old_len=0 duplicate leaked locked_vm accounting, errno=" << errno << " (" + << strerror(errno) << ")"; + EXPECT_EQ(0, munlock(probe, SegmentSize() * 2)); + EXPECT_EQ(0, munmap(probe, SegmentSize() * 2)); + + EXPECT_EQ(0, shmdt(dup)); + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, SysvShmMremapOldLenZeroFromMiddleAllowsLongNewLen) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + const size_t ps = PageSize(); + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[ps] = 'u'; + addr[ps * 2] = 'v'; + + void* dup = + reinterpret_cast(syscall(SYS_mremap, addr + ps, 0, SegmentSize(), MREMAP_MAYMOVE)); + ASSERT_NE(MAP_FAILED, dup) << "mremap duplicate from middle failed: errno=" << errno << " (" + << strerror(errno) << ")"; + + EXPECT_EQ('u', static_cast(dup)[0]); + EXPECT_EQ('v', static_cast(dup)[ps]); + static_cast(dup)[0] = 'U'; + EXPECT_EQ('U', addr[ps]); + EXPECT_EQ(2, ShmNattch(shm.id())); + + EXPECT_EQ(0, shmdt(static_cast(dup) - ps)); + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, SysvShmMremapFailureDoesNotLeakAttach) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'f'; + EXPECT_EQ(1, ShmNattch(shm.id())); + + errno = 0; + void* failed = + reinterpret_cast(syscall(SYS_mremap, addr, SegmentSize(), SegmentSize(), + MREMAP_DONTUNMAP, 0)); + EXPECT_EQ(MAP_FAILED, failed); + EXPECT_EQ(1, ShmNattch(shm.id())); + EXPECT_EQ('f', addr[0]); + + EXPECT_EQ(0, shmdt(addr)); +} + +TEST(SysvShmSemantics, IpcRmidKeepsOldIdVisibleUntilLastDetach) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + const int id = shm.release(); + + char* addr = static_cast(Attach(id)); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'r'; + EXPECT_EQ(1, ShmNattch(id)); + + struct shmid_ds ds; + ASSERT_EQ(0, shmctl(id, IPC_STAT, &ds)); + const time_t original_ctime = ds.shm_ctime; + sleep(1); + + ASSERT_EQ(0, shmctl(id, IPC_RMID, nullptr)); + + ASSERT_EQ(0, shmctl(id, IPC_STAT, &ds)); + EXPECT_EQ(1U, ds.shm_nattch); + EXPECT_NE(0, ds.shm_perm.mode & SHM_DEST); + EXPECT_EQ(original_ctime, ds.shm_ctime); + + char* again = static_cast(Attach(id)); + ASSERT_NE(MAP_FAILED, again); + EXPECT_EQ('r', again[0]); + + EXPECT_EQ('r', addr[0]); + addr[0] = 'R'; + EXPECT_EQ('R', again[0]); + EXPECT_EQ(0, shmdt(again)); + EXPECT_EQ(0, shmdt(addr)); + + errno = 0; + void* denied = shmat(id, nullptr, 0); + EXPECT_EQ(reinterpret_cast(-1), denied); + EXPECT_EQ(EINVAL, errno); +} + +TEST(SysvShmSemantics, IpcRmidReleasesKeyWhileOldMappingDetachesByTombstone) { + const key_t key = UniqueKey(); + ShmSegment first(key, SegmentSize(), IPC_CREAT | IPC_EXCL | 0600); + ASSERT_TRUE(first.valid()) << "first shmget failed: errno=" << errno << " (" << strerror(errno) + << ")"; + const int old_id = first.release(); + + char* old_addr = static_cast(Attach(old_id)); + ASSERT_NE(MAP_FAILED, old_addr); + old_addr[0] = 'o'; + ASSERT_EQ(0, shmctl(old_id, IPC_RMID, nullptr)); + + ShmSegment second(key, SegmentSize(), IPC_CREAT | IPC_EXCL | 0600); + ASSERT_TRUE(second.valid()) << "key was not released by IPC_RMID: errno=" << errno << " (" + << strerror(errno) << ")"; + EXPECT_NE(old_id, second.id()); + + char* old_again = static_cast(Attach(old_id)); + ASSERT_NE(MAP_FAILED, old_again); + EXPECT_EQ('o', old_again[0]); + + EXPECT_EQ('o', old_addr[0]); + EXPECT_EQ(0, shmdt(old_again)); + EXPECT_EQ(0, shmdt(old_addr)); +} + +TEST(SysvShmSemantics, SysvShmContentSurvivesDropCachesAfterDetach) { + ShmSegment shm(SegmentSize()); + ASSERT_TRUE(shm.valid()); + const size_t ps = PageSize(); + + char* addr = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, addr); + addr[0] = 'a'; + addr[ps] = 'b'; + addr[ps * 2] = 'c'; + ASSERT_EQ(0, msync(addr, SegmentSize(), MS_SYNC)) + << "msync failed: errno=" << errno << " (" << strerror(errno) << ")"; + ASSERT_EQ(0, shmdt(addr)); + + DropPageCache(); + + char* again = static_cast(Attach(shm.id())); + ASSERT_NE(MAP_FAILED, again); + EXPECT_EQ('a', again[0]); + EXPECT_EQ('b', again[ps]); + EXPECT_EQ('c', again[ps * 2]); + EXPECT_EQ(0, shmdt(again)); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/user/apps/tests/dunitest/whitelist.txt b/user/apps/tests/dunitest/whitelist.txt index 54119d5028..b51e719688 100644 --- a/user/apps/tests/dunitest/whitelist.txt +++ b/user/apps/tests/dunitest/whitelist.txt @@ -21,6 +21,7 @@ normal/proc_self_limits normal/proc_fd_devfs_readlink normal/mlock_semantics normal/mmap_truncate_cow +normal/sysv_shm_semantics normal/sched_affinity normal/sync_file_range normal/splice_concurrent_io