diff --git a/include/kernel/address-space.h b/include/kernel/address-space.h index 216eaef..cab9354 100644 --- a/include/kernel/address-space.h +++ b/include/kernel/address-space.h @@ -105,6 +105,12 @@ extern kern_status_t address_space_release( virt_addr_t base, size_t length); +/* duplicate all of the mappings in `src` within `dest. the duplication will use + * copy-on-write; page data will not be copied until it is written to. */ +extern kern_status_t address_space_duplicate( + struct address_space *dest, + struct address_space *src); + extern bool address_space_validate_access( struct address_space *region, virt_addr_t base, diff --git a/vm/address-space.c b/vm/address-space.c index e210d49..92730b0 100644 --- a/vm/address-space.c +++ b/vm/address-space.c @@ -1206,6 +1206,238 @@ kern_status_t address_space_release( return status; } +static struct vm_area *area_duplicate(struct vm_area *area) +{ + struct vm_area *out = vm_cache_alloc(&vm_area_cache, VM_NORMAL); + if (!out) { + return NULL; + } + + out->vma_prot = area->vma_prot; + out->vma_object_offset = area->vma_object_offset; + out->vma_base = area->vma_base; + out->vma_limit = area->vma_limit; + + return out; +} + +static kern_status_t update_area_pte_cow( + struct address_space *src, + struct address_space *dest, + struct vm_area *area) +{ + if (!area->vma_object) { + return KERN_OK; + } + + for (virt_addr_t i = area->vma_base; i < area->vma_limit; + i += VM_PAGE_SIZE) { + off_t pg_offset = i - area->vma_base + area->vma_object_offset; + struct vm_page *pg = vm_object_get_page( + area->vma_object, + pg_offset, + 0, + NULL); + vm_prot_t temp_prot = area->vma_prot; + temp_prot &= ~VM_PROT_WRITE; + + if (pg) { + pmap_add( + src->s_pmap, + i, + vm_page_get_pfn(pg), + temp_prot, + PMAP_NORMAL); + pmap_add( + dest->s_pmap, + i, + vm_page_get_pfn(pg), + temp_prot, + PMAP_NORMAL); + tracek("PTE %zx -> %zx [%x]", + i, + vm_page_get_paddr(pg), + temp_prot); + } + } + + return KERN_OK; +} + +static kern_status_t prepare_duplicate_areas( + struct address_space *src, + struct address_space *dest) +{ + struct btree_node *cur_node = btree_first(&src->s_mappings); + while (cur_node) { + struct vm_area *tmp_area + = BTREE_CONTAINER(struct vm_area, vma_node, cur_node); + if (!tmp_area->vma_object) { + cur_node = btree_next(cur_node); + continue; + } + + struct vm_object *src_vmo = tmp_area->vma_object; + vm_object_lock(src_vmo); + + struct vm_object *dest_vmo = NULL; + struct queue_entry *cur_entry + = queue_first(&src_vmo->vo_mappings); + + while (cur_entry) { + struct vm_area *src_area = QUEUE_CONTAINER( + struct vm_area, + vma_object_entry, + cur_entry); + if (src_area->vma_space != src) { + cur_entry = queue_next(cur_entry); + continue; + } + + struct vm_area *dest_area = get_entry( + &dest->s_mappings, + src_area->vma_base, + GET_ENTRY_EXACT); + if (!dest_area) { + /* this shouldn't happen. the duplicate vm_areas + * were already created by + * address_space_duplicate */ + panic("create_duplicate_vmo: corresponding " + "vm_area is missing"); + } + + if (dest_area->vma_object) { + cur_entry = queue_next(cur_entry); + continue; + } + + if (!dest_vmo) { + tracek("[%zx-%zx %x] creating COW duplicate of " + "vmo %p", + src_area->vma_base, + src_area->vma_limit, + src_area->vma_prot, + src_vmo); + dest_vmo = vm_object_duplicate_cow(src_vmo); + tracek("[%zx-%zx %x] created COW duplicate of " + "vmo %p -> %p", + src_area->vma_base, + src_area->vma_limit, + src_area->vma_prot, + src_vmo, + dest_vmo); + } + + dest_area->vma_object = dest_vmo; + update_area_pte_cow(src, dest, src_area); + cur_entry = queue_next(cur_entry); + } + + vm_object_unlock(src_vmo); + + cur_node = btree_next(cur_node); + } + + return KERN_OK; +} + +kern_status_t address_space_duplicate( + struct address_space *dest, + struct address_space *src) +{ + // address_space_dump(src); + /* clang-format off + * strategy for COW address space duplication: + * 1. duplicate each vm_area in the address space + * a. all details except for the vm_object pointer are copied. + * b. create a duplicate vm_object, where all the details are the + * same, but don't copy the pages or vm_page pointers. + * c. if the vm_object is attached to a vm_controller, don't inform + * the controller yet. + * d. for both the original and duplicate vm_area, duplicate the PTE + * entries, changing all of them to read-only. increment the + * p_cow_ref counters for all committed vm_pages. + * e. use the vm_object's vm_area list, and the vm_area's vma_space + * pointer, to ensure that only one duplicate is created for each + * unique vm-object referenced by an address-space. + * 2. when a page fault occurs: + * a. find the relevant vm_area as normal. + * b. if the faulted page is present and the vm_area's prot flags + * should allow the access, a COW is required. + * c. if the relevant page is already present in the vm_area's + * vm_object, this is the original vm_area. otherwise, this is the + * clone vm_area. + * d. if this is the source vm_area: + * i. decrement p_cow_ref in the page. if it is 0, skip to step v. + * ii. remove the relevant page from the vm_area + * iii. allocate a new page and copy the data. + * iv. add the new page to the vm_object at the same offset. + * v. change the PTE entry to the proper protection flags. + * vi. resume the faulting task. + * e. otherwise, if this is the clone vm_area: + * i. if the vm-object has a controller, send + * PAGE_REQUEST_DUPLICATE to it. the controller needs to + * prepare itself to receive page requests from this vm-object, + * which includes priving it an equeue_key_t. + * i. use the physical address stored in the PTE to find the + * relevant vm_page. + * ii. decrement p_cow_ref in the page. + * iii. if p_cow_ref is > 0, allocate a new page and copy the data. + * otherwise, use the existing page as-is. + * iv. add the page from step iii to the vm_object at the correct + * offset. + * v. change the PTE entry to the proper protection flags. + * vi. resume the faulting task. + * 3. when destroying a vm_area: + * a. for pages already present in a vm-object, handle as normal. + * b. for pages not present in a vm-object, but for which a valid PTE + * exists, use the PTE physical address to find the vm_page. + * c. decrement p_cow_ref in this page. + * d. if p_cow_ref == 0, de-allocate the page. + * clang-format on + */ + struct btree_node *cur = btree_first(&src->s_mappings); + while (cur) { + struct vm_area *src_area + = BTREE_CONTAINER(struct vm_area, vma_node, cur); + struct vm_area *dest_area = area_duplicate(src_area); + tracek("duplicated vm_area [%zx-%zx] %p -> %p", + src_area->vma_base, + src_area->vma_limit, + src_area, + dest_area); + /* TODO handle OOM */ + put_entry(&dest->s_mappings, dest_area); + + cur = btree_next(cur); + } + + cur = btree_first(&src->s_reserved); + while (cur) { + struct vm_area *src_area + = BTREE_CONTAINER(struct vm_area, vma_node, cur); + struct vm_area *dest_area = area_duplicate(src_area); + tracek("duplicated vm_area [r] [%zx-%zx] %p -> %p", + src_area->vma_base, + src_area->vma_limit, + src_area, + dest_area); + /* TODO handle OOM */ + put_entry(&dest->s_reserved, dest_area); + + cur = btree_next(cur); + } + + tracek("preparing duplicate areas"); + kern_status_t status = prepare_duplicate_areas(src, dest); + tracek("prepared duplicate areas"); + if (status != KERN_OK) { + return status; + } + + return KERN_OK; +} + bool address_space_validate_access( struct address_space *region, virt_addr_t ptr,