diff --git a/CMakeLists.txt b/CMakeLists.txt index 50a3fedf..6d7d55f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,13 +1,19 @@ cmake_minimum_required(VERSION 3.7) -project(noah VERSION 0.3.9) +project(noah VERSION 0.5.1) configure_file(bin/noah.in ${PROJECT_SOURCE_DIR}/bin/noah @ONLY) configure_file(include/version.h.in ${PROJECT_SOURCE_DIR}/include/version.h) +if (CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 16.0.0) +add_definitions("-DMACOS_PRE_16") +endif() + +find_library(HYPERVISOR_FRAMEWORK Hypervisor) +find_library(PTHREAD_LIBRARY pthread) + set(CMAKE_C_FLAGS "-Wall -Wextra -Wno-unused-parameter -std=gnu11") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -g -fsanitize=address -fno-omit-frame-pointer") -set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2") include_directories(include) add_executable(noah @@ -33,10 +39,11 @@ add_executable(noah src/mm/shm.c src/ipc/sem.c ) -target_link_libraries(noah pthread "-framework Hypervisor") +target_link_libraries(noah ${PTHREAD_LIBRARY} ${HYPERVISOR_FRAMEWORK}) install(PROGRAMS bin/noah DESTINATION bin) install(TARGETS noah DESTINATION libexec) +install(FILES man/noah.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/man/man1) enable_testing() add_subdirectory(test) diff --git a/HACKING.md b/HACKING.md index 43c85522..d6effac7 100644 --- a/HACKING.md +++ b/HACKING.md @@ -18,6 +18,15 @@ $ INSTALL_PREFIX/libexec/noah -m ROOT_PATH PATH_TO_INIT ``` where `ROOT_PATH` is a path to the directory that is treated as the root mount point in the Linux box, and `PATH_TO_INIT` is a path to the first command to be run in the boot sequence, like `/bin/bash`. +`noahstrap` helps you set up a Linux environment on your local machine. +It retrieves a ready-to-use distro image from the Internet and extracts it to a specified directory. +`noahstrap` is installed via homebrew. + +```console +$ brew install linux-noah/noah/noahstrap +$ noahstrap --help # prints help message +``` + ## Debugging There are several methods to debug `noah`. diff --git a/README.md b/README.md index 166384ac..404a73ea 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,28 @@ -# Noah [![Build Status](http://noah-ci.idylls.jp/job/noah/badge/icon)](http://noah-ci.idylls.jp/job/noah/) +# Noah Noah is a Darwin subsystem for Linux, or "Bash on Ubuntu on Mac OS X". Noah is implemented as a hypervisor that traps linux system calls and translates them into Darwin's system calls. Noah also has an interpreter of ELF files so that binary executables of Linux run directly and flawlessly without any modifications. +__Noah is an experimental product__. Most Linux apps don't work well due to missing system calls. For the technical details, please refer to [its academic paper](https://dl.acm.org/doi/abs/10.1145/3381052.3381327). Currently, this repository is not actively maintained. It might not run on the latest macOS. + + + ## Quick Start -Noah is installed via homebrew. On the first run, noah automatically downloads and installs a comprehensive linux environment in your home directory (by default, ubuntu 16.04 is installed in `~/.noah/tree`). +Noah is installed via Homebrew or MacPorts. On the first run, noah automatically downloads and installs a comprehensive linux environment in your home directory (by default, ubuntu 16.04 is installed in `~/.noah/tree`). +macOS Sierra or higher is required. + +### Homebrew + +```console +$ brew install linux-noah/noah/noah +$ noah +``` +### MacPorts - $ brew tap linux-noah/noah - $ brew install noah - $ noah +```console +$ sudo port install noah +$ noah +``` ## Hacking @@ -16,4 +30,4 @@ See [HACKING.md](HACKING.md). ## LICENSE -Dual MITL/GPL, for all files without explicit notaiton. +Dual MITL/GPL, for all files without explicit notation. diff --git a/bin/noah.in b/bin/noah.in index 1f4be49a..b319ea71 100755 --- a/bin/noah.in +++ b/bin/noah.in @@ -54,7 +54,7 @@ if ($> != 0) { my $init = "/bin/bash -i"; if (@ARGV != 0) { - $init = "@ARGV"; + $init = join(" ", map {"'$_'"} @ARGV); } my $opts = ""; diff --git a/images/screenshot.png b/images/screenshot.png new file mode 100755 index 00000000..8786efdc Binary files /dev/null and b/images/screenshot.png differ diff --git a/include/linux/futex.h b/include/linux/futex.h index 970e3773..9abaabbd 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -92,4 +92,51 @@ struct linux_robust_list_head { l_uintptr_t pending_list; }; +#define LINUX_PR_SET_PDEATHSIG 1 +#define LINUX_PR_GET_PDEATHSIG 2 +#define LINUX_PR_GET_DUMPABLE 3 +#define LINUX_PR_SET_DUMPABLE 4 +#define LINUX_PR_GET_UNALIGN 5 +#define LINUX_PR_SET_UNALIGN 6 +#define LINUX_PR_GET_KEEPCAPS 7 +#define LINUX_PR_SET_KEEPCAPS 8 +#define LINUX_PR_GET_FPEMU 9 +#define LINUX_PR_SET_FPEMU 10 +#define LINUX_PR_GET_FPEXC 11 +#define LINUX_PR_SET_FPEXC 12 +#define LINUX_PR_GET_TIMING 13 +#define LINUX_PR_SET_TIMING 14 +#define LINUX_PR_SET_NAME 15 +#define LINUX_PR_GET_NAME 16 + +#define LINIX_PR_GET_ENDIAN 19 +#define LINIX_PR_SET_ENDIAN 20 +#define LINIX_PR_GET_SECCOMP 21 +#define LINIX_PR_SET_SECCOMP 22 +#define LINIX_PR_CAPBSET_READ 23 +#define LINIX_PR_CAPBSET_DROP 24 +#define LINIX_PR_GET_TSC 25 +#define LINIX_PR_SET_TSC 26 +#define LINIX_PR_GET_SECUREBITS 27 +#define LINIX_PR_SET_SECUREBITS 28 +#define LINIX_PR_SET_TIMERSLACK 29 +#define LINIX_PR_GET_TIMERSLACK 30 +#define LINIX_PR_TASK_PERF_EVENTS_DISABLE 31 +#define LINIX_PR_TASK_PERF_EVENTS_ENABLE 32 +#define LINIX_PR_MCE_KILL 33 +#define LINIX_PR_MCE_KILL_GET 34 +#define LINIX_PR_SET_MM 35 +#define LINIX_PR_SET_CHILD_SUBREAPER 36 +#define LINIX_PR_GET_CHILD_SUBREAPER 37 +#define LINIX_PR_SET_NO_NEW_PRIVS 38 +#define LINIX_PR_GET_NO_NEW_PRIVS 39 +#define LINIX_PR_GET_TID_ADDRESS 40 +#define LINIX_PR_SET_THP_DISABLE 41 +#define LINIX_PR_GET_THP_DISABLE 42 +#define LINIX_PR_MPX_ENABLE_MANAGEMENT 43 +#define LINIX_PR_MPX_DISABLE_MANAGEMENT 44 +#define LINIX_PR_SET_FP_MODE 45 +#define LINIX_PR_GET_FP_MODE 46 +#define LINIX_PR_CAP_AMBIENT 47 + #endif /* !_LINUX_FUTEX_H */ diff --git a/include/linux/misc.h b/include/linux/misc.h index ed16d404..80c52205 100644 --- a/include/linux/misc.h +++ b/include/linux/misc.h @@ -80,6 +80,8 @@ struct l___sysctl_args #define LINUX_RLIM_NLIMITS 10 +#define LINUX_RLIM_INFINITY (~0UL) + struct l_rlimit { l_ulong rlim_cur; l_ulong rlim_max; diff --git a/include/linux/time.h b/include/linux/time.h index bb7c2ea0..1a4162e1 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -50,6 +50,11 @@ struct l_timezone { typedef uint32_t l_clockid_t; +struct l_itimerval { + struct l_timeval it_interval; + struct l_timeval it_value; +}; + #define LINUX_CLOCK_REALTIME 0 #define LINUX_CLOCK_MONOTONIC 1 #define LINUX_CLOCK_PROCESS_CPUTIME_ID 2 diff --git a/include/noah.h b/include/noah.h index eb82d97e..97cc2fb0 100644 --- a/include/noah.h +++ b/include/noah.h @@ -81,7 +81,7 @@ struct task { struct fdtable { int start; // First fd number of this table int size; // Current table size expressed in number of bits - struct file *files; + struct file **files; uint64_t *open_fds; uint64_t *cloexec_fds; }; diff --git a/include/syscall.h b/include/syscall.h index 71fe9df8..6b5a9581 100644 --- a/include/syscall.h +++ b/include/syscall.h @@ -37,9 +37,9 @@ SYSCALL(33, dup2) \ SYSCALL(34, unimplemented) \ SYSCALL(35, nanosleep) \ - SYSCALL(36, unimplemented) \ + SYSCALL(36, getitimer) \ SYSCALL(37, alarm) \ - SYSCALL(38, unimplemented) \ + SYSCALL(38, setitimer) \ SYSCALL(39, getpid) \ SYSCALL(40, unimplemented) \ SYSCALL(41, socket) \ @@ -134,7 +134,7 @@ SYSCALL(130, rt_sigsuspend) \ SYSCALL(131, sigaltstack) \ SYSCALL(132, utime) \ - SYSCALL(133, unimplemented) \ + SYSCALL(133, mknod) \ SYSCALL(134, uselib) \ SYSCALL(135, unimplemented) \ SYSCALL(136, unimplemented) \ @@ -260,7 +260,7 @@ SYSCALL(256, unimplemented) \ SYSCALL(257, openat) \ SYSCALL(258, mkdirat) \ - SYSCALL(259, unimplemented) \ + SYSCALL(259, mknodat) \ SYSCALL(260, fchownat) \ SYSCALL(261, unimplemented) \ SYSCALL(262, newfstatat) \ diff --git a/include/vmm.h b/include/vmm.h index be733b99..b080611e 100644 --- a/include/vmm.h +++ b/include/vmm.h @@ -11,8 +11,8 @@ struct vcpu_snapshot { uint64_t vcpu_reg[NR_X86_REG_LIST]; - uint64_t vmcs[NR_VMCS_FIELD]; - char fpu_states[512] __attribute__((aligned(16))); + uint64_t vmcs[NR_VMCS_FIELD_MASKED]; + char fpu_states[2496] __attribute__((aligned(16))); }; struct vmm_snapshot { diff --git a/include/x86/specialreg.h b/include/x86/specialreg.h index 1e68405c..55e3a266 100644 --- a/include/x86/specialreg.h +++ b/include/x86/specialreg.h @@ -17,3 +17,6 @@ #define MSR_TIME_STAMP_COUNTER 0x00000010 #define MSR_KERNEL_GS_BASE 0xc0000102 #define MSR_TSC_AUX 0xc0000103 + +#define XCR0_SSE_STATE 0x00000002 +#define XCR0_AVX_STATE 0x00000004 diff --git a/include/x86/vm.h b/include/x86/vm.h index 79e746c5..c0553ded 100644 --- a/include/x86/vm.h +++ b/include/x86/vm.h @@ -18,13 +18,13 @@ typedef enum { PAGE_PML4E, } page_type_t; -#define PAGE_SHIFT(page_type) (12 + (page_type) * 9) -#define PAGE_SIZE(page_type) (1ULL << PAGE_SHIFT(page_type)) +#define PAGE_SHIFTOF(page_type) (12 + (page_type) * 9) +#define PAGE_SIZEOF(page_type) (1ULL << PAGE_SHIFTOF(page_type)) #define NR_PAGE_ENTRY 512 static inline int is_page_aligned(void *addr, page_type_t page) { - return ((uint64_t)addr & (PAGE_SIZE(page) - 1)) == 0; + return ((uint64_t)addr & (PAGE_SIZEOF(page) - 1)) == 0; } /* idt */ diff --git a/include/x86/vmx.h b/include/x86/vmx.h index 35b63b7a..1e28ba16 100644 --- a/include/x86/vmx.h +++ b/include/x86/vmx.h @@ -1,171 +1,185 @@ #ifndef NOAH_X86_VMX_H #define NOAH_X86_VMX_H -#define NR_VMCS_FIELD (sizeof(vmcs_field_list) / sizeof(uint32_t) - 1) +#define elementsof(array) ( sizeof(array) / sizeof((array)[0]) ) +#define NR_VMCS_FIELD (elementsof(vmcs_field_list) - 1) +#define NR_VMCS_FIELD_MASKED (elementsof(vmcs_field_masked_list) - 1) -#define VMCS_FIELD_ENTRIES \ - VMCS_FIELD(VMCS_VPID) \ - VMCS_FIELD(VMCS_CTRL_POSTED_INT_N_VECTOR) \ - VMCS_FIELD(VMCS_CTRL_EPTP_INDEX) \ - VMCS_FIELD(VMCS_GUEST_ES) \ - VMCS_FIELD(VMCS_GUEST_CS) \ - VMCS_FIELD(VMCS_GUEST_SS) \ - VMCS_FIELD(VMCS_GUEST_DS) \ - VMCS_FIELD(VMCS_GUEST_FS) \ - VMCS_FIELD(VMCS_GUEST_GS) \ - VMCS_FIELD(VMCS_GUEST_LDTR) \ - VMCS_FIELD(VMCS_GUEST_TR) \ - VMCS_FIELD(VMCS_GUEST_INT_STATUS) \ - VMCS_FIELD(VMCS_HOST_ES) \ - VMCS_FIELD(VMCS_HOST_CS) \ - VMCS_FIELD(VMCS_HOST_SS) \ - VMCS_FIELD(VMCS_HOST_DS) \ - VMCS_FIELD(VMCS_HOST_FS) \ - VMCS_FIELD(VMCS_HOST_GS) \ - VMCS_FIELD(VMCS_HOST_TR) \ - VMCS_FIELD(VMCS_CTRL_IO_BITMAP_A) \ - VMCS_FIELD(VMCS_CTRL_IO_BITMAP_B) \ - VMCS_FIELD(VMCS_CTRL_MSR_BITMAPS) \ - VMCS_FIELD(VMCS_CTRL_VMEXIT_MSR_STORE_ADDR) \ - VMCS_FIELD(VMCS_CTRL_VMEXIT_MSR_LOAD_ADDR) \ - VMCS_FIELD(VMCS_CTRL_VMENTRY_MSR_LOAD_ADDR) \ - VMCS_FIELD(VMCS_CTRL_EXECUTIVE_VMCS_PTR) \ - VMCS_FIELD(VMCS_CTRL_TSC_OFFSET) \ - VMCS_FIELD(VMCS_CTRL_VIRTUAL_APIC) \ - VMCS_FIELD(VMCS_CTRL_APIC_ACCESS) \ - VMCS_FIELD(VMCS_CTRL_POSTED_INT_DESC_ADDR) \ - VMCS_FIELD(VMCS_CTRL_VMFUNC_CTRL) \ - VMCS_FIELD(VMCS_CTRL_EPTP) \ - VMCS_FIELD(VMCS_CTRL_EOI_EXIT_BITMAP_0) \ - VMCS_FIELD(VMCS_CTRL_EOI_EXIT_BITMAP_1) \ - VMCS_FIELD(VMCS_CTRL_EOI_EXIT_BITMAP_2) \ - VMCS_FIELD(VMCS_CTRL_EOI_EXIT_BITMAP_3) \ - VMCS_FIELD(VMCS_CTRL_EPTP_LIST_ADDR) \ - VMCS_FIELD(VMCS_CTRL_VMREAD_BITMAP_ADDR) \ - VMCS_FIELD(VMCS_CTRL_VMWRITE_BITMAP_ADDR) \ - VMCS_FIELD(VMCS_CTRL_VIRT_EXC_INFO_ADDR) \ - VMCS_FIELD(VMCS_CTRL_XSS_EXITING_BITMAP) \ - VMCS_FIELD(VMCS_GUEST_PHYSICAL_ADDRESS) \ - VMCS_FIELD(VMCS_GUEST_LINK_POINTER) \ - VMCS_FIELD(VMCS_GUEST_IA32_DEBUGCTL) \ - VMCS_FIELD(VMCS_GUEST_IA32_PAT) \ - VMCS_FIELD(VMCS_GUEST_IA32_EFER) \ - VMCS_FIELD(VMCS_GUEST_IA32_PERF_GLOBAL_CTRL) \ - VMCS_FIELD(VMCS_GUEST_PDPTE0) \ - VMCS_FIELD(VMCS_GUEST_PDPTE1) \ - VMCS_FIELD(VMCS_GUEST_PDPTE2) \ - VMCS_FIELD(VMCS_GUEST_PDPTE3) \ - VMCS_FIELD(VMCS_HOST_IA32_PAT) \ - VMCS_FIELD(VMCS_HOST_IA32_EFER) \ - VMCS_FIELD(VMCS_HOST_IA32_PERF_GLOBAL_CTRL) \ - VMCS_FIELD(VMCS_CTRL_PIN_BASED) \ - VMCS_FIELD(VMCS_CTRL_CPU_BASED) \ - VMCS_FIELD(VMCS_CTRL_EXC_BITMAP) \ - VMCS_FIELD(VMCS_CTRL_PF_ERROR_MASK) \ - VMCS_FIELD(VMCS_CTRL_PF_ERROR_MATCH) \ - VMCS_FIELD(VMCS_CTRL_CR3_COUNT) \ - VMCS_FIELD(VMCS_CTRL_VMEXIT_CONTROLS) \ - VMCS_FIELD(VMCS_CTRL_VMEXIT_MSR_STORE_COUNT) \ - VMCS_FIELD(VMCS_CTRL_VMEXIT_MSR_LOAD_COUNT) \ - VMCS_FIELD(VMCS_CTRL_VMENTRY_CONTROLS) \ - VMCS_FIELD(VMCS_CTRL_VMENTRY_MSR_LOAD_COUNT) \ - VMCS_FIELD(VMCS_CTRL_VMENTRY_IRQ_INFO) \ - VMCS_FIELD(VMCS_CTRL_VMENTRY_EXC_ERROR) \ - VMCS_FIELD(VMCS_CTRL_VMENTRY_INSTR_LEN) \ - VMCS_FIELD(VMCS_CTRL_TPR_THRESHOLD) \ - VMCS_FIELD(VMCS_CTRL_CPU_BASED2) \ - VMCS_FIELD(VMCS_CTRL_PLE_GAP) \ - VMCS_FIELD(VMCS_CTRL_PLE_WINDOW) \ - VMCS_FIELD(VMCS_RO_INSTR_ERROR) \ - VMCS_FIELD(VMCS_RO_EXIT_REASON) \ - VMCS_FIELD(VMCS_RO_VMEXIT_IRQ_INFO) \ - VMCS_FIELD(VMCS_RO_VMEXIT_IRQ_ERROR) \ - VMCS_FIELD(VMCS_RO_IDT_VECTOR_INFO) \ - VMCS_FIELD(VMCS_RO_IDT_VECTOR_ERROR) \ - VMCS_FIELD(VMCS_RO_VMEXIT_INSTR_LEN) \ - VMCS_FIELD(VMCS_RO_VMX_INSTR_INFO) \ - VMCS_FIELD(VMCS_GUEST_ES_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_CS_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_SS_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_DS_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_FS_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_GS_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_LDTR_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_TR_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_GDTR_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_IDTR_LIMIT) \ - VMCS_FIELD(VMCS_GUEST_ES_AR) \ - VMCS_FIELD(VMCS_GUEST_CS_AR) \ - VMCS_FIELD(VMCS_GUEST_SS_AR) \ - VMCS_FIELD(VMCS_GUEST_DS_AR) \ - VMCS_FIELD(VMCS_GUEST_FS_AR) \ - VMCS_FIELD(VMCS_GUEST_GS_AR) \ - VMCS_FIELD(VMCS_GUEST_LDTR_AR) \ - VMCS_FIELD(VMCS_GUEST_TR_AR) \ - VMCS_FIELD(VMCS_GUEST_IGNORE_IRQ) \ - VMCS_FIELD(VMCS_GUEST_ACTIVITY_STATE) \ - VMCS_FIELD(VMCS_GUEST_SMBASE) \ - VMCS_FIELD(VMCS_GUEST_IA32_SYSENTER_CS) \ - VMCS_FIELD(VMCS_GUEST_VMX_TIMER_VALUE) \ - VMCS_FIELD(VMCS_HOST_IA32_SYSENTER_CS) \ - VMCS_FIELD(VMCS_CTRL_CR0_MASK) \ - VMCS_FIELD(VMCS_CTRL_CR4_MASK) \ - VMCS_FIELD(VMCS_CTRL_CR0_SHADOW) \ - VMCS_FIELD(VMCS_CTRL_CR4_SHADOW) \ - VMCS_FIELD(VMCS_CTRL_CR3_VALUE0) \ - VMCS_FIELD(VMCS_CTRL_CR3_VALUE1) \ - VMCS_FIELD(VMCS_CTRL_CR3_VALUE2) \ - VMCS_FIELD(VMCS_CTRL_CR3_VALUE3) \ - VMCS_FIELD(VMCS_RO_EXIT_QUALIFIC) \ - VMCS_FIELD(VMCS_RO_IO_RCX) \ - VMCS_FIELD(VMCS_RO_IO_RSI) \ - VMCS_FIELD(VMCS_RO_IO_RDI) \ - VMCS_FIELD(VMCS_RO_IO_RIP) \ - VMCS_FIELD(VMCS_RO_GUEST_LIN_ADDR) \ - VMCS_FIELD(VMCS_GUEST_CR0) \ - VMCS_FIELD(VMCS_GUEST_CR3) \ - VMCS_FIELD(VMCS_GUEST_CR4) \ - VMCS_FIELD(VMCS_GUEST_ES_BASE) \ - VMCS_FIELD(VMCS_GUEST_CS_BASE) \ - VMCS_FIELD(VMCS_GUEST_SS_BASE) \ - VMCS_FIELD(VMCS_GUEST_DS_BASE) \ - VMCS_FIELD(VMCS_GUEST_FS_BASE) \ - VMCS_FIELD(VMCS_GUEST_GS_BASE) \ - VMCS_FIELD(VMCS_GUEST_LDTR_BASE) \ - VMCS_FIELD(VMCS_GUEST_TR_BASE) \ - VMCS_FIELD(VMCS_GUEST_GDTR_BASE) \ - VMCS_FIELD(VMCS_GUEST_IDTR_BASE) \ - VMCS_FIELD(VMCS_GUEST_DR7) \ - VMCS_FIELD(VMCS_GUEST_RSP) \ - VMCS_FIELD(VMCS_GUEST_RIP) \ - VMCS_FIELD(VMCS_GUEST_RFLAGS) \ - VMCS_FIELD(VMCS_GUEST_DEBUG_EXC) \ - VMCS_FIELD(VMCS_GUEST_SYSENTER_ESP) \ - VMCS_FIELD(VMCS_GUEST_SYSENTER_EIP) \ - VMCS_FIELD(VMCS_HOST_CR0) \ - VMCS_FIELD(VMCS_HOST_CR3) \ - VMCS_FIELD(VMCS_HOST_CR4) \ - VMCS_FIELD(VMCS_HOST_FS_BASE) \ - VMCS_FIELD(VMCS_HOST_GS_BASE) \ - VMCS_FIELD(VMCS_HOST_TR_BASE) \ - VMCS_FIELD(VMCS_HOST_GDTR_BASE) \ - VMCS_FIELD(VMCS_HOST_IDTR_BASE) \ - VMCS_FIELD(VMCS_HOST_IA32_SYSENTER_ESP) \ - VMCS_FIELD(VMCS_HOST_IA32_SYSENTER_EIP) \ - VMCS_FIELD(VMCS_HOST_RSP) \ - VMCS_FIELD(VMCS_HOST_RIP) \ +#define VMCS_FIELD_ENTRIES \ + MASK( VMCS_FIELD(VMCS_VPID) ) \ + VMCS_FIELD(VMCS_CTRL_POSTED_INT_N_VECTOR) \ + VMCS_FIELD(VMCS_CTRL_EPTP_INDEX) \ + VMCS_FIELD(VMCS_GUEST_ES) \ + VMCS_FIELD(VMCS_GUEST_CS) \ + VMCS_FIELD(VMCS_GUEST_SS) \ + VMCS_FIELD(VMCS_GUEST_DS) \ + VMCS_FIELD(VMCS_GUEST_FS) \ + VMCS_FIELD(VMCS_GUEST_GS) \ + VMCS_FIELD(VMCS_GUEST_LDTR) \ + VMCS_FIELD(VMCS_GUEST_TR) \ + VMCS_FIELD(VMCS_GUEST_INT_STATUS) \ + MASK( VMCS_FIELD(VMCS_HOST_ES) ) \ + MASK( VMCS_FIELD(VMCS_HOST_CS) ) \ + MASK( VMCS_FIELD(VMCS_HOST_SS) ) \ + MASK( VMCS_FIELD(VMCS_HOST_DS) ) \ + MASK( VMCS_FIELD(VMCS_HOST_FS) ) \ + MASK( VMCS_FIELD(VMCS_HOST_GS) ) \ + MASK( VMCS_FIELD(VMCS_HOST_TR) ) \ + VMCS_FIELD(VMCS_CTRL_IO_BITMAP_A) \ + VMCS_FIELD(VMCS_CTRL_IO_BITMAP_B) \ + VMCS_FIELD(VMCS_CTRL_MSR_BITMAPS) \ + VMCS_FIELD(VMCS_CTRL_VMEXIT_MSR_STORE_ADDR) \ + VMCS_FIELD(VMCS_CTRL_VMEXIT_MSR_LOAD_ADDR) \ + VMCS_FIELD(VMCS_CTRL_VMENTRY_MSR_LOAD_ADDR) \ + VMCS_FIELD(VMCS_CTRL_EXECUTIVE_VMCS_PTR) \ + VMCS_FIELD(VMCS_CTRL_TSC_OFFSET) \ + VMCS_FIELD(VMCS_CTRL_VIRTUAL_APIC) \ + VMCS_FIELD(VMCS_CTRL_APIC_ACCESS) \ + VMCS_FIELD(VMCS_CTRL_POSTED_INT_DESC_ADDR) \ + VMCS_FIELD(VMCS_CTRL_VMFUNC_CTRL) \ + VMCS_FIELD(VMCS_CTRL_EPTP) \ + VMCS_FIELD(VMCS_CTRL_EOI_EXIT_BITMAP_0) \ + VMCS_FIELD(VMCS_CTRL_EOI_EXIT_BITMAP_1) \ + VMCS_FIELD(VMCS_CTRL_EOI_EXIT_BITMAP_2) \ + VMCS_FIELD(VMCS_CTRL_EOI_EXIT_BITMAP_3) \ + VMCS_FIELD(VMCS_CTRL_EPTP_LIST_ADDR) \ + VMCS_FIELD(VMCS_CTRL_VMREAD_BITMAP_ADDR) \ + VMCS_FIELD(VMCS_CTRL_VMWRITE_BITMAP_ADDR) \ + VMCS_FIELD(VMCS_CTRL_VIRT_EXC_INFO_ADDR) \ + VMCS_FIELD(VMCS_CTRL_XSS_EXITING_BITMAP) \ + MASK( VMCS_FIELD(VMCS_GUEST_PHYSICAL_ADDRESS) ) \ + VMCS_FIELD(VMCS_GUEST_LINK_POINTER) \ + VMCS_FIELD(VMCS_GUEST_IA32_DEBUGCTL) \ + VMCS_FIELD(VMCS_GUEST_IA32_PAT) \ + VMCS_FIELD(VMCS_GUEST_IA32_EFER) \ + VMCS_FIELD(VMCS_GUEST_IA32_PERF_GLOBAL_CTRL) \ + VMCS_FIELD(VMCS_GUEST_PDPTE0) \ + VMCS_FIELD(VMCS_GUEST_PDPTE1) \ + VMCS_FIELD(VMCS_GUEST_PDPTE2) \ + VMCS_FIELD(VMCS_GUEST_PDPTE3) \ + MASK( VMCS_FIELD(VMCS_HOST_IA32_PAT) ) \ + MASK( VMCS_FIELD(VMCS_HOST_IA32_EFER) ) \ + MASK( VMCS_FIELD(VMCS_HOST_IA32_PERF_GLOBAL_CTRL) ) \ + VMCS_FIELD(VMCS_CTRL_PIN_BASED) \ + VMCS_FIELD(VMCS_CTRL_CPU_BASED) \ + VMCS_FIELD(VMCS_CTRL_EXC_BITMAP) \ + VMCS_FIELD(VMCS_CTRL_PF_ERROR_MASK) \ + VMCS_FIELD(VMCS_CTRL_PF_ERROR_MATCH) \ + VMCS_FIELD(VMCS_CTRL_CR3_COUNT) \ + VMCS_FIELD(VMCS_CTRL_VMEXIT_CONTROLS) \ + VMCS_FIELD(VMCS_CTRL_VMEXIT_MSR_STORE_COUNT) \ + VMCS_FIELD(VMCS_CTRL_VMEXIT_MSR_LOAD_COUNT) \ + VMCS_FIELD(VMCS_CTRL_VMENTRY_CONTROLS) \ + VMCS_FIELD(VMCS_CTRL_VMENTRY_MSR_LOAD_COUNT) \ + VMCS_FIELD(VMCS_CTRL_VMENTRY_IRQ_INFO) \ + VMCS_FIELD(VMCS_CTRL_VMENTRY_EXC_ERROR) \ + VMCS_FIELD(VMCS_CTRL_VMENTRY_INSTR_LEN) \ + VMCS_FIELD(VMCS_CTRL_TPR_THRESHOLD) \ + VMCS_FIELD(VMCS_CTRL_CPU_BASED2) \ + VMCS_FIELD(VMCS_CTRL_PLE_GAP) \ + VMCS_FIELD(VMCS_CTRL_PLE_WINDOW) \ + MASK( VMCS_FIELD(VMCS_RO_INSTR_ERROR) ) \ + MASK( VMCS_FIELD(VMCS_RO_EXIT_REASON) ) \ + MASK( VMCS_FIELD(VMCS_RO_VMEXIT_IRQ_INFO) ) \ + MASK( VMCS_FIELD(VMCS_RO_VMEXIT_IRQ_ERROR) ) \ + MASK( VMCS_FIELD(VMCS_RO_IDT_VECTOR_INFO) ) \ + MASK( VMCS_FIELD(VMCS_RO_IDT_VECTOR_ERROR) ) \ + MASK( VMCS_FIELD(VMCS_RO_VMEXIT_INSTR_LEN) ) \ + MASK( VMCS_FIELD(VMCS_RO_VMX_INSTR_INFO) ) \ + VMCS_FIELD(VMCS_GUEST_ES_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_CS_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_SS_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_DS_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_FS_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_GS_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_LDTR_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_TR_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_GDTR_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_IDTR_LIMIT) \ + VMCS_FIELD(VMCS_GUEST_ES_AR) \ + VMCS_FIELD(VMCS_GUEST_CS_AR) \ + VMCS_FIELD(VMCS_GUEST_SS_AR) \ + VMCS_FIELD(VMCS_GUEST_DS_AR) \ + VMCS_FIELD(VMCS_GUEST_FS_AR) \ + VMCS_FIELD(VMCS_GUEST_GS_AR) \ + VMCS_FIELD(VMCS_GUEST_LDTR_AR) \ + VMCS_FIELD(VMCS_GUEST_TR_AR) \ + VMCS_FIELD(VMCS_GUEST_IGNORE_IRQ) \ + VMCS_FIELD(VMCS_GUEST_ACTIVITY_STATE) \ + VMCS_FIELD(VMCS_GUEST_SMBASE) \ + VMCS_FIELD(VMCS_GUEST_IA32_SYSENTER_CS) \ + VMCS_FIELD(VMCS_GUEST_VMX_TIMER_VALUE) \ + VMCS_FIELD(VMCS_HOST_IA32_SYSENTER_CS) \ + VMCS_FIELD(VMCS_CTRL_CR0_MASK) \ + VMCS_FIELD(VMCS_CTRL_CR4_MASK) \ + VMCS_FIELD(VMCS_CTRL_CR0_SHADOW) \ + VMCS_FIELD(VMCS_CTRL_CR4_SHADOW) \ + VMCS_FIELD(VMCS_CTRL_CR3_VALUE0) \ + VMCS_FIELD(VMCS_CTRL_CR3_VALUE1) \ + VMCS_FIELD(VMCS_CTRL_CR3_VALUE2) \ + VMCS_FIELD(VMCS_CTRL_CR3_VALUE3) \ + MASK( VMCS_FIELD(VMCS_RO_EXIT_QUALIFIC) ) \ + MASK( VMCS_FIELD(VMCS_RO_IO_RCX) ) \ + MASK( VMCS_FIELD(VMCS_RO_IO_RSI) ) \ + MASK( VMCS_FIELD(VMCS_RO_IO_RDI) ) \ + MASK( VMCS_FIELD(VMCS_RO_IO_RIP) ) \ + MASK( VMCS_FIELD(VMCS_RO_GUEST_LIN_ADDR) ) \ + VMCS_FIELD(VMCS_GUEST_CR0) \ + VMCS_FIELD(VMCS_GUEST_CR3) \ + VMCS_FIELD(VMCS_GUEST_CR4) \ + VMCS_FIELD(VMCS_GUEST_ES_BASE) \ + VMCS_FIELD(VMCS_GUEST_CS_BASE) \ + VMCS_FIELD(VMCS_GUEST_SS_BASE) \ + VMCS_FIELD(VMCS_GUEST_DS_BASE) \ + VMCS_FIELD(VMCS_GUEST_FS_BASE) \ + VMCS_FIELD(VMCS_GUEST_GS_BASE) \ + VMCS_FIELD(VMCS_GUEST_LDTR_BASE) \ + VMCS_FIELD(VMCS_GUEST_TR_BASE) \ + VMCS_FIELD(VMCS_GUEST_GDTR_BASE) \ + VMCS_FIELD(VMCS_GUEST_IDTR_BASE) \ + VMCS_FIELD(VMCS_GUEST_DR7) \ + VMCS_FIELD(VMCS_GUEST_RSP) \ + VMCS_FIELD(VMCS_GUEST_RIP) \ + VMCS_FIELD(VMCS_GUEST_RFLAGS) \ + VMCS_FIELD(VMCS_GUEST_DEBUG_EXC) \ + VMCS_FIELD(VMCS_GUEST_SYSENTER_ESP) \ + VMCS_FIELD(VMCS_GUEST_SYSENTER_EIP) \ + MASK( VMCS_FIELD(VMCS_HOST_CR0) ) \ + MASK( VMCS_FIELD(VMCS_HOST_CR3) ) \ + MASK( VMCS_FIELD(VMCS_HOST_CR4) ) \ + MASK( VMCS_FIELD(VMCS_HOST_FS_BASE) ) \ + MASK( VMCS_FIELD(VMCS_HOST_GS_BASE) ) \ + MASK( VMCS_FIELD(VMCS_HOST_TR_BASE) ) \ + MASK( VMCS_FIELD(VMCS_HOST_GDTR_BASE) ) \ + MASK( VMCS_FIELD(VMCS_HOST_IDTR_BASE) ) \ + MASK( VMCS_FIELD(VMCS_HOST_IA32_SYSENTER_ESP) ) \ + MASK( VMCS_FIELD(VMCS_HOST_IA32_SYSENTER_EIP) ) \ + MASK( VMCS_FIELD(VMCS_HOST_RSP) ) \ + MASK( VMCS_FIELD(VMCS_HOST_RIP) ) \ VMCS_FIELD(VMCS_MAX) static const uint32_t vmcs_field_list[] = { +#define MASK(x) x #define VMCS_FIELD(x) x, VMCS_FIELD_ENTRIES #undef VMCS_FIELD +#undef MASK }; static const char *vmcs_field_str[] = { +#define MASK(x) x #define VMCS_FIELD(x) #x, VMCS_FIELD_ENTRIES #undef VMCS_FIELD +#undef MASK +}; + +static const uint32_t vmcs_field_masked_list[] = { +#define MASK(x) +#define VMCS_FIELD(x) x, + VMCS_FIELD_ENTRIES +#undef VMCS_FIELD +#undef MASK }; #define NR_X86_REG_LIST (sizeof(x86_reg_list) / sizeof(uint32_t) - 1) diff --git a/lib/vmm.c b/lib/vmm.c index 3469b3be..d3b9ed1a 100644 --- a/lib/vmm.c +++ b/lib/vmm.c @@ -207,8 +207,8 @@ vmm_snapshot_vcpu(struct vcpu_snapshot *snapshot) vmm_read_register(x86_reg_list[i], &snapshot->vcpu_reg[i]); } /* snapshot vmcs */ - for (uint64_t i = 0; i < NR_VMCS_FIELD; i++) { - vmm_read_vmcs(vmcs_field_list[i], &snapshot->vmcs[i]); + for (uint64_t i = 0; i < NR_VMCS_FIELD_MASKED; i++) { + vmm_read_vmcs(vmcs_field_masked_list[i], &snapshot->vmcs[i]); } hv_vcpu_read_fpstate(vcpu->vcpuid, snapshot->fpu_states, sizeof snapshot->fpu_states); } @@ -236,55 +236,8 @@ void vmm_restore_vcpu(struct vcpu_snapshot *snapshot) { /* restore vmcs */ - static const uint32_t restore_mask[] = { - VMCS_VPID, - VMCS_HOST_ES, - VMCS_HOST_CS, - VMCS_HOST_SS, - VMCS_HOST_DS, - VMCS_HOST_FS, - VMCS_HOST_GS, - VMCS_HOST_TR, - VMCS_HOST_IA32_PAT, - VMCS_HOST_IA32_EFER, - VMCS_HOST_IA32_PERF_GLOBAL_CTRL, - VMCS_GUEST_PHYSICAL_ADDRESS, - VMCS_RO_INSTR_ERROR, - VMCS_RO_EXIT_REASON, - VMCS_RO_VMEXIT_IRQ_INFO, - VMCS_RO_VMEXIT_IRQ_ERROR, - VMCS_RO_IDT_VECTOR_INFO, - VMCS_RO_IDT_VECTOR_ERROR, - VMCS_RO_VMEXIT_INSTR_LEN, - VMCS_RO_VMX_INSTR_INFO, - VMCS_RO_EXIT_QUALIFIC, - VMCS_RO_IO_RCX, - VMCS_RO_IO_RSI, - VMCS_RO_IO_RDI, - VMCS_RO_IO_RIP, - VMCS_RO_GUEST_LIN_ADDR, - VMCS_HOST_CR0, - VMCS_HOST_CR3, - VMCS_HOST_CR4, - VMCS_HOST_FS_BASE, - VMCS_HOST_GS_BASE, - VMCS_HOST_TR_BASE, - VMCS_HOST_GDTR_BASE, - VMCS_HOST_IDTR_BASE, - VMCS_HOST_IA32_SYSENTER_ESP, - VMCS_HOST_IA32_SYSENTER_EIP, - VMCS_HOST_RSP, - VMCS_HOST_RIP, - }; - - for (uint64_t i = 0; i < NR_VMCS_FIELD; i++) { - for (uint64_t j = 0; j < sizeof restore_mask / sizeof restore_mask[0]; j++) { - if (restore_mask[j] == vmcs_field_list[i]) { - goto cont; - } - } - vmm_write_vmcs(vmcs_field_list[i], snapshot->vmcs[i]); -cont: ; + for (uint64_t i = 0; i < NR_VMCS_FIELD_MASKED; i++) { + vmm_write_vmcs(vmcs_field_masked_list[i], snapshot->vmcs[i]); } /* restore registers */ diff --git a/man/.gitignore b/man/.gitignore new file mode 100644 index 00000000..ec0237e9 --- /dev/null +++ b/man/.gitignore @@ -0,0 +1 @@ +noah.ps diff --git a/man/Makefile b/man/Makefile new file mode 100644 index 00000000..963cb264 --- /dev/null +++ b/man/Makefile @@ -0,0 +1,11 @@ +# Use the remark-man node-js package to generate the documentation off the +# GitHub markdown file. +# See https://github.com/remarkjs/remark-man. +noah.1: noah.md + node noah.js + +noah.ps: noah.1 + groff -tmandoc noah.1 > noah.ps + +clean: + rm -f noah.ps diff --git a/man/noah.1 b/man/noah.1 new file mode 100644 index 00000000..dc787c3e --- /dev/null +++ b/man/noah.1 @@ -0,0 +1,52 @@ +.TH "NOAH" "1" "May 2018" "" "" +.SH "NAME" +\fBnoah\fR - Linux ABI implementation (aka Execution Flavour) for OSX +.SH "SYNOPSIS" +.P +\fBnoah\fR \fB-h\fR | \fB\fI-o output_file\fR\fR \[lB]\fI-w warning_file\fR\[rB] \[lB]\fI-s strace_file\fR\[rB] \fB-m /virtual/filesystem/root\fR \fBprogram\fR \[lB]\fI...\fR\[rB] +.SH "DESCRIPTION" +.P +Noah implements Linux Application Binary Interface (ABI) for OSX through its Hypervisor Framework based on Intel(R) VTX technology. +.P +For convenience, a wrapper script written in PERL is provided for launching, as well as debugging the tool (it has to be edited manually). It is that script that is normally executed from path, providing \fI~/.noah/tree\fR as the virtual root. +.P +On the first invocation the script uses noahstrap(1) to populate this virtual root with a default Ubuntu suite, see noahstrap-suites(1). +.SS "Options" +.P + \fI-h\fR, \fI--help\fR output a short help message. +.P + \fI-w file\fR, \fI--warning file\fR optional, specifies the warning capture file. +.P + \fI-o file\fR, \fI--output file\fR optional, specifies the output capture file. +.P + \fI-s file\fR, \fI--strace file\fR optional, specifies the strace capture file. +.P + \fI-m /virtual/filesystem/root\fR, \fI--mnt /virtual/filesystem/root\fR mandatory, specifies the virtual filesystem root where the target application, as well as the ELF interpreter and the rest of dynamic libraries reside. +.P + \fIprogram\fR the target program within the virtual FS root. +.SH "FILES" +.P + \fI~/.noah/tree\fR +.P +.RS 2 +.nf +Default virtual filesystem root. +.fi +.RE +.SH "REFERENCES:" +.RS 0 +.IP \(bu 4 +\fBxhyve\fR \fI\(lahttps://github.com/mist64/xhyve\(ra\fR +.IP \(bu 4 +\fBLinux Darling project\fR \fI\(lahttp://www.darlinghq.org/source-code/\(ra\fR +.IP \(bu 4 +\fBFreeBSD Linuxolator aka Linux ABI\fR \fI\(lahttps://www.freebsd.org/doc/handbook/linuxemu-advanced.html\(ra\fR +.IP \(bu 4 +\fBBash on Ubuntu on macOS\fR \fI\(lahttps://hagi.is.s.u-tokyo.ac.jp/~yuichi/papers/apsys2017.pdf\(ra\fR +.IP \(bu 4 +\fBNoah Hypervisor-Based Darwin Subsystem for Linux\fR \fI\(lahttp://events17.linuxfoundation.org/sites/events/files/slides/Noah%20Hypervisor-Based%20Darwin%20Subsystem%20for%20Linux-pdf.pdf\(ra\fR +.RE 0 + +.SH "SEE ALSO:" +.P +noahstrap(1), noahstrap-suites(1) diff --git a/man/noah.js b/man/noah.js new file mode 100644 index 00000000..dd93c020 --- /dev/null +++ b/man/noah.js @@ -0,0 +1,13 @@ +var vfile = require('to-vfile'); +var unified = require('unified'); +var markdown = require('remark-parse'); +var man = require('remark-man'); + +unified() + .use(markdown) + .use(man) + .process(vfile.readSync('noah.md'), function (err, file) { + if (err) throw err; + file.extname = '.1'; + vfile.writeSync(file); + }); diff --git a/man/noah.md b/man/noah.md new file mode 100644 index 00000000..9332fee2 --- /dev/null +++ b/man/noah.md @@ -0,0 +1,52 @@ +# noah(1) -- Linux ABI implementation (aka Execution Flavour) for OSX + +## SYNOPSIS + +`noah` `-h` \| [_-o output_file_] \[_-w warning_file_] \[_-s strace_file_] `-m /virtual/filesystem/root` `program` \[_..._] + +## DESCRIPTION + +Noah implements Linux Application Binary Interface (ABI) for OSX through its +Hypervisor Framework based on Intel(R) VTX technology. + +For convenience, a wrapper script written in PERL is provided for launching, +as well as debugging the tool (it has to be edited manually). It is that script +that is normally executed from path, providing _~/.noah/tree_ as the +virtual root. + +On the first invocation the script uses noahstrap(1) to populate this virtual +root with a default Ubuntu suite, see noahstrap-suites(1). + +#### Options + + _-h_, _--help_ output a short help message. + + _-w file_, _--warning file_ optional, specifies the warning capture file. + + _-o file_, _--output file_ optional, specifies the output capture file. + + _-s file_, _--strace file_ optional, specifies the strace capture file. + + _-m /virtual/filesystem/root_, _--mnt /virtual/filesystem/root_ mandatory, specifies the virtual filesystem root where the target + application, as well as the ELF interpreter and the rest of dynamic libraries + reside. + + _program_ the target program within the virtual filesystem root. + +## FILES + + _~/.noah/tree_ + + Default virtual filesystem root. + +## REFERENCES: + +- [xhyve](https://github.com/mist64/xhyve) +- [Linux Darling project](http://www.darlinghq.org/source-code/) +- [FreeBSD Linuxolator aka Linux ABI](https://www.freebsd.org/doc/handbook/linuxemu-advanced.html) +- [Bash on Ubuntu on macOS](https://hagi.is.s.u-tokyo.ac.jp/~yuichi/papers/apsys2017.pdf) +- [Noah Hypervisor-Based Darwin Subsystem for Linux](http://events17.linuxfoundation.org/sites/events/files/slides/Noah%20Hypervisor-Based%20Darwin%20Subsystem%20for%20Linux-pdf.pdf) + +## SEE ALSO: + +noahstrap(1), noahstrap-suites(1) diff --git a/src/conv.c b/src/conv.c index d3e1754f..f059ffd0 100644 --- a/src/conv.c +++ b/src/conv.c @@ -620,8 +620,8 @@ darwin_to_linux_rlimit(int resource, struct rlimit *darwin_rlimit, struct l_rlim break; default: *linux_rlimit = (struct l_rlimit) { - .rlim_cur = darwin_rlimit->rlim_cur, - .rlim_max = darwin_rlimit->rlim_max + .rlim_cur = darwin_rlimit->rlim_cur == RLIM_INFINITY ? LINUX_RLIM_INFINITY : darwin_rlimit->rlim_cur, + .rlim_max = darwin_rlimit->rlim_max == RLIM_INFINITY ? LINUX_RLIM_INFINITY : darwin_rlimit->rlim_max }; } } diff --git a/src/fs/fs.c b/src/fs/fs.c index e4cdb3d8..3cae7abd 100644 --- a/src/fs/fs.c +++ b/src/fs/fs.c @@ -84,11 +84,52 @@ struct file_operations { }; static inline bool in_userfd(int fd); +static const int user_fdtable_initsize = 64; static const int vkern_fdtable_maxsize = 64; +static const int fdtable_alloc_unit = 64; // must be a multiple of 64 static inline void set_fdbit(struct fdtable *table, uint64_t *fdbits, int fd); static inline void clear_fdbit(struct fdtable *table, uint64_t *fdbits, int fd); +static inline int div_ceil(int x, int y) { return (x + y - 1) / y; } + +int +alloc_fdtable(struct fdtable *fdtable, int newsize) +{ + newsize = div_ceil(newsize, fdtable_alloc_unit) * fdtable_alloc_unit; + int oldsize = fdtable->size; + if (newsize <= oldsize) + return 0; + + int newunit = newsize / fdtable_alloc_unit; + int oldunit = oldsize / fdtable_alloc_unit; + fdtable->files = realloc(fdtable->files, sizeof(struct file *) * newunit); + if (fdtable->files == NULL) + return -LINUX_ENOMEM; + for (int i = oldunit; i < newunit; i++) { + fdtable->files[i] = calloc(fdtable_alloc_unit, sizeof(struct file)); + if (fdtable->files[i] == NULL) + return -LINUX_ENOMEM; + } + + int newfdslen = newsize / 8; + fdtable->open_fds = realloc(fdtable->open_fds, newfdslen); + if (fdtable->open_fds == NULL) + return -LINUX_ENOMEM; + fdtable->cloexec_fds = realloc(fdtable->cloexec_fds, newfdslen); + if (fdtable->cloexec_fds == NULL) + return -LINUX_ENOMEM; + + int offset = oldsize / 8; + int size = newfdslen - offset; + + bzero(fdtable->open_fds + oldunit, size); + bzero(fdtable->cloexec_fds + oldunit, size); + + fdtable->size = newsize; + return 0; +} + void init_fileinfo(int rootfd) { @@ -96,24 +137,11 @@ init_fileinfo(int rootfd) struct fileinfo *fileinfo = &proc.fileinfo; getrlimit(RLIMIT_NOFILE, &limit); - fileinfo->vkern_fdtable = (struct fdtable) { - .start = limit.rlim_cur - 64, - .size = 64, - .files = malloc(sizeof(struct file) * 64), - .open_fds = malloc(sizeof(uint64_t)), - .cloexec_fds = malloc(sizeof(uint64_t)) - }; - fileinfo->vkern_fdtable.open_fds[0] = 0; - fileinfo->vkern_fdtable.cloexec_fds[0] = 0; - fileinfo->fdtable = (struct fdtable) { - .start = 0, - .size = vkern_fdtable_maxsize, - .files = malloc(sizeof(struct file) * vkern_fdtable_maxsize), - .open_fds = malloc(sizeof(uint64_t)), - .cloexec_fds = malloc(sizeof(uint64_t)) - }; - fileinfo->fdtable.open_fds[0] = 0; - fileinfo->fdtable.cloexec_fds[0] = 0; + fileinfo->vkern_fdtable = (struct fdtable) { 0, 0, NULL, NULL, NULL }; + fileinfo->vkern_fdtable.start = limit.rlim_cur - vkern_fdtable_maxsize; + alloc_fdtable(&fileinfo->vkern_fdtable, vkern_fdtable_maxsize); + fileinfo->fdtable = (struct fdtable) { 0, 0, NULL, NULL, NULL }; + alloc_fdtable(&fileinfo->fdtable, user_fdtable_initsize); for (int i = 0; i < (int) limit.rlim_cur; i++) { if (i == rootfd) { @@ -239,6 +267,18 @@ darwinfs_ioctl(struct file *file, int cmd, uint64_t val0) linux_to_darwin_winsize(&ws, &lws); return syswrap(ioctl(fd, TIOCSWINSZ, &ws)); } + case LINUX_TCXONC: { + int sel; + switch(val0) { + case LINUX_TCOOFF: sel = TCOOFF; break; + case LINUX_TCOON: sel = TCOON; break; + case LINUX_TCIOFF: sel = TCIOFF; break; + case LINUX_TCION: sel = TCION; break; + default: + return -LINUX_EINVAL; + } + return syswrap(tcflow(fd, sel)); + } case LINUX_TCFLSH: { int sel; switch (val0) { @@ -250,6 +290,24 @@ darwinfs_ioctl(struct file *file, int cmd, uint64_t val0) } return syswrap(tcflush(fd, sel)); } + case LINUX_FIONREAD: { + int val; + int r = syswrap(ioctl(fd, FIONREAD, &val)); + if (r < 0) { + return r; + } + if (copy_to_user(val0, &val, sizeof val)) { + return -LINUX_EFAULT; + } + return r; + } + case LINUX_FIONBIO: { + int val; + if (copy_from_user(&val, val0, sizeof val)) { + return -LINUX_EFAULT; + } + return syswrap(ioctl(fd, FIONBIO, &val)); + } case LINUX_FIOCLEX: { pthread_rwlock_wrlock(&proc.fileinfo.fdtable_lock); int r = sys_fcntl(fd, LINUX_F_SETFD, 1); @@ -500,7 +558,8 @@ alloc_file(struct fdtable *table, int fd) darwinfs_fchmod, }; - struct file *file = table->files + (fd - table->start); + int offset = fd - table->start; + struct file *file = &table->files[offset / fdtable_alloc_unit][offset % fdtable_alloc_unit]; file->ops = &ops; file->fd = fd; } @@ -518,16 +577,9 @@ register_fd(int fd, bool is_cloexec) } struct fdtable *fdtable = &proc.fileinfo.fdtable; if (proc.fileinfo.fdtable.size <= fd) { - // Expand table - int new_size = roundup(fd, sizeof(uint64_t)); - size_t old_nunits = proc.fileinfo.fdtable.size / 64; - size_t new_nunits = new_size / 64; - fdtable->files = realloc(fdtable->files, new_size * sizeof(struct file)); - fdtable->open_fds = realloc(fdtable->open_fds, sizeof(uint64_t) * new_nunits); - fdtable->cloexec_fds = realloc(fdtable->cloexec_fds, sizeof(uint64_t) * new_nunits); - bzero(fdtable->open_fds + old_nunits, (new_nunits - old_nunits) * sizeof(uint64_t)); - bzero(fdtable->cloexec_fds + old_nunits, (new_nunits - old_nunits) * sizeof(uint64_t)); - fdtable->size = new_size; + int err = alloc_fdtable(fdtable, fd + 1); + if (err < 0) + return err; } set_fdbit(fdtable, fdtable->open_fds, fd); if (is_cloexec) { @@ -570,18 +622,25 @@ vkern_dup_fd(int fd, bool is_cloexec) } struct file * -get_file(int fd) +do_get_file(struct fdtable *table, int fd) { - if (fd < 0 || fd >= proc.fileinfo.fdtable.size) { + if (!test_fdbit(table, table->open_fds, fd)) { return NULL; } + int offset = fd - table->start; + return &table->files[offset / fdtable_alloc_unit][offset % fdtable_alloc_unit]; +} +struct file * +get_file(int fd) +{ struct file *ret = NULL; + struct fdtable *table = &proc.fileinfo.fdtable; pthread_rwlock_rdlock(&proc.fileinfo.fdtable_lock); - if (!test_fdbit(&proc.fileinfo.fdtable, proc.fileinfo.fdtable.open_fds, fd)) { + if (fd < 0 || fd >= table->size) { goto out; } - ret = &proc.fileinfo.fdtable.files[fd - proc.fileinfo.fdtable.start]; + ret = do_get_file(table, fd); out: pthread_rwlock_unlock(&proc.fileinfo.fdtable_lock); @@ -924,7 +983,11 @@ darwinfs_statfs(struct fs *fs, struct dir *dir, const char *path, struct l_statf if (dir->fd != AT_FDCWD) { path_to_statfs = full_path; char at_path[PATH_MAX]; - assert(fcntl(dir->fd, F_GETPATH, at_path) == 0); // fd must be a regular directory to which fcntl should succeed + // fd must be a regular directory to which fcntl should succeed + int r = fcntl(dir->fd, F_GETPATH, at_path); + if (r != 0) { + panic("fcntl failed"); + } if (snprintf(full_path, PATH_MAX, "%s/%s", at_path, path) >= PATH_MAX) { return -LINUX_ENAMETOOLONG; } @@ -1161,8 +1224,9 @@ do_close(struct fdtable *table, int fd) if (!test_fdbit(table, table->open_fds, fd)) { return -LINUX_EBADF; } - struct file *file = &table->files[fd - table->start]; - assert(file); + struct file *file = do_get_file(table, fd); + if (file == NULL) + return -LINUX_EBADF; int n = file->ops->close(file); clear_fdbit(table, table->open_fds, fd); clear_fdbit(table, table->cloexec_fds, fd); @@ -1384,10 +1448,10 @@ DEFINE_SYSCALL(renameat, int, oldfd, gstr_t, oldpath_ptr, int, newfd, gstr_t, ne struct path oldpath, newpath; int r; - if ((r = vfs_grab_dir(oldfd, oldname, 0, &oldpath)) < 0) { + if ((r = vfs_grab_dir(oldfd, oldname, LOOKUP_NOFOLLOW, &oldpath)) < 0) { goto out1; } - if ((r = vfs_grab_dir(newfd, newname, 0, &newpath)) < 0) { + if ((r = vfs_grab_dir(newfd, newname, LOOKUP_NOFOLLOW, &newpath)) < 0) { goto out2; } if (oldpath.fs != newpath.fs) { @@ -1414,10 +1478,18 @@ DEFINE_SYSCALL(unlinkat, int, dirfd, gstr_t, path_ptr, int, flags) struct path path; int r; - if ((r = vfs_grab_dir(dirfd, name, 0, &path)) < 0) { + if ((r = vfs_grab_dir(dirfd, name, LOOKUP_NOFOLLOW, &path)) < 0) { return r; } r = path.fs->ops->unlinkat(path.fs, path.dir, path.subpath, flags); + if (r == -LINUX_EPERM) { + struct l_newstat st; + int r2 = path.fs->ops->fstatat(path.fs, path.dir, path.subpath, &st, + LINUX_AT_SYMLINK_NOFOLLOW); + if (r2 == 0 && S_ISDIR(st.st_mode)) { + r = -LINUX_EISDIR; + } + } vfs_ungrab_dir(&path); return r; } @@ -1444,7 +1516,6 @@ DEFINE_SYSCALL(linkat, int, oldfd, gstr_t, oldpath_ptr, int, newfd, gstr_t, newp } int lkflag = flags & LINUX_AT_SYMLINK_FOLLOW ? 0 : LOOKUP_NOFOLLOW; - struct path oldpath, newpath; int r; if ((r = vfs_grab_dir(oldfd, oldname, lkflag, &oldpath)) < 0) { @@ -1605,6 +1676,34 @@ DEFINE_SYSCALL(umask, int, mask) return vfs_umask(mask); } +DEFINE_SYSCALL(mknodat, int, dirfd, gaddr_t, path_ptr, l_mode_t, mode, l_dev_t, dev) { + char name[LINUX_PATH_MAX]; + if (strncpy_from_user(name, path_ptr, sizeof name) < 0) + return -LINUX_EFAULT; + + struct path path; + int r = 0; + switch(mode & S_IFMT) { + case S_IFIFO: { + if ((r = vfs_grab_dir(dirfd, name, 0, &path)) < 0) { + goto out; + } + r = syswrap(mkfifo(path.subpath, mode)); + break; + } + default: + warnk("unsupported mknod mode: %d", mode); + return -LINUX_EINVAL; + } + out: + vfs_ungrab_dir(&path); + return r; +} + +DEFINE_SYSCALL(mknod, gaddr_t, path_ptr, l_mode_t, mode, l_dev_t, dev) { + return sys_mknodat(LINUX_AT_FDCWD, path_ptr, mode, dev); +} + /* TODO: functions below are not yet ported to the new vfs archtecture. */ @@ -1795,10 +1894,6 @@ DEFINE_SYSCALL(pwrite64, unsigned int, fd, gstr_t, buf_ptr, size_t, count, off_t if (r < 0) { goto out; } - if (copy_to_user(buf_ptr, buf, r)) { - r = -LINUX_EFAULT; - goto out; - } out: free(buf); return r; diff --git a/src/ipc/futex.c b/src/ipc/futex.c index 8451c663..f2a89eae 100644 --- a/src/ipc/futex.c +++ b/src/ipc/futex.c @@ -117,6 +117,11 @@ do_private_futex(gaddr_t uaddr, int op, uint32_t val, gaddr_t timeout_ptr, gaddr return do_private_futex_wake(uaddr, val, false, 0); } case LINUX_FUTEX_WAIT: { + uint32_t uval; + if (copy_from_user(&uval, uaddr, sizeof uval)) + return -LINUX_EFAULT; + if (uval != val) + return -EWOULDBLOCK; struct timespec ts; if (timeout_ptr != 0) { struct l_timespec timeout; diff --git a/src/main.c b/src/main.c index 7856581f..1724f5c3 100644 --- a/src/main.c +++ b/src/main.c @@ -23,6 +23,25 @@ #include +static int +get_cpuid_count (unsigned int leaf, + unsigned int subleaf, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + __cpuid_count(leaf, subleaf, *eax, *ebx, *ecx, *edx); + return 1; +} + +static bool +is_avx(int instlen, uint64_t rip) +{ + uint8_t op; + if (copy_from_user(&op, rip, sizeof op)) + return false; + return op == 0xc4 || op == 0xc5; +} + static bool is_syscall(int instlen, uint64_t rip) { @@ -70,6 +89,114 @@ task_run() return vmm_run(); } +#define get_bit(integer, n) (int)((integer & ( 1 << n )) >> n) + +#define GET_VMCS(val, var) \ + var = 0;\ + vmm_read_vmcs(val, &var); + +#define GET_MSR(val, var) \ + var = 0;\ + vmm_read_msr(val, &var); + +static void check_vm_entry() +{ + uint64_t tmp; + uint64_t controls, pin_based, cpu_based1, cpu_based2; + uint64_t cr0, cr4; + uint8_t unrestricted_guest, load_debug_controls, ia_32e_mode_guest, + ia_32_perf_global_ctrl, ia_32_pat, ia_32_efer, ia_32_bndcfgs; + + + GET_VMCS(VMCS_CTRL_VMENTRY_CONTROLS, controls); + GET_VMCS(VMCS_CTRL_PIN_BASED, pin_based); + GET_VMCS(VMCS_CTRL_CPU_BASED, cpu_based1); + GET_VMCS(VMCS_CTRL_CPU_BASED2, cpu_based2); + + unrestricted_guest = get_bit(cpu_based2, 7); + load_debug_controls = get_bit(controls, 2); + ia_32e_mode_guest = get_bit(controls, 9); + ia_32_perf_global_ctrl = get_bit(controls, 13); + ia_32_pat = get_bit(controls, 14); + ia_32_efer = get_bit(controls, 15); + ia_32_bndcfgs = get_bit(controls, 16); + + GET_VMCS(VMCS_GUEST_CR0, cr0); + GET_VMCS(VMCS_GUEST_CR4, cr4); + + if (!unrestricted_guest) { + assert(!get_bit(cr0, 31) || get_bit(cr0, 0)); + } + + if (load_debug_controls) { + GET_VMCS(VMCS_GUEST_IA32_DEBUGCTL, tmp); + vmm_write_vmcs(VMCS_GUEST_IA32_DEBUGCTL, tmp & 0b1101111111000011); + GET_VMCS(VMCS_GUEST_IA32_DEBUGCTL, tmp); + assert(!get_bit(tmp, 2) + && !get_bit(tmp, 3) + && !get_bit(tmp, 4) + && !get_bit(tmp, 5) + && !get_bit(tmp, 13) + && tmp < 65535); + } + + if (ia_32e_mode_guest) { + assert(get_bit(cr0, 31) && get_bit(cr4, 5)); + } else { + assert(!get_bit(cr4, 17)); + } + + GET_VMCS(VMCS_GUEST_CR3, tmp); + assert(!tmp); // CR3 field must be such that bits 63:52 and + // bits in the range 51:32 beyond the + // processor's physical address width are 0 + + if (load_debug_controls) { + GET_VMCS(VMCS_GUEST_DR7, tmp); + assert(tmp < 0b100000000000000000000000000000000); + } + + warnk("Didn't check IA32_SYSENTER_ESP canonical\n"); + warnk("Didn't check IA32_SYSENTER_EIP canonical\n"); + + + if (ia_32_perf_global_ctrl) { + warnk("IA_32_PERF_GLOBAL_CTRL not tested\n"); + GET_VMCS(VMCS_GUEST_IA32_PERF_GLOBAL_CTRL, tmp); + assert(!tmp); // Too few bits not reserved + } + + if (ia_32_pat) { + warnk("IA_32_PAT not tested\n"); + GET_VMCS(VMCS_GUEST_IA32_PAT, tmp); + for (int i = 0; i < 8; ++i) { + char tmpbyte = tmp & 0xff; + assert(tmpbyte == 0 + || tmpbyte == 1 + || tmpbyte == 4 + || tmpbyte == 5 + || tmpbyte == 6 + || tmpbyte == 7); + tmp >>= 8; + + } + } + + if (ia_32_efer) { + GET_VMCS(VMCS_GUEST_IA32_EFER, tmp); + assert(!tmp); // Too few bits not reserved + assert(get_bit(tmp, 10) == ia_32e_mode_guest); + assert(!get_bit(cr0, 31) || (get_bit(tmp, 10) == get_bit(tmp, 8))); + } + + if (ia_32_bndcfgs) { + warnk("Didn't check IA32_BNDCFGS\n"); + } + + printk("EVERYTHING CLEAR SO FAR\n"); + +} + void main_loop(int return_on_sigret) { @@ -131,7 +258,18 @@ main_loop(int return_on_sigret) return; } continue; - } + } else if (is_avx(instlen, rip)) { + uint64_t xcr0; + vmm_read_register(HV_X86_XCR0, &xcr0); + if ((xcr0 & XCR0_AVX_STATE) == 0) { + unsigned int eax, ebx, ecx, edx; + get_cpuid_count(0x0d, 0x0, &eax, &ebx, &ecx, &edx); + if (eax & XCR0_AVX_STATE) { + vmm_write_register(HV_X86_XCR0, xcr0 | XCR0_AVX_STATE); + continue; + } + } + } /* FIXME */ warnk("invalid opcode! (rip = %p): ", (void *) rip); unsigned char inst[instlen]; @@ -239,7 +377,22 @@ main_loop(int return_on_sigret) } default: - printk("other reason: %llu\n", exit_reason); + // See: Intel® 64 and IA-32 Architectures Software Developer’s Manual + // Volume 3B: System Programming Guide, Part 2 + // Order Number: 253669-033US + // December 2009 + // Section 21.9 VM-EXIT INFORMATION FIELDS + // 21.9.1 Basic VM-Exit information + // Exit reason + if (exit_reason & (1<<31)) { + exit_reason ^= (1<<31); + printk("VM-entry failure exit reason: %llx\n", exit_reason); + } else + printk("other exit reason: %llx\n", exit_reason); + if (exit_reason & VMX_REASON_VMENTRY_GUEST) + check_vm_entry(); + vmm_read_vmcs(VMCS_RO_EXIT_QUALIFIC, &qual); + printk("exit qualification: %llx\n", qual); } } @@ -310,6 +463,13 @@ init_regs() { /* set up cpu regs */ vmm_write_register(HV_X86_RFLAGS, 0x2); + unsigned int eax, ebx, ecx, edx; + get_cpuid_count(0x0d, 0x0, &eax, &ebx, &ecx, &edx); + if (eax & XCR0_SSE_STATE) { + uint64_t xcr0; + vmm_read_register(HV_X86_XCR0, &xcr0); + vmm_write_register(HV_X86_XCR0, xcr0 | XCR0_SSE_STATE); + } } void @@ -488,10 +648,11 @@ main(int argc, char *argv[], char **envp) { "strace", required_argument, NULL, 's'}, { "warning", required_argument, NULL, 'w'}, { "mnt", required_argument, NULL, 'm' }, + { "help", no_argument, NULL, 'h' }, { 0, 0, 0, 0 } }; - while ((c = getopt_long(argc, argv, "+o:w:s:m:", long_options, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "+ho:w:s:m:", long_options, NULL)) != -1) { switch (c) { case 'o': strncpy(debug_paths[PRINTK_PATH], optarg, PATH_MAX); @@ -509,6 +670,10 @@ main(int argc, char *argv[], char **envp) } argv[optind - 1] = root; break; + case 'h': + default: + printf("Usage: noah -h | [-o output] [-w warning] [-s strace] -m /virtual/filesystem/root executable ...\n"); + exit(0); } } diff --git a/src/meta_strace.c b/src/meta_strace.c index b075fac8..000184d8 100644 --- a/src/meta_strace.c +++ b/src/meta_strace.c @@ -29,7 +29,7 @@ print_gstr(gstr_t str, int maxlen) { fprintf(strace_sink, "\""); for (int i = 0; i < maxlen; i++) { - char c = *((char*)guest_to_host(str) + i); + char c = *((char*)guest_to_host(str + i)); if (c == '\0') { break; } else if (c == '\n') { diff --git a/src/mm/mm.c b/src/mm/mm.c index 675dc9b6..5b21335e 100644 --- a/src/mm/mm.c +++ b/src/mm/mm.c @@ -282,7 +282,7 @@ DEFINE_SYSCALL(munlock, gaddr_t, addr, size_t, length) DEFINE_SYSCALL(brk, unsigned long, brk) { uint64_t ret; - brk = roundup(brk, PAGE_SIZE(PAGE_4KB)); + brk = roundup(brk, PAGE_SIZEOF(PAGE_4KB)); pthread_rwlock_wrlock(&proc.mm->alloc_lock); if (brk < proc.mm->start_brk) { @@ -326,7 +326,7 @@ DEFINE_SYSCALL(get_mempolicy, gaddr_t, policy, gaddr_t, nmask, unsigned long, ma DEFINE_SYSCALL(msync, gaddr_t, addr, size_t, len, int, flags) { struct mm_region *region = find_region(addr, proc.mm); - if (!region || addr - region->gaddr >= len || len + addr - region->gaddr >= region->size) { + if (!region || addr - region->gaddr >= len || len + addr - region->gaddr > region->size) { return -LINUX_ENOMEM; } diff --git a/src/mm/mmap.c b/src/mm/mmap.c index ca5b54c7..d2883d0f 100644 --- a/src/mm/mmap.c +++ b/src/mm/mmap.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -28,7 +29,7 @@ init_mmap(struct mm *mm) gaddr_t alloc_region(size_t len) { - len = roundup(len, PAGE_SIZE(PAGE_4KB)); + len = roundup(len, PAGE_SIZEOF(PAGE_4KB)); proc.mm->current_mmap_top += len; return proc.mm->current_mmap_top - len; } @@ -39,7 +40,7 @@ do_munmap(gaddr_t gaddr, size_t size) if (!is_page_aligned((void*)gaddr, PAGE_4KB)) { return -LINUX_EINVAL; } - size = roundup(size, PAGE_SIZE(PAGE_4KB)); // Linux kernel also does this + size = roundup(size, PAGE_SIZEOF(PAGE_4KB)); // Linux kernel also does this struct mm_region *overlapping = find_region_range(gaddr, size, proc.mm); if (overlapping == NULL) { @@ -76,6 +77,7 @@ linux_to_darwin_mflags(int l_flags) if (l_flags & LINUX_MAP_SHARED) d_flags |= MAP_SHARED; if (l_flags & LINUX_MAP_PRIVATE) d_flags |= MAP_PRIVATE; if (l_flags & LINUX_MAP_ANON) d_flags |= MAP_ANON; + if (l_flags & LINUX_MAP_HUGETLB) d_flags |= VM_FLAGS_SUPERPAGE_SIZE_ANY; return d_flags; } @@ -94,9 +96,9 @@ do_mmap(gaddr_t addr, size_t len, int d_prot, int l_prot, int l_flags, int fd, o /* the linux kernel does nothing for LINUX_MAP_STACK */ l_flags &= ~LINUX_MAP_STACK; - len = roundup(len, PAGE_SIZE(PAGE_4KB)); + len = roundup(len, PAGE_SIZEOF(PAGE_4KB)); - if ((l_flags & ~(LINUX_MAP_SHARED | LINUX_MAP_PRIVATE | LINUX_MAP_FIXED | LINUX_MAP_ANON)) != 0) { + if ((l_flags & ~(LINUX_MAP_SHARED | LINUX_MAP_PRIVATE | LINUX_MAP_FIXED | LINUX_MAP_ANON | LINUX_MAP_HUGETLB)) != 0) { warnk("unsupported mmap l_flags: 0x%x\n", l_flags); exit(1); } @@ -110,7 +112,7 @@ do_mmap(gaddr_t addr, size_t len, int d_prot, int l_prot, int l_flags, int fd, o void *ptr = mmap(0, len, d_prot, linux_to_darwin_mflags(l_flags), fd, offset); if (ptr == MAP_FAILED) { - panic("mmap failed. addr :0x%llx, len: 0x%lux, prot: %d, l_flags: %d, fd: %d, offset: 0x%llx\n", addr, len, l_prot, l_flags, fd, offset); + return -darwin_to_linux_errno(errno); } do_munmap(addr, len); @@ -148,8 +150,8 @@ DEFINE_SYSCALL(mremap, gaddr_t, old_addr, size_t, old_size, size_t, new_size, in return -LINUX_EINVAL; /* Linux kernel also does these aligning */ - old_size = roundup(old_size, PAGE_SIZE(PAGE_4KB)); - new_size = roundup(new_size, PAGE_SIZE(PAGE_4KB)); + old_size = roundup(old_size, PAGE_SIZEOF(PAGE_4KB)); + new_size = roundup(new_size, PAGE_SIZEOF(PAGE_4KB)); gaddr_t ret = old_addr; @@ -221,7 +223,7 @@ DEFINE_SYSCALL(mprotect, gaddr_t, addr, size_t, len, int, prot) } // TODO check if user is permiited to access the addr - len = roundup(len, PAGE_SIZE(PAGE_4KB)); + len = roundup(len, PAGE_SIZEOF(PAGE_4KB)); gaddr_t end = addr + len; hv_memory_flags_t hvprot = 0; diff --git a/src/mm/shm.c b/src/mm/shm.c index e0eff37b..345d57e9 100644 --- a/src/mm/shm.c +++ b/src/mm/shm.c @@ -48,10 +48,10 @@ DEFINE_SYSCALL(shmat, int, shmid, gaddr_t, addr, int, shmflg) pthread_rwlock_wrlock(&proc.mm->alloc_lock); addr = alloc_region(len); do_munmap(addr, len); - record_region(proc.mm, ptr, addr, len, LINUX_PROT_READ | LINUX_PROT_WRITE | LINUX_PROT_EXEC, LINUX_MAP_PRIVATE | LINUX_MAP_FIXED, -1, 0); - vmm_mmap(addr, len, HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC, ptr); + record_region(proc.mm, ptr, addr, len, LINUX_PROT_READ | LINUX_PROT_WRITE, LINUX_MAP_PRIVATE | LINUX_MAP_FIXED, -1, 0); + vmm_mmap(addr, len, HV_MEMORY_READ | HV_MEMORY_WRITE, ptr); pthread_rwlock_unlock(&proc.mm->alloc_lock); - return (uint64_t) ptr; + return (uint64_t) addr; } DEFINE_SYSCALL(shmctl, int, shmid, int, cmd, gaddr_t, buf_ptr) diff --git a/src/net/net.c b/src/net/net.c index ea96ee0e..11d12ab3 100644 --- a/src/net/net.c +++ b/src/net/net.c @@ -246,21 +246,36 @@ DEFINE_SYSCALL(shutdown, int, socket, int, how) return syswrap(shutdown(socket, how)); } -DEFINE_SYSCALL(sendto, int, socket, gaddr_t, buf_ptr, int, length, int, flags, gaddr_t, dest_addr, socklen_t, dest_len) +DEFINE_SYSCALL(sendto, int, socket, gaddr_t, buf_ptr, int, length, int, flags, gaddr_t, addr_ptr, socklen_t, addrlen) { - warnk("sendto: dest_addr is not used! (dest_addr = 0x%llx, dest_len = %d)\n", dest_addr, dest_len); - int r; + int ret; + struct sockaddr *sockaddr = NULL; + struct l_sockaddr l_sockaddr; + + if (addr_ptr != 0) { + if (copy_from_user(&l_sockaddr, addr_ptr, addrlen)) + return -LINUX_EFAULT; + if (linux_to_darwin_sockaddr(&sockaddr, &l_sockaddr, addrlen) < 0) + return -LINUX_EINVAL; + } char *buf = malloc(length); - - if (copy_from_user(buf, buf_ptr, length)) { - r = -LINUX_EFAULT; + if (buf == NULL) { + ret = -LINUX_ENOMEM; + goto err; + } + ret = copy_from_user(buf, buf_ptr, length); + if (ret < 0) { + ret = -LINUX_EFAULT; goto out; } - r = syswrap(sendto(socket, buf, length, flags, NULL, 0)); - -out: + ret = syswrap(sendto(socket, buf, length, flags, sockaddr, addrlen)); + + out: free(buf); - return r; + err: + if (sockaddr) + free(sockaddr); + return ret; } int @@ -301,7 +316,7 @@ linux_to_darwin_msg_flags(l_int flags) } if (flags) { - warnk("unsupported msg_flags: 0x%x", flags); + warnk("unsupported msg_flags: 0x%x\n", flags); return -LINUX_EOPNOTSUPP; } @@ -324,7 +339,7 @@ DEFINE_SYSCALL(recvfrom, int, socket, gaddr_t, buf_ptr, int, length, int, flags, int ret = syswrap(recvfrom(socket, buf, length, flags, sock_ptr, socklen_ptr)); if (ret < 0) return ret; - if (copy_to_user(buf_ptr, buf, length)) + if (copy_to_user(buf_ptr, buf, ret)) return -LINUX_EFAULT; if (addr_ptr != 0) { char addr[sock_ptr->sa_len]; @@ -363,18 +378,34 @@ do_sendmsg(int sockfd, const struct l_msghdr *msg, int flags) if (copy_from_user(hdr.msg_iov[i].iov_base, liov[i].iov_base, hdr.msg_iov[i].iov_len)) return -LINUX_EFAULT; } - hdr.msg_flags = linux_to_darwin_msg_flags(msg->msg_flags); - if (hdr.msg_flags < 0) { - warnk("do_sendmsg: unsupported flags\n"); - return hdr.msg_flags; - } + if (LINUX_CMSG_FIRSTHDR(msg) != 0) { warnk("we do not support ancillary data yet\n"); return -LINUX_EINVAL; } hdr.msg_control = NULL; hdr.msg_controllen = 0; - return syswrap(sendmsg(sockfd, &hdr, flags)); + + /* + On Mac OS X MSG_NOSIGNAL is not supported, so we need to set SO_NOSIGPIPE + option on the socket. + See https://lists.apple.com/archives/macnetworkprog/2002/Dec/msg00091.html. + */ + int msg_flags = linux_to_darwin_msg_flags(flags & ~LINUX_MSG_NOSIGNAL); + if (msg_flags < 0) { + warnk("do_sendmsg: unsupported flags\n"); + return hdr.msg_flags; + } + if (flags & LINUX_MSG_NOSIGNAL) { + int val = 1; + int r = syswrap(setsockopt(sockfd, SOL_SOCKET, SO_NOSIGPIPE, + (void*)&val, sizeof(val))); + if (r < 0) { + panic("Noah cannot set SO_NOSIGPIPE option."); + } + } + + return syswrap(sendmsg(sockfd, &hdr, msg_flags)); } DEFINE_SYSCALL(sendmsg, int, sockfd, gaddr_t, msg_ptr, int, flags) @@ -605,10 +636,32 @@ DEFINE_SYSCALL(getpeername, int, sockfd, gaddr_t, addr_ptr, gaddr_t, addrlen_ptr DEFINE_SYSCALL(socketpair, int, family, int, type, int, protocol, gaddr_t, usockvec_ptr) { int fds[2]; - int r = syswrap(socketpair(linux_to_darwin_sa_family(family), type, protocol, fds)); - if (r < 0) - return r; - if (copy_to_user(usockvec_ptr, fds, sizeof fds)) - return -LINUX_EFAULT; - return r; + pthread_rwlock_wrlock(&proc.fileinfo.fdtable_lock); + int ret = syswrap(socketpair(linux_to_darwin_sa_family(family), type, protocol, fds)); + if (ret < 0) + goto err; + int e = register_fd(fds[0], type & LINUX_SOCK_CLOEXEC); + if (e < 0) { + close(fds[0]); + close(fds[1]); + ret = e; + goto err; + } + e = register_fd(fds[1], type & LINUX_SOCK_CLOEXEC); + if (e < 0) { + user_close(fds[0]); + close(fds[1]); + ret = e; + goto err; + } + if (copy_to_user(usockvec_ptr, fds, sizeof fds)) { + user_close(fds[0]); + user_close(fds[1]); + ret = -LINUX_EFAULT; + goto err; + } + +err: + pthread_rwlock_unlock(&proc.fileinfo.fdtable_lock); + return ret; } diff --git a/src/proc/exec.c b/src/proc/exec.c index 015ef6e0..d4944a18 100644 --- a/src/proc/exec.c +++ b/src/proc/exec.c @@ -66,10 +66,10 @@ load_elf_interp(const char *path, ulong load_addr) ulong p_vaddr = p[i].p_vaddr + load_addr; - ulong mask = PAGE_SIZE(PAGE_4KB) - 1; + ulong mask = PAGE_SIZEOF(PAGE_4KB) - 1; ulong vaddr = p_vaddr & ~mask; ulong offset = p_vaddr & mask; - ulong size = roundup(p[i].p_memsz + offset, PAGE_SIZE(PAGE_4KB)); + ulong size = roundup(p[i].p_memsz + offset, PAGE_SIZEOF(PAGE_4KB)); int prot = 0; if (p[i].p_flags & PF_X) prot |= LINUX_PROT_EXEC; @@ -81,7 +81,7 @@ load_elf_interp(const char *path, ulong load_addr) copy_to_user(vaddr + offset, data + p[i].p_offset, p[i].p_filesz); - map_top = MAX(map_top, roundup(vaddr + size, PAGE_SIZE(PAGE_4KB))); + map_top = MAX(map_top, roundup(vaddr + size, PAGE_SIZEOF(PAGE_4KB))); } vmm_write_vmcs(VMCS_GUEST_RIP, load_addr + h->e_entry); @@ -128,10 +128,10 @@ load_elf(Elf64_Ehdr *ehdr, int argc, char *argv[], char **envp) ulong p_vaddr = p[i].p_vaddr + global_offset; - ulong mask = PAGE_SIZE(PAGE_4KB) - 1; + ulong mask = PAGE_SIZEOF(PAGE_4KB) - 1; ulong vaddr = p_vaddr & ~mask; ulong offset = p_vaddr & mask; - ulong size = roundup(p[i].p_memsz + offset, PAGE_SIZE(PAGE_4KB)); + ulong size = roundup(p[i].p_memsz + offset, PAGE_SIZEOF(PAGE_4KB)); int prot = 0; if (p[i].p_flags & PF_X) prot |= LINUX_PROT_EXEC; @@ -147,7 +147,7 @@ load_elf(Elf64_Ehdr *ehdr, int argc, char *argv[], char **envp) load_base = p[i].p_vaddr - p[i].p_offset + global_offset; load_base_set = true; } - map_top = MAX(map_top, roundup(vaddr + size, PAGE_SIZE(PAGE_4KB))); + map_top = MAX(map_top, roundup(vaddr + size, PAGE_SIZEOF(PAGE_4KB))); } assert(load_base_set); @@ -301,7 +301,7 @@ init_userstack(int argc, char *argv[], char **envp, uint64_t exe_base, const Elf { AT_PHDR, exe_base + ehdr->e_phoff }, { AT_PHENT, ehdr->e_phentsize }, { AT_PHNUM, ehdr->e_phnum }, - { AT_PAGESZ, PAGE_SIZE(PAGE_4KB) }, + { AT_PAGESZ, PAGE_SIZEOF(PAGE_4KB) }, { AT_RANDOM, rand_ptr }, { AT_NULL, 0 }, }; @@ -400,10 +400,14 @@ do_exec(const char *elf_path, int argc, char *argv[], char **envp) return -LINUX_EINVAL; } - prepare_newproc(); - /* Now do exec */ fstat(fd, &st); + if (!S_ISREG(st.st_mode)) { + vkern_close(fd); + return -LINUX_EACCES; + } + + prepare_newproc(); data = mmap(0, st.st_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0); diff --git a/src/proc/process.c b/src/proc/process.c index cc12c131..40e50bbf 100644 --- a/src/proc/process.c +++ b/src/proc/process.c @@ -294,11 +294,7 @@ DEFINE_SYSCALL(gettid) return do_gettid(); } -DEFINE_SYSCALL(getrlimit, int, l_resource, gaddr_t, rl_ptr) -{ - struct rlimit rl; - struct l_rlimit l_rl; - +int linux_to_darwin_rlimopts(int l_resource) { int resource = 0; switch (l_resource) { case LINUX_RLIMIT_CPU: resource = RLIMIT_CPU; break; @@ -312,6 +308,19 @@ DEFINE_SYSCALL(getrlimit, int, l_resource, gaddr_t, rl_ptr) case LINUX_RLIMIT_MEMLOCK: resource = RLIMIT_MEMLOCK; break; case LINUX_RLIMIT_AS: resource = RLIMIT_AS; break; } + return resource; +} + +DEFINE_SYSCALL(getrlimit, int, l_resource, gaddr_t, rl_ptr) +{ + struct rlimit rl; + struct l_rlimit l_rl; + + if (l_resource >= LINUX_RLIM_NLIMITS) { + return -LINUX_EINVAL; + } + + int resource = linux_to_darwin_rlimopts(l_resource); int r = syswrap(getrlimit(resource, &rl)); if (r < 0) @@ -404,11 +413,23 @@ DEFINE_SYSCALL(uname, gaddr_t, buf_ptr) return 0; } -DEFINE_SYSCALL(prctl, int, option) +DEFINE_SYSCALL(prctl, int, option, unsigned long, arg1, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { - /* FIXME */ - printk("prctl is not implemented yet\n"); - return -ENOSYS; + switch (option) { + case LINUX_PR_SET_NAME: { + char buf[16]; + if (copy_from_user(buf, (gaddr_t)arg1, sizeof(buf))) { + return -LINUX_EFAULT; + } + // trancate if the legnth of arg1 exceeds 16byte. + buf[15] = '\0'; + pthread_setname_np(buf); + return 0; + } + default: + warnk("unkown prctl cmd: %d\n", option); + return -LINUX_EINVAL; + } } DEFINE_SYSCALL(arch_prctl, int, code, gaddr_t, addr) diff --git a/src/sys/sys.c b/src/sys/sys.c index a338a117..b79f09a4 100644 --- a/src/sys/sys.c +++ b/src/sys/sys.c @@ -33,19 +33,36 @@ DEFINE_SYSCALL(sysinfo, gaddr_t, info_ptr) int64_t memsize; len = sizeof memsize; - if (sysctlbyname("hw.memsize", &memsize, &len, NULL, 0) < 0) exit(1); + if (sysctlbyname("hw.memsize", &memsize, &len, NULL, 0) < 0){ + perror("sysinfo:"); + exit(1); + } info.totalram = memsize; int64_t freepages; len = sizeof freepages; - if (sysctlbyname("vm.page_free_count", &freepages, &len, NULL, 0) < 0) exit(1); + if (sysctlbyname("vm.page_free_count", &freepages, &len, NULL, 0) < 0){ + perror("sysinfo:"); + exit(1); + } info.freeram = freepages * 0x1000; - uint64_t swapinfo[3]; + /* + * sysctlbyname() changed in macos 15. Any older os will leave swapinfo[4] as 0. + */ + + uint64_t swapinfo[4] = {0}; len = sizeof swapinfo; - if (sysctlbyname("vm.swapusage", &swapinfo, &len, NULL, 0) < 0) exit(1); + if (sysctlbyname("vm.swapusage", &swapinfo, &len, NULL, 0) < 0){ + perror("sysinfo:"); + exit(1); + } info.totalswap = swapinfo[0]; - info.freeswap = swapinfo[2]; + + if(swapinfo[3] == 0) + info.freeswap = swapinfo[2]; + else + info.freeswap = swapinfo[1]; /* TODO */ info.sharedram = 0; diff --git a/src/sys/time.c b/src/sys/time.c index 3de79dee..96b31857 100644 --- a/src/sys/time.c +++ b/src/sys/time.c @@ -14,13 +14,42 @@ #include #include -#include #include #include #include #include #include +#if defined(MACOS_PRE_16) +#include +#define TIMER_ABSTIME -1 +#define CLOCK_REALTIME CALENDAR_CLOCK +#define CLOCK_MONOTONIC SYSTEM_CLOCK +#define CLOCK_PROCESS_CPUTIME_ID 2 +#define CLOCK_THREAD_CPUTIME_ID 3 + +typedef int clockid_t; + +int clock_gettime(clockid_t clk_id, struct timespec *t){ + mach_timebase_info_data_t timebase; + mach_timebase_info(&timebase); + uint64_t time; + time = mach_absolute_time(); + double nseconds = ((double)time * (double)timebase.numer)/((double)timebase.denom); + double seconds = ((double)time * (double)timebase.numer)/((double)timebase.denom * 1e9); + t->tv_sec = seconds; + t->tv_nsec = nseconds; + return 0; +} + +int clock_getres(clockid_t clk_id, struct timespec *t){ + return 0; +} +#else +#include +#endif + + DEFINE_SYSCALL(time, gaddr_t, tloc_ptr) { time_t t; @@ -225,3 +254,60 @@ DEFINE_SYSCALL(fdatasync, int, fildes) { return syswrap(fsync(fildes)); } + +void +darwin_to_linux_itimerval(const struct itimerval* d_val, struct l_itimerval* l_val) { + l_val->it_interval.tv_sec = d_val->it_interval.tv_sec; + l_val->it_interval.tv_usec = d_val->it_interval.tv_usec; + l_val->it_value.tv_sec = d_val->it_value.tv_sec; + l_val->it_value.tv_usec = d_val->it_value.tv_usec; +} + +void +linux_to_darwin_itimerval(const struct l_itimerval* l_val, struct itimerval* d_val) { + d_val->it_interval.tv_sec = l_val->it_interval.tv_sec; + d_val->it_interval.tv_usec = l_val->it_interval.tv_usec; + d_val->it_value.tv_sec = l_val->it_value.tv_sec; + d_val->it_value.tv_usec = l_val->it_value.tv_usec; +} + +DEFINE_SYSCALL(getitimer, int, which, gaddr_t, ret_ptr) { + struct itimerval value; + // Darwin's which is compatible with that of Linux + int r = syswrap(getitimer(which, &value)); + if (r < 0) { + return r; + } + struct l_itimerval l_itimerval; + darwin_to_linux_itimerval(&value, &l_itimerval); + if (copy_to_user(ret_ptr, &l_itimerval, sizeof l_itimerval)) { + return -LINUX_EFAULT; + } + return r; +} + + +DEFINE_SYSCALL(setitimer, int, which, gaddr_t, new_ptr, gaddr_t, old_ptr) { + struct l_itimerval l_newvalue, l_oldvalue; + struct itimerval newvalue, oldvalue; + if (new_ptr == 0) { + return sys_getitimer(which, old_ptr); + } + if (copy_from_user(&l_newvalue, new_ptr, sizeof l_newvalue)) { + return -LINUX_EFAULT; + } + linux_to_darwin_itimerval(&l_newvalue, &newvalue); + // Darwin's which is compatible with that of Linux + int r = syswrap(setitimer(which, &newvalue, &oldvalue)); + if (r < 0) { + return r; + } + if (old_ptr != 0) { + darwin_to_linux_itimerval(&oldvalue, &l_oldvalue); + if (copy_to_user(old_ptr, &l_oldvalue, sizeof l_oldvalue)) { + return -LINUX_EFAULT; + } + } + return r; +} + diff --git a/util/release-engineering.sh b/util/release-engineering.sh index 266f6c12..a6b299fe 100755 --- a/util/release-engineering.sh +++ b/util/release-engineering.sh @@ -47,8 +47,7 @@ else fi sed -i "" -e "4 s@.*@ url \"$URL\"@g" noah.rb -sed -i "" -e "5 s@.*@ version \"$VERSION\"@g" noah.rb -sed -i "" -e "6 s@.*@ sha256 \"$SHA256\"@g" noah.rb +sed -i "" -e "5 s@.*@ sha256 \"$SHA256\"@g" noah.rb git add noah.rb git commit -m "version $VERSION"