// Copyright 2022 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include "atomic.h" #include "sysmsg.h" // __export_deep_sleep_timeout is the timeout after which the stub thread stops // polling and fall asleep. uint64_t __export_deep_sleep_timeout; // LINT.IfChange #define MAX_GUEST_CONTEXTS (4095) #define MAX_CONTEXT_QUEUE_ENTRIES (MAX_GUEST_CONTEXTS + 1) #define INVALID_CONTEXT_ID 0xfefefefe #define INVALID_THREAD_ID 0xfefefefe // Each element of a context_queue ring buffer is a sum of its index shifted by // CQ_INDEX_SHIFT and context_id. #define CQ_INDEX_SHIFT 32 #define CQ_CONTEXT_MASK ((1UL << CQ_INDEX_SHIFT) - 1) // See systrap/context_queue.go struct context_queue { uint32_t start; uint32_t end; uint32_t num_active_threads; uint32_t num_spinning_threads; uint32_t num_threads_to_wakeup; uint32_t num_active_contexts; uint32_t num_awake_contexts; uint32_t fast_path_disabled; uint32_t used_fast_path; uint64_t ringbuffer[MAX_CONTEXT_QUEUE_ENTRIES]; }; struct context_queue *__export_context_queue_addr; // LINT.ThenChange(../context_queue.go) uint32_t is_empty(struct context_queue *queue) { return atomic_load(&queue->start) == atomic_load(&queue->end); } int32_t queued_contexts(struct context_queue *queue) { return (atomic_load(&queue->end) + MAX_CONTEXT_QUEUE_ENTRIES - atomic_load(&queue->start)) % MAX_CONTEXT_QUEUE_ENTRIES; } #if defined(__x86_64__) static __inline__ unsigned long rdtsc(void) { unsigned h, l; __asm__ __volatile__("rdtsc" : "=a"(l), "=d"(h)); return ((unsigned long)l) | (((unsigned long)h) << 32); } static __inline__ void spinloop(void) { asm("pause"); } #elif defined(__aarch64__) static __inline__ unsigned long rdtsc(void) { long val; asm volatile("mrs %0, cntvct_el0" : "=r"(val)); return val; } static __inline__ void spinloop(void) { asm volatile("yield" : : : "memory"); } #endif void *__export_context_region; static struct thread_context *thread_context_addr(uint32_t tcid) { return (struct thread_context *)(__export_context_region + tcid * ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT); } void memcpy(uint8_t *dest, uint8_t *src, size_t n) { for (size_t i = 0; i < n; i += 1) { dest[i] = src[i]; } } // The spinning queue is a queue of spinning threads. It solves the // fragmentation problem. The idea is to minimize the number of threads // processing requests. We can't control how system threads are scheduled, so // can't distribute requests efficiently. The spinning queue emulates virtual // threads sorted by their spinning time. // // This queue is lock-less to be sure that any thread scheduled out // from CPU doesn't block others. // // The size of the queue must be a divisor of 2^32, because queue indexes are // calculated as modules of uint32 values. #define SPINNING_QUEUE_SIZE 256 // MAX_RE_ENQUEUE defines the amount of time a given entry in the spinning queue // needs to reach timeout in order to be removed. Re-enqueuing a timeout is done // in order to mitigate rdtsc inaccuracies. #define MAX_RE_ENQUEUE 2 struct spinning_queue { uint32_t len; uint32_t start; uint32_t end; uint64_t start_times[SPINNING_QUEUE_SIZE]; uint8_t num_times_re_enqueued[SPINNING_QUEUE_SIZE]; }; struct spinning_queue *__export_spinning_queue_addr; // spinning_queue_push adds a new thread to the queue. It returns false if the // queue is full, or if re_enqueue_times has reached MAX_RE_ENQUEUE. static bool spinning_queue_push(uint8_t re_enqueue_times) __attribute__((warn_unused_result)); static bool spinning_queue_push(uint8_t re_enqueue_times) { struct spinning_queue *queue = __export_spinning_queue_addr; uint32_t idx, end, len; BUILD_BUG_ON(sizeof(struct spinning_queue) > SPINNING_QUEUE_MEM_SIZE); if (re_enqueue_times >= MAX_RE_ENQUEUE) { return false; } len = atomic_add(&queue->len, 1); if (len > SPINNING_QUEUE_SIZE) { atomic_sub(&queue->len, 1); return false; } end = atomic_add(&queue->end, 1); idx = end - 1; atomic_store(&queue->num_times_re_enqueued[idx % SPINNING_QUEUE_SIZE], re_enqueue_times); atomic_store(&queue->start_times[idx % SPINNING_QUEUE_SIZE], rdtsc()); return true; } // spinning_queue_pop() removes one thread from a queue that has been spinning // the shortest time. // However it doesn't take into account the spinning re-enqueue. static void spinning_queue_pop() { struct spinning_queue *queue = __export_spinning_queue_addr; atomic_sub(&queue->end, 1); atomic_sub(&queue->len, 1); } // spinning_queue_remove_first removes one thread from a queue that has been // spinning longer than others and longer than a specified timeout. // // If `timeout` is zero, it always removes one element and never returns false. // // Returns true if one thread has been removed from the queue. static bool spinning_queue_remove_first(uint64_t timeout) __attribute__((warn_unused_result)); static bool spinning_queue_remove_first(uint64_t timeout) { struct spinning_queue *queue = __export_spinning_queue_addr; uint64_t ts; uint8_t re_enqueue = 0; while (1) { uint32_t idx, qidx; idx = atomic_load(&queue->start); qidx = idx % SPINNING_QUEUE_SIZE; ts = atomic_load(&queue->start_times[qidx]); if (ts == 0) continue; if (rdtsc() - ts < timeout) return false; if (idx != atomic_load(&queue->start)) continue; // Lose the race. re_enqueue = atomic_load(&queue->num_times_re_enqueued[qidx]); if (atomic_compare_exchange(&queue->start_times[qidx], &ts, 0)) { atomic_add(&queue->start, 1); break; } } atomic_sub(&queue->len, 1); if (timeout == 0) return true; return !spinning_queue_push(re_enqueue + 1); } struct thread_context *queue_get_context(struct sysmsg *sysmsg) { struct context_queue *queue = __export_context_queue_addr; // Indexes should not jump when start or end are overflowed. BUILD_BUG_ON(UINT32_MAX % MAX_CONTEXT_QUEUE_ENTRIES != MAX_CONTEXT_QUEUE_ENTRIES - 1); while (!is_empty(queue)) { uint64_t idx = atomic_load(&queue->start); uint32_t next = idx % MAX_CONTEXT_QUEUE_ENTRIES; uint64_t v = atomic_load(&queue->ringbuffer[next]); // We need to check the index to be sure that a ring buffer hasn't been // recycled. if ((v >> CQ_INDEX_SHIFT) != idx) continue; if (!atomic_compare_exchange(&queue->ringbuffer[next], &v, INVALID_CONTEXT_ID)) { continue; } uint32_t context_id = v & CQ_CONTEXT_MASK; if (context_id == INVALID_CONTEXT_ID) continue; atomic_add(&queue->start, 1); if (context_id > MAX_GUEST_CONTEXTS) { panic(STUB_ERROR_BAD_CONTEXT_ID, context_id); } struct thread_context *ctx = thread_context_addr(context_id); sysmsg->context = ctx; atomic_store(&ctx->acked_time, rdtsc()); atomic_store(&ctx->thread_id, sysmsg->thread_id); return ctx; } return NULL; } // get_context_fast sets nr_active_threads_p only if it deactivates the thread. static struct thread_context *get_context_fast(struct sysmsg *sysmsg, struct context_queue *queue, uint32_t *nr_active_threads_p) { uint32_t nr_active_threads, nr_awake_contexts; if (!spinning_queue_push(0)) return NULL; atomic_store(&queue->used_fast_path, 1); while (1) { struct thread_context *ctx; ctx = queue_get_context(sysmsg); if (ctx) { spinning_queue_pop(); return ctx; } if (atomic_load(&queue->fast_path_disabled) != 0) { if (!spinning_queue_remove_first(0)) panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0); break; } nr_active_threads = atomic_load(&queue->num_active_threads); nr_awake_contexts = atomic_load(&queue->num_awake_contexts); if (nr_awake_contexts < nr_active_threads) { if (atomic_compare_exchange(&queue->num_active_threads, &nr_active_threads, nr_active_threads - 1)) { nr_active_threads -= 1; if (!spinning_queue_remove_first(0)) panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0); *nr_active_threads_p = nr_active_threads; break; } } if (spinning_queue_remove_first(__export_deep_sleep_timeout)) { break; } spinloop(); } return NULL; } #define NR_IF_THREAD_IS_ACTIVE (~0) static bool try_to_dec_threads_to_wakeup(struct context_queue *queue) { while (1) { uint32_t nr = atomic_load(&queue->num_threads_to_wakeup); if (nr == 0) { return false; } if (atomic_compare_exchange(&queue->num_threads_to_wakeup, &nr, nr - 1)) { return true; }; } } void init_new_thread() { struct context_queue *queue = __export_context_queue_addr; atomic_add(&queue->num_active_threads, 1); try_to_dec_threads_to_wakeup(queue); } // get_context retrieves a context that is ready to be restored to the user. // This populates sysmsg->thread_context_id. struct thread_context *get_context(struct sysmsg *sysmsg) { struct context_queue *queue = __export_context_queue_addr; uint32_t nr_active_threads; struct thread_context *ctx; for (;;) { atomic_add(&queue->num_spinning_threads, 1); // Change sysmsg thread state just to indicate thread is not asleep. atomic_store(&sysmsg->state, THREAD_STATE_PREP); ctx = queue_get_context(sysmsg); if (ctx) { goto exit; } bool fast_path_enabled = atomic_load(&queue->fast_path_disabled) == 0; nr_active_threads = NR_IF_THREAD_IS_ACTIVE; if (fast_path_enabled) { ctx = get_context_fast(sysmsg, queue, &nr_active_threads); if (ctx) goto exit; } if (nr_active_threads == NR_IF_THREAD_IS_ACTIVE) { nr_active_threads = atomic_sub(&queue->num_active_threads, 1); } atomic_sub(&queue->num_spinning_threads, 1); atomic_store(&sysmsg->state, THREAD_STATE_ASLEEP); uint32_t nr_active_contexts = atomic_load(&queue->num_active_contexts); // We have to make another attempt to get a context here to prevent TOCTTOU // races with waitOnState and kickSysmsgThread. There are two assumptions: // * If the queue isn't empty, one or more threads have to be active. // * A new thread isn't kicked, if the number of active threads are not less // than a number of active contexts. if (nr_active_threads < nr_active_contexts) { ctx = queue_get_context(sysmsg); if (ctx) { atomic_store(&sysmsg->state, THREAD_STATE_PREP); atomic_add(&queue->num_active_threads, 1); return ctx; } } while (1) { if (!try_to_dec_threads_to_wakeup(queue)) { sys_futex(&queue->num_threads_to_wakeup, FUTEX_WAIT, 0, NULL, NULL, 0); continue; } // Mark this thread as being active only if it can get a context. ctx = queue_get_context(sysmsg); if (ctx) { atomic_store(&sysmsg->state, THREAD_STATE_PREP); atomic_add(&queue->num_active_threads, 1); return ctx; } } } exit: atomic_sub(&queue->num_spinning_threads, 1); return ctx; } // switch_context signals the sentry that the old context is ready to be worked // on and retrieves a new context to switch to. struct thread_context *switch_context(struct sysmsg *sysmsg, struct thread_context *ctx, enum context_state new_context_state) { struct context_queue *queue = __export_context_queue_addr; if (ctx) { atomic_sub(&queue->num_active_contexts, 1); atomic_store(&ctx->thread_id, INVALID_THREAD_ID); atomic_store(&ctx->last_thread_id, sysmsg->thread_id); atomic_store(&ctx->state_changed_time, rdtsc()); atomic_store(&ctx->state, new_context_state); if (atomic_load(&ctx->sentry_fast_path) == 0) { int ret = sys_futex(&ctx->state, FUTEX_WAKE, 1, NULL, NULL, 0); if (ret < 0) { panic(STUB_ERROR_FUTEX, ret); } } } return get_context(sysmsg); } void verify_offsets() { BUILD_BUG_ON(offsetof_sysmsg_self != offsetof(struct sysmsg, self)); BUILD_BUG_ON(offsetof_sysmsg_ret_addr != offsetof(struct sysmsg, ret_addr)); BUILD_BUG_ON(offsetof_sysmsg_syshandler != offsetof(struct sysmsg, syshandler)); BUILD_BUG_ON(offsetof_sysmsg_syshandler_stack != offsetof(struct sysmsg, syshandler_stack)); BUILD_BUG_ON(offsetof_sysmsg_app_stack != offsetof(struct sysmsg, app_stack)); BUILD_BUG_ON(offsetof_sysmsg_interrupt != offsetof(struct sysmsg, interrupt)); BUILD_BUG_ON(offsetof_sysmsg_state != offsetof(struct sysmsg, state)); BUILD_BUG_ON(offsetof_sysmsg_context != offsetof(struct sysmsg, context)); BUILD_BUG_ON(offsetof_thread_context_fpstate != offsetof(struct thread_context, fpstate)); BUILD_BUG_ON(offsetof_thread_context_fpstate_changed != offsetof(struct thread_context, fpstate_changed)); BUILD_BUG_ON(offsetof_thread_context_ptregs != offsetof(struct thread_context, ptregs)); BUILD_BUG_ON(kTHREAD_STATE_NONE != THREAD_STATE_NONE); BUILD_BUG_ON(sizeof(struct thread_context) > ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT); }