Files

419 lines
14 KiB
C
Raw Permalink Normal View History

2023-02-22 18:18:47 -08:00
// Copyright 2022 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#define _GNU_SOURCE
#include <errno.h>
#include <linux/futex.h>
#include <linux/unistd.h>
#include <signal.h>
2023-03-21 22:08:06 -07:00
#include <stdbool.h>
2023-02-22 18:18:47 -08:00
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include "atomic.h"
2023-02-22 18:18:47 -08:00
#include "sysmsg.h"
// __export_deep_sleep_timeout is the timeout after which the stub thread stops
// polling and fall asleep.
uint64_t __export_deep_sleep_timeout;
2023-03-14 17:45:21 -07:00
// LINT.IfChange
#define MAX_GUEST_CONTEXTS (4095)
#define MAX_CONTEXT_QUEUE_ENTRIES (MAX_GUEST_CONTEXTS + 1)
#define INVALID_CONTEXT_ID 0xfefefefe
#define INVALID_THREAD_ID 0xfefefefe
// Each element of a context_queue ring buffer is a sum of its index shifted by
// CQ_INDEX_SHIFT and context_id.
#define CQ_INDEX_SHIFT 32
#define CQ_CONTEXT_MASK ((1UL << CQ_INDEX_SHIFT) - 1)
2023-03-14 17:45:21 -07:00
// See systrap/context_queue.go
struct context_queue {
uint32_t start;
uint32_t end;
uint32_t num_active_threads;
2023-10-25 11:53:10 -07:00
uint32_t num_spinning_threads;
uint32_t num_threads_to_wakeup;
uint32_t num_active_contexts;
uint32_t num_awake_contexts;
uint32_t fast_path_disabled;
2023-10-25 11:53:10 -07:00
uint32_t used_fast_path;
uint64_t ringbuffer[MAX_CONTEXT_QUEUE_ENTRIES];
2023-03-14 17:45:21 -07:00
};
2023-03-21 22:08:06 -07:00
struct context_queue *__export_context_queue_addr;
2023-03-14 17:45:21 -07:00
// LINT.ThenChange(../context_queue.go)
uint32_t is_empty(struct context_queue *queue) {
return atomic_load(&queue->start) == atomic_load(&queue->end);
2023-03-14 17:45:21 -07:00
}
int32_t queued_contexts(struct context_queue *queue) {
return (atomic_load(&queue->end) + MAX_CONTEXT_QUEUE_ENTRIES -
atomic_load(&queue->start)) %
2023-03-14 17:45:21 -07:00
MAX_CONTEXT_QUEUE_ENTRIES;
}
2023-02-22 18:18:47 -08:00
#if defined(__x86_64__)
static __inline__ unsigned long rdtsc(void) {
unsigned h, l;
__asm__ __volatile__("rdtsc" : "=a"(l), "=d"(h));
return ((unsigned long)l) | (((unsigned long)h) << 32);
}
static __inline__ void spinloop(void) { asm("pause"); }
#elif defined(__aarch64__)
static __inline__ unsigned long rdtsc(void) {
long val;
asm volatile("mrs %0, cntvct_el0" : "=r"(val));
return val;
}
static __inline__ void spinloop(void) { asm volatile("yield" : : : "memory"); }
#endif
2023-03-22 12:23:24 -07:00
void *__export_context_region;
static struct thread_context *thread_context_addr(uint32_t tcid) {
return (struct thread_context *)(__export_context_region +
tcid *
ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT);
}
void memcpy(uint8_t *dest, uint8_t *src, size_t n) {
for (size_t i = 0; i < n; i += 1) {
dest[i] = src[i];
}
}
2023-03-21 22:08:06 -07:00
// The spinning queue is a queue of spinning threads. It solves the
// fragmentation problem. The idea is to minimize the number of threads
// processing requests. We can't control how system threads are scheduled, so
// can't distribute requests efficiently. The spinning queue emulates virtual
// threads sorted by their spinning time.
//
// This queue is lock-less to be sure that any thread scheduled out
// from CPU doesn't block others.
//
// The size of the queue must be a divisor of 2^32, because queue indexes are
// calculated as modules of uint32 values.
#define SPINNING_QUEUE_SIZE 256
2023-10-25 11:53:10 -07:00
// MAX_RE_ENQUEUE defines the amount of time a given entry in the spinning queue
// needs to reach timeout in order to be removed. Re-enqueuing a timeout is done
// in order to mitigate rdtsc inaccuracies.
#define MAX_RE_ENQUEUE 2
2023-03-21 22:08:06 -07:00
struct spinning_queue {
uint32_t len;
2023-03-21 22:08:06 -07:00
uint32_t start;
uint32_t end;
uint64_t start_times[SPINNING_QUEUE_SIZE];
2023-10-25 11:53:10 -07:00
uint8_t num_times_re_enqueued[SPINNING_QUEUE_SIZE];
2023-03-21 22:08:06 -07:00
};
struct spinning_queue *__export_spinning_queue_addr;
// spinning_queue_push adds a new thread to the queue. It returns false if the
2023-10-25 11:53:10 -07:00
// queue is full, or if re_enqueue_times has reached MAX_RE_ENQUEUE.
static bool spinning_queue_push(uint8_t re_enqueue_times)
__attribute__((warn_unused_result));
static bool spinning_queue_push(uint8_t re_enqueue_times) {
2023-03-21 22:08:06 -07:00
struct spinning_queue *queue = __export_spinning_queue_addr;
uint32_t idx, end, len;
2023-03-21 22:08:06 -07:00
BUILD_BUG_ON(sizeof(struct spinning_queue) > SPINNING_QUEUE_MEM_SIZE);
2023-10-25 11:53:10 -07:00
if (re_enqueue_times >= MAX_RE_ENQUEUE) {
return false;
}
2023-03-21 22:08:06 -07:00
len = atomic_add(&queue->len, 1);
if (len > SPINNING_QUEUE_SIZE) {
atomic_sub(&queue->len, 1);
2023-03-21 22:08:06 -07:00
return false;
}
end = atomic_add(&queue->end, 1);
2023-03-21 22:08:06 -07:00
idx = end - 1;
2023-10-25 11:53:10 -07:00
atomic_store(&queue->num_times_re_enqueued[idx % SPINNING_QUEUE_SIZE],
re_enqueue_times);
atomic_store(&queue->start_times[idx % SPINNING_QUEUE_SIZE], rdtsc());
2023-03-21 22:08:06 -07:00
return true;
}
// spinning_queue_pop() removes one thread from a queue that has been spinning
// the shortest time.
2023-10-25 11:53:10 -07:00
// However it doesn't take into account the spinning re-enqueue.
2023-03-21 22:08:06 -07:00
static void spinning_queue_pop() {
struct spinning_queue *queue = __export_spinning_queue_addr;
atomic_sub(&queue->end, 1);
atomic_sub(&queue->len, 1);
2023-03-21 22:08:06 -07:00
}
// spinning_queue_remove_first removes one thread from a queue that has been
// spinning longer than others and longer than a specified timeout.
//
// If `timeout` is zero, it always removes one element and never returns false.
//
2023-03-21 22:08:06 -07:00
// Returns true if one thread has been removed from the queue.
static bool spinning_queue_remove_first(uint64_t timeout)
__attribute__((warn_unused_result));
static bool spinning_queue_remove_first(uint64_t timeout) {
struct spinning_queue *queue = __export_spinning_queue_addr;
uint64_t ts;
2023-10-25 11:53:10 -07:00
uint8_t re_enqueue = 0;
2023-03-21 22:08:06 -07:00
while (1) {
uint32_t idx, qidx;
2023-03-21 22:08:06 -07:00
idx = atomic_load(&queue->start);
qidx = idx % SPINNING_QUEUE_SIZE;
ts = atomic_load(&queue->start_times[qidx]);
if (ts == 0) continue;
if (rdtsc() - ts < timeout) return false;
if (idx != atomic_load(&queue->start)) continue; // Lose the race.
re_enqueue = atomic_load(&queue->num_times_re_enqueued[qidx]);
if (atomic_compare_exchange(&queue->start_times[qidx], &ts, 0)) {
atomic_add(&queue->start, 1);
break;
}
2023-03-21 22:08:06 -07:00
}
atomic_sub(&queue->len, 1);
2023-10-25 11:53:10 -07:00
if (timeout == 0) return true;
return !spinning_queue_push(re_enqueue + 1);
2023-03-21 22:08:06 -07:00
}
struct thread_context *queue_get_context(struct sysmsg *sysmsg) {
struct context_queue *queue = __export_context_queue_addr;
// Indexes should not jump when start or end are overflowed.
BUILD_BUG_ON(UINT32_MAX % MAX_CONTEXT_QUEUE_ENTRIES !=
MAX_CONTEXT_QUEUE_ENTRIES - 1);
2023-03-21 22:08:06 -07:00
while (!is_empty(queue)) {
uint64_t idx = atomic_load(&queue->start);
uint32_t next = idx % MAX_CONTEXT_QUEUE_ENTRIES;
uint64_t v = atomic_load(&queue->ringbuffer[next]);
// We need to check the index to be sure that a ring buffer hasn't been
// recycled.
if ((v >> CQ_INDEX_SHIFT) != idx) continue;
if (!atomic_compare_exchange(&queue->ringbuffer[next], &v,
INVALID_CONTEXT_ID)) {
continue;
}
uint32_t context_id = v & CQ_CONTEXT_MASK;
2023-03-21 22:08:06 -07:00
if (context_id == INVALID_CONTEXT_ID) continue;
atomic_add(&queue->start, 1);
if (context_id > MAX_GUEST_CONTEXTS) {
panic(STUB_ERROR_BAD_CONTEXT_ID, context_id);
2023-03-21 22:08:06 -07:00
}
2023-03-22 12:23:24 -07:00
struct thread_context *ctx = thread_context_addr(context_id);
sysmsg->context = ctx;
2023-10-25 11:53:10 -07:00
atomic_store(&ctx->acked_time, rdtsc());
atomic_store(&ctx->thread_id, sysmsg->thread_id);
2023-03-21 22:08:06 -07:00
return ctx;
}
return NULL;
}
// get_context_fast sets nr_active_threads_p only if it deactivates the thread.
static struct thread_context *get_context_fast(struct sysmsg *sysmsg,
struct context_queue *queue,
uint32_t *nr_active_threads_p) {
uint32_t nr_active_threads, nr_awake_contexts;
2023-10-25 11:53:10 -07:00
if (!spinning_queue_push(0)) return NULL;
atomic_store(&queue->used_fast_path, 1);
while (1) {
struct thread_context *ctx;
ctx = queue_get_context(sysmsg);
if (ctx) {
spinning_queue_pop();
return ctx;
}
if (atomic_load(&queue->fast_path_disabled) != 0) {
if (!spinning_queue_remove_first(0))
panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0);
break;
}
nr_active_threads = atomic_load(&queue->num_active_threads);
nr_awake_contexts = atomic_load(&queue->num_awake_contexts);
if (nr_awake_contexts < nr_active_threads) {
if (atomic_compare_exchange(&queue->num_active_threads,
&nr_active_threads, nr_active_threads - 1)) {
nr_active_threads -= 1;
if (!spinning_queue_remove_first(0))
panic(STUB_ERROR_SPINNING_QUEUE_DECREF, 0);
*nr_active_threads_p = nr_active_threads;
break;
}
}
if (spinning_queue_remove_first(__export_deep_sleep_timeout)) {
break;
}
spinloop();
}
return NULL;
}
#define NR_IF_THREAD_IS_ACTIVE (~0)
static bool try_to_dec_threads_to_wakeup(struct context_queue *queue) {
while (1) {
uint32_t nr = atomic_load(&queue->num_threads_to_wakeup);
if (nr == 0) {
return false;
}
if (atomic_compare_exchange(&queue->num_threads_to_wakeup, &nr, nr - 1)) {
return true;
};
}
}
void init_new_thread() {
struct context_queue *queue = __export_context_queue_addr;
atomic_add(&queue->num_active_threads, 1);
try_to_dec_threads_to_wakeup(queue);
}
2023-03-14 17:45:21 -07:00
// get_context retrieves a context that is ready to be restored to the user.
// This populates sysmsg->thread_context_id.
struct thread_context *get_context(struct sysmsg *sysmsg) {
2023-03-21 22:08:06 -07:00
struct context_queue *queue = __export_context_queue_addr;
uint32_t nr_active_threads;
2023-03-21 22:08:06 -07:00
2023-10-25 11:53:10 -07:00
struct thread_context *ctx;
2023-03-14 17:45:21 -07:00
for (;;) {
2023-10-25 11:53:10 -07:00
atomic_add(&queue->num_spinning_threads, 1);
2023-03-21 22:08:06 -07:00
2023-03-14 17:45:21 -07:00
// Change sysmsg thread state just to indicate thread is not asleep.
atomic_store(&sysmsg->state, THREAD_STATE_PREP);
2023-03-21 22:08:06 -07:00
ctx = queue_get_context(sysmsg);
if (ctx) {
2023-10-25 11:53:10 -07:00
goto exit;
}
2023-10-25 11:53:10 -07:00
bool fast_path_enabled = atomic_load(&queue->fast_path_disabled) == 0;
nr_active_threads = NR_IF_THREAD_IS_ACTIVE;
if (fast_path_enabled) {
ctx = get_context_fast(sysmsg, queue, &nr_active_threads);
2023-10-25 11:53:10 -07:00
if (ctx) goto exit;
}
if (nr_active_threads == NR_IF_THREAD_IS_ACTIVE) {
nr_active_threads = atomic_sub(&queue->num_active_threads, 1);
2023-03-14 17:45:21 -07:00
}
2023-10-25 11:53:10 -07:00
atomic_sub(&queue->num_spinning_threads, 1);
atomic_store(&sysmsg->state, THREAD_STATE_ASLEEP);
uint32_t nr_active_contexts = atomic_load(&queue->num_active_contexts);
// We have to make another attempt to get a context here to prevent TOCTTOU
// races with waitOnState and kickSysmsgThread. There are two assumptions:
// * If the queue isn't empty, one or more threads have to be active.
// * A new thread isn't kicked, if the number of active threads are not less
// than a number of active contexts.
if (nr_active_threads < nr_active_contexts) {
ctx = queue_get_context(sysmsg);
if (ctx) {
atomic_store(&sysmsg->state, THREAD_STATE_PREP);
atomic_add(&queue->num_active_threads, 1);
return ctx;
}
}
2023-03-14 17:45:21 -07:00
while (1) {
if (!try_to_dec_threads_to_wakeup(queue)) {
sys_futex(&queue->num_threads_to_wakeup, FUTEX_WAIT, 0, NULL, NULL, 0);
continue;
}
// Mark this thread as being active only if it can get a context.
ctx = queue_get_context(sysmsg);
if (ctx) {
atomic_store(&sysmsg->state, THREAD_STATE_PREP);
atomic_add(&queue->num_active_threads, 1);
return ctx;
}
}
2023-03-14 17:45:21 -07:00
}
2023-10-25 11:53:10 -07:00
exit:
atomic_sub(&queue->num_spinning_threads, 1);
return ctx;
2023-03-14 17:45:21 -07:00
}
// switch_context signals the sentry that the old context is ready to be worked
// on and retrieves a new context to switch to.
struct thread_context *switch_context(struct sysmsg *sysmsg,
struct thread_context *ctx,
enum context_state new_context_state) {
struct context_queue *queue = __export_context_queue_addr;
if (ctx) {
atomic_sub(&queue->num_active_contexts, 1);
atomic_store(&ctx->thread_id, INVALID_THREAD_ID);
atomic_store(&ctx->last_thread_id, sysmsg->thread_id);
2023-10-25 11:53:10 -07:00
atomic_store(&ctx->state_changed_time, rdtsc());
atomic_store(&ctx->state, new_context_state);
if (atomic_load(&ctx->sentry_fast_path) == 0) {
int ret = sys_futex(&ctx->state, FUTEX_WAKE, 1, NULL, NULL, 0);
if (ret < 0) {
panic(STUB_ERROR_FUTEX, ret);
}
2023-03-14 17:45:21 -07:00
}
}
2023-03-22 09:55:04 -07:00
return get_context(sysmsg);
2023-03-14 17:45:21 -07:00
}
void verify_offsets() {
BUILD_BUG_ON(offsetof_sysmsg_self != offsetof(struct sysmsg, self));
BUILD_BUG_ON(offsetof_sysmsg_ret_addr != offsetof(struct sysmsg, ret_addr));
BUILD_BUG_ON(offsetof_sysmsg_syshandler !=
offsetof(struct sysmsg, syshandler));
BUILD_BUG_ON(offsetof_sysmsg_syshandler_stack !=
offsetof(struct sysmsg, syshandler_stack));
BUILD_BUG_ON(offsetof_sysmsg_app_stack != offsetof(struct sysmsg, app_stack));
BUILD_BUG_ON(offsetof_sysmsg_interrupt != offsetof(struct sysmsg, interrupt));
BUILD_BUG_ON(offsetof_sysmsg_state != offsetof(struct sysmsg, state));
2023-03-22 12:23:24 -07:00
BUILD_BUG_ON(offsetof_sysmsg_context != offsetof(struct sysmsg, context));
BUILD_BUG_ON(offsetof_thread_context_fpstate !=
offsetof(struct thread_context, fpstate));
BUILD_BUG_ON(offsetof_thread_context_fpstate_changed !=
offsetof(struct thread_context, fpstate_changed));
BUILD_BUG_ON(offsetof_thread_context_ptregs !=
offsetof(struct thread_context, ptregs));
BUILD_BUG_ON(kTHREAD_STATE_NONE != THREAD_STATE_NONE);
BUILD_BUG_ON(sizeof(struct thread_context) >
ALLOCATED_SIZEOF_THREAD_CONTEXT_STRUCT);
}