// Copyright Epic Games, Inc. All Rights Reserved.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;
using HordeCommon;
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StackExchange.Redis;
namespace Horde.Server.Server;
///
/// Allows reporting the health of a subsystem
///
public interface IHealthMonitor
{
///
/// Sets the name used for reporting status of this service
///
void SetName(string name);
///
/// Updates the current health of a system
///
Task UpdateAsync(HealthStatus result, string? message = null, DateTimeOffset? timestamp = null);
}
///
/// Typed implementation of
///
/// Type of the subsystem
public interface IHealthMonitor : IHealthMonitor
{
}
internal class HealthMonitor : IHealthMonitor
{
private readonly ServerStatusService _statusService;
private string _name;
public HealthMonitor(ServerStatusService statusService) : this(statusService, typeof(T).Name)
{
}
public HealthMonitor(ServerStatusService statusService, string name)
{
_statusService = statusService;
_name = name;
}
public void SetName(string name)
{
_name = name;
}
public async Task UpdateAsync(HealthStatus result, string? message, DateTimeOffset? timestamp)
{
await _statusService.ReportAsync(typeof(T), _name, result, message, timestamp);
}
}
///
/// Represents status of a subsystem inside Horde
///
/// Unique ID
/// Human-readable name
/// List of updates
public record SubsystemStatus(string Id, string Name, List Updates)
{
///
public override string ToString()
{
string updates = Updates.Count > 0 ? Updates.First().ToString() : "";
return $"Subsystem(Id={Id} Name={Name} LastUpdate={updates})";
}
}
///
/// An individual status update for a subsystem
///
///
///
///
public record SubsystemStatusUpdate(HealthStatus Result, string? Message, DateTimeOffset UpdatedAt);
///
/// Tracks health and status of the Horde server itself
/// Such as connectivity to external systems (MongoDB, Redis, Perforce etc).
///
public class ServerStatusService : IHostedService
{
///
/// Max historical status updates to keep
///
public const int MaxHistoryLength = 10;
private readonly IClock _clock;
private readonly MongoService _mongoService;
private readonly RedisService _redis;
private readonly IHealthMonitor _mongoDbHealth;
private readonly ITicker _mongoDbHealthTicker;
private readonly IHealthMonitor _redisHealth;
private readonly ITicker _redisHealthTicker;
private static string RedisHashKey() => "server-status";
///
/// Constructor
///
///
///
///
///
public ServerStatusService(MongoService mongoService, RedisService redisService, IClock clock, ILogger logger)
{
_mongoService = mongoService;
_redis = redisService;
_clock = clock;
_mongoDbHealth = new HealthMonitor(this, "MongoDB");
_mongoDbHealthTicker = clock.AddTicker($"{nameof(ServerStatusService)}.MongoDb", TimeSpan.FromSeconds(30.0), UpdateMongoDbHealthAsync, logger);
_redisHealth = new HealthMonitor(this, "Redis");
_redisHealthTicker = clock.AddTicker($"{nameof(ServerStatusService)}.Redis", TimeSpan.FromSeconds(30.0), UpdateRedisHealthAsync, logger);
}
///
public async Task StartAsync(CancellationToken cancellationToken)
{
await _mongoDbHealthTicker.StartAsync();
await _redisHealthTicker.StartAsync();
}
///
public async Task StopAsync(CancellationToken cancellationToken)
{
await _mongoDbHealthTicker.StopAsync();
await _redisHealthTicker.StopAsync();
}
///
/// Checks health and connectivity to MongoDB database
///
internal async ValueTask UpdateMongoDbHealthAsync(CancellationToken cancellationToken)
{
HealthCheckResult result = await _mongoService.CheckHealthAsync(new HealthCheckContext(), cancellationToken);
await _mongoDbHealth.UpdateAsync(result.Status, result.Description);
}
///
/// Checks health and connectivity to Redis database
///
internal async ValueTask UpdateRedisHealthAsync(CancellationToken cancellationToken)
{
HealthCheckResult result = await _redis.CheckHealthAsync(new HealthCheckContext(), cancellationToken);
await _redisHealth.UpdateAsync(result.Status, result.Description);
}
///
/// Report a status update for a given subsystem
///
/// Service type reporting health
/// Human-readable name
/// Result of the update
/// Human-readable message
/// Optional timestamp to be associated with the report. Defaults to UtcNow
public async Task ReportAsync(Type type, string name, HealthStatus result, string? message = null, DateTimeOffset? timestamp = null)
{
string id = type.Name;
IDatabase redis = _redis.GetDatabase();
SubsystemStatus status = await GetSubsystemStatusFromRedisAsync(redis, id, name);
SubsystemStatusUpdate update = new (result, message, timestamp ?? _clock.UtcNow);
status.Updates.Add(update);
status.Updates.Sort((a, b) => b.UpdatedAt.CompareTo(a.UpdatedAt));
if (status.Updates.Count > MaxHistoryLength)
{
status.Updates.RemoveRange(MaxHistoryLength, status.Updates.Count - MaxHistoryLength);
}
string data = JsonSerializer.Serialize(status);
await redis.HashSetAsync(RedisHashKey(), id, data);
}
private static async Task GetSubsystemStatusFromRedisAsync(IDatabase redis, string id, string name)
{
try
{
string? rawJson = await redis.HashGetAsync(RedisHashKey(), id);
if (rawJson != null)
{
return JsonSerializer.Deserialize(rawJson) ?? throw new JsonException("Unable to parse JSON: " + rawJson);
}
}
catch (Exception)
{
// Ignored
}
return new SubsystemStatus(id, name, []);
}
///
/// Get a list of status and updates for each subsystem
///
/// A list of statuses
public async Task> GetSubsystemStatusesAsync()
{
HashEntry[] entries = await _redis.GetDatabase().HashGetAllAsync(RedisHashKey());
List subsystems = [];
foreach (HashEntry entry in entries)
{
try
{
SubsystemStatus status = JsonSerializer.Deserialize(entry.Value.ToString()) ?? throw new JsonException("Failed parsing JSON");
subsystems.Add(status);
}
catch (JsonException) { /* Ignored */ }
}
return subsystems;
}
}