// Copyright Epic Games, Inc. All Rights Reserved.
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Threading.Tasks;
using EpicGames.Core;
using EpicGames.Horde.Storage;
using EpicGames.Serialization;
namespace Horde.Build.Utilities
{
///
/// Information about a blob stored in a blob pack file
///
class ObjectPackEntry
{
public IoHash Hash { get; }
public int Offset { get; }
public int Length { get; }
public IoHash[] Refs { get; }
public ObjectPackEntry(IoHash hash, int offset, int length, IoHash[] refs)
{
Hash = hash;
Offset = offset;
Length = length;
Refs = refs;
}
}
///
/// Index for a blob pack
///
[CbConverter(typeof(ObjectPackIndexConverter))]
class ObjectPackIndex
{
public DateTime Time { get; }
public ObjectPackEntry[] Blobs { get; }
public IoHash DataHash { get; }
public int _dataSize;
readonly Dictionary _hashToInfo;
public ObjectPackIndex(DateTime time, ObjectPackEntry[] blobs, IoHash dataHash, int dataSize)
{
Time = time;
Blobs = blobs;
DataHash = dataHash;
_dataSize = dataSize;
_hashToInfo = blobs.ToDictionary(x => x.Hash, x => x);
}
public bool Contains(IoHash hash) => _hashToInfo.ContainsKey(hash);
public bool TryGetEntry(IoHash hash, [NotNullWhen(true)] out ObjectPackEntry? blobInfo) => _hashToInfo.TryGetValue(hash, out blobInfo);
}
///
/// Converter for BlobPackIndex objects
///
class ObjectPackIndexConverter : CbConverterBase
{
class EncodeFormat
{
[CbField("time")]
public DateTime Time { get; set; }
[CbField("exports")]
public IoHash[]? Exports { get; set; }
[CbField("lengths")]
public int[]? Lengths { get; set; }
[CbField("refs")]
public IoHash[][]? Refs { get; set; }
[CbField("data")]
public CbBinaryAttachment DataHash { get; set; }
[CbField("size")]
public int DataSize { get; set; }
}
///
public override ObjectPackIndex Read(CbField field)
{
EncodeFormat format = CbSerializer.Deserialize(field);
ObjectPackEntry[] objects = new ObjectPackEntry[format.Exports!.Length];
int offset = 0;
for (int idx = 0; idx < format.Exports.Length; idx++)
{
objects[idx] = new ObjectPackEntry(format.Exports[idx], offset, format.Lengths![idx], format.Refs![idx]);
offset += format.Lengths[idx];
}
return new ObjectPackIndex(format.Time, objects, format.DataHash, format.DataSize);
}
///
public override void Write(CbWriter writer, ObjectPackIndex index)
{
writer.BeginObject();
WriteInternal(writer, index);
writer.EndObject();
}
///
public override void WriteNamed(CbWriter writer, Utf8String name, ObjectPackIndex index)
{
writer.BeginObject(name);
WriteInternal(writer, index);
writer.EndObject();
}
static void WriteInternal(CbWriter writer, ObjectPackIndex index)
{
EncodeFormat format = new EncodeFormat();
format.Time = index.Time;
format.Exports = index.Blobs.ConvertAll(x => x.Hash).ToArray();
format.Lengths = index.Blobs.ConvertAll(x => x.Length).ToArray();
format.Refs = index.Blobs.ConvertAll(x => x.Refs).ToArray();
format.DataHash = index.DataHash;
format.DataSize = index._dataSize;
CbSerializer.Serialize(writer, format);
}
}
///
/// Helper class to maintain a set of small objects, re-packing blobs according to a heuristic to balance download performance with churn.
///
class ObjectSet
{
readonly ILegacyStorageClient _storageClient;
readonly NamespaceId _namespaceId;
public int MaxPackSize { get; }
public HashSet RootSet { get; set; } = new HashSet();
DateTime _time;
int _nextPackSize;
byte[] _nextPackData;
readonly List _nextPackEntries = new List();
readonly Dictionary _nextPackHashToEntry = new Dictionary();
public List PackIndexes { get; } = new List();
readonly List _writeTasks = new List();
///
/// Constructor
///
///
///
///
/// The initial update time; used to determine the age of blobs
public ObjectSet(ILegacyStorageClient storageClient, NamespaceId namespaceId, int maxPackSize, DateTime time)
{
_storageClient = storageClient;
_namespaceId = namespaceId;
MaxPackSize = maxPackSize;
_nextPackData = null!;
SetTime(time);
Reset();
}
///
/// Reset the current state of the next blob
///
void Reset()
{
_nextPackSize = 0;
_nextPackData = new byte[MaxPackSize];
_nextPackEntries.Clear();
_nextPackHashToEntry.Clear();
}
///
/// Reset the current timestamp
///
/// The new timestamp
public void SetTime(DateTime time)
{
_time = time;
}
///
/// Copies an existing entry into storage
///
/// Hash of the data
/// The data buffer
/// References to other objects
public void Add(IoHash hash, ReadOnlySpan data, ReadOnlySpan refs)
{
if (!_nextPackHashToEntry.ContainsKey(hash))
{
// Create enough space for the new data
CreateSpace(data.Length);
// Copy the data into the buffer
data.CopyTo(_nextPackData.AsSpan(_nextPackSize));
// Add the blob
ObjectPackEntry entry = new ObjectPackEntry(hash, _nextPackSize, data.Length, refs.ToArray());
_nextPackHashToEntry.Add(hash, entry);
_nextPackEntries.Add(entry);
_nextPackSize += data.Length;
}
}
///
/// Adds an item to the packer
///
/// Size of the data
/// Delegate to copy the data into a span
/// References to other objects
public IoHash Add(int size, Action> readData, IoHash[] refs)
{
// Get the last blob and make sure there's enough space in it
CreateSpace(size);
// Copy the data into the new blob
Memory output = _nextPackData.AsMemory(_nextPackSize, size);
readData(output);
// Update the metadata for it
IoHash hash = IoHash.Compute(output.Span);
if (!_nextPackHashToEntry.ContainsKey(hash))
{
ObjectPackEntry entry = new ObjectPackEntry(hash, _nextPackSize, size, refs);
_nextPackHashToEntry.Add(hash, entry);
_nextPackEntries.Add(entry);
_nextPackSize += size;
}
return hash;
}
///
/// Finds data for an object with the given hash, from the current pack files
///
///
///
public async Task> GetObjectDataAsync(IoHash hash)
{
await Task.WhenAll(_writeTasks);
ObjectPackEntry? entry;
if (_nextPackHashToEntry.TryGetValue(hash, out entry))
{
return _nextPackData.AsMemory(entry.Offset, entry.Length);
}
foreach (ObjectPackIndex pack in PackIndexes)
{
if (pack.TryGetEntry(hash, out entry))
{
ReadOnlyMemory packData = await _storageClient.ReadBlobToMemoryAsync(_namespaceId, pack.DataHash);
return packData.Slice(entry.Offset, entry.Length);
}
}
return default;
}
///
/// Tries to find an entry for the given hash from the current set of pack files
///
/// Hash of the
///
///
public bool TryGetEntry(IoHash hash, [NotNullWhen(true)] out ObjectPackEntry? entry)
{
ObjectPackEntry? localEntry;
if (_nextPackHashToEntry.TryGetValue(hash, out localEntry))
{
entry = localEntry;
return true;
}
foreach (ObjectPackIndex pack in PackIndexes)
{
if (pack.TryGetEntry(hash, out localEntry))
{
entry = localEntry;
return true;
}
}
entry = null;
return false;
}
///
/// Flush any pending blobs to disk
///
public async Task FlushAsync()
{
// Find the live set of objects
HashSet liveSet = new HashSet();
foreach(IoHash rootHash in RootSet)
{
FindLiveSet(rootHash, liveSet);
}
// Find the total cost of all the current blobs, then loop through the blobs trying to find a more optimal arrangement
double totalCost = PackIndexes.Sum(x => GetCostHeuristic(x)) + GetCostHeuristic(_nextPackSize, TimeSpan.Zero);
for (; ; )
{
// Exclude any objects that are in the pending blobs, since we will always upload these
HashSet newLiveSet = new HashSet(liveSet);
newLiveSet.ExceptWith(_nextPackHashToEntry.Values.Select(x => x.Hash));
// Get the size and cost of the next blob
double nextBlobCost = GetCostHeuristic(_nextPackSize, TimeSpan.Zero);
// Pass through all the blobs to find the best one to merge in
double mergeCost = totalCost;
ObjectPackIndex? mergePack = null;
for (int idx = PackIndexes.Count - 1; idx >= 0; idx--)
{
ObjectPackIndex packIndex = PackIndexes[idx];
// Try to merge any old blobs with the next blob
if (packIndex.Time < _time)
{
// Calculate the cost of the last blob if we merge this one with it. We remove blobs as we iterate
// through the list, since subsequent blobs will not usefully contribute the same items.
double newTotalCost = totalCost - GetCostHeuristic(packIndex) - nextBlobCost;
int newNextPackSize = _nextPackSize;
foreach (ObjectPackEntry entry in packIndex.Blobs)
{
if (liveSet.Contains(entry.Hash))
{
if (newNextPackSize + entry.Length > MaxPackSize)
{
newTotalCost += GetCostHeuristic(newNextPackSize, TimeSpan.Zero);
newNextPackSize = 0;
}
newNextPackSize += entry.Length;
}
}
newTotalCost += GetCostHeuristic(newNextPackSize, TimeSpan.Zero);
// Compute the potential cost if we replace the partial blob with the useful parts of this blob
if (newTotalCost < mergeCost)
{
mergePack = packIndex;
mergeCost = newTotalCost;
}
}
// Remove any items in this blob from the remaining live set. No other blobs need to include them.
newLiveSet.ExceptWith(packIndex.Blobs.Select(x => x.Hash));
}
// Bail out if we didn't find anything to merge
if (mergePack == null)
{
break;
}
// Get the data for this blob
ReadOnlyMemory mergeData = await _storageClient.ReadBlobToMemoryAsync(_namespaceId, mergePack.DataHash);
// Add anything that's still part of the live set into the new blobs
int offset = 0;
foreach (ObjectPackEntry blob in mergePack.Blobs)
{
if (liveSet.Contains(blob.Hash))
{
ReadOnlyMemory data = mergeData.Slice(offset, blob.Length);
Add(blob.Hash, data.Span, blob.Refs);
}
offset += blob.Length;
}
// Discard the old blob
PackIndexes.Remove(mergePack);
totalCost = mergeCost;
}
// Write the current blob
FlushCurrentPack();
// Wait for all the writes to finish
await Task.WhenAll(_writeTasks);
_writeTasks.Clear();
}
///
/// Finds the live set for a particular tree, and updates tree entries with the size of used items within them
///
///
///
void FindLiveSet(IoHash hash, HashSet liveSet)
{
if (liveSet.Add(hash))
{
ObjectPackEntry? entry;
if (!TryGetEntry(hash, out entry))
{
throw new Exception($"Missing blob {hash} from working set");
}
foreach (IoHash refHash in entry.Refs)
{
FindLiveSet(refHash, liveSet);
}
}
}
///
/// Creates enough space to store the given block of data
///
///
void CreateSpace(int size)
{
// Get the last blob and make sure there's enough space in it
if (_nextPackSize + size > MaxPackSize)
{
FlushCurrentPack();
}
// Resize the next blob buffer if necessary
if (size > _nextPackData.Length)
{
Array.Resize(ref _nextPackData, size);
}
}
///
/// Finalize the current blob and start writing it to storage
///
void FlushCurrentPack()
{
if (_nextPackSize > 0)
{
// Write the buffer to storage
Array.Resize(ref _nextPackData, _nextPackSize);
ReadOnlyMemory data = _nextPackData;
IoHash dataHash = IoHash.Compute(data.Span);
_writeTasks.Add(Task.Run(() => _storageClient.WriteBlobFromMemoryAsync(_namespaceId, dataHash, data)));
// Create the new index
ObjectPackIndex index = new ObjectPackIndex(_time, _nextPackEntries.ToArray(), dataHash, _nextPackSize);
PackIndexes.Add(index);
// Clear the next pack buffer
Reset();
}
}
///
/// Index to calculate the heuristic for
public double GetCostHeuristic(ObjectPackIndex index) => GetCostHeuristic(index.Blobs.Length, _time - index.Time);
///
/// Heuristic which estimates the cost of a particular blob. This is used to compare scenarios of merging blobs to reduce download
/// size against keeping older blobs which a lot of agents already have.
///
/// Size of the blob
/// Age of the blob
/// Heuristic for the cost of a blob
public static double GetCostHeuristic(int size, TimeSpan age)
{
// Time overhead to starting a download
const double DownloadInit = 0.1;
// Download speed for agents, in bytes/sec
const double DownloadRate = 1024 * 1024;
// Probability of an agent having to download everything. Prevents bias against keeping a large number of files.
const double CleanSyncProbability = 0.2;
// Average length of time between agents having to update
TimeSpan averageCoherence = TimeSpan.FromHours(4.0);
// Scale the age into a -1.0 -> 1.0 range around AverageCoherence
double scaledAge = (averageCoherence - age).TotalSeconds / averageCoherence.TotalSeconds;
// Get the probability of agents having to sync this blob based on its age. This is modeled as a logistic function (1 / (1 + e^-x))
// with value of 0.5 at AverageCoherence, and MaxInterval at zero.
// Find the scale factor for the 95% interval
// 1 / (1 + e^-x) = MaxInterval
// e^-x = (1 / MaxInterval) - 1
// x = -ln((1 / MaxInterval) - 1)
const double MaxInterval = 0.95;
double sigmoidScale = -Math.Log((1.0 / MaxInterval) - 1.0);
// Find the probability of having to sync this
double param = scaledAge * sigmoidScale;
double probability = 1.0 / (1.0 + Math.Exp(-param));
// Scale the probability against having to do a full sync
probability = CleanSyncProbability + (probability * (1.0 - CleanSyncProbability));
// Compute the final cost estimate; the amount of time we expect agents to spend downloading the file
return probability * (DownloadInit + (size / DownloadRate));
}
}
}