Files

141 lines
5.7 KiB
C++
Raw Permalink Normal View History

// Copyright Epic Games, Inc. All Rights Reserved.
#include "RecoveryService.h"
#if CRASH_REPORT_WITH_RECOVERY
#include "HAL/FileManager.h"
#include "CrashReportClient.h" // For CrashReportClientLog
#include "Interfaces/IPluginManager.h"
#include "IMessagingModule.h"
#include "ConcertSettings.h"
#include "ConcertSyncSessionFlags.h"
#include "IConcertServer.h"
#include "IConcertSession.h"
#include "IConcertSyncServer.h"
#include "IConcertSyncServerModule.h"
#include "ConcertMessageData.h"
#include "Runtime/Launch/Resources/Version.h"
#include "ConcertLocalFileSharingService.h"
static const TCHAR RecoveryServiceName[] = TEXT("Disaster Recovery Service");
bool FRecoveryService::CollectFiles(const FString& DestDir, bool bMetaDataOnly, bool bAnonymizeMetaData)
{
auto LogError = [](const TCHAR* Reason)
{
UE_LOG(CrashReportClientLog, Error, TEXT("Failed to collect recovery session file(s). %s"), Reason);
};
if (!Server)
{
LogError(TEXT("The recovery service is not running."));
return false;
}
else if (!IFileManager::Get().DirectoryExists(*DestDir))
{
LogError(TEXT("The destination folder doesn't exist."));
return false;
}
FGuid ExportedSessionId = GetRecoverySessionId();
if (!ExportedSessionId.IsValid())
{
LogError(TEXT("The session session could not be found."));
return false;
}
FText ErrorMsg;
FConcertSessionFilter Filter;
Filter.bMetaDataOnly = bMetaDataOnly;
if (!Server->GetConcertServer()->ExportSession(ExportedSessionId, Filter, DestDir, bAnonymizeMetaData, ErrorMsg))
{
LogError(TEXT("Server failed to export the session."));
return false;
}
return true;
}
bool FRecoveryService::Startup()
{
#if UE_BUILD_SHIPPING && (!defined(PLATFORM_SUPPORTS_MESSAGEBUS) || !PLATFORM_SUPPORTS_MESSAGEBUS)
#error PLATFORM_SUPPORTS_MESSAGEBUS was explicitly defined in CrashReportClient.Target.cs for shipping configuration. MessageBus is required by Concert. Ensure it is still enabled.
#endif
if (!IMessagingModule::Get().GetDefaultBus())
{
UE_LOG(CrashReportClientLog, Error, TEXT("MessageBus is not enabled in this configuration. Recovery service will be disabled!"));
return false;
}
if (!IConcertSyncServerModule::IsAvailable())
{
UE_LOG(CrashReportClientLog, Error, TEXT("ConcertSyncServer Module is missing. Recovery service will be disabled!"));
return false;
}
TSharedPtr<IPlugin> Plugin = IPluginManager::Get().FindPlugin(TEXT("UdpMessaging"));
if (!Plugin || !Plugin->IsEnabled())
{
// The UdpMessaging plugin should be added to the {appname}.Target.cs build file.
UE_LOG(CrashReportClientLog, Error, TEXT("The 'UDP Messaging' plugin is disabled. The Concert server only supports UDP protocol. Recovery service will be disabled!"));
return false;
}
// Setup the disaster recovery server configuration
UConcertServerConfig* ServerConfig = IConcertSyncServerModule::Get().ParseServerSettings(FCommandLine::Get());
#jira UE-87927 - Disaster Recovery doesn't restore a crash from a restored session - Added the ability to copy and restore a live session, preventing the need to archive it in first place, making the server exist fast (releasing the session lock very quickly) before showing the crash UI and before the next Editor instance could starts. Details: This bug could manifest if various ways. An issue causing this bug was fixed in 11252374. This bug can also be observed if the crash reporting process doesn't release its lock on the crashed session quickly. Archiving a session may takes several minutes (depending on the session size) and while a session is archiving, its database is locked and cannot be restored until the archiving process complets. When the Editor reboots after a crash, it searches for a session to recover, but skip over any session that is mounted/locked assuming the session is concurrently used by a concurrent Editor process, potentially preventing it from restoring. The optimal way to work around this problem is to skip the archiving step. Instead, the live session is never archived (saving a copy), which allows the recovery service to shutdown and release the session lock very quickly ensuring that the session will be unlocked when the Editor restarts. On Editor start, it a crashed session is found and the user decides to restore it, the live session is copied into a new live session. This changelist also affect those other jira in the following ways: #jira UE-87899 - Disaster recovery prevents showing the crash reporting UI in a timely manner if the session is large - This CL changes execution order to shut down the recovery service ASAP to release the lock, but the optimization above make it super fast, so the UI should always be shown in a timely manner. #jira UE-87927 - Disaster Recovery doesn't restore a crash from a restored session - This CL ensures the recovery service release the session lock faster than the next instance of the Editor can start. #jira UE-87900 - Disaster Recovery stops recording transactions if the UDP transport layer restarts or auto-repair #jira UE-88517 - Concert Log Spam - (ConcertKeepAlive) discarded - This CL fixes an issues with endpoints timeout logic. #jira UE-81049 - Clean up the DisasterRecovery Intermediate directory - This CL added code to clean up the intermediate directory left over by crashed client. #rb Francis.Hurteau #ROBOMERGE-SOURCE: CL 11632069 in //UE4/Release-4.25/... via CL 11632084 #ROBOMERGE-BOT: RELEASE (Release-4.25Plus -> Main) (v655-11596533) [CL 11632094 by patrick laflamme in Main branch]
2020-02-26 11:18:30 -05:00
ServerConfig->bAutoArchiveOnReboot = false; // Skip archiving, disaster recovery restore a live session by copying it, this saves the step of archiving.
ServerConfig->bAutoArchiveOnShutdown = false; // Skip archiving, this can takes several minutes. It is more efficient to let the session 'live' and 'copy' it when restoring.
ServerConfig->EndpointSettings.RemoteEndpointTimeoutSeconds = 0; // Ensure the endpoints never time out (and are kept alive automatically by Concert).
#jira UE-83339 - Disaster Recovery can fail to recover its session when the project is opened from the Project Browser - Fixed a disaster recovery bug preventing the Editor from recovering a session because another instance of the Editor on another project already locked all the sessions. Problem: On windows, the CrashReportClientEditor (hosting disaster recovery service) is started in the static initialization, before the engine is initialized, not allowing lot of command line configuration. The Editor project browser would start a first CrashReportClientEditor instance, which would load and lock all the available sessions (unless another CrashReportClientEditor was running). When the user selected a project, a new Editor and CrashReportClientEditor were launched before the first one was closed. The second instance could not access the existing sessions because they were still locked by the first instance. Solution: Because CrashReportClientEditor is launch before the engine is initialized, we don't have any context at the launch time. The best the was to delay the moment when the server reloads the existing sessions and enable each clients to store their sessions in different folders (repositories) mounted on demand by the server. Implementation details: - Implemented new RPC API to allow the client to list/create/load/drop specific repositories containing its own sessions on demand. - Updated the Concert server to manage multiples directories where session can be stored/found (session repositories) rather than just one. - Added a settings to allow the user to specify where the disaster recovery sessions should be stored on the disk. Now default in the current project folder. - Added a settings to prevent the Concert server from scanning the sessions in the default location. - Updated disaster recovery to start without any session repository and let the client decide if a new one needs to be created or an existing one be mounted to restore a previous session. - Changed the code to let disaster recovery client manage its session history rather than letting the server rotate the old session. Defaulted the history to 0, user has no flow to visualize and pick from the history. #rb Jamie.Dale #ROBOMERGE-SOURCE: CL 10260823 in //UE4/Release-4.24/... #ROBOMERGE-BOT: RELEASE (Release-4.24 -> Main) (v591-10236483) [CL 10260830 by patrick laflamme in Main branch]
2019-11-15 12:55:57 -05:00
ServerConfig->bMountDefaultSessionRepository = false; // Let the client mount its own repository to support concurrent recovery server and prevent them from concurrently accessing non-sharable database files.
ServerConfig->AuthorizedClientKeys.Add(ServerConfig->ServerName); // The disaster recovery client is configured to use the unique server name as key to identify itself.
FConcertSessionFilter AutoArchiveSessionFilter;
AutoArchiveSessionFilter.bIncludeIgnoredActivities = true;
// Start disaster recovery server.
Server = IConcertSyncServerModule::Get().CreateServer(TEXT("DisasterRecovery"), AutoArchiveSessionFilter);
Server->SetFileSharingService(MakeShared<FConcertLocalFileSharingService>(Server->GetConcertServer()->GetRole()));
Server->Startup(ServerConfig, EConcertSyncSessionFlags::Default_DisasterRecoverySession);
UE_LOG(CrashReportClientLog, Display, TEXT("%s Initialized (Name: %s, Version: %d.%d, Role: %s)"), RecoveryServiceName, *Server->GetConcertServer()->GetServerInfo().ServerName, ENGINE_MAJOR_VERSION, ENGINE_MINOR_VERSION, *Server->GetConcertServer()->GetRole());
return true;
}
void FRecoveryService::Shutdown()
{
if (Server)
{
Server->Shutdown();
Server.Reset();
UE_LOG(CrashReportClientLog, Display, TEXT("%s Shutdown"), RecoveryServiceName);
}
}
FGuid FRecoveryService::GetRecoverySessionId() const
{
FGuid SessionId;
int32 SessionSeqNum = -1;
// As long as the Concert server is up, the session would remain live (it's going to be archived when the server shutdown or reboot).
for (TSharedPtr<IConcertServerSession>& Session : Server->GetConcertServer()->GetSessions())
{
// As convention, the disaster recovery session names starts with the server name, followed by a sequence number, the project name and date time. (See RecoveryService::MakeSessionName())
if (Session->GetName().StartsWith(Server->GetConcertServer()->GetServerInfo().ServerName))
{
// The user may have enabled/disabled the recovery service few times and as result, several live sessions will be available. Need to pick the last one. The highest sequence number
// in the session name corresponds to the last session created.
int32 SeqNum = 0;
RecoveryService::TokenizeSessionName(Session->GetName(), nullptr, &SeqNum, nullptr, nullptr);
if (SeqNum > SessionSeqNum)
{
SessionId = Session->GetId();
}
}
}
return SessionId; // Uninitialized Guid (invalid) means not found.
}
#endif