2019-12-26 23:01:54 -05:00
// Copyright Epic Games, Inc. All Rights Reserved.
2019-10-03 16:26:48 -04:00
# include "RecoveryService.h"
# if CRASH_REPORT_WITH_RECOVERY
# include "HAL/FileManager.h"
# include "CrashReportClient.h" // For CrashReportClientLog
# include "Interfaces/IPluginManager.h"
# include "IMessagingModule.h"
# include "ConcertSettings.h"
# include "ConcertSyncSessionFlags.h"
# include "IConcertServer.h"
# include "IConcertSession.h"
# include "IConcertSyncServer.h"
# include "IConcertSyncServerModule.h"
# include "ConcertMessageData.h"
# include "Runtime/Launch/Resources/Version.h"
2020-03-10 14:25:48 -04:00
# include "ConcertLocalFileSharingService.h"
2019-10-03 16:26:48 -04:00
static const TCHAR RecoveryServiceName [ ] = TEXT ( " Disaster Recovery Service " ) ;
bool FRecoveryService : : CollectFiles ( const FString & DestDir , bool bMetaDataOnly , bool bAnonymizeMetaData )
{
auto LogError = [ ] ( const TCHAR * Reason )
{
UE_LOG ( CrashReportClientLog , Error , TEXT ( " Failed to collect recovery session file(s). %s " ) , Reason ) ;
} ;
if ( ! Server )
{
LogError ( TEXT ( " The recovery service is not running. " ) ) ;
return false ;
}
else if ( ! IFileManager : : Get ( ) . DirectoryExists ( * DestDir ) )
{
LogError ( TEXT ( " The destination folder doesn't exist. " ) ) ;
return false ;
}
FGuid ExportedSessionId = GetRecoverySessionId ( ) ;
if ( ! ExportedSessionId . IsValid ( ) )
{
LogError ( TEXT ( " The session session could not be found. " ) ) ;
return false ;
}
FText ErrorMsg ;
FConcertSessionFilter Filter ;
Filter . bMetaDataOnly = bMetaDataOnly ;
if ( ! Server - > GetConcertServer ( ) - > ExportSession ( ExportedSessionId , Filter , DestDir , bAnonymizeMetaData , ErrorMsg ) )
{
LogError ( TEXT ( " Server failed to export the session. " ) ) ;
return false ;
}
return true ;
}
bool FRecoveryService : : Startup ( )
{
# if UE_BUILD_SHIPPING && (!defined(PLATFORM_SUPPORTS_MESSAGEBUS) || !PLATFORM_SUPPORTS_MESSAGEBUS)
# error PLATFORM_SUPPORTS_MESSAGEBUS was explicitly defined in CrashReportClient.Target.cs for shipping configuration. MessageBus is required by Concert. Ensure it is still enabled.
# endif
if ( ! IMessagingModule : : Get ( ) . GetDefaultBus ( ) )
{
UE_LOG ( CrashReportClientLog , Error , TEXT ( " MessageBus is not enabled in this configuration. Recovery service will be disabled! " ) ) ;
return false ;
}
if ( ! IConcertSyncServerModule : : IsAvailable ( ) )
{
UE_LOG ( CrashReportClientLog , Error , TEXT ( " ConcertSyncServer Module is missing. Recovery service will be disabled! " ) ) ;
return false ;
}
2021-04-29 19:32:06 -04:00
TSharedPtr < IPlugin > Plugin = IPluginManager : : Get ( ) . FindPlugin ( TEXT ( " UdpMessaging " ) ) ;
2019-10-03 16:26:48 -04:00
if ( ! Plugin | | ! Plugin - > IsEnabled ( ) )
{
// The UdpMessaging plugin should be added to the {appname}.Target.cs build file.
UE_LOG ( CrashReportClientLog , Error , TEXT ( " The 'UDP Messaging' plugin is disabled. The Concert server only supports UDP protocol. Recovery service will be disabled! " ) ) ;
return false ;
}
// Setup the disaster recovery server configuration
UConcertServerConfig * ServerConfig = IConcertSyncServerModule : : Get ( ) . ParseServerSettings ( FCommandLine : : Get ( ) ) ;
#jira UE-87927 - Disaster Recovery doesn't restore a crash from a restored session
- Added the ability to copy and restore a live session, preventing the need to archive it in first place, making the server exist fast (releasing the session lock very quickly) before showing the crash UI and before the next Editor instance could starts.
Details:
This bug could manifest if various ways. An issue causing this bug was fixed in 11252374. This bug can also be observed if the crash reporting process doesn't release its lock on the crashed session quickly. Archiving a session may takes several minutes (depending on the session size) and while a session is archiving, its database is locked and cannot be restored until the archiving process complets. When the Editor reboots after a crash, it searches for a session to recover, but skip over any session that is mounted/locked assuming the session is concurrently used by a concurrent Editor process, potentially preventing it from restoring. The optimal way to work around this problem is to skip the archiving step. Instead, the live session is never archived (saving a copy), which allows the recovery service to shutdown and release the session lock very quickly ensuring that the session will be unlocked when the Editor restarts. On Editor start, it a crashed session is found and the user decides to restore it, the live session is copied into a new live session.
This changelist also affect those other jira in the following ways:
#jira UE-87899 - Disaster recovery prevents showing the crash reporting UI in a timely manner if the session is large
- This CL changes execution order to shut down the recovery service ASAP to release the lock, but the optimization above make it super fast, so the UI should always be shown in a timely manner.
#jira UE-87927 - Disaster Recovery doesn't restore a crash from a restored session
- This CL ensures the recovery service release the session lock faster than the next instance of the Editor can start.
#jira UE-87900 - Disaster Recovery stops recording transactions if the UDP transport layer restarts or auto-repair
#jira UE-88517 - Concert Log Spam - (ConcertKeepAlive) discarded
- This CL fixes an issues with endpoints timeout logic.
#jira UE-81049 - Clean up the DisasterRecovery Intermediate directory
- This CL added code to clean up the intermediate directory left over by crashed client.
#rb Francis.Hurteau
#ROBOMERGE-SOURCE: CL 11632069 in //UE4/Release-4.25/... via CL 11632084
#ROBOMERGE-BOT: RELEASE (Release-4.25Plus -> Main) (v655-11596533)
[CL 11632094 by patrick laflamme in Main branch]
2020-02-26 11:18:30 -05:00
ServerConfig - > bAutoArchiveOnReboot = false ; // Skip archiving, disaster recovery restore a live session by copying it, this saves the step of archiving.
ServerConfig - > bAutoArchiveOnShutdown = false ; // Skip archiving, this can takes several minutes. It is more efficient to let the session 'live' and 'copy' it when restoring.
ServerConfig - > EndpointSettings . RemoteEndpointTimeoutSeconds = 0 ; // Ensure the endpoints never time out (and are kept alive automatically by Concert).
2019-11-15 12:55:57 -05:00
ServerConfig - > bMountDefaultSessionRepository = false ; // Let the client mount its own repository to support concurrent recovery server and prevent them from concurrently accessing non-sharable database files.
2019-11-27 09:00:18 -05:00
ServerConfig - > AuthorizedClientKeys . Add ( ServerConfig - > ServerName ) ; // The disaster recovery client is configured to use the unique server name as key to identify itself.
2019-10-03 16:26:48 -04:00
FConcertSessionFilter AutoArchiveSessionFilter ;
AutoArchiveSessionFilter . bIncludeIgnoredActivities = true ;
// Start disaster recovery server.
Server = IConcertSyncServerModule : : Get ( ) . CreateServer ( TEXT ( " DisasterRecovery " ) , AutoArchiveSessionFilter ) ;
2020-03-10 14:25:48 -04:00
Server - > SetFileSharingService ( MakeShared < FConcertLocalFileSharingService > ( Server - > GetConcertServer ( ) - > GetRole ( ) ) ) ;
2019-10-03 16:26:48 -04:00
Server - > Startup ( ServerConfig , EConcertSyncSessionFlags : : Default_DisasterRecoverySession ) ;
UE_LOG ( CrashReportClientLog , Display , TEXT ( " %s Initialized (Name: %s, Version: %d.%d, Role: %s) " ) , RecoveryServiceName , * Server - > GetConcertServer ( ) - > GetServerInfo ( ) . ServerName , ENGINE_MAJOR_VERSION , ENGINE_MINOR_VERSION , * Server - > GetConcertServer ( ) - > GetRole ( ) ) ;
return true ;
}
void FRecoveryService : : Shutdown ( )
{
if ( Server )
{
Server - > Shutdown ( ) ;
Server . Reset ( ) ;
UE_LOG ( CrashReportClientLog , Display , TEXT ( " %s Shutdown " ) , RecoveryServiceName ) ;
}
}
FGuid FRecoveryService : : GetRecoverySessionId ( ) const
{
2020-01-29 18:45:15 -05:00
FGuid SessionId ;
int32 SessionSeqNum = - 1 ;
2019-10-03 16:26:48 -04:00
// As long as the Concert server is up, the session would remain live (it's going to be archived when the server shutdown or reboot).
for ( TSharedPtr < IConcertServerSession > & Session : Server - > GetConcertServer ( ) - > GetSessions ( ) )
{
2020-01-29 18:45:15 -05:00
// As convention, the disaster recovery session names starts with the server name, followed by a sequence number, the project name and date time. (See RecoveryService::MakeSessionName())
2019-10-03 16:26:48 -04:00
if ( Session - > GetName ( ) . StartsWith ( Server - > GetConcertServer ( ) - > GetServerInfo ( ) . ServerName ) )
{
2020-01-29 18:45:15 -05:00
// The user may have enabled/disabled the recovery service few times and as result, several live sessions will be available. Need to pick the last one. The highest sequence number
// in the session name corresponds to the last session created.
int32 SeqNum = 0 ;
RecoveryService : : TokenizeSessionName ( Session - > GetName ( ) , nullptr , & SeqNum , nullptr , nullptr ) ;
if ( SeqNum > SessionSeqNum )
{
SessionId = Session - > GetId ( ) ;
}
2019-10-03 16:26:48 -04:00
}
}
2020-01-29 18:45:15 -05:00
return SessionId ; // Uninitialized Guid (invalid) means not found.
2019-10-03 16:26:48 -04:00
}
# endif