From fbe41916cb50c584a744412c2845f44da81ae1af Mon Sep 17 00:00:00 2001
From: Sebastian Lackner <sebastian@fds-team.de>
Date: Thu, 22 Jan 2015 01:05:36 +0100
Subject: [PATCH] Properly wrap CUDA stream callbacks by forwarding them to a
 separate worker thread.

---
 debian/changelog                              |   1 +
 ...wrap-stream-callbacks-by-forwarding-.patch | 209 ++++++++++++++++++
 patches/patchinstall.sh                       |   2 +
 3 files changed, 212 insertions(+)
 create mode 100644 patches/nvcuda-CUDA_Support/0007-nvcuda-Properly-wrap-stream-callbacks-by-forwarding-.patch

diff --git a/debian/changelog b/debian/changelog
index bd47f86b..8539cd53 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -4,6 +4,7 @@ wine-staging (1.7.35) UNRELEASED; urgency=low
   * Automatically enable fallback method to apply patches when running from inside of a git subdirectory.
   * Synchronize CSMT patchset with https://github.com/stefand/wine.
   * Several improvements to make nvcuvid (CUDA video decoding) better compatible with x86_64.
+  * Properly wrap CUDA stream callbacks by forwarding them to a separate worker thread.
   * Added patch to quote program name in ShellExecute[Ex] when it contains spaces.
   * Added patch to implement support for DDS file format in D3DXSaveTextureToFileInMemory.
   * Added patch to avoid appending duplicate NULL character when importing keys with regedit.
diff --git a/patches/nvcuda-CUDA_Support/0007-nvcuda-Properly-wrap-stream-callbacks-by-forwarding-.patch b/patches/nvcuda-CUDA_Support/0007-nvcuda-Properly-wrap-stream-callbacks-by-forwarding-.patch
new file mode 100644
index 00000000..c444334c
--- /dev/null
+++ b/patches/nvcuda-CUDA_Support/0007-nvcuda-Properly-wrap-stream-callbacks-by-forwarding-.patch
@@ -0,0 +1,209 @@
+From 4cb6fdc37124222d6e661d8db89fc832f98da9cb Mon Sep 17 00:00:00 2001
+From: Sebastian Lackner <sebastian@fds-team.de>
+Date: Thu, 22 Jan 2015 01:02:53 +0100
+Subject: nvcuda: Properly wrap stream callbacks by forwarding them to a worker
+ thread.
+
+---
+ dlls/nvcuda/Makefile.in |   1 +
+ dlls/nvcuda/nvcuda.c    | 136 ++++++++++++++++++++++++++++++++++++++++++------
+ 2 files changed, 122 insertions(+), 15 deletions(-)
+
+diff --git a/dlls/nvcuda/Makefile.in b/dlls/nvcuda/Makefile.in
+index 6322fb2..98541b1 100644
+--- a/dlls/nvcuda/Makefile.in
++++ b/dlls/nvcuda/Makefile.in
+@@ -1,4 +1,5 @@
+ MODULE    = nvcuda.dll
++EXTRALIBS = $(PTHREAD_LIBS)
+ 
+ C_SRCS = \
+ 	nvcuda.c \
+diff --git a/dlls/nvcuda/nvcuda.c b/dlls/nvcuda/nvcuda.c
+index 9fefc28..8b356fd 100644
+--- a/dlls/nvcuda/nvcuda.c
++++ b/dlls/nvcuda/nvcuda.c
+@@ -21,11 +21,17 @@
+ #include "wine/port.h"
+ 
+ #include <stdarg.h>
++#include <assert.h>
++
++#ifdef HAVE_PTHREAD_H
++#include <pthread.h>
++#endif
+ 
+ #include "windef.h"
+ #include "winbase.h"
+ #include "wine/library.h"
+ #include "wine/debug.h"
++#include "wine/list.h"
+ #include "wine/wgl.h"
+ #include "cuda.h"
+ #include "nvcuda.h"
+@@ -39,6 +45,30 @@
+ 
+ WINE_DEFAULT_DEBUG_CHANNEL(nvcuda);
+ 
++struct stream_callback_entry
++{
++    struct list entry;
++    enum
++    {
++        STREAM_CALLBACK_ABANDONED,
++        STREAM_CALLBACK_PENDING,
++        STREAM_CALLBACK_EXECUTED
++    } status;
++    void (WINAPI *callback)(CUstream hStream, CUresult status, void *userData);
++    struct
++    {
++        CUstream stream;
++        CUresult status;
++        void *userdata;
++    } args;
++};
++
++static struct list stream_callbacks            = LIST_INIT( stream_callbacks );
++static pthread_mutex_t stream_callback_mutex   = PTHREAD_MUTEX_INITIALIZER;
++static pthread_cond_t  stream_callback_request = PTHREAD_COND_INITIALIZER;
++static pthread_cond_t  stream_callback_reply   = PTHREAD_COND_INITIALIZER;
++LONG num_stream_callbacks;
++
+ static CUresult (*pcuArray3DCreate)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+ static CUresult (*pcuArray3DCreate_v2)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+ static CUresult (*pcuArray3DGetDescriptor)(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+@@ -1793,40 +1823,116 @@ CUresult WINAPI wine_cuPointerSetAttribute(const void *value, CUpointer_attribut
+     return pcuPointerSetAttribute(value, attribute, ptr);
+ }
+ 
+-struct stream_callback
++static DWORD WINAPI stream_callback_worker_thread(LPVOID parameter)
+ {
+-    void (WINAPI *callback)(CUstream hStream, CUresult status, void *userData);
+-    void *userData;
+-};
++    struct stream_callback_entry *wrapper;
++    struct list *ptr;
++    pthread_mutex_lock(&stream_callback_mutex);
++
++    for (;;)
++    {
++        while ((ptr = list_head(&stream_callbacks)))
++        {
++            wrapper = LIST_ENTRY(ptr, struct stream_callback_entry, entry);
++            list_remove(&wrapper->entry);
++
++            switch (wrapper->status)
++            {
++                case STREAM_CALLBACK_ABANDONED:
++                    free(wrapper);
++                    break;
++
++                case STREAM_CALLBACK_PENDING:
++                    pthread_mutex_unlock(&stream_callback_mutex);
++
++                    TRACE("calling stream callback %p(%p, %d, %p)\n", wrapper->callback,
++                          wrapper->args.stream, wrapper->args.status, wrapper->args.userdata);
++                    wrapper->callback(wrapper->args.stream, wrapper->args.status, wrapper->args.userdata);
++                    TRACE("stream callback %p returned\n", wrapper->callback);
++
++                    wrapper->status = STREAM_CALLBACK_EXECUTED;
++                    pthread_cond_broadcast(&stream_callback_reply);
++                    pthread_mutex_lock(&stream_callback_mutex);
++                    break;
++
++                default:
++                    assert(0); /* never reached */
++            }
++
++            if (!--num_stream_callbacks)
++                goto end;
++        }
++
++        pthread_cond_wait(&stream_callback_request, &stream_callback_mutex);
++    }
++
++end:
++    pthread_mutex_unlock(&stream_callback_mutex);
++    return 0;
++}
+ 
+ static void stream_callback_wrapper(CUstream hStream, CUresult status, void *userData)
+ {
+-    struct stream_callback *wrapper = userData;
+-    TRACE("(%p, %d, %p)\n", hStream, status, userData);
++    struct stream_callback_entry *wrapper = userData;
++    wrapper->status         = STREAM_CALLBACK_PENDING;
++    wrapper->args.stream    = hStream;
++    wrapper->args.status    = status;
++    pthread_mutex_lock(&stream_callback_mutex);
+ 
+-    TRACE("calling stream callback %p(%p, %d, %p)\n", wrapper->callback, hStream, status, wrapper->userData);
+-    wrapper->callback(hStream, status, wrapper->userData);
+-    TRACE("stream callback %p returned\n", wrapper->callback);
++    list_add_tail(&stream_callbacks, &wrapper->entry);
++    pthread_cond_signal(&stream_callback_request);
++    while (wrapper->status == STREAM_CALLBACK_PENDING)
++        pthread_cond_wait(&stream_callback_reply, &stream_callback_mutex);
+ 
+-    HeapFree( GetProcessHeap(), 0, wrapper );
++    pthread_mutex_unlock(&stream_callback_mutex);
++    free(wrapper);
+ }
+ 
+ CUresult WINAPI wine_cuStreamAddCallback(CUstream hStream, void *callback, void *userData, unsigned int flags)
+ {
+-    struct stream_callback *wrapper;
++    struct stream_callback_entry *wrapper;
+     CUresult ret;
+ 
+     TRACE("(%p, %p, %p, %u)\n", hStream, callback, userData, flags);
+ 
+-    wrapper = HeapAlloc( GetProcessHeap(), 0, sizeof(*wrapper) );
++    wrapper = malloc(sizeof(*wrapper));
+     if (!wrapper)
+         return CUDA_ERROR_OUT_OF_MEMORY;
++    wrapper->callback       = callback;
++    wrapper->args.userdata  = userData;
+ 
+-    wrapper->callback = callback;
+-    wrapper->userData = userData;
++    /* spawn a new worker thread if necessary */
++    pthread_mutex_lock(&stream_callback_mutex);
++    if (!num_stream_callbacks++)
++    {
++        HANDLE thread = CreateThread(NULL, 0, stream_callback_worker_thread, NULL, 0, NULL);
++        if (!thread)
++        {
++            num_stream_callbacks--;
++            pthread_mutex_unlock(&stream_callback_mutex);
++            free(wrapper);
++            return CUDA_ERROR_OUT_OF_MEMORY; /* FIXME */
++        }
++        CloseHandle(thread);
++    }
++    pthread_mutex_unlock(&stream_callback_mutex);
+ 
+     ret = pcuStreamAddCallback(hStream, stream_callback_wrapper, wrapper, flags);
+-    if (ret) HeapFree( GetProcessHeap(), 0, wrapper );
++    if (ret)
++    {
++        pthread_mutex_lock(&stream_callback_mutex);
++        if (num_stream_callbacks == 1)
++        {
++            wrapper->status = STREAM_CALLBACK_ABANDONED;
++            list_add_tail(&stream_callbacks, &wrapper->entry);
++            pthread_cond_signal(&stream_callback_request);
++            wrapper = NULL;
++        }
++        else num_stream_callbacks--;
++        pthread_mutex_unlock(&stream_callback_mutex);
++        free(wrapper);
++    }
++
+     return ret;
+ }
+ 
+-- 
+2.2.1
+
diff --git a/patches/patchinstall.sh b/patches/patchinstall.sh
index 533e983e..40f6ba52 100755
--- a/patches/patchinstall.sh
+++ b/patches/patchinstall.sh
@@ -2204,6 +2204,7 @@ if test "$enable_nvcuda_CUDA_Support" -eq 1; then
 	patch_apply nvcuda-CUDA_Support/0004-nvcuda-Implement-new-functions-added-in-CUDA-6.5.patch
 	patch_apply nvcuda-CUDA_Support/0005-nvcuda-Properly-wrap-undocumented-ContextStorage-int.patch
 	patch_apply nvcuda-CUDA_Support/0006-nvcuda-Emulate-two-d3d9-initialization-functions.patch
+	patch_apply nvcuda-CUDA_Support/0007-nvcuda-Properly-wrap-stream-callbacks-by-forwarding-.patch
 	(
 		echo '+    { "Sebastian Lackner", "include: Add cuda.h.h.", 1 },';
 		echo '+    { "Sebastian Lackner", "nvcuda: Add stub dll.", 1 },';
@@ -2211,6 +2212,7 @@ if test "$enable_nvcuda_CUDA_Support" -eq 1; then
 		echo '+    { "Sebastian Lackner", "nvcuda: Implement new functions added in CUDA 6.5.", 1 },';
 		echo '+    { "Michael Müller", "nvcuda: Properly wrap undocumented '\''ContextStorage'\'' interface and add tests.", 1 },';
 		echo '+    { "Michael Müller", "nvcuda: Emulate two d3d9 initialization functions.", 1 },';
+		echo '+    { "Sebastian Lackner", "nvcuda: Properly wrap stream callbacks by forwarding them to a worker thread.", 1 },';
 	) >> "$patchlist"
 fi