From 54113f35e048dc27653ca9a1c84fea1ac48ee069 Mon Sep 17 00:00:00 2001 From: shamanDevel Date: Mon, 2 May 2016 14:21:02 +0200 Subject: [PATCH] added alternative versions for kernel launches and resource acquiring that do not return an event. This improves the performance. --- .../src/main/java/com/jme3/opencl/Buffer.java | 38 +++++++++++- .../src/main/java/com/jme3/opencl/Image.java | 38 ++++++++++++ .../src/main/java/com/jme3/opencl/Kernel.java | 62 ++++++++++++++++++- .../java/com/jme3/opencl/package-info.java | 14 +++++ .../opencl/TestVertexBufferSharing.java | 6 +- .../jme3test/opencl/TestWriteToTexture.java | 8 +-- .../java/com/jme3/opencl/jocl/JoclBuffer.java | 14 +++++ .../java/com/jme3/opencl/jocl/JoclImage.java | 14 +++++ .../java/com/jme3/opencl/jocl/JoclKernel.java | 18 ++++++ .../com/jme3/opencl/lwjgl/LwjglBuffer.java | 12 ++++ .../com/jme3/opencl/lwjgl/LwjglImage.java | 13 ++++ .../com/jme3/opencl/lwjgl/LwjglKernel.java | 17 +++++ 12 files changed, 244 insertions(+), 10 deletions(-) diff --git a/jme3-core/src/main/java/com/jme3/opencl/Buffer.java b/jme3-core/src/main/java/com/jme3/opencl/Buffer.java index 66ce0c8f7..ebb7d2bb2 100644 --- a/jme3-core/src/main/java/com/jme3/opencl/Buffer.java +++ b/jme3-core/src/main/java/com/jme3/opencl/Buffer.java @@ -377,6 +377,29 @@ public abstract class Buffer extends AbstractOpenCLObject { * @return the event object */ public abstract Event acquireBufferForSharingAsync(CommandQueue queue); + + /** + * Aquires this buffer object for using. Only call this method if this buffer + * represents a shared object from OpenGL, created with e.g. + * {@link Context#bindVertexBuffer(com.jme3.scene.VertexBuffer, com.jme3.opencl.MemoryAccess) }. + * This method must be called before the buffer is used. After the work is + * done, the buffer must be released by calling + * {@link #releaseBufferForSharingAsync(com.jme3.opencl.CommandQueue) } + * so that OpenGL can use the VertexBuffer again. + * + * The generated event object is directly released. + * This brings a performance improvement when the resource is e.g. directly + * used by a kernel afterwards on the same queue (this implicitly waits for + * this action). If you need the event, use + * {@link #acquireBufferForSharingAsync(com.jme3.opencl.CommandQueue) } instead. + * + * @param queue the command queue + */ + public void acquireBufferForSharingNoEvent(CommandQueue queue) { + //default implementation, overwrite for better performance + acquireBufferForSharingAsync(queue).release(); + } + /** * Releases a shared buffer object. * Call this method after the buffer object was acquired by @@ -387,5 +410,18 @@ public abstract class Buffer extends AbstractOpenCLObject { */ public abstract Event releaseBufferForSharingAsync(CommandQueue queue); - //TODO: add variants of the above two methods that don't create the event object, but release the event immediately + /** + * Releases a shared buffer object. + * Call this method after the buffer object was acquired by + * {@link #acquireBufferForSharingAsync(com.jme3.opencl.CommandQueue) } + * to hand the control back to OpenGL. + * The generated event object is directly released, resulting in + * performance improvements. + * @param queue the command queue + */ + public void releaseBufferForSharingNoEvent(CommandQueue queue) { + //default implementation, overwrite for better performance + releaseBufferForSharingAsync(queue).release(); + } + } diff --git a/jme3-core/src/main/java/com/jme3/opencl/Image.java b/jme3-core/src/main/java/com/jme3/opencl/Image.java index e58c20f7e..f9d9d9f28 100644 --- a/jme3-core/src/main/java/com/jme3/opencl/Image.java +++ b/jme3-core/src/main/java/com/jme3/opencl/Image.java @@ -485,6 +485,30 @@ memory layout in which channels are stored in the image. * @return the event object */ public abstract Event acquireImageForSharingAsync(CommandQueue queue); + + /** + * Aquires this image object for using. Only call this method if this image + * represents a shared object from OpenGL, created with e.g. + * {@link Context#bindImage(com.jme3.texture.Image, com.jme3.texture.Texture.Type, int, com.jme3.opencl.MemoryAccess) } + * or variations. + * This method must be called before the image is used. After the work is + * done, the image must be released by calling + * {@link #releaseImageForSharingAsync(com.jme3.opencl.CommandQueue) } + * so that OpenGL can use the image/texture/renderbuffer again. + * + * The generated event object is directly released. + * This brings a performance improvement when the resource is e.g. directly + * used by a kernel afterwards on the same queue (this implicitly waits for + * this action). If you need the event, use + * {@link #acquireImageForSharingAsync(com.jme3.opencl.CommandQueue) }. + * + * @param queue the command queue + */ + public void acquireImageForSharingNoEvent(CommandQueue queue) { + //Default implementation, overwrite for performance + acquireImageForSharingAsync(queue).release(); + } + /** * Releases a shared image object. * Call this method after the image object was acquired by @@ -495,5 +519,19 @@ memory layout in which channels are stored in the image. */ public abstract Event releaseImageForSharingAsync(CommandQueue queue); + /** + * Releases a shared image object. + * Call this method after the image object was acquired by + * {@link #acquireImageForSharingAsync(com.jme3.opencl.CommandQueue) } + * to hand the control back to OpenGL. + * The generated event object is directly released, resulting in + * performance improvements. + * @param queue the command queue + */ + public void releaseImageForSharingNoEvent(CommandQueue queue) { + //default implementation, overwrite it for performance improvements + releaseImageForSharingAsync(queue).release(); + } + //TODO: add variants of the above two methods that don't create the event object, but release the event immediately } diff --git a/jme3-core/src/main/java/com/jme3/opencl/Kernel.java b/jme3-core/src/main/java/com/jme3/opencl/Kernel.java index 82fbf9018..17525a4bd 100644 --- a/jme3-core/src/main/java/com/jme3/opencl/Kernel.java +++ b/jme3-core/src/main/java/com/jme3/opencl/Kernel.java @@ -299,6 +299,9 @@ public abstract class Kernel extends AbstractOpenCLObject { /** * Launches the kernel with the current global work size, work group size * and arguments. + * If the returned event object is not needed and would otherwise be + * released immediately, {@link #RunNoEvent(com.jme3.opencl.CommandQueue) } + * might bring a better performance. * @param queue the command queue * @return an event object indicating when the kernel is finished * @see #setGlobalWorkSize(com.jme3.opencl.Kernel.WorkSize) @@ -306,6 +309,22 @@ public abstract class Kernel extends AbstractOpenCLObject { * @see #setArg(int, java.lang.Object) */ public abstract Event Run(CommandQueue queue); + + /** + * Launches the kernel with the current global work size, work group size + * and arguments without returning an event object. + * The generated event is directly released. Therefore, the performance + * is better, but there is no way to detect when the kernel execution + * has finished. For this purpose, use {@link #Run(com.jme3.opencl.CommandQueue) }. + * @param queue the command queue + * @see #setGlobalWorkSize(com.jme3.opencl.Kernel.WorkSize) + * @see #setWorkGroupSize(com.jme3.opencl.Kernel.WorkSize) + * @see #setArg(int, java.lang.Object) + */ + public void RunNoEvent(CommandQueue queue) { + //Default implementation, overwrite to not allocate the event object + Run(queue).release(); + } /** * Sets the work sizes and arguments in one call and launches the kernel. @@ -325,6 +344,28 @@ public abstract class Kernel extends AbstractOpenCLObject { setArgs(args); return Run(queue); } + + /** + * Sets the work sizes and arguments in one call and launches the kernel. + * The global work size is set to the specified size. The work group + * size is automatically determined by the driver. + * Each object in the argument array is sent to the kernel by + * {@link #setArg(int, java.lang.Object) }. + * The generated event is directly released. Therefore, the performance + * is better, but there is no way to detect when the kernel execution + * has finished. For this purpose, use + * {@link #Run1(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }. + * @param queue the command queue + * @param globalWorkSize the global work size + * @param args the kernel arguments + * @see #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) + */ + public void Run1NoEvent(CommandQueue queue, WorkSize globalWorkSize, Object... args) { + setGlobalWorkSize(globalWorkSize); + setWorkGroupSizeToNull(); + setArgs(args); + RunNoEvent(queue); + } /** * Sets the work sizes and arguments in one call and launches the kernel. @@ -342,8 +383,25 @@ public abstract class Kernel extends AbstractOpenCLObject { return Run(queue); } - //TODO: add variants of the above three methods that don't create the event object, but release the event immediately - + /** + * Sets the work sizes and arguments in one call and launches the kernel. + * The generated event is directly released. Therefore, the performance + * is better, but there is no way to detect when the kernel execution + * has finished. For this purpose, use + * {@link #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }. + * @param queue the command queue + * @param globalWorkSize the global work size + * @param workGroupSize the work group size + * @param args the kernel arguments + */ + public void Run2NoEvent(CommandQueue queue, WorkSize globalWorkSize, + WorkSize workGroupSize, Object... args) { + setGlobalWorkSize(globalWorkSize); + setWorkGroupSize(workGroupSize); + setArgs(args); + RunNoEvent(queue); + } + /** * A placeholder for kernel arguments representing local kernel memory. * This defines the size of available shared memory of a {@code __shared} kernel diff --git a/jme3-core/src/main/java/com/jme3/opencl/package-info.java b/jme3-core/src/main/java/com/jme3/opencl/package-info.java index 5b4fcbc0b..c96a026d1 100644 --- a/jme3-core/src/main/java/com/jme3/opencl/package-info.java +++ b/jme3-core/src/main/java/com/jme3/opencl/package-info.java @@ -89,6 +89,16 @@ * These async calls all return {@link com.jme3.opencl.Event} objects. * These events can be used to check (non-blocking) if the action has completed, e.g. a memory copy * is finished, or to block the execution until the action has finished. + *
+ * Some methods have the suffix {@code -NoEvent}. This means that these methods + * don't return an event object even if the OpenCL function would return an event. + * There exists always an alternative version that does return an event. + * These methods exist to increase the performance: since all actions (like multiple kernel calls) + * that are sent to the same command queue are executed in order, there is no + * need for intermediate events. (These intermediate events would be released + * immediately). Therefore, the no-event alternatives increase the performance + * because no additional event object has to be allocated and less system calls + * are neccessary. * *

* Interoperability between OpenCL and jME3:
@@ -142,6 +152,10 @@ * thrown. The exception always records the error code and error name and the * OpenCL function call where the error was detected. Please check the official * OpenCL specification for the meanings of these errors for that particular function. + *

  • {@code UnsupportedOperationException}: the OpenCL implementation does not + * support some operations. This is currently only an issue for Jogamp's Jogl + * renderer, since Jocl only supports OpenCL 1.1. LWJGL has full support for + * OpenCL 1.2 and 2.0. * */ package com.jme3.opencl; diff --git a/jme3-examples/src/main/java/jme3test/opencl/TestVertexBufferSharing.java b/jme3-examples/src/main/java/jme3test/opencl/TestVertexBufferSharing.java index 0d2257817..78e8e5de1 100644 --- a/jme3-examples/src/main/java/jme3test/opencl/TestVertexBufferSharing.java +++ b/jme3-examples/src/main/java/jme3test/opencl/TestVertexBufferSharing.java @@ -135,15 +135,15 @@ public class TestVertexBufferSharing extends SimpleApplication { time += tpf; //aquire resource - buffer.acquireBufferForSharingAsync(clQueue).release(); + buffer.acquireBufferForSharingNoEvent(clQueue); //no need to wait for the returned event, since the kernel implicitely waits for it (same command queue) //execute kernel float scale = (float) Math.pow(1.1, (1.0 - time%2) / 16.0); - kernel.Run1(clQueue, ws, buffer, scale).release(); + kernel.Run1NoEvent(clQueue, ws, buffer, scale); //release resource - buffer.releaseBufferForSharingAsync(clQueue).release(); + buffer.releaseBufferForSharingNoEvent(clQueue); } } \ No newline at end of file diff --git a/jme3-examples/src/main/java/jme3test/opencl/TestWriteToTexture.java b/jme3-examples/src/main/java/jme3test/opencl/TestWriteToTexture.java index c1f3e612f..5f3143832 100644 --- a/jme3-examples/src/main/java/jme3test/opencl/TestWriteToTexture.java +++ b/jme3-examples/src/main/java/jme3test/opencl/TestWriteToTexture.java @@ -136,15 +136,15 @@ public class TestWriteToTexture extends SimpleApplication implements AnalogListe } private void updateOpenCL(float tpf) { //aquire resource - texCL.acquireImageForSharingAsync(clQueue).release(); + texCL.acquireImageForSharingNoEvent(clQueue); //no need to wait for the returned event, since the kernel implicitely waits for it (same command queue) //execute kernel - kernel.Run1(clQueue, new com.jme3.opencl.Kernel.WorkSize(settings.getWidth(), settings.getHeight()), texCL, C, 16) - .release(); + Kernel.WorkSize ws = new Kernel.WorkSize(settings.getWidth(), settings.getHeight()); + kernel.Run1NoEvent(clQueue, ws, texCL, C, 16); //release resource - texCL.releaseImageForSharingAsync(clQueue).release(); + texCL.releaseImageForSharingNoEvent(clQueue); } @Override diff --git a/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclBuffer.java b/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclBuffer.java index 06d417827..23cc4dda9 100644 --- a/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclBuffer.java +++ b/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclBuffer.java @@ -191,6 +191,13 @@ public class JoclBuffer extends Buffer { long event = Utils.pointers[0].get(0); return new JoclEvent(event); } + @Override + public void acquireBufferForSharingNoEvent(CommandQueue queue) { + Utils.pointers[1].rewind(); + Utils.pointers[1].put(0, id); + long q = ((JoclCommandQueue)queue).id; + ((CLGL) cl).clEnqueueAcquireGLObjects(q, 1, Utils.pointers[1], 0, null, null); + } @Override public Event releaseBufferForSharingAsync(CommandQueue queue) { @@ -202,6 +209,13 @@ public class JoclBuffer extends Buffer { long event = Utils.pointers[0].get(0); return new JoclEvent(event); } + @Override + public void releaseBufferForSharingNoEvent(CommandQueue queue) { + Utils.pointers[1].rewind(); + Utils.pointers[1].put(0, id); + long q = ((JoclCommandQueue)queue).id; + ((CLGL) cl).clEnqueueReleaseGLObjects(q, 1, Utils.pointers[1], 0, null, null); + } private static class ReleaserImpl implements ObjectReleaser { private long mem; diff --git a/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclImage.java b/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclImage.java index 0041147f9..252debdf3 100644 --- a/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclImage.java +++ b/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclImage.java @@ -502,6 +502,13 @@ public class JoclImage extends Image { return new JoclEvent(event); } @Override + public void acquireImageForSharingNoEvent(CommandQueue queue) { + Utils.pointers[1].rewind(); + Utils.pointers[1].put(0, id); + long q = ((JoclCommandQueue)queue).id; + ((CLGL) cl).clEnqueueAcquireGLObjects(q, 1, Utils.pointers[1], 0, null, null); + } + @Override public Event releaseImageForSharingAsync(CommandQueue queue) { Utils.pointers[0].rewind(); Utils.pointers[1].rewind(); @@ -511,6 +518,13 @@ public class JoclImage extends Image { long event = Utils.pointers[0].get(0); return new JoclEvent(event); } + @Override + public void releaseImageForSharingNoEvent(CommandQueue queue) { + Utils.pointers[1].rewind(); + Utils.pointers[1].put(0, id); + long q = ((JoclCommandQueue)queue).id; + ((CLGL) cl).clEnqueueReleaseGLObjects(q, 1, Utils.pointers[1], 0, null, null); + } private static class ReleaserImpl implements ObjectReleaser { private long mem; diff --git a/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclKernel.java b/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclKernel.java index 43bac1a30..708e226e2 100644 --- a/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclKernel.java +++ b/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclKernel.java @@ -239,6 +239,24 @@ public class JoclKernel extends Kernel { Utils.checkError(ret, "clEnqueueNDRangeKernel"); return new JoclEvent(Utils.pointers[0].get(0)); } + + @Override + public void RunNoEvent(CommandQueue queue) { + Utils.pointers[1].rewind(); + Utils.pointers[1].put(globalWorkSize.getSizes(), 0, globalWorkSize.getSizes().length); + Utils.pointers[1].position(0); + PointerBuffer p2 = null; + if (workGroupSize.getSizes()[0] > 0) { + p2 = Utils.pointers[2].rewind(); + p2.put(workGroupSize.getSizes(), 0, workGroupSize.getSizes().length); + p2.position(0); + } + long q = ((JoclCommandQueue) queue).id; + int ret = cl.clEnqueueNDRangeKernel(q, kernel, + globalWorkSize.getDimension(), null, Utils.pointers[1], + p2, 0, null, null); + Utils.checkError(ret, "clEnqueueNDRangeKernel"); + } private static class ReleaserImpl implements ObjectReleaser { private long kernel; diff --git a/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglBuffer.java b/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglBuffer.java index 9a90c6eb1..bbf766df9 100644 --- a/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglBuffer.java +++ b/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglBuffer.java @@ -194,6 +194,12 @@ public class LwjglBuffer extends Buffer { long event = Utils.pointerBuffers[0].get(0); return new LwjglEvent(q.getCLEvent(event)); } + @Override + public void acquireBufferForSharingNoEvent(CommandQueue queue) { + CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue(); + int ret = CL10GL.clEnqueueAcquireGLObjects(q, buffer, null, null); + Utils.checkError(ret, "clEnqueueAcquireGLObjects"); + } @Override public Event releaseBufferForSharingAsync(CommandQueue queue) { @@ -204,6 +210,12 @@ public class LwjglBuffer extends Buffer { long event = Utils.pointerBuffers[0].get(0); return new LwjglEvent(q.getCLEvent(event)); } + @Override + public void releaseBufferForSharingNoEvent(CommandQueue queue) { + CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue(); + int ret = CL10GL.clEnqueueReleaseGLObjects(q, buffer, null, null); + Utils.checkError(ret, "clEnqueueReleaseGLObjects"); + } private static class ReleaserImpl implements ObjectReleaser { private CLMem mem; diff --git a/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglImage.java b/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglImage.java index 46b8b0ff7..b61629ed7 100644 --- a/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglImage.java +++ b/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglImage.java @@ -534,6 +534,13 @@ public class LwjglImage extends Image { long event = Utils.pointerBuffers[0].get(0); return new LwjglEvent(q.getCLEvent(event)); } + @Override + public void acquireImageForSharingNoEvent(CommandQueue queue) { + CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue(); + int ret = CL10GL.clEnqueueAcquireGLObjects(q, image, null, null); + Utils.checkError(ret, "clEnqueueAcquireGLObjects"); + } + @Override public Event releaseImageForSharingAsync(CommandQueue queue) { Utils.pointerBuffers[0].rewind(); CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue(); @@ -542,6 +549,12 @@ public class LwjglImage extends Image { long event = Utils.pointerBuffers[0].get(0); return new LwjglEvent(q.getCLEvent(event)); } + @Override + public void releaseImageForSharingNoEvent(CommandQueue queue) { + CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue(); + int ret = CL10GL.clEnqueueReleaseGLObjects(q, image, null, null); + Utils.checkError(ret, "clEnqueueReleaseGLObjects"); + } private static class ReleaserImpl implements ObjectReleaser { private CLMem mem; diff --git a/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglKernel.java b/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglKernel.java index 1897e2501..bc1edafd8 100644 --- a/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglKernel.java +++ b/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglKernel.java @@ -223,6 +223,23 @@ public class LwjglKernel extends Kernel { Utils.checkError(ret, "clEnqueueNDRangeKernel"); return new LwjglEvent(q.getCLEvent(Utils.pointerBuffers[0].get(0))); } + @Override + public void RunNoEvent(CommandQueue queue) { + Utils.pointerBuffers[1].rewind(); + Utils.pointerBuffers[1].put(globalWorkSize.getSizes()); + Utils.pointerBuffers[1].position(0); + PointerBuffer p2 = null; + if (workGroupSize.getSizes()[0] > 0) { + p2 = Utils.pointerBuffers[2].rewind(); + p2.put(workGroupSize.getSizes()); + p2.position(0); + } + CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue(); + int ret = CL10.clEnqueueNDRangeKernel(q, kernel, + globalWorkSize.getDimension(), null, Utils.pointerBuffers[1], + p2, null, null); + Utils.checkError(ret, "clEnqueueNDRangeKernel"); + } @Override public ObjectReleaser getReleaser() {