added alternative versions for kernel launches and resource acquiring that do not return an event. This improves the performance.

9 years ago · 54113f35e0
parent 4be6013068
commit 54113f35e0
12 changed files with 244 additions and 10 deletions
--- a/jme3-core/src/main/java/com/jme3/opencl/Buffer.java
+++ b/jme3-core/src/main/java/com/jme3/opencl/Buffer.java
@ -377,6 +377,29 @@ public abstract class Buffer extends AbstractOpenCLObject {
     * @return the event object
     */
    public abstract Event acquireBufferForSharingAsync(CommandQueue queue);
    /**
     * Aquires this buffer object for using. Only call this method if this buffer
     * represents a shared object from OpenGL, created with e.g.
     * {@link Context#bindVertexBuffer(com.jme3.scene.VertexBuffer, com.jme3.opencl.MemoryAccess) }.
     * This method must be called before the buffer is used. After the work is
     * done, the buffer must be released by calling
     * {@link #releaseBufferForSharingAsync(com.jme3.opencl.CommandQueue) }
     * so that OpenGL can use the VertexBuffer again.
     * 
     * The generated event object is directly released.
     * This brings a performance improvement when the resource is e.g. directly
     * used by a kernel afterwards on the same queue (this implicitly waits for
     * this action). If you need the event, use 
     * {@link #acquireBufferForSharingAsync(com.jme3.opencl.CommandQueue) } instead.
     * 
     * @param queue the command queue
     */
    public void acquireBufferForSharingNoEvent(CommandQueue queue) {
        //default implementation, overwrite for better performance
        acquireBufferForSharingAsync(queue).release();
    }
    /**
     * Releases a shared buffer object.
     * Call this method after the buffer object was acquired by
@ -387,5 +410,18 @@ public abstract class Buffer extends AbstractOpenCLObject {
     */
    public abstract Event releaseBufferForSharingAsync(CommandQueue queue);
-    //TODO: add variants of the above two methods that don't create the event object, but release the event immediately
+    /**
     * Releases a shared buffer object.
     * Call this method after the buffer object was acquired by
     * {@link #acquireBufferForSharingAsync(com.jme3.opencl.CommandQueue) }
     * to hand the control back to OpenGL.
     * The generated event object is directly released, resulting in 
     * performance improvements.
     * @param queue the command queue
     */
    public void releaseBufferForSharingNoEvent(CommandQueue queue) {
        //default implementation, overwrite for better performance
        releaseBufferForSharingAsync(queue).release();
    }
 }
--- a/jme3-core/src/main/java/com/jme3/opencl/Image.java
+++ b/jme3-core/src/main/java/com/jme3/opencl/Image.java
@ -485,6 +485,30 @@ memory layout in which channels are stored in the image.
     * @return the event object
     */
    public abstract Event acquireImageForSharingAsync(CommandQueue queue);
    /**
     * Aquires this image object for using. Only call this method if this image
     * represents a shared object from OpenGL, created with e.g.
     * {@link Context#bindImage(com.jme3.texture.Image, com.jme3.texture.Texture.Type, int, com.jme3.opencl.MemoryAccess) }
     * or variations.
     * This method must be called before the image is used. After the work is
     * done, the image must be released by calling
     * {@link #releaseImageForSharingAsync(com.jme3.opencl.CommandQueue)  }
     * so that OpenGL can use the image/texture/renderbuffer again.
     * 
     * The generated event object is directly released.
     * This brings a performance improvement when the resource is e.g. directly
     * used by a kernel afterwards on the same queue (this implicitly waits for
     * this action). If you need the event, use 
     * {@link #acquireImageForSharingAsync(com.jme3.opencl.CommandQueue) }.
     * 
     * @param queue the command queue
     */
    public void acquireImageForSharingNoEvent(CommandQueue queue) {
        //Default implementation, overwrite for performance
        acquireImageForSharingAsync(queue).release();
    }
    /**
     * Releases a shared image object.
     * Call this method after the image object was acquired by
@ -495,5 +519,19 @@ memory layout in which channels are stored in the image.
     */
    public abstract Event releaseImageForSharingAsync(CommandQueue queue);
    /**
     * Releases a shared image object.
     * Call this method after the image object was acquired by
     * {@link #acquireImageForSharingAsync(com.jme3.opencl.CommandQueue) }
     * to hand the control back to OpenGL.
     * The generated event object is directly released, resulting in 
     * performance improvements.
     * @param queue the command queue
     */
    public void releaseImageForSharingNoEvent(CommandQueue queue) {
        //default implementation, overwrite it for performance improvements
        releaseImageForSharingAsync(queue).release();
    }
    //TODO: add variants of the above two methods that don't create the event object, but release the event immediately
 }
--- a/jme3-core/src/main/java/com/jme3/opencl/Kernel.java
+++ b/jme3-core/src/main/java/com/jme3/opencl/Kernel.java
@ -299,6 +299,9 @@ public abstract class Kernel extends AbstractOpenCLObject {
    /**
     * Launches the kernel with the current global work size, work group size
     * and arguments.
     * If the returned event object is not needed and would otherwise be
     * released immediately, {@link #RunNoEvent(com.jme3.opencl.CommandQueue) }
     * might bring a better performance.
     * @param queue the command queue
     * @return an event object indicating when the kernel is finished
     * @see #setGlobalWorkSize(com.jme3.opencl.Kernel.WorkSize) 
@ -306,6 +309,22 @@ public abstract class Kernel extends AbstractOpenCLObject {
     * @see #setArg(int, java.lang.Object) 
     */
    public abstract Event Run(CommandQueue queue);
    /**
     * Launches the kernel with the current global work size, work group size
     * and arguments without returning an event object.
     * The generated event is directly released. Therefore, the performance
     * is better, but there is no way to detect when the kernel execution
     * has finished. For this purpose, use {@link #Run(com.jme3.opencl.CommandQueue) }.
     * @param queue the command queue
     * @see #setGlobalWorkSize(com.jme3.opencl.Kernel.WorkSize) 
     * @see #setWorkGroupSize(com.jme3.opencl.Kernel.WorkSize) 
     * @see #setArg(int, java.lang.Object) 
     */
    public void RunNoEvent(CommandQueue queue) {
        //Default implementation, overwrite to not allocate the event object
        Run(queue).release();
    }
    /**
     * Sets the work sizes and arguments in one call and launches the kernel.
@ -325,6 +344,28 @@ public abstract class Kernel extends AbstractOpenCLObject {
        setArgs(args);
        return Run(queue);
    }
    /**
     * Sets the work sizes and arguments in one call and launches the kernel.
     * The global work size is set to the specified size. The work group
     * size is automatically determined by the driver.
     * Each object in the argument array is sent to the kernel by
     * {@link #setArg(int, java.lang.Object) }.
     * The generated event is directly released. Therefore, the performance
     * is better, but there is no way to detect when the kernel execution
     * has finished. For this purpose, use 
     * {@link #Run1(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }.
     * @param queue the command queue
     * @param globalWorkSize the global work size
     * @param args the kernel arguments
     * @see #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) 
     */
    public void Run1NoEvent(CommandQueue queue, WorkSize globalWorkSize, Object... args) {
        setGlobalWorkSize(globalWorkSize);
        setWorkGroupSizeToNull();
        setArgs(args);
        RunNoEvent(queue);
    }
    /**
     * Sets the work sizes and arguments in one call and launches the kernel.
@ -342,8 +383,25 @@ public abstract class Kernel extends AbstractOpenCLObject {
        return Run(queue);
    }
-    //TODO: add variants of the above three methods that don't create the event object, but release the event immediately
+    /**
-
+     * Sets the work sizes and arguments in one call and launches the kernel.
     * The generated event is directly released. Therefore, the performance
     * is better, but there is no way to detect when the kernel execution
     * has finished. For this purpose, use 
     * {@link #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }.
     * @param queue the command queue
     * @param globalWorkSize the global work size
     * @param workGroupSize the work group size
     * @param args the kernel arguments
     */
    public void Run2NoEvent(CommandQueue queue, WorkSize globalWorkSize,
            WorkSize workGroupSize, Object... args) {
        setGlobalWorkSize(globalWorkSize);
        setWorkGroupSize(workGroupSize);
        setArgs(args);
        RunNoEvent(queue);
    }
    /**
     * A placeholder for kernel arguments representing local kernel memory.
     * This defines the size of available shared memory of a {@code __shared} kernel
--- a/jme3-core/src/main/java/com/jme3/opencl/package-info.java
+++ b/jme3-core/src/main/java/com/jme3/opencl/package-info.java
@ -89,6 +89,16 @@
 * These async calls all return {@link com.jme3.opencl.Event} objects.
 * These events can be used to check (non-blocking) if the action has completed, e.g. a memory copy
 * is finished, or to block the execution until the action has finished.
 * <br>
 * Some methods have the suffix {@code -NoEvent}. This means that these methods
 * don't return an event object even if the OpenCL function would return an event.
 * There exists always an alternative version that does return an event.
 * These methods exist to increase the performance: since all actions (like multiple kernel calls)
 * that are sent to the same command queue are executed in order, there is no
 * need for intermediate events. (These intermediate events would be released
 * immediately). Therefore, the no-event alternatives increase the performance
 * because no additional event object has to be allocated and less system calls
 * are neccessary.
 * 
 * <p>
 * <b>Interoperability between OpenCL and jME3:</b><br>
@ -142,6 +152,10 @@
 * thrown. The exception always records the error code and error name and the 
 * OpenCL function call where the error was detected. Please check the official
 * OpenCL specification for the meanings of these errors for that particular function.</li>
 * <li>{@code UnsupportedOperationException}: the OpenCL implementation does not
 * support some operations. This is currently only an issue for Jogamp's Jogl
 * renderer, since Jocl only supports OpenCL 1.1. LWJGL has full support for
 * OpenCL 1.2 and 2.0.
 * </ul>
 */
 package com.jme3.opencl;
--- a/jme3-examples/src/main/java/jme3test/opencl/TestVertexBufferSharing.java
+++ b/jme3-examples/src/main/java/jme3test/opencl/TestVertexBufferSharing.java
@ -135,15 +135,15 @@ public class TestVertexBufferSharing extends SimpleApplication {
        time += tpf;
        //aquire resource
-        buffer.acquireBufferForSharingAsync(clQueue).release();
+        buffer.acquireBufferForSharingNoEvent(clQueue);
        //no need to wait for the returned event, since the kernel implicitely waits for it (same command queue)
        //execute kernel
        float scale = (float) Math.pow(1.1, (1.0 - time%2) / 16.0);
-        kernel.Run1(clQueue, ws, buffer, scale).release();
+        kernel.Run1NoEvent(clQueue, ws, buffer, scale);
        //release resource
-        buffer.releaseBufferForSharingAsync(clQueue).release();
+        buffer.releaseBufferForSharingNoEvent(clQueue);
    }
 }
--- a/jme3-examples/src/main/java/jme3test/opencl/TestWriteToTexture.java
+++ b/jme3-examples/src/main/java/jme3test/opencl/TestWriteToTexture.java
@ -136,15 +136,15 @@ public class TestWriteToTexture extends SimpleApplication implements AnalogListe
    }
    private void updateOpenCL(float tpf) {
        //aquire resource
-        texCL.acquireImageForSharingAsync(clQueue).release();
+        texCL.acquireImageForSharingNoEvent(clQueue);
        //no need to wait for the returned event, since the kernel implicitely waits for it (same command queue)
        //execute kernel
-        kernel.Run1(clQueue, new com.jme3.opencl.Kernel.WorkSize(settings.getWidth(), settings.getHeight()), texCL, C, 16)
+        Kernel.WorkSize ws = new Kernel.WorkSize(settings.getWidth(), settings.getHeight());
-                .release();
+        kernel.Run1NoEvent(clQueue, ws, texCL, C, 16);
        //release resource
-        texCL.releaseImageForSharingAsync(clQueue).release();
+        texCL.releaseImageForSharingNoEvent(clQueue);
    }
    @Override
--- a/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclBuffer.java
+++ b/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclBuffer.java
@ -191,6 +191,13 @@ public class JoclBuffer extends Buffer {
        long event = Utils.pointers[0].get(0);
        return new JoclEvent(event);
    }
    @Override
    public void acquireBufferForSharingNoEvent(CommandQueue queue) {
        Utils.pointers[1].rewind();
        Utils.pointers[1].put(0, id);
        long q = ((JoclCommandQueue)queue).id;
        ((CLGL) cl).clEnqueueAcquireGLObjects(q, 1, Utils.pointers[1], 0, null, null);
    }
    @Override
    public Event releaseBufferForSharingAsync(CommandQueue queue) {
@ -202,6 +209,13 @@ public class JoclBuffer extends Buffer {
        long event = Utils.pointers[0].get(0);
        return new JoclEvent(event);
    }
    @Override
    public void releaseBufferForSharingNoEvent(CommandQueue queue) {
        Utils.pointers[1].rewind();
        Utils.pointers[1].put(0, id);
        long q = ((JoclCommandQueue)queue).id;
        ((CLGL) cl).clEnqueueReleaseGLObjects(q, 1, Utils.pointers[1], 0, null, null);
    }
    private static class ReleaserImpl implements ObjectReleaser {
        private long mem;
--- a/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclImage.java
+++ b/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclImage.java
@ -502,6 +502,13 @@ public class JoclImage extends Image {
        return new JoclEvent(event);
    }
    @Override
    public void acquireImageForSharingNoEvent(CommandQueue queue) {
        Utils.pointers[1].rewind();
        Utils.pointers[1].put(0, id);
        long q = ((JoclCommandQueue)queue).id;
        ((CLGL) cl).clEnqueueAcquireGLObjects(q, 1, Utils.pointers[1], 0, null, null);
    }
    @Override
    public Event releaseImageForSharingAsync(CommandQueue queue) {
        Utils.pointers[0].rewind();
        Utils.pointers[1].rewind();
@ -511,6 +518,13 @@ public class JoclImage extends Image {
        long event = Utils.pointers[0].get(0);
        return new JoclEvent(event);
    }
    @Override
    public void releaseImageForSharingNoEvent(CommandQueue queue) {
        Utils.pointers[1].rewind();
        Utils.pointers[1].put(0, id);
        long q = ((JoclCommandQueue)queue).id;
        ((CLGL) cl).clEnqueueReleaseGLObjects(q, 1, Utils.pointers[1], 0, null, null);
    }
    private static class ReleaserImpl implements ObjectReleaser {
        private long mem;
--- a/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclKernel.java
+++ b/jme3-jogl/src/main/java/com/jme3/opencl/jocl/JoclKernel.java
@ -239,6 +239,24 @@ public class JoclKernel extends Kernel {
        Utils.checkError(ret, "clEnqueueNDRangeKernel");
        return new JoclEvent(Utils.pointers[0].get(0));
    }
    @Override
    public void RunNoEvent(CommandQueue queue) {
        Utils.pointers[1].rewind();
        Utils.pointers[1].put(globalWorkSize.getSizes(), 0, globalWorkSize.getSizes().length);
        Utils.pointers[1].position(0);
        PointerBuffer p2 = null;
        if (workGroupSize.getSizes()[0] > 0) {
            p2 = Utils.pointers[2].rewind();
            p2.put(workGroupSize.getSizes(), 0, workGroupSize.getSizes().length);
            p2.position(0);
        }
        long q = ((JoclCommandQueue) queue).id;
        int ret = cl.clEnqueueNDRangeKernel(q, kernel,
 			globalWorkSize.getDimension(), null, Utils.pointers[1],
 			p2, 0, null, null);
        Utils.checkError(ret, "clEnqueueNDRangeKernel");
    }
    private static class ReleaserImpl implements ObjectReleaser {
        private long kernel;
--- a/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglBuffer.java
+++ b/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglBuffer.java
@ -194,6 +194,12 @@ public class LwjglBuffer extends Buffer {
        long event = Utils.pointerBuffers[0].get(0);
        return new LwjglEvent(q.getCLEvent(event));
    }
    @Override
    public void acquireBufferForSharingNoEvent(CommandQueue queue) {
        CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
        int ret = CL10GL.clEnqueueAcquireGLObjects(q, buffer, null, null);
        Utils.checkError(ret, "clEnqueueAcquireGLObjects");
    }
    @Override
    public Event releaseBufferForSharingAsync(CommandQueue queue) {
@ -204,6 +210,12 @@ public class LwjglBuffer extends Buffer {
        long event = Utils.pointerBuffers[0].get(0);
        return new LwjglEvent(q.getCLEvent(event));
    }
    @Override
    public void releaseBufferForSharingNoEvent(CommandQueue queue) {
        CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
        int ret = CL10GL.clEnqueueReleaseGLObjects(q, buffer, null, null);
        Utils.checkError(ret, "clEnqueueReleaseGLObjects");
    }
    private static class ReleaserImpl implements ObjectReleaser {
        private CLMem mem;
--- a/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglImage.java
+++ b/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglImage.java
@ -534,6 +534,13 @@ public class LwjglImage extends Image {
        long event = Utils.pointerBuffers[0].get(0);
        return new LwjglEvent(q.getCLEvent(event));
    }
    @Override
    public void acquireImageForSharingNoEvent(CommandQueue queue) {
        CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
        int ret = CL10GL.clEnqueueAcquireGLObjects(q, image, null, null);
        Utils.checkError(ret, "clEnqueueAcquireGLObjects");
    }
    @Override
    public Event releaseImageForSharingAsync(CommandQueue queue) {
        Utils.pointerBuffers[0].rewind();
        CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
@ -542,6 +549,12 @@ public class LwjglImage extends Image {
        long event = Utils.pointerBuffers[0].get(0);
        return new LwjglEvent(q.getCLEvent(event));
    }
    @Override
    public void releaseImageForSharingNoEvent(CommandQueue queue) {
        CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
        int ret = CL10GL.clEnqueueReleaseGLObjects(q, image, null, null);
        Utils.checkError(ret, "clEnqueueReleaseGLObjects");
    }
    private static class ReleaserImpl implements ObjectReleaser {
        private CLMem mem;
--- a/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglKernel.java
+++ b/jme3-lwjgl/src/main/java/com/jme3/opencl/lwjgl/LwjglKernel.java
@ -223,6 +223,23 @@ public class LwjglKernel extends Kernel {
        Utils.checkError(ret, "clEnqueueNDRangeKernel");
        return new LwjglEvent(q.getCLEvent(Utils.pointerBuffers[0].get(0)));
    }
    @Override
    public void RunNoEvent(CommandQueue queue) {
        Utils.pointerBuffers[1].rewind();
        Utils.pointerBuffers[1].put(globalWorkSize.getSizes());
        Utils.pointerBuffers[1].position(0);
        PointerBuffer p2 = null;
        if (workGroupSize.getSizes()[0] > 0) {
            p2 = Utils.pointerBuffers[2].rewind();
            p2.put(workGroupSize.getSizes());
            p2.position(0);
        }
        CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
        int ret = CL10.clEnqueueNDRangeKernel(q, kernel,
 			globalWorkSize.getDimension(), null, Utils.pointerBuffers[1],
 			p2, null, null);
        Utils.checkError(ret, "clEnqueueNDRangeKernel");
    }
    @Override
    public ObjectReleaser getReleaser() {