added alternative versions for kernel launches and resource acquiring that do not return an event. This improves the performance.
This commit is contained in:
parent
4be6013068
commit
54113f35e0
@ -377,6 +377,29 @@ public abstract class Buffer extends AbstractOpenCLObject {
|
||||
* @return the event object
|
||||
*/
|
||||
public abstract Event acquireBufferForSharingAsync(CommandQueue queue);
|
||||
|
||||
/**
|
||||
* Aquires this buffer object for using. Only call this method if this buffer
|
||||
* represents a shared object from OpenGL, created with e.g.
|
||||
* {@link Context#bindVertexBuffer(com.jme3.scene.VertexBuffer, com.jme3.opencl.MemoryAccess) }.
|
||||
* This method must be called before the buffer is used. After the work is
|
||||
* done, the buffer must be released by calling
|
||||
* {@link #releaseBufferForSharingAsync(com.jme3.opencl.CommandQueue) }
|
||||
* so that OpenGL can use the VertexBuffer again.
|
||||
*
|
||||
* The generated event object is directly released.
|
||||
* This brings a performance improvement when the resource is e.g. directly
|
||||
* used by a kernel afterwards on the same queue (this implicitly waits for
|
||||
* this action). If you need the event, use
|
||||
* {@link #acquireBufferForSharingAsync(com.jme3.opencl.CommandQueue) } instead.
|
||||
*
|
||||
* @param queue the command queue
|
||||
*/
|
||||
public void acquireBufferForSharingNoEvent(CommandQueue queue) {
|
||||
//default implementation, overwrite for better performance
|
||||
acquireBufferForSharingAsync(queue).release();
|
||||
}
|
||||
|
||||
/**
|
||||
* Releases a shared buffer object.
|
||||
* Call this method after the buffer object was acquired by
|
||||
@ -387,5 +410,18 @@ public abstract class Buffer extends AbstractOpenCLObject {
|
||||
*/
|
||||
public abstract Event releaseBufferForSharingAsync(CommandQueue queue);
|
||||
|
||||
//TODO: add variants of the above two methods that don't create the event object, but release the event immediately
|
||||
/**
|
||||
* Releases a shared buffer object.
|
||||
* Call this method after the buffer object was acquired by
|
||||
* {@link #acquireBufferForSharingAsync(com.jme3.opencl.CommandQueue) }
|
||||
* to hand the control back to OpenGL.
|
||||
* The generated event object is directly released, resulting in
|
||||
* performance improvements.
|
||||
* @param queue the command queue
|
||||
*/
|
||||
public void releaseBufferForSharingNoEvent(CommandQueue queue) {
|
||||
//default implementation, overwrite for better performance
|
||||
releaseBufferForSharingAsync(queue).release();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -485,6 +485,30 @@ memory layout in which channels are stored in the image.
|
||||
* @return the event object
|
||||
*/
|
||||
public abstract Event acquireImageForSharingAsync(CommandQueue queue);
|
||||
|
||||
/**
|
||||
* Aquires this image object for using. Only call this method if this image
|
||||
* represents a shared object from OpenGL, created with e.g.
|
||||
* {@link Context#bindImage(com.jme3.texture.Image, com.jme3.texture.Texture.Type, int, com.jme3.opencl.MemoryAccess) }
|
||||
* or variations.
|
||||
* This method must be called before the image is used. After the work is
|
||||
* done, the image must be released by calling
|
||||
* {@link #releaseImageForSharingAsync(com.jme3.opencl.CommandQueue) }
|
||||
* so that OpenGL can use the image/texture/renderbuffer again.
|
||||
*
|
||||
* The generated event object is directly released.
|
||||
* This brings a performance improvement when the resource is e.g. directly
|
||||
* used by a kernel afterwards on the same queue (this implicitly waits for
|
||||
* this action). If you need the event, use
|
||||
* {@link #acquireImageForSharingAsync(com.jme3.opencl.CommandQueue) }.
|
||||
*
|
||||
* @param queue the command queue
|
||||
*/
|
||||
public void acquireImageForSharingNoEvent(CommandQueue queue) {
|
||||
//Default implementation, overwrite for performance
|
||||
acquireImageForSharingAsync(queue).release();
|
||||
}
|
||||
|
||||
/**
|
||||
* Releases a shared image object.
|
||||
* Call this method after the image object was acquired by
|
||||
@ -495,5 +519,19 @@ memory layout in which channels are stored in the image.
|
||||
*/
|
||||
public abstract Event releaseImageForSharingAsync(CommandQueue queue);
|
||||
|
||||
/**
|
||||
* Releases a shared image object.
|
||||
* Call this method after the image object was acquired by
|
||||
* {@link #acquireImageForSharingAsync(com.jme3.opencl.CommandQueue) }
|
||||
* to hand the control back to OpenGL.
|
||||
* The generated event object is directly released, resulting in
|
||||
* performance improvements.
|
||||
* @param queue the command queue
|
||||
*/
|
||||
public void releaseImageForSharingNoEvent(CommandQueue queue) {
|
||||
//default implementation, overwrite it for performance improvements
|
||||
releaseImageForSharingAsync(queue).release();
|
||||
}
|
||||
|
||||
//TODO: add variants of the above two methods that don't create the event object, but release the event immediately
|
||||
}
|
||||
|
@ -299,6 +299,9 @@ public abstract class Kernel extends AbstractOpenCLObject {
|
||||
/**
|
||||
* Launches the kernel with the current global work size, work group size
|
||||
* and arguments.
|
||||
* If the returned event object is not needed and would otherwise be
|
||||
* released immediately, {@link #RunNoEvent(com.jme3.opencl.CommandQueue) }
|
||||
* might bring a better performance.
|
||||
* @param queue the command queue
|
||||
* @return an event object indicating when the kernel is finished
|
||||
* @see #setGlobalWorkSize(com.jme3.opencl.Kernel.WorkSize)
|
||||
@ -307,6 +310,22 @@ public abstract class Kernel extends AbstractOpenCLObject {
|
||||
*/
|
||||
public abstract Event Run(CommandQueue queue);
|
||||
|
||||
/**
|
||||
* Launches the kernel with the current global work size, work group size
|
||||
* and arguments without returning an event object.
|
||||
* The generated event is directly released. Therefore, the performance
|
||||
* is better, but there is no way to detect when the kernel execution
|
||||
* has finished. For this purpose, use {@link #Run(com.jme3.opencl.CommandQueue) }.
|
||||
* @param queue the command queue
|
||||
* @see #setGlobalWorkSize(com.jme3.opencl.Kernel.WorkSize)
|
||||
* @see #setWorkGroupSize(com.jme3.opencl.Kernel.WorkSize)
|
||||
* @see #setArg(int, java.lang.Object)
|
||||
*/
|
||||
public void RunNoEvent(CommandQueue queue) {
|
||||
//Default implementation, overwrite to not allocate the event object
|
||||
Run(queue).release();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the work sizes and arguments in one call and launches the kernel.
|
||||
* The global work size is set to the specified size. The work group
|
||||
@ -326,6 +345,28 @@ public abstract class Kernel extends AbstractOpenCLObject {
|
||||
return Run(queue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the work sizes and arguments in one call and launches the kernel.
|
||||
* The global work size is set to the specified size. The work group
|
||||
* size is automatically determined by the driver.
|
||||
* Each object in the argument array is sent to the kernel by
|
||||
* {@link #setArg(int, java.lang.Object) }.
|
||||
* The generated event is directly released. Therefore, the performance
|
||||
* is better, but there is no way to detect when the kernel execution
|
||||
* has finished. For this purpose, use
|
||||
* {@link #Run1(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }.
|
||||
* @param queue the command queue
|
||||
* @param globalWorkSize the global work size
|
||||
* @param args the kernel arguments
|
||||
* @see #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...)
|
||||
*/
|
||||
public void Run1NoEvent(CommandQueue queue, WorkSize globalWorkSize, Object... args) {
|
||||
setGlobalWorkSize(globalWorkSize);
|
||||
setWorkGroupSizeToNull();
|
||||
setArgs(args);
|
||||
RunNoEvent(queue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the work sizes and arguments in one call and launches the kernel.
|
||||
* @param queue the command queue
|
||||
@ -342,7 +383,24 @@ public abstract class Kernel extends AbstractOpenCLObject {
|
||||
return Run(queue);
|
||||
}
|
||||
|
||||
//TODO: add variants of the above three methods that don't create the event object, but release the event immediately
|
||||
/**
|
||||
* Sets the work sizes and arguments in one call and launches the kernel.
|
||||
* The generated event is directly released. Therefore, the performance
|
||||
* is better, but there is no way to detect when the kernel execution
|
||||
* has finished. For this purpose, use
|
||||
* {@link #Run2(com.jme3.opencl.CommandQueue, com.jme3.opencl.Kernel.WorkSize, com.jme3.opencl.Kernel.WorkSize, java.lang.Object...) }.
|
||||
* @param queue the command queue
|
||||
* @param globalWorkSize the global work size
|
||||
* @param workGroupSize the work group size
|
||||
* @param args the kernel arguments
|
||||
*/
|
||||
public void Run2NoEvent(CommandQueue queue, WorkSize globalWorkSize,
|
||||
WorkSize workGroupSize, Object... args) {
|
||||
setGlobalWorkSize(globalWorkSize);
|
||||
setWorkGroupSize(workGroupSize);
|
||||
setArgs(args);
|
||||
RunNoEvent(queue);
|
||||
}
|
||||
|
||||
/**
|
||||
* A placeholder for kernel arguments representing local kernel memory.
|
||||
|
@ -89,6 +89,16 @@
|
||||
* These async calls all return {@link com.jme3.opencl.Event} objects.
|
||||
* These events can be used to check (non-blocking) if the action has completed, e.g. a memory copy
|
||||
* is finished, or to block the execution until the action has finished.
|
||||
* <br>
|
||||
* Some methods have the suffix {@code -NoEvent}. This means that these methods
|
||||
* don't return an event object even if the OpenCL function would return an event.
|
||||
* There exists always an alternative version that does return an event.
|
||||
* These methods exist to increase the performance: since all actions (like multiple kernel calls)
|
||||
* that are sent to the same command queue are executed in order, there is no
|
||||
* need for intermediate events. (These intermediate events would be released
|
||||
* immediately). Therefore, the no-event alternatives increase the performance
|
||||
* because no additional event object has to be allocated and less system calls
|
||||
* are neccessary.
|
||||
*
|
||||
* <p>
|
||||
* <b>Interoperability between OpenCL and jME3:</b><br>
|
||||
@ -142,6 +152,10 @@
|
||||
* thrown. The exception always records the error code and error name and the
|
||||
* OpenCL function call where the error was detected. Please check the official
|
||||
* OpenCL specification for the meanings of these errors for that particular function.</li>
|
||||
* <li>{@code UnsupportedOperationException}: the OpenCL implementation does not
|
||||
* support some operations. This is currently only an issue for Jogamp's Jogl
|
||||
* renderer, since Jocl only supports OpenCL 1.1. LWJGL has full support for
|
||||
* OpenCL 1.2 and 2.0.
|
||||
* </ul>
|
||||
*/
|
||||
package com.jme3.opencl;
|
||||
|
@ -135,15 +135,15 @@ public class TestVertexBufferSharing extends SimpleApplication {
|
||||
time += tpf;
|
||||
|
||||
//aquire resource
|
||||
buffer.acquireBufferForSharingAsync(clQueue).release();
|
||||
buffer.acquireBufferForSharingNoEvent(clQueue);
|
||||
//no need to wait for the returned event, since the kernel implicitely waits for it (same command queue)
|
||||
|
||||
//execute kernel
|
||||
float scale = (float) Math.pow(1.1, (1.0 - time%2) / 16.0);
|
||||
kernel.Run1(clQueue, ws, buffer, scale).release();
|
||||
kernel.Run1NoEvent(clQueue, ws, buffer, scale);
|
||||
|
||||
//release resource
|
||||
buffer.releaseBufferForSharingAsync(clQueue).release();
|
||||
buffer.releaseBufferForSharingNoEvent(clQueue);
|
||||
}
|
||||
|
||||
}
|
@ -136,15 +136,15 @@ public class TestWriteToTexture extends SimpleApplication implements AnalogListe
|
||||
}
|
||||
private void updateOpenCL(float tpf) {
|
||||
//aquire resource
|
||||
texCL.acquireImageForSharingAsync(clQueue).release();
|
||||
texCL.acquireImageForSharingNoEvent(clQueue);
|
||||
//no need to wait for the returned event, since the kernel implicitely waits for it (same command queue)
|
||||
|
||||
//execute kernel
|
||||
kernel.Run1(clQueue, new com.jme3.opencl.Kernel.WorkSize(settings.getWidth(), settings.getHeight()), texCL, C, 16)
|
||||
.release();
|
||||
Kernel.WorkSize ws = new Kernel.WorkSize(settings.getWidth(), settings.getHeight());
|
||||
kernel.Run1NoEvent(clQueue, ws, texCL, C, 16);
|
||||
|
||||
//release resource
|
||||
texCL.releaseImageForSharingAsync(clQueue).release();
|
||||
texCL.releaseImageForSharingNoEvent(clQueue);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -191,6 +191,13 @@ public class JoclBuffer extends Buffer {
|
||||
long event = Utils.pointers[0].get(0);
|
||||
return new JoclEvent(event);
|
||||
}
|
||||
@Override
|
||||
public void acquireBufferForSharingNoEvent(CommandQueue queue) {
|
||||
Utils.pointers[1].rewind();
|
||||
Utils.pointers[1].put(0, id);
|
||||
long q = ((JoclCommandQueue)queue).id;
|
||||
((CLGL) cl).clEnqueueAcquireGLObjects(q, 1, Utils.pointers[1], 0, null, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Event releaseBufferForSharingAsync(CommandQueue queue) {
|
||||
@ -202,6 +209,13 @@ public class JoclBuffer extends Buffer {
|
||||
long event = Utils.pointers[0].get(0);
|
||||
return new JoclEvent(event);
|
||||
}
|
||||
@Override
|
||||
public void releaseBufferForSharingNoEvent(CommandQueue queue) {
|
||||
Utils.pointers[1].rewind();
|
||||
Utils.pointers[1].put(0, id);
|
||||
long q = ((JoclCommandQueue)queue).id;
|
||||
((CLGL) cl).clEnqueueReleaseGLObjects(q, 1, Utils.pointers[1], 0, null, null);
|
||||
}
|
||||
|
||||
private static class ReleaserImpl implements ObjectReleaser {
|
||||
private long mem;
|
||||
|
@ -502,6 +502,13 @@ public class JoclImage extends Image {
|
||||
return new JoclEvent(event);
|
||||
}
|
||||
@Override
|
||||
public void acquireImageForSharingNoEvent(CommandQueue queue) {
|
||||
Utils.pointers[1].rewind();
|
||||
Utils.pointers[1].put(0, id);
|
||||
long q = ((JoclCommandQueue)queue).id;
|
||||
((CLGL) cl).clEnqueueAcquireGLObjects(q, 1, Utils.pointers[1], 0, null, null);
|
||||
}
|
||||
@Override
|
||||
public Event releaseImageForSharingAsync(CommandQueue queue) {
|
||||
Utils.pointers[0].rewind();
|
||||
Utils.pointers[1].rewind();
|
||||
@ -511,6 +518,13 @@ public class JoclImage extends Image {
|
||||
long event = Utils.pointers[0].get(0);
|
||||
return new JoclEvent(event);
|
||||
}
|
||||
@Override
|
||||
public void releaseImageForSharingNoEvent(CommandQueue queue) {
|
||||
Utils.pointers[1].rewind();
|
||||
Utils.pointers[1].put(0, id);
|
||||
long q = ((JoclCommandQueue)queue).id;
|
||||
((CLGL) cl).clEnqueueReleaseGLObjects(q, 1, Utils.pointers[1], 0, null, null);
|
||||
}
|
||||
|
||||
private static class ReleaserImpl implements ObjectReleaser {
|
||||
private long mem;
|
||||
|
@ -240,6 +240,24 @@ public class JoclKernel extends Kernel {
|
||||
return new JoclEvent(Utils.pointers[0].get(0));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void RunNoEvent(CommandQueue queue) {
|
||||
Utils.pointers[1].rewind();
|
||||
Utils.pointers[1].put(globalWorkSize.getSizes(), 0, globalWorkSize.getSizes().length);
|
||||
Utils.pointers[1].position(0);
|
||||
PointerBuffer p2 = null;
|
||||
if (workGroupSize.getSizes()[0] > 0) {
|
||||
p2 = Utils.pointers[2].rewind();
|
||||
p2.put(workGroupSize.getSizes(), 0, workGroupSize.getSizes().length);
|
||||
p2.position(0);
|
||||
}
|
||||
long q = ((JoclCommandQueue) queue).id;
|
||||
int ret = cl.clEnqueueNDRangeKernel(q, kernel,
|
||||
globalWorkSize.getDimension(), null, Utils.pointers[1],
|
||||
p2, 0, null, null);
|
||||
Utils.checkError(ret, "clEnqueueNDRangeKernel");
|
||||
}
|
||||
|
||||
private static class ReleaserImpl implements ObjectReleaser {
|
||||
private long kernel;
|
||||
private ReleaserImpl(long kernel) {
|
||||
|
@ -194,6 +194,12 @@ public class LwjglBuffer extends Buffer {
|
||||
long event = Utils.pointerBuffers[0].get(0);
|
||||
return new LwjglEvent(q.getCLEvent(event));
|
||||
}
|
||||
@Override
|
||||
public void acquireBufferForSharingNoEvent(CommandQueue queue) {
|
||||
CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
|
||||
int ret = CL10GL.clEnqueueAcquireGLObjects(q, buffer, null, null);
|
||||
Utils.checkError(ret, "clEnqueueAcquireGLObjects");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Event releaseBufferForSharingAsync(CommandQueue queue) {
|
||||
@ -204,6 +210,12 @@ public class LwjglBuffer extends Buffer {
|
||||
long event = Utils.pointerBuffers[0].get(0);
|
||||
return new LwjglEvent(q.getCLEvent(event));
|
||||
}
|
||||
@Override
|
||||
public void releaseBufferForSharingNoEvent(CommandQueue queue) {
|
||||
CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
|
||||
int ret = CL10GL.clEnqueueReleaseGLObjects(q, buffer, null, null);
|
||||
Utils.checkError(ret, "clEnqueueReleaseGLObjects");
|
||||
}
|
||||
|
||||
private static class ReleaserImpl implements ObjectReleaser {
|
||||
private CLMem mem;
|
||||
|
@ -534,6 +534,13 @@ public class LwjglImage extends Image {
|
||||
long event = Utils.pointerBuffers[0].get(0);
|
||||
return new LwjglEvent(q.getCLEvent(event));
|
||||
}
|
||||
@Override
|
||||
public void acquireImageForSharingNoEvent(CommandQueue queue) {
|
||||
CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
|
||||
int ret = CL10GL.clEnqueueAcquireGLObjects(q, image, null, null);
|
||||
Utils.checkError(ret, "clEnqueueAcquireGLObjects");
|
||||
}
|
||||
@Override
|
||||
public Event releaseImageForSharingAsync(CommandQueue queue) {
|
||||
Utils.pointerBuffers[0].rewind();
|
||||
CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
|
||||
@ -542,6 +549,12 @@ public class LwjglImage extends Image {
|
||||
long event = Utils.pointerBuffers[0].get(0);
|
||||
return new LwjglEvent(q.getCLEvent(event));
|
||||
}
|
||||
@Override
|
||||
public void releaseImageForSharingNoEvent(CommandQueue queue) {
|
||||
CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
|
||||
int ret = CL10GL.clEnqueueReleaseGLObjects(q, image, null, null);
|
||||
Utils.checkError(ret, "clEnqueueReleaseGLObjects");
|
||||
}
|
||||
|
||||
private static class ReleaserImpl implements ObjectReleaser {
|
||||
private CLMem mem;
|
||||
|
@ -223,6 +223,23 @@ public class LwjglKernel extends Kernel {
|
||||
Utils.checkError(ret, "clEnqueueNDRangeKernel");
|
||||
return new LwjglEvent(q.getCLEvent(Utils.pointerBuffers[0].get(0)));
|
||||
}
|
||||
@Override
|
||||
public void RunNoEvent(CommandQueue queue) {
|
||||
Utils.pointerBuffers[1].rewind();
|
||||
Utils.pointerBuffers[1].put(globalWorkSize.getSizes());
|
||||
Utils.pointerBuffers[1].position(0);
|
||||
PointerBuffer p2 = null;
|
||||
if (workGroupSize.getSizes()[0] > 0) {
|
||||
p2 = Utils.pointerBuffers[2].rewind();
|
||||
p2.put(workGroupSize.getSizes());
|
||||
p2.position(0);
|
||||
}
|
||||
CLCommandQueue q = ((LwjglCommandQueue) queue).getQueue();
|
||||
int ret = CL10.clEnqueueNDRangeKernel(q, kernel,
|
||||
globalWorkSize.getDimension(), null, Utils.pointerBuffers[1],
|
||||
p2, null, null);
|
||||
Utils.checkError(ret, "clEnqueueNDRangeKernel");
|
||||
}
|
||||
|
||||
@Override
|
||||
public ObjectReleaser getReleaser() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user