Yes, most references will be based on CUDA (of course, also the ones in the cuda showcase, and there are quite lots of them…). But the concepts of CUDA and OpenCL are basically the same, so it should be relatively easy to port an existing CUDA implementation to OpenCL - but there are rather sophisticated ones: http://cudasw.sourceforge.net/homepage.htm#latest seems to be well-known and associated with many publications, so it should be worth a look.
However, for now, the current state of my tests. Note that this is not “polished”, and still one of the first test results. I still have to see how much time I can invest here. There are many other tasks in my queue, but again: IF (!) it is possible to achieve a good speedup here (with reasonable effort), it COULD become a nice sample…
/*
* JOCL - Java bindings for OpenCL
*
* Copyright 2014 Marco Hutter - http://www.jocl.org/
*/
package org.jocl.smithwaterman;
import static org.jocl.CL.CL_CONTEXT_PLATFORM;
import static org.jocl.CL.CL_MEM_READ_ONLY;
import static org.jocl.CL.CL_TRUE;
import static org.jocl.CL.clBuildProgram;
import static org.jocl.CL.clCreateBuffer;
import static org.jocl.CL.clCreateCommandQueue;
import static org.jocl.CL.clCreateContext;
import static org.jocl.CL.clCreateKernel;
import static org.jocl.CL.clCreateProgramWithSource;
import static org.jocl.CL.clEnqueueNDRangeKernel;
import static org.jocl.CL.clEnqueueReadBuffer;
import static org.jocl.CL.clEnqueueWriteBuffer;
import static org.jocl.CL.clFinish;
import static org.jocl.CL.clGetDeviceIDs;
import static org.jocl.CL.clGetPlatformIDs;
import static org.jocl.CL.clReleaseCommandQueue;
import static org.jocl.CL.clReleaseContext;
import static org.jocl.CL.clReleaseKernel;
import static org.jocl.CL.clReleaseMemObject;
import static org.jocl.CL.clReleaseProgram;
import static org.jocl.CL.clSetKernelArg;
import java.util.Arrays;
import java.util.Random;
import org.jocl.CL;
import org.jocl.Pointer;
import org.jocl.Sizeof;
import org.jocl.cl_command_queue;
import org.jocl.cl_context;
import org.jocl.cl_context_properties;
import org.jocl.cl_device_id;
import org.jocl.cl_kernel;
import org.jocl.cl_mem;
import org.jocl.cl_platform_id;
import org.jocl.cl_program;
/**
* An implementation of the Smith-Waterman algorithm with OpenCL and
* plain Java
*/
public class JOCLSmithWaterman
{
/**
* The platform from which the first device will be used
*/
private static final int platformIndex = 0;
/**
* The device type that will be used
*/
private static final long deviceType = CL.CL_DEVICE_TYPE_ALL;
/**
* The index of the device that will be used
*/
private static final int deviceIndex = 0;
/**
* The main OpenCL context that will be created in {@link #init()}
* and destroyed in {@link #shutdown()}
*/
private static cl_context context;
/**
* The OpenCL device that will be used
*/
private static cl_device_id device;
/**
* The OpenCL command queue for the device
*/
private static cl_command_queue commandQueue;
/**
* The OpenCL program that contains the kernel function
*/
private static cl_program program;
/**
* The OpenCL kernel function
*/
private static cl_kernel kernel;
/**
* The value for a match between the strings
*/
public static final int MATCH_VALUE = 2;
/**
* The value for a mismatch between the strings
*/
public static final int MISMATCH_VALUE = -1;
/**
* The value for an empty prefix
*/
public static final int PREFIX_VALUE = 0;
/**
* The value for an insertion / deletion
*/
public static final int INDEL_VALUE = -1;
/**
* The entry point of this application
*
* @param args Not used
*/
public static void main(String[] args)
{
init();
basicTest();
speedTest();
shutdown();
}
/**
* Performs a basic test: Computes the alignment with OpenCL and
* with Java, and compares the results
*/
private static void basicTest()
{
Random random = new Random(0);
String string0 = Utils.createRandomString(10, random);
String string1 = Utils.createRandomString(10, random);
AlignmentMatrix aHost = computeAlignmentMatrixHost(string0, string1);
AlignmentMatrix aDevice = computeAlignmentMatrixDevice(string0, string1);
System.out.println("Host result");
Utils.print(aHost);
System.out.println("Device result");
Utils.print(aDevice);
boolean equal = Utils.equal(aHost, aDevice);
if (equal)
{
System.out.println("PASSED");
}
}
/**
* Computes the alignment for longer strings, repeatedly, with
* OpenCL and with Java, and prints some basic time measures
*/
private static void speedTest()
{
Random random = new Random(0);
String string0 = Utils.createRandomString(1500, random);
String string1 = Utils.createRandomString(1500, random);
long before = 0;
long after = 0;
long durationHost = 0;
long durationDevice = 0;
int runs = 20;
boolean passed = true;
for (int i=0; i<runs; i++)
{
before = System.nanoTime();
AlignmentMatrix aHost = computeAlignmentMatrixHost(string0, string1);
after = System.nanoTime();
durationHost += (after-before);
before = System.nanoTime();
AlignmentMatrix aDevice = computeAlignmentMatrixDevice(string0, string1);
after = System.nanoTime();
durationDevice += (after-before);
passed &= Utils.equal(aHost, aDevice);
}
System.out.println("passed "+passed);
System.out.println("host "+durationHost /1e6);
System.out.println("device "+durationDevice/1e6);
}
/**
* Initialize the OpenCL context, device, command queue, program and kernel
*/
private static void init()
{
// Enable exceptions and subsequently omit error checks
CL.setExceptionsEnabled(true);
// Obtain the number of platforms
int numPlatformsArray[] = new int[1];
clGetPlatformIDs(0, null, numPlatformsArray);
int numPlatforms = numPlatformsArray[0];
//Obtain a platform ID
cl_platform_id platforms[] = new cl_platform_id[numPlatforms];
clGetPlatformIDs(platforms.length, platforms, null);
cl_platform_id platform = platforms[platformIndex];
// Initialize the context properties
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platform);
// Obtain the number of devices for the platform
int numDevicesArray[] = new int[1];
clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray);
int numDevices = numDevicesArray[0];
// Obtain a device ID
cl_device_id devices[] = new cl_device_id[numDevices];
clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
device = devices[deviceIndex];
// Create a context for the selected device
context = clCreateContext(contextProperties, 1,
new cl_device_id[]{device}, null, null, null);
// Create command-queue for device
long properties = 0;
properties |= CL.CL_QUEUE_PROFILING_ENABLE ;
commandQueue = clCreateCommandQueue(context, device, properties, null);
// Create and build the program and the kernel
String source = Utils.read("kernels/smithWatermanKernel.cl");
program = clCreateProgramWithSource(
context, 1, new String[]{ source }, null, null);
clBuildProgram(program, 0, null, null, null, null);
kernel = clCreateKernel(program, "smithWaterman", null);
}
/**
* Release the OpenCL context, device, command queue, program and kernel
*/
private static void shutdown()
{
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
}
/**
* A simple class representing an alignment matrix
*/
static class AlignmentMatrix
{
int matrix[];
int sizeX;
int sizeY;
byte s0[];
byte s1[];
}
/**
* Compute the alignment matrix for the given strings using OpenCL
*
* @param string0 The first string
* @param string1 The second string
*/
private static AlignmentMatrix computeAlignmentMatrixDevice(
String string0, String string1)
{
int subSize = 10;
int unpaddedSizeX = string0.length() + 1;
int unpaddedSizeY = string1.length() + 1;
int tilesX = (int)Math.ceil((double)unpaddedSizeX / subSize);
int tilesY = (int)Math.ceil((double)unpaddedSizeY / subSize);
// Create the padded matrix, and the padded strings
int sizeX = tilesX * subSize;
int sizeY = tilesY * subSize;
int matrix[] = new int[sizeX*sizeY];
Utils.initMatrix(matrix, sizeX, sizeY);
byte s0[] = Arrays.copyOf(string0.getBytes(), sizeX);
byte s1[] = Arrays.copyOf(string1.getBytes(), sizeY);
// Create memory objects for the padded matrix and
// the padded strings on the device
cl_mem matrixMem = clCreateBuffer(context,
CL.CL_MEM_READ_WRITE | CL.CL_MEM_COPY_HOST_PTR,
Sizeof.cl_int * matrix.length, Pointer.to(matrix), null);
cl_mem s0Mem = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL.CL_MEM_COPY_HOST_PTR,
Sizeof.cl_char * s0.length, Pointer.to(s0), null);
cl_mem s1Mem = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL.CL_MEM_COPY_HOST_PTR ,
Sizeof.cl_char * s1.length, Pointer.to(s1), null);
// Create the array and the memory object that will store
// the x- and y-coordinates where the tiles of the current
// diagonal start. This is done outside of the loop, with
// the maximum possible number of tiles, to avoid frequent
// reallocation of the memory object
int maxTilesOnDiagonal = Math.max(tilesX, tilesY);
int subIndices[] = new int[maxTilesOnDiagonal*2];
cl_mem subIndicesMem = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL.CL_MEM_COPY_HOST_PTR,
Sizeof.cl_int * subIndices.length, Pointer.to(subIndices), null);
// Set the kernel arguments
int a = 0;
clSetKernelArg(kernel, a++, Sizeof.cl_mem, Pointer.to(matrixMem));
clSetKernelArg(kernel, a++, Sizeof.cl_int, Pointer.to(new int[]{sizeX}));
clSetKernelArg(kernel, a++, Sizeof.cl_int, Pointer.to(new int[]{sizeX}));
clSetKernelArg(kernel, a++, Sizeof.cl_mem, Pointer.to(subIndicesMem));
clSetKernelArg(kernel, a++, Sizeof.cl_int, Pointer.to(new int[]{subSize}));
clSetKernelArg(kernel, a++, Sizeof.cl_mem, Pointer.to(s0Mem));
clSetKernelArg(kernel, a++, Sizeof.cl_mem, Pointer.to(s1Mem));
// Walk along all diagonals of tiles of the matrix
int numTileDiagonals = tilesX + tilesY - 1;
for (int d=0; d<numTileDiagonals; d++)
{
// Compute the coordinates of the start- and end tile
// of the current diagonal
int tx0 = 0;
int ty0 = d;
if (d >= tilesY)
{
tx0 = d - tilesY + 1;
ty0 = tilesY - 1;
}
int tx1 = d;
int ty1 = 0;
if (d >= tilesX)
{
tx1 = tilesX - 1;
ty1 = d - tilesX + 1;
}
// Compute the indices of the upper left corners of
// the tiles that should be treated by the kernel
int tilesOnDiagonal = Math.max(tx1-tx0, ty1-ty0) + 1;
int tx = tx0;
int ty = ty0;
for (int i=0; i<tilesOnDiagonal; i++)
{
subIndices[2*i+0] = tx * subSize;
subIndices[2*i+1] = ty * subSize;
tx++;
ty--;
}
// Copy the new subIndices to the device
clEnqueueWriteBuffer(commandQueue, subIndicesMem,
CL_TRUE, 0, tilesOnDiagonal * 2 * Sizeof.cl_int,
Pointer.to(subIndices), 0, null, null);
// Invoke the kernel for all tiles on the current diagonal
long globalWorkSize[] = new long[] { tilesOnDiagonal };
clEnqueueNDRangeKernel(commandQueue, kernel,
1, null, globalWorkSize, null, 0, null, null);
clFinish(commandQueue);
}
// Read the matrix data back to the host
clEnqueueReadBuffer(commandQueue, matrixMem, CL_TRUE, 0,
matrix.length * Sizeof.cl_int, Pointer.to(matrix), 0, null, null);
// Release the memory objects
clReleaseMemObject(matrixMem);
clReleaseMemObject(s0Mem);
clReleaseMemObject(s1Mem);
clReleaseMemObject(subIndicesMem);
AlignmentMatrix alignmentMatrix = new AlignmentMatrix();
alignmentMatrix.matrix = matrix;
alignmentMatrix.sizeX = sizeX;
alignmentMatrix.sizeY = sizeY;
alignmentMatrix.s0 = s0;
alignmentMatrix.s1 = s1;
return alignmentMatrix;
}
/**
* Returns the alignment matrix for the given strings, computed in
* plain Java
*
* @param string0 The first string
* @param string1 The second string
* @return The alignment matrix
*/
private static AlignmentMatrix computeAlignmentMatrixHost(
String string0, String string1)
{
int sizeX = string0.length()+1;
int sizeY = string1.length()+1;
int matrix[] = new int[sizeX*sizeY];
byte s0[] = string0.getBytes();
byte s1[] = string1.getBytes();
Utils.initMatrix(matrix, sizeX, sizeY);
fillAlignmentMatrixHost(matrix, sizeX, sizeY, s0, s1);
AlignmentMatrix alignmentMatrix = new AlignmentMatrix();
alignmentMatrix.matrix = matrix;
alignmentMatrix.sizeX = sizeX;
alignmentMatrix.sizeY = sizeY;
alignmentMatrix.s0 = s0;
alignmentMatrix.s1 = s1;
return alignmentMatrix;
}
/**
* Fill the given alignment matrix by computing the matching between
* the given strings.
*
* @param matrix The matrix
* @param sizeX The size of the matrix
* @param sizeY The size of the matrix
* @param s0 The first string
* @param s1 The second string
*/
private static void fillAlignmentMatrixHost(
int matrix[], int sizeX, int sizeY, byte s0[], byte s1[])
{
for (int xx = 0; xx < sizeX-1; xx++)
{
int x = xx + 1;
for (int yy = 0; yy < sizeY-1; yy++)
{
int y = yy + 1;
int index = x+y*sizeX;
int nw = matrix[xx+yy*sizeX] + match(s0[xx], s1[yy]);
int n = matrix[x +yy*sizeX] + -1;
int w = matrix[xx+y *sizeX] + -1;
int max = 0;
max = nw > max ? nw : max;
max = n > max ? n : max;
max = w > max ? w : max;
// System.out.println("Compute at "+x+" "+y+" with "+(char)s0[xx]+" and "+(char)s1[yy]);
// System.out.println(" "+xx+", "+yy+" value "+nw);
// System.out.println(" "+x +", "+yy+" value "+n );
// System.out.println(" "+xx+", "+y +" value "+ w);
matrix[index] = max;
}
}
}
/**
* Returns the value for a match/mismatch between the given
* characters.
*
* @param b0 The first character
* @param b1 The second character
* @return The match/mismatch value
*/
static int match(byte b0, byte b1)
{
if (b0 == b1)
{
return MATCH_VALUE;
}
return MISMATCH_VALUE;
}
}
/*
* JOCL - Java bindings for OpenCL
*
* Copyright 2014 Marco Hutter - http://www.jocl.org/
*/
package org.jocl.smithwaterman;
import static org.jocl.CL.clGetDeviceInfo;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.Random;
import org.jocl.CL;
import org.jocl.Pointer;
import org.jocl.Sizeof;
import org.jocl.cl_device_id;
import org.jocl.smithwaterman.JOCLSmithWaterman.AlignmentMatrix;
/**
* Utility methods for the Smith-Waterman JOCL sample
*/
class Utils
{
/**
* Create a string with the specified length, consisting of random letters
* from ['A' ... 'Z']
*
* @param length The length of the string
* @param random The random number generator
* @return The string
*/
static String createRandomString(int length, Random random)
{
StringBuilder sb = new StringBuilder();
for (int i=0; i< length; i++)
{
sb.append((char)('A'+random.nextInt(26)));
}
return sb.toString();
}
/**
* Reads the contents of the file with the given name, and returns
* it as a string. If there is any IO error, then an error message
* will be printed and <code>null</code> will be returned.
*
* @param fileName The name of the file to read
* @return The contents of the file, or <code>null</code> if an error
* occurred
*/
static String read(String fileName)
{
InputStream inputStream = null;
try
{
inputStream = new FileInputStream(fileName);
String result = read(inputStream);
return result;
}
catch (IOException e)
{
e.printStackTrace();
return null;
}
finally
{
if (inputStream != null)
{
try
{
inputStream.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
}
/**
* Reads the lines that are contained in the given input stream and
* returns them as a string. The caller is responsible for closing
* the given stream.
*
* @param inputStream The input stream to read.
* @return The contents of the stream
* @throws IOException If an IO error occurs
*/
static String read(InputStream inputStream) throws IOException
{
BufferedReader br = new BufferedReader(
new InputStreamReader(inputStream));
StringBuilder sb = new StringBuilder();
String line = null;
while (true)
{
line = br.readLine();
if (line == null)
{
break;
}
sb.append(line+"
");
}
return sb.toString();
}
/**
* Returns the CL_DEVIVE_MAX_WORK_GROUP_SIZE of the given device
*
* @param device The device
* @return The maximum work group size
*/
static int getMaxWorkGroupSize(cl_device_id device)
{
ByteBuffer buffer = ByteBuffer.
allocate(Sizeof.size_t).
order(ByteOrder.nativeOrder());
clGetDeviceInfo(device, CL.CL_DEVICE_MAX_WORK_GROUP_SIZE,
Sizeof.size_t, Pointer.to(buffer), null);
if (Sizeof.size_t == 4)
{
return buffer.getInt();
}
else
{
return (int)buffer.getLong();
}
}
/**
* Initialize the alignment matrix:
* <ul>
* <li>Fill it with all zeros</li>
* <li>
* Then fill the first row with values 0*PREFIX_VALUE,
* 1*PREFIX_VALUE, ... (sizeX-1)*PREFIX_VALUE
* </li>
* <li>
* Then fill the first column with values 0*PREFIX_VALUE,
* 1*PREFIX_VALUE, ... (sizeY-1)*PREFIX_VALUE
* </li>
* </ul>
*
* @param matrix The matrix
* @param sizeX The size of the matrix
* @param sizeY The size of the matrix
*/
static void initMatrix(int matrix[], int sizeX, int sizeY)
{
Arrays.fill(matrix, 0);
int x = 0;
int y = 0;
for (x=0; x<sizeX; x++)
{
int index = x + y * sizeX;
matrix[index] = x * JOCLSmithWaterman.PREFIX_VALUE;
}
x = 0;
for (y=0; y<sizeY; y++)
{
int index = x + y * sizeX;
matrix[index] = y * JOCLSmithWaterman.PREFIX_VALUE;
}
}
/**
* Print the given alignment matrix
*
* @param alignmentMatrix The alignment matrix
*/
static void print(AlignmentMatrix alignmentMatrix)
{
print(
alignmentMatrix.matrix,
alignmentMatrix.sizeX,
alignmentMatrix.sizeY,
alignmentMatrix.s0,
alignmentMatrix.s1);
}
/**
* Print the alignment matrix for the given strings. The strings are
* assumed to be padded so that s0.length==sizeX and s1.length==sizeY.
*
* @param matrix The matrix
* @param sizeX The size of the matrix
* @param sizeY The size of the matrix
* @param s0 The first string
* @param s1 The second string
*/
static void print(
int[] matrix, int sizeX,
int sizeY, byte s0[], byte s1[])
{
int cellSize = 4;
String format = "%"+cellSize+"s";
System.out.printf(format, "");
for (int x=0; x<sizeX; x++)
{
char c = ' ';
if (x > 0)
{
c = (char) s0[x-1];
if (c == 0)
{
c = '.';
}
}
System.out.printf(format, String.valueOf(c));
}
System.out.println();
for (int y=0; y<sizeY; y++)
{
char c = ' ';
if (y > 0)
{
c = (char) s1[y-1];
if (c == 0)
{
c = '.';
}
}
System.out.printf(format, String.valueOf(c));
for (int x=0; x<sizeX; x++)
{
int index = x + y*sizeX;
int value = matrix[index];
System.out.printf(format, String.valueOf(value));
}
System.out.println();
}
}
/**
* Checks whether the given alignment matrices are equal. That is,
* whether they contain the same values in the range that is present
* in both matrices. (If one of the matrices is padded to be larger
* than the other, then the padded part is ignored)
*
* @param a0 The first matrix
* @param a1 The second matrix
* @return Whether the matrices are equal
*/
static boolean equal(AlignmentMatrix a0, AlignmentMatrix a1)
{
int minSizeX = Math.min(a0.sizeX, a1.sizeX);
int minSizeY = Math.min(a0.sizeY, a1.sizeY);
for (int x=0; x<minSizeX; x++)
{
for (int y=0; y<minSizeY; y++)
{
int index0 = x + y * a0.sizeX;
int index1 = x + y * a1.sizeX;
int value0 = a0.matrix[index0];
int value1 = a1.matrix[index1];
if (value0 != value1)
{
System.out.println("At "+x+" "+y+" found values "+
value0+" and "+value1);
return false;
}
}
}
return true;
}
/**
* Private constructor to prevent instantiation
*/
private Utils()
{
// Private constructor to prevent instantiation
}
}
smithWatermanKernel.cl:
/*
* JOCL - Java bindings for OpenCL
*
* Copyright 2014 Marco Hutter - http://www.jocl.org/
*/
// Originally, this was based on
// https://github.com/l-urence/smith-waterman/blob/master/src/kernel.cl
// but heavily modified
#define MATCH 2
#define MISMATCH -1
#define INDEL -1
int match(char c0, char c1)
{
if (c0 == c1)
{
return MATCH;
}
return MISMATCH;
}
__kernel void smithWaterman (
__global int *matrix,
int sizeX,
int sizeY,
__global int *subIndices,
int subSize,
__global char *s0,
__global char *s1)
{
int gid = get_global_id(0);
int startX = subIndices[gid+gid+0];
int startY = subIndices[gid+gid+1];
for (int xx = startX; xx < startX + subSize; xx++)
{
int x = xx + 1;
if (x < sizeX)
{
for (int yy = startY; yy < startY + subSize; yy++)
{
int y = yy + 1;
if (y < sizeY)
{
int index = x+y*sizeX;
int nw = matrix[xx+yy*sizeX] + match(s0[xx], s1[yy]);
int n = matrix[x+ yy*sizeX] + INDEL;
int w = matrix[xx+y *sizeX] + INDEL;
int max = 0;
max = nw > max ? nw : max;
max = n > max ? n : max;
max = w > max ? w : max;
matrix[index] = max;
}
}
}
}
}