Let’s take our focus off of the external stuff :). That’s distracting from the actual question. Here’s the code without the kernel launcher.

I think my main confusing maybe coming from where you use the JIT compiler options and what you compile on the command line.

```
JITOptions jitOptions = new JITOptions();
CUlinkState state = new CUlinkState();
cuLinkCreate(jitOptions, state);
//tried with AND without this
JCudaDriver.cuLinkAddFile(state, CUjitInputType.CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", jitOptions);
cuLinkAddFile(state, CUjitInputType.CU_JIT_INPUT_CUBIN, moduleFileName, jitOptions);
long sz[] = new long[1];
Pointer image = new Pointer();
cuLinkComplete(state, image, sz);
CUModule module = new CUModule();
cuModuleLoadDataEx(module, image,
0, new int[0], Pointer.to(new int[0]));
cuLinkDestroy(state);
```

Here are the cuda kernels:

pairwise_transform.h

```
#include <math.h>
//x** and y**
template <typename T>
__device__ T op(T d1,T d2,T *params);
template <typename T>
__device__ T op(T d1,T *params);
template <typename T>
__device__ void transform(int n,int xOffset,int yOffset, T *dx, T *dy,int incx,int incy,T *params,T *result,int incz,int blockSize) {
int totalThreads = gridDim.x * blockDim.x;
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + tid;
if (incy == 0) {
if ((blockIdx.x == 0) && (tid == 0)) {
for (; i < n; i++) {
result[i * incz] = op(dx[i * incx],params);
}
}
} else if ((incx == incy) && (incx > 0)) {
/* equal, positive, increments */
if (incx == 1) {
/* both increments equal to 1 */
for (; i < n; i += totalThreads) {
result[i * incz] = op(dx**,dy**,params);
}
} else {
/* equal, positive, non-unit increments. */
for (; i < n; i += totalThreads) {
result[i * incz] = op(dx[i * incx],dy[i * incy],params);
}
}
} else {
/* unequal or nonpositive increments */
for (; i < n; i += totalThreads) {
result[i * incz] = op(dx[i * incx],dy[i * incy],params);
}
}
}
```

add_strided.cu

```
#include <pairwise_transform.h>
__device__ double op(double d1,double d2,double *params) {
return d1 + d2;
}
__device__ double op(double d1,double *params) {
return d1;
}
__device__ float op(float d1,float d2,float *params) {
return d1 + d2;
}
__device__ float op(float d1,float *params) {
return d1;
}
__global__ void add_strided_double(int n,int xOffset,int yOffset, double *dx, double *dy,int incx,int incy,double *params,double *result,int incz,int blockSize) {
transform<double>(n,xOffset,yOffset,dx,dy,incx,incy,params,result,incz,blockSize);
}
__global__ void add_strided_float(int n,int xOffset,int yOffset, float *dx, float *dy,int incx,int incy,float *params,float *result,int incz,int blockSize) {
transform<float>(n,xOffset,yOffset,dx,dy,incx,incy,params,result,incz,blockSize);
}
```

I’m compiling it with:

`nvcc -O3 -cubin -rdc=true -gencode arch=compute_52,code=sm_52 -dc add_strided.cu -lcudadevrt -lcudart -o add_strided.cubin`