jcuda.driver.CUdeviceptr Java Exaples

Source File: JCudaTestUtils.java From jcuda with MIT License

6 votes

/**
 * Returns whether the given pointers refer to the same memory address.<br>
 * <br>
 * <b>NOTE:<b><br>
 * <br>
 * This method does NOT implement a general way for comparing arbitrary 
 * pointers. The concept of equality of pointers is subtle, and by 
 * default NOT implemented in the pointer classes. This method is 
 * SOLELY intended for the test cases in which it is used.
 * 
 * @param p0 The first pointer
 * @param p1 The second pointer
 * @return Whether the pointers are equal
 */
static boolean equal(CUdeviceptr p0, CUdeviceptr p1)
{
    class TestCUdeviceptr extends CUdeviceptr
    {
        TestCUdeviceptr(CUdeviceptr other)
        {
            super(other);
        }
        
        @Override
        public long getNativePointer()
        {
            return super.getNativePointer();
        }
    }
    TestCUdeviceptr tp0 = new TestCUdeviceptr(p0);
    TestCUdeviceptr tp1 = new TestCUdeviceptr(p1);
    return tp0.getNativePointer() == tp1.getNativePointer();
}

Source File: GPUHelper.java From Juicebox with MIT License

5 votes

public static CUdeviceptr allocateInput(float[] input) {
    int typeSize = Sizeof.FLOAT;
    Pointer ptr = Pointer.to(input);
    int size = input.length;
    CUdeviceptr dInput = new CUdeviceptr();
    cuMemAlloc(dInput, size * Sizeof.FLOAT);
    cuMemcpyHtoD(dInput, ptr, size * typeSize);
    return dInput;
}

Source File: JCudaDriverBasicGraphExample.java From jcuda-samples with MIT License

5 votes

/**
 * Create device data containing the given float value, the given number
 * of times
 * 
 * @param numElements The number of elements
 * @param value The value of the elements
 * @return The pointer to the data
 */
private static CUdeviceptr createDeviceData(int numElements, float value)
{
    float hostData[] = new float[numElements];
    for (int i = 0; i < numElements; i++)
    {
        hostData[i] = value;
    }
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, numElements * Sizeof.FLOAT);
    cuMemcpyHtoD(deviceData, Pointer.to(hostData),
        numElements * Sizeof.FLOAT);
    return deviceData;
}

Source File: JCudaReduction.java From jcuda-samples with MIT License

5 votes

/**
 * Initialize the context, module, function and other elements used 
 * in this sample
 */
private static void init()
{
    // Initialize the driver API and create a context for the first device
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Create the PTX file by calling the NVCC
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaReductionKernel.cu");
    
    // Load the module from the PTX file
    module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the "reduce" function.
    function = new CUfunction();
    cuModuleGetFunction(function, module, "reduce");
    
    // Allocate a chunk of temporary memory (must be at least
    // numberOfBlocks * Sizeof.FLOAT)
    deviceBuffer = new CUdeviceptr();
    cuMemAlloc(deviceBuffer, 1024 * Sizeof.FLOAT);
    
}

Source File: GPUHelper.java From JuiceboxLegacy with MIT License

5 votes

public static CUdeviceptr allocateInput(float[] input) {
    int typeSize = Sizeof.FLOAT;
    Pointer ptr = Pointer.to(input);
    int size = input.length;
    CUdeviceptr dInput = new CUdeviceptr();
    cuMemAlloc(dInput, size * Sizeof.FLOAT);
    cuMemcpyHtoD(dInput, ptr, size * typeSize);
    return dInput;
}

Source File: CUDAInnerLoop.java From ocular with GNU General Public License v3.0

5 votes

public void startup(float[][] whiteTemplates, float[][] blackTemplates, int[] templateNumIndices, int[] templateIndicesOffsets, int minTemplateWidth, int maxTemplateWidth, int maxSequenceLength, int totalTemplateNumIndices) {
	this.whiteTemplates = whiteTemplates;
	this.blackTemplates = blackTemplates;
	this.templateNumIndices = templateNumIndices;
	this.templateIndicesOffsets = templateIndicesOffsets;
	this.maxTemplateWidth = maxTemplateWidth;
	this.minTemplateWidth = minTemplateWidth;
	this.totalTemplateNumIndices = totalTemplateNumIndices;
	
	int numTemplateWidths = (maxTemplateWidth-minTemplateWidth)+1;
	int extendedMaxSeqLength = (BLOCK_SIZE_X*ROLL_X) * (int) Math.ceil(((double) maxSequenceLength) / (BLOCK_SIZE_X*ROLL_X));
	this.d_Ow = new CUdeviceptr();
	cuMemAlloc(d_Ow, (extendedMaxSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
	this.d_Ob = new CUdeviceptr();
	cuMemAlloc(d_Ob, (extendedMaxSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
	this.d_scores = new CUdeviceptr();
	cuMemAlloc(d_scores, maxSequenceLength*totalTemplateNumIndices * Sizeof.FLOAT);
	this.d_Tw = new CUdeviceptr[numTemplateWidths];
	this.d_Tb = new CUdeviceptr[numTemplateWidths];
	for (int tw=minTemplateWidth; tw<=maxTemplateWidth; ++tw) {
		if (templateNumIndices[tw-minTemplateWidth] > 0) {
			d_Tw[tw-minTemplateWidth] = new CUdeviceptr();
			cuMemAlloc(d_Tw[tw-minTemplateWidth], whiteTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
			cuMemcpyHtoD(d_Tw[tw-minTemplateWidth], Pointer.to(whiteTemplates[tw-minTemplateWidth]), whiteTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
			
			d_Tb[tw-minTemplateWidth] = new CUdeviceptr();
			cuMemAlloc(d_Tb[tw-minTemplateWidth], blackTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
			cuMemcpyHtoD(d_Tb[tw-minTemplateWidth], Pointer.to(blackTemplates[tw-minTemplateWidth]), blackTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
		}
	}
}

Source File: Context.java From OSPREY3 with GNU General Public License v2.0

4 votes

public void free(CUdeviceptr pdBuf) {
	JCudaDriver.cuMemFree(pdBuf);
}

Source File: JCudaDriverMemRangeTest.java From jcuda with MIT License

4 votes

@Test
public void testMemRangeAttribute()
{
    JCudaDriver.setExceptionsEnabled(true);
    
    cuInit(0);
    CUcontext contest = new CUcontext();
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    cuCtxCreate(contest, 0, device);
    
    int size = 64;
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAllocManaged(deviceData, size, CU_MEM_ATTACH_HOST);
    
    int readMostly[] = { 12345 };
    int lastPrefetchLocation[] = { 12345 };
    int preferredLocation[] = { 12345 };
    int accessedBy[] = { 12345, 12345, 12345 };
    
    cuMemRangeGetAttribute(Pointer.to(readMostly), Sizeof.INT, 
        CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, deviceData, size);

    cuMemRangeGetAttribute(Pointer.to(lastPrefetchLocation), Sizeof.INT, 
        CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, deviceData, size);

    cuMemRangeGetAttribute(Pointer.to(preferredLocation), Sizeof.INT, 
        CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, deviceData, size);

    cuMemRangeGetAttribute(
        Pointer.to(accessedBy), Sizeof.INT * accessedBy.length, 
        CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, deviceData, size);

    boolean printResults = false;
    //printResults = true;
    if (printResults)
    {
        System.out.println("readMostly          : " + 
            Arrays.toString(lastPrefetchLocation));
        System.out.println("lastPrefetchLocation: " + 
            Arrays.toString(lastPrefetchLocation));
        System.out.println("preferredLocation   : " + 
            Arrays.toString(preferredLocation));
        System.out.println("accessedBy          : " + 
            Arrays.toString(accessedBy));
    }
}

Source File: JCudaDriverMemRangeTest.java From jcuda with MIT License

4 votes

@Test
public void testMemRangeAttributes()
{
    JCudaDriver.setExceptionsEnabled(true);
    
    cuInit(0);
    CUcontext contest = new CUcontext();
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    cuCtxCreate(contest, 0, device);
    
    int size = 64;
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAllocManaged(deviceData, size, CU_MEM_ATTACH_HOST);
    
    int readMostly[] = { 12345 };
    int lastPrefetchLocation[] = { 12345 };
    int preferredLocation[] = { 12345 };
    int accessedBy[] = { 12345, 12345, 12345 };
    
    Pointer data[] =  
    {
        Pointer.to(readMostly),
        Pointer.to(lastPrefetchLocation),
        Pointer.to(preferredLocation),
        Pointer.to(accessedBy) 
    };
    long dataSizes[] = 
    {
        Sizeof.INT, 
        Sizeof.INT, 
        Sizeof.INT, 
        Sizeof.INT * accessedBy.length
    };
    int attributes[] =  
    {
        CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
        CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
        CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
        CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY,
    };
    cuMemRangeGetAttributes(data, dataSizes, 
        attributes, attributes.length, deviceData, size);
    
    boolean printResults = false;
    //printResults = true;
    if (printResults)
    {
        System.out.println("readMostly          : " + 
            Arrays.toString(lastPrefetchLocation));
        System.out.println("lastPrefetchLocation: " + 
            Arrays.toString(lastPrefetchLocation));
        System.out.println("preferredLocation   : " + 
            Arrays.toString(preferredLocation));
        System.out.println("accessedBy          : " + 
            Arrays.toString(accessedBy));
    }
}

Source File: GPUTesting.java From JuiceboxLegacy with MIT License

4 votes

public static void test() {
    JCudaDriver.setExceptionsEnabled(true);

    String sourceCode = "extern \"C\"" + "\n" +
            "__global__ void add(float *result, float *a, float *b)" +
            "\n" +
            "{" + "\n" +
            "    int i = threadIdx.x;" + "\n" +
            "    result[i] = a[i] + b[i];" + "\n" +
            "}";

    // Prepare the kernel
    System.out.println("Preparing the KernelLauncher...");
    KernelLauncher kernelLauncher =
            KernelLauncher.compile(sourceCode, "add");

    // Create the input data
    System.out.println("Creating input data...");
    int size = 10;
    float result[] = new float[size];
    float a[] = new float[size];
    float b[] = new float[size];
    for (int i = 0; i < size; i++) {
        a[i] = i;
        b[i] = i;
    }

    // Allocate the device memory and copy the input
    // data to the device
    System.out.println("Initializing device memory...");

    CUdeviceptr dResult = GPUHelper.allocateOutput(size, Sizeof.FLOAT);
    CUdeviceptr dA = GPUHelper.allocateInput(a);
    CUdeviceptr dB = GPUHelper.allocateInput(b);

    System.out.println("Calling the kernel...");
    kernelLauncher.setBlockSize(size, 1, 1);
    kernelLauncher.call(dResult, dA, dB);

    // Copy the result from the device to the host
    System.out.println("Obtaining results...");

    cuMemcpyDtoH(Pointer.to(result), dResult, size * Sizeof.FLOAT);

    System.out.println("Result: " + Arrays.toString(result));

    // Clean up
    cuMemFree(dA);
    cuMemFree(dB);
    cuMemFree(dResult);
}

Source File: GPUHelper.java From JuiceboxLegacy with MIT License

4 votes

public static CUdeviceptr allocateOutput(int size, int typeSize) {
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, size * typeSize);
    return dOutput;
}

Source File: GPUHelper.java From JuiceboxLegacy with MIT License

4 votes

public static void freeUpMemory(CUdeviceptr[] pointers) {
    for (CUdeviceptr pointer : pointers) {
        cuMemFree(pointer);
    }
}

Source File: Context.java From OSPREY3 with GNU General Public License v2.0

4 votes

public CUdeviceptr malloc(long numBytes) {
	CUdeviceptr pdBuf = new CUdeviceptr();
	JCudaDriver.cuMemAlloc(pdBuf, numBytes);
	return pdBuf;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 3D float4 texture access
 */
private boolean test_float4_3D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.Depth = sizeZ;
    ad.NumChannels = 4;
    cuArray3DCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
    copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copy.srcHost = Pointer.to(input_float4_3D);
    copy.srcPitch = sizeX * Sizeof.FLOAT * 4;
    copy.srcHeight = sizeY;
    copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copy.dstArray = array;
    copy.dstHeight = sizeX;
    copy.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
    copy.Height = sizeY;
    copy.Depth = sizeZ;
    cuMemcpy3D(copy);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_3D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_3D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY }),
     	Pointer.to(new float[]{ posZ })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 3D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 3.5f, 3.5f, 3.5f, 3.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 3D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: Context.java From OSPREY3 with GNU General Public License v2.0

4 votes

public void uploadAsync(CUdeviceptr pdBuf, Pointer phBuf, long numBytes, GpuStream stream) {
	JCudaDriver.cuMemcpyHtoDAsync(pdBuf, phBuf, numBytes, stream.getStream());
}

Source File: Context.java From OSPREY3 with GNU General Public License v2.0

4 votes

public void downloadAsync(Pointer phBuf, CUdeviceptr pdBuf, long numBytes, GpuStream stream) {
	JCudaDriver.cuMemcpyDtoHAsync(phBuf, pdBuf, numBytes, stream.getStream());
}

Source File: GPUTesting.java From Juicebox with MIT License

4 votes

public static void test() {
    JCudaDriver.setExceptionsEnabled(true);

    String sourceCode = "extern \"C\"" + "\n" +
            "__global__ void add(float *result, float *a, float *b)" +
            "\n" +
            "{" + "\n" +
            "    int i = threadIdx.x;" + "\n" +
            "    result[i] = a[i] + b[i];" + "\n" +
            "}";

    // Prepare the kernel
    System.out.println("Preparing the KernelLauncher...");
    KernelLauncher kernelLauncher =
            KernelLauncher.compile(sourceCode, "add");

    // Create the input data
    System.out.println("Creating input data...");
    int size = 10;
    float[] result = new float[size];
    float[] a = new float[size];
    float[] b = new float[size];
    for (int i = 0; i < size; i++) {
        a[i] = i;
        b[i] = i;
    }

    // Allocate the device memory and copy the input
    // data to the device
    System.out.println("Initializing device memory...");

    CUdeviceptr dResult = GPUHelper.allocateOutput(size, Sizeof.FLOAT);
    CUdeviceptr dA = GPUHelper.allocateInput(a);
    CUdeviceptr dB = GPUHelper.allocateInput(b);

    System.out.println("Calling the kernel...");
    kernelLauncher.setBlockSize(size, 1, 1);
    kernelLauncher.call(dResult, dA, dB);

    // Copy the result from the device to the host
    System.out.println("Obtaining results...");

    cuMemcpyDtoH(Pointer.to(result), dResult, size * Sizeof.FLOAT);

    System.out.println("Result: " + Arrays.toString(result));

    // Clean up
    cuMemFree(dA);
    cuMemFree(dB);
    cuMemFree(dResult);
}

Source File: GPUHelper.java From Juicebox with MIT License

4 votes

public static CUdeviceptr allocateOutput(int size, int typeSize) {
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, size * typeSize);
    return dOutput;
}

Source File: GPUHelper.java From Juicebox with MIT License

4 votes

public static void freeUpMemory(CUdeviceptr[] pointers) {
    for (CUdeviceptr pointer : pointers) {
        cuMemFree(pointer);
    }
}

Source File: VecFloatSample.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args)
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Afterwards, initialize the vector library, which will
    // attach to the current context
    VecFloat.init();
    
    // Allocate and fill the host input data
    int n = 50000;
    float hostX[] = new float[n];
    float hostY[] = new float[n];
    for(int i = 0; i < n; i++)
    {
        hostX[i] = (float)i;
        hostY[i] = (float)i;
    }

    // Allocate the device pointers, and copy the
    // host input data to the device
    CUdeviceptr deviceX = new CUdeviceptr();
    cuMemAlloc(deviceX, n * Sizeof.FLOAT);
    cuMemcpyHtoD(deviceX, Pointer.to(hostX), n * Sizeof.FLOAT);

    CUdeviceptr deviceY = new CUdeviceptr();
    cuMemAlloc(deviceY, n * Sizeof.FLOAT); 
    cuMemcpyHtoD(deviceY, Pointer.to(hostY), n * Sizeof.FLOAT);

    CUdeviceptr deviceResult = new CUdeviceptr();
    cuMemAlloc(deviceResult, n * Sizeof.FLOAT);

    // Perform the vector operations
    VecFloat.cos(n, deviceX, deviceX);               // x = cos(x)  
    VecFloat.mul(n, deviceX, deviceX, deviceX);      // x = x*x
    VecFloat.sin(n, deviceY, deviceY);               // y = sin(y)
    VecFloat.mul(n, deviceY, deviceY, deviceY);      // y = y*y
    VecFloat.add(n, deviceResult, deviceX, deviceY); // result = x+y

    // Allocate host output memory and copy the device output
    // to the host.
    float hostResult[] = new float[n];
    cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, n * Sizeof.FLOAT);

    // Verify the result
    boolean passed = true;
    for(int i = 0; i < n; i++)
    {
        float expected = (float)(
            Math.cos(hostX[i])*Math.cos(hostX[i])+
            Math.sin(hostY[i])*Math.sin(hostY[i]));
        if (Math.abs(hostResult[i] - expected) > 1e-5)
        {
            System.out.println(
                "At index "+i+ " found "+hostResult[i]+
                " but expected "+expected);
            passed = false;
            break;
        }
    }
    System.out.println("Test "+(passed?"PASSED":"FAILED"));

    // Clean up.
    cuMemFree(deviceX);
    cuMemFree(deviceY);
    cuMemFree(deviceResult);
    VecFloat.shutdown();
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 2D float4 texture access
 */
private boolean test_float4_2D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.NumChannels = 4;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
    copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copyHD.srcHost = Pointer.to(input_float4_2D);
    copyHD.srcPitch = sizeX * Sizeof.FLOAT * 4;
    copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copyHD.dstArray = array;
    copyHD.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
    copyHD.Height = sizeY;
    cuMemcpy2D(copyHD);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_2D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_2D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 2D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 1.5f, 1.5f, 1.5f, 1.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 2D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 1D float4 texture access
 */
private boolean test_float4_1D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = 1;
    ad.NumChannels = 4;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    Pointer pInput = Pointer.to(input_float4_1D);
    cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT * 4);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_1D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_1D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 1D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 0.5f, 0.5f, 0.5f, 0.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 1D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 3D float texture access
 */
private boolean test_float_3D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.Depth = sizeZ;
    ad.NumChannels = 1;
    cuArray3DCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
    copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copy.srcHost = Pointer.to(input_float_3D);
    copy.srcPitch = sizeX * Sizeof.FLOAT;
    copy.srcHeight = sizeY;
    copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copy.dstArray = array;
    copy.dstHeight = sizeX;
    copy.WidthInBytes = sizeX * Sizeof.FLOAT;
    copy.Height = sizeY;
    copy.Depth = sizeZ;
    cuMemcpy3D(copy);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_3D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_3D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY }),
     	Pointer.to(new float[]{ posZ })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  3D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 3.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  3D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 2D float texture access
 */
private boolean test_float_2D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.NumChannels = 1;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
    copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copyHD.srcHost = Pointer.to(input_float_2D);
    copyHD.srcPitch = sizeX * Sizeof.FLOAT;
    copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copyHD.dstArray = array;
    copyHD.WidthInBytes = sizeX * Sizeof.FLOAT;
    copyHD.Height = sizeY;
    cuMemcpy2D(copyHD);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_2D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_2D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  2D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 1.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  2D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 1D float texture access
 */
private boolean test_float_1D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = 1;
    ad.NumChannels = 1;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    Pointer pInput = Pointer.to(input_float_1D);
    cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_1D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_1D");
    
    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();
    
    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  1D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 0.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  1D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaConstantMemoryExample.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args) throws IOException 
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Create the PTX file by calling the NVCC
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaConstantMemoryKernel.cu");

    // Load the PTX file.
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain the pointer to the constant memory, and print some info
    CUdeviceptr constantMemoryPointer = new CUdeviceptr();
    long constantMemorySizeArray[] = { 0 };
    cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray, 
        module, "constantMemoryData");
    int constantMemorySize = (int)constantMemorySizeArray[0];
    
    System.out.println("constantMemoryPointer: " + constantMemoryPointer);
    System.out.println("constantMemorySize: " + constantMemorySize);

    // Copy some host data to the constant memory
    int numElements = constantMemorySize / Sizeof.FLOAT;
    float hostData[] = new float[numElements];
    for (int i = 0; i < numElements; i++)
    {
        hostData[i] = i;
    }
    cuMemcpyHtoD(constantMemoryPointer, 
        Pointer.to(hostData), constantMemorySize);
    
    // Now use the constant memory in the kernel call:
    
    // Obtain a function pointer to the "constantMemoryKernel" function.
    CUfunction kernel = new CUfunction();
    cuModuleGetFunction(kernel, module, "constantMemoryKernel");

    // Allocate some device memory
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, constantMemorySize);
    
    // Set up the kernel parameters
    Pointer kernelParameters = Pointer.to(
        Pointer.to(deviceData),
        Pointer.to(new int[]{numElements})
    );
    
    // Launch the kernel
    int blockSizeX = numElements;
    int gridSizeX = 1;
    cuLaunchKernel(kernel,
        gridSizeX,  1, 1, 
        blockSizeX, 1, 1,
        0, null,         
        kernelParameters, null 
    );
    cuCtxSynchronize();
    
    // Copy the result back to the host, and verify that it is
    // the same that was copied to the constant memory
    float hostResult[] = new float[numElements];
    cuMemcpyDtoH(Pointer.to(hostResult), deviceData, constantMemorySize);
    
    boolean passed = Arrays.equals(hostData,  hostResult);
    System.out.println("Test " + (passed ? "PASSED" : "FAILED"));
}

Source File: JCudaDriverUnifiedMemory.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args)
{
    JCudaDriver.setExceptionsEnabled(true);
    JCublas.setExceptionsEnabled(true);
    
    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);
    
    // Check if the device supports managed memory
    int supported[] = { 0 };
    cuDeviceGetAttribute(supported, 
        CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, device);
    if (supported[0] == 0)
    {
        System.err.println("Device does not support managed memory");
        return;
    }

    // Allocate managed memory that is accessible to the host
    int n = 10;
    long size = n * Sizeof.FLOAT;
    CUdeviceptr p = new CUdeviceptr();
    cuMemAllocManaged(p, size, CU_MEM_ATTACH_HOST);

    // Obtain the byte buffer from the pointer. This is supported only
    // for memory that was allocated to be accessible on the host:
    ByteBuffer bb = p.getByteBuffer(0, size);
    
    System.out.println("Buffer on host side: " + bb);

    // Fill the buffer with sample data
    FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer();
    for (int i = 0; i < n; i++)
    {
        fb.put(i, i);
    }

    // Make the buffer accessible to all devices
    cuStreamAttachMemAsync(null, p, 0,  CU_MEM_ATTACH_GLOBAL);
    cuStreamSynchronize(null);

    // Use the pointer in a device operation (here, a dot product with 
    // JCublas, for example). The data that was filled in by the host
    // will now be used by the device.
    cublasHandle handle = new cublasHandle();
    cublasCreate(handle);
    float result[] = { -1.0f };
    cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result));
    System.out.println("Result: " + result[0]);
}

Source File: JCudaDynamicParallelism.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args)
{
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize a context for the first device
    cuInit(0);
    CUcontext context = new CUcontext();
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    cuCtxCreate(context, 0, device);

    // Create the CUBIN file by calling the NVCC. 
    // See the prepareDefaultCubinFile method for the details about
    // the NVCC parameters that are used here. 
    String cubinFileName = JCudaSamplesUtils.prepareDefaultCubinFile(
        "src/main/resources/kernels/JCudaDynamicParallelismKernel.cu");

    // Load the CUBIN file 
    CUmodule module = new CUmodule();
    cuModuleLoad(module, cubinFileName);

    // Obtain a function pointer to the "parentKernel" function.
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "parentKernel");

    // Define the nesting structure. 
    // 
    // NOTE: The number of child threads MUST match the value that 
    // is used in the kernel, for the childKernel<<<1, 8>>> call!
    // 
    int numParentThreads = 8;
    int numChildThreads = 8;

    // Allocate the device data that will be filled by the kernel
    int numElements = numParentThreads * numChildThreads;
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, numElements * Sizeof.FLOAT);

    // Set up the kernel parameters: A pointer to an array
    // of pointers which point to the actual values.
    Pointer kernelParameters = Pointer.to(
        Pointer.to(new int[] { numElements }),
        Pointer.to(deviceData)
    );

    // Call the kernel function.
    int blockSizeX = numParentThreads;
    int gridSizeX = (numElements + numElements - 1) / blockSizeX;
    cuLaunchKernel(function,
        gridSizeX,  1, 1,      // Grid dimension
        blockSizeX, 1, 1,      // Block dimension
        0, null,               // Shared memory size and stream
        kernelParameters, null // Kernel- and extra parameters
    );
    cuCtxSynchronize();

    // Copy the device data to the host
    float hostData[] = new float[numElements];
    for(int i = 0; i < numElements; i++)
    {
        hostData[i] = i;
    }
    cuMemcpyDtoH(Pointer.to(hostData), 
        deviceData, numElements * Sizeof.FLOAT);

    // Compare the host data with the expected values
    float hostDataRef[] = new float[numElements];
    for(int i = 0; i < numParentThreads; i++)
    {
        for (int j=0; j < numChildThreads; j++)
        {
            hostDataRef[i * numChildThreads + j] = i + 0.1f * j;
        }
    }
    System.out.println("Result: "+Arrays.toString(hostData));
    boolean passed = Arrays.equals(hostData, hostDataRef);
    System.out.println(passed ? "PASSED" : "FAILED");

    // Clean up.
    cuMemFree(deviceData);
}

Source File: JCudaDriverStreamCallbacks.java From jcuda-samples with MIT License

4 votes

/**
 * Create a Workload instance. This method is called by multiple host
 * threads, to create the individual workloads, and to send the 
 * commands for processing the workloads to CUDA
 * 
 * @param index The index of the workload 
 * @param executor The executor service 
 */
private static void createWorkloadOnHost(
    final int index, final ExecutorService executor)
{
    // Make sure that the CUDA context is current for the calling thread
    cuCtxSetCurrent(context);

    // Initialize the workload, and create the CUDA stream

    System.out.println(index + ": Initializing workload");
    final Workload workload = new Workload();
    workload.index = index;
    workload.stream = new CUstream();
    cuStreamCreate(workload.stream, 0);
    
    
    // Create the host data of the workload
    
    System.out.println(index + ": Create host data");
    workload.hostData = new Pointer();
    cuMemHostAlloc(workload.hostData, WORKLOAD_SIZE * Sizeof.INT, 0);
    ByteBuffer hostByteBuffer =
        workload.hostData.getByteBuffer(0, WORKLOAD_SIZE * Sizeof.INT);
    IntBuffer hostIntBuffer = 
        hostByteBuffer.order(ByteOrder.nativeOrder()).asIntBuffer();
    for (int i = 0; i < WORKLOAD_SIZE; i++)
    {
        hostIntBuffer.put(i, i);
    }
    workload.deviceData = new CUdeviceptr();
    cuMemAlloc(workload.deviceData, WORKLOAD_SIZE * Sizeof.INT);

    
    // Execute the CUDA commands:
    // - Copy the host data to the device
    // - Execute the kernel
    // - Copy the modified device data back to the host
    // All this is done asynchronously

    System.out.println(index + ": Execute CUDA commands");

    cuMemcpyHtoDAsync(workload.deviceData, workload.hostData,
        WORKLOAD_SIZE * Sizeof.INT, workload.stream);

    Pointer kernelParameters = Pointer.to(
        Pointer.to(new int[]{WORKLOAD_SIZE}),
        Pointer.to(workload.deviceData)
    );
    int blockSizeX = 256;
    int gridSizeX = (WORKLOAD_SIZE + blockSizeX - 1) / blockSizeX;
    cuLaunchKernel(function, gridSizeX,  1, 1, blockSizeX, 1, 1,
        0, workload.stream, kernelParameters, null);
    
    cuMemcpyDtoHAsync(workload.hostData, workload.deviceData,
        WORKLOAD_SIZE * Sizeof.INT, workload.stream);
    
    
    // Define the callback that will be called when all CUDA commands
    // on the stream have finished. This callback will forward the
    // workload to the "finishWorkloadOnHost" method.
    CUstreamCallback callback = new CUstreamCallback()
    {
        @Override
        public void call(
            CUstream hStream, int status, final Object userData)
        {
            System.out.println(index + ": Callback was called");
            Runnable runnable = new Runnable()
            {
                @Override
                public void run()
                {
                    finishWorkloadOnHost(userData);
                }
            };
            executor.submit(runnable);
        }
    };
    cuStreamAddCallback(workload.stream, callback, workload, 0);
}

Source File: JCudaReduction.java From jcuda-samples with MIT License

4 votes

/**
 * Entry point of this sample
 *
 * @param args Not used
 */
public static void main(String args[])
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);

    init();
    boolean passed = true;
    for (int n = 100000; n <= 26500000; n *= 2)
    {
        float hostInput[] = createRandomArray(n);

        long timeNs0 = 0;
        long timeNs1 = 0;

        // Copy the input data to the device
        timeNs0 = System.nanoTime();
        CUdeviceptr deviceInput = new CUdeviceptr();
        cuMemAlloc(deviceInput, hostInput.length * Sizeof.FLOAT);
        cuMemcpyHtoD(deviceInput, Pointer.to(hostInput), 
            hostInput.length * Sizeof.FLOAT);
        timeNs1 = System.nanoTime();
        long durationCopyNs = timeNs1 - timeNs0;

        // Execute the reduction with CUDA
        timeNs0 = System.nanoTime();
        float resultJCuda = reduce(deviceInput, hostInput.length);
        timeNs1 = System.nanoTime();
        long durationCompNs = timeNs1 - timeNs0;

        cuMemFree(deviceInput);

        // Execute the reduction with Java
        timeNs0 = System.nanoTime();
        float resultJava = reduceHost(hostInput);
        timeNs1 = System.nanoTime();
        long durationJavaNs = timeNs1 - timeNs0;

        System.out.println("Reduction of " + n + " elements");
        System.out.printf(Locale.ENGLISH,
            "  JCuda: %7.3f ms, result: %f " +
            "(copy: %7.3f ms, comp: %7.3f ms)\n",
            (durationCopyNs + durationCompNs) / 1e6, resultJCuda, 
            durationCopyNs / 1e6, durationCompNs / 1e6);
        System.out.printf(Locale.ENGLISH,
            "  Java : %7.3f ms, result: %f\n", 
            durationJavaNs / 1e6, resultJava);
        
        passed &= 
            Math.abs(resultJCuda - resultJava) < resultJava * 1e-5;
        
    }
    System.out.println("Test " + (passed ? "PASSED" : "FAILED"));

    shutdown();
}

jcuda.driver.CUdeviceptr Java Examples