jcuda.runtime.JCuda Java Examples

The following examples show how to use jcuda.runtime.JCuda. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JCusolverSp_LinearSolver_Direct.java    From jcuda-samples with MIT License 6 votes vote down vote up
public static void main(String[] args)
{
    JCuda.setExceptionsEnabled(true);;
    JCusparse.setExceptionsEnabled(true);
    JCusolver.setExceptionsEnabled(true);

    String path = "src/main/resources/data/jcusolver/";
    String fileName = path + "lap2D_5pt_n100.mtx";
    String testFunc = "chol"; // "chol", "lu", "qr"
    String reorder = "symrcm"; // "symrcm", "symamd", null

    runTest(
        "-F="+fileName,
        "-R="+testFunc,
        "-P="+reorder);
}
 
Example #2
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void getrfGetriBatched(List<Matrix> A, List<Matrix> B) {
	Pointer[] Apointers = new Pointer[A.size()];
	Pointer[] Bpointers = new Pointer[B.size()];
	for (int i=0; i<A.size(); ++i) {
		Apointers[i] = A.get(i).data_d;
		Bpointers[i] = B.get(i).data_d;
	}
	Pointer Apointers_d = new Pointer();
	JCuda.cudaMalloc(Apointers_d, A.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Apointers_d, Pointer.to(Apointers), A.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	Pointer Bpointers_d = new Pointer();
	JCuda.cudaMalloc(Bpointers_d, B.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Bpointers_d, Pointer.to(Bpointers), B.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	Pointer info_d = new Pointer();
	JCuda.cudaMalloc(info_d, A.size() * Sizeof.INT);
	Pointer pivots_d = new Pointer();
	JCuda.cudaMalloc(pivots_d, A.get(0).rows * A.size() * Sizeof.INT);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCublas2.cublasSgetrfBatched(cublasHandle, A.get(0).rows, Apointers_d, A.get(0).rows, pivots_d, info_d, A.size());
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCublas2.cublasSgetriBatched(cublasHandle, A.get(0).rows, Apointers_d, A.get(0).rows, pivots_d, Bpointers_d, B.get(0).rows, info_d, A.size());
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCuda.cudaFree(Apointers_d);
	JCuda.cudaFree(Bpointers_d);
	JCuda.cudaFree(info_d);
	JCuda.cudaFree(pivots_d);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #3
Source File: MatrixVectorMul.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void open(Configuration parameters) {
	// When multiple instances of this class and JCuda exist in different class loaders, then we will get UnsatisfiedLinkError.
	// To avoid that, we need to temporarily override the java.io.tmpdir, where the JCuda store its native library, with a random path.
	// For more details please refer to https://issues.apache.org/jira/browse/FLINK-5408 and the discussion in http://apache-flink-user-mailing-list-archive.2336050.n4.nabble.com/Classloader-and-removal-of-native-libraries-td14808.html
	final String originTempDir = System.getProperty("java.io.tmpdir");
	final String newTempDir = originTempDir + "/jcuda-" + UUID.randomUUID();
	System.setProperty("java.io.tmpdir", newTempDir);

	final Set<ExternalResourceInfo> externalResourceInfos = getRuntimeContext().getExternalResourceInfos(resourceName);
	Preconditions.checkState(!externalResourceInfos.isEmpty(), "The MatrixVectorMul needs at least one GPU device while finding 0 GPU.");
	final Optional<String> firstIndexOptional = externalResourceInfos.iterator().next().getProperty("index");
	Preconditions.checkState(firstIndexOptional.isPresent());

	matrixPointer = new Pointer();
	final float[] matrix = new float[dimension * dimension];
	// Initialize a random matrix
	for (int i = 0; i < dimension * dimension; ++i) {
		matrix[i] = (float) Math.random();
	}

	// Set the CUDA device
	JCuda.cudaSetDevice(Integer.parseInt(firstIndexOptional.get()));

	// Initialize JCublas
	JCublas.cublasInit();

	// Allocate device memory for the matrix
	JCublas.cublasAlloc(dimension * dimension, Sizeof.FLOAT, matrixPointer);
	JCublas.cublasSetVector(dimension * dimension, Sizeof.FLOAT, Pointer.to(matrix), 1, matrixPointer, 1);

	// Change the java.io.tmpdir back to its original value.
	System.setProperty("java.io.tmpdir", originTempDir);
}
 
Example #4
Source File: HiCCUPS.java    From JuiceboxLegacy with MIT License 5 votes vote down vote up
/**
 * todo needs some more development/expansion
 */
private void testGPUInstallation(){
    try {
        jcuda.Pointer pointer = new jcuda.Pointer();
        JCuda.cudaMalloc(pointer, 4);
        JCuda.cudaFree(pointer);
    }
    catch (Exception e) {
        System.err.println("GPU/CUDA Installation Not Detected");
        System.err.println("Exiting HiCCUPS");
        System.exit(24);
    }
}
 
Example #5
Source File: TestPointerGetByteBuffer.java    From jcuda with MIT License 5 votes vote down vote up
@Test(expected = ArithmeticException.class)
public void testGetByteBufferWithOverflow()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    pointer.getByteBuffer(Integer.MAX_VALUE - 10, 20);
}
 
Example #6
Source File: TestPointerGetByteBuffer.java    From jcuda with MIT License 5 votes vote down vote up
@Test(expected = IllegalArgumentException.class)
public void testGetByteBufferWithInvalidSize()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    pointer.getByteBuffer(100, 1000);
}
 
Example #7
Source File: TestPointerGetByteBuffer.java    From jcuda with MIT License 5 votes vote down vote up
@Test(expected = IllegalArgumentException.class)
public void testGetByteBufferWithInvalidOffset()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    pointer.getByteBuffer(-100, 800);
}
 
Example #8
Source File: TestPointerGetByteBuffer.java    From jcuda with MIT License 5 votes vote down vote up
@Test
public void testGetByteBufferEndianness()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    ByteBuffer byteBuffer = pointer.getByteBuffer(100, 800);
    assertEquals(ByteOrder.nativeOrder(), byteBuffer.order());
}
 
Example #9
Source File: TestPointerGetByteBuffer.java    From jcuda with MIT License 5 votes vote down vote up
@Test
public void testGetByteBufferWithOffsetAndSize()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    ByteBuffer byteBuffer = pointer.getByteBuffer(100, 800);
    
    assertNotNull(byteBuffer);
    assertEquals(0, byteBuffer.position());
    assertEquals(800, byteBuffer.limit());
}
 
Example #10
Source File: TestPointerGetByteBuffer.java    From jcuda with MIT License 5 votes vote down vote up
@Test
public void testGetByteBuffer()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    ByteBuffer byteBuffer = pointer.getByteBuffer();
    
    assertNotNull(byteBuffer);
    assertEquals(0, byteBuffer.position());
    assertEquals(1000, byteBuffer.limit());
}
 
Example #11
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void gemmBatched(float alpha, List<Matrix> A, List<Matrix> B, float beta, List<Matrix> C) {
	Pointer[] Apointers = new Pointer[A.size()];
	Pointer[] Bpointers = new Pointer[B.size()];
	Pointer[] Cpointers = new Pointer[C.size()];
	for (int i=0; i<A.size(); ++i) {
		Apointers[i] = A.get(i).data_d;
		Bpointers[i] = B.get(i).data_d;
		Cpointers[i] = C.get(i).data_d;
	}
	Pointer Apointers_d = new Pointer();
	JCuda.cudaMalloc(Apointers_d, A.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Apointers_d, Pointer.to(Apointers), A.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	Pointer Bpointers_d = new Pointer();
	JCuda.cudaMalloc(Bpointers_d, B.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Bpointers_d, Pointer.to(Bpointers), B.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	Pointer Cpointers_d = new Pointer();
	JCuda.cudaMalloc(Cpointers_d, C.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Cpointers_d, Pointer.to(Cpointers), C.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCublas2.cublasSgemmBatched(cublasHandle, cublasOperation.CUBLAS_OP_N, cublasOperation.CUBLAS_OP_N, C.get(0).rows, C.get(0).cols, B.get(0).rows, Pointer.to(new float[] {alpha}), Apointers_d, A.get(0).rows, Bpointers_d, B.get(0).rows, Pointer.to(new float[] {beta}), Cpointers_d, C.get(0).rows, A.size());
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCuda.cudaFree(Apointers_d);
	JCuda.cudaFree(Bpointers_d);
	JCuda.cudaFree(Cpointers_d);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #12
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
public Matrix(int rows, int cols) {
	this.dontFree = false;
	this.rows = rows;
	this.cols = cols;
	this.data_d = new Pointer();
	JCuda.cudaMalloc(data_d, rows*cols * Sizeof.FLOAT);
	CublasUtil.allocated.add(this);
}
 
Example #13
Source File: JCudnnMnist.java    From jcuda-samples with MIT License 5 votes vote down vote up
public static void main(String args[])
{
    JCuda.setExceptionsEnabled(true);
    JCudnn.setExceptionsEnabled(true);
    JCublas2.setExceptionsEnabled(true);

    int version = (int) cudnnGetVersion();
    System.out.printf("cudnnGetVersion() : %d , " + 
        "CUDNN_VERSION from cudnn.h : %d\n",
        version, CUDNN_VERSION);

    System.out.println("Creating network and layers...");
    Network mnist = new Network();
    
    System.out.println("Classifying...");
    int i1 = mnist.classifyExample(dataDirectory + first_image);
    int i2 = mnist.classifyExample(dataDirectory + second_image);

    mnist.setConvolutionAlgorithm(CUDNN_CONVOLUTION_FWD_ALGO_FFT);
    int i3 = mnist.classifyExample(dataDirectory + third_image);
    
    System.out.println(
        "\nResult of classification: " + i1 + " " + i2 + " " + i3);
    if (i1 != 1 || i2 != 3 || i3 != 5)
    {
        System.out.println("\nTest failed!\n");
    }
    else
    {
        System.out.println("\nTest passed!\n");
    }
    mnist.destroy();
}
 
Example #14
Source File: JCudaPrintDeviceInfo.java    From jcuda-samples with MIT License 5 votes vote down vote up
public static void main(String[] args)
{
    JCuda.setExceptionsEnabled(true);
    int deviceCount[] = { 0 };
    cudaGetDeviceCount(deviceCount);
    System.out.println("Found " + deviceCount[0] + " devices");
    for (int device = 0; device < deviceCount[0]; device++)
    {
        System.out.println("Properties of device " + device + ":");
        cudaDeviceProp deviceProperties = new cudaDeviceProp();
        cudaGetDeviceProperties(deviceProperties, device);
        System.out.println(deviceProperties.toFormattedString());
    }
    
}
 
Example #15
Source File: JCurandSample.java    From jcuda-samples with MIT License 5 votes vote down vote up
public static void main(String args[])
{
    // Enable exceptions and omit all subsequent error checks
    JCuda.setExceptionsEnabled(true);
    JCurand.setExceptionsEnabled(true);

    int n = 100;
    curandGenerator generator = new curandGenerator();

    // Allocate n floats on host 
    float hostData[] = new float[n];

    // Allocate n floats on device 
    Pointer deviceData = new Pointer();
    cudaMalloc(deviceData, n * Sizeof.FLOAT);

    // Create pseudo-random number generator 
    curandCreateGenerator(generator, CURAND_RNG_PSEUDO_DEFAULT);

    // Set seed 
    curandSetPseudoRandomGeneratorSeed(generator, 1234);

    // Generate n floats on device 
    curandGenerateUniform(generator, deviceData, n);

    // Copy device memory to host 
    cudaMemcpy(Pointer.to(hostData), deviceData, 
        n * Sizeof.FLOAT, cudaMemcpyDeviceToHost);

    // Show result
    System.out.println(Arrays.toString(hostData));

    // Cleanup 
    curandDestroyGenerator(generator);
    cudaFree(deviceData);
}
 
Example #16
Source File: JCusolverDn_LinearSolver_Direct.java    From jcuda-samples with MIT License 5 votes vote down vote up
public static void main(String args[])
{
    JCuda.setExceptionsEnabled(true);;
    JCusparse.setExceptionsEnabled(true);
    JCusolver.setExceptionsEnabled(true);

    String path = "src/main/resources/data/jcusolver/";
    String fileName = path + "gr_900_900_crg.mtx";
    String testFunc = "chol"; // "chol", "lu", "qr"

    runTest(
        "-F="+fileName,
        "-R="+testFunc);
}
 
Example #17
Source File: CudaUtil.java    From murphy with Apache License 2.0 4 votes vote down vote up
public static void shutdown() {
	JCuda.cudaDeviceReset();
}
 
Example #18
Source File: CublasUtil.java    From murphy with Apache License 2.0 4 votes vote down vote up
public void free() {
	setDontFree(false);
	if (data_d != null) JCuda.cudaFree(data_d);
}
 
Example #19
Source File: LibMatrixCuMatMult.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
 * allocated in dense row-major format and A is sparse.
 * 
 * Other than input and output, this method requires additional memory =
 * outRLen * outCLen * sizeOfDataType
 * 
 * @param gCtx
 *            a valid {@link GPUContext}
 * @param instName
 *            name of the invoking instruction to record{@link Statistics}.
 * @param C
 *            output matrix pointer
 * @param A
 *            left matrix pointer
 * @param B
 *            right matrix pointer
 * @param leftNumRows
 *            number of rows of A
 * @param leftNumColumns
 *            number of cols of A
 * @param rightNumRows
 *            number of rows of B
 * @param rightNumColumns
 *            number of cols of B
 * @param outRLen
 *            number of rows of C
 * @param outCLen
 *            number of cols of C
 * @param isLeftTransposed
 *            is op(A) = t(A)
 * @param isRightTransposed
 *            is op(B) = t(B)
 */
static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B,
		long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen,
		boolean isLeftTransposed, boolean isRightTransposed) {
	// t(C) = t(B) %*% t(A)
	Pointer output = null;
	if (outRLen != 1 && outCLen != 1) {
		output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType);
	} else {
		// no transpose required for vector output
		output = C;
	}
	CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
			leftNumColumns, !isRightTransposed, !isLeftTransposed);
	denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params);
	if (outRLen != 1 && outCLen != 1) {
		// Transpose: C = t(output)
		cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
				toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
				toInt(outRLen), C, toInt(outCLen));
		if (!DMLScript.EAGER_CUDA_FREE)
			JCuda.cudaDeviceSynchronize();
		gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE);
	}
}
 
Example #20
Source File: JCudaBasicBindingTest.java    From jcuda with MIT License 4 votes vote down vote up
@Test
public void testJCuda()
{
    assertTrue(BasicBindingTest.testBinding(JCuda.class));
}
 
Example #21
Source File: JCudaMemcpy3DTest.java    From jcuda with MIT License 4 votes vote down vote up
@Test
public void testMemcpy3D()
{
    JCuda.setExceptionsEnabled(true);
    
    // Define the size of the memory region, 
    // in number of float elements
    int sizeFloatsX = 11;
    int sizeFloatsY = 13;
    int sizeFloatsZ = 17;
    int sizeFloats = sizeFloatsX * sizeFloatsY * sizeFloatsZ;
    cudaExtent extentFloats = 
        new cudaExtent(sizeFloatsX, sizeFloatsY, sizeFloatsZ);

    // Allocate the host input memory, and fill it with
    // consecutive numbers
    ByteBuffer hostInputData = 
        ByteBuffer.allocate(sizeFloats * Sizeof.FLOAT);
    FloatBuffer hostInputBuffer = 
        hostInputData.order(ByteOrder.nativeOrder()).asFloatBuffer();
    for (int i=0; i<hostInputBuffer.capacity(); i++)
    {
        hostInputBuffer.put(i, (float)i);
    }
    
    // Allocate the host output memory
    ByteBuffer hostOutputData = 
        ByteBuffer.allocate(sizeFloats * Sizeof.FLOAT);
    FloatBuffer hostOutputBuffer = 
        hostOutputData.order(ByteOrder.nativeOrder()).asFloatBuffer();
    
    // Run the 3D memory copy
    copy(extentFloats, 
        Pointer.to(hostInputData), 
        Pointer.to(hostOutputData));
    
    // Obtain the input- and output data as arrays, and compare it
    float input[] = new float[hostInputBuffer.capacity()];
    hostInputBuffer.slice().get(input);
    float output[] = new float[hostOutputBuffer.capacity()];
    hostOutputBuffer.slice().get(output);
    assertArrayEquals(input, output, 0.0f);
}
 
Example #22
Source File: JCudaRuntimeUnifiedMemory.java    From jcuda-samples with MIT License 4 votes vote down vote up
public static void main(String[] args)
{
    JCuda.setExceptionsEnabled(true);
    JCublas.setExceptionsEnabled(true);
    
    // Check if the device supports managed memory
    int supported[] = { 0 };
    cudaDeviceGetAttribute(supported, cudaDevAttrManagedMemory, 0);
    if (supported[0] == 0)
    {
        System.err.println("Device does not support managed memory");
        return;
    }

    // Allocate managed memory that is accessible to the host
    int n = 10;
    long size = n * Sizeof.FLOAT;
    Pointer p = new Pointer();
    cudaMallocManaged(p, size, cudaMemAttachHost);

    // Obtain the byte buffer from the pointer. This is supported only
    // for memory that was allocated to be accessible on the host:
    ByteBuffer bb = p.getByteBuffer(0, size);
    
    System.out.println("Buffer on host side: " + bb);

    // Fill the buffer with sample data
    FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer();
    for (int i = 0; i < n; i++)
    {
        fb.put(i, i);
    }

    // Make the buffer accessible to all devices
    cudaStreamAttachMemAsync(null, p, 0, cudaMemAttachGlobal);
    cudaStreamSynchronize(null);

    // Use the pointer in a device operation (here, a dot product with 
    // JCublas, for example). The data that was filled in by the host
    // will now be used by the device.
    cublasHandle handle = new cublasHandle();
    cublasCreate(handle);
    float result[] = { -1.0f };
    cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result));
    System.out.println("Result: " + result[0]);
}
 
Example #23
Source File: JCudaRuntimeMappedMemory.java    From jcuda-samples with MIT License 4 votes vote down vote up
/**
 * Entry point of this sample
 * 
 * @param args Not used
 */
public static void main(String args[])
{
    // Enable exceptions to quickly be informed about errors in this test
    JCuda.setExceptionsEnabled(true);
    JCublas2.setExceptionsEnabled(true);

    // Check if the device supports mapped host memory
    cudaDeviceProp deviceProperties = new cudaDeviceProp();
    cudaGetDeviceProperties(deviceProperties, 0);
    if (deviceProperties.canMapHostMemory == 0)
    {
        System.err.println("This device can not map host memory");
        System.err.println(deviceProperties.toFormattedString());
        return;
    }

    // Set the flag indicating that mapped memory will be used
    cudaSetDeviceFlags(cudaDeviceMapHost);

    // Allocate mappable host memory
    int n = 5;
    Pointer hostPointer = new Pointer();
    cudaHostAlloc(hostPointer, n * Sizeof.FLOAT, cudaHostAllocMapped);

    // Create a device pointer mapping the host memory
    Pointer devicePointer = new Pointer();
    cudaHostGetDevicePointer(devicePointer, hostPointer, 0);

    // Obtain a ByteBuffer for accessing the data in the host
    // pointer. Modifications in this ByteBuffer will be
    // visible in the device memory.
    ByteBuffer byteBuffer = hostPointer.getByteBuffer(0, n * Sizeof.FLOAT);

    // Set the byte order of the ByteBuffer
    byteBuffer.order(ByteOrder.nativeOrder());

    // For convenience, view the ByteBuffer as a FloatBuffer
    // and fill it with some sample data
    FloatBuffer floatBuffer = byteBuffer.asFloatBuffer();
    System.out.print("Input : ");
    for (int i = 0; i < n; i++)
    {
        floatBuffer.put(i, (float) i);
        System.out.print(floatBuffer.get(i) + ", ");
    }
    System.out.println();

    // Apply a CUBLAS routine to the device pointer. This will
    // modify the host data, which was mapped to the device.
    cublasHandle handle = new cublasHandle();
    cublasCreate(handle);
    Pointer two = Pointer.to(new float[] { 2.0f });
    cublasSscal(handle, n, two, devicePointer, 1);
    cublasDestroy(handle);
    cudaDeviceSynchronize();

    // Print the contents of the host memory after the
    // modification via the mapped pointer.
    System.out.print("Output: ");
    for (int i = 0; i < n; i++)
    {
        System.out.print(floatBuffer.get(i) + ", ");
    }
    System.out.println();

    // Clean up
    cudaFreeHost(hostPointer);
}
 
Example #24
Source File: JCublas2PointerModes.java    From jcuda-samples with MIT License 4 votes vote down vote up
/**
 * Entry point of this sample
 * 
 * @param args Not used
 */
public static void main(String[] args)
{
    // Enable exceptions and omit subsequent error checks
    JCublas2.setExceptionsEnabled(true);
    JCuda.setExceptionsEnabled(true);

    // Create the input data: A vector containing the
    // value 1.0 exactly n times.
    int n = 1000000;
    float hostData[] = new float[n];
    Arrays.fill(hostData,  1.0f);

    // Allocate device memory, and copy the input data to the device
    Pointer deviceData = new Pointer();
    cudaMalloc(deviceData, n * Sizeof.FLOAT);
    cudaMemcpy(deviceData, Pointer.to(hostData), n * Sizeof.FLOAT,
        cudaMemcpyHostToDevice);

    // Create a CUBLAS handle
    cublasHandle handle = new cublasHandle();
    cublasCreate(handle);


    // Execute the 'dot' function in HOST pointer mode:
    // The result will be written to a pointer that
    // points to host memory.

    // Set the pointer mode to HOST
    cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);

    // Prepare the pointer for the result in HOST memory
    float hostResult[] = { -1.0f };
    Pointer hostResultPointer = Pointer.to(hostResult);

    // Execute the 'dot' function
    long beforeHostCall = System.nanoTime();
    cublasSdot(handle, n, deviceData, 1, deviceData, 1, hostResultPointer);
    long afterHostCall = System.nanoTime();

    // Print the result and timing information
    double hostDuration = (afterHostCall - beforeHostCall) / 1e6;
    System.out.println("Host call duration: " + hostDuration + " ms");
    System.out.println("Result: " + hostResult[0]);


    // Execute the 'dot' function in DEVICE pointer mode:
    // The result will be written to a pointer that
    // points to device memory.

    // Set the pointer mode to DEVICE
    cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);

    // Prepare the pointer for the result in DEVICE memory
    Pointer deviceResultPointer = new Pointer();
    cudaMalloc(deviceResultPointer, Sizeof.FLOAT);

    // Execute the 'dot' function
    long beforeDeviceCall = System.nanoTime();
    cublasSdot(handle, n, deviceData, 1, deviceData, 1,
        deviceResultPointer);
    long afterDeviceCall = System.nanoTime();

    // Synchronize in order to wait for the result to
    // be available (note that this is done implicitly
    // when cudaMemcpy is called)
    cudaDeviceSynchronize();
    long afterDeviceSync = System.nanoTime();

    // Copy the result from the device to the host
    float deviceResult[] = { -1.0f };
    cudaMemcpy(Pointer.to(deviceResult), deviceResultPointer, 
        Sizeof.FLOAT, cudaMemcpyDeviceToHost);

    // Print the result and timing information
    double deviceCallDuration = (afterDeviceCall - beforeDeviceCall) / 1e6;
    double deviceFullDuration = (afterDeviceSync - beforeDeviceCall) / 1e6;
    System.out .println(
        "Device call duration: " + deviceCallDuration + " ms");
    System.out.println(
        "Device full duration: " + deviceFullDuration + " ms");
    System.out.println("Result: " + deviceResult[0]);

    // Clean up
    cudaFree(deviceData);
    cublasDestroy(handle);
}
 
Example #25
Source File: JCublas2SgemmBatched.java    From jcuda-samples with MIT License 4 votes vote down vote up
public static void main(String[] args)
{
    JCublas2.setExceptionsEnabled(true);
    JCuda.setExceptionsEnabled(true);
    testSgemmBatched(10, 100);
}
 
Example #26
Source File: LibMatrixCuMatMult.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
 * allocated in dense row-major format and A is sparse.
 * 
 * Other than input and output, this method requires additional memory =
 * outRLen * outCLen * sizeOfDataType
 * 
 * @param gCtx
 *            a valid {@link GPUContext}
 * @param instName
 *            name of the invoking instruction to record{@link Statistics}.
 * @param C
 *            output matrix pointer
 * @param A
 *            left matrix pointer
 * @param B
 *            right matrix pointer
 * @param leftNumRows
 *            number of rows of A
 * @param leftNumColumns
 *            number of cols of A
 * @param rightNumRows
 *            number of rows of B
 * @param rightNumColumns
 *            number of cols of B
 * @param outRLen
 *            number of rows of C
 * @param outCLen
 *            number of cols of C
 * @param isLeftTransposed
 *            is op(A) = t(A)
 * @param isRightTransposed
 *            is op(B) = t(B)
 */
static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B,
		long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen,
		boolean isLeftTransposed, boolean isRightTransposed) {
	// t(C) = t(B) %*% t(A)
	Pointer output = null;
	if (outRLen != 1 && outCLen != 1) {
		output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType);
	} else {
		// no transpose required for vector output
		output = C;
	}
	CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
			leftNumColumns, !isRightTransposed, !isLeftTransposed);
	denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params);
	if (outRLen != 1 && outCLen != 1) {
		// Transpose: C = t(output)
		cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
				toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
				toInt(outRLen), C, toInt(outCLen));
		if (!DMLScript.EAGER_CUDA_FREE)
			JCuda.cudaDeviceSynchronize();
		gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE);
	}
}
 
Example #27
Source File: GPUContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Returns which device is currently being used.
 *
 * @return the current device for the calling host thread
 */
public static int cudaGetDevice() {
	int[] device = new int[1];
	JCuda.cudaGetDevice(device);
	return device[0];
}
 
Example #28
Source File: GPUContext.java    From systemds with Apache License 2.0 4 votes vote down vote up
/**
 * Returns which device is currently being used.
 *
 * @return the current device for the calling host thread
 */
public static int cudaGetDevice() {
	int[] device = new int[1];
	JCuda.cudaGetDevice(device);
	return device[0];
}