jcuda.runtime.JCuda Java Exaples

Source File: JCusolverSp_LinearSolver_Direct.java From jcuda-samples with MIT License

6 votes

public static void main(String[] args)
{
    JCuda.setExceptionsEnabled(true);;
    JCusparse.setExceptionsEnabled(true);
    JCusolver.setExceptionsEnabled(true);

    String path = "src/main/resources/data/jcusolver/";
    String fileName = path + "lap2D_5pt_n100.mtx";
    String testFunc = "chol"; // "chol", "lu", "qr"
    String reorder = "symrcm"; // "symrcm", "symamd", null

    runTest(
        "-F="+fileName,
        "-R="+testFunc,
        "-P="+reorder);
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void getrfGetriBatched(List<Matrix> A, List<Matrix> B) {
	Pointer[] Apointers = new Pointer[A.size()];
	Pointer[] Bpointers = new Pointer[B.size()];
	for (int i=0; i<A.size(); ++i) {
		Apointers[i] = A.get(i).data_d;
		Bpointers[i] = B.get(i).data_d;
	}
	Pointer Apointers_d = new Pointer();
	JCuda.cudaMalloc(Apointers_d, A.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Apointers_d, Pointer.to(Apointers), A.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	Pointer Bpointers_d = new Pointer();
	JCuda.cudaMalloc(Bpointers_d, B.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Bpointers_d, Pointer.to(Bpointers), B.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	Pointer info_d = new Pointer();
	JCuda.cudaMalloc(info_d, A.size() * Sizeof.INT);
	Pointer pivots_d = new Pointer();
	JCuda.cudaMalloc(pivots_d, A.get(0).rows * A.size() * Sizeof.INT);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCublas2.cublasSgetrfBatched(cublasHandle, A.get(0).rows, Apointers_d, A.get(0).rows, pivots_d, info_d, A.size());
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCublas2.cublasSgetriBatched(cublasHandle, A.get(0).rows, Apointers_d, A.get(0).rows, pivots_d, Bpointers_d, B.get(0).rows, info_d, A.size());
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCuda.cudaFree(Apointers_d);
	JCuda.cudaFree(Bpointers_d);
	JCuda.cudaFree(info_d);
	JCuda.cudaFree(pivots_d);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: MatrixVectorMul.java From flink with Apache License 2.0

5 votes

@Override
public void open(Configuration parameters) {
	// When multiple instances of this class and JCuda exist in different class loaders, then we will get UnsatisfiedLinkError.
	// To avoid that, we need to temporarily override the java.io.tmpdir, where the JCuda store its native library, with a random path.
	// For more details please refer to https://issues.apache.org/jira/browse/FLINK-5408 and the discussion in http://apache-flink-user-mailing-list-archive.2336050.n4.nabble.com/Classloader-and-removal-of-native-libraries-td14808.html
	final String originTempDir = System.getProperty("java.io.tmpdir");
	final String newTempDir = originTempDir + "/jcuda-" + UUID.randomUUID();
	System.setProperty("java.io.tmpdir", newTempDir);

	final Set<ExternalResourceInfo> externalResourceInfos = getRuntimeContext().getExternalResourceInfos(resourceName);
	Preconditions.checkState(!externalResourceInfos.isEmpty(), "The MatrixVectorMul needs at least one GPU device while finding 0 GPU.");
	final Optional<String> firstIndexOptional = externalResourceInfos.iterator().next().getProperty("index");
	Preconditions.checkState(firstIndexOptional.isPresent());

	matrixPointer = new Pointer();
	final float[] matrix = new float[dimension * dimension];
	// Initialize a random matrix
	for (int i = 0; i < dimension * dimension; ++i) {
		matrix[i] = (float) Math.random();
	}

	// Set the CUDA device
	JCuda.cudaSetDevice(Integer.parseInt(firstIndexOptional.get()));

	// Initialize JCublas
	JCublas.cublasInit();

	// Allocate device memory for the matrix
	JCublas.cublasAlloc(dimension * dimension, Sizeof.FLOAT, matrixPointer);
	JCublas.cublasSetVector(dimension * dimension, Sizeof.FLOAT, Pointer.to(matrix), 1, matrixPointer, 1);

	// Change the java.io.tmpdir back to its original value.
	System.setProperty("java.io.tmpdir", originTempDir);
}

Source File: HiCCUPS.java From JuiceboxLegacy with MIT License

5 votes

/**
 * todo needs some more development/expansion
 */
private void testGPUInstallation(){
    try {
        jcuda.Pointer pointer = new jcuda.Pointer();
        JCuda.cudaMalloc(pointer, 4);
        JCuda.cudaFree(pointer);
    }
    catch (Exception e) {
        System.err.println("GPU/CUDA Installation Not Detected");
        System.err.println("Exiting HiCCUPS");
        System.exit(24);
    }
}

Source File: TestPointerGetByteBuffer.java From jcuda with MIT License

5 votes

@Test(expected = ArithmeticException.class)
public void testGetByteBufferWithOverflow()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    pointer.getByteBuffer(Integer.MAX_VALUE - 10, 20);
}

Source File: TestPointerGetByteBuffer.java From jcuda with MIT License

5 votes

@Test(expected = IllegalArgumentException.class)
public void testGetByteBufferWithInvalidSize()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    pointer.getByteBuffer(100, 1000);
}

Source File: TestPointerGetByteBuffer.java From jcuda with MIT License

5 votes

@Test(expected = IllegalArgumentException.class)
public void testGetByteBufferWithInvalidOffset()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    pointer.getByteBuffer(-100, 800);
}

Source File: TestPointerGetByteBuffer.java From jcuda with MIT License

5 votes

@Test
public void testGetByteBufferEndianness()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    ByteBuffer byteBuffer = pointer.getByteBuffer(100, 800);
    assertEquals(ByteOrder.nativeOrder(), byteBuffer.order());
}

Source File: TestPointerGetByteBuffer.java From jcuda with MIT License

5 votes

@Test
public void testGetByteBufferWithOffsetAndSize()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    ByteBuffer byteBuffer = pointer.getByteBuffer(100, 800);
    
    assertNotNull(byteBuffer);
    assertEquals(0, byteBuffer.position());
    assertEquals(800, byteBuffer.limit());
}

Source File: TestPointerGetByteBuffer.java From jcuda with MIT License

5 votes

@Test
public void testGetByteBuffer()
{
    Pointer pointer = new Pointer();
    JCuda.cudaMallocHost(pointer, 1000);
    ByteBuffer byteBuffer = pointer.getByteBuffer();
    
    assertNotNull(byteBuffer);
    assertEquals(0, byteBuffer.position());
    assertEquals(1000, byteBuffer.limit());
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void gemmBatched(float alpha, List<Matrix> A, List<Matrix> B, float beta, List<Matrix> C) {
	Pointer[] Apointers = new Pointer[A.size()];
	Pointer[] Bpointers = new Pointer[B.size()];
	Pointer[] Cpointers = new Pointer[C.size()];
	for (int i=0; i<A.size(); ++i) {
		Apointers[i] = A.get(i).data_d;
		Bpointers[i] = B.get(i).data_d;
		Cpointers[i] = C.get(i).data_d;
	}
	Pointer Apointers_d = new Pointer();
	JCuda.cudaMalloc(Apointers_d, A.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Apointers_d, Pointer.to(Apointers), A.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	Pointer Bpointers_d = new Pointer();
	JCuda.cudaMalloc(Bpointers_d, B.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Bpointers_d, Pointer.to(Bpointers), B.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	Pointer Cpointers_d = new Pointer();
	JCuda.cudaMalloc(Cpointers_d, C.size() * Sizeof.POINTER);
	JCuda.cudaMemcpy(Cpointers_d, Pointer.to(Cpointers), C.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCublas2.cublasSgemmBatched(cublasHandle, cublasOperation.CUBLAS_OP_N, cublasOperation.CUBLAS_OP_N, C.get(0).rows, C.get(0).cols, B.get(0).rows, Pointer.to(new float[] {alpha}), Apointers_d, A.get(0).rows, Bpointers_d, B.get(0).rows, Pointer.to(new float[] {beta}), Cpointers_d, C.get(0).rows, A.size());
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
	
	JCuda.cudaFree(Apointers_d);
	JCuda.cudaFree(Bpointers_d);
	JCuda.cudaFree(Cpointers_d);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

public Matrix(int rows, int cols) {
	this.dontFree = false;
	this.rows = rows;
	this.cols = cols;
	this.data_d = new Pointer();
	JCuda.cudaMalloc(data_d, rows*cols * Sizeof.FLOAT);
	CublasUtil.allocated.add(this);
}

Source File: JCudnnMnist.java From jcuda-samples with MIT License

5 votes

public static void main(String args[])
{
    JCuda.setExceptionsEnabled(true);
    JCudnn.setExceptionsEnabled(true);
    JCublas2.setExceptionsEnabled(true);

    int version = (int) cudnnGetVersion();
    System.out.printf("cudnnGetVersion() : %d , " + 
        "CUDNN_VERSION from cudnn.h : %d\n",
        version, CUDNN_VERSION);

    System.out.println("Creating network and layers...");
    Network mnist = new Network();
    
    System.out.println("Classifying...");
    int i1 = mnist.classifyExample(dataDirectory + first_image);
    int i2 = mnist.classifyExample(dataDirectory + second_image);

    mnist.setConvolutionAlgorithm(CUDNN_CONVOLUTION_FWD_ALGO_FFT);
    int i3 = mnist.classifyExample(dataDirectory + third_image);
    
    System.out.println(
        "\nResult of classification: " + i1 + " " + i2 + " " + i3);
    if (i1 != 1 || i2 != 3 || i3 != 5)
    {
        System.out.println("\nTest failed!\n");
    }
    else
    {
        System.out.println("\nTest passed!\n");
    }
    mnist.destroy();
}

Source File: JCudaPrintDeviceInfo.java From jcuda-samples with MIT License

5 votes

public static void main(String[] args)
{
    JCuda.setExceptionsEnabled(true);
    int deviceCount[] = { 0 };
    cudaGetDeviceCount(deviceCount);
    System.out.println("Found " + deviceCount[0] + " devices");
    for (int device = 0; device < deviceCount[0]; device++)
    {
        System.out.println("Properties of device " + device + ":");
        cudaDeviceProp deviceProperties = new cudaDeviceProp();
        cudaGetDeviceProperties(deviceProperties, device);
        System.out.println(deviceProperties.toFormattedString());
    }
    
}

Source File: JCurandSample.java From jcuda-samples with MIT License

5 votes

public static void main(String args[])
{
    // Enable exceptions and omit all subsequent error checks
    JCuda.setExceptionsEnabled(true);
    JCurand.setExceptionsEnabled(true);

    int n = 100;
    curandGenerator generator = new curandGenerator();

    // Allocate n floats on host 
    float hostData[] = new float[n];

    // Allocate n floats on device 
    Pointer deviceData = new Pointer();
    cudaMalloc(deviceData, n * Sizeof.FLOAT);

    // Create pseudo-random number generator 
    curandCreateGenerator(generator, CURAND_RNG_PSEUDO_DEFAULT);

    // Set seed 
    curandSetPseudoRandomGeneratorSeed(generator, 1234);

    // Generate n floats on device 
    curandGenerateUniform(generator, deviceData, n);

    // Copy device memory to host 
    cudaMemcpy(Pointer.to(hostData), deviceData, 
        n * Sizeof.FLOAT, cudaMemcpyDeviceToHost);

    // Show result
    System.out.println(Arrays.toString(hostData));

    // Cleanup 
    curandDestroyGenerator(generator);
    cudaFree(deviceData);
}

Source File: JCusolverDn_LinearSolver_Direct.java From jcuda-samples with MIT License

5 votes

public static void main(String args[])
{
    JCuda.setExceptionsEnabled(true);;
    JCusparse.setExceptionsEnabled(true);
    JCusolver.setExceptionsEnabled(true);

    String path = "src/main/resources/data/jcusolver/";
    String fileName = path + "gr_900_900_crg.mtx";
    String testFunc = "chol"; // "chol", "lu", "qr"

    runTest(
        "-F="+fileName,
        "-R="+testFunc);
}

Source File: CudaUtil.java From murphy with Apache License 2.0

4 votes

public static void shutdown() {
	JCuda.cudaDeviceReset();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

4 votes

public void free() {
	setDontFree(false);
	if (data_d != null) JCuda.cudaFree(data_d);
}

Source File: LibMatrixCuMatMult.java From systemds with Apache License 2.0

4 votes

/**
 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
 * allocated in dense row-major format and A is sparse.
 * 
 * Other than input and output, this method requires additional memory =
 * outRLen * outCLen * sizeOfDataType
 * 
 * @param gCtx
 *            a valid {@link GPUContext}
 * @param instName
 *            name of the invoking instruction to record{@link Statistics}.
 * @param C
 *            output matrix pointer
 * @param A
 *            left matrix pointer
 * @param B
 *            right matrix pointer
 * @param leftNumRows
 *            number of rows of A
 * @param leftNumColumns
 *            number of cols of A
 * @param rightNumRows
 *            number of rows of B
 * @param rightNumColumns
 *            number of cols of B
 * @param outRLen
 *            number of rows of C
 * @param outCLen
 *            number of cols of C
 * @param isLeftTransposed
 *            is op(A) = t(A)
 * @param isRightTransposed
 *            is op(B) = t(B)
 */
static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B,
		long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen,
		boolean isLeftTransposed, boolean isRightTransposed) {
	// t(C) = t(B) %*% t(A)
	Pointer output = null;
	if (outRLen != 1 && outCLen != 1) {
		output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType);
	} else {
		// no transpose required for vector output
		output = C;
	}
	CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
			leftNumColumns, !isRightTransposed, !isLeftTransposed);
	denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params);
	if (outRLen != 1 && outCLen != 1) {
		// Transpose: C = t(output)
		cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
				toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
				toInt(outRLen), C, toInt(outCLen));
		if (!DMLScript.EAGER_CUDA_FREE)
			JCuda.cudaDeviceSynchronize();
		gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE);
	}
}

Source File: JCudaBasicBindingTest.java From jcuda with MIT License

4 votes

@Test
public void testJCuda()
{
    assertTrue(BasicBindingTest.testBinding(JCuda.class));
}

Source File: JCudaMemcpy3DTest.java From jcuda with MIT License

4 votes

@Test
public void testMemcpy3D()
{
    JCuda.setExceptionsEnabled(true);
    
    // Define the size of the memory region, 
    // in number of float elements
    int sizeFloatsX = 11;
    int sizeFloatsY = 13;
    int sizeFloatsZ = 17;
    int sizeFloats = sizeFloatsX * sizeFloatsY * sizeFloatsZ;
    cudaExtent extentFloats = 
        new cudaExtent(sizeFloatsX, sizeFloatsY, sizeFloatsZ);

    // Allocate the host input memory, and fill it with
    // consecutive numbers
    ByteBuffer hostInputData = 
        ByteBuffer.allocate(sizeFloats * Sizeof.FLOAT);
    FloatBuffer hostInputBuffer = 
        hostInputData.order(ByteOrder.nativeOrder()).asFloatBuffer();
    for (int i=0; i<hostInputBuffer.capacity(); i++)
    {
        hostInputBuffer.put(i, (float)i);
    }
    
    // Allocate the host output memory
    ByteBuffer hostOutputData = 
        ByteBuffer.allocate(sizeFloats * Sizeof.FLOAT);
    FloatBuffer hostOutputBuffer = 
        hostOutputData.order(ByteOrder.nativeOrder()).asFloatBuffer();
    
    // Run the 3D memory copy
    copy(extentFloats, 
        Pointer.to(hostInputData), 
        Pointer.to(hostOutputData));
    
    // Obtain the input- and output data as arrays, and compare it
    float input[] = new float[hostInputBuffer.capacity()];
    hostInputBuffer.slice().get(input);
    float output[] = new float[hostOutputBuffer.capacity()];
    hostOutputBuffer.slice().get(output);
    assertArrayEquals(input, output, 0.0f);
}

Source File: JCudaRuntimeUnifiedMemory.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args)
{
    JCuda.setExceptionsEnabled(true);
    JCublas.setExceptionsEnabled(true);
    
    // Check if the device supports managed memory
    int supported[] = { 0 };
    cudaDeviceGetAttribute(supported, cudaDevAttrManagedMemory, 0);
    if (supported[0] == 0)
    {
        System.err.println("Device does not support managed memory");
        return;
    }

    // Allocate managed memory that is accessible to the host
    int n = 10;
    long size = n * Sizeof.FLOAT;
    Pointer p = new Pointer();
    cudaMallocManaged(p, size, cudaMemAttachHost);

    // Obtain the byte buffer from the pointer. This is supported only
    // for memory that was allocated to be accessible on the host:
    ByteBuffer bb = p.getByteBuffer(0, size);
    
    System.out.println("Buffer on host side: " + bb);

    // Fill the buffer with sample data
    FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer();
    for (int i = 0; i < n; i++)
    {
        fb.put(i, i);
    }

    // Make the buffer accessible to all devices
    cudaStreamAttachMemAsync(null, p, 0, cudaMemAttachGlobal);
    cudaStreamSynchronize(null);

    // Use the pointer in a device operation (here, a dot product with 
    // JCublas, for example). The data that was filled in by the host
    // will now be used by the device.
    cublasHandle handle = new cublasHandle();
    cublasCreate(handle);
    float result[] = { -1.0f };
    cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result));
    System.out.println("Result: " + result[0]);
}

Source File: JCudaRuntimeMappedMemory.java From jcuda-samples with MIT License

4 votes

/**
 * Entry point of this sample
 * 
 * @param args Not used
 */
public static void main(String args[])
{
    // Enable exceptions to quickly be informed about errors in this test
    JCuda.setExceptionsEnabled(true);
    JCublas2.setExceptionsEnabled(true);

    // Check if the device supports mapped host memory
    cudaDeviceProp deviceProperties = new cudaDeviceProp();
    cudaGetDeviceProperties(deviceProperties, 0);
    if (deviceProperties.canMapHostMemory == 0)
    {
        System.err.println("This device can not map host memory");
        System.err.println(deviceProperties.toFormattedString());
        return;
    }

    // Set the flag indicating that mapped memory will be used
    cudaSetDeviceFlags(cudaDeviceMapHost);

    // Allocate mappable host memory
    int n = 5;
    Pointer hostPointer = new Pointer();
    cudaHostAlloc(hostPointer, n * Sizeof.FLOAT, cudaHostAllocMapped);

    // Create a device pointer mapping the host memory
    Pointer devicePointer = new Pointer();
    cudaHostGetDevicePointer(devicePointer, hostPointer, 0);

    // Obtain a ByteBuffer for accessing the data in the host
    // pointer. Modifications in this ByteBuffer will be
    // visible in the device memory.
    ByteBuffer byteBuffer = hostPointer.getByteBuffer(0, n * Sizeof.FLOAT);

    // Set the byte order of the ByteBuffer
    byteBuffer.order(ByteOrder.nativeOrder());

    // For convenience, view the ByteBuffer as a FloatBuffer
    // and fill it with some sample data
    FloatBuffer floatBuffer = byteBuffer.asFloatBuffer();
    System.out.print("Input : ");
    for (int i = 0; i < n; i++)
    {
        floatBuffer.put(i, (float) i);
        System.out.print(floatBuffer.get(i) + ", ");
    }
    System.out.println();

    // Apply a CUBLAS routine to the device pointer. This will
    // modify the host data, which was mapped to the device.
    cublasHandle handle = new cublasHandle();
    cublasCreate(handle);
    Pointer two = Pointer.to(new float[] { 2.0f });
    cublasSscal(handle, n, two, devicePointer, 1);
    cublasDestroy(handle);
    cudaDeviceSynchronize();

    // Print the contents of the host memory after the
    // modification via the mapped pointer.
    System.out.print("Output: ");
    for (int i = 0; i < n; i++)
    {
        System.out.print(floatBuffer.get(i) + ", ");
    }
    System.out.println();

    // Clean up
    cudaFreeHost(hostPointer);
}

Source File: JCublas2PointerModes.java From jcuda-samples with MIT License

4 votes

/**
 * Entry point of this sample
 * 
 * @param args Not used
 */
public static void main(String[] args)
{
    // Enable exceptions and omit subsequent error checks
    JCublas2.setExceptionsEnabled(true);
    JCuda.setExceptionsEnabled(true);

    // Create the input data: A vector containing the
    // value 1.0 exactly n times.
    int n = 1000000;
    float hostData[] = new float[n];
    Arrays.fill(hostData,  1.0f);

    // Allocate device memory, and copy the input data to the device
    Pointer deviceData = new Pointer();
    cudaMalloc(deviceData, n * Sizeof.FLOAT);
    cudaMemcpy(deviceData, Pointer.to(hostData), n * Sizeof.FLOAT,
        cudaMemcpyHostToDevice);

    // Create a CUBLAS handle
    cublasHandle handle = new cublasHandle();
    cublasCreate(handle);


    // Execute the 'dot' function in HOST pointer mode:
    // The result will be written to a pointer that
    // points to host memory.

    // Set the pointer mode to HOST
    cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);

    // Prepare the pointer for the result in HOST memory
    float hostResult[] = { -1.0f };
    Pointer hostResultPointer = Pointer.to(hostResult);

    // Execute the 'dot' function
    long beforeHostCall = System.nanoTime();
    cublasSdot(handle, n, deviceData, 1, deviceData, 1, hostResultPointer);
    long afterHostCall = System.nanoTime();

    // Print the result and timing information
    double hostDuration = (afterHostCall - beforeHostCall) / 1e6;
    System.out.println("Host call duration: " + hostDuration + " ms");
    System.out.println("Result: " + hostResult[0]);


    // Execute the 'dot' function in DEVICE pointer mode:
    // The result will be written to a pointer that
    // points to device memory.

    // Set the pointer mode to DEVICE
    cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);

    // Prepare the pointer for the result in DEVICE memory
    Pointer deviceResultPointer = new Pointer();
    cudaMalloc(deviceResultPointer, Sizeof.FLOAT);

    // Execute the 'dot' function
    long beforeDeviceCall = System.nanoTime();
    cublasSdot(handle, n, deviceData, 1, deviceData, 1,
        deviceResultPointer);
    long afterDeviceCall = System.nanoTime();

    // Synchronize in order to wait for the result to
    // be available (note that this is done implicitly
    // when cudaMemcpy is called)
    cudaDeviceSynchronize();
    long afterDeviceSync = System.nanoTime();

    // Copy the result from the device to the host
    float deviceResult[] = { -1.0f };
    cudaMemcpy(Pointer.to(deviceResult), deviceResultPointer, 
        Sizeof.FLOAT, cudaMemcpyDeviceToHost);

    // Print the result and timing information
    double deviceCallDuration = (afterDeviceCall - beforeDeviceCall) / 1e6;
    double deviceFullDuration = (afterDeviceSync - beforeDeviceCall) / 1e6;
    System.out .println(
        "Device call duration: " + deviceCallDuration + " ms");
    System.out.println(
        "Device full duration: " + deviceFullDuration + " ms");
    System.out.println("Result: " + deviceResult[0]);

    // Clean up
    cudaFree(deviceData);
    cublasDestroy(handle);
}

Source File: JCublas2SgemmBatched.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args)
{
    JCublas2.setExceptionsEnabled(true);
    JCuda.setExceptionsEnabled(true);
    testSgemmBatched(10, 100);
}

Source File: LibMatrixCuMatMult.java From systemds with Apache License 2.0

4 votes

/**
 * Internal method to invoke the appropriate CuSPARSE kernel for matrix
 * multiplication for operation: C = op(A) * op(B) This assumes B and C are
 * allocated in dense row-major format and A is sparse.
 * 
 * Other than input and output, this method requires additional memory =
 * outRLen * outCLen * sizeOfDataType
 * 
 * @param gCtx
 *            a valid {@link GPUContext}
 * @param instName
 *            name of the invoking instruction to record{@link Statistics}.
 * @param C
 *            output matrix pointer
 * @param A
 *            left matrix pointer
 * @param B
 *            right matrix pointer
 * @param leftNumRows
 *            number of rows of A
 * @param leftNumColumns
 *            number of cols of A
 * @param rightNumRows
 *            number of rows of B
 * @param rightNumColumns
 *            number of cols of B
 * @param outRLen
 *            number of rows of C
 * @param outCLen
 *            number of cols of C
 * @param isLeftTransposed
 *            is op(A) = t(A)
 * @param isRightTransposed
 *            is op(B) = t(B)
 */
static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B,
		long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen,
		boolean isLeftTransposed, boolean isRightTransposed) {
	// t(C) = t(B) %*% t(A)
	Pointer output = null;
	if (outRLen != 1 && outCLen != 1) {
		output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType);
	} else {
		// no transpose required for vector output
		output = C;
	}
	CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows,
			leftNumColumns, !isRightTransposed, !isLeftTransposed);
	denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params);
	if (outRLen != 1 && outCLen != 1) {
		// Transpose: C = t(output)
		cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T,
				toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(),
				toInt(outRLen), C, toInt(outCLen));
		if (!DMLScript.EAGER_CUDA_FREE)
			JCuda.cudaDeviceSynchronize();
		gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE);
	}
}

Source File: GPUContext.java From systemds with Apache License 2.0

4 votes

/**
 * Returns which device is currently being used.
 *
 * @return the current device for the calling host thread
 */
public static int cudaGetDevice() {
	int[] device = new int[1];
	JCuda.cudaGetDevice(device);
	return device[0];
}

Source File: GPUContext.java From systemds with Apache License 2.0

4 votes

/**
 * Returns which device is currently being used.
 *
 * @return the current device for the calling host thread
 */
public static int cudaGetDevice() {
	int[] device = new int[1];
	JCuda.cudaGetDevice(device);
	return device[0];
}

jcuda.runtime.JCuda Java Examples