jcuda.runtime.JCuda Java Examples
The following examples show how to use
jcuda.runtime.JCuda.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JCusolverSp_LinearSolver_Direct.java From jcuda-samples with MIT License | 6 votes |
public static void main(String[] args) { JCuda.setExceptionsEnabled(true);; JCusparse.setExceptionsEnabled(true); JCusolver.setExceptionsEnabled(true); String path = "src/main/resources/data/jcusolver/"; String fileName = path + "lap2D_5pt_n100.mtx"; String testFunc = "chol"; // "chol", "lu", "qr" String reorder = "symrcm"; // "symrcm", "symamd", null runTest( "-F="+fileName, "-R="+testFunc, "-P="+reorder); }
Example #2
Source File: CublasUtil.java From murphy with Apache License 2.0 | 5 votes |
private static void getrfGetriBatched(List<Matrix> A, List<Matrix> B) { Pointer[] Apointers = new Pointer[A.size()]; Pointer[] Bpointers = new Pointer[B.size()]; for (int i=0; i<A.size(); ++i) { Apointers[i] = A.get(i).data_d; Bpointers[i] = B.get(i).data_d; } Pointer Apointers_d = new Pointer(); JCuda.cudaMalloc(Apointers_d, A.size() * Sizeof.POINTER); JCuda.cudaMemcpy(Apointers_d, Pointer.to(Apointers), A.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice); Pointer Bpointers_d = new Pointer(); JCuda.cudaMalloc(Bpointers_d, B.size() * Sizeof.POINTER); JCuda.cudaMemcpy(Bpointers_d, Pointer.to(Bpointers), B.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice); Pointer info_d = new Pointer(); JCuda.cudaMalloc(info_d, A.size() * Sizeof.INT); Pointer pivots_d = new Pointer(); JCuda.cudaMalloc(pivots_d, A.get(0).rows * A.size() * Sizeof.INT); if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize(); JCublas2.cublasSgetrfBatched(cublasHandle, A.get(0).rows, Apointers_d, A.get(0).rows, pivots_d, info_d, A.size()); if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize(); JCublas2.cublasSgetriBatched(cublasHandle, A.get(0).rows, Apointers_d, A.get(0).rows, pivots_d, Bpointers_d, B.get(0).rows, info_d, A.size()); if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize(); JCuda.cudaFree(Apointers_d); JCuda.cudaFree(Bpointers_d); JCuda.cudaFree(info_d); JCuda.cudaFree(pivots_d); if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize(); }
Example #3
Source File: MatrixVectorMul.java From flink with Apache License 2.0 | 5 votes |
@Override public void open(Configuration parameters) { // When multiple instances of this class and JCuda exist in different class loaders, then we will get UnsatisfiedLinkError. // To avoid that, we need to temporarily override the java.io.tmpdir, where the JCuda store its native library, with a random path. // For more details please refer to https://issues.apache.org/jira/browse/FLINK-5408 and the discussion in http://apache-flink-user-mailing-list-archive.2336050.n4.nabble.com/Classloader-and-removal-of-native-libraries-td14808.html final String originTempDir = System.getProperty("java.io.tmpdir"); final String newTempDir = originTempDir + "/jcuda-" + UUID.randomUUID(); System.setProperty("java.io.tmpdir", newTempDir); final Set<ExternalResourceInfo> externalResourceInfos = getRuntimeContext().getExternalResourceInfos(resourceName); Preconditions.checkState(!externalResourceInfos.isEmpty(), "The MatrixVectorMul needs at least one GPU device while finding 0 GPU."); final Optional<String> firstIndexOptional = externalResourceInfos.iterator().next().getProperty("index"); Preconditions.checkState(firstIndexOptional.isPresent()); matrixPointer = new Pointer(); final float[] matrix = new float[dimension * dimension]; // Initialize a random matrix for (int i = 0; i < dimension * dimension; ++i) { matrix[i] = (float) Math.random(); } // Set the CUDA device JCuda.cudaSetDevice(Integer.parseInt(firstIndexOptional.get())); // Initialize JCublas JCublas.cublasInit(); // Allocate device memory for the matrix JCublas.cublasAlloc(dimension * dimension, Sizeof.FLOAT, matrixPointer); JCublas.cublasSetVector(dimension * dimension, Sizeof.FLOAT, Pointer.to(matrix), 1, matrixPointer, 1); // Change the java.io.tmpdir back to its original value. System.setProperty("java.io.tmpdir", originTempDir); }
Example #4
Source File: HiCCUPS.java From JuiceboxLegacy with MIT License | 5 votes |
/** * todo needs some more development/expansion */ private void testGPUInstallation(){ try { jcuda.Pointer pointer = new jcuda.Pointer(); JCuda.cudaMalloc(pointer, 4); JCuda.cudaFree(pointer); } catch (Exception e) { System.err.println("GPU/CUDA Installation Not Detected"); System.err.println("Exiting HiCCUPS"); System.exit(24); } }
Example #5
Source File: TestPointerGetByteBuffer.java From jcuda with MIT License | 5 votes |
@Test(expected = ArithmeticException.class) public void testGetByteBufferWithOverflow() { Pointer pointer = new Pointer(); JCuda.cudaMallocHost(pointer, 1000); pointer.getByteBuffer(Integer.MAX_VALUE - 10, 20); }
Example #6
Source File: TestPointerGetByteBuffer.java From jcuda with MIT License | 5 votes |
@Test(expected = IllegalArgumentException.class) public void testGetByteBufferWithInvalidSize() { Pointer pointer = new Pointer(); JCuda.cudaMallocHost(pointer, 1000); pointer.getByteBuffer(100, 1000); }
Example #7
Source File: TestPointerGetByteBuffer.java From jcuda with MIT License | 5 votes |
@Test(expected = IllegalArgumentException.class) public void testGetByteBufferWithInvalidOffset() { Pointer pointer = new Pointer(); JCuda.cudaMallocHost(pointer, 1000); pointer.getByteBuffer(-100, 800); }
Example #8
Source File: TestPointerGetByteBuffer.java From jcuda with MIT License | 5 votes |
@Test public void testGetByteBufferEndianness() { Pointer pointer = new Pointer(); JCuda.cudaMallocHost(pointer, 1000); ByteBuffer byteBuffer = pointer.getByteBuffer(100, 800); assertEquals(ByteOrder.nativeOrder(), byteBuffer.order()); }
Example #9
Source File: TestPointerGetByteBuffer.java From jcuda with MIT License | 5 votes |
@Test public void testGetByteBufferWithOffsetAndSize() { Pointer pointer = new Pointer(); JCuda.cudaMallocHost(pointer, 1000); ByteBuffer byteBuffer = pointer.getByteBuffer(100, 800); assertNotNull(byteBuffer); assertEquals(0, byteBuffer.position()); assertEquals(800, byteBuffer.limit()); }
Example #10
Source File: TestPointerGetByteBuffer.java From jcuda with MIT License | 5 votes |
@Test public void testGetByteBuffer() { Pointer pointer = new Pointer(); JCuda.cudaMallocHost(pointer, 1000); ByteBuffer byteBuffer = pointer.getByteBuffer(); assertNotNull(byteBuffer); assertEquals(0, byteBuffer.position()); assertEquals(1000, byteBuffer.limit()); }
Example #11
Source File: CublasUtil.java From murphy with Apache License 2.0 | 5 votes |
private static void gemmBatched(float alpha, List<Matrix> A, List<Matrix> B, float beta, List<Matrix> C) { Pointer[] Apointers = new Pointer[A.size()]; Pointer[] Bpointers = new Pointer[B.size()]; Pointer[] Cpointers = new Pointer[C.size()]; for (int i=0; i<A.size(); ++i) { Apointers[i] = A.get(i).data_d; Bpointers[i] = B.get(i).data_d; Cpointers[i] = C.get(i).data_d; } Pointer Apointers_d = new Pointer(); JCuda.cudaMalloc(Apointers_d, A.size() * Sizeof.POINTER); JCuda.cudaMemcpy(Apointers_d, Pointer.to(Apointers), A.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice); Pointer Bpointers_d = new Pointer(); JCuda.cudaMalloc(Bpointers_d, B.size() * Sizeof.POINTER); JCuda.cudaMemcpy(Bpointers_d, Pointer.to(Bpointers), B.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice); Pointer Cpointers_d = new Pointer(); JCuda.cudaMalloc(Cpointers_d, C.size() * Sizeof.POINTER); JCuda.cudaMemcpy(Cpointers_d, Pointer.to(Cpointers), C.size() * Sizeof.POINTER, cudaMemcpyKind.cudaMemcpyHostToDevice); if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize(); JCublas2.cublasSgemmBatched(cublasHandle, cublasOperation.CUBLAS_OP_N, cublasOperation.CUBLAS_OP_N, C.get(0).rows, C.get(0).cols, B.get(0).rows, Pointer.to(new float[] {alpha}), Apointers_d, A.get(0).rows, Bpointers_d, B.get(0).rows, Pointer.to(new float[] {beta}), Cpointers_d, C.get(0).rows, A.size()); if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize(); JCuda.cudaFree(Apointers_d); JCuda.cudaFree(Bpointers_d); JCuda.cudaFree(Cpointers_d); if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize(); }
Example #12
Source File: CublasUtil.java From murphy with Apache License 2.0 | 5 votes |
public Matrix(int rows, int cols) { this.dontFree = false; this.rows = rows; this.cols = cols; this.data_d = new Pointer(); JCuda.cudaMalloc(data_d, rows*cols * Sizeof.FLOAT); CublasUtil.allocated.add(this); }
Example #13
Source File: JCudnnMnist.java From jcuda-samples with MIT License | 5 votes |
public static void main(String args[]) { JCuda.setExceptionsEnabled(true); JCudnn.setExceptionsEnabled(true); JCublas2.setExceptionsEnabled(true); int version = (int) cudnnGetVersion(); System.out.printf("cudnnGetVersion() : %d , " + "CUDNN_VERSION from cudnn.h : %d\n", version, CUDNN_VERSION); System.out.println("Creating network and layers..."); Network mnist = new Network(); System.out.println("Classifying..."); int i1 = mnist.classifyExample(dataDirectory + first_image); int i2 = mnist.classifyExample(dataDirectory + second_image); mnist.setConvolutionAlgorithm(CUDNN_CONVOLUTION_FWD_ALGO_FFT); int i3 = mnist.classifyExample(dataDirectory + third_image); System.out.println( "\nResult of classification: " + i1 + " " + i2 + " " + i3); if (i1 != 1 || i2 != 3 || i3 != 5) { System.out.println("\nTest failed!\n"); } else { System.out.println("\nTest passed!\n"); } mnist.destroy(); }
Example #14
Source File: JCudaPrintDeviceInfo.java From jcuda-samples with MIT License | 5 votes |
public static void main(String[] args) { JCuda.setExceptionsEnabled(true); int deviceCount[] = { 0 }; cudaGetDeviceCount(deviceCount); System.out.println("Found " + deviceCount[0] + " devices"); for (int device = 0; device < deviceCount[0]; device++) { System.out.println("Properties of device " + device + ":"); cudaDeviceProp deviceProperties = new cudaDeviceProp(); cudaGetDeviceProperties(deviceProperties, device); System.out.println(deviceProperties.toFormattedString()); } }
Example #15
Source File: JCurandSample.java From jcuda-samples with MIT License | 5 votes |
public static void main(String args[]) { // Enable exceptions and omit all subsequent error checks JCuda.setExceptionsEnabled(true); JCurand.setExceptionsEnabled(true); int n = 100; curandGenerator generator = new curandGenerator(); // Allocate n floats on host float hostData[] = new float[n]; // Allocate n floats on device Pointer deviceData = new Pointer(); cudaMalloc(deviceData, n * Sizeof.FLOAT); // Create pseudo-random number generator curandCreateGenerator(generator, CURAND_RNG_PSEUDO_DEFAULT); // Set seed curandSetPseudoRandomGeneratorSeed(generator, 1234); // Generate n floats on device curandGenerateUniform(generator, deviceData, n); // Copy device memory to host cudaMemcpy(Pointer.to(hostData), deviceData, n * Sizeof.FLOAT, cudaMemcpyDeviceToHost); // Show result System.out.println(Arrays.toString(hostData)); // Cleanup curandDestroyGenerator(generator); cudaFree(deviceData); }
Example #16
Source File: JCusolverDn_LinearSolver_Direct.java From jcuda-samples with MIT License | 5 votes |
public static void main(String args[]) { JCuda.setExceptionsEnabled(true);; JCusparse.setExceptionsEnabled(true); JCusolver.setExceptionsEnabled(true); String path = "src/main/resources/data/jcusolver/"; String fileName = path + "gr_900_900_crg.mtx"; String testFunc = "chol"; // "chol", "lu", "qr" runTest( "-F="+fileName, "-R="+testFunc); }
Example #17
Source File: CudaUtil.java From murphy with Apache License 2.0 | 4 votes |
public static void shutdown() { JCuda.cudaDeviceReset(); }
Example #18
Source File: CublasUtil.java From murphy with Apache License 2.0 | 4 votes |
public void free() { setDontFree(false); if (data_d != null) JCuda.cudaFree(data_d); }
Example #19
Source File: LibMatrixCuMatMult.java From systemds with Apache License 2.0 | 4 votes |
/** * Internal method to invoke the appropriate CuSPARSE kernel for matrix * multiplication for operation: C = op(A) * op(B) This assumes B and C are * allocated in dense row-major format and A is sparse. * * Other than input and output, this method requires additional memory = * outRLen * outCLen * sizeOfDataType * * @param gCtx * a valid {@link GPUContext} * @param instName * name of the invoking instruction to record{@link Statistics}. * @param C * output matrix pointer * @param A * left matrix pointer * @param B * right matrix pointer * @param leftNumRows * number of rows of A * @param leftNumColumns * number of cols of A * @param rightNumRows * number of rows of B * @param rightNumColumns * number of cols of B * @param outRLen * number of rows of C * @param outCLen * number of cols of C * @param isLeftTransposed * is op(A) = t(A) * @param isRightTransposed * is op(B) = t(B) */ static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B, long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen, boolean isLeftTransposed, boolean isRightTransposed) { // t(C) = t(B) %*% t(A) Pointer output = null; if (outRLen != 1 && outCLen != 1) { output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType); } else { // no transpose required for vector output output = C; } CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows, leftNumColumns, !isRightTransposed, !isLeftTransposed); denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params); if (outRLen != 1 && outCLen != 1) { // Transpose: C = t(output) cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T, toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(), toInt(outRLen), C, toInt(outCLen)); if (!DMLScript.EAGER_CUDA_FREE) JCuda.cudaDeviceSynchronize(); gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE); } }
Example #20
Source File: JCudaBasicBindingTest.java From jcuda with MIT License | 4 votes |
@Test public void testJCuda() { assertTrue(BasicBindingTest.testBinding(JCuda.class)); }
Example #21
Source File: JCudaMemcpy3DTest.java From jcuda with MIT License | 4 votes |
@Test public void testMemcpy3D() { JCuda.setExceptionsEnabled(true); // Define the size of the memory region, // in number of float elements int sizeFloatsX = 11; int sizeFloatsY = 13; int sizeFloatsZ = 17; int sizeFloats = sizeFloatsX * sizeFloatsY * sizeFloatsZ; cudaExtent extentFloats = new cudaExtent(sizeFloatsX, sizeFloatsY, sizeFloatsZ); // Allocate the host input memory, and fill it with // consecutive numbers ByteBuffer hostInputData = ByteBuffer.allocate(sizeFloats * Sizeof.FLOAT); FloatBuffer hostInputBuffer = hostInputData.order(ByteOrder.nativeOrder()).asFloatBuffer(); for (int i=0; i<hostInputBuffer.capacity(); i++) { hostInputBuffer.put(i, (float)i); } // Allocate the host output memory ByteBuffer hostOutputData = ByteBuffer.allocate(sizeFloats * Sizeof.FLOAT); FloatBuffer hostOutputBuffer = hostOutputData.order(ByteOrder.nativeOrder()).asFloatBuffer(); // Run the 3D memory copy copy(extentFloats, Pointer.to(hostInputData), Pointer.to(hostOutputData)); // Obtain the input- and output data as arrays, and compare it float input[] = new float[hostInputBuffer.capacity()]; hostInputBuffer.slice().get(input); float output[] = new float[hostOutputBuffer.capacity()]; hostOutputBuffer.slice().get(output); assertArrayEquals(input, output, 0.0f); }
Example #22
Source File: JCudaRuntimeUnifiedMemory.java From jcuda-samples with MIT License | 4 votes |
public static void main(String[] args) { JCuda.setExceptionsEnabled(true); JCublas.setExceptionsEnabled(true); // Check if the device supports managed memory int supported[] = { 0 }; cudaDeviceGetAttribute(supported, cudaDevAttrManagedMemory, 0); if (supported[0] == 0) { System.err.println("Device does not support managed memory"); return; } // Allocate managed memory that is accessible to the host int n = 10; long size = n * Sizeof.FLOAT; Pointer p = new Pointer(); cudaMallocManaged(p, size, cudaMemAttachHost); // Obtain the byte buffer from the pointer. This is supported only // for memory that was allocated to be accessible on the host: ByteBuffer bb = p.getByteBuffer(0, size); System.out.println("Buffer on host side: " + bb); // Fill the buffer with sample data FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer(); for (int i = 0; i < n; i++) { fb.put(i, i); } // Make the buffer accessible to all devices cudaStreamAttachMemAsync(null, p, 0, cudaMemAttachGlobal); cudaStreamSynchronize(null); // Use the pointer in a device operation (here, a dot product with // JCublas, for example). The data that was filled in by the host // will now be used by the device. cublasHandle handle = new cublasHandle(); cublasCreate(handle); float result[] = { -1.0f }; cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result)); System.out.println("Result: " + result[0]); }
Example #23
Source File: JCudaRuntimeMappedMemory.java From jcuda-samples with MIT License | 4 votes |
/** * Entry point of this sample * * @param args Not used */ public static void main(String args[]) { // Enable exceptions to quickly be informed about errors in this test JCuda.setExceptionsEnabled(true); JCublas2.setExceptionsEnabled(true); // Check if the device supports mapped host memory cudaDeviceProp deviceProperties = new cudaDeviceProp(); cudaGetDeviceProperties(deviceProperties, 0); if (deviceProperties.canMapHostMemory == 0) { System.err.println("This device can not map host memory"); System.err.println(deviceProperties.toFormattedString()); return; } // Set the flag indicating that mapped memory will be used cudaSetDeviceFlags(cudaDeviceMapHost); // Allocate mappable host memory int n = 5; Pointer hostPointer = new Pointer(); cudaHostAlloc(hostPointer, n * Sizeof.FLOAT, cudaHostAllocMapped); // Create a device pointer mapping the host memory Pointer devicePointer = new Pointer(); cudaHostGetDevicePointer(devicePointer, hostPointer, 0); // Obtain a ByteBuffer for accessing the data in the host // pointer. Modifications in this ByteBuffer will be // visible in the device memory. ByteBuffer byteBuffer = hostPointer.getByteBuffer(0, n * Sizeof.FLOAT); // Set the byte order of the ByteBuffer byteBuffer.order(ByteOrder.nativeOrder()); // For convenience, view the ByteBuffer as a FloatBuffer // and fill it with some sample data FloatBuffer floatBuffer = byteBuffer.asFloatBuffer(); System.out.print("Input : "); for (int i = 0; i < n; i++) { floatBuffer.put(i, (float) i); System.out.print(floatBuffer.get(i) + ", "); } System.out.println(); // Apply a CUBLAS routine to the device pointer. This will // modify the host data, which was mapped to the device. cublasHandle handle = new cublasHandle(); cublasCreate(handle); Pointer two = Pointer.to(new float[] { 2.0f }); cublasSscal(handle, n, two, devicePointer, 1); cublasDestroy(handle); cudaDeviceSynchronize(); // Print the contents of the host memory after the // modification via the mapped pointer. System.out.print("Output: "); for (int i = 0; i < n; i++) { System.out.print(floatBuffer.get(i) + ", "); } System.out.println(); // Clean up cudaFreeHost(hostPointer); }
Example #24
Source File: JCublas2PointerModes.java From jcuda-samples with MIT License | 4 votes |
/** * Entry point of this sample * * @param args Not used */ public static void main(String[] args) { // Enable exceptions and omit subsequent error checks JCublas2.setExceptionsEnabled(true); JCuda.setExceptionsEnabled(true); // Create the input data: A vector containing the // value 1.0 exactly n times. int n = 1000000; float hostData[] = new float[n]; Arrays.fill(hostData, 1.0f); // Allocate device memory, and copy the input data to the device Pointer deviceData = new Pointer(); cudaMalloc(deviceData, n * Sizeof.FLOAT); cudaMemcpy(deviceData, Pointer.to(hostData), n * Sizeof.FLOAT, cudaMemcpyHostToDevice); // Create a CUBLAS handle cublasHandle handle = new cublasHandle(); cublasCreate(handle); // Execute the 'dot' function in HOST pointer mode: // The result will be written to a pointer that // points to host memory. // Set the pointer mode to HOST cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST); // Prepare the pointer for the result in HOST memory float hostResult[] = { -1.0f }; Pointer hostResultPointer = Pointer.to(hostResult); // Execute the 'dot' function long beforeHostCall = System.nanoTime(); cublasSdot(handle, n, deviceData, 1, deviceData, 1, hostResultPointer); long afterHostCall = System.nanoTime(); // Print the result and timing information double hostDuration = (afterHostCall - beforeHostCall) / 1e6; System.out.println("Host call duration: " + hostDuration + " ms"); System.out.println("Result: " + hostResult[0]); // Execute the 'dot' function in DEVICE pointer mode: // The result will be written to a pointer that // points to device memory. // Set the pointer mode to DEVICE cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); // Prepare the pointer for the result in DEVICE memory Pointer deviceResultPointer = new Pointer(); cudaMalloc(deviceResultPointer, Sizeof.FLOAT); // Execute the 'dot' function long beforeDeviceCall = System.nanoTime(); cublasSdot(handle, n, deviceData, 1, deviceData, 1, deviceResultPointer); long afterDeviceCall = System.nanoTime(); // Synchronize in order to wait for the result to // be available (note that this is done implicitly // when cudaMemcpy is called) cudaDeviceSynchronize(); long afterDeviceSync = System.nanoTime(); // Copy the result from the device to the host float deviceResult[] = { -1.0f }; cudaMemcpy(Pointer.to(deviceResult), deviceResultPointer, Sizeof.FLOAT, cudaMemcpyDeviceToHost); // Print the result and timing information double deviceCallDuration = (afterDeviceCall - beforeDeviceCall) / 1e6; double deviceFullDuration = (afterDeviceSync - beforeDeviceCall) / 1e6; System.out .println( "Device call duration: " + deviceCallDuration + " ms"); System.out.println( "Device full duration: " + deviceFullDuration + " ms"); System.out.println("Result: " + deviceResult[0]); // Clean up cudaFree(deviceData); cublasDestroy(handle); }
Example #25
Source File: JCublas2SgemmBatched.java From jcuda-samples with MIT License | 4 votes |
public static void main(String[] args) { JCublas2.setExceptionsEnabled(true); JCuda.setExceptionsEnabled(true); testSgemmBatched(10, 100); }
Example #26
Source File: LibMatrixCuMatMult.java From systemds with Apache License 2.0 | 4 votes |
/** * Internal method to invoke the appropriate CuSPARSE kernel for matrix * multiplication for operation: C = op(A) * op(B) This assumes B and C are * allocated in dense row-major format and A is sparse. * * Other than input and output, this method requires additional memory = * outRLen * outCLen * sizeOfDataType * * @param gCtx * a valid {@link GPUContext} * @param instName * name of the invoking instruction to record{@link Statistics}. * @param C * output matrix pointer * @param A * left matrix pointer * @param B * right matrix pointer * @param leftNumRows * number of rows of A * @param leftNumColumns * number of cols of A * @param rightNumRows * number of rows of B * @param rightNumColumns * number of cols of B * @param outRLen * number of rows of C * @param outCLen * number of cols of C * @param isLeftTransposed * is op(A) = t(A) * @param isRightTransposed * is op(B) = t(B) */ static void sparseDenseMatMult(GPUContext gCtx, String instName, Pointer C, CSRPointer A, Pointer B, long leftNumRows, long leftNumColumns, long rightNumRows, long rightNumColumns, long outRLen, long outCLen, boolean isLeftTransposed, boolean isRightTransposed) { // t(C) = t(B) %*% t(A) Pointer output = null; if (outRLen != 1 && outCLen != 1) { output = gCtx.allocate(instName, outRLen * outCLen * sizeOfDataType); } else { // no transpose required for vector output output = C; } CuMatMultParameters params = new CuMatMultParameters(rightNumRows, rightNumColumns, leftNumRows, leftNumColumns, !isRightTransposed, !isLeftTransposed); denseSparseMatMult(getCusparseHandle(gCtx), instName, output, B, A, params); if (outRLen != 1 && outCLen != 1) { // Transpose: C = t(output) cudaSupportFunctions.cublasgeam(gCtx.getCublasHandle(), cublasOperation.CUBLAS_OP_T, cublasOperation.CUBLAS_OP_T, toInt(outCLen), toInt(outRLen), one(), output, toInt(outRLen), zero(), new Pointer(), toInt(outRLen), C, toInt(outCLen)); if (!DMLScript.EAGER_CUDA_FREE) JCuda.cudaDeviceSynchronize(); gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE); } }
Example #27
Source File: GPUContext.java From systemds with Apache License 2.0 | 4 votes |
/** * Returns which device is currently being used. * * @return the current device for the calling host thread */ public static int cudaGetDevice() { int[] device = new int[1]; JCuda.cudaGetDevice(device); return device[0]; }
Example #28
Source File: GPUContext.java From systemds with Apache License 2.0 | 4 votes |
/** * Returns which device is currently being used. * * @return the current device for the calling host thread */ public static int cudaGetDevice() { int[] device = new int[1]; JCuda.cudaGetDevice(device); return device[0]; }