jcuda.jcudnn.JCudnn Java Examples

The following examples show how to use jcuda.jcudnn.JCudnn. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LibMatrixCuDNN.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Performs an "softmax" operation on a matrix on the GPU
 * @param ec	execution context
 * @param gCtx a valid {@link GPUContext}
 * @param instName the invoking instruction's name for record {@link Statistics}.
 * @param in1	input matrix
 * @param outputName	output matrix name
 */
public static void softmax(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
	if(LOG.isTraceEnabled()) {
		LOG.trace("GPU : softmax" + ", GPUContext=" + gCtx);
	}
	cudnnTensorDescriptor tensorDesc = allocateTensorDescriptor(toInt(in1.getNumRows()), toInt(in1.getNumColumns()), 1, 1);
	Pointer srcPointer = getDensePointerForCuDNN(gCtx, in1, instName);
	MatrixObject out = ec.getMatrixObject(outputName);
	ec.allocateGPUMatrixObject(outputName, in1.getNumRows(), in1.getNumColumns());
	out.getGPUObject(gCtx).allocateAndFillDense(0);
	Pointer dstPointer = getDensePointerForCuDNN(gCtx, out, instName);
	JCudnn.cudnnSoftmaxForward(gCtx.getCudnnHandle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, 
               one(), tensorDesc, srcPointer,
               zero(), tensorDesc, dstPointer);
	cudnnDestroyTensorDescriptor(tensorDesc);
}
 
Example #2
Source File: LibMatrixCuDNN.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Performs an "softmax" operation on a matrix on the GPU
 * @param ec	execution context
 * @param gCtx a valid {@link GPUContext}
 * @param instName the invoking instruction's name for record {@link Statistics}.
 * @param in1	input matrix
 * @param outputName	output matrix name
 */
public static void softmax(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String outputName) {
	if(LOG.isTraceEnabled()) {
		LOG.trace("GPU : softmax" + ", GPUContext=" + gCtx);
	}
	cudnnTensorDescriptor tensorDesc = allocateTensorDescriptor(toInt(in1.getNumRows()), toInt(in1.getNumColumns()), 1, 1);
	Pointer srcPointer = getDensePointerForCuDNN(gCtx, in1, instName);
	MatrixObject out = ec.getMatrixObject(outputName);
	ec.allocateGPUMatrixObject(outputName, in1.getNumRows(), in1.getNumColumns());
	out.getGPUObject(gCtx).allocateAndFillDense(0);
	Pointer dstPointer = getDensePointerForCuDNN(gCtx, out, instName);
	JCudnn.cudnnSoftmaxForward(gCtx.getCudnnHandle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, 
               one(), tensorDesc, srcPointer,
               zero(), tensorDesc, dstPointer);
	cudnnDestroyTensorDescriptor(tensorDesc);
}
 
Example #3
Source File: LibMatrixCuDNN.java    From systemds with Apache License 2.0 5 votes vote down vote up
private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName,
		Pointer x, Pointer hx, Pointer cx, Pointer wPointer,  // input
		String outputName, String cyName,  					 // output
		String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
	boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
	// Get output pointers
	Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType);
	Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
	// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);
	
	try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer)) {
		JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.cxDesc, cx, 
				algo.wDesc, wPointer, 
				algo.yDesc, cudnnYPointer, 
				algo.hyDesc, hyPointer, 
				algo.cyDesc, cyPointer, 
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
	}
	
	if(return_sequences) {
		gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE);
		Pointer sysdsYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output",
				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M),
				sysdsYPointer, cudnnYPointer, N, T, M, N*T*M);
	}
	gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE);
}
 
Example #4
Source File: LibMatrixCuDNN.java    From systemds with Apache License 2.0 5 votes vote down vote up
private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName,
		Pointer x, Pointer hx, Pointer cx, Pointer wPointer,  // input
		String outputName, String cyName,  					 // output
		String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
	boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
	// Get output pointers
	Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType);
	Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
	// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);
	
	try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer)) {
		JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.cxDesc, cx, 
				algo.wDesc, wPointer, 
				algo.yDesc, cudnnYPointer, 
				algo.hyDesc, hyPointer, 
				algo.cyDesc, cyPointer, 
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
	}
	
	if(return_sequences) {
		gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE);
		Pointer sysdsYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output",
				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M),
				sysdsYPointer, cudnnYPointer, N, T, M, N*T*M);
	}
	gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE);
}
 
Example #5
Source File: JCudnnMnist.java    From jcuda-samples with MIT License 5 votes vote down vote up
public static void main(String args[])
{
    JCuda.setExceptionsEnabled(true);
    JCudnn.setExceptionsEnabled(true);
    JCublas2.setExceptionsEnabled(true);

    int version = (int) cudnnGetVersion();
    System.out.printf("cudnnGetVersion() : %d , " + 
        "CUDNN_VERSION from cudnn.h : %d\n",
        version, CUDNN_VERSION);

    System.out.println("Creating network and layers...");
    Network mnist = new Network();
    
    System.out.println("Classifying...");
    int i1 = mnist.classifyExample(dataDirectory + first_image);
    int i2 = mnist.classifyExample(dataDirectory + second_image);

    mnist.setConvolutionAlgorithm(CUDNN_CONVOLUTION_FWD_ALGO_FFT);
    int i3 = mnist.classifyExample(dataDirectory + third_image);
    
    System.out.println(
        "\nResult of classification: " + i1 + " " + i2 + " " + i3);
    if (i1 != 1 || i2 != 3 || i3 != 5)
    {
        System.out.println("\nTest failed!\n");
    }
    else
    {
        System.out.println("\nTest passed!\n");
    }
    mnist.destroy();
}
 
Example #6
Source File: LibMatrixCuDNN.java    From systemds with Apache License 2.0 4 votes vote down vote up
public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName,
		Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName,  // input
		String dxName, String dwName, String dbName, String dhxName, String dcxName,  	// output
		boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
	// Transform the input dout and prepare them for cudnnRNNBackwardData
	Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	int size = return_sequences ? N*T*M : N*M;
	LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
			ExecutionConfig.getConfigForSimpleVectorOperations(size),
			getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M),
			dy, N, T, M, size, return_sequences ? 1 : 0);
	ec.releaseMatrixInputForGPUInstruction(doutName);
			
	// Allocate intermediate pointers computed by forward
	Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
		JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.cxDesc, cx, 
				algo.wDesc, wPointer, 
				algo.yDesc, yPointer, 
				algo.hyDesc, new Pointer(), 
				algo.cyDesc, new Pointer(), 
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		
		Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType);
		JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.yDesc, yPointer,
				// ----------------------
				// Additional inputs:
				algo.dyDesc, dy, 
				algo.dhyDesc, new Pointer(), 
				algo.dcyDesc, getDenseInputPointer(ec, gCtx, instName, dcyName, N, M),
				// ----------------------
				algo.wDesc, wPointer, 
				algo.hxDesc, hx,
				algo.cxDesc, cx,
				// ----------------------
				// Output:
				algo.dxDesc, cudnnDx, 
				algo.dhxDesc, getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), 
				algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M),
				// ----------------------
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE);
		ec.releaseMatrixInputForGPUInstruction(dcyName);
		ec.releaseMatrixOutputForGPUInstruction(dhxName);
		ec.releaseMatrixOutputForGPUInstruction(dcxName);
		
		Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
				smlDx, cudnnDx, N, D, T*D, N*T*D);
		ec.releaseMatrixOutputForGPUInstruction(dxName);
		gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE);
		
		// -------------------------------------------------------------------------------------------
		Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
		JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.yDesc, yPointer, 
				algo.workSpace, algo.sizeInBytes, 
				algo.dwDesc, cudnnDwPointer, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
				getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), 
				getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
		gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE);
		ec.releaseMatrixOutputForGPUInstruction(dwName);
		ec.releaseMatrixOutputForGPUInstruction(dbName);
		// -------------------------------------------------------------------------------------------
		
		gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE);
	}
}
 
Example #7
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, String instName, 
		String rnnMode, int N, int T, int M, int D, boolean isTraining, Pointer w) throws DMLRuntimeException {
	this.gCtx = gCtx;
	this.instName = instName;
	
	// Allocate input/output descriptors
	xDesc = new cudnnTensorDescriptor[T];
	dxDesc = new cudnnTensorDescriptor[T];
	yDesc = new cudnnTensorDescriptor[T];
	dyDesc = new cudnnTensorDescriptor[T];
	for(int t = 0; t < T; t++) {
		xDesc[t] = allocateTensorDescriptorWithStride(N, D, 1);
		dxDesc[t] = allocateTensorDescriptorWithStride(N, D, 1);
		yDesc[t] = allocateTensorDescriptorWithStride(N, M, 1);
		dyDesc[t] = allocateTensorDescriptorWithStride(N, M, 1);
	}
	hxDesc = allocateTensorDescriptorWithStride(1, N, M); 
	dhxDesc = allocateTensorDescriptorWithStride(1, N, M);
	cxDesc = allocateTensorDescriptorWithStride(1, N, M);
	dcxDesc = allocateTensorDescriptorWithStride(1, N, M);
	hyDesc = allocateTensorDescriptorWithStride(1, N, M);
	dhyDesc = allocateTensorDescriptorWithStride(1, N, M);
	cyDesc = allocateTensorDescriptorWithStride(1, N, M);
	dcyDesc = allocateTensorDescriptorWithStride(1, N, M);
	
	// Initial dropout descriptor
	dropoutDesc = new cudnnDropoutDescriptor();
	JCudnn.cudnnCreateDropoutDescriptor(dropoutDesc);
	long [] _dropOutSizeInBytes = {-1};
	JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), _dropOutSizeInBytes);
	dropOutSizeInBytes = _dropOutSizeInBytes[0];
	dropOutStateSpace = new Pointer();
	if (dropOutSizeInBytes != 0)
		dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes);
	JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345);
	
	// Initialize RNN descriptor
	rnnDesc = new cudnnRNNDescriptor();
	cudnnCreateRNNDescriptor(rnnDesc);
	JCudnn.cudnnSetRNNDescriptor_v6(gCtx.getCudnnHandle(), rnnDesc, M, 1, dropoutDesc, 
			CUDNN_LINEAR_INPUT, CUDNN_UNIDIRECTIONAL, 
			getCuDNNRnnMode(rnnMode), CUDNN_RNN_ALGO_STANDARD, LibMatrixCUDA.CUDNN_DATA_TYPE);
	
	// Allocate filter descriptor
	int expectedNumWeights = getExpectedNumWeights();
	if(rnnMode.equalsIgnoreCase("lstm") && (D+M+2)*4*M != expectedNumWeights) {
		throw new DMLRuntimeException("Incorrect number of RNN parameters " +  (D+M+2)*4*M + " != " +  expectedNumWeights + ", where numFeatures=" + D + ", hiddenSize=" + M);
	}
	wDesc = allocateFilterDescriptor(expectedNumWeights);
	dwDesc = allocateFilterDescriptor(expectedNumWeights);
	
	// Setup workspace
	workSpace = new Pointer(); reserveSpace = new Pointer();
	sizeInBytes = getWorkspaceSize(T);
	if(sizeInBytes != 0)
		workSpace = gCtx.allocate(instName, sizeInBytes);
	reserveSpaceSizeInBytes = 0;
	if(isTraining) {
		reserveSpaceSizeInBytes = getReservespaceSize(T);
		if (reserveSpaceSizeInBytes != 0) {
			reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes);
		}
	}
}
 
Example #8
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
private static cudnnFilterDescriptor allocateFilterDescriptor(int numWeights) {
	cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
	cudnnCreateFilterDescriptor(filterDesc);
	JCudnn.cudnnSetFilterNdDescriptor(filterDesc, LibMatrixCUDA.CUDNN_DATA_TYPE, CUDNN_TENSOR_NCHW, 3, new int[] {numWeights, 1, 1});
	return filterDesc;
}
 
Example #9
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
private int getExpectedNumWeights() throws DMLRuntimeException {
	long [] weightSizeInBytesArray = {-1}; // (D+M+2)*4*M
	JCudnn.cudnnGetRNNParamsSize(gCtx.getCudnnHandle(), rnnDesc, xDesc[0], weightSizeInBytesArray, LibMatrixCUDA.CUDNN_DATA_TYPE);
	// check if (D+M+2)*4M == weightsSize / sizeof(dataType) where weightsSize is given by 'cudnnGetRNNParamsSize'.
	return LibMatrixCUDA.toInt(weightSizeInBytesArray[0]/LibMatrixCUDA.sizeOfDataType);
}
 
Example #10
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
private long getReservespaceSize(int seqLength) {
	long [] sizeInBytesArray = new long[1];
	JCudnn.cudnnGetRNNTrainingReserveSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray);
	return sizeInBytesArray[0];
}
 
Example #11
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
private long getWorkspaceSize(int seqLength) {
	long [] sizeInBytesArray = new long[1];
	JCudnn.cudnnGetRNNWorkspaceSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray);
	return sizeInBytesArray[0];
}
 
Example #12
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
public LibMatrixCuDNNRnnAlgorithm(ExecutionContext ec, GPUContext gCtx, String instName, 
		String rnnMode, int N, int T, int M, int D, boolean isTraining, Pointer w) throws DMLRuntimeException {
	this.gCtx = gCtx;
	this.instName = instName;
	
	// Allocate input/output descriptors
	xDesc = new cudnnTensorDescriptor[T];
	dxDesc = new cudnnTensorDescriptor[T];
	yDesc = new cudnnTensorDescriptor[T];
	dyDesc = new cudnnTensorDescriptor[T];
	for(int t = 0; t < T; t++) {
		xDesc[t] = allocateTensorDescriptorWithStride(N, D, 1);
		dxDesc[t] = allocateTensorDescriptorWithStride(N, D, 1);
		yDesc[t] = allocateTensorDescriptorWithStride(N, M, 1);
		dyDesc[t] = allocateTensorDescriptorWithStride(N, M, 1);
	}
	hxDesc = allocateTensorDescriptorWithStride(1, N, M); 
	dhxDesc = allocateTensorDescriptorWithStride(1, N, M);
	cxDesc = allocateTensorDescriptorWithStride(1, N, M);
	dcxDesc = allocateTensorDescriptorWithStride(1, N, M);
	hyDesc = allocateTensorDescriptorWithStride(1, N, M);
	dhyDesc = allocateTensorDescriptorWithStride(1, N, M);
	cyDesc = allocateTensorDescriptorWithStride(1, N, M);
	dcyDesc = allocateTensorDescriptorWithStride(1, N, M);
	
	// Initial dropout descriptor
	dropoutDesc = new cudnnDropoutDescriptor();
	JCudnn.cudnnCreateDropoutDescriptor(dropoutDesc);
	long [] _dropOutSizeInBytes = {-1};
	JCudnn.cudnnDropoutGetStatesSize(gCtx.getCudnnHandle(), _dropOutSizeInBytes);
	dropOutSizeInBytes = _dropOutSizeInBytes[0];
	dropOutStateSpace = new Pointer();
	if (dropOutSizeInBytes != 0)
		dropOutStateSpace = gCtx.allocate(instName, dropOutSizeInBytes);
	JCudnn.cudnnSetDropoutDescriptor(dropoutDesc, gCtx.getCudnnHandle(), 0, dropOutStateSpace, dropOutSizeInBytes, 12345);
	
	// Initialize RNN descriptor
	rnnDesc = new cudnnRNNDescriptor();
	cudnnCreateRNNDescriptor(rnnDesc);
	JCudnn.cudnnSetRNNDescriptor_v6(gCtx.getCudnnHandle(), rnnDesc, M, 1, dropoutDesc, 
			CUDNN_LINEAR_INPUT, CUDNN_UNIDIRECTIONAL, 
			getCuDNNRnnMode(rnnMode), CUDNN_RNN_ALGO_STANDARD, LibMatrixCUDA.CUDNN_DATA_TYPE);
	
	// Allocate filter descriptor
	int expectedNumWeights = getExpectedNumWeights();
	if(rnnMode.equalsIgnoreCase("lstm") && (D+M+2)*4*M != expectedNumWeights) {
		throw new DMLRuntimeException("Incorrect number of RNN parameters " +  (D+M+2)*4*M + " != " +  expectedNumWeights + ", where numFeatures=" + D + ", hiddenSize=" + M);
	}
	wDesc = allocateFilterDescriptor(expectedNumWeights);
	dwDesc = allocateFilterDescriptor(expectedNumWeights);
	
	// Setup workspace
	workSpace = new Pointer(); reserveSpace = new Pointer();
	sizeInBytes = getWorkspaceSize(T);
	if(sizeInBytes != 0)
		workSpace = gCtx.allocate(instName, sizeInBytes);
	reserveSpaceSizeInBytes = 0;
	if(isTraining) {
		reserveSpaceSizeInBytes = getReservespaceSize(T);
		if (reserveSpaceSizeInBytes != 0) {
			reserveSpace = gCtx.allocate(instName, reserveSpaceSizeInBytes);
		}
	}
}
 
Example #13
Source File: LibMatrixCuDNN.java    From systemds with Apache License 2.0 4 votes vote down vote up
public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName,
		Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName,  // input
		String dxName, String dwName, String dbName, String dhxName, String dcxName,  	// output
		boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
	// Transform the input dout and prepare them for cudnnRNNBackwardData
	Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	int size = return_sequences ? N*T*M : N*M;
	LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
			ExecutionConfig.getConfigForSimpleVectorOperations(size),
			getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M),
			dy, N, T, M, size, return_sequences ? 1 : 0);
	ec.releaseMatrixInputForGPUInstruction(doutName);
			
	// Allocate intermediate pointers computed by forward
	Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
		JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.cxDesc, cx, 
				algo.wDesc, wPointer, 
				algo.yDesc, yPointer, 
				algo.hyDesc, new Pointer(), 
				algo.cyDesc, new Pointer(), 
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		
		Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType);
		JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.yDesc, yPointer,
				// ----------------------
				// Additional inputs:
				algo.dyDesc, dy, 
				algo.dhyDesc, new Pointer(), 
				algo.dcyDesc, getDenseInputPointer(ec, gCtx, instName, dcyName, N, M),
				// ----------------------
				algo.wDesc, wPointer, 
				algo.hxDesc, hx,
				algo.cxDesc, cx,
				// ----------------------
				// Output:
				algo.dxDesc, cudnnDx, 
				algo.dhxDesc, getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), 
				algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M),
				// ----------------------
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE);
		ec.releaseMatrixInputForGPUInstruction(dcyName);
		ec.releaseMatrixOutputForGPUInstruction(dhxName);
		ec.releaseMatrixOutputForGPUInstruction(dcxName);
		
		Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
				smlDx, cudnnDx, N, D, T*D, N*T*D);
		ec.releaseMatrixOutputForGPUInstruction(dxName);
		gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE);
		
		// -------------------------------------------------------------------------------------------
		Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
		JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.yDesc, yPointer, 
				algo.workSpace, algo.sizeInBytes, 
				algo.dwDesc, cudnnDwPointer, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
				getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), 
				getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
		gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE);
		ec.releaseMatrixOutputForGPUInstruction(dwName);
		ec.releaseMatrixOutputForGPUInstruction(dbName);
		// -------------------------------------------------------------------------------------------
		
		gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE);
	}
}
 
Example #14
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
private static cudnnFilterDescriptor allocateFilterDescriptor(int numWeights) {
	cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
	cudnnCreateFilterDescriptor(filterDesc);
	JCudnn.cudnnSetFilterNdDescriptor(filterDesc, LibMatrixCUDA.CUDNN_DATA_TYPE, CUDNN_TENSOR_NCHW, 3, new int[] {numWeights, 1, 1});
	return filterDesc;
}
 
Example #15
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
private int getExpectedNumWeights() throws DMLRuntimeException {
	long [] weightSizeInBytesArray = {-1}; // (D+M+2)*4*M
	JCudnn.cudnnGetRNNParamsSize(gCtx.getCudnnHandle(), rnnDesc, xDesc[0], weightSizeInBytesArray, LibMatrixCUDA.CUDNN_DATA_TYPE);
	// check if (D+M+2)*4M == weightsSize / sizeof(dataType) where weightsSize is given by 'cudnnGetRNNParamsSize'.
	return LibMatrixCUDA.toInt(weightSizeInBytesArray[0]/LibMatrixCUDA.sizeOfDataType);
}
 
Example #16
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
private long getReservespaceSize(int seqLength) {
	long [] sizeInBytesArray = new long[1];
	JCudnn.cudnnGetRNNTrainingReserveSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray);
	return sizeInBytesArray[0];
}
 
Example #17
Source File: LibMatrixCuDNNRnnAlgorithm.java    From systemds with Apache License 2.0 4 votes vote down vote up
private long getWorkspaceSize(int seqLength) {
	long [] sizeInBytesArray = new long[1];
	JCudnn.cudnnGetRNNWorkspaceSize(gCtx.getCudnnHandle(), rnnDesc, seqLength, xDesc, sizeInBytesArray);
	return sizeInBytesArray[0];
}