Java Code Examples for jcuda.jcudnn.JCudnn#cudnnRNNForwardTraining()

The following examples show how to use jcuda.jcudnn.JCudnn#cudnnRNNForwardTraining() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: LibMatrixCuDNN.java From systemds with Apache License 2.0

5 votes

private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName,
		Pointer x, Pointer hx, Pointer cx, Pointer wPointer,  // input
		String outputName, String cyName,  					 // output
		String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
	boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
	// Get output pointers
	Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType);
	Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
	// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);
	
	try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer)) {
		JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.cxDesc, cx, 
				algo.wDesc, wPointer, 
				algo.yDesc, cudnnYPointer, 
				algo.hyDesc, hyPointer, 
				algo.cyDesc, cyPointer, 
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
	}
	
	if(return_sequences) {
		gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE);
		Pointer sysdsYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output",
				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M),
				sysdsYPointer, cudnnYPointer, N, T, M, N*T*M);
	}
	gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE);
}

Example 2

Source File: LibMatrixCuDNN.java From systemds with Apache License 2.0

5 votes

private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName,
		Pointer x, Pointer hx, Pointer cx, Pointer wPointer,  // input
		String outputName, String cyName,  					 // output
		String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
	boolean hasCarry = rnnMode.equalsIgnoreCase("lstm");
	// Get output pointers
	Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType);
	Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer();
	// Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M);
	
	try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer)) {
		JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.cxDesc, cx, 
				algo.wDesc, wPointer, 
				algo.yDesc, cudnnYPointer, 
				algo.hyDesc, hyPointer, 
				algo.cyDesc, cyPointer, 
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
	}
	
	if(return_sequences) {
		gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE);
		Pointer sysdsYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output",
				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M),
				sysdsYPointer, cudnnYPointer, N, T, M, N*T*M);
	}
	gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE);
}

Example 3

Source File: LibMatrixCuDNN.java From systemds with Apache License 2.0

4 votes

public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName,
		Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName,  // input
		String dxName, String dwName, String dbName, String dhxName, String dcxName,  	// output
		boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
	// Transform the input dout and prepare them for cudnnRNNBackwardData
	Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	int size = return_sequences ? N*T*M : N*M;
	LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
			ExecutionConfig.getConfigForSimpleVectorOperations(size),
			getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M),
			dy, N, T, M, size, return_sequences ? 1 : 0);
	ec.releaseMatrixInputForGPUInstruction(doutName);
			
	// Allocate intermediate pointers computed by forward
	Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
		JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.cxDesc, cx, 
				algo.wDesc, wPointer, 
				algo.yDesc, yPointer, 
				algo.hyDesc, new Pointer(), 
				algo.cyDesc, new Pointer(), 
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		
		Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType);
		JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.yDesc, yPointer,
				// ----------------------
				// Additional inputs:
				algo.dyDesc, dy, 
				algo.dhyDesc, new Pointer(), 
				algo.dcyDesc, getDenseInputPointer(ec, gCtx, instName, dcyName, N, M),
				// ----------------------
				algo.wDesc, wPointer, 
				algo.hxDesc, hx,
				algo.cxDesc, cx,
				// ----------------------
				// Output:
				algo.dxDesc, cudnnDx, 
				algo.dhxDesc, getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), 
				algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M),
				// ----------------------
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE);
		ec.releaseMatrixInputForGPUInstruction(dcyName);
		ec.releaseMatrixOutputForGPUInstruction(dhxName);
		ec.releaseMatrixOutputForGPUInstruction(dcxName);
		
		Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
				smlDx, cudnnDx, N, D, T*D, N*T*D);
		ec.releaseMatrixOutputForGPUInstruction(dxName);
		gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE);
		
		// -------------------------------------------------------------------------------------------
		Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
		JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.yDesc, yPointer, 
				algo.workSpace, algo.sizeInBytes, 
				algo.dwDesc, cudnnDwPointer, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
				getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), 
				getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
		gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE);
		ec.releaseMatrixOutputForGPUInstruction(dwName);
		ec.releaseMatrixOutputForGPUInstruction(dbName);
		// -------------------------------------------------------------------------------------------
		
		gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE);
	}
}

Example 4

Source File: LibMatrixCuDNN.java From systemds with Apache License 2.0

4 votes

public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName,
		Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName,  // input
		String dxName, String dwName, String dbName, String dhxName, String dcxName,  	// output
		boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException {
	// Transform the input dout and prepare them for cudnnRNNBackwardData
	Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	int size = return_sequences ? N*T*M : N*M;
	LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients",
			ExecutionConfig.getConfigForSimpleVectorOperations(size),
			getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M),
			dy, N, T, M, size, return_sequences ? 1 : 0);
	ec.releaseMatrixInputForGPUInstruction(doutName);
			
	// Allocate intermediate pointers computed by forward
	Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType);
	try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) {
		JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.cxDesc, cx, 
				algo.wDesc, wPointer, 
				algo.yDesc, yPointer, 
				algo.hyDesc, new Pointer(), 
				algo.cyDesc, new Pointer(), 
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		
		Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType);
		JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.yDesc, yPointer,
				// ----------------------
				// Additional inputs:
				algo.dyDesc, dy, 
				algo.dhyDesc, new Pointer(), 
				algo.dcyDesc, getDenseInputPointer(ec, gCtx, instName, dcyName, N, M),
				// ----------------------
				algo.wDesc, wPointer, 
				algo.hxDesc, hx,
				algo.cxDesc, cx,
				// ----------------------
				// Output:
				algo.dxDesc, cudnnDx, 
				algo.dhxDesc, getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), 
				algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M),
				// ----------------------
				algo.workSpace, algo.sizeInBytes, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE);
		ec.releaseMatrixInputForGPUInstruction(dcyName);
		ec.releaseMatrixOutputForGPUInstruction(dhxName);
		ec.releaseMatrixOutputForGPUInstruction(dcxName);
		
		Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput",
				ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D),
				smlDx, cudnnDx, N, D, T*D, N*T*D);
		ec.releaseMatrixOutputForGPUInstruction(dxName);
		gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE);
		
		// -------------------------------------------------------------------------------------------
		Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType);
		JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, 
				algo.xDesc, x, 
				algo.hxDesc, hx, 
				algo.yDesc, yPointer, 
				algo.workSpace, algo.sizeInBytes, 
				algo.dwDesc, cudnnDwPointer, 
				algo.reserveSpace, algo.reserveSpaceSizeInBytes);
		LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight",
				ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)),
				getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), 
				getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M);
		gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE);
		ec.releaseMatrixOutputForGPUInstruction(dwName);
		ec.releaseMatrixOutputForGPUInstruction(dbName);
		// -------------------------------------------------------------------------------------------
		
		gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE);
	}
}