Java Code Examples for jcuda.jcudnn.JCudnn#cudnnRNNForwardTraining()
The following examples show how to use
jcuda.jcudnn.JCudnn#cudnnRNNForwardTraining() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LibMatrixCuDNN.java From systemds with Apache License 2.0 | 5 votes |
private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName, Pointer x, Pointer hx, Pointer cx, Pointer wPointer, // input String outputName, String cyName, // output String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException { boolean hasCarry = rnnMode.equalsIgnoreCase("lstm"); // Get output pointers Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType); Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType); Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer(); // Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M); try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer)) { JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.xDesc, x, algo.hxDesc, hx, algo.cxDesc, cx, algo.wDesc, wPointer, algo.yDesc, cudnnYPointer, algo.hyDesc, hyPointer, algo.cyDesc, cyPointer, algo.workSpace, algo.sizeInBytes, algo.reserveSpace, algo.reserveSpaceSizeInBytes); } if(return_sequences) { gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE); Pointer sysdsYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output", ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M), sysdsYPointer, cudnnYPointer, N, T, M, N*T*M); } gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE); }
Example 2
Source File: LibMatrixCuDNN.java From systemds with Apache License 2.0 | 5 votes |
private static void singleLayerUnidirectionalRNNForward(ExecutionContext ec, GPUContext gCtx, String instName, Pointer x, Pointer hx, Pointer cx, Pointer wPointer, // input String outputName, String cyName, // output String rnnMode, boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException { boolean hasCarry = rnnMode.equalsIgnoreCase("lstm"); // Get output pointers Pointer cudnnYPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType); Pointer hyPointer = !return_sequences ? getDenseOutputPointer(ec, gCtx, instName, outputName, N, M) : gCtx.allocate(instName, N*M*sizeOfDataType); Pointer cyPointer = hasCarry ? getDenseOutputPointer(ec, gCtx, instName, cyName, N, M) : new Pointer(); // Pointer wPointer = getDensePointerForCuDNN(gCtx, w, instName, D+M+2, 4*M); try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, rnnMode, N, T, M, D, true, wPointer)) { JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.xDesc, x, algo.hxDesc, hx, algo.cxDesc, cx, algo.wDesc, wPointer, algo.yDesc, cudnnYPointer, algo.hyDesc, hyPointer, algo.cyDesc, cyPointer, algo.workSpace, algo.sizeInBytes, algo.reserveSpace, algo.reserveSpaceSizeInBytes); } if(return_sequences) { gCtx.cudaFreeHelper(instName, hyPointer, DMLScript.EAGER_CUDA_FREE); Pointer sysdsYPointer = getDenseOutputPointer(ec, gCtx, instName, outputName, N, T*M); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_output", ExecutionConfig.getConfigForSimpleVectorOperations(N*T*M), sysdsYPointer, cudnnYPointer, N, T, M, N*T*M); } gCtx.cudaFreeHelper(instName, cudnnYPointer, DMLScript.EAGER_CUDA_FREE); }
Example 3
Source File: LibMatrixCuDNN.java From systemds with Apache License 2.0 | 4 votes |
public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName, Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName, // input String dxName, String dwName, String dbName, String dhxName, String dcxName, // output boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException { // Transform the input dout and prepare them for cudnnRNNBackwardData Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType); int size = return_sequences ? N*T*M : N*M; LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients", ExecutionConfig.getConfigForSimpleVectorOperations(size), getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M), dy, N, T, M, size, return_sequences ? 1 : 0); ec.releaseMatrixInputForGPUInstruction(doutName); // Allocate intermediate pointers computed by forward Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType); try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) { JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.xDesc, x, algo.hxDesc, hx, algo.cxDesc, cx, algo.wDesc, wPointer, algo.yDesc, yPointer, algo.hyDesc, new Pointer(), algo.cyDesc, new Pointer(), algo.workSpace, algo.sizeInBytes, algo.reserveSpace, algo.reserveSpaceSizeInBytes); Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType); JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.yDesc, yPointer, // ---------------------- // Additional inputs: algo.dyDesc, dy, algo.dhyDesc, new Pointer(), algo.dcyDesc, getDenseInputPointer(ec, gCtx, instName, dcyName, N, M), // ---------------------- algo.wDesc, wPointer, algo.hxDesc, hx, algo.cxDesc, cx, // ---------------------- // Output: algo.dxDesc, cudnnDx, algo.dhxDesc, getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M), // ---------------------- algo.workSpace, algo.sizeInBytes, algo.reserveSpace, algo.reserveSpaceSizeInBytes); gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE); ec.releaseMatrixInputForGPUInstruction(dcyName); ec.releaseMatrixOutputForGPUInstruction(dhxName); ec.releaseMatrixOutputForGPUInstruction(dcxName); Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput", ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), smlDx, cudnnDx, N, D, T*D, N*T*D); ec.releaseMatrixOutputForGPUInstruction(dxName); gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE); // ------------------------------------------------------------------------------------------- Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.xDesc, x, algo.hxDesc, hx, algo.yDesc, yPointer, algo.workSpace, algo.sizeInBytes, algo.dwDesc, cudnnDwPointer, algo.reserveSpace, algo.reserveSpaceSizeInBytes); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight", ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)), getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M); gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE); ec.releaseMatrixOutputForGPUInstruction(dwName); ec.releaseMatrixOutputForGPUInstruction(dbName); // ------------------------------------------------------------------------------------------- gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE); } }
Example 4
Source File: LibMatrixCuDNN.java From systemds with Apache License 2.0 | 4 votes |
public static void lstmBackward(ExecutionContext ec, GPUContext gCtx, String instName, Pointer x, Pointer hx, Pointer cx, Pointer wPointer, String doutName, String dcyName, // input String dxName, String dwName, String dbName, String dhxName, String dcxName, // output boolean return_sequences, int N, int M, int D, int T) throws DMLRuntimeException { // Transform the input dout and prepare them for cudnnRNNBackwardData Pointer dy = gCtx.allocate(instName, N*T*M*sizeOfDataType); int size = return_sequences ? N*T*M : N*M; LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_backward_gradients", ExecutionConfig.getConfigForSimpleVectorOperations(size), getDenseInputPointer(ec, gCtx, instName, doutName, N, return_sequences ? T*M : M), dy, N, T, M, size, return_sequences ? 1 : 0); ec.releaseMatrixInputForGPUInstruction(doutName); // Allocate intermediate pointers computed by forward Pointer yPointer = gCtx.allocate(instName, N*T*M*sizeOfDataType); try(LibMatrixCuDNNRnnAlgorithm algo = new LibMatrixCuDNNRnnAlgorithm(ec, gCtx, instName, "lstm", N, T, M, D, true, wPointer)) { JCudnn.cudnnRNNForwardTraining(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.xDesc, x, algo.hxDesc, hx, algo.cxDesc, cx, algo.wDesc, wPointer, algo.yDesc, yPointer, algo.hyDesc, new Pointer(), algo.cyDesc, new Pointer(), algo.workSpace, algo.sizeInBytes, algo.reserveSpace, algo.reserveSpaceSizeInBytes); Pointer cudnnDx = gCtx.allocate(instName, N*T*D*LibMatrixCUDA.sizeOfDataType); JCudnn.cudnnRNNBackwardData(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.yDesc, yPointer, // ---------------------- // Additional inputs: algo.dyDesc, dy, algo.dhyDesc, new Pointer(), algo.dcyDesc, getDenseInputPointer(ec, gCtx, instName, dcyName, N, M), // ---------------------- algo.wDesc, wPointer, algo.hxDesc, hx, algo.cxDesc, cx, // ---------------------- // Output: algo.dxDesc, cudnnDx, algo.dhxDesc, getDenseOutputPointer(ec, gCtx, instName, dhxName, N, M), algo.dcxDesc, getDenseOutputPointer(ec, gCtx, instName, dcxName, N, M), // ---------------------- algo.workSpace, algo.sizeInBytes, algo.reserveSpace, algo.reserveSpaceSizeInBytes); gCtx.cudaFreeHelper(instName, dy, DMLScript.EAGER_CUDA_FREE); ec.releaseMatrixInputForGPUInstruction(dcyName); ec.releaseMatrixOutputForGPUInstruction(dhxName); ec.releaseMatrixOutputForGPUInstruction(dcxName); Pointer smlDx = getDenseOutputPointer(ec, gCtx, instName, dxName, N, T*D); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dinput", ExecutionConfig.getConfigForSimpleVectorOperations(N*T*D), smlDx, cudnnDx, N, D, T*D, N*T*D); ec.releaseMatrixOutputForGPUInstruction(dxName); gCtx.cudaFreeHelper(instName, cudnnDx, DMLScript.EAGER_CUDA_FREE); // ------------------------------------------------------------------------------------------- Pointer cudnnDwPointer = gCtx.allocate(instName, (D+M+2)*(4*M)*LibMatrixCUDA.sizeOfDataType); JCudnn.cudnnRNNBackwardWeights(gCtx.getCudnnHandle(), algo.rnnDesc, T, algo.xDesc, x, algo.hxDesc, hx, algo.yDesc, yPointer, algo.workSpace, algo.sizeInBytes, algo.dwDesc, cudnnDwPointer, algo.reserveSpace, algo.reserveSpaceSizeInBytes); LibMatrixCUDA.getCudaKernels(gCtx).launchKernel("prepare_lstm_dweight", ExecutionConfig.getConfigForSimpleVectorOperations((D+M+2)*(4*M)), getDenseOutputPointer(ec, gCtx, instName, dwName, D+M, 4*M), getDenseOutputPointer(ec, gCtx, instName, dbName, 1, 4*M), cudnnDwPointer, D, M); gCtx.cudaFreeHelper(instName, cudnnDwPointer, DMLScript.EAGER_CUDA_FREE); ec.releaseMatrixOutputForGPUInstruction(dwName); ec.releaseMatrixOutputForGPUInstruction(dbName); // ------------------------------------------------------------------------------------------- gCtx.cudaFreeHelper(instName, yPointer, DMLScript.EAGER_CUDA_FREE); } }