org.nd4j.jita.conf.CudaEnvironment Java Examples
The following examples show how to use
org.nd4j.jita.conf.CudaEnvironment.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CudaAffinityManager.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * This method returns device id available. Round-robin balancing used here. * * @param threadId this parameter can be anything, it's used for logging only. * @return */ protected Integer getNextDevice(long threadId) { Integer device = null; if (!CudaEnvironment.getInstance().getConfiguration().isForcedSingleGPU() && getNumberOfDevices() > 0) { // simple round-robin here synchronized (this) { device = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().get(devPtr.getAndIncrement()); // We check only for number of entries here, not their actual values if (devPtr.get() >= CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().size()) devPtr.set(0); val t = Thread.currentThread(); val n = t.getId() == threadId ? t.getName() : "N/A"; logger.debug("Mapping thread [{} - {}] to device [{}], out of [{}] devices...", threadId, n, device, CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().size()); } } else { device = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().get(0); logger.debug("Single device is forced, mapping to device [{}]", device); } return device; }
Example #2
Source File: AsynchronousFlowControllerTest.java From nd4j with Apache License 2.0 | 6 votes |
@Before public void setUp() throws Exception { CudaEnvironment.getInstance().getConfiguration() .setFirstMemory(AllocationStatus.DEVICE) .setExecutionModel(Configuration.ExecutionModel.ASYNCHRONOUS) .setAllocationModel(Configuration.AllocationModel.CACHE_ALL) .setMaximumSingleDeviceAllocation(1024 * 1024 * 1024L) .setMaximumBlockSize(128) .allowPreallocation(true) .setPreallocationCalls(20) .setMaximumGridSize(256) .enableDebug(false) .setVerbose(false); if (allocator == null) allocator = AtomicAllocator.getInstance(); if (controller == null) controller = (AsynchronousFlowController) allocator.getFlowController(); }
Example #3
Source File: LimitedContextPool.java From nd4j with Apache License 2.0 | 6 votes |
public LimitedContextPool() { int perDevicePool = CudaEnvironment.getInstance().getConfiguration().getPoolSize(); for (int i = 0; i < 4; i++) { ReferenceQueue<Thread> queue = new ReferenceQueue<>(); ResourceGarbageCollectorThread collector = new ResourceGarbageCollectorThread(i, queue); collector.start(); collectors.put(i, collector); queueMap.put(i, queue); } fillPoolWithResources(perDevicePool, false); currentPoolSize.set(perDevicePool); }
Example #4
Source File: AtomicAllocator.java From deeplearning4j with Apache License 2.0 | 6 votes |
public void applyConfiguration() { //log.info("Applying CUDA configuration..."); CudaEnvironment.getInstance().notifyConfigurationApplied(); NativeOpsHolder.getInstance().getDeviceNativeOps().enableDebugMode(configuration.isDebug()); //configuration.enableDebug(configuration.isDebug()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableVerboseMode(configuration.isVerbose()); //configuration.setVerbose(configuration.isVerbose()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableP2P(configuration.isCrossDeviceAccessAllowed()); //configuration.allowCrossDeviceAccess(configuration.isCrossDeviceAccessAllowed()); NativeOpsHolder.getInstance().getDeviceNativeOps().setGridLimit(configuration.getMaximumGridSize()); //configuration.setMaximumGridSize(configuration.getMaximumGridSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpNumThreads(configuration.getMaximumBlockSize()); // configuration.setMaximumBlockSize(configuration.getMaximumBlockSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpMinThreads(configuration.getMinimumBlockSize()); // configuration.setMinimumBlockSize(configuration.getMinimumBlockSize()); }
Example #5
Source File: AtomicAllocator.java From nd4j with Apache License 2.0 | 6 votes |
public void applyConfiguration() { //log.info("Applying CUDA configuration..."); CudaEnvironment.getInstance().notifyConfigurationApplied(); NativeOpsHolder.getInstance().getDeviceNativeOps().enableDebugMode(configuration.isDebug()); //configuration.enableDebug(configuration.isDebug()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableVerboseMode(configuration.isVerbose()); //configuration.setVerbose(configuration.isVerbose()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableP2P(configuration.isCrossDeviceAccessAllowed()); //configuration.allowCrossDeviceAccess(configuration.isCrossDeviceAccessAllowed()); NativeOpsHolder.getInstance().getDeviceNativeOps().setGridLimit(configuration.getMaximumGridSize()); //configuration.setMaximumGridSize(configuration.getMaximumGridSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpNumThreads(configuration.getMaximumBlockSize()); // configuration.setMaximumBlockSize(configuration.getMaximumBlockSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpMinThreads(configuration.getMinimumBlockSize()); // configuration.setMinimumBlockSize(configuration.getMinimumBlockSize()); }
Example #6
Source File: PolicyNetService.java From FancyBing with GNU General Public License v3.0 | 6 votes |
public static void main(String[] args) { Nd4j.getMemoryManager().setAutoGcWindow(2000); CudaEnvironment.getInstance().getConfiguration() .setMaximumDeviceCacheableLength(1024 * 1024 * 1024L) .setMaximumDeviceCache(2L * 1024 * 1024 * 1024L) .setMaximumHostCacheableLength(1024 * 1024 * 1024L) .setMaximumHostCache(8L * 1024 * 1024 * 1024L); // Register services, bind services in multi ports for better performance Registry registry = null; for (int i = 0; i < Global.NETWORK_THREADS_NUM; i++) { try { registry = LocateRegistry.createRegistry(Global.POLICYNET_RMI_PORT + i); PolicyNetService policyNet = new PolicyNetService(); registry.rebind(Global.NAME + "Policy", policyNet); System.out.println("Bind FancyBingPolicy server on " + (Global.POLICYNET_RMI_PORT + i)); System.out.println("FancyBingPolicy server started."); } catch (Exception e) { e.printStackTrace(); } } }
Example #7
Source File: CudaAffinityManager.java From nd4j with Apache License 2.0 | 6 votes |
/** * This method returns device id available. Round-robin balancing used here. * * @param threadId this parameter can be anything, it's used for logging only. * @return */ protected Integer getNextDevice(long threadId) { Integer device = null; if (!CudaEnvironment.getInstance().getConfiguration().isForcedSingleGPU() && getNumberOfDevices() > 0) { // simple round-robin here synchronized (this) { device = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().get(devPtr.getAndIncrement()); // We check only for number of entries here, not their actual values if (devPtr.get() >= CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().size()) devPtr.set(0); logger.debug("Mapping thread [{}] to device [{}], out of [{}] devices...", threadId, device, CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().size()); } } else { device = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().get(0); logger.debug("Single device is forced, mapping to device [{}]", device); } return device; }
Example #8
Source File: CudaTransformsTests.java From nd4j with Apache License 2.0 | 6 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setFirstMemory(AllocationStatus.DEVICE) .setExecutionModel(Configuration.ExecutionModel.ASYNCHRONOUS) .setAllocationModel(Configuration.AllocationModel.CACHE_ALL) .setMaximumSingleDeviceAllocation(1024 * 1024 * 1024L) .setMaximumBlockSize(128) .setMaximumGridSize(256) .enableDebug(false) .setVerbose(false); System.out.println("Init called"); }
Example #9
Source File: CudnnDropoutHelper.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Override public void backprop(INDArray gradAtOutput, INDArray gradAtInput) { int[] gradAtOutShape = adaptForTensorDescr(ArrayUtil.toInts(gradAtOutput.shape())); int[] gradAtOutStride = adaptForTensorDescr(ArrayUtil.toInts(gradAtOutput.stride())); checkCudnn(cudnnSetTensorNdDescriptor(cudnnContext.dyTensorDesc, dataType, gradAtOutShape.length, gradAtOutShape, gradAtOutStride)); int[] gradAtInShape = adaptForTensorDescr(ArrayUtil.toInts(gradAtInput.shape())); int[] gradAtInStride = adaptForTensorDescr(ArrayUtil.toInts(gradAtInput.stride())); checkCudnn(cudnnSetTensorNdDescriptor(cudnnContext.dxTensorDesc, dataType, gradAtInShape.length, gradAtInShape, gradAtInStride)); Allocator allocator = AtomicAllocator.getInstance(); CudaContext context = allocator.getFlowController().prepareAction(gradAtOutput, gradAtInput); Pointer dyPtr = allocator.getPointer(gradAtOutput, context); Pointer dxPtr = allocator.getPointer(gradAtInput, context); checkCudnn(cudnnDropoutBackward(cudnnContext, cudnnContext.dropoutDesc, cudnnContext.dyTensorDesc, dyPtr, cudnnContext.dxTensorDesc, dxPtr, mask, mask.capacity())); allocator.registerAction(context, gradAtOutput, gradAtInput); if (CudaEnvironment.getInstance().getConfiguration().isDebug()) context.syncOldStream(); }
Example #10
Source File: WeirdSparkTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .enableDebug(false) .setVerbose(false) .allowPreallocation(false) .setAllocationModel(Configuration.AllocationModel.CACHE_ALL) .setMemoryModel(Configuration.MemoryModel.IMMEDIATE); }
Example #11
Source File: CudaReduce3Tests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setFirstMemory(AllocationStatus.DEVICE) .setAllocationModel(Configuration.AllocationModel.DIRECT) .setMaximumBlockSize(32) .enableDebug(true) .setVerbose(true); System.out.println("Init called"); }
Example #12
Source File: DevicesTests.java From nd4j with Apache License 2.0 | 5 votes |
@Test public void testOtherDevice1() { CudaEnvironment.getInstance().getConfiguration().useDevices(1, 2); INDArray array = Nd4j.create(1000000); for (int i = 0; i < 10000; i++) { array.addi(10f); } assertEquals(1, AtomicAllocator.getInstance().getAllocationPoint(array).getDeviceId()); }
Example #13
Source File: DevicesTests.java From nd4j with Apache License 2.0 | 5 votes |
@Test public void testOtherDevice2() { CudaEnvironment.getInstance().getConfiguration().useDevices(0); INDArray array = Nd4j.create(1000000); for (int i = 0; i < 10000; i++) { array.addi(10f); } assertEquals(0, AtomicAllocator.getInstance().getAllocationPoint(array).getDeviceId()); }
Example #14
Source File: AveragingTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { DataTypeUtil.setDTypeForContext(DataBuffer.Type.FLOAT); CudaEnvironment.getInstance().getConfiguration() .allowMultiGPU(true) .allowCrossDeviceAccess(true) .enableDebug(true) .setMaximumGridSize(512) .setMaximumBlockSize(256) .setVerbose(true); }
Example #15
Source File: CudaBroadcastTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setExecutionModel(Configuration.ExecutionModel.SEQUENTIAL) .setFirstMemory(AllocationStatus.DEVICE) .setMaximumBlockSize(64) .setMaximumGridSize(128) .enableDebug(true); System.out.println("Init called"); }
Example #16
Source File: SporadicTests.java From nd4j with Apache License 2.0 | 5 votes |
@Test public void testReduceX() throws Exception { CudaEnvironment.getInstance().getConfiguration().setMaximumGridSize(11); INDArray x = Nd4j.create(500, 500); INDArray exp_0 = Nd4j.linspace(1, 500, 500); INDArray exp_1 = Nd4j.create(500).assign(250.5); x.addiRowVector(Nd4j.linspace(1, 500, 500)); assertEquals(exp_0, x.mean(0)); assertEquals(exp_1, x.mean(1)); assertEquals(250.5, x.meanNumber().doubleValue(), 1e-5); }
Example #17
Source File: SporadicTests.java From nd4j with Apache License 2.0 | 5 votes |
@Test public void testIndexReduceX() throws Exception { CudaEnvironment.getInstance().getConfiguration().setMaximumGridSize(11); INDArray x = Nd4j.create(500, 500); INDArray exp_0 = Nd4j.create(500).assign(0); INDArray exp_1 = Nd4j.create(500).assign(499); x.addiRowVector(Nd4j.linspace(1, 500, 500)); assertEquals(exp_0, Nd4j.argMax(x, 0)); assertEquals(exp_1, Nd4j.argMax(x, 1)); }
Example #18
Source File: ElementWiseStrideTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setFirstMemory(AllocationStatus.DEVICE) .setExecutionModel(Configuration.ExecutionModel.SEQUENTIAL) .setAllocationModel(Configuration.AllocationModel.CACHE_ALL) .setMaximumBlockSize(128) .enableDebug(true) .setVerbose(true); System.out.println("Init called"); }
Example #19
Source File: EndlessTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setFirstMemory(AllocationStatus.DEVICE) .setExecutionModel(Configuration.ExecutionModel.SEQUENTIAL) .setAllocationModel(Configuration.AllocationModel.CACHE_ALL) .enableDebug(false) .setVerbose(false); System.out.println("Init called"); }
Example #20
Source File: CudaPairwiseTrainformsTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setExecutionModel(Configuration.ExecutionModel.SEQUENTIAL) .setFirstMemory(AllocationStatus.DEVICE) .setMaximumBlockSize(256) .setMaximumGridSize(64) .enableDebug(true) .setVerbose(true); System.out.println("Init called"); }
Example #21
Source File: DelayedMemoryTest.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setFirstMemory(AllocationStatus.DEVICE) .setMemoryModel(Configuration.MemoryModel.DELAYED) .allowMultiGPU(true) .enableDebug(true); }
Example #22
Source File: CudaIndexReduceTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setExecutionModel(Configuration.ExecutionModel.SEQUENTIAL) .setFirstMemory(AllocationStatus.DEVICE) .setMaximumBlockSize(64) .setMaximumGridSize(64) .enableDebug(true); System.out.println("Init called"); }
Example #23
Source File: DoublesTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() throws Exception { System.out.println("----------------------"); DataTypeUtil.setDTypeForContext(DataBuffer.Type.DOUBLE); CudaEnvironment.getInstance().getConfiguration().enableDebug(true).setVerbose(true).allowMultiGPU(false); }
Example #24
Source File: CudaAffinityManager.java From nd4j with Apache License 2.0 | 5 votes |
/** * This method pairs specified thread & device * * @param threadId * @param deviceId */ @Override public void attachThreadToDevice(long threadId, Integer deviceId) { List<Integer> devices = new ArrayList<>(CudaEnvironment.getInstance().getConfiguration().getAvailableDevices()); logger.debug("Manually mapping thread [{}] to device [{}], out of [{}] devices...", threadId, deviceId, devices.size()); affinityMap.put(threadId, deviceId); }
Example #25
Source File: CudaCachingZeroProvider.java From nd4j with Apache License 2.0 | 5 votes |
/** * This method provides PointersPair to memory chunk specified by AllocationShape * * PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape. * * @param shape shape of desired memory chunk * @param point target AllocationPoint structure * @param location either HOST or DEVICE * @return */ @Override public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) { long reqMemory = AllocationUtils.getRequiredMemory(shape); if (location == AllocationStatus.HOST && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCacheableLength()) { CacheHolder cache = zeroCache.get(shape); if (cache != null) { Pointer pointer = cache.poll(); if (pointer != null) { cacheZeroHit.incrementAndGet(); // since this memory chunk is going to be used now, remove it's amount from zeroCachedAmount.addAndGet(-1 * reqMemory); PointersPair pair = new PointersPair(); pair.setDevicePointer(new CudaPointer(pointer.address())); pair.setHostPointer(new CudaPointer(pointer.address())); point.setAllocationStatus(AllocationStatus.HOST); return pair; } } cacheZeroMiss.incrementAndGet(); if (CudaEnvironment.getInstance().getConfiguration().isUsePreallocation() && zeroCachedAmount.get() < CudaEnvironment.getInstance().getConfiguration().getMaximumHostCache() / 10 && reqMemory < 16 * 1024 * 1024L) { CachePreallocator preallocator = new CachePreallocator(shape, location, CudaEnvironment.getInstance().getConfiguration().getPreallocationCalls()); preallocator.start(); } cacheZeroMiss.incrementAndGet(); return super.malloc(shape, point, location); } return super.malloc(shape, point, location); }
Example #26
Source File: CudaFullCachingProvider.java From nd4j with Apache License 2.0 | 5 votes |
/** * This method provides PointersPair to memory chunk specified by AllocationShape * * PLEASE NOTE: This method can actually ignore malloc request, and give out previously cached free memory chunk with equal shape. * * @param shape shape of desired memory chunk * @param point target AllocationPoint structure * @param location either HOST or DEVICE * @return */ @Override public PointersPair malloc(AllocationShape shape, AllocationPoint point, AllocationStatus location) { long reqMemory = AllocationUtils.getRequiredMemory(shape); if (location == AllocationStatus.DEVICE && reqMemory < CudaEnvironment.getInstance().getConfiguration().getMaximumDeviceAllocation()) { int deviceId = AtomicAllocator.getInstance().getDeviceId(); ensureDeviceCacheHolder(deviceId, shape); CacheHolder cache = deviceCache.get(deviceId).get(shape); if (cache != null) { Pointer pointer = cache.poll(); if (pointer != null) { cacheDeviceHit.incrementAndGet(); deviceCachedAmount.get(deviceId).addAndGet(-1 * reqMemory); PointersPair pair = new PointersPair(); pair.setDevicePointer(pointer); point.setAllocationStatus(AllocationStatus.DEVICE); point.setDeviceId(deviceId); return pair; } } cacheDeviceMiss.incrementAndGet(); return super.malloc(shape, point, location); } return super.malloc(shape, point, location); }
Example #27
Source File: CudaAccumTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setExecutionModel(Configuration.ExecutionModel.ASYNCHRONOUS) .setFirstMemory(AllocationStatus.DEVICE) .setMaximumBlockSize(128) .setMaximumGridSize(256) .enableDebug(false) .setVerbose(false); System.out.println("Init called"); }
Example #28
Source File: CudaScalarsTests.java From nd4j with Apache License 2.0 | 5 votes |
@Before public void setUp() { CudaEnvironment.getInstance().getConfiguration() .setExecutionModel(Configuration.ExecutionModel.SEQUENTIAL) .setFirstMemory(AllocationStatus.DEVICE) .setMaximumBlockSize(64) .setMaximumGridSize(256) .enableDebug(true); System.out.println("Init called"); }
Example #29
Source File: LimitedContextPool.java From nd4j with Apache License 2.0 | 5 votes |
protected synchronized void fillPoolWithResources(int numResources, boolean restoreDevice) { List<Integer> devices = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices(); int cDevice = 0; if (restoreDevice) { cDevice = AtomicAllocator.getInstance().getDeviceId(); } NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps(); for (Integer device : devices) { nativeOps.setDevice(new CudaPointer(device)); pool.put(device, new LinkedBlockingQueue<CudaContext>()); cublasHandle_t handle = createNewCublasHandle(); cusolverDnHandle_t solverHandle = createNewSolverHandle(); for (int cnt = 0; cnt < numResources; cnt++) { CudaContext context = createNewStream(device); context.initOldStream(); getDeviceBuffers(context, device); context.setHandle(handle); context.setSolverHandle(solverHandle); context.syncOldStream(); pool.get(device).add(context); } } if (restoreDevice) { nativeOps.setDevice(new CudaPointer(cDevice)); } }
Example #30
Source File: ProtectedCudaShapeInfoProvider.java From nd4j with Apache License 2.0 | 5 votes |
@Override public Pair<DataBuffer, long[]> createShapeInformation(long[] shape, long[] stride, long offset, long elementWiseStride, char order) { // We enforce offset to 0 in shapeBuffer, since we need it for cache efficiency + we don't actually use offset value @ native side offset = 0; Integer deviceId = AtomicAllocator.getInstance().getDeviceId(); LongShapeDescriptor descriptor = new LongShapeDescriptor(shape, stride, offset, elementWiseStride, order); if (!protector.containsDataBuffer(deviceId, descriptor)) { Pair<DataBuffer, long[]> buffer = null; synchronized (this) { if (!protector.containsDataBuffer(deviceId, descriptor)) { //log.info("Cache miss: {}", descriptor); buffer = super.createShapeInformation(shape, stride, offset, elementWiseStride, order); buffer.getFirst().setConstant(true); if (CudaEnvironment.getInstance().getConfiguration().getMemoryModel() == Configuration.MemoryModel.IMMEDIATE) { Nd4j.getConstantHandler().moveToConstantSpace(buffer.getFirst()); } //deviceCache.get(deviceId).put(descriptor, buffer); protector.persistDataBuffer(deviceId, descriptor, buffer); bytes.addAndGet(buffer.getFirst().length() * 4 * 2); cacheMiss.incrementAndGet(); } else { buffer = protector.getDataBuffer(deviceId, descriptor); } } return buffer; } else { // log.info("Cache hit: {}", descriptor); cacheHit.incrementAndGet(); } return protector.getDataBuffer(deviceId, descriptor); //deviceCache.get(deviceId).get(descriptor); }