Java Code Examples for org.apache.spark.api.java.JavaPairRDD#values()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#values() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StreamableRDDTest_Failures.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Test public void testBasicStream() throws Exception { StreamListener<ExecRow> sl = new StreamListener<>(); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(tenRows, 2).mapToPair(new FailsFunction(3)); StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); srdd.submit(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); LOG.trace(execRow); count++; assertNotNull(execRow); assertTrue(execRow.getColumn(1).getInt() < 10); } assertEquals(10, count); }
Example 2
Source File: StreamableRDDTest_Failures.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Test public void testFailureBoundary() throws Exception { StreamListener<ExecRow> sl = new StreamListener<>(); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(tenRows, 20).mapToPair(new FailsFunction(4)); StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); srdd.submit(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); LOG.trace(execRow); count++; assertNotNull(execRow); assertTrue(execRow.getColumn(1).getInt() < 10); } assertEquals(10, count); }
Example 3
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Test public void testBasicStream() throws Exception { StreamListener<ExecRow> sl = new StreamListener<>(); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(tenRows, 10); StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); srdd.submit(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); LOG.trace(execRow); count++; assertNotNull(execRow); assertTrue(execRow.getColumn(1).getInt() < 10); } assertEquals(10, count); }
Example 4
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testOffset() throws StandardException { StreamListener<ExecRow> sl = new StreamListener<>(-1, 60000); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 100000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13); final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; int first = 60000; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); assertEquals(count+first, execRow.getColumn(1).getInt()); count++; } assertEquals(100000-60000, count); }
Example 5
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testLimit() throws StandardException { StreamListener<ExecRow> sl = new StreamListener<>(400, 0); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 100000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13); final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); assertEquals(count, execRow.getColumn(1).getInt()); count++; } assertEquals(400, count); }
Example 6
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testOffsetLimit() throws StandardException { StreamListener<ExecRow> sl = new StreamListener<>(400, 30000); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 100000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13); final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; int first = 30000; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); assertEquals(count+first, execRow.getColumn(1).getInt()); count++; } assertEquals(400, count); }
Example 7
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testSmallLimit() throws StandardException { int limit = 2000; int offset = 0; int total = 4000; int batches = 2; int batchSize = 512; StreamListener<ExecRow> sl = new StreamListener<>(limit, offset, batches, batchSize); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < total; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 1); final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; int first = offset; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); assertEquals(count+first, execRow.getColumn(1).getInt()); count++; } assertEquals(limit, count); }
Example 8
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testSmallOffsetLimit() throws StandardException { int limit = 100; int offset = 2000; int total = 4000; StreamListener<ExecRow> sl = new StreamListener<>(limit, offset); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < total; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 1); final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; int first = offset; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); assertEquals(count+first, execRow.getColumn(1).getInt()); count++; } assertEquals(limit, count); }
Example 9
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testBlockingLargeOddPartitions() throws StandardException { StreamListener<ExecRow> sl = new StreamListener<>(); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 100000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13); final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); count++; assertNotNull(execRow); } assertEquals(100000, count); }
Example 10
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testBlockingLarge() throws StandardException { StreamListener<ExecRow> sl = new StreamListener<>(); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 100000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 12); final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); count++; assertNotNull(execRow); } assertEquals(100000, count); }
Example 11
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testBlocking() throws StandardException { StreamListener<ExecRow> sl = new StreamListener<>(); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 10000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 6); final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { LOG.error(e); throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); count++; assertNotNull(execRow); } assertEquals(10000, count); }
Example 12
Source File: StreamableRDDTest_Failures.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testFailureAfterRecoveryWarmup() throws StandardException, FileNotFoundException, UnsupportedEncodingException { int size = 100000; int batches = 2; int batchSize = 512; StreamListener<ExecRow> sl = new StreamListener<>(-1, 0, batches, batchSize); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < size; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 2).sortByKey().mapToPair(new FailsTwiceFunction(10000, 2000)); final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); count++; } assertEquals(size, count); }
Example 13
Source File: StreamableRDDTest_Failures.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testFailureDuringRecoveryWarmup() throws StandardException, FileNotFoundException, UnsupportedEncodingException { int size = 100000; int batches = 2; int batchSize = 512; StreamListener<ExecRow> sl = new StreamListener<>(-1, 0, batches, batchSize); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < size; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 2).sortByKey().mapToPair(new FailsTwiceFunction(10000, 100)); final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); count++; } assertEquals(size, count); }
Example 14
Source File: StreamableRDDTest_Failures.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testFailureAfterLimit() throws StandardException { StreamListener<ExecRow> sl = new StreamListener<>(40000, 300); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 100000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13).mapToPair(new FailsFunction(40301));; final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; int first = 300; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); assertEquals(count+first, execRow.getColumn(1).getInt()); count++; } assertEquals(40000, count); }
Example 15
Source File: StreamableRDDTest_Failures.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testFailureBeforeOffset() throws StandardException { StreamListener<ExecRow> sl = new StreamListener<>(40000, 300); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 100000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13).mapToPair(new FailsFunction(200));; final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; int first = 300; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); assertEquals(count+first, execRow.getColumn(1).getInt()); count++; } assertEquals(40000, count); }
Example 16
Source File: StreamableRDDTest_Failures.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testFailureBeforeLargeOffset() throws StandardException { StreamListener<ExecRow> sl = new StreamListener<>(400, 30000); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 100000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13).mapToPair(new FailsFunction(29500));; final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; int first = 30000; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); assertEquals(count+first, execRow.getColumn(1).getInt()); count++; } assertEquals(400, count); }
Example 17
Source File: StreamableRDDTest_Failures.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testBlockingLarge() throws StandardException, FileNotFoundException, UnsupportedEncodingException { int size = 100000; int batches = 2; int batchSize = 512; StreamListener<ExecRow> sl = new StreamListener<>(-1, 0, batches, batchSize); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < size; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 12).sortByKey().mapToPair(new FailsFunction(10000)); final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); count++; } assertEquals(size, count); }
Example 18
Source File: StreamableRDDTest_Failures.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testBlockingMedium() throws StandardException, FileNotFoundException, UnsupportedEncodingException { int size = 20000; int batches = 2; int batchSize = 512; StreamListener<ExecRow> sl = new StreamListener<>(-1, 0, batches, batchSize); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < size; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 2).sortByKey().mapToPair(new FailsFunction(5000)); final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize); new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); Iterator<ExecRow> it = sl.getIterator(); int count = 0; while (it.hasNext()) { ExecRow execRow = it.next(); assertNotNull(execRow); count++; } assertEquals(size, count); }
Example 19
Source File: StreamableRDDTest.java From spliceengine with GNU Affero General Public License v3.0 | 4 votes |
@Test public void testConcurrentQueries() throws StandardException, ExecutionException, InterruptedException { final StreamListener<ExecRow> sl1 = new StreamListener<>(); final StreamListener<ExecRow> sl2 = new StreamListener<>(); final StreamListener<ExecRow> sl3 = new StreamListener<>(); HostAndPort hostAndPort = server.getHostAndPort(); server.register(sl1); server.register(sl2); server.register(sl3); List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>(); for(int i = 0; i < 100000; ++i) { manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2))); } JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 12); final StreamableRDD srdd1 = new StreamableRDD(rdd.values(), sl1.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); final StreamableRDD srdd2 = new StreamableRDD(rdd.values().map(new Function<ExecRow,ExecRow>() { @Override public ExecRow call(ExecRow o) throws Exception { o.getColumn(1).setValue(0); return o; } }), sl2.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); final StreamableRDD srdd3 = new StreamableRDD(rdd.values(), sl3.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort()); for (final StreamableRDD srdd : Arrays.asList(srdd1, srdd2, srdd3)) { new Thread() { @Override public void run() { try { srdd.submit(); } catch (Exception e) { throw new RuntimeException(e); } } }.start(); } // We collect them asynchronously into memory so we are able to iterate over them at the same time. Otherwise // tasks for the third RDD might be blocked by tasks in other RDDs, and we are not consuming elements from the // other iterators so they can become unblocked. ExecutorService executor = Executors.newFixedThreadPool(3); Future<List<ExecRow>> future1 = executor.submit(new Callable<List<ExecRow>>() { @Override public List<ExecRow> call() throws Exception { return IteratorUtils.toList(sl1.getIterator()); } }); Future<List<ExecRow>> future2 = executor.submit(new Callable<List<ExecRow>>() { @Override public List<ExecRow> call() throws Exception { return IteratorUtils.toList(sl2.getIterator()); } }); Future<List<ExecRow>> future3 = executor.submit(new Callable<List<ExecRow>>() { @Override public List<ExecRow> call() throws Exception { return IteratorUtils.toList(sl3.getIterator()); } }); Iterator<ExecRow> it1 = future1.get().iterator(); Iterator<ExecRow> it2 = future2.get().iterator(); Iterator<ExecRow> it3 = future3.get().iterator(); int count = 0; while (it1.hasNext()) { ExecRow r1 = it1.next(); ExecRow r2 = it2.next(); ExecRow r3 = it3.next(); count++; assertNotNull(r1); assertNotNull(r2); assertNotNull(r3); assertEquals(0, r2.getColumn(1).getInt()); assertEquals(r1.getColumn(1), r3.getColumn(1)); assertEquals(r1.getColumn(2), r2.getColumn(2)); } assertEquals(100000, count); }
Example 20
Source File: MLUpdate.java From oryx with Apache License 2.0 | 4 votes |
@Override public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object,M> newKeyMessageData, JavaPairRDD<Object,M> pastKeyMessageData, String modelDirString, TopicProducer<String,String> modelUpdateTopic) throws IOException, InterruptedException { Objects.requireNonNull(newKeyMessageData); JavaRDD<M> newData = newKeyMessageData.values(); JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values(); if (newData != null) { newData.cache(); // This forces caching of the RDD. This shouldn't be necessary but we see some freezes // when many workers try to materialize the RDDs at once. Hence the workaround. newData.foreachPartition(p -> {}); } if (pastData != null) { pastData.cache(); pastData.foreachPartition(p -> {}); } List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos( getHyperParameterValues(), hyperParamSearch, candidates); Path modelDir = new Path(modelDirString); Path tempModelPath = new Path(modelDir, ".temporary"); Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis())); FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration()); fs.mkdirs(candidatesPath); Path bestCandidatePath = findBestCandidatePath( sparkContext, newData, pastData, hyperParameterCombos, candidatesPath); Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis())); if (bestCandidatePath == null) { log.info("Unable to build any model"); } else { // Move best model into place fs.rename(bestCandidatePath, finalPath); } // Then delete everything else fs.delete(candidatesPath, true); if (modelUpdateTopic == null) { log.info("No update topic configured, not publishing models to a topic"); } else { // Push PMML model onto update topic, if it exists Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME); if (fs.exists(bestModelPath)) { FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath); PMML bestModel = null; boolean modelNeededForUpdates = canPublishAdditionalModelData(); boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize; if (modelNeededForUpdates || modelNotTooLarge) { // Either the model is required for publishAdditionalModelData, or required because it's going to // be serialized to Kafka try (InputStream in = fs.open(bestModelPath)) { bestModel = PMMLUtils.read(in); } } if (modelNotTooLarge) { modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel)); } else { modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString()); } if (modelNeededForUpdates) { publishAdditionalModelData( sparkContext, bestModel, newData, pastData, finalPath, modelUpdateTopic); } } } if (newData != null) { newData.unpersist(); } if (pastData != null) { pastData.unpersist(); } }