Java Code Examples for org.apache.spark.api.java.JavaRDD#collect()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#collect() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DependencyParser.java From vn.vitk with GNU General Public License v3.0 | 6 votes |
/** * Parses all sentences in an input file, each on a line and writes the result to * the console window containing flattened dependency tuples. * @param jsc * @param inputFileName */ public void parse(JavaSparkContext jsc, String inputFileName) { List<String> sentences = jsc.textFile(inputFileName).collect(); JavaRDD<String> input = jsc.parallelize(sentences); JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction()); JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction()); JavaRDD<String> rows = graphs.map(new Function<DependencyGraph, String>() { private static final long serialVersionUID = -6021310762521034121L; public String call(DependencyGraph graph) { return graph.dependencies(); } }); for (String s : rows.collect()) { System.out.println(s); } }
Example 2
Source File: SparkExport.java From deeplearning4j with Apache License 2.0 | 6 votes |
public static void exportCSVLocal(String outputDir, String baseFileName, int numFiles, String delimiter, String quote, JavaRDD<List<Writable>> data, int rngSeed) throws Exception { JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote)); double[] split = new double[numFiles]; for (int i = 0; i < split.length; i++) split[i] = 1.0 / numFiles; JavaRDD<String>[] splitData = lines.randomSplit(split); int count = 0; Random r = new Random(rngSeed); for (JavaRDD<String> subset : splitData) { String path = FilenameUtils.concat(outputDir, baseFileName + (count++) + ".csv"); List<String> linesList = subset.collect(); if (!(linesList instanceof ArrayList)) linesList = new ArrayList<>(linesList); Collections.shuffle(linesList, r); FileUtils.writeLines(new File(path), linesList); } }
Example 3
Source File: SparkExport.java From deeplearning4j with Apache License 2.0 | 6 votes |
public static void exportCSVLocal(String outputDir, String baseFileName, int numFiles, String delimiter, String quote, JavaRDD<List<Writable>> data) throws Exception { JavaRDD<String> lines = data.map(new WritablesToStringFunction(delimiter, quote)); double[] split = new double[numFiles]; for (int i = 0; i < split.length; i++) split[i] = 1.0 / numFiles; JavaRDD<String>[] splitData = lines.randomSplit(split); int count = 0; for (JavaRDD<String> subset : splitData) { String path = FilenameUtils.concat(outputDir, baseFileName + (count++) + ".csv"); // subset.saveAsTextFile(path); List<String> linesList = subset.collect(); FileUtils.writeLines(new File(path), linesList); } }
Example 4
Source File: TextPipelineTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test @Ignore //AB 2020/04/20 https://github.com/eclipse/deeplearning4j/issues/8849 public void testCountCumSum() throws Exception { JavaSparkContext sc = getContext(); JavaRDD<String> corpusRDD = getCorpusRDD(sc); Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap()); TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap); pipeline.buildVocabCache(); pipeline.buildVocabWordListRDD(); JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD(); CountCumSum countCumSum = new CountCumSum(sentenceCountRDD); JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum(); List<Long> sentenceCountCumSumList = sentenceCountCumSumRDD.collect(); assertTrue(sentenceCountCumSumList.get(0) == 6L); assertTrue(sentenceCountCumSumList.get(1) == 9L); sc.stop(); }
Example 5
Source File: SparkKafkaDataSetWriter.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Override public DataSet<ExecRow> write() throws StandardException{ long start = System.currentTimeMillis(); CountFunction countFunction = new CountFunction<>(); KafkaStreamer kafkaStreamer = new KafkaStreamer(rdd.getNumPartitions(), topicName); JavaRDD streamed = rdd.map(countFunction).mapPartitionsWithIndex(kafkaStreamer, true); streamed.collect(); Long count = countFunction.getCount().value(); if(count == 0) { try { kafkaStreamer.noData(); } catch(Exception e) { throw StandardException.newException("", e); } } long end = System.currentTimeMillis(); ValueRow valueRow=new ValueRow(2); valueRow.setColumn(1,new SQLLongint(count)); valueRow.setColumn(2,new SQLLongint(end-start)); return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1)); }
Example 6
Source File: HaplotypeCallerSpark.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * WriteVariants, this is currently going to be horribly slow and explosive on a full size file since it performs a collect. * * This will be replaced by a parallel writer similar to what's done with {@link org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink} */ private void writeVariants(JavaRDD<VariantContext> variants) { final List<VariantContext> collectedVariants = variants.collect(); final SAMSequenceDictionary referenceDictionary = getReferenceSequenceDictionary(); final List<VariantContext> sortedVariants = collectedVariants.stream() .sorted((o1, o2) -> IntervalUtils.compareLocatables(o1, o2, referenceDictionary)) .collect(Collectors.toList()); final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgs, getHeaderForReads(), new ReferenceMultiSourceAdapter(getReference(), getAuthHolder())); try(final VariantContextWriter writer = hcEngine.makeVCFWriter(output, getBestAvailableSequenceDictionary())) { hcEngine.writeHeader(writer, getHeaderForReads().getSequenceDictionary(), Collections.emptySet()); sortedVariants.forEach(writer::add); } }
Example 7
Source File: MapPartitionsWithIndex.java From SparkDemo with MIT License | 5 votes |
private static void mapPartitionsWithIndex(JavaSparkContext sc) { List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4"); // 初始化,分为3个分区 JavaRDD<String> namesRDD = sc.parallelize(names, 3); JavaRDD<String> mapPartitionsWithIndexRDD = namesRDD .mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { private static final long serialVersionUID = 1L; public Iterator<String> call(Integer v1, Iterator<String> v2) throws Exception { List<String> list = new ArrayList<String>(); while (v2.hasNext()) { list.add("分区索引:" + v1 + "\t" + v2.next()); } return list.iterator(); } }, true); // 从集群获取数据到本地内存中 List<String> result = mapPartitionsWithIndexRDD.collect(); for (String s : result) { System.out.println(s); } sc.close(); }
Example 8
Source File: PersistenceRDDTest.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 外部集合转成RDD */ @Test public void testParallelize() { List<String> stringList = Arrays.asList("1", "2", "3", "4", "5"); JavaRDD<String> parallelize = sparkContext.parallelize(stringList); List<String> collect = parallelize.collect(); checkResult(collect); }
Example 9
Source File: ReadsSparkSourceUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
private void doLoadReads(String bam, String referencePathName, ValidationStringency validationStringency) { JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); final GATKPath inputBamSpecifier = new GATKPath(bam); final GATKPath referenceInputPath = referencePathName == null ? null : new GATKPath(referencePathName); ReadsSparkSource readSource = new ReadsSparkSource(ctx, validationStringency); JavaRDD<GATKRead> rddSerialReads = getSerialReads(ctx, bam, referenceInputPath, validationStringency); JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBamSpecifier, referenceInputPath); List<GATKRead> serialReads = rddSerialReads.collect(); List<GATKRead> parallelReads = rddParallelReads.collect(); Assert.assertEquals(serialReads.size(), parallelReads.size()); }
Example 10
Source File: TestHoodieClientOnCopyOnWriteStorage.java From hudi with Apache License 2.0 | 5 votes |
private Pair<Path, JavaRDD<WriteStatus>> testConsistencyCheck(HoodieTableMetaClient metaClient, String instantTime) throws Exception { HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false) .withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true) .withMaxConsistencyCheckIntervalMs(1).withInitialConsistencyCheckIntervalMs(1).build()) .build(); HoodieWriteClient client = getHoodieWriteClient(cfg); client.startCommitWithTime(instantTime); JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(dataGen.generateInserts(instantTime, 200), 1); JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, instantTime); result.collect(); // Create a dummy marker file to simulate the case that a marker file was created without data file. // This should fail the commit String partitionPath = Arrays .stream(fs.globStatus(new Path(String.format("%s/*/*/*/*", metaClient.getMarkerFolderPath(instantTime))), path -> path.toString().endsWith(HoodieTableMetaClient.MARKER_EXTN))) .limit(1).map(status -> status.getPath().getParent().toString()).collect(Collectors.toList()).get(0); Path markerFilePath = new Path(String.format("%s/%s", partitionPath, FSUtils.makeMarkerFile(instantTime, "1-0-1", UUID.randomUUID().toString()))); metaClient.getFs().create(markerFilePath); LOG.info("Created a dummy marker path=" + markerFilePath); Exception e = assertThrows(HoodieCommitException.class, () -> { client.commit(instantTime, result); }, "Commit should fail due to consistency check"); assertTrue(e.getCause() instanceof HoodieIOException); return Pair.of(markerFilePath, result); }
Example 11
Source File: ReadsSparkSourceUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Test(dataProvider = "loadShardedReads", groups = "spark") public void shardedReadsSparkSourceTest(String expectedBam, String shardedBam, String referencePath) { JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); final GATKPath referenceInputPath = referencePath == null ? null : new GATKPath(referencePath); final GATKPath shardedBamSpecifier = new GATKPath(shardedBam); ReadsSparkSource readSource = new ReadsSparkSource(ctx); JavaRDD<GATKRead> rddSerialReads = getSerialReads(ctx, expectedBam, referenceInputPath, ReadConstants.DEFAULT_READ_VALIDATION_STRINGENCY); JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(shardedBamSpecifier, referenceInputPath); List<GATKRead> serialReads = rddSerialReads.collect(); List<GATKRead> parallelReads = rddParallelReads.collect(); Assert.assertEquals(parallelReads.size(), serialReads.size()); }
Example 12
Source File: MiniBatchTests.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testMiniBatches() throws Exception { log.info("Setting up Spark Context..."); JavaRDD<String> lines = sc.textFile(new ClassPathResource("svmLight/iris_svmLight_0.txt") .getTempFileFromArchive().toURI().toString()).cache(); long count = lines.count(); assertEquals(300, count); // gotta map this to a Matrix/INDArray RecordReader rr = new SVMLightRecordReader(); Configuration c = new Configuration(); c.set(SVMLightRecordReader.NUM_FEATURES, "5"); rr.setConf(c); JavaRDD<DataSet> points = lines.map(new RecordReaderFunction(rr, 4, 3)).cache(); count = points.count(); assertEquals(300, count); List<DataSet> collect = points.collect(); points = points.repartition(1); JavaRDD<DataSet> miniBatches = new RDDMiniBatches(10, points).miniBatchesJava(); count = miniBatches.count(); List<DataSet> list = miniBatches.collect(); assertEquals(30, count); //Expect exactly 30 from 1 partition... could be more for multiple input partitions lines.unpersist(); points.unpersist(); miniBatches.map(new DataSetAssertionFunction()); }
Example 13
Source File: LogisticRegressionBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testLogisticRegression() { //prepare data String datapath = "src/test/resources/binary_classification_test.libsvm"; JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD(); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd()); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel, null); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //validate predictions List<LabeledPoint> testPoints = trainingData.collect(); for (LabeledPoint i : testPoints) { Vector v = i.features(); double actual = lrmodel.predict(v); Map<String, Object> data = new HashMap<String, Object>(); data.put("features", v.toArray()); transformer.transform(data); double predicted = (double) data.get("prediction"); assertEquals(actual, predicted, EPSILON); } }
Example 14
Source File: TestExport.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testBatchAndExportDataSetsFunction() throws Exception { String baseDir = System.getProperty("java.io.tmpdir"); baseDir = FilenameUtils.concat(baseDir, "dl4j_spark_testBatchAndExport/"); baseDir = baseDir.replaceAll("\\\\", "/"); File f = new File(baseDir); if (f.exists()) FileUtils.deleteDirectory(f); f.mkdir(); f.deleteOnExit(); int minibatchSize = 5; int nIn = 4; int nOut = 3; List<DataSet> dataSets = new ArrayList<>(); dataSets.add(new DataSet(Nd4j.create(10, nIn), Nd4j.create(10, nOut))); //Larger than minibatch size -> tests splitting for (int i = 0; i < 98; i++) { if (i % 2 == 0) { dataSets.add(new DataSet(Nd4j.create(5, nIn), Nd4j.create(5, nOut))); } else { dataSets.add(new DataSet(Nd4j.create(1, nIn), Nd4j.create(1, nOut))); dataSets.add(new DataSet(Nd4j.create(1, nIn), Nd4j.create(1, nOut))); dataSets.add(new DataSet(Nd4j.create(3, nIn), Nd4j.create(3, nOut))); } } Collections.shuffle(dataSets, new Random(12345)); JavaRDD<DataSet> rdd = sc.parallelize(dataSets); rdd = rdd.repartition(1); //For testing purposes (should get exactly 100 out, but maybe more with more partitions) JavaRDD<String> pathsRdd = rdd.mapPartitionsWithIndex( new BatchAndExportDataSetsFunction(minibatchSize, "file:///" + baseDir), true); List<String> paths = pathsRdd.collect(); assertEquals(100, paths.size()); File[] files = f.listFiles(); assertNotNull(files); int count = 0; for (File file : files) { if (!file.getPath().endsWith(".bin")) continue; // System.out.println(file); DataSet ds = new DataSet(); ds.load(file); assertEquals(minibatchSize, ds.numExamples()); count++; } assertEquals(100, count); FileUtils.deleteDirectory(f); }
Example 15
Source File: Tokenizer.java From vn.vitk with GNU General Public License v3.0 | 4 votes |
/** * Tokenizes a text file and returns a list of tokens. * @param fileName * @return a list of tokens. */ public List<String> tokenize(String fileName) { JavaRDD<String> input = readTextFile(fileName); JavaRDD<String> output = tokenize(input); return output.collect(); }
Example 16
Source File: RoughAlignmentClient.java From render with GNU General Public License v2.0 | 4 votes |
private void alignTier() throws IOException { LOG.info("alignTier: entry"); final List<HierarchicalStack> stacksToAlign = new ArrayList<>(); if (! parameters.keepExisting(PipelineStep.ALIGN) || tierZeroStack.requiresAlignment()) { stacksToAlign.add(tierZeroStack); tierZeroStack.setAlignmentQuality(null); } if (stacksToAlign.size() == 1) { // broadcast EM_aligner tool to ensure that solver is run serially on each node final EMAlignerTool solver = new EMAlignerTool(new File(parameters.solverScript), new File(parameters.solverParametersTemplate)); final Broadcast<EMAlignerTool> broadcastEMAlignerTool = sparkContext.broadcast(solver); final HierarchicalTierSolveFunction solveStacksFunction = new HierarchicalTierSolveFunction(parameters.renderWeb.baseDataUrl, parameters.zNeighborDistance, broadcastEMAlignerTool); // remove any pre-existing alignment results ... driverTierRender.deleteStack(tierZeroStack.getAlignedStackId().getStack(), null); final JavaRDD<HierarchicalStack> rddTierStacksToAlign = sparkContext.parallelize(stacksToAlign); final JavaRDD<HierarchicalStack> rddTierStacksAfterAlignment = rddTierStacksToAlign.map(solveStacksFunction); final List<HierarchicalStack> tierStacksAfterAlignment = rddTierStacksAfterAlignment.collect(); LOG.info("alignTier: processing results"); final Double alignmentQuality = tierStacksAfterAlignment.get(0).getAlignmentQuality(); if ((alignmentQuality == null) || (alignmentQuality < 0.0)) { throw new IOException("alignment of " + tierZeroStack.getSplitStackId() + " failed (alignment quality is " + alignmentQuality + ")"); } tierZeroStack.setAlignmentQuality(alignmentQuality); persistHierarchicalData(tierZeroStack); LOG.info("alignTier: {} has alignment quality {}", tierZeroStack.getAlignedStackId(), tierZeroStack.getAlignmentQuality()); } else { LOG.info("alignTier: all aligned stacks have already been generated"); } LOG.info("alignTier: exit"); }
Example 17
Source File: TransformSectionClient.java From render with GNU General Public License v2.0 | 4 votes |
public void run() throws IOException, URISyntaxException { final SparkConf conf = new SparkConf().setAppName("TransformSectionClient"); final JavaSparkContext sparkContext = new JavaSparkContext(conf); final String sparkAppId = sparkContext.getConf().getAppId(); final String executorsJson = LogUtilities.getExecutorsApiJson(sparkAppId); LOG.info("run: appId is {}, executors data is {}", sparkAppId, executorsJson); final RenderDataClient sourceDataClient = parameters.renderWeb.getDataClient(); final List<Double> zValues = sourceDataClient.getStackZValues(parameters.stack, parameters.layerRange.minZ, parameters.layerRange.maxZ); if (zValues.size() == 0) { throw new IllegalArgumentException("source stack does not contain any matching z values"); } final RenderDataClient targetDataClient = new RenderDataClient(parameters.renderWeb.baseDataUrl, parameters.getTargetOwner(), parameters.getTargetProject()); final StackMetaData sourceStackMetaData = sourceDataClient.getStackMetaData(parameters.stack); targetDataClient.setupDerivedStack(sourceStackMetaData, parameters.getTargetStack()); final LeafTransformSpec stackTransform = new LeafTransformSpec(parameters.transformId, null, parameters.transformClass, parameters.transformData.replace(',', ' ')); // make RDD final JavaRDD<Double> rddZValues = sparkContext.parallelize(zValues); final Function<Double, Integer> transformFunction = (Function<Double, Integer>) z -> { LogUtilities.setupExecutorLog4j("z " + z); //get the source client final RenderDataClient sourceDataClient1 = parameters.renderWeb.getDataClient(); //get the target client(which can be the same as the source) final RenderDataClient targetDataClient1 = new RenderDataClient(parameters.renderWeb.baseDataUrl, parameters.getTargetOwner(), parameters.getTargetProject()); final ResolvedTileSpecCollection sourceCollection = sourceDataClient1.getResolvedTiles(parameters.stack, z); sourceCollection.addTransformSpecToCollection(stackTransform); sourceCollection.addReferenceTransformToAllTiles(stackTransform.getId(), false); //vs tile spec validation? sourceCollection.removeUnreferencedTransforms(); targetDataClient1.saveResolvedTiles(sourceCollection, parameters.getTargetStack(), z); return sourceCollection.getTileCount(); }; // assign a transformation to the RDD final JavaRDD<Integer> rddTileCounts = rddZValues.map(transformFunction); // use an action to get the results final List<Integer> tileCountList = rddTileCounts.collect(); long total = 0; for (final Integer tileCount : tileCountList) { total += tileCount; } LOG.info("run: collected stats"); LOG.info("run: saved {} tiles and transforms", total); sparkContext.stop(); }
Example 18
Source File: TestIteratorUtils.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testRRMDSIJoin() throws Exception { ClassPathResource cpr1 = new ClassPathResource("spark/rrmdsi/file1.txt"); ClassPathResource cpr2 = new ClassPathResource("spark/rrmdsi/file2.txt"); RecordReader rr1 = new CSVRecordReader(); rr1.initialize(new FileSplit(cpr1.getFile())); RecordReader rr2 = new CSVRecordReader(); rr2.initialize(new FileSplit(cpr2.getFile())); RecordReaderMultiDataSetIterator rrmdsi1 = new RecordReaderMultiDataSetIterator.Builder(1) .addReader("r1", rr1) .addReader("r2", rr2) .addInput("r1", 1, 2) .addOutput("r2",1,2) .build(); RecordReaderMultiDataSetIterator rrmdsi2 = new RecordReaderMultiDataSetIterator.Builder(1) .addReader("r1", new SparkSourceDummyReader(0)) .addReader("r2", new SparkSourceDummyReader(1)) .addInput("r1", 1, 2) .addOutput("r2",1,2) .build(); List<MultiDataSet> expected = new ArrayList<>(3); while(rrmdsi1.hasNext()){ expected.add(rrmdsi1.next()); } JavaRDD<List<Writable>> rdd1 = sc.textFile(cpr1.getFile().getPath()).coalesce(1) .map(new StringToWritablesFunction(new CSVRecordReader())); JavaRDD<List<Writable>> rdd2 = sc.textFile(cpr2.getFile().getPath()).coalesce(1) .map(new StringToWritablesFunction(new CSVRecordReader())); List<JavaRDD<List<Writable>>> list = Arrays.asList(rdd1, rdd2); JavaRDD<MultiDataSet> mdsRdd = IteratorUtils.mapRRMDSI(list, null, new int[]{0,0}, null, false, rrmdsi2); List<MultiDataSet> act = mdsRdd.collect(); expected = new ArrayList<>(expected); act = new ArrayList<>(act); Comparator<MultiDataSet> comp = new Comparator<MultiDataSet>() { @Override public int compare(MultiDataSet d1, MultiDataSet d2) { return Double.compare(d1.getFeatures(0).getDouble(0), d2.getFeatures(0).getDouble(0)); } }; Collections.sort(expected, comp); Collections.sort(act, comp); assertEquals(expected, act); }
Example 19
Source File: TestSequenceRecordReaderBytesFunction.java From DataVec with Apache License 2.0 | 4 votes |
@Test public void testRecordReaderBytesFunction() throws Exception { //Local file path ClassPathResource cpr = new ClassPathResource("/video/shapes_0.mp4"); String path = cpr.getFile().getAbsolutePath(); String folder = path.substring(0, path.length() - 12); path = folder + "*"; //Load binary data from local file system, convert to a sequence file: //Load and convert JavaPairRDD<String, PortableDataStream> origData = sc.binaryFiles(path); JavaPairRDD<Text, BytesWritable> filesAsBytes = origData.mapToPair(new FilesAsBytesFunction()); //Write the sequence file: Path p = Files.createTempDirectory("dl4j_rrbytesTest"); p.toFile().deleteOnExit(); String outPath = p.toString() + "/out"; filesAsBytes.saveAsNewAPIHadoopFile(outPath, Text.class, BytesWritable.class, SequenceFileOutputFormat.class); //Load data from sequence file, parse via SequenceRecordReader: JavaPairRDD<Text, BytesWritable> fromSeqFile = sc.sequenceFile(outPath, Text.class, BytesWritable.class); SequenceRecordReader seqRR = new CodecRecordReader(); Configuration conf = new Configuration(); conf.set(CodecRecordReader.RAVEL, "true"); conf.set(CodecRecordReader.START_FRAME, "0"); conf.set(CodecRecordReader.TOTAL_FRAMES, "25"); conf.set(CodecRecordReader.ROWS, "64"); conf.set(CodecRecordReader.COLUMNS, "64"); Configuration confCopy = new Configuration(conf); seqRR.setConf(conf); JavaRDD<List<List<Writable>>> dataVecData = fromSeqFile.map(new SequenceRecordReaderBytesFunction(seqRR)); //Next: do the same thing locally, and compare the results InputSplit is = new FileSplit(new File(folder), new String[] {"mp4"}, true); SequenceRecordReader srr = new CodecRecordReader(); srr.initialize(is); srr.setConf(confCopy); List<List<List<Writable>>> list = new ArrayList<>(4); while (srr.hasNext()) { list.add(srr.sequenceRecord()); } assertEquals(4, list.size()); List<List<List<Writable>>> fromSequenceFile = dataVecData.collect(); assertEquals(4, list.size()); assertEquals(4, fromSequenceFile.size()); boolean[] found = new boolean[4]; for (int i = 0; i < 4; i++) { int foundIndex = -1; List<List<Writable>> collection = fromSequenceFile.get(i); for (int j = 0; j < 4; j++) { if (collection.equals(list.get(j))) { if (foundIndex != -1) fail(); //Already found this value -> suggests this spark value equals two or more of local version? (Shouldn't happen) foundIndex = j; if (found[foundIndex]) fail(); //One of the other spark values was equal to this one -> suggests duplicates in Spark list found[foundIndex] = true; //mark this one as seen before } } } int count = 0; for (boolean b : found) if (b) count++; assertEquals(4, count); //Expect all 4 and exactly 4 pairwise matches between spark and local versions }
Example 20
Source File: TestAsyncCompaction.java From hudi with Apache License 2.0 | 4 votes |
private List<HoodieRecord> runNextDeltaCommits(HoodieWriteClient client, final HoodieReadClient readClient, List<String> deltaInstants, List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst, List<String> expPendingCompactionInstants) throws Exception { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); List<Pair<String, HoodieCompactionPlan>> pendingCompactions = readClient.getPendingCompactions(); List<String> gotPendingCompactionInstants = pendingCompactions.stream().map(pc -> pc.getKey()).sorted().collect(Collectors.toList()); assertEquals(expPendingCompactionInstants, gotPendingCompactionInstants); Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> fgIdToCompactionOperation = CompactionUtils.getAllPendingCompactionOperations(metaClient); if (insertFirst) { // Use first instant for inserting records String firstInstant = deltaInstants.get(0); deltaInstants = deltaInstants.subList(1, deltaInstants.size()); JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1); client.startCommitWithTime(firstInstant); JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, firstInstant); List<WriteStatus> statusList = statuses.collect(); if (!cfg.shouldAutoCommit()) { client.commit(firstInstant, statuses); } assertNoWriteErrors(statusList); metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); HoodieTable hoodieTable = getHoodieTable(metaClient, cfg); List<HoodieBaseFile> dataFilesToRead = getCurrentLatestDataFiles(hoodieTable, cfg); assertTrue(dataFilesToRead.stream().findAny().isPresent(), "should list the parquet files we wrote in the delta commit"); validateDeltaCommit(firstInstant, fgIdToCompactionOperation, cfg); } int numRecords = records.size(); for (String instantTime : deltaInstants) { records = dataGen.generateUpdates(instantTime, numRecords); metaClient = new HoodieTableMetaClient(hadoopConf, cfg.getBasePath()); createNextDeltaCommit(instantTime, records, client, metaClient, cfg, false); validateDeltaCommit(instantTime, fgIdToCompactionOperation, cfg); } return records; }