org.apache.spark.api.java.function.FlatMapFunction Java Exaples

Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0

8 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: ParameterAveragingTrainingMaster.java From deeplearning4j with Apache License 2.0

7 votes

protected void doIteration(SparkComputationGraph graph, JavaRDD<MultiDataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<MultiDataSet> splitData = split;

    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = split.partitions().size();

    FlatMapFunction<Iterator<MultiDataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerMultiDataSetFlatMap<>(getWorkerInstance(graph));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}

Source File: ChronixSparkContext.java From chronix.spark with Apache License 2.0

6 votes

/**
 * Low-level chunked query.
 *
 * @param query Solr query
 * @param zkHost Zookeeper host
 * @param collection     the Solr collection of chronix time series data
 * @param chronixStorage a ChronixSolrCloudStorage instance
 * @return ChronixRDD of time series (chunks)
 * @throws SolrServerException
 */
public ChronixRDD queryChronixChunks(
        final SolrQuery query,
        final String zkHost,
        final String collection,
        final ChronixSolrCloudStorage chronixStorage) throws SolrServerException, IOException {

    // first get a list of replicas to query for this collection
    List<String> shards = chronixStorage.getShardList(zkHost, collection);

    // parallelize the requests to the shards
    JavaRDD<MetricTimeSeries> docs = jsc.parallelize(shards, shards.size()).flatMap(
            (FlatMapFunction<String, MetricTimeSeries>) shardUrl -> chronixStorage.streamFromSingleNode(
                    zkHost, collection, shardUrl, query, new MetricTimeSeriesConverter()).iterator());
    return new ChronixRDD(docs);
}

Source File: RemoveOrphanFilesAction.java From iceberg with Apache License 2.0

6 votes

private static FlatMapFunction<Iterator<String>, String> listDirsRecursively(
    Broadcast<SerializableConfiguration> conf,
    long olderThanTimestamp) {

  return (FlatMapFunction<Iterator<String>, String>) dirs -> {
    List<String> subDirs = Lists.newArrayList();
    List<String> files = Lists.newArrayList();

    Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

    int maxDepth = 2000;
    int maxDirectSubDirs = Integer.MAX_VALUE;

    dirs.forEachRemaining(dir -> {
      listDirRecursively(dir, predicate, conf.value().value(), maxDepth, maxDirectSubDirs, subDirs, files);
    });

    if (!subDirs.isEmpty()) {
      throw new RuntimeException("Could not list subdirectories, reached maximum subdirectory depth: " + maxDepth);
    }

    return files.iterator();
  };
}

Source File: StructureAligner.java From mmtf-spark with Apache License 2.0

6 votes

/**
 * Creates an RDD of all n*(n-1)/2 unique pairs for pairwise structural alignments.
 * @param sc spark context
 * @param n number of protein chains
 * @return
 */
private static JavaRDD<Tuple2<Integer, Integer>> getPairs(JavaSparkContext sc, int n) {
	// create a list of integers from 0 - n-1
	List<Integer> range = IntStream.range(0, n).boxed().collect(Collectors.toList());

	JavaRDD<Integer> pRange = sc.parallelize(range, NUM_TASKS*sc.defaultParallelism());

	// flatmap this list of integers into all unique pairs 
	// (0,1),(0,2),...(0,n-1),  (1,2)(1,3),..,(1,n-1),  (2,3),(2,4),...
	return pRange.flatMap(new FlatMapFunction<Integer, Tuple2<Integer,Integer>>() {
		private static final long serialVersionUID = -432662341173300339L;

		@Override
		public Iterator<Tuple2<Integer, Integer>> call(Integer t) throws Exception {
			List<Tuple2<Integer, Integer>> pairs = new ArrayList<>();

			for (int i = 0; i < t; i++) {
				pairs.add(new Tuple2<Integer, Integer>(i, t));
			}
			return pairs.iterator();
		}
		// The partitions generated here are not well balanced, which would lead to an
		// unbalanced workload. Here we repartition the pairs for efficient processing.
	}).repartition(NUM_TASKS*sc.defaultParallelism()); 
}

Source File: SparkDistributor.java From DataGenerator with Apache License 2.0

6 votes

@Override
public void distribute(final List<Frontier> frontierList) {
    JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("dg-spark").setMaster(masterURL));

    generatedMaps = sc
            .parallelize(frontierList)
            .flatMap(new FlatMapFunction<Frontier, Map<String, String>>() {
                @Override
                public Iterable<Map<String, String>> call(Frontier frontier) {
                    LinkedList<Map<String, String>> storage = new LinkedList<>();
                    frontier.searchForScenarios(new CatchAndStoreProcessing(storage), searchExitFlag);

                    return storage;
                }
            })
            .flatMap(new FlatMapFunction<Map<String, String>, Map<String, String>>() {
                @Override
                public Iterable<Map<String, String>> call(Map<String, String> initialVars) {
                    return SparkDistributor.dataConsumer.transformAndReturn(initialVars);
                }
            });
}

Source File: MapPartitions.java From SparkDemo with MIT License

6 votes

private static void mapPartitions(JavaSparkContext sc) {
	List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4");

	JavaRDD<String> namesRDD = sc.parallelize(names, 3);
	JavaRDD<String> mapPartitionsRDD = namesRDD.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
		int count = 0;

		@Override
		public Iterator<String> call(Iterator<String> stringIterator) throws Exception {
			List<String> list = new ArrayList<String>();
			while (stringIterator.hasNext()) {
				list.add("count:" + count++ + "\t" + stringIterator.next());
			}
			return list.iterator();
		}
	});

	// 从集群获取数据到本地内存中
	List<String> result = mapPartitionsRDD.collect();
	for (String s : result) {
		System.out.println(s);
	}

	sc.close();
}

Source File: FlatMap.java From SparkDemo with MIT License

6 votes

private static void flatMap(JavaSparkContext sc) {
	List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript");
	JavaRDD<String> rddData = sc.parallelize(data);

	FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String s) throws Exception {
			List<String> list = Arrays.asList(s.split(","));
			return list.iterator();
		}
	};
	JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction);


	flatMapData.foreach(new VoidFunction<String>() {
		@Override
		public void call(String v) throws Exception {
			System.out.println(v);
		}
	});

	sc.close();
}

Source File: SparkExtensionTest.java From component-runtime with Apache License 2.0

6 votes

public static void main(final String[] args) {
    final SparkConf conf =
            new SparkConf().setAppName(SparkClusterRuleTest.SubmittableMain.class.getName()).setMaster(args[0]);
    final JavaSparkContext context = new JavaSparkContext(conf);

    context
            .parallelize(singletonList("a b"))
            .flatMap((FlatMapFunction<String, String>) text -> asList(text.split(" ")).iterator())
            .mapToPair(word -> new Tuple2<>(word, 1))
            .reduceByKey((a, b) -> a + b)
            .foreach(result -> {
                try (final FileWriter writer = new FileWriter(args[1], true)) {
                    writer.write(result._1 + " -> " + result._2 + '\n');
                }
            });
}

Source File: ChronixRDD.java From chronix.spark with Apache License 2.0

6 votes

/**
 * Transformation: Transforms the ChronixRDD into a RDD of MetricObservations (pair of timestamp & value + dimensions).
 *
 * @return RDD of MetricObservations
 */
public JavaRDD<MetricObservation> toObservations() {
    return this.flatMap((FlatMapFunction<MetricTimeSeries, MetricObservation>) ts -> ts.points().map(point -> {
        //null-safe read of dimensional values
        String host = ts.attributes().get(MetricDimension.HOST) == null ? null
                : ts.attributes().get(MetricDimension.HOST).toString();
        String series = ts.attributes().get(MetricDimension.MEASUREMENT_SERIES) == null ? null
                : ts.attributes().get(MetricDimension.MEASUREMENT_SERIES).toString();
        String process = ts.attributes().get(MetricDimension.PROCESS) == null ? null
                : ts.attributes().get(MetricDimension.PROCESS).toString();
        String group = ts.attributes().get(MetricDimension.METRIC_GROUP) == null ? null
                : ts.attributes().get(MetricDimension.METRIC_GROUP).toString();
        String ag = ts.attributes().get(MetricDimension.AGGREGATION_LEVEL) == null ? null
                : ts.attributes().get(MetricDimension.AGGREGATION_LEVEL).toString();
        //convert Point/MetricTimeSeries to MetricObservation
        return new MetricObservation(
                ts.getMetric(),
                host, series, process, group, ag,
                point.getTimestamp(),
                point.getValue()
        );
    }).iterator());
}

Source File: LocusWalkerSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
 * @param referenceFileName the name of the reference file added via {@code SparkContext#addFile()}
 * @param bFeatureManager the feature manager broadcast
 * @param sequenceDictionary the sequence dictionary for the reads
 * @param header the reads header
 * @param downsamplingInfo the downsampling method for the reads
 * @return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
 */
private static FlatMapFunction<Shard<GATKRead>, LocusWalkerContext> getAlignmentsFunction(
        String referenceFileName, Broadcast<FeatureManager> bFeatureManager,
        SAMSequenceDictionary sequenceDictionary, SAMFileHeader header, LIBSDownsamplingInfo downsamplingInfo, boolean isEmitEmptyLoci) {
    return (FlatMapFunction<Shard<GATKRead>, LocusWalkerContext>) shardedRead -> {
        SimpleInterval interval = shardedRead.getInterval();
        Iterator<GATKRead> readIterator = shardedRead.iterator();
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager fm = bFeatureManager == null ? null : bFeatureManager.getValue();

        final AlignmentContextIteratorBuilder alignmentContextIteratorBuilder = new AlignmentContextIteratorBuilder();
        alignmentContextIteratorBuilder.setDownsamplingInfo(downsamplingInfo);
        alignmentContextIteratorBuilder.setEmitEmptyLoci(isEmitEmptyLoci);
        alignmentContextIteratorBuilder.setKeepUniqueReadListInLibs(false);
        alignmentContextIteratorBuilder.setIncludeNs(false);

        final Iterator<AlignmentContext> alignmentContextIterator = alignmentContextIteratorBuilder.build(
                readIterator, header, Collections.singletonList(interval), sequenceDictionary, true);

        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(alignmentContextIterator, 0), false).map(alignmentContext -> {
            final SimpleInterval alignmentInterval = new SimpleInterval(alignmentContext);
            return new LocusWalkerContext(alignmentContext, new ReferenceContext(reference, alignmentInterval), new FeatureContext(fm, alignmentInterval));
        }).iterator();
    };
}

Source File: VariantsSparkSink.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

private static void writeVariantsSingle(
        final JavaSparkContext ctx, final String outputFile, final JavaRDD<VariantContext> variants,
        final VCFHeader header, final boolean writeGvcf, final List<Number> gqPartitions, final int defaultPloidy,
        final int numReducers, final boolean writeTabixIndex, final boolean sortVariantsToHeader) throws IOException {

    //TODO remove me when https://github.com/broadinstitute/gatk/issues/4303 is fixed
    if (outputFile.endsWith(FileExtensions.BCF) || outputFile.endsWith(FileExtensions.BCF + ".gz")) {
        throw new UserException.UnimplementedFeature("It is currently not possible to write a BCF file on spark.  See https://github.com/broadinstitute/gatk/issues/4303 for more details .");
    }
    final JavaRDD<VariantContext> sortedVariants = sortVariantsToHeader ? sortVariants(variants, header, numReducers) : variants;
    final JavaRDD<VariantContext> variantsToSave;
    if (writeGvcf) {
        GVCFBlockCombiner gvcfBlockCombiner = new GVCFBlockCombiner(gqPartitions, defaultPloidy, false);
        gvcfBlockCombiner.addRangesToHeader(header);
        variantsToSave = sortedVariants.mapPartitions((FlatMapFunction<Iterator<VariantContext>, VariantContext>) v -> new GVCFBlockCombiningIterator(v, gqPartitions, defaultPloidy));
    } else {
        variantsToSave = sortedVariants;
    }
    TabixIndexWriteOption tabixIndexWriteOption = TabixIndexWriteOption.fromBoolean(writeTabixIndex);
    HtsjdkVariantsRdd htsjdkVariantsRdd = new HtsjdkVariantsRdd(header, variantsToSave);
    HtsjdkVariantsRddStorage.makeDefault(ctx)
            .write(htsjdkVariantsRdd, outputFile, tabixIndexWriteOption);
}

Source File: FindAssemblyRegionsSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

private static FlatMapFunction<Tuple2<String, Iterable<ActivityProfileStateRange>>, ReadlessAssemblyRegion> getReadlessAssemblyRegionsFunction(
        final SAMFileHeader header,
        final AssemblyRegionArgumentCollection assemblyRegionArgs) {
    return (FlatMapFunction<Tuple2<String, Iterable<ActivityProfileStateRange>>, ReadlessAssemblyRegion>) iter ->
            Iterators.transform(
                    new AssemblyRegionFromActivityProfileStateIterator(
                            ActivityProfileStateRange.toIteratorActivityProfileState(iter._2.iterator()),
                            header,
                            assemblyRegionArgs.minAssemblyRegionSize,
                            assemblyRegionArgs.maxAssemblyRegionSize,
                            assemblyRegionArgs.assemblyRegionPadding,
                            assemblyRegionArgs.activeProbThreshold,
                            assemblyRegionArgs.maxProbPropagationDistance), new com.google.common.base.Function<AssemblyRegion, ReadlessAssemblyRegion>() {
                        @Nullable
                        @Override
                        public ReadlessAssemblyRegion apply(@Nullable AssemblyRegion input) {
                            return new ReadlessAssemblyRegion(input);
                        }
                    });
}

Source File: VariantWalkerSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

private static FlatMapFunction<Shard<VariantContext>, VariantWalkerContext> getVariantsFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager) {
    return (FlatMapFunction<Shard<VariantContext>, VariantWalkerContext>) shard -> {
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();

        return StreamSupport.stream(shard.spliterator(), false)
                .filter(v -> v.getStart() >= shard.getStart() && v.getStart() <= shard.getEnd()) // only include variants that start in the shard
                .map(v -> {
                    final SimpleInterval variantInterval = new SimpleInterval(v);
                    return new VariantWalkerContext(v,
                            new ReadsContext(), // empty
                            new ReferenceContext(reference, variantInterval),
                            new FeatureContext(features, variantInterval));
                }).iterator();
    };
}

Source File: HaplotypeCallerSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

private static FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext> assemblyFunction(final SAMFileHeader header,
                                                                                                       final String referenceFileName,
                                                                                                       final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast,
                                                                                                       final Broadcast<AssemblyRegionArgumentCollection> assemblyRegionArgsBroadcast,
                                                                                                       final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast) {
    return (FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext>) contexts -> {
        // HaplotypeCallerEngine isn't serializable but is expensive to instantiate, so construct and reuse one for every partition
        final ReferenceSequenceFile taskReferenceSequenceFile = taskReferenceSequenceFile(referenceFileName);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), assemblyRegionArgsBroadcast.value(), false, false, header, taskReferenceSequenceFile, annotatorEngineBroadcast.getValue());
        Iterator<Iterator<VariantContext>> iterators = Utils.stream(contexts).map(context -> {
            AssemblyRegion region = context.getAssemblyRegion();
            FeatureContext featureContext = context.getFeatureContext();
            return hcEngine.callRegion(region, featureContext, context.getReferenceContext()).iterator();
        }).iterator();

        return Iterators.concat(iterators);
    };
}

Source File: ParameterAveragingTrainingMaster.java From deeplearning4j with Apache License 2.0

6 votes

protected void doIteration(SparkDl4jMultiLayer network, JavaRDD<DataSet> split, int splitNum, int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<DataSet> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();


    FlatMapFunction<Iterator<DataSet>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerFlatMap<>(getWorkerInstance(network));
    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(network, null, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}

Source File: ParameterAveragingTrainingMaster.java From deeplearning4j with Apache License 2.0

6 votes

protected void doIterationPDS_MDS(SparkComputationGraph graph, JavaRDD<PortableDataStream> split, int splitNum,
                int numSplits) {
    log.info("Starting training of split {} of {}. workerMiniBatchSize={}, averagingFreq={}, Configured for {} workers",
                    splitNum, numSplits, batchSizePerWorker, averagingFrequency, numWorkers);
    if (collectTrainingStats)
        stats.logMapPartitionsStart();

    JavaRDD<PortableDataStream> splitData = split;
    if (collectTrainingStats)
        stats.logRepartitionStart();
    splitData = SparkUtils.repartition(splitData, repartition, repartitionStrategy,
                    numObjectsEachWorker(rddDataSetNumExamples), numWorkers);
    int nPartitions = splitData.partitions().size();
    if (collectTrainingStats && repartition != Repartition.Never)
        stats.logRepartitionEnd();

    FlatMapFunction<Iterator<PortableDataStream>, ParameterAveragingTrainingResult> function =
                    new ExecuteWorkerPDSMDSFlatMap<>(getWorkerInstance(graph));

    JavaRDD<ParameterAveragingTrainingResult> result = splitData.mapPartitions(function);
    processResults(null, graph, result, splitNum, numSplits);

    if (collectTrainingStats)
        stats.logMapPartitionsEnd(nPartitions);
}

Source File: HaplotypeCallerSpark.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * @return and RDD of {@link Tuple2<AssemblyRegion, SimpleInterval>} which pairs each AssemblyRegion with the
 * interval it was generated in
 */
private static FlatMapFunction<Iterator<Shard<GATKRead>>, Tuple2<AssemblyRegion, SimpleInterval>> shardsToAssemblyRegions(
        final AuthHolder authHolder,
        final Broadcast<ReferenceMultiSource> reference,
        final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast,
        final ShardingArgumentCollection assemblyArgs,
        final SAMFileHeader header) {
    return shards -> {
        final ReferenceMultiSource referenceMultiSource = reference.value();
        final ReferenceMultiSourceAdapter referenceSource = new ReferenceMultiSourceAdapter(referenceMultiSource, authHolder);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), header, referenceSource);

        return iteratorToStream(shards).flatMap(shardToRegion(assemblyArgs, header, referenceSource, hcEngine)).iterator();
    };
}

Source File: CollectAllelicCountsSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

private static FlatMapFunction<Iterator<LocusWalkerContext>, AllelicCountCollector> distributedCount(final Broadcast<SampleLocatableMetadata> sampleMetadataBroadcast,
                                                                                                     final int minimumBaseQuality) {
    return (FlatMapFunction<Iterator<LocusWalkerContext>, AllelicCountCollector>) contextIterator -> {
        final AllelicCountCollector result = new AllelicCountCollector(sampleMetadataBroadcast.getValue());

        contextIterator.forEachRemaining( ctx -> {
            final byte refAsByte = ctx.getReferenceContext().getBase();
            result.collectAtLocus(Nucleotide.decode(refAsByte), ctx.getAlignmentContext().getBasePileup(),
                    ctx.getAlignmentContext().getLocation(), minimumBaseQuality);
            }
        );
        return Collections.singletonList(result).iterator();
    };
}

Source File: FindAssemblyRegionsSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

private static FlatMapFunction<Iterator<AssemblyRegion>, AssemblyRegionWalkerContext> getAssemblyRegionWalkerContextFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager) {

    return (FlatMapFunction<Iterator<AssemblyRegion>, AssemblyRegionWalkerContext>) assemblyRegionIter -> {
        final ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        final FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return Utils.stream(assemblyRegionIter).map(assemblyRegion ->
                new AssemblyRegionWalkerContext(assemblyRegion,
                        new ReferenceContext(reference, assemblyRegion.getPaddedSpan()),
                        new FeatureContext(features, assemblyRegion.getPaddedSpan()))).iterator();
    };
}

Source File: FindAssemblyRegionsSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

private static FlatMapFunction<Iterator<Shard<GATKRead>>, ActivityProfileStateRange> getActivityProfileStatesFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager,
        final SAMFileHeader header,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> supplierBroadcast,
        final AssemblyRegionArgumentCollection assemblyRegionArgs) {
    return (FlatMapFunction<Iterator<Shard<GATKRead>>, ActivityProfileStateRange>) shardedReadIterator -> {
        final ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        final FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        final AssemblyRegionEvaluator assemblyRegionEvaluator = supplierBroadcast.getValue().get(); // one AssemblyRegionEvaluator instance per Spark partition
        
        return Utils.stream(shardedReadIterator)
                .map(shardedRead -> {
                    final ReadsDownsampler readsDownsampler = assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ?
                            new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, header) : null;
                    return new ShardToMultiIntervalShardAdapter<>(
                            new DownsampleableSparkReadShard(
                                    new ShardBoundary(shardedRead.getInterval(), shardedRead.getPaddedInterval()), shardedRead, readsDownsampler));
                })
                .map(shardedRead -> {
                    final Iterator<ActivityProfileState> activityProfileStateIter = new ActivityProfileStateIterator(
                            new ShardToMultiIntervalShardAdapter<>(shardedRead),
                            header, reference, features, assemblyRegionEvaluator
                    );
                    return new ActivityProfileStateRange(shardedRead, activityProfileStateIter);
                }).iterator();
    };
}

Source File: FindAssemblyRegionsSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

private static FlatMapFunction<Iterator<Shard<GATKRead>>, AssemblyRegionWalkerContext> getAssemblyRegionsFunctionFast(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager,
        final SAMFileHeader header,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> supplierBroadcast,
        final AssemblyRegionArgumentCollection assemblyRegionArgs) {
    return (FlatMapFunction<Iterator<Shard<GATKRead>>, AssemblyRegionWalkerContext>) shardedReadIterator -> {
        final ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        final FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        final AssemblyRegionEvaluator assemblyRegionEvaluator = supplierBroadcast.getValue().get(); // one AssemblyRegionEvaluator instance per Spark partition
        final ReadsDownsampler readsDownsampler = assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ?
                new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, header) : null;

        Iterator<Iterator<AssemblyRegionWalkerContext>> iterators = Utils.stream(shardedReadIterator)
                .map(shardedRead -> new ShardToMultiIntervalShardAdapter<>(
                        new DownsampleableSparkReadShard(
                                new ShardBoundary(shardedRead.getInterval(), shardedRead.getPaddedInterval()), shardedRead, readsDownsampler)))
                .map(downsampledShardedRead -> {
                    final Iterator<AssemblyRegion> assemblyRegionIter = new AssemblyRegionIterator(
                            new ShardToMultiIntervalShardAdapter<>(downsampledShardedRead),
                            header, reference, features, assemblyRegionEvaluator, assemblyRegionArgs);
                    return Utils.stream(assemblyRegionIter).map(assemblyRegion ->
                            new AssemblyRegionWalkerContext(assemblyRegion,
                                    new ReferenceContext(reference, assemblyRegion.getPaddedSpan()),
                                    new FeatureContext(features, assemblyRegion.getPaddedSpan()))).iterator();
                }).iterator();
        return Iterators.concat(iterators);
    };
}

Source File: ReadWalkerSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

private static FlatMapFunction<Iterator<GATKRead>, ReadWalkerContext> getReadsFunction(
        String referenceFileName, Broadcast<FeatureManager> bFeatureManager) {
    return readIterator -> {
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return Iterators.transform(readIterator, new Function<GATKRead, ReadWalkerContext>() {
            @Nullable
            @Override
            public ReadWalkerContext apply(@Nullable GATKRead r) {
                final SimpleInterval readInterval = getReadInterval(r);
                return new ReadWalkerContext(r, new ReferenceContext(reference, readInterval), new FeatureContext(features, readInterval));
            }
        });
    };
}

Source File: SparkFrontendUtils.java From incubator-nemo with Apache License 2.0

5 votes

/**
 * Converts a {@link Function1} to a corresponding {@link FlatMapFunction}.
 *
 * @param scalaFunction the scala function to convert.
 * @param <I>           the type of input.
 * @param <O>           the type of output.
 * @return the converted Java function.
 */
public static <I, O> FlatMapFunction<I, O> toJavaFlatMapFunction(
  final Function1<I, TraversableOnce<O>> scalaFunction) {
  return new FlatMapFunction<I, O>() {
    @Override
    public Iterator<O> call(final I i) throws Exception {
      return JavaConverters.asJavaIteratorConverter(scalaFunction.apply(i).toIterator()).asJava();
    }
  };
}

Source File: SparkUtilsUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Test(dataProvider = "readPairsAndPartitions")
public void testPutReadsWithSameNameInSamePartition(int numPairs, int numPartitions, int numReadsInPair, int[] expectedReadsPerPartition) {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
    header.setSortOrder(SAMFileHeader.SortOrder.queryname);
    JavaRDD<GATKRead> reads =  ctx.parallelize(createPairedReads(header, numPairs, numReadsInPair), numPartitions);
    JavaRDD<GATKRead> pairedReads = SparkUtils.putReadsWithTheSameNameInTheSamePartition(header, reads, ctx);
    List<List<GATKRead>> partitions = pairedReads.mapPartitions((FlatMapFunction<Iterator<GATKRead>, List<GATKRead>>) it ->
            Iterators.singletonIterator(Lists.newArrayList(it))).collect();
    assertEquals(partitions.size(), numPartitions);
    for (int i = 0; i < numPartitions; i++) {
        assertEquals(partitions.get(i).size(), expectedReadsPerPartition[i]);
    }
    assertEquals(Arrays.stream(expectedReadsPerPartition).sum(), numPairs * numReadsInPair);
}

Source File: TranslationUtils.java From beam with Apache License 2.0

5 votes

/**
 * A utility method that adapts {@link Function} to a {@link FlatMapFunction} with an {@link
 * Iterator} input. This is particularly useful because it allows to use functions written for map
 * functions in flatmap functions.
 *
 * @param func the {@link Function} to adapt.
 * @param <InputT> the input type.
 * @param <OutputT> the output type.
 * @return a {@link FlatMapFunction} that accepts an {@link Iterator} as an input and applies the
 *     {@link Function} on every element.
 */
public static <InputT, OutputT>
    FlatMapFunction<Iterator<InputT>, OutputT> functionToFlatMapFunction(
        final Function<InputT, OutputT> func) {
  return itr ->
      Iterators.transform(
          itr,
          t -> {
            try {
              return func.call(t);
            } catch (Exception e) {
              throw new RuntimeException(e);
            }
          });
}

Source File: BlurRDD.java From incubator-retired-blur with Apache License 2.0

5 votes

public <T> JavaRDD<T> executeStream(JavaSparkContext context, String table, StreamFunction<T> streamFunction) {
  User user = UserContext.getUser();
  List<BlurSparkSplit> splits = getSplits(table, user, CLASS_LOADER_ID);
  return context.parallelize(splits).flatMap(new FlatMapFunction<BlurSparkSplit, T>() {
    @Override
    public Iterable<T> call(BlurSparkSplit t) throws Exception {
      return new Iterable<T>() {
        @Override
        public Iterator<T> iterator() {
          Closer closer = Closer.create();
          try {
            String host = t.getHost();
            int port = t.getPort();
            int timeout = t.getTimeout();
            StreamClient streamClient = closer.register(new StreamClient(host, port, timeout));
            String classLoaderId = t.getClassLoaderId();
            if (!streamClient.isClassLoaderAvailable(classLoaderId)) {
              streamClient.loadJars(classLoaderId, _jars);
            }
            return wrapClose(closer, streamClient.executeStream(t, streamFunction).iterator());
          } catch (IOException e) {
            IOUtils.closeQuietly(closer);
            throw new RuntimeException(e);
          }
        }
      };
    }
  });
}

Source File: TestMorphlineUtils.java From envelope with Apache License 2.0

5 votes

@Test (expected = RuntimeException.class)
public void morphlineMapperNoSchema(
    final @Mocked MorphlineUtils.Pipeline pipeline,
    final @Mocked Row row,
    final @Mocked StructType schema
) throws Exception {

  new Expectations(MorphlineUtils.class) {{
    MorphlineUtils.getPipeline("file", "id"); result = pipeline; times = 1;
    row.schema(); result = null;
  }};

  FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true);
  function.call(row);
}

Source File: TestMorphlineUtils.java From envelope with Apache License 2.0

5 votes

@Test
public void morphlineMapperNoPipeline(
    final @Mocked MorphlineUtils.Pipeline pipeline,
    final @Mocked Row row,
    final @Mocked StructType schema
) throws Exception {

  new Expectations(MorphlineUtils.class) {{
    MorphlineUtils.getPipeline("file", "id"); result = null; times = 1;
    MorphlineUtils.setPipeline("file", "id", (MorphlineUtils.Collector) any, true); result = pipeline; times = 1;
    MorphlineUtils.executePipeline(pipeline, (Record) any, true); result = Lists.newArrayList(); times = 1;
    row.schema(); result = schema;
    row.get(anyInt); returns("val1", "val2"); times = 2;
    schema.fieldNames(); result = new String[] { "one", "two"};
  }};

  FlatMapFunction<Row, Row> function = MorphlineUtils.morphlineMapper("file", "id", schema, true);
  Iterator<Row> results = function.call(row);

  assertEquals("Invalid number of Rows returned", 0, Lists.newArrayList(results).size());

  new Verifications() {{
    Record record;
    MorphlineUtils.executePipeline(pipeline, record = withCapture(), true);
    assertEquals(2, record.getFields().size());
    assertEquals("val1", record.get("one").get(0));
  }};
}

Source File: HaplotypeCallerSpark.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Call variants from Tuples of AssemblyRegion and Simple Interval
 * The interval should be the non-padded shard boundary for the shard that the corresponding AssemblyRegion was
 * created in, it's used to eliminate redundant variant calls at the edge of shard boundaries.
 */
private static FlatMapFunction<Iterator<Tuple2<AssemblyRegion, SimpleInterval>>, VariantContext> callVariantsFromAssemblyRegions(
        final AuthHolder authHolder,
        final SAMFileHeader header,
        final Broadcast<ReferenceMultiSource> referenceBroadcast,
        final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast) {
    return regionAndIntervals -> {
        //HaplotypeCallerEngine isn't serializable but is expensive to instantiate, so construct and reuse one for every partition
        final ReferenceMultiSourceAdapter referenceReader = new ReferenceMultiSourceAdapter(referenceBroadcast.getValue(), authHolder);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), header, referenceReader);
        return iteratorToStream(regionAndIntervals).flatMap(regionToVariants(hcEngine)).iterator();
    };
}

org.apache.spark.api.java.function.FlatMapFunction Java Examples