Java Code Examples for org.apache.spark.api.java.JavaRDD#aggregate()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#aggregate() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0 | 6 votes |
public static DataAnalysis analyze(Schema schema, JavaRDD<List<Writable>> data, int maxHistogramBuckets) { data.cache(); /* * TODO: Some care should be given to add histogramBuckets and histogramBucketCounts to this in the future */ List<ColumnType> columnTypes = schema.getColumnTypes(); List<AnalysisCounter> counters = data.aggregate(null, new AnalysisAddFunction(schema), new AnalysisCombineFunction()); double[][] minsMaxes = new double[counters.size()][2]; List<ColumnAnalysis> list = DataVecAnalysisUtils.convertCounters(counters, minsMaxes, columnTypes); List<HistogramCounter> histogramCounters = data.aggregate(null, new HistogramAddFunction(maxHistogramBuckets, schema, minsMaxes), new HistogramCombineFunction()); DataVecAnalysisUtils.mergeCounters(list, histogramCounters); return new DataAnalysis(schema, list); }
Example 2
Source File: FlagStatSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Override protected void runTool(final JavaSparkContext ctx) { final JavaRDD<GATKRead> reads = getReads(); final FlagStatus result = reads.aggregate(new FlagStatus(), FlagStatus::add, FlagStatus::merge); System.out.println(result); if(out != null ) { try ( final PrintStream ps = new PrintStream(BucketUtils.createFile(out)) ) { ps.print(result); } } }
Example 3
Source File: MeanQualityByCycleSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Computes the MeanQualityByCycle. Creates a metrics file with relevant histograms. */ public MetricsFile<?, Integer> calculateMeanQualityByCycle(final JavaRDD<GATKRead> reads){ final MetricsReadFilter metricsFilter = new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly); final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read)); final HistogramGeneratorPair aggregate = filteredReads.aggregate(new HistogramGeneratorPair(), (hgp, read) -> hgp.addRead(read), (hgp1, hgp2) -> hgp1.merge(hgp2)); return finish(aggregate.useQuals, aggregate.useOrigQuals); }
Example 4
Source File: QualityScoreDistributionSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Override protected void runTool(final JavaSparkContext ctx) { final JavaRDD<GATKRead> reads = getReads(); final MetricsReadFilter metricsFilter = new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly); final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read)); final Counts result = filteredReads.aggregate(new Counts(includeNoCalls), (counts, read) -> counts.addRead(read), (counts1, counts2) -> counts1.merge(counts2)); final MetricsFile<?, Byte> metrics = makeMetrics(result); saveResults(metrics, getHeaderForReads(), getReadSourceName()); }
Example 5
Source File: CollectBaseDistributionByCycleSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Computes the MeanQualityByCycle. Creates a metrics file with relevant histograms. */ public MetricsFile<BaseDistributionByCycleMetrics, Integer> calculateBaseDistributionByCycle(final JavaRDD<GATKRead> reads){ final MetricsReadFilter metricsFilter = new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly); final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read)); final HistogramGenerator hist = filteredReads.aggregate(new HistogramGenerator(), (hgp, read) -> hgp.addRead(read), (hgp1, hgp2) -> hgp1.merge(hgp2)); final MetricsFile<BaseDistributionByCycleMetrics, Integer> metricsFile = getMetricsFile(); hist.addToMetricsFile(metricsFile); return metricsFile; }
Example 6
Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Analyze the data quality of data - provides a report on missing values, values that don't comply with schema, etc * @param schema Schema for data * @param data Data to analyze * @return DataQualityAnalysis object */ public static DataQualityAnalysis analyzeQuality(final Schema schema, final JavaRDD<List<Writable>> data) { int nColumns = schema.numColumns(); List<QualityAnalysisState> states = data.aggregate(null, new BiFunctionAdapter<>(new QualityAnalysisAddFunction(schema)), new BiFunctionAdapter<>(new QualityAnalysisCombineFunction())); List<ColumnQuality> list = new ArrayList<>(nColumns); for (QualityAnalysisState qualityState : states) { list.add(qualityState.getColumnQuality()); } return new DataQualityAnalysis(schema, list); }
Example 7
Source File: AnalyzeSpark.java From DataVec with Apache License 2.0 | 4 votes |
/** * * @param schema * @param data * @return */ public static DataQualityAnalysis analyzeQuality(final Schema schema, final JavaRDD<List<Writable>> data) { data.cache(); int nColumns = schema.numColumns(); List<ColumnType> columnTypes = schema.getColumnTypes(); List<QualityAnalysisState> states = data.aggregate(null, new QualityAnalysisAddFunction(schema), new QualityAnalysisCombineFunction()); List<ColumnQuality> list = new ArrayList<>(nColumns); for (QualityAnalysisState qualityState : states) { list.add(qualityState.getColumnQuality()); } return new DataQualityAnalysis(schema, list); }
Example 8
Source File: AnalyzeSpark.java From DataVec with Apache License 2.0 | 3 votes |
/** * Get a list of unique values from the specified column. * For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)} * * @param columnNames Names of the column to get unique values from * @param schema Data schema * @param data Data to get unique values from * @return List of unique values, for each of the specified columns */ public static Map<String,List<Writable>> getUnique(List<String> columnNames, Schema schema, JavaRDD<List<Writable>> data){ Map<String,Set<Writable>> m = data.aggregate(null, new UniqueAddFunction(columnNames, schema), new UniqueMergeFunction()); Map<String,List<Writable>> out = new HashMap<>(); for(String s : m.keySet()){ out.put(s, new ArrayList<>(m.get(s))); } return out; }
Example 9
Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0 | 3 votes |
/** * Get a list of unique values from the specified columns. * For sequence data, use {@link #getUniqueSequence(String, Schema, JavaRDD)} * * @param columnNames Names of the column to get unique values from * @param schema Data schema * @param data Data to get unique values from * @return List of unique values, for each of the specified columns */ public static Map<String,List<Writable>> getUnique(List<String> columnNames, Schema schema, JavaRDD<List<Writable>> data){ Map<String,Set<Writable>> m = data.aggregate(null, new UniqueAddFunction(columnNames, schema), new UniqueMergeFunction()); Map<String,List<Writable>> out = new HashMap<>(); for(String s : m.keySet()){ out.put(s, new ArrayList<>(m.get(s))); } return out; }