org.apache.spark.api.java.JavaSparkContext Java Examples
The following examples show how to use
org.apache.spark.api.java.JavaSparkContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkUtils.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * List of the files in the given directory (path), as a {@code JavaRDD<String>} * * @param sc Spark context * @param path Path to list files in * @param recursive Whether to walk the directory tree recursively (i.e., include subdirectories) * @param allowedExtensions If null: all files will be accepted. If non-null: only files with the specified extension will be allowed. * Exclude the extension separator - i.e., use "txt" not ".txt" here. * @param config Hadoop configuration to use. Must not be null. * @return Paths in the directory * @throws IOException If error occurs getting directory contents */ public static JavaRDD<String> listPaths(@NonNull JavaSparkContext sc, String path, boolean recursive, Set<String> allowedExtensions, @NonNull Configuration config) throws IOException { List<String> paths = new ArrayList<>(); FileSystem hdfs = FileSystem.get(URI.create(path), config); RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(path), recursive); while (fileIter.hasNext()) { String filePath = fileIter.next().getPath().toString(); if(allowedExtensions == null){ paths.add(filePath); } else { String ext = FilenameUtils.getExtension(path); if(allowedExtensions.contains(ext)){ paths.add(filePath); } } } return sc.parallelize(paths); }
Example #2
Source File: PSFilterTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Test(groups = "spark") public void testDoSetPairFlags() { final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); final SAMSequenceDictionary seq = new SAMSequenceDictionary(); seq.addSequence(new SAMSequenceRecord("test_seq", 1000)); final SAMFileHeader header = new SAMFileHeader(seq); final List<GATKRead> readList = makeReadSet(header); final JavaRDD<GATKRead> reads = ctx.parallelize(readList); ; final List<GATKRead> result = PSFilter.setPairFlags(reads, 100).collect(); Assert.assertEquals(result.size(), 6); for (final GATKRead read : result) { if (read.getName().equals("paired_1") || read.getName().equals("paired_2")) { Assert.assertTrue(read.isPaired()); } else { Assert.assertFalse(read.isPaired()); } } }
Example #3
Source File: SparkUtilsUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Test public void testPathExists() throws Exception { MiniClusterUtils.runOnIsolatedMiniCluster( cluster -> { //use the HDFS on the mini cluster final Path workingDirectory = MiniClusterUtils.getWorkingDir(cluster); final Path tempPath = new Path(workingDirectory, "testFileExists.txt"); final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); Assert.assertFalse(SparkUtils.hadoopPathExists(ctx, tempPath.toUri())); final FileSystem fs = tempPath.getFileSystem(ctx.hadoopConfiguration()); final FSDataOutputStream fsOutStream = fs.create(tempPath); fsOutStream.close(); fs.deleteOnExit(tempPath); Assert.assertTrue(SparkUtils.hadoopPathExists(ctx, tempPath.toUri())); }); }
Example #4
Source File: CassandraDependenciesJob.java From spark-dependencies with Apache License 2.0 | 6 votes |
public void run(String peerServiceTag) { long microsLower = day.toInstant().toEpochMilli() * 1000; long microsUpper = day.plus(Period.ofDays(1)).toInstant().toEpochMilli() * 1000 - 1; log.info("Running Dependencies job for {}: {} ≤ Span.timestamp {}", day, microsLower, microsUpper); JavaSparkContext sc = new JavaSparkContext(conf); try { JavaPairRDD<String, Iterable<Span>> traces = javaFunctions(sc) .cassandraTable(keyspace, "traces", mapRowTo(CassandraSpan.class)) .where("start_time < ? AND start_time > ?", microsUpper, microsLower) .mapToPair(span -> new Tuple2<>(span.getTraceId(), span)) .mapValues(span -> (Span) span) .groupByKey(); List<Dependency> dependencyLinks = DependenciesSparkHelper.derive(traces,peerServiceTag); store(sc, dependencyLinks); log.info("Done, {} dependency objects created", dependencyLinks.size()); } finally { sc.stop(); } }
Example #5
Source File: CoveragePoNQCUtilsUnitTest.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Test public void testIdentifySamplesWithSuspiciousContigsDelsWithSpark() { final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); final Set<String> gtBlacklistSamples = new HashSet<>(); gtBlacklistSamples.add("sample_1"); gtBlacklistSamples.add("sample_2"); gtBlacklistSamples.add("sample_3"); ReadCountCollection allCoverageProfiles = null; try { allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_FILE_DEL); } catch (final IOException ioe) { Assert.fail("Could not load test file: " + TEST_FILE_DEL, ioe); } final JavaRDD<ReadCountCollection> allSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createParallelIndividualReadCountCollections(allCoverageProfiles, ctx); // By the time we are here, input is assumed to have been tangent normalized. final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(allSampleTangentNormalizedReadCounts, ctx, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles)); final Set<String> resultSamples = new HashSet<>(blacklistSamples); Assert.assertEquals(resultSamples.size(), gtBlacklistSamples.size()); Assert.assertEquals(Sets.difference(resultSamples, gtBlacklistSamples).size(), 0); }
Example #6
Source File: PdbRedoToMmtf.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) { if (args.length != 2) { System.err.println("Usage: " + PdbRedoToMmtf.class.getSimpleName() + " <pdb-redo-path> <mmtf-path"); System.exit(1); } long start = System.nanoTime(); // instantiate Spark SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PdbRedoToMmtf.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // import PDB-REDO from a local copy JavaPairRDD<String, StructureDataInterface> pdbredo = MmtfImporter.importPdbRedo(args[0], sc); // save PDB-REDO as an MMTF-Hadoop Sequence file MmtfWriter.writeSequenceFile(args[1], sc, pdbredo); long end = System.nanoTime(); System.out.println("time: " + (end-start)/1E9 + " sec."); // close Spark sc.close(); }
Example #7
Source File: SparkWordCount.java From Apache-Spark-2x-for-Java-Developers with MIT License | 6 votes |
public static void main(String[] args) throws Exception { System.out.println(System.getProperty("hadoop.home.dir")); String inputPath = args[0]; String outputPath = args[1]; FileUtils.deleteQuietly(new File(outputPath)); JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount"); JavaRDD<String> rdd = sc.textFile(inputPath); JavaPairRDD<String, Integer> counts = rdd .flatMap(x -> Arrays.asList(x.split(" ")).iterator()) .mapToPair(x -> new Tuple2<String, Integer>((String) x, 1)) .reduceByKey((x, y) -> x + y); counts.saveAsTextFile(outputPath); sc.close(); }
Example #8
Source File: CpxVariantReInterpreterSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Override protected void runTool(final JavaSparkContext ctx) { // TODO: 5/9/18 getback sample name in output files final SAMFileHeader headerForReads = getHeaderForReads(); final Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines(); final SvDiscoveryInputMetaData svDiscoveryInputMetaData = new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile, derivedSimpleVCFPrefix, null, null, null, null, headerForReads, getReference(), defaultToolVCFHeaderLines, localLogger); final JavaRDD<VariantContext> complexVariants = new VariantsSparkSource(ctx) .getParallelVariantContexts(complexVCF, getIntervals()); final JavaRDD<GATKRead> assemblyRawAlignments = getReads(); final SegmentedCpxVariantSimpleVariantExtractor.ExtractedSimpleVariants extract = SegmentedCpxVariantSimpleVariantExtractor.extract(complexVariants, svDiscoveryInputMetaData, assemblyRawAlignments); final String derivedOneSegmentSimpleVCF = derivedSimpleVCFPrefix + "_1_seg.vcf"; final String derivedMultiSegmentSimpleVCF = derivedSimpleVCFPrefix + "_multi_seg.vcf"; final VCFHeader vcfHeader = VariantsSparkSource.getHeader(complexVCF); SVVCFWriter.writeVCF(extract.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, vcfHeader.getSequenceDictionary(), defaultToolVCFHeaderLines, logger); SVVCFWriter.writeVCF(extract.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, vcfHeader.getSequenceDictionary(), defaultToolVCFHeaderLines, logger); }
Example #9
Source File: WordCount.java From Apache-Spark-2x-for-Java-Developers with MIT License | 6 votes |
public static void wordCountJava8( String filename ) { // Define a configuration to use to interact with Spark SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App"); // Create a Java version of the Spark Context from the configuration JavaSparkContext sc = new JavaSparkContext(conf); // Load the input data, which is a text file read from the command line JavaRDD<String> input = sc.textFile( filename ); // Java 8 with lambdas: split the input string into words // TODO here a change has happened JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() ); // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y ); // Save the word count back out to a text file, causing evaluation. counts.saveAsTextFile( "output" ); }
Example #10
Source File: VariantsSparkSinkUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
private void assertSingleShardedWritingWorks(String vcf, String outputPath, boolean writeTabixIndex) throws IOException { JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx); JavaRDD<VariantContext> variants = variantsSparkSource.getParallelVariantContexts(vcf, null); if (variants.getNumPartitions() == 1) { variants = variants.repartition(3); // repartition to more than 1 partition } VCFHeader header = getHeader(vcf); VariantsSparkSink.writeVariants(ctx, outputPath, variants, header, writeTabixIndex); checkFileExtensionConsistentWithContents(outputPath, writeTabixIndex); JavaRDD<VariantContext> variants2 = variantsSparkSource.getParallelVariantContexts(outputPath, null); final List<VariantContext> writtenVariants = variants2.collect(); VariantContextTestUtils.assertEqualVariants(readVariants(vcf), writtenVariants); }
Example #11
Source File: JavaKernelDensityEstimationExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ // an RDD of sample data JavaRDD<Double> data = jsc.parallelize( Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0)); // Construct the density estimator with the sample data // and a standard deviation for the Gaussian kernels KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0); // Find density estimates for the given values double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0}); System.out.println(Arrays.toString(densities)); // $example off$ jsc.stop(); }
Example #12
Source File: Grep.java From flink-perf with Apache License 2.0 | 5 votes |
public static void main(String[] args) { String master = args[0]; String inFile = args[1]; String outFile = args[2]; String patterns[] = new String[args.length-3]; System.arraycopy(args,3,patterns,0,args.length-3); System.err.println("Starting spark with master="+master+" in="+inFile); System.err.println("Using patterns: "+ Arrays.toString(patterns)); SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> file = sc.textFile(inFile); for(int p = 0; p < patterns.length; p++) { final String pattern = patterns[p]; JavaRDD<String> res = file.filter(new Function<String, Boolean>() { private static final long serialVersionUID = 1L; Pattern p = Pattern.compile(pattern); @Override public Boolean call(String value) throws Exception { if (value == null || value.length() == 0) { return false; } final Matcher m = p.matcher(value); if (m.find()) { return true; } return false; } }); res.saveAsTextFile(outFile+"_"+pattern); } }
Example #13
Source File: JDBCDataSource.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { // SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local"); JavaSparkContext sc = SparkUtils.getRemoteSparkContext(JDBCDataSource.class); SQLContext sqlContext = new SQLContext(sc); Map<String, String> options = new HashMap<String, String>(); options.put("url", "jdbc:mysql://192.168.2.129:3306/hive"); options.put("dbtable", "t_user"); options.put("user", "root"); options.put("password", "666666"); // 加载jdbc数据配置信息 并不会立即连接数据库 Dataset<Row> dataset1 = sqlContext.read().format("jdbc").options(options).load(); // options.put("dbtable", "tb_item"); // DataFrame dataFrame2 = sqlContext.read().format("jdbc").options(options).load(); // 读取jdbc表数据 dataset1.javaRDD().foreach(new VoidFunction<Row>() { @Override public void call(Row row) throws Exception { System.out.println(row); } }); // 将RDD数据存储到MYSQL中 saveToMysql( sqlContext, options); sc.close(); }
Example #14
Source File: CacheTest.java From beam with Apache License 2.0 | 5 votes |
/** * Test checks how the cache candidates map is populated by the runner when evaluating the * pipeline. */ @Test public void cacheCandidatesUpdaterTest() { SparkPipelineOptions options = createOptions(); Pipeline pipeline = Pipeline.create(options); PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar")); // First use of pCollection. pCollection.apply(Count.globally()); // Second use of pCollection. PCollectionView<List<String>> view = pCollection.apply(View.asList()); // Internally View.asList() creates a PCollection that underlies the PCollectionView, that // PCollection should not be cached as the SparkRunner does not access that PCollection to // access the PCollectionView. pipeline .apply(Create.of("foo", "baz")) .apply( ParDo.of( new DoFn<String, String>() { @ProcessElement public void processElement(ProcessContext processContext) { if (processContext.sideInput(view).contains(processContext.element())) { processContext.output(processContext.element()); } } }) .withSideInputs(view)); JavaSparkContext jsc = SparkContextFactory.getSparkContext(options); EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options); SparkRunner.CacheVisitor cacheVisitor = new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt); pipeline.traverseTopologically(cacheVisitor); assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection)); assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count()); }
Example #15
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<MatrixIndexes,MatrixBlock> toMatrixJavaPairRDD(JavaSparkContext sc, MatrixBlock src, int blen, int numParts, boolean inclEmpty) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; List<Tuple2<MatrixIndexes,MatrixBlock>> list = null; if( src.getNumRows() <= blen && src.getNumColumns() <= blen ) { list = Arrays.asList(new Tuple2<>(new MatrixIndexes(1,1), src)); } else { MatrixCharacteristics mc = new MatrixCharacteristics( src.getNumRows(), src.getNumColumns(), blen, src.getNonZeros()); list = LongStream.range(0, mc.getNumBlocks()).parallel() .mapToObj(i -> createIndexedMatrixBlock(src, mc, i)) .filter(kv -> inclEmpty || !kv._2.isEmptyBlock(false)) .collect(Collectors.toList()); } JavaPairRDD<MatrixIndexes,MatrixBlock> result = (numParts > 1) ? sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list); if (DMLScript.STATISTICS) { Statistics.accSparkParallelizeTime(System.nanoTime() - t0); Statistics.incSparkParallelizeCount(1); } return result; }
Example #16
Source File: JavaTC.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaTC") .getOrCreate(); JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2; JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache(); // Linear transitive closure: each round grows paths by one edge, // by joining the graph's edges with the already-discovered paths. // e.g. join the path (y, z) from the TC with the edge (x, y) from // the graph to obtain the path (x, z). // Because join() joins on keys, the edges are stored in reversed order. JavaPairRDD<Integer, Integer> edges = tc.mapToPair( new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() { @Override public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) { return new Tuple2<>(e._2(), e._1()); } }); long oldCount; long nextCount = tc.count(); do { oldCount = nextCount; // Perform the join, obtaining an RDD of (y, (z, x)) pairs, // then project the result to obtain the new (x, z) paths. tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache(); nextCount = tc.count(); } while (nextCount != oldCount); System.out.println("TC has " + tc.count() + " edges."); spark.stop(); }
Example #17
Source File: FindBreakpointEvidenceSpark.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Kmerize reads having template names in a given set, * filter out low complexity kmers and kmers that appear too often in the genome to be helpful in localizing reads, * kill intervals that have too few surviving kmers. * The return is a Tuple2 in which * _1 describes the intervals that have been killed for having too few kmers (as a map from intervalId onto an explanatory string), * and _2 describes the good kmers that we want to use in local assemblies (as a multimap from kmer onto intervalId). */ private static Tuple2<List<AlignedAssemblyOrExcuse>, HopscotchUniqueMultiMap<SVKmer, Integer, KmerAndInterval>> getKmerAndIntervalsSet( final FindBreakpointEvidenceSparkArgumentCollection params, final ReadMetadata readMetadata, final JavaSparkContext ctx, final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap, final int nIntervals, final JavaRDD<GATKRead> unfilteredReads, final SVReadFilter filter, final Logger logger) { final Set<SVKmer> kmerKillSet = SVFileUtils.readKmersFile(params.kmersToIgnoreFile, params.kSize); if ( params.adapterSequence != null ) { SVKmerizer.stream(params.adapterSequence, params.kSize, 0, new SVKmerLong()) .forEach(kmer -> kmerKillSet.add(kmer.canonical(params.kSize))); } log("Ignoring " + kmerKillSet.size() + " genomically common kmers.", logger); final Tuple2<List<AlignedAssemblyOrExcuse>, List<KmerAndInterval>> kmerIntervalsAndDispositions = getKmerIntervals(params, readMetadata, ctx, qNamesMultiMap, nIntervals, kmerKillSet, unfilteredReads, filter, logger); final HopscotchUniqueMultiMap<SVKmer, Integer, KmerAndInterval> kmerMultiMap = new HopscotchUniqueMultiMap<>(kmerIntervalsAndDispositions._2()); log("Discovered " + kmerMultiMap.size() + " kmers.", logger); return new Tuple2<>(kmerIntervalsAndDispositions._1(), kmerMultiMap); }
Example #18
Source File: BulkInsertPreppedDeltaCommitActionExecutor.java From hudi with Apache License 2.0 | 5 votes |
public BulkInsertPreppedDeltaCommitActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, HoodieTable table, String instantTime, JavaRDD<HoodieRecord<T>> preppedInputRecordRdd, Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) { super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT); this.preppedInputRecordRdd = preppedInputRecordRdd; this.bulkInsertPartitioner = bulkInsertPartitioner; }
Example #19
Source File: JavaEmbeddedIgniteRDDSelfTest.java From ignite with Apache License 2.0 | 5 votes |
/** * @throws Exception If failed. */ @Test public void testStoreDataToIgnite() throws Exception { JavaSparkContext sc = createContext(); JavaIgniteContext<String, String> ic = null; try { ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false); ic.fromCache(PARTITIONED_CACHE_NAME) .savePairs(sc.parallelize(F.range(0, KEYS_CNT), GRID_CNT).mapToPair(TO_PAIR_F), true, false); Ignite ignite = ic.ignite(); IgniteCache<String, String> cache = ignite.cache(PARTITIONED_CACHE_NAME); for (int i = 0; i < KEYS_CNT; i++) { String val = cache.get(String.valueOf(i)); assertNotNull("Value was not put to cache for key: " + i, val); assertEquals("Invalid value stored for key: " + i, "val" + i, val); } } finally { if (ic != null) ic.close(true); sc.stop(); } }
Example #20
Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License | 5 votes |
private static void runJsonDatasetExample(SparkSession spark) { // $example on:json_dataset$ // A JSON dataset is pointed to by path. // The path can be either a single text file or a directory storing text files Dataset<Row> people = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); // The inferred schema can be visualized using the printSchema() method people.printSchema(); // root // |-- age: long (nullable = true) // |-- name: string (nullable = true) // Creates a temporary view using the DataFrame people.createOrReplaceTempView("people"); // SQL statements can be run by using the sql methods provided by spark Dataset<Row> namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); namesDF.show(); // +------+ // | name| // +------+ // |Justin| // +------+ // Alternatively, a DataFrame can be created for a JSON dataset represented by // an RDD[String] storing one JSON object per string. List<String> jsonData = Arrays.asList( "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD<String> anotherPeopleRDD = new JavaSparkContext(spark.sparkContext()).parallelize(jsonData); Dataset anotherPeople = spark.read().json(anotherPeopleRDD); anotherPeople.show(); // +---------------+----+ // | address|name| // +---------------+----+ // |[Columbus,Ohio]| Yin| // +---------------+----+ // $example off:json_dataset$ }
Example #21
Source File: PSScorerTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Test(dataProvider = "mapPairs", groups = "spark") public void testMapGroupedReadsToTax(final int readLength, final List<Integer> NM1, final List<Integer> NM2, final List<Integer> clip1, final List<Integer> clip2, final List<Integer> insert1, final List<Integer> insert2, final List<Integer> delete1, final List<Integer> delete2, final List<String> contig1, final List<String> contig2, final List<Integer> truthTax) { final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxonomyDatabase); //Test with alternate alignments assigned to the XA tag final List<Iterable<GATKRead>> readListXA = new ArrayList<>(); readListXA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "XA")); final JavaRDD<Iterable<GATKRead>> pairsXA = ctx.parallelize(readListXA); final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultXA = PSScorer.mapGroupedReadsToTax(pairsXA, MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast); final PSPathogenAlignmentHit infoXA = resultXA.first()._2; Assert.assertNotNull(infoXA); Assert.assertEquals(infoXA.taxIDs.size(), truthTax.size()); Assert.assertTrue(infoXA.taxIDs.containsAll(truthTax)); Assert.assertEquals(infoXA.numMates, 2); //Test SA tag final List<Iterable<GATKRead>> readListSA = new ArrayList<>(); readListSA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "SA")); final JavaRDD<Iterable<GATKRead>> pairsSA = ctx.parallelize(readListSA); final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultSA = PSScorer.mapGroupedReadsToTax(pairsSA, MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast); final PSPathogenAlignmentHit infoSA = resultSA.first()._2; Assert.assertNotNull(infoSA); Assert.assertEquals(infoSA.taxIDs.size(), truthTax.size()); Assert.assertTrue(infoSA.taxIDs.containsAll(truthTax)); Assert.assertEquals(infoSA.numMates, 2); }
Example #22
Source File: SparkSegmentTarPushJob.java From incubator-pinot with Apache License 2.0 | 5 votes |
@Override public void run() throws Exception { if (!_enableParallelPush) { super.run(); } else { List<Path> segmentPathsToPush = getDataFilePaths(_segmentPattern); retainRecentFiles(segmentPathsToPush, _lookBackPeriod); List<String> segmentsToPush = new ArrayList<>(); segmentPathsToPush.forEach(path -> { segmentsToPush.add(path.toString()); }); JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate()); if (_pushJobParallelism == -1) { _pushJobParallelism = segmentsToPush.size(); } JavaRDD<String> pathRDD = sparkContext.parallelize(segmentsToPush, _pushJobParallelism); pathRDD.foreach(segmentTarPath -> { try (ControllerRestApi controllerRestApi = getControllerRestApi()) { FileSystem fileSystem = FileSystem.get(new Path(segmentTarPath).toUri(), new Configuration()); // TODO: Deal with invalid prefixes in the future List<String> currentSegments = controllerRestApi.getAllSegments("OFFLINE"); controllerRestApi.pushSegments(fileSystem, Arrays.asList(new Path(segmentTarPath))); if (_deleteExtraSegments) { controllerRestApi .deleteSegmentUris(getSegmentsToDelete(currentSegments, Arrays.asList(new Path(segmentTarPath)))); } } }); } }
Example #23
Source File: KMeansUpdate.java From oryx with Apache License 2.0 | 5 votes |
/** * @param sparkContext active Spark Context * @param trainData training data on which to build a model * @param hyperParameters ordered list of hyper parameter values to use in building model * @param candidatePath directory where additional model files can be written * @return a {@link PMML} representation of a model trained on the given data */ @Override public PMML buildModel(JavaSparkContext sparkContext, JavaRDD<String> trainData, List<?> hyperParameters, Path candidatePath) { int numClusters = (Integer) hyperParameters.get(0); Preconditions.checkArgument(numClusters > 1); log.info("Building KMeans Model with {} clusters", numClusters); JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN)); KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations, initializationStrategy); return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel)); }
Example #24
Source File: TestStreamingStep.java From envelope with Apache License 2.0 | 5 votes |
public JavaRDD<String> generateRDD() { Random values = new Random(); values.setSeed(System.currentTimeMillis()); List<String> list = Lists.newLinkedList(); for (int i = 0; i < batchSize; i++) { list.add(String.valueOf(values.nextLong())); } SparkContext sc = Contexts.getSparkSession().sparkContext(); JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc); return jsc.parallelize(list,this.partitions); }
Example #25
Source File: CNLOHCaller.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 5 votes |
private double[] calcNewRhos(final List<ACNVModeledSegment> segments, final List<double[][][]> responsibilitiesBySeg, final double lambda, final double[] rhos, final int[] mVals, final int[] nVals, final JavaSparkContext ctx) { // Since, we pass in the entire responsibilities matrix, we need the correct index for each rho. That, and the // fact that this is a univariate objective function, means we need to create an instance for each rho. And // then we blast across Spark. final List<Pair<? extends Function<Double, Double>, SearchInterval>> objectives = IntStream.range(0, rhos.length) .mapToObj(i -> new Pair<>( new Function<Double, Double>() { @Override public Double apply(Double rho) { return calculateESmnObjective(rho, segments, responsibilitiesBySeg, mVals, nVals, lambda, i); } }, new SearchInterval(0.0, 1.0, rhos[i]))) .collect(Collectors.toList()); final JavaRDD<Pair<? extends Function<Double, Double>, SearchInterval>> objectivesRDD = ctx.parallelize(objectives); final List<Double> resultsAsDouble = objectivesRDD .map(objective -> optimizeIt(objective.getFirst(), objective.getSecond())) .collect(); return resultsAsDouble.stream().mapToDouble(Double::doubleValue).toArray(); }
Example #26
Source File: SparkUtils.java From deeplearning4j with Apache License 2.0 | 5 votes |
public static Broadcast<byte[]> asByteArrayBroadcast(JavaSparkContext sc, INDArray array){ ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { Nd4j.write(array, new DataOutputStream(baos)); } catch (IOException e){ throw new RuntimeException(e); //Should never happen } byte[] paramBytes = baos.toByteArray(); //See docs in EvaluationRunner for why we use byte[] instead of INDArray (thread locality etc) return sc.broadcast(paramBytes); }
Example #27
Source File: TraverseStructureHierarchy.java From mmtf-spark with Apache License 2.0 | 5 votes |
public static void main(String args[]) { // instantiate Spark. Each Spark application needs these two lines of code. SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadMmtfReduced.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // List<String> pdbIds = Arrays.asList("5UTV"); // multiple models // List<String> pdbIds = Arrays.asList("1BZ1"); // multiple protein chains // List<String> pdbIds = Arrays.asList("1STP"); // single protein chain List<String> pdbIds = Arrays.asList("1HV4"); // structure with 2 bioassemblies // List<String> pdbIds = Arrays.asList("2NBK"); // single protein chain JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); pdb.foreach(t -> TraverseStructureHierarchy.printAll(t._2)); }
Example #28
Source File: CassandraDependenciesJob.java From spark-dependencies with Apache License 2.0 | 5 votes |
private String dependenciesTable(JavaSparkContext sc) { try { javaFunctions(sc) .cassandraTable(keyspace, "dependencies_v2") .limit(1L).collect(); } catch (Exception ex) { return "dependencies"; } return "dependencies_v2"; }
Example #29
Source File: TransformationRDDTest.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 初始化 * * @throws Exception the exception * @since hui_project 1.0.0 */ @Before public void before() throws Exception { sparkConf = new SparkConf().setMaster("local[4]").setAppName("test"); // sparkConf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer"); sparkContext = new JavaSparkContext(sparkConf); }
Example #30
Source File: TestEarlyStoppingSpark.java From deeplearning4j with Apache License 2.0 | 5 votes |
private JavaRDD<DataSet> getIris() { JavaSparkContext sc = getContext(); IrisDataSetIterator iter = new IrisDataSetIterator(irisBatchSize(), 150); List<DataSet> list = new ArrayList<>(150); while (iter.hasNext()) list.add(iter.next()); return sc.parallelize(list); }