org.apache.spark.api.java.JavaSparkContext#close

Source File: SparkWordCount.java From Apache-Spark-2x-for-Java-Developers with MIT License

6 votes

public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}

Source File: Union.java From SparkDemo with MIT License

6 votes

static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD，不去重，要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}

Source File: FilterByRFree.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws FileNotFoundException {

		String path = MmtfReader.getMmtfReducedPath();
	    
	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByRFree.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);
		 
	    // here the methods are chained together
	    long count = MmtfReader
	    		.readSequenceFile(path, sc)
	    		.filter(new Resolution(0.0, 2.0))
	    		.count();
	    
	    System.out.println("# structures: " + count);
	    
	    sc.close();
	}

Source File: PolyPeptideChainStatistics.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws FileNotFoundException {

		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PolyPeptideChainStatistics.class.getSimpleName());
		JavaSparkContext sc = new JavaSparkContext(conf);

		JavaDoubleRDD chainLengths = MmtfReader
				.readReducedSequenceFile(sc) // read PDB from MMTF-Hadoop sequence file															
				.flatMapToPair(new StructureToPolymerChains(false, true)) // split (flatmap) into unique polymer chains
				.filter(new PolymerComposition(PolymerComposition.AMINO_ACIDS_20)) // only consider chains that contain the 20 standard aminoacids
				.mapToDouble(t -> t._2.getNumGroups()); // get the number of groups (residues) in each chain using a lambda expression

		System.out.println("Protein chains length statistics for proteins in the PDB with the 20 standard amino acids:");
		System.out.println(chainLengths.stats());

		sc.close();
	}

Source File: ReadMmtfFull.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws FileNotFoundException {  
    long start = System.nanoTime();
    
    // instantiate Spark. Each Spark application needs these two lines of code.
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadMmtfFull.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read all PDB entries from a local Hadoop sequence file
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readFullSequenceFile(sc);

    System.out.println("# structures: " + pdb.count());
    
    // close Spark
    sc.close();
    
    long end = System.nanoTime();
    System.out.println((end-start)/1E9 + " sec."); 
}

Source File: FilterExclusivelyByLProteins.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws FileNotFoundException {

		String path = MmtfReader.getMmtfReducedPath();
	    
	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterExclusivelyByLProteins.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);
	    		 
	    boolean exclusive = true;
	    
	    long count = MmtfReader
	    		.readSequenceFile(path, sc) // read MMTF hadoop sequence file
	    		// retain pdb entries that exclusively (flag set to true) contain L-protein chains
	    		.filter(new ContainsLProteinChain(exclusive)) 
	    		.count();
	    
	    System.out.println("# L-proteins: " + count);
	    sc.close();
	}

Source File: SpecifyFormatLoadSave.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("SpecifyFormatLoadSave").setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // 创建DataFrame 读取json
    SQLContext sqlContext = new SQLContext(sc);

    DataFrameReader dataFrameReader = sqlContext.read();

    // parquet 是本地数据存储的格式
    Dataset<Row> dataset = dataFrameReader.format("json").load(Constant.LOCAL_FILE_PREX + "/data/resources/people.json");

    // 通过关writer写入并保存save
    DataFrameWriter write = dataset.select("name").write();
    write.format("parquet").save("tmp/people.parquet");

    sc.close();
}

Source File: MapToBioJava.java From mmtf-spark with Apache License 2.0

5 votes

public static void main(String[] args) throws FileNotFoundException {

	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(MapToBioJava.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);
	    
	    long count = MmtfReader
	    		.readReducedSequenceFile(sc) // read MMTF-Hadoop sequence file
	    		.flatMapToPair(new StructureToPolymerChains())
	    		.mapValues(new StructureToBioJava())
	    		.count();
	    
	    System.out.println("Number of polymer chains: " + count);
	    
	    sc.close();
	}

Source File: BuildDataFrameFromScratch2.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Build a DataFrame from Scratch").master("local[*]")
      .getOrCreate();

  List<String[]> stringAsList = new ArrayList<>();
  stringAsList.add(new String[] { "bar1.1", "bar2.1" });
  stringAsList.add(new String[] { "bar1.2", "bar2.2" });

  JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());

  JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((
      String[] row) -> RowFactory.create(row));

  // Creates schema
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField(
          "foe1", DataTypes.StringType, false),
          DataTypes.createStructField("foe2", DataTypes.StringType, false) });

  Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();

  log.debug("** Schema: ");
  df.printSchema();

  log.debug("** Data: ");
  df.show();

  sparkContext.close();
}

Source File: BuildDataFrameFromScratch.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Build a DataFrame from Scratch").master("local[*]")
      .getOrCreate();

  List<String> stringAsList = new ArrayList<>();
  stringAsList.add("bar");

  JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());

  JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((
      String row) -> RowFactory.create(row));

  // Creates schema
  StructType schema = DataTypes.createStructType(
      new StructField[] { DataTypes.createStructField("foe",
          DataTypes.StringType, false) });

  Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();

  log.debug("** Schema: ");
  df.printSchema();

  log.debug("** Data: ");
  df.show();

  sparkContext.close();
}

Source File: MmcifToMmtfFull.java From mmtf-spark with Apache License 2.0

5 votes

/**
    * Converts a directory containing .cif files into an MMTF-Hadoop Sequence file.
    * The input directory is traversed recursively to find PDB files.
    * 
    * @param args args[0] <input-path-to-cif_files>, args[1] <output-path-to-mmtf-hadoop-file>
    * 
    * @throws FileNotFoundException
    */
public static void main(String[] args) throws FileNotFoundException {  
	
    if (args.length != 2) {
        System.out.println("Usage: MmcifToMmtfFull <input-path-to-cif_files> <output-path-to-mmtf-hadoop-file>");
    }
    
    // path to input directory
    String cifPath = args[0];
    
    // path to output directory
    String mmtfPath = args[1];

    // instantiate Spark
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("MmcifToMmtfFull");
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read cif files recursively starting from the specified top level directory
    JavaPairRDD<String, StructureDataInterface> structures = MmtfImporter.importMmcifFiles(cifPath, sc);
    
    // save as an MMTF-Hadoop Sequence File
    MmtfWriter.writeSequenceFile(mmtfPath, sc, structures);
    
    System.out.println(structures.count() + " structures written to: " + mmtfPath);
    
    // close Spark
    sc.close(); 
}

Source File: DataFrameOperation.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("DataFrameOperation").setMaster("local");
	JavaSparkContext sc = new JavaSparkContext(conf);

	SQLContext sqlContext = new SQLContext(sc);

	// 将数据源读取为数据框,可以理解为一张表。具有数据和结构信息
	Dataset<Row> dataset = sqlContext.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

	// 格式化的打印这张表
	dataset.show();

	// 搭建元数据(结构)schema
	dataset.printSchema();

	// 查询列并计算
	dataset.select("name").show();
	dataset.select(dataset.col("name"), dataset.col("age").plus(1)).show();

	// 过滤
	dataset.filter(dataset.col("age").gt(20)).show();

	// 根据某一列分组然后统计count
	dataset.groupBy("age").count().show();

	sc.close();
}

Source File: WriteMmtfCustom.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * @param args
 * @throws FileNotFoundException 
 */
public static void main(String[] args) throws FileNotFoundException {

	String path = MmtfReader.getMmtfFullPath();
    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WriteMmtfCustom.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read a 20% random sample of the PDB
    double fraction = 0.2;
    long seed = 123;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, fraction, seed, sc);

    // retain high resolution X-ray structures
    pdb = pdb
    		.filter(new ExperimentalMethods(ExperimentalMethods.X_RAY_DIFFRACTION))
    		.filter(new Resolution(0, 2.0))
    		.filter(new Rfree(0, 0.2));
   
    // coalesce this into 8 partitions to avoid creating many small files
    pdb = pdb.coalesce(8);
    
    // save this subset in a Hadoop Sequence file
    MmtfWriter.writeSequenceFile(path +"_xray", sc, pdb);
    
    System.out.println("# structures in custom set: " + pdb.count());
  
    long end = System.nanoTime();
    
    System.out.println("Time: " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: JDBCDataSource.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
//		SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local");
		JavaSparkContext sc = SparkUtils.getRemoteSparkContext(JDBCDataSource.class);
		SQLContext sqlContext = new SQLContext(sc);

		Map<String, String> options = new HashMap<String, String>();
		options.put("url", "jdbc:mysql://192.168.2.129:3306/hive");
		options.put("dbtable", "t_user");
		options.put("user", "root");
		options.put("password", "666666");

		// 加载jdbc数据配置信息 并不会立即连接数据库
		Dataset<Row> dataset1 = sqlContext.read().format("jdbc").options(options).load();

		//		options.put("dbtable", "tb_item");
		//		DataFrame dataFrame2 = sqlContext.read().format("jdbc").options(options).load();

		// 读取jdbc表数据
		dataset1.javaRDD().foreach(new VoidFunction<Row>() {
			@Override
			public void call(Row row) throws Exception {
				System.out.println(row);
			}
		});


		// 将RDD数据存储到MYSQL中
		saveToMysql( sqlContext, options);

		sc.close();
	}

Source File: WildTypeQuery.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WildTypeQuery.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);

	boolean includeExpressionTags = true;
	int sequenceCoverage = 95;

	long count = MmtfReader.readReducedSequenceFile(sc)
			.filter(new WildType(includeExpressionTags, sequenceCoverage)).count();

	System.out.println(count);

	sc.close();
}

Source File: SecondaryStructureElementDemo.java From mmtf-spark with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException {    
    long start = System.nanoTime();

    SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    List<String> pdbIds = Arrays.asList("1STP"); // single protein chain
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
    
    pdb = pdb
    		.flatMapToPair(new StructureToPolymerChains())
    		.filter(new ContainsLProteinChain());
    
    Dataset<Row> ds = SecondaryStructureElementExtractor.getDataset(pdb, "E", 6);

    // show the top 50 rows of this dataset
    ds.show(50, false);

    long end = System.nanoTime();
    
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
    
    sc.close();
}

Source File: InteractionAnalysisAdvanced.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args no input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // get non-redundant subset
    pdb = pdb.filter(new Pisces(40, 2.5));
    
    // find Zinc interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(20);

       // TODO print the top 10 interacting elements
       System.out.println("Top interacting elements");
       Dataset<Row> topElements = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("element2")
       		.count();
       
       topElements.withColumn("frequency", col("count").divide(n))
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       interactions
       .groupBy("element2")
       .avg("distance")
       .sort("avg(distance)")
       .show(10);

       // Aggregate multiple statistics
       // Note: import static org.apache.spark.sql.functions.* required!
       // e.g. org.apache.spark.sql.functions.avg
       // for a list of all available functions
       interactions
       .groupBy("element2")
       .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance"))
       .show(10);
       
       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: FilterByReleaseDate.java From mmtf-spark with Apache License 2.0

4 votes

public static void main(String[] args) throws FileNotFoundException {

		String path = MmtfReader.getMmtfReducedPath();
	   
	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByReleaseDate.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);
		 
	    long count = MmtfReader
	    		.readSequenceFile(path, sc)
	    		.filter(new ReleaseDate("2000-01-28","2017-02-28"))
	    		.count();
	    
	    System.out.println("Structures released between 2000-01-28 and 2017-02-28 " + count);
	    
	    sc.close();
	}

Source File: SparkGraphXKickoff.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

4 votes

public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
            .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
            .setAppName(APPLICATION_NAME);

        JavaSparkContext javaSparkContext = new JavaSparkContext(conf);        
        
        List<Tuple2<Object, String>> listOfVertex = new ArrayList<>();
        listOfVertex.add(new Tuple2<>(1l, "James"));
        listOfVertex.add(new Tuple2<>(2l, "Andy"));
        listOfVertex.add(new Tuple2<>(3l, "Ed"));
        listOfVertex.add(new Tuple2<>(4l, "Roger"));
        listOfVertex.add(new Tuple2<>(5l, "Tony"));

        List<Edge<String>> listOfEdge = new ArrayList<>();
        listOfEdge.add(new Edge<>(2, 1, "Friend"));
        listOfEdge.add(new Edge<>(3, 1, "Friend"));
        listOfEdge.add(new Edge<>(3, 2, "Colleague"));    
        listOfEdge.add(new Edge<>(3, 5, "Partner"));
        listOfEdge.add(new Edge<>(4, 3, "Boss"));        
        listOfEdge.add(new Edge<>(5, 2, "Partner"));       
    
        JavaRDD<Tuple2<Object, String>> vertexRDD = javaSparkContext.parallelize(listOfVertex);
        JavaRDD<Edge<String>> edgeRDD = javaSparkContext.parallelize(listOfEdge);

        ClassTag<String> stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
		
        Graph<String, String> graph = Graph.apply(
            vertexRDD.rdd(), 
            edgeRDD.rdd(), 
            "", 
            StorageLevel.MEMORY_ONLY(), 
			StorageLevel.MEMORY_ONLY(), 
			stringTag, 
			stringTag
            );    

        //apply specific algorithms, such as PageRank

        graph.vertices()
            .saveAsTextFile(VERTICES_FOLDER_PATH);        
			 
        graph.edges()
	    .saveAsTextFile(EDGES_FOLDER_PATH);        

        javaSparkContext.close();
    }

Source File: ReducedRedundancyLocatorExampleMain.java From s3-inventory-usage-examples with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception{
    String srcBucketName;
    String scrBucketKey;
    String destBucketName;
    String destPrefix;
    ArgumentParser argumentParser = new ArgumentParser();
    AmazonS3 s3Client = new AmazonS3Client();

    try {
        BucketKey location = argumentParser.parseArguments(args);
        srcBucketName = location.getSrcBucket();
        scrBucketKey = location.getSrcKey();
        destBucketName = location.getDestBucket();
        destPrefix = location.getDestPrefix();
    } catch (ParseException e) {
        LOG.info(PARSE_ERROR_MSG);
        throw new IllegalArgumentException("Parser throw a parse Exception", e);
    }

    // Obtain the original manifest files
    InventoryManifestRetriever inventoryManifestRetriever =
            new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey);
    InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest();

    // Check if the inventory report includes the StorageClass column
    String fileSchema = manifest.getFileSchema();
    String filterColumn = "storageClass";
    if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) {
        throw new StorageClassNotIncludedException();
    }

    //Create Spark Context
    SparkConf sparkConf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory());

    // Get the inventory report, split it into lines, parse each line to a POJO,
    // Filter, and write new csv file to S3
    JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators());
    List<InventoryManifest.Locator> newLocatorList = locatorRDD
            .map(new InventoryReportLineRetriever(clientFactory, manifest))
            .flatMap(new InventoryReportMapper(manifest))
            .filter(new ReducedRedundancyStorageClassFilter())
            .mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest,
                    destBucketName, destPrefix))
            .collect();

    // Generate new manifest files including new locators, and send them back to S3
    new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest)
            .writeManifest(newLocatorList);

    sc.close();
}

Java Code Examples for org.apache.spark.api.java.JavaSparkContext#close()