Java Code Examples for org.apache.spark.api.java.JavaSparkContext#close()
The following examples show how to use
org.apache.spark.api.java.JavaSparkContext#close() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkWordCount.java From Apache-Spark-2x-for-Java-Developers with MIT License | 6 votes |
public static void main(String[] args) throws Exception { System.out.println(System.getProperty("hadoop.home.dir")); String inputPath = args[0]; String outputPath = args[1]; FileUtils.deleteQuietly(new File(outputPath)); JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount"); JavaRDD<String> rdd = sc.textFile(inputPath); JavaPairRDD<String, Integer> counts = rdd .flatMap(x -> Arrays.asList(x.split(" ")).iterator()) .mapToPair(x -> new Tuple2<String, Integer>((String) x, 1)) .reduceByKey((x, y) -> x + y); counts.saveAsTextFile(outputPath); sc.close(); }
Example 2
Source File: Union.java From SparkDemo with MIT License | 6 votes |
static void union(JavaSparkContext sc ) { List<String> datas1 = Arrays.asList("张三", "李四"); List<String> datas2 = Arrays.asList("tom", "gim"); JavaRDD<String> data1RDD = sc.parallelize(datas1); JavaRDD<String> data2RDD = sc.parallelize(datas2); /** * ==================================================================== * | 合并两个RDD,不去重,要求两个RDD中的元素类型一致 | * | Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD | | * ==================================================================== */ JavaRDD<String> unionRDD = data1RDD .union(data2RDD); unionRDD.foreach(new VoidFunction<String>() { @Override public void call(String t) throws Exception { System.out.println(t); } }); sc.close(); }
Example 3
Source File: FilterByRFree.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws FileNotFoundException { String path = MmtfReader.getMmtfReducedPath(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByRFree.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // here the methods are chained together long count = MmtfReader .readSequenceFile(path, sc) .filter(new Resolution(0.0, 2.0)) .count(); System.out.println("# structures: " + count); sc.close(); }
Example 4
Source File: PolyPeptideChainStatistics.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws FileNotFoundException { SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PolyPeptideChainStatistics.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); JavaDoubleRDD chainLengths = MmtfReader .readReducedSequenceFile(sc) // read PDB from MMTF-Hadoop sequence file .flatMapToPair(new StructureToPolymerChains(false, true)) // split (flatmap) into unique polymer chains .filter(new PolymerComposition(PolymerComposition.AMINO_ACIDS_20)) // only consider chains that contain the 20 standard aminoacids .mapToDouble(t -> t._2.getNumGroups()); // get the number of groups (residues) in each chain using a lambda expression System.out.println("Protein chains length statistics for proteins in the PDB with the 20 standard amino acids:"); System.out.println(chainLengths.stats()); sc.close(); }
Example 5
Source File: ReadMmtfFull.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws FileNotFoundException { long start = System.nanoTime(); // instantiate Spark. Each Spark application needs these two lines of code. SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadMmtfFull.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read all PDB entries from a local Hadoop sequence file JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readFullSequenceFile(sc); System.out.println("# structures: " + pdb.count()); // close Spark sc.close(); long end = System.nanoTime(); System.out.println((end-start)/1E9 + " sec."); }
Example 6
Source File: FilterExclusivelyByLProteins.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws FileNotFoundException { String path = MmtfReader.getMmtfReducedPath(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterExclusivelyByLProteins.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); boolean exclusive = true; long count = MmtfReader .readSequenceFile(path, sc) // read MMTF hadoop sequence file // retain pdb entries that exclusively (flag set to true) contain L-protein chains .filter(new ContainsLProteinChain(exclusive)) .count(); System.out.println("# L-proteins: " + count); sc.close(); }
Example 7
Source File: SpecifyFormatLoadSave.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("SpecifyFormatLoadSave").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); // 创建DataFrame 读取json SQLContext sqlContext = new SQLContext(sc); DataFrameReader dataFrameReader = sqlContext.read(); // parquet 是本地数据存储的格式 Dataset<Row> dataset = dataFrameReader.format("json").load(Constant.LOCAL_FILE_PREX + "/data/resources/people.json"); // 通过关writer写入并保存save DataFrameWriter write = dataset.select("name").write(); write.format("parquet").save("tmp/people.parquet"); sc.close(); }
Example 8
Source File: MapToBioJava.java From mmtf-spark with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws FileNotFoundException { SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(MapToBioJava.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); long count = MmtfReader .readReducedSequenceFile(sc) // read MMTF-Hadoop sequence file .flatMapToPair(new StructureToPolymerChains()) .mapValues(new StructureToBioJava()) .count(); System.out.println("Number of polymer chains: " + count); sc.close(); }
Example 9
Source File: BuildDataFrameFromScratch2.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName( "Build a DataFrame from Scratch").master("local[*]") .getOrCreate(); List<String[]> stringAsList = new ArrayList<>(); stringAsList.add(new String[] { "bar1.1", "bar2.1" }); stringAsList.add(new String[] { "bar1.2", "bar2.2" }); JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map(( String[] row) -> RowFactory.create(row)); // Creates schema StructType schema = DataTypes .createStructType(new StructField[] { DataTypes.createStructField( "foe1", DataTypes.StringType, false), DataTypes.createStructField("foe2", DataTypes.StringType, false) }); Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF(); log.debug("** Schema: "); df.printSchema(); log.debug("** Data: "); df.show(); sparkContext.close(); }
Example 10
Source File: BuildDataFrameFromScratch.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName( "Build a DataFrame from Scratch").master("local[*]") .getOrCreate(); List<String> stringAsList = new ArrayList<>(); stringAsList.add("bar"); JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map(( String row) -> RowFactory.create(row)); // Creates schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("foe", DataTypes.StringType, false) }); Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF(); log.debug("** Schema: "); df.printSchema(); log.debug("** Data: "); df.show(); sparkContext.close(); }
Example 11
Source File: MmcifToMmtfFull.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Converts a directory containing .cif files into an MMTF-Hadoop Sequence file. * The input directory is traversed recursively to find PDB files. * * @param args args[0] <input-path-to-cif_files>, args[1] <output-path-to-mmtf-hadoop-file> * * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException { if (args.length != 2) { System.out.println("Usage: MmcifToMmtfFull <input-path-to-cif_files> <output-path-to-mmtf-hadoop-file>"); } // path to input directory String cifPath = args[0]; // path to output directory String mmtfPath = args[1]; // instantiate Spark SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("MmcifToMmtfFull"); JavaSparkContext sc = new JavaSparkContext(conf); // read cif files recursively starting from the specified top level directory JavaPairRDD<String, StructureDataInterface> structures = MmtfImporter.importMmcifFiles(cifPath, sc); // save as an MMTF-Hadoop Sequence File MmtfWriter.writeSequenceFile(mmtfPath, sc, structures); System.out.println(structures.count() + " structures written to: " + mmtfPath); // close Spark sc.close(); }
Example 12
Source File: DataFrameOperation.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("DataFrameOperation").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); // 将数据源读取为数据框,可以理解为一张表。具有数据和结构信息 Dataset<Row> dataset = sqlContext.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); // 格式化的打印这张表 dataset.show(); // 搭建元数据(结构)schema dataset.printSchema(); // 查询列并计算 dataset.select("name").show(); dataset.select(dataset.col("name"), dataset.col("age").plus(1)).show(); // 过滤 dataset.filter(dataset.col("age").gt(20)).show(); // 根据某一列分组然后统计count dataset.groupBy("age").count().show(); sc.close(); }
Example 13
Source File: WriteMmtfCustom.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * @param args * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WriteMmtfCustom.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read a 20% random sample of the PDB double fraction = 0.2; long seed = 123; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, fraction, seed, sc); // retain high resolution X-ray structures pdb = pdb .filter(new ExperimentalMethods(ExperimentalMethods.X_RAY_DIFFRACTION)) .filter(new Resolution(0, 2.0)) .filter(new Rfree(0, 0.2)); // coalesce this into 8 partitions to avoid creating many small files pdb = pdb.coalesce(8); // save this subset in a Hadoop Sequence file MmtfWriter.writeSequenceFile(path +"_xray", sc, pdb); System.out.println("# structures in custom set: " + pdb.count()); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 14
Source File: JDBCDataSource.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { // SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local"); JavaSparkContext sc = SparkUtils.getRemoteSparkContext(JDBCDataSource.class); SQLContext sqlContext = new SQLContext(sc); Map<String, String> options = new HashMap<String, String>(); options.put("url", "jdbc:mysql://192.168.2.129:3306/hive"); options.put("dbtable", "t_user"); options.put("user", "root"); options.put("password", "666666"); // 加载jdbc数据配置信息 并不会立即连接数据库 Dataset<Row> dataset1 = sqlContext.read().format("jdbc").options(options).load(); // options.put("dbtable", "tb_item"); // DataFrame dataFrame2 = sqlContext.read().format("jdbc").options(options).load(); // 读取jdbc表数据 dataset1.javaRDD().foreach(new VoidFunction<Row>() { @Override public void call(Row row) throws Exception { System.out.println(row); } }); // 将RDD数据存储到MYSQL中 saveToMysql( sqlContext, options); sc.close(); }
Example 15
Source File: WildTypeQuery.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @throws IOException */ public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WildTypeQuery.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); boolean includeExpressionTags = true; int sequenceCoverage = 95; long count = MmtfReader.readReducedSequenceFile(sc) .filter(new WildType(includeExpressionTags, sequenceCoverage)).count(); System.out.println(count); sc.close(); }
Example 16
Source File: SecondaryStructureElementDemo.java From mmtf-spark with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException { long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); List<String> pdbIds = Arrays.asList("1STP"); // single protein chain JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache(); pdb = pdb .flatMapToPair(new StructureToPolymerChains()) .filter(new ContainsLProteinChain()); Dataset<Row> ds = SecondaryStructureElementExtractor.getDataset(pdb, "E", 6); // show the top 50 rows of this dataset ds.show(50, false); long end = System.nanoTime(); System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec."); sc.close(); }
Example 17
Source File: InteractionAnalysisAdvanced.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args no input arguments * @throws IOException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // get non-redundant subset pdb = pdb.filter(new Pisces(40, 2.5)); // find Zinc interactions within 3 Angstroms GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3); Dataset<Row> interactions = finder.getDataset(pdb).cache(); // show the data schema of the dataset and some data interactions.printSchema(); interactions.show(20); long n = interactions.count(); System.out.println("# interactions: " + n); System.out.println("Top interacting groups"); Dataset<Row> topGroups = interactions .groupBy("residue2") .count(); topGroups .sort(col("count").desc()) // sort descending by count .show(10); System.out.println("Top interacting group/atoms types"); Dataset<Row> topGroupsAndAtoms = interactions .filter("element2 != 'C'") // exclude carbon interactions .groupBy("residue2","atom2") .count(); topGroupsAndAtoms .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence .filter("frequency > 0.01") // filter out occurrences < 1 % .sort(col("frequency").desc()) // sort descending .show(20); // TODO print the top 10 interacting elements System.out.println("Top interacting elements"); Dataset<Row> topElements = interactions .filter("element2 != 'C'") // exclude carbon interactions .groupBy("element2") .count(); topElements.withColumn("frequency", col("count").divide(n)) .filter("frequency > 0.01") // filter out occurrences < 1 % .sort(col("frequency").desc()) // sort descending .show(10); interactions .groupBy("element2") .avg("distance") .sort("avg(distance)") .show(10); // Aggregate multiple statistics // Note: import static org.apache.spark.sql.functions.* required! // e.g. org.apache.spark.sql.functions.avg // for a list of all available functions interactions .groupBy("element2") .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance")) .show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 18
Source File: FilterByReleaseDate.java From mmtf-spark with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws FileNotFoundException { String path = MmtfReader.getMmtfReducedPath(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByReleaseDate.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); long count = MmtfReader .readSequenceFile(path, sc) .filter(new ReleaseDate("2000-01-28","2017-02-28")) .count(); System.out.println("Structures released between 2000-01-28 and 2017-02-28 " + count); sc.close(); }
Example 19
Source File: SparkGraphXKickoff.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME); JavaSparkContext javaSparkContext = new JavaSparkContext(conf); List<Tuple2<Object, String>> listOfVertex = new ArrayList<>(); listOfVertex.add(new Tuple2<>(1l, "James")); listOfVertex.add(new Tuple2<>(2l, "Andy")); listOfVertex.add(new Tuple2<>(3l, "Ed")); listOfVertex.add(new Tuple2<>(4l, "Roger")); listOfVertex.add(new Tuple2<>(5l, "Tony")); List<Edge<String>> listOfEdge = new ArrayList<>(); listOfEdge.add(new Edge<>(2, 1, "Friend")); listOfEdge.add(new Edge<>(3, 1, "Friend")); listOfEdge.add(new Edge<>(3, 2, "Colleague")); listOfEdge.add(new Edge<>(3, 5, "Partner")); listOfEdge.add(new Edge<>(4, 3, "Boss")); listOfEdge.add(new Edge<>(5, 2, "Partner")); JavaRDD<Tuple2<Object, String>> vertexRDD = javaSparkContext.parallelize(listOfVertex); JavaRDD<Edge<String>> edgeRDD = javaSparkContext.parallelize(listOfEdge); ClassTag<String> stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class); Graph<String, String> graph = Graph.apply( vertexRDD.rdd(), edgeRDD.rdd(), "", StorageLevel.MEMORY_ONLY(), StorageLevel.MEMORY_ONLY(), stringTag, stringTag ); //apply specific algorithms, such as PageRank graph.vertices() .saveAsTextFile(VERTICES_FOLDER_PATH); graph.edges() .saveAsTextFile(EDGES_FOLDER_PATH); javaSparkContext.close(); }
Example 20
Source File: ReducedRedundancyLocatorExampleMain.java From s3-inventory-usage-examples with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception{ String srcBucketName; String scrBucketKey; String destBucketName; String destPrefix; ArgumentParser argumentParser = new ArgumentParser(); AmazonS3 s3Client = new AmazonS3Client(); try { BucketKey location = argumentParser.parseArguments(args); srcBucketName = location.getSrcBucket(); scrBucketKey = location.getSrcKey(); destBucketName = location.getDestBucket(); destPrefix = location.getDestPrefix(); } catch (ParseException e) { LOG.info(PARSE_ERROR_MSG); throw new IllegalArgumentException("Parser throw a parse Exception", e); } // Obtain the original manifest files InventoryManifestRetriever inventoryManifestRetriever = new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey); InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest(); // Check if the inventory report includes the StorageClass column String fileSchema = manifest.getFileSchema(); String filterColumn = "storageClass"; if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) { throw new StorageClassNotIncludedException(); } //Create Spark Context SparkConf sparkConf = new SparkConf(); JavaSparkContext sc = new JavaSparkContext(sparkConf); Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory()); // Get the inventory report, split it into lines, parse each line to a POJO, // Filter, and write new csv file to S3 JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators()); List<InventoryManifest.Locator> newLocatorList = locatorRDD .map(new InventoryReportLineRetriever(clientFactory, manifest)) .flatMap(new InventoryReportMapper(manifest)) .filter(new ReducedRedundancyStorageClassFilter()) .mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest, destBucketName, destPrefix)) .collect(); // Generate new manifest files including new locators, and send them back to S3 new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest) .writeManifest(newLocatorList); sc.close(); }