Java Code Examples for org.apache.spark.sql.SparkSession#createDataset()
The following examples show how to use
org.apache.spark.sql.SparkSession#createDataset() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MyVariantDataset.java From mmtf-spark with Apache License 2.0 | 8 votes |
/** * Returns a dataset of missense variations for a list of Uniprot Ids and a MyVariant.info query. * See <a href="http://myvariant.info/docs/">query syntax</a>. * <p> Example: * <pre> * String query = "clinvar.rcv.clinical_significance:pathogenic " * + "OR clinvar.rcv.clinical_significance:likely pathogenic"; * </pre> * * @param uniprotIds list of Uniprot Ids * @param query MyVariant.info query string * @return dataset with variation Ids and Uniprot Ids or null if no data are found * @throws IOException */ public static Dataset<Row> getVariations(List<String> uniprotIds, String query) throws IOException { // get a spark context SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc will be closed elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // download data in parallel JavaRDD<String> data = sc.parallelize(uniprotIds).flatMap(m -> getData(m, query)); // convert from JavaRDD to Dataset Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); // parse json strings and return as a dataset Dataset<Row> dataset = spark.read().json(jsonData); // return null if dataset contains no results if (!Arrays.asList(dataset.columns()).contains("hits")) { System.out.println("MyVariantDataset: no matches found"); return null; } return flattenDataset(dataset); }
Example 2
Source File: DataSetApplication.java From sparkResearch with Apache License 2.0 | 5 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder().master("local") .appName("Java Spark SQL") .getOrCreate(); Person person = new Person("spark",10); Encoder<Person> encoder = Encoders.bean(Person.class); Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder); dataset.show(); //最终输出 {name:spark;age:10} /*常见类型的编码器*/ Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder); Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) { return value+1; } },integerEncoder); result.collect(); //最终输出 [2,3] /*通过提供一个类,可以将数据流转换为数据集。基于名称的映射*/ String url = "/usr/local/text.json"; Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder); personDataset.show(); //最终输出 name:... age:,,,, }
Example 3
Source File: G2SDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Downloads PDB residue mappings for a list of genomic variations. * @param variationIds genomic variation ids (e.g. chr7:g.140449103A>C) * @param pdbId specific PDB structure used for mapping * @param chainId specific chain used for mapping * @return dataset with PDB mapping information * @throws IOException */ private static Dataset<Row> getDataset(List<String> variationIds, String structureId, String chainId) throws IOException { // get a spark context SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc will be closed elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // download data in parallel JavaRDD<String> data = sc.parallelize(variationIds).flatMap(m -> getData(m, structureId, chainId)); // convert from JavaRDD to Dataset Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); // parse json strings and return as a dataset Dataset<Row> dataset = spark.read().json(jsonData); dataset.show(); // return null if dataset is empty if (dataset.columns().length == 0) { System.out.println("G2SDataset: no matches found"); return null; } dataset = standardizeData(dataset); return flattenDataset(dataset); }
Example 4
Source File: JavaBean.java From learning-spark-with-java with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("Dataset-JavaBean") .master("local[4]") .getOrCreate(); // // The Java API requires you to explicitly instantiate an encoder for // any JavaBean you want to use for schema inference // Encoder<Number> numberEncoder = Encoders.bean(Number.class); // // Create a container of the JavaBean instances // List<Number> data = Arrays.asList( new Number(1, "one", "un"), new Number(2, "two", "deux"), new Number(3, "three", "trois")); // // Use the encoder and the container of JavaBean instances to create a // Dataset // Dataset<Number> ds = spark.createDataset(data, numberEncoder); System.out.println("*** here is the schema inferred from the bean"); ds.printSchema(); System.out.println("*** here is the data"); ds.show(); // Use the convenient bean-inferred column names to query System.out.println("*** filter by one column and fetch others"); ds.where(col("i").gt(2)).select(col("english"), col("french")).show(); spark.stop(); }
Example 5
Source File: EncoderHelpersTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void beamCoderToSparkEncoderTest() { SparkSession sparkSession = SparkSession.builder() .appName("beamCoderToSparkEncoderTest") .master("local[4]") .getOrCreate(); List<Integer> data = Arrays.asList(1, 2, 3); Dataset<Integer> dataset = sparkSession.createDataset(data, EncoderHelpers.fromBeamCoder(VarIntCoder.of())); assertEquals(data, dataset.collectAsList()); }
Example 6
Source File: ArrayToDatasetApp.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder() .appName("Array to Dataset<String>") .master("local") .getOrCreate(); String[] l = new String[] { "a", "b", "c", "d" }; List<String> data = Arrays.asList(l); Dataset<String> df = spark.createDataset(data, Encoders.STRING()); df.show(); }
Example 7
Source File: ReducerApp.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().master("local").getOrCreate(); List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); Dataset<Integer> df = spark.createDataset(data, Encoders.INT()); df.show(); df.printSchema(); Integer sumByReduce = df.reduce(new SumByReduce()); System.out.println("Sum should be 55 and it is... " + sumByReduce); }
Example 8
Source File: SparkTableUtil.java From iceberg with Apache License 2.0 | 4 votes |
/** * Import files from given partitions to an Iceberg table. * * @param spark a Spark session * @param partitions partitions to import * @param targetTable an Iceberg table where to import the data * @param spec a partition spec * @param stagingDir a staging directory to store temporary manifest files */ public static void importSparkPartitions( SparkSession spark, List<SparkPartition> partitions, Table targetTable, PartitionSpec spec, String stagingDir) { Configuration conf = spark.sessionState().newHadoopConf(); SerializableConfiguration serializableConf = new SerializableConfiguration(conf); int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); int numShufflePartitions = spark.sessionState().conf().numShufflePartitions(); MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties()); JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD<SparkPartition> partitionRDD = sparkContext.parallelize(partitions, parallelism); Dataset<SparkPartition> partitionDS = spark.createDataset( partitionRDD.rdd(), Encoders.javaSerialization(SparkPartition.class)); List<ManifestFile> manifests = partitionDS .flatMap((FlatMapFunction<SparkPartition, DataFile>) sparkPartition -> listPartition(sparkPartition, spec, serializableConf, metricsConfig).iterator(), Encoders.javaSerialization(DataFile.class)) .repartition(numShufflePartitions) .map((MapFunction<DataFile, Tuple2<String, DataFile>>) file -> Tuple2.apply(file.path().toString(), file), Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) .orderBy(col("_1")) .mapPartitions( (MapPartitionsFunction<Tuple2<String, DataFile>, ManifestFile>) fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple), Encoders.javaSerialization(ManifestFile.class)) .collectAsList(); try { boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( targetTable.properties(), TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); AppendFiles append = targetTable.newAppend(); manifests.forEach(append::appendManifest); append.commit(); if (!snapshotIdInheritanceEnabled) { // delete original manifests as they were rewritten before the commit deleteManifests(targetTable.io(), manifests); } } catch (Throwable e) { deleteManifests(targetTable.io(), manifests); throw e; } }
Example 9
Source File: JavaSparkSQLExample.java From SparkDemo with MIT License | 4 votes |
private static void runDatasetCreationExample(SparkSession spark) { // $example on:create_ds$ // Create an instance of a Bean class Person person = new Person(); person.setName("Andy"); person.setAge(32); // Encoders are created for Java beans Encoder<Person> personEncoder = Encoders.bean(Person.class); Dataset<Person> javaBeanDS = spark.createDataset( Collections.singletonList(person), personEncoder ); javaBeanDS.show(); // +---+----+ // |age|name| // +---+----+ // | 32|Andy| // +---+----+ // Encoders for most common types are provided in class Encoders Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder); Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) throws Exception { return value + 1; } }, integerEncoder); transformedDS.collect(); // Returns [2, 3, 4] // DataFrames can be converted to a Dataset by providing a class. Mapping based on name String path = Constant.LOCAL_FILE_PREX +"/data/resources/people.json"; Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder); peopleDS.show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:create_ds$ }
Example 10
Source File: MockValueSets.java From bunsen with Apache License 2.0 | 4 votes |
/** * Convenience method to create a MockValueSets instance with some test data. */ public static MockValueSets createWithTestValue(SparkSession spark, SparkRowConverter valueSetRowConverter) { Dataset<UrlAndVersion> urlAndVersion = spark.createDataset( ImmutableList.of(new UrlAndVersion( "http://hl7.org/fhir/us/core/ValueSet/us-core-encounter-type", "1.1.0"), new UrlAndVersion( "http://hl7.org/fhir/ValueSet/v3-ActPriority", "2017-04-19")), AbstractValueSets.getUrlAndVersionEncoder()); Dataset<Row> valueSet = valueSetRowConverter.toDataFrame(spark, ImmutableList.of(new ValueSet() .setUrl("http://hl7.org/fhir/us/core/ValueSet/us-core-encounter-type") .setVersion("1.1.0"), new ValueSet() .setUrl("http://hl7.org/fhir/ValueSet/v3-ActPriority") .setVersion("2017-04-19"))) .withColumn("timestamp", lit("20180101120000").cast("timestamp")); Dataset<Value> values = spark.createDataset( ImmutableList.of(new Value( "http://hl7.org/fhir/us/core/ValueSet/us-core-encounter-type", "1.1.0", "http://www.ama-assn.org/go/cpt", "0.0.1", "99200"), new Value( "http://hl7.org/fhir/ValueSet/v3-ActPriority", "2017-04-19", "http://hl7.org/fhir/v3/ActPriority", "2017-04-19", "EM")), AbstractValueSets.getValueEncoder()); return new MockValueSets(spark, urlAndVersion, valueSet, values, valueSetRowConverter); }
Example 11
Source File: Basic.java From learning-spark-with-java with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("Dataset-Basic") .master("local[4]") .getOrCreate(); List<Integer> data = Arrays.asList(10, 11, 12, 13, 14, 15); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()); System.out.println("*** only one column, and it always has the same name"); ds.printSchema(); ds.show(); System.out.println("*** values > 12"); // the harder way to filter Dataset<Integer> ds2 = ds.filter((Integer value) -> value > 12); ds.show(); List<Tuple3<Integer, String, String>> tuples = Arrays.asList( new Tuple3<>(1, "one", "un"), new Tuple3<>(2, "two", "deux"), new Tuple3<>(3, "three", "trois")); Encoder<Tuple3<Integer, String, String>> encoder = Encoders.tuple(Encoders.INT(), Encoders.STRING(), Encoders.STRING()); Dataset<Tuple3<Integer, String, String>> tupleDS = spark.createDataset(tuples, encoder); System.out.println("*** Tuple Dataset types"); tupleDS.printSchema(); // the tuple columns have unfriendly names, but you can use them to query System.out.println("*** filter by one column and fetch another"); tupleDS.where(col("_1").gt(2)).select(col("_2"), col("_3")).show(); spark.stop(); }