Java Code Examples for org.apache.spark.sql.SparkSession#createDataFrame()
The following examples show how to use
org.apache.spark.sql.SparkSession#createDataFrame() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DatasetBalancerTest.java From mmtf-spark with Apache License 2.0 | 6 votes |
@Test public void test() { List<Row> rows = Arrays.asList( RowFactory.create("a", 1), RowFactory.create("a", 2), RowFactory.create("b", 1), RowFactory.create("b", 2), RowFactory.create("b", 3), RowFactory.create("c", 1), RowFactory.create("c", 2), RowFactory.create("c", 3), RowFactory.create("c", 4)); SparkSession spark = SparkSession.builder().master("local[1]").getOrCreate(); StructType schema = new StructType( new StructField[] { DataTypes.createStructField("key", DataTypes.StringType, false), DataTypes.createStructField("value", DataTypes.IntegerType, false) }); Dataset<Row> data = spark.createDataFrame(rows, schema); long seed = 19; Dataset<Row> balancedData = DatasetBalancer.downsample(data, "key", seed); assertTrue(balancedData.count() > 0); spark.close(); }
Example 2
Source File: GraphLoader.java From tutorials with MIT License | 6 votes |
public GraphFrame getGraphFrameUserRelationship() throws IOException { Path temp = Files.createTempDirectory("sparkGraphFrames"); SparkSession session = SparkSession.builder() .appName("SparkGraphFrameSample") .config("spark.sql.warehouse.dir", temp.toString()) .sparkContext(getSparkContext().sc()) .master("local[*]") .getOrCreate(); List<User> users = loadUsers(); Dataset<Row> userDataset = session.createDataFrame(users, User.class); List<Relationship> relationshipsList = getRelations(); Dataset<Row> relationshipDataset = session.createDataFrame(relationshipsList, Relationship.class); GraphFrame graphFrame = new GraphFrame(userDataset, relationshipDataset); return graphFrame; }
Example 3
Source File: Bundles.java From bunsen with Apache License 2.0 | 6 votes |
/** * Extracts the given resource type from the RDD of bundles and returns * it as a Dataset of that type, including any declared resources contained * to the parent resource. * * @param spark the spark session * @param bundles the RDD of FHIR Bundles * @param resourceTypeUrl the url of the resource * @param containedClassesUrls the list of urls of the resources contained to the parent resource * @return a dataset of the given resource */ public Dataset<Row> extractEntry(SparkSession spark, JavaRDD<BundleContainer> bundles, String resourceTypeUrl, List<String> containedClassesUrls) { FhirContext context = FhirContexts.contextFor(fhirVersion); SparkRowConverter converter = SparkRowConverter .forResource(context, resourceTypeUrl, containedClassesUrls); ToResourceRow resourceToRowConverter = new ToResourceRow(converter.getResourceType(), resourceTypeUrl, fhirVersion, converter, containedClassesUrls); JavaRDD<Row> resourceRdd = bundles.flatMap(resourceToRowConverter); return spark.createDataFrame(resourceRdd.rdd(), converter.getSchema()); }
Example 4
Source File: StructureAligner.java From mmtf-spark with Apache License 2.0 | 6 votes |
/** * Calculates all vs. all structural alignments of protein chains using the * specified alignment algorithm. The input structures must contain single * protein chains. * * @param targets structures containing single protein chains * @param alignmentAlgorithm name of the algorithm * @return dataset with alignment metrics */ public static Dataset<Row> getAllVsAllAlignments(JavaPairRDD<String, StructureDataInterface> targets, String alignmentAlgorithm) { SparkSession session = SparkSession.builder().getOrCreate(); JavaSparkContext sc = new JavaSparkContext(session.sparkContext()); // create a list of chainName/ C Alpha coordinates List<Tuple2<String, Point3d[]>> chains = targets.mapValues( s -> new ColumnarStructureX(s,true).getcAlphaCoordinates()).collect(); // create an RDD of all pair indices (0,1), (0,2), ..., (1,2), (1,3), ... JavaRDD<Tuple2<Integer, Integer>> pairs = getPairs(sc, chains.size()); // calculate structural alignments for all pairs. // broadcast (copy) chains to all worker nodes for efficient processing. // for each pair there can be zero or more solutions, therefore we flatmap the pairs. JavaRDD<Row> rows = pairs.flatMap(new StructuralAlignmentMapper(sc.broadcast(chains), alignmentAlgorithm)); // convert rows to a dataset return session.createDataFrame(rows, getSchema()); }
Example 5
Source File: JavaSQLTransformerExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaSQLTransformerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, 1.0, 3.0), RowFactory.create(2, 2.0, 5.0) ); StructType schema = new StructType(new StructField [] { new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("v2", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); SQLTransformer sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"); sqlTrans.transform(df).show(); // $example off$ spark.stop(); }
Example 6
Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0 | 5 votes |
/** * Add element indices as new column to DataFrame * * @param df input data frame * @param sparkSession the Spark Session * @param nameOfCol name of index column * @return new data frame */ public static Dataset<Row> addIDToDataFrame(Dataset<Row> df, SparkSession sparkSession, String nameOfCol) { StructField[] oldSchema = df.schema().fields(); StructField[] newSchema = new StructField[oldSchema.length + 1]; for(int i = 0; i < oldSchema.length; i++) { newSchema[i] = oldSchema[i]; } newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false); // JavaRDD<Row> newRows = df.rdd().toJavaRDD().map(new AddRowID()); JavaRDD<Row> newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID()); return sparkSession.createDataFrame(newRows, new StructType(newSchema)); }
Example 7
Source File: FromRowsAndSchema.java From learning-spark-with-java with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("DataFrame-FromRowsAndSchema") .master("local[4]") .getOrCreate(); List<Row> customerRows = Arrays.asList( RowFactory.create(1, "Widget Co", 120000.00, 0.00, "AZ"), RowFactory.create(2, "Acme Widgets", 410500.00, 500.00, "CA"), RowFactory.create(3, "Widgetry", 410500.00, 200.00, "CA"), RowFactory.create(4, "Widgets R Us", 410500.00, 0.0, "CA"), RowFactory.create(5, "Ye Olde Widgete", 500.00, 0.0, "MA") ); List<StructField> fields = Arrays.asList( DataTypes.createStructField("id", DataTypes.IntegerType, true), DataTypes.createStructField("name", DataTypes.StringType, true), DataTypes.createStructField("sales", DataTypes.DoubleType, true), DataTypes.createStructField("discount", DataTypes.DoubleType, true), DataTypes.createStructField("state", DataTypes.StringType, true) ); StructType customerSchema = DataTypes.createStructType(fields); Dataset<Row> customerDF = spark.createDataFrame(customerRows, customerSchema); System.out.println("*** the schema created"); customerDF.printSchema(); System.out.println("*** the data"); customerDF.show(); System.out.println("*** just the rows from CA"); customerDF.filter(col("state").equalTo("CA")).show(); spark.stop(); }
Example 8
Source File: QuaternaryStructureDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a dataset with quaternary structure info * * @param structure * @return dataset quaternary structure info */ public static Dataset<Row> getDataset(JavaPairRDD<String, StructureDataInterface> structure) { JavaRDD<Row> rows = structure.flatMap(t -> getQuaternaryStructure(t)); StructType schema = new StructType(new StructField[]{ new StructField("structureId", DataTypes.StringType, false, Metadata.empty()), new StructField("bioAssemblyId", DataTypes.StringType, false, Metadata.empty()), new StructField("proteinStoichiometry", DataTypes.StringType, true, Metadata.empty()), new StructField("dnaStoichiometry", DataTypes.StringType, true, Metadata.empty()), new StructField("rnaStoichiometry", DataTypes.StringType, true, Metadata.empty()), }); SparkSession spark = SparkSession.builder().getOrCreate(); return spark.createDataFrame(rows, schema); }
Example 9
Source File: JavaCountVectorizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaCountVectorizerExample") .getOrCreate(); // $example on$ // Input data: Each row is a bag of words from a sentence or document. List<Row> data = Arrays.asList( RowFactory.create(Arrays.asList("a", "b", "c")), RowFactory.create(Arrays.asList("a", "b", "b", "c", "a")) ); StructType schema = new StructType(new StructField [] { new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); // fit a CountVectorizerModel from the corpus CountVectorizerModel cvModel = new CountVectorizer() .setInputCol("text") .setOutputCol("feature") .setVocabSize(3) .setMinDF(2) .fit(df); // alternatively, define CountVectorizerModel with a-priori vocabulary CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"}) .setInputCol("text") .setOutputCol("feature"); cvModel.transform(df).show(false); // $example off$ spark.stop(); }
Example 10
Source File: GroupInteractionExtractor.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a dataset of interactions that satisfy the criteria of * the {@link InteractionFilter}. Each atom and its interacting neighbor atoms * are represented as a row in a Dataset. In addition, geometric features * of the interactions, such as distances, angles, and orientational order * parameters are returned in each row (see {@link edu.sdsc.mm.dev.utils.CoordinationGeometry}). * * @param structures a set of PDB structures * @return filter criteria for determining noncovalent interactions * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter * @see edu.sdsc.mm.dev.utils.CoordinationGeometry */ public static Dataset<Row> getInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) { SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // calculate interactions boolean pairwise = false; JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise)); // convert JavaRDD to Dataset return spark.createDataFrame(rows, AtomInteraction.getSchema(filter.getMaxInteractions())); }
Example 11
Source File: GroupInteractionExtractor.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns a Dataset of pairwise interactions that satisfy the criteria of * the {@link InteractionFilter}. Each atom, its interacting neighbor atom, and * the interaction distance is represented as a row. * * @param structures a set of PDB structures * @return filter criteria for determining noncovalent interactions * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter */ public static Dataset<Row> getPairInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) { SparkSession spark = SparkSession.builder().getOrCreate(); @SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); // calculate interactions boolean pairwise = true; JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise)); // convert JavaRDD to Dataset return spark.createDataFrame(rows, AtomInteraction.getPairInteractionSchema()); }
Example 12
Source File: TestSuite.java From stocator with Apache License 2.0 | 5 votes |
public void test10(SparkSession spark, String path) throws Exception { System.out.println("*********************************"); System.out.println("T10: Partition test - start"); try { List<Square> squares = new ArrayList<Square>(); for (int value = 1; value <= 10; value++) { Square square = new Square(); square.setValue(value); square.setSquare(value * value); squares.add(square); } Dataset<Row> squaresDF = spark.createDataFrame(squares, Square.class); squaresDF.write().mode("overwrite").partitionBy("value").parquet(path); Dataset<Row> readDF = spark.read().parquet(path); readDF.printSchema(); long dfCount = readDF.count(); long schemaLength = readDF.schema().length(); if (dfCount != 10) { throw new Exception("T10: failed. Read " + dfCount + ", expected 10"); } if (schemaLength != 2) { throw new Exception("T10: failed. Schema length" + schemaLength + ", expected 2"); } System.out.println("T10: Partition test - completed"); } catch (Exception e) { throw e; } finally { deleteData(path, spark.sparkContext().hadoopConfiguration(), dataCreate); } }
Example 13
Source File: SparkStreamingSqlAnalyse.java From sylph with Apache License 2.0 | 5 votes |
/** * 预编译sql 而不是等到运行时,才发现错误 * Precompiled sql instead of waiting for the runtime to find the error */ private static void checkDStream( SparkSession spark, String sourceTableName, StructType sourceSchema, List<Consumer<SparkSession>> handlers ) { RDD<Row> rdd = spark.sparkContext().<Row>emptyRDD(ClassTag$.MODULE$.<Row>apply(Row.class)); Dataset<Row> df = spark.createDataFrame(rdd, sourceSchema); df.createOrReplaceTempView(sourceTableName); handlers.forEach(x -> x.accept(spark)); spark.sql("drop view " + sourceTableName); }
Example 14
Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License | 5 votes |
private static void runParquetSchemaMergingExample(SparkSession spark) { // $example on:schema_merging$ List<Square> squares = new ArrayList<>(); for (int value = 1; value <= 5; value++) { Square square = new Square(); square.setValue(value); square.setSquare(value * value); squares.add(square); } // Create a simple DataFrame, store into a partition directory Dataset<Row> squaresDF = spark.createDataFrame(squares, Square.class); squaresDF.write().parquet("data/test_table/key=1"); List<Cube> cubes = new ArrayList<>(); for (int value = 6; value <= 10; value++) { Cube cube = new Cube(); cube.setValue(value); cube.setCube(value * value * value); cubes.add(cube); } // Create another DataFrame in a new partition directory, // adding a new column and dropping an existing column Dataset<Row> cubesDF = spark.createDataFrame(cubes, Cube.class); cubesDF.write().parquet("data/test_table/key=2"); // Read the partitioned table Dataset<Row> mergedDF = spark.read().option("mergeSchema", true).parquet("data/test_table"); mergedDF.printSchema(); // The final schema consists of all 3 columns in the Parquet files together // with the partitioning column appeared in the partition directory paths // root // |-- value: int (nullable = true) // |-- square: int (nullable = true) // |-- cube: int (nullable = true) // |-- key: int (nullable = true) // $example off:schema_merging$ }
Example 15
Source File: JavaMaxAbsScalerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaMaxAbsScalerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); MaxAbsScaler scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); // Compute summary statistics and generate MaxAbsScalerModel MaxAbsScalerModel scalerModel = scaler.fit(dataFrame); // rescale each feature to range [-1, 1]. Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.select("features", "scaledFeatures").show(); // $example off$ spark.stop(); }
Example 16
Source File: JavaVectorSlicerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaVectorSlicerExample") .getOrCreate(); // $example on$ Attribute[] attrs = new Attribute[]{ NumericAttribute.defaultAttr().withName("f1"), NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); List<Row> data = Lists.newArrayList( RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) ); Dataset<Row> dataset = spark.createDataFrame(data, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer() .setInputCol("userFeatures").setOutputCol("features"); vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"}) Dataset<Row> output = vectorSlicer.transform(dataset); output.show(false); // $example off$ spark.stop(); }
Example 17
Source File: CustomDataFrame.java From sparkResearch with Apache License 2.0 | 5 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder() .master("local") .appName("spark app") .getOrCreate(); //创建普通的JavaRDD JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD(); //字符串编码的模式 String schema = "name age"; //根据模式的字符串生成模式 List<StructField> structFieldList = new ArrayList<>(); for (String fieldName : schema.split(" ")) { StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true); structFieldList.add(structField); } StructType structType = DataTypes.createStructType(structFieldList); JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() { @Override public Row call(String v1) { String[] attirbutes = v1.split(","); return RowFactory.create(attirbutes[0], attirbutes[1].trim()); } }); //将模式应用于RDD Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType); //创建临时视图 dataset.createOrReplaceTempView("user"); Dataset<Row> result = sparkSession.sql("select * from user"); result.show(); }
Example 18
Source File: ProcessVendorTrasactions.java From aws-big-data-blog with Apache License 2.0 | 4 votes |
public static void run(String jobInputParam) throws Exception{ List<StructField> schemaFields = new ArrayList<StructField>(); schemaFields.add(DataTypes.createStructField("vendor_id", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("trans_amount", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("trans_type", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("item_id", DataTypes.StringType, true)); schemaFields.add(DataTypes.createStructField("trans_date", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(schemaFields); SparkConf conf = new SparkConf().setAppName("Spark Redshift No Access-Keys"); SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); String redshiftJDBCURL=props.getProperty("redshift.jdbc.url"); String s3TempPath = props.getProperty("s3.temp.path"); System.out.println("props"+props); JavaRDD<Row> salesRDD = sc.textFile(jobInputParam). map(new Function<String,Row>(){public Row call(String saleRec){ String[] fields = saleRec.split(","); return RowFactory.create(fields[0], fields[1],fields[2],fields[3],fields[4]);}}); Dataset<Row> salesDF = spark.createDataFrame(salesRDD,schema); Dataset<Row> vendorItemSaleAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("4")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum")); Dataset<Row> vendorItemTaxAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("5")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum")); Dataset<Row> vendorItemDiscountAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("6")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum")); String[] joinColArray = {"vendor_id","item_id","trans_date"}; vendorItemSaleAmountDF.printSchema(); Seq<String> commonJoinColumns = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList(joinColArray)).seq(); Dataset<Row> vendorAggregatedDF = vendorItemSaleAmountDF.join(vendorItemTaxAmountDF,commonJoinColumns,"left_outer") .join(vendorItemDiscountAmountDF,commonJoinColumns,"left_outer") .toDF("vendor_id","item_id","trans_date","sale_amount","tax_amount","discount_amount"); vendorAggregatedDF.printSchema(); DefaultAWSCredentialsProviderChain provider = new DefaultAWSCredentialsProviderChain(); AWSSessionCredentials creds = (AWSSessionCredentials) provider.getCredentials(); String appendix=new StringBuilder(String.valueOf(System.currentTimeMillis())).append("_").append(String.valueOf(new Random().nextInt(10)+1)).toString(); String vendorTransSummarySQL = new StringBuilder("begin transaction;delete from vendortranssummary using vendortranssummary_temp") .append(appendix) .append(" where vendortranssummary.vendor_id=vendortranssummary_temp") .append(appendix) .append(".vendor_id and vendortranssummary.item_id=vendortranssummary_temp") .append(appendix) .append(".item_id and vendortranssummary.trans_date = vendortranssummary_temp") .append(appendix) .append(".trans_date;") .append("insert into vendortranssummary select * from vendortranssummary_temp") .append(appendix) .append(";drop table vendortranssummary_temp") .append(appendix) .append(";end transaction;").toString(); vendorAggregatedDF.write().format("com.databricks.spark.redshift").option("url", redshiftJDBCURL) .option("dbtable", "vendortranssummary_temp"+appendix) .option("usestagingtable","false") .option("postactions",vendorTransSummarySQL) .option("temporary_aws_access_key_id", creds.getAWSAccessKeyId()) .option("temporary_aws_secret_access_key",creds.getAWSSecretKey()) .option("temporary_aws_session_token", creds.getSessionToken()) .option("tempdir", s3TempPath).mode(SaveMode.Overwrite).save(); }
Example 19
Source File: DatasetOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) throws AnalysisException { //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set System.setProperty("hadoop.home.dir", "E:\\hadoop"); //Build a Spark Session SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse") .appName("DatasetOperations") //.enableHiveSupport() .getOrCreate(); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); //Create a RDD JavaRDD<String> deptRDD = sparkSession.sparkContext() .textFile("src/main/resources/dept.txt", 1) .toJavaRDD(); //Convert the RDD to RDD<Rows> JavaRDD<Row> deptRows = deptRDD.filter(str-> !str.contains("deptno")).map(new Function<String, Row>() { private static final long serialVersionUID = 1L; @Override public Row call(String rowString) throws Exception { String[] cols = rowString.split(","); return RowFactory.create(cols[0].trim(), cols[1].trim(),cols[2].trim()); } }); //Create schema String[] schemaArr=deptRDD.first().split(","); List<StructField> structFieldList = new ArrayList<>(); for (String fieldName : schemaArr) { StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true); structFieldList.add(structField); } StructType schema = DataTypes.createStructType(structFieldList); Dataset<Row> deptDf = sparkSession.createDataFrame(deptRows, schema); deptDf.printSchema(); deptDf.show(); deptDf.createOrReplaceTempView("dept"); Dataset<Row> result = sparkSession.sql("select loc,count(loc) from dept where deptno > 10 group by loc" ); result.show(); // sparkSession.newSession().sql("SELECT * FROM dept").show(); deptDf.createGlobalTempView("dept_global_view"); sparkSession.newSession().sql("SELECT deptno,dname,loc, rank() OVER (PARTITION BY loc ORDER BY deptno ) FROM global_temp.dept_global_view").show(); // sparkSession.newSession().sql("SELECT * FROM dept_global_view").show(); deptDf.write().mode(SaveMode.Overwrite).json("src/main/resources/output/dept"); deptDf.write().mode(SaveMode.Overwrite).format("csv").save("src/main/resources/output/deptText"); deptDf.write().mode("overwrite").format("csv").save("src/main/resources/output/deptText"); deptDf.write().mode(SaveMode.Overwrite).format("csv").saveAsTable("Department"); deptDf.write().mode(SaveMode.Overwrite).format("csv").option("path", "file:///E:/hadoop/bin").saveAsTable("Department"); // Read the CSV data Dataset<Row> emp_ds = sparkSession.read() .format("csv") .option("header", "true") .option("inferSchema", "true") .load("src/main/resources/employee.txt"); emp_ds.printSchema(); emp_ds.show(); emp_ds.select("empName" ,"empId").show(); emp_ds.select(col("empName").name("Employee Name") ,col("empId").cast(DataTypes.IntegerType).name("Employee Id")).show(); emp_ds.sort(col("empId").asc()).filter(col("salary").gt("2500")); emp_ds.select("job").groupBy(col("job")).count().show(); //emp_ds.as("A").join(deptDf.as("B"),col("deptno"),"left").printSchema(); emp_ds.as("A").join(deptDf.as("B"),emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"left").select("A.empId","A.empName","A.job","A.manager","A.hiredate","A.salary","A.comm","A.deptno","B.dname","B.loc").show(); emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").show(); emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").logicalPlan(); emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").explain(); sparkSession.sql("show functions").show(false); sparkSession.sql("DESCRIBE FUNCTION add_months").show(false); sparkSession.sql("DESCRIBE FUNCTION EXTENDED add_months").show(false); }
Example 20
Source File: InteractionFingerprinter.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * Returns a dataset of ligand - macromolecule interaction information. * * Criteria to select interactions are specified by the * {@link InteractionFilter} * * <p>The dataset contains the following columns: * <pre> * structureChainId - pdbId.chainName for which the interaction data are listed * queryChainId - name of chain that interacts with target chain * targetChainId - name of chain for which the interaction data are listed * groupNumbers - array of residue numbers of interacting groups including insertion code (e.g., 101A) * sequenceIndices - array of zero-based index of interaction groups (residues) mapped onto target sequence * sequence - target polymer sequence * </pre> * * @param structures a set of PDB structures * @param filter interaction criteria * @return dataset with interacting residue information */ public static Dataset<Row> getPolymerInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) { // find all interactions JavaRDD<Row> rows = structures.flatMap(new PolymerInteractionFingerprint(filter)); // convert RDD to a Dataset with the following columns boolean nullable = false; StructField[] fields = { DataTypes.createStructField("structureChainId", DataTypes.StringType, nullable), DataTypes.createStructField("queryChainId", DataTypes.StringType, nullable), DataTypes.createStructField("targetChainId", DataTypes.StringType, nullable), DataTypes.createStructField("groupNumbers", DataTypes.createArrayType(DataTypes.StringType), nullable), DataTypes.createStructField("sequenceIndices", DataTypes.createArrayType(DataTypes.IntegerType), nullable), DataTypes.createStructField("sequence", DataTypes.StringType, nullable) }; SparkSession spark = SparkSession.builder().getOrCreate(); return spark.createDataFrame(rows, new StructType(fields)); }