org.apache.spark.sql.SparkSession#createDataFrame

Source File: DatasetBalancerTest.java From mmtf-spark with Apache License 2.0

6 votes

@Test
public void test() {
	List<Row> rows = Arrays.asList(
			RowFactory.create("a", 1), RowFactory.create("a", 2), 
			RowFactory.create("b", 1), RowFactory.create("b", 2), RowFactory.create("b", 3), 
			RowFactory.create("c", 1), RowFactory.create("c", 2), RowFactory.create("c", 3), RowFactory.create("c", 4));

	SparkSession spark = SparkSession.builder().master("local[1]").getOrCreate();

	StructType schema = new StructType(
			new StructField[] { DataTypes.createStructField("key", DataTypes.StringType, false),
					DataTypes.createStructField("value", DataTypes.IntegerType, false) });

	Dataset<Row> data = spark.createDataFrame(rows, schema);

	long seed = 19;
	Dataset<Row> balancedData = DatasetBalancer.downsample(data, "key", seed);
	assertTrue(balancedData.count() > 0);
	
    spark.close();
}

Source File: GraphLoader.java From tutorials with MIT License

6 votes

public GraphFrame getGraphFrameUserRelationship() throws IOException {
    Path temp = Files.createTempDirectory("sparkGraphFrames");
    SparkSession session = SparkSession.builder()
        .appName("SparkGraphFrameSample")
        .config("spark.sql.warehouse.dir", temp.toString())
        .sparkContext(getSparkContext().sc())
        .master("local[*]")
        .getOrCreate();
    List<User> users = loadUsers();

    Dataset<Row> userDataset = session.createDataFrame(users, User.class);

    List<Relationship> relationshipsList = getRelations();
    Dataset<Row> relationshipDataset = session.createDataFrame(relationshipsList, Relationship.class);

    GraphFrame graphFrame = new GraphFrame(userDataset, relationshipDataset);

    return graphFrame;
}

Source File: Bundles.java From bunsen with Apache License 2.0

6 votes

/**
 * Extracts the given resource type from the RDD of bundles and returns
 * it as a Dataset of that type, including any declared resources contained
 * to the parent resource.
 *
 * @param spark the spark session
 * @param bundles the RDD of FHIR Bundles
 * @param resourceTypeUrl the url of the resource
 * @param containedClassesUrls the list of urls of the resources contained to the parent resource
 * @return a dataset of the given resource
 */
public Dataset<Row> extractEntry(SparkSession spark, JavaRDD<BundleContainer> bundles,
    String resourceTypeUrl, List<String> containedClassesUrls) {

  FhirContext context = FhirContexts.contextFor(fhirVersion);

  SparkRowConverter converter = SparkRowConverter
      .forResource(context, resourceTypeUrl, containedClassesUrls);

  ToResourceRow resourceToRowConverter = new ToResourceRow(converter.getResourceType(),
      resourceTypeUrl,
      fhirVersion,
      converter,
      containedClassesUrls);

  JavaRDD<Row> resourceRdd = bundles.flatMap(resourceToRowConverter);

  return spark.createDataFrame(resourceRdd.rdd(), converter.getSchema());
}

Source File: StructureAligner.java From mmtf-spark with Apache License 2.0

6 votes

/**
 * Calculates all vs. all structural alignments of protein chains using the 
 * specified alignment algorithm. The input structures must contain single 
 * protein chains.
 * 
 * @param targets structures containing single protein chains
 * @param alignmentAlgorithm name of the algorithm
 * @return dataset with alignment metrics
 */
public static Dataset<Row> getAllVsAllAlignments(JavaPairRDD<String, StructureDataInterface> targets,
		String alignmentAlgorithm) {

	SparkSession session = SparkSession.builder().getOrCreate();
	JavaSparkContext sc = new JavaSparkContext(session.sparkContext());

	// create a list of chainName/ C Alpha coordinates
	List<Tuple2<String, Point3d[]>> chains  = targets.mapValues(
			s -> new ColumnarStructureX(s,true).getcAlphaCoordinates()).collect();

	// create an RDD of all pair indices (0,1), (0,2), ..., (1,2), (1,3), ...
	JavaRDD<Tuple2<Integer, Integer>> pairs = getPairs(sc, chains.size());
	
	// calculate structural alignments for all pairs.
	// broadcast (copy) chains to all worker nodes for efficient processing.
	// for each pair there can be zero or more solutions, therefore we flatmap the pairs.
	JavaRDD<Row> rows = pairs.flatMap(new StructuralAlignmentMapper(sc.broadcast(chains), alignmentAlgorithm));

	// convert rows to a dataset
	return session.createDataFrame(rows, getSchema());
}

Source File: JavaSQLTransformerExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaSQLTransformerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 1.0, 3.0),
    RowFactory.create(2, 2.0, 5.0)
  );
  StructType schema = new StructType(new StructField [] {
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()),
    new StructField("v2", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  SQLTransformer sqlTrans = new SQLTransformer().setStatement(
    "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__");

  sqlTrans.transform(df).show();
  // $example off$

  spark.stop();
}

Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0

5 votes

/**
 * Add element indices as new column to DataFrame
 *
 * @param df input data frame
 * @param sparkSession the Spark Session
 * @param nameOfCol name of index column
 * @return new data frame
 */
public static Dataset<Row> addIDToDataFrame(Dataset<Row> df, SparkSession sparkSession, String nameOfCol) {
	StructField[] oldSchema = df.schema().fields();
	StructField[] newSchema = new StructField[oldSchema.length + 1];
	for(int i = 0; i < oldSchema.length; i++) {
		newSchema[i] = oldSchema[i];
	}
	newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false);
	// JavaRDD<Row> newRows = df.rdd().toJavaRDD().map(new AddRowID());
	JavaRDD<Row> newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID());
	return sparkSession.createDataFrame(newRows, new StructType(newSchema));
}

Source File: FromRowsAndSchema.java From learning-spark-with-java with MIT License

5 votes

public static void main(String[] args) {
    SparkSession spark = SparkSession
        .builder()
        .appName("DataFrame-FromRowsAndSchema")
        .master("local[4]")
        .getOrCreate();

    List<Row> customerRows = Arrays.asList(
        RowFactory.create(1, "Widget Co", 120000.00, 0.00, "AZ"),
        RowFactory.create(2, "Acme Widgets", 410500.00, 500.00, "CA"),
        RowFactory.create(3, "Widgetry", 410500.00, 200.00, "CA"),
        RowFactory.create(4, "Widgets R Us", 410500.00, 0.0, "CA"),
        RowFactory.create(5, "Ye Olde Widgete", 500.00, 0.0, "MA")
    );

    List<StructField> fields = Arrays.asList(
        DataTypes.createStructField("id", DataTypes.IntegerType, true),
        DataTypes.createStructField("name", DataTypes.StringType, true),
        DataTypes.createStructField("sales", DataTypes.DoubleType, true),
        DataTypes.createStructField("discount", DataTypes.DoubleType, true),
        DataTypes.createStructField("state", DataTypes.StringType, true)
    );
    StructType customerSchema = DataTypes.createStructType(fields);

    Dataset<Row> customerDF =
        spark.createDataFrame(customerRows, customerSchema);

    System.out.println("*** the schema created");
    customerDF.printSchema();

    System.out.println("*** the data");
    customerDF.show();

    System.out.println("*** just the rows from CA");
    customerDF.filter(col("state").equalTo("CA")).show();

    spark.stop();
}

Source File: QuaternaryStructureDataset.java From mmtf-spark with Apache License 2.0

5 votes

/**
* Returns a dataset with quaternary structure info
* 
* @param structure 
* @return dataset quaternary structure info
*/
  public static Dataset<Row> getDataset(JavaPairRDD<String, StructureDataInterface> structure) {
      JavaRDD<Row> rows = structure.flatMap(t -> getQuaternaryStructure(t));
      
      StructType schema = new StructType(new StructField[]{
              new StructField("structureId", DataTypes.StringType, false, Metadata.empty()),
              new StructField("bioAssemblyId", DataTypes.StringType, false, Metadata.empty()),
              new StructField("proteinStoichiometry", DataTypes.StringType, true, Metadata.empty()),
              new StructField("dnaStoichiometry", DataTypes.StringType, true, Metadata.empty()),
              new StructField("rnaStoichiometry", DataTypes.StringType, true, Metadata.empty()),
      });
      
      SparkSession spark = SparkSession.builder().getOrCreate();
      return spark.createDataFrame(rows, schema);
  }

Source File: JavaCountVectorizerExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaCountVectorizerExample")
    .getOrCreate();

  // $example on$
  // Input data: Each row is a bag of words from a sentence or document.
  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("a", "b", "c")),
    RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
  );
  StructType schema = new StructType(new StructField [] {
    new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  // fit a CountVectorizerModel from the corpus
  CountVectorizerModel cvModel = new CountVectorizer()
    .setInputCol("text")
    .setOutputCol("feature")
    .setVocabSize(3)
    .setMinDF(2)
    .fit(df);

  // alternatively, define CountVectorizerModel with a-priori vocabulary
  CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
    .setInputCol("text")
    .setOutputCol("feature");

  cvModel.transform(df).show(false);
  // $example off$

  spark.stop();
}

Source File: GroupInteractionExtractor.java From mmtf-spark with Apache License 2.0

5 votes

/**
    * Returns a dataset of interactions that satisfy the criteria of
    * the {@link InteractionFilter}. Each atom and its interacting neighbor atoms
    * are represented as a row in a Dataset. In addition, geometric features 
    * of the interactions, such as distances, angles, and orientational order 
    * parameters are returned in each row (see {@link edu.sdsc.mm.dev.utils.CoordinationGeometry}).
    * 
    * @param structures a set of PDB structures
    * @return filter criteria for determining noncovalent interactions
    * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter
    * @see edu.sdsc.mm.dev.utils.CoordinationGeometry
 */
public static Dataset<Row> getInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) {
	SparkSession spark = SparkSession.builder().getOrCreate();
	@SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere
	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

	// calculate interactions
	boolean pairwise = false;
	JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise));
	
	// convert JavaRDD to Dataset
	return spark.createDataFrame(rows, AtomInteraction.getSchema(filter.getMaxInteractions()));
}

Source File: GroupInteractionExtractor.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Returns a Dataset of pairwise interactions that satisfy the criteria of
 * the {@link InteractionFilter}. Each atom, its interacting neighbor atom, and 
 * the interaction distance is represented as a row.
 * 
 * @param structures a set of PDB structures
 * @return filter criteria for determining noncovalent interactions
 * @see edu.sdsc.mmtf.spark.interactions.InteractionFilter
 */
public static Dataset<Row> getPairInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) {
	SparkSession spark = SparkSession.builder().getOrCreate();	
	@SuppressWarnings("resource") // sc cannot be closed here, it's still required elsewhere
	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

    // calculate interactions
	boolean pairwise = true;
	JavaRDD<Row> rows = structures.flatMap(new StructureToAtomInteractions(sc.broadcast(filter), pairwise));
	
	// convert JavaRDD to Dataset
	return spark.createDataFrame(rows, AtomInteraction.getPairInteractionSchema());
}

Source File: TestSuite.java From stocator with Apache License 2.0

5 votes

public void test10(SparkSession spark, String path) throws Exception {
  System.out.println("*********************************");
  System.out.println("T10: Partition test - start");
  try {
    List<Square> squares = new ArrayList<Square>();
    for (int value = 1; value <= 10; value++) {
      Square square = new Square();
      square.setValue(value);
      square.setSquare(value * value);
      squares.add(square);
    }
    Dataset<Row> squaresDF = spark.createDataFrame(squares, Square.class);
    squaresDF.write().mode("overwrite").partitionBy("value").parquet(path);

    Dataset<Row> readDF = spark.read().parquet(path);
    readDF.printSchema();
    long dfCount = readDF.count();
    long schemaLength = readDF.schema().length();
    if (dfCount != 10) {
      throw new Exception("T10: failed. Read " + dfCount + ", expected 10");
    }
    if (schemaLength != 2) {
      throw new Exception("T10: failed. Schema length" + schemaLength + ", expected 2");
    }
    System.out.println("T10: Partition test - completed");
  } catch (Exception e) {
    throw e;
  } 
    finally { deleteData(path, spark.sparkContext().hadoopConfiguration(), dataCreate);
    }

}

Source File: SparkStreamingSqlAnalyse.java From sylph with Apache License 2.0

5 votes

/**
 * 预编译sql 而不是等到运行时，才发现错误
 * Precompiled sql instead of waiting for the runtime to find the error
 */
private static void checkDStream(
        SparkSession spark,
        String sourceTableName,
        StructType sourceSchema,
        List<Consumer<SparkSession>> handlers
)
{
    RDD<Row> rdd = spark.sparkContext().<Row>emptyRDD(ClassTag$.MODULE$.<Row>apply(Row.class));
    Dataset<Row> df = spark.createDataFrame(rdd, sourceSchema);
    df.createOrReplaceTempView(sourceTableName);
    handlers.forEach(x -> x.accept(spark));
    spark.sql("drop view " + sourceTableName);
}

Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License

5 votes

private static void runParquetSchemaMergingExample(SparkSession spark) {
  // $example on:schema_merging$
  List<Square> squares = new ArrayList<>();
  for (int value = 1; value <= 5; value++) {
    Square square = new Square();
    square.setValue(value);
    square.setSquare(value * value);
    squares.add(square);
  }

  // Create a simple DataFrame, store into a partition directory
  Dataset<Row> squaresDF = spark.createDataFrame(squares, Square.class);
  squaresDF.write().parquet("data/test_table/key=1");

  List<Cube> cubes = new ArrayList<>();
  for (int value = 6; value <= 10; value++) {
    Cube cube = new Cube();
    cube.setValue(value);
    cube.setCube(value * value * value);
    cubes.add(cube);
  }

  // Create another DataFrame in a new partition directory,
  // adding a new column and dropping an existing column
  Dataset<Row> cubesDF = spark.createDataFrame(cubes, Cube.class);
  cubesDF.write().parquet("data/test_table/key=2");

  // Read the partitioned table
  Dataset<Row> mergedDF = spark.read().option("mergeSchema", true).parquet("data/test_table");
  mergedDF.printSchema();

  // The final schema consists of all 3 columns in the Parquet files together
  // with the partitioning column appeared in the partition directory paths
  // root
  //  |-- value: int (nullable = true)
  //  |-- square: int (nullable = true)
  //  |-- cube: int (nullable = true)
  //  |-- key: int (nullable = true)
  // $example off:schema_merging$
}

Source File: JavaMaxAbsScalerExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMaxAbsScalerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
      RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)),
      RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)),
      RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0))
  );
  StructType schema = new StructType(new StructField[]{
      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("features", new VectorUDT(), false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  MaxAbsScaler scaler = new MaxAbsScaler()
    .setInputCol("features")
    .setOutputCol("scaledFeatures");

  // Compute summary statistics and generate MaxAbsScalerModel
  MaxAbsScalerModel scalerModel = scaler.fit(dataFrame);

  // rescale each feature to range [-1, 1].
  Dataset<Row> scaledData = scalerModel.transform(dataFrame);
  scaledData.select("features", "scaledFeatures").show();
  // $example off$

  spark.stop();
}

Source File: JavaVectorSlicerExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorSlicerExample")
    .getOrCreate();

  // $example on$
  Attribute[] attrs = new Attribute[]{
    NumericAttribute.defaultAttr().withName("f1"),
    NumericAttribute.defaultAttr().withName("f2"),
    NumericAttribute.defaultAttr().withName("f3")
  };
  AttributeGroup group = new AttributeGroup("userFeatures", attrs);

  List<Row> data = Lists.newArrayList(
    RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
    RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
  );

  Dataset<Row> dataset =
    spark.createDataFrame(data, (new StructType()).add(group.toStructField()));

  VectorSlicer vectorSlicer = new VectorSlicer()
    .setInputCol("userFeatures").setOutputCol("features");

  vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
  // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})

  Dataset<Row> output = vectorSlicer.transform(dataset);
  output.show(false);
  // $example off$

  spark.stop();
}

Source File: CustomDataFrame.java From sparkResearch with Apache License 2.0

5 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    //创建普通的JavaRDD
    JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD();
    //字符串编码的模式
    String schema = "name age";

    //根据模式的字符串生成模式
    List<StructField> structFieldList = new ArrayList<>();
    for (String fieldName : schema.split(" ")) {
        StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        structFieldList.add(structField);
    }
    StructType structType = DataTypes.createStructType(structFieldList);

    JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String v1) {
            String[] attirbutes = v1.split(",");
            return RowFactory.create(attirbutes[0], attirbutes[1].trim());
        }
    });

    //将模式应用于RDD
    Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType);

    //创建临时视图
    dataset.createOrReplaceTempView("user");
    Dataset<Row> result = sparkSession.sql("select * from user");
    result.show();
}

Source File: ProcessVendorTrasactions.java From aws-big-data-blog with Apache License 2.0

4 votes

public static void run(String jobInputParam) throws Exception{
	
   	List<StructField> schemaFields = new ArrayList<StructField>();
   	schemaFields.add(DataTypes.createStructField("vendor_id", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_amount", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_type", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("item_id", DataTypes.StringType, true));
   	schemaFields.add(DataTypes.createStructField("trans_date", DataTypes.StringType, true));
   	StructType schema = DataTypes.createStructType(schemaFields);

   	SparkConf conf = new SparkConf().setAppName("Spark Redshift No Access-Keys");
   	SparkSession spark = SparkSession.builder().config(conf).getOrCreate();	
	JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
			
	String redshiftJDBCURL=props.getProperty("redshift.jdbc.url");
	String s3TempPath = props.getProperty("s3.temp.path");
	System.out.println("props"+props);
	
	JavaRDD<Row> salesRDD = sc.textFile(jobInputParam).
			map(new Function<String,Row>(){public Row call(String saleRec){ String[] fields = saleRec.split(",");
		      return RowFactory.create(fields[0], fields[1],fields[2],fields[3],fields[4]);}});
	Dataset<Row> salesDF = spark.createDataFrame(salesRDD,schema);
	Dataset<Row> vendorItemSaleAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("4")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	Dataset<Row> vendorItemTaxAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("5")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	Dataset<Row> vendorItemDiscountAmountDF = salesDF.filter(salesDF.col("trans_type").equalTo("6")).groupBy(salesDF.col("vendor_id"),salesDF.col("item_id"),salesDF.col("trans_date")).agg(ImmutableMap.of("trans_amount", "sum"));
	String[] joinColArray = {"vendor_id","item_id","trans_date"};
	vendorItemSaleAmountDF.printSchema();
	Seq<String> commonJoinColumns = scala.collection.JavaConversions.asScalaBuffer(Arrays.asList(joinColArray)).seq();

	Dataset<Row> vendorAggregatedDF = vendorItemSaleAmountDF.join(vendorItemTaxAmountDF,commonJoinColumns,"left_outer")
							 .join(vendorItemDiscountAmountDF,commonJoinColumns,"left_outer")
							 .toDF("vendor_id","item_id","trans_date","sale_amount","tax_amount","discount_amount");
	
	vendorAggregatedDF.printSchema();
	DefaultAWSCredentialsProviderChain provider = new DefaultAWSCredentialsProviderChain();
	AWSSessionCredentials creds  = (AWSSessionCredentials) provider.getCredentials();
	
	String appendix=new StringBuilder(String.valueOf(System.currentTimeMillis())).append("_").append(String.valueOf(new Random().nextInt(10)+1)).toString();
	String vendorTransSummarySQL = new StringBuilder("begin transaction;delete from vendortranssummary using vendortranssummary_temp")
			 .append(appendix)
			 .append(" where vendortranssummary.vendor_id=vendortranssummary_temp")
			 .append(appendix)
			 .append(".vendor_id and vendortranssummary.item_id=vendortranssummary_temp")
			 .append(appendix)
			 .append(".item_id and vendortranssummary.trans_date = vendortranssummary_temp")
			 .append(appendix)
			 .append(".trans_date;")
			 .append("insert into vendortranssummary select * from vendortranssummary_temp")
			 .append(appendix)
			 .append(";drop table vendortranssummary_temp")
			 .append(appendix)
			 .append(";end transaction;").toString();
	vendorAggregatedDF.write().format("com.databricks.spark.redshift").option("url", redshiftJDBCURL)
    .option("dbtable", "vendortranssummary_temp"+appendix)
    .option("usestagingtable","false")
    .option("postactions",vendorTransSummarySQL)
    .option("temporary_aws_access_key_id", creds.getAWSAccessKeyId())
    .option("temporary_aws_secret_access_key",creds.getAWSSecretKey())
    .option("temporary_aws_session_token", creds.getSessionToken())
    .option("tempdir", s3TempPath).mode(SaveMode.Overwrite).save();
		
}

Source File: DatasetOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) throws AnalysisException {
	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
	 //Build a Spark Session	
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
      .appName("DatasetOperations")
      //.enableHiveSupport()
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
	  rootLogger.setLevel(Level.WARN); 
	  //Create a RDD
	  JavaRDD<String> deptRDD = sparkSession.sparkContext()
			  .textFile("src/main/resources/dept.txt", 1)
			  .toJavaRDD();

	  //Convert the RDD to RDD<Rows>
	 JavaRDD<Row> deptRows = deptRDD.filter(str-> !str.contains("deptno")).map(new Function<String, Row>() {
		private static final long serialVersionUID = 1L;
		@Override
		public Row call(String rowString) throws Exception {
			String[] cols = rowString.split(",");
		    return RowFactory.create(cols[0].trim(), cols[1].trim(),cols[2].trim());
		}
	});
	  
	  //Create schema 		  
	  String[] schemaArr=deptRDD.first().split(",");
	  List<StructField> structFieldList = new ArrayList<>();
	  for (String fieldName : schemaArr) {
	    StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
	    structFieldList.add(structField);
	  }
	  StructType schema = DataTypes.createStructType(structFieldList);
	  
	  Dataset<Row> deptDf = sparkSession.createDataFrame(deptRows, schema);
	  deptDf.printSchema();
	  deptDf.show();
	  
	  deptDf.createOrReplaceTempView("dept");	
	  
	  Dataset<Row> result = sparkSession.sql("select loc,count(loc) from dept  where deptno > 10 group by loc" );
	  result.show();
	  
	  
	 // sparkSession.newSession().sql("SELECT * FROM dept").show();
	  
	  
        deptDf.createGlobalTempView("dept_global_view");
	  
	  sparkSession.newSession().sql("SELECT deptno,dname,loc, rank() OVER (PARTITION BY loc ORDER BY deptno ) FROM global_temp.dept_global_view").show();
	 
	//  sparkSession.newSession().sql("SELECT * FROM dept_global_view").show();
	  
	  deptDf.write().mode(SaveMode.Overwrite).json("src/main/resources/output/dept");
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").save("src/main/resources/output/deptText");
	  deptDf.write().mode("overwrite").format("csv").save("src/main/resources/output/deptText");
	 
  
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").saveAsTable("Department");
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").option("path", "file:///E:/hadoop/bin").saveAsTable("Department");
	  
	// Read the CSV data
		 Dataset<Row> emp_ds = sparkSession.read()
				 .format("csv")
   		         .option("header", "true")
   		         .option("inferSchema", "true")
   		         .load("src/main/resources/employee.txt");    
		 
		 emp_ds.printSchema();
		 emp_ds.show();
		 
		emp_ds.select("empName" ,"empId").show();
		
		emp_ds.select(col("empName").name("Employee Name") ,col("empId").cast(DataTypes.IntegerType).name("Employee Id")).show();
		
		emp_ds.sort(col("empId").asc()).filter(col("salary").gt("2500"));
		
		emp_ds.select("job").groupBy(col("job")).count().show();
		
		//emp_ds.as("A").join(deptDf.as("B"),col("deptno"),"left").printSchema();

		emp_ds.as("A").join(deptDf.as("B"),emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"left").select("A.empId","A.empName","A.job","A.manager","A.hiredate","A.salary","A.comm","A.deptno","B.dname","B.loc").show();
		
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").show();			
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").logicalPlan();
		
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").explain();
		 
          sparkSession.sql("show functions").show(false);
          sparkSession.sql("DESCRIBE FUNCTION add_months").show(false);
          sparkSession.sql("DESCRIBE FUNCTION EXTENDED add_months").show(false);
          
         
}

Source File: InteractionFingerprinter.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * Returns a dataset of ligand - macromolecule interaction information.
 *  * Criteria to select interactions are specified by the 
 * {@link InteractionFilter}
 * 
 * <p>The dataset contains the following columns:
 * <pre>
 *    structureChainId - pdbId.chainName for which the interaction data are listed
 *    queryChainId - name of chain that interacts with target chain
 *    targetChainId - name of chain for which the interaction data are listed
 *    groupNumbers - array of residue numbers of interacting groups including insertion code (e.g., 101A)
 *    sequenceIndices - array of zero-based index of interaction groups (residues) mapped onto target sequence
 *    sequence - target polymer sequence
 * </pre>
 *
 * @param structures a set of PDB structures
 * @param filter interaction criteria
 * @return dataset with interacting residue information
 */
public static Dataset<Row> getPolymerInteractions(JavaPairRDD<String, StructureDataInterface> structures, InteractionFilter filter) {
    // find all interactions
    JavaRDD<Row> rows = structures.flatMap(new PolymerInteractionFingerprint(filter));
   
    // convert RDD to a Dataset with the following columns
    boolean nullable = false;
    StructField[] fields = {
            DataTypes.createStructField("structureChainId", DataTypes.StringType, nullable),
            DataTypes.createStructField("queryChainId", DataTypes.StringType, nullable),
            DataTypes.createStructField("targetChainId", DataTypes.StringType, nullable),
            DataTypes.createStructField("groupNumbers", DataTypes.createArrayType(DataTypes.StringType), nullable),
            DataTypes.createStructField("sequenceIndices", DataTypes.createArrayType(DataTypes.IntegerType), nullable), 
            DataTypes.createStructField("sequence", DataTypes.StringType, nullable)
    };
    
    SparkSession spark = SparkSession.builder().getOrCreate();
    return spark.createDataFrame(rows, new StructType(fields));
}

Java Code Examples for org.apache.spark.sql.SparkSession#createDataFrame()