org.apache.spark.sql.Dataset#createOrReplaceTempView

Source File: JavaShakespeare.java From spark-bigquery-connector with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder()
            .appName("spark-bigquery-demo")
            .getOrCreate();

    // Use the Cloud Storage bucket for temporary BigQuery export data used
    // by the connector. This assumes the Cloud Storage connector for
    // Hadoop is configured.
    String bucket = spark.sparkContext().hadoopConfiguration().get("fs.gs.system.bucket");
    spark.conf().set("temporaryGcsBucket", bucket);

    // Load data in from BigQuery.
    Dataset<Row> wordsDF = spark.read().format("bigquery")
            .option("table", "bigquery-public-data.samples.shakespeare").load().cache();
    wordsDF.show();
    wordsDF.printSchema();
    wordsDF.createOrReplaceTempView("words");

    // Perform word count.
    Dataset<Row> wordCountDF = spark.sql(
            "SELECT word, SUM(word_count) AS word_count FROM words GROUP BY word");

    // Saving the data to BigQuery
    wordCountDF.write().format("bigquery").option("table", "wordcount_dataset.wordcount_output")
            .save();
}

Source File: JavaIgniteDataFrameExample.java From ignite with Apache License 2.0

6 votes

/** */
private static void nativeSparkSqlExample(SparkSession spark) {
    System.out.println("Querying using Spark SQL.");

    Dataset<Row> df = spark.read()
            .format(IgniteDataFrameSettings.FORMAT_IGNITE()) //Data source type.
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "person") //Table to read.
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG) //Ignite config.
            .load();

    //Registering DataFrame as Spark view.
    df.createOrReplaceTempView("person");

    //Selecting data from Ignite through Spark SQL Engine.
    Dataset<Row> igniteDF = spark.sql("SELECT * FROM person WHERE id >= 2 AND name = 'Mary Major'");

    System.out.println("Result schema:");

    igniteDF.printSchema(); //Printing query schema to console.

    System.out.println("Result content:");

    igniteDF.show(); //Printing query results to console.
}

Source File: TestSuite.java From stocator with Apache License 2.0

6 votes

public void test3(SparkSession spark, Dataset<Row> schemaFlights, String outCSV1) throws Exception {
  try {
    System.out.println("*********************************");
    System.out.println("T3: Run SQL query on the dataset");
    schemaFlights.createOrReplaceTempView("flights");
    Dataset<Row> results = spark.sql(
        "select year, month, count(*) as Count  from flights WHERE cancellationCode like 'B' GROUP BY year, month ORDER By year, month");
    System.out.println("*********************************");
    System.out.println("T3: Save query result as CSV object: ");
    System.out.println("T3 " + outCSV1);
    results.write().mode("overwrite").format("com.databricks.spark.csv").save(outCSV1);
    // read and compare
    System.out.println("*********************************");
    System.out.println("T3: Read query CSV object and compare");
    JavaRDD<String> csvReadBack = spark.read().textFile(outCSV1).javaRDD();
    countAndCompare(results, csvReadBack.count(), "T3");
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(outCSV1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }

}

Source File: CustomDataFrame.java From sparkResearch with Apache License 2.0

5 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    //创建普通的JavaRDD
    JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD();
    //字符串编码的模式
    String schema = "name age";

    //根据模式的字符串生成模式
    List<StructField> structFieldList = new ArrayList<>();
    for (String fieldName : schema.split(" ")) {
        StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        structFieldList.add(structField);
    }
    StructType structType = DataTypes.createStructType(structFieldList);

    JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String v1) {
            String[] attirbutes = v1.split(",");
            return RowFactory.create(attirbutes[0], attirbutes[1].trim());
        }
    });

    //将模式应用于RDD
    Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType);

    //创建临时视图
    dataset.createOrReplaceTempView("user");
    Dataset<Row> result = sparkSession.sql("select * from user");
    result.show();
}

Source File: CustomReportService.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Concatenates structureId and chainId fields into a single key if chainId
 * field is present
 * 
 * @param spark
 * @param dataset
 * @return
 */
private static Dataset<Row> concatIds(SparkSession spark, Dataset<Row> dataset, String[] columnNames) {
	if (Arrays.asList(dataset.columns()).contains("chainId")) {
		dataset.createOrReplaceTempView("table");

		String sql = "SELECT CONCAT(structureId,'.',chainId) as structureChainId," + "structureId,chainId,"
				+ columNamesString(columnNames) + " from table";
		dataset = spark.sql(sql);
	}
	return dataset;
}

Source File: CustomReportDemo.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * @param args no input arguments
 * @throws IOException if custom report web service fails
 */
public static void main(String[] args) throws IOException {    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
   
    // retrieve PDB annotation: Binding affinities (Ki, Kd), 
    // group name of the ligand (hetId), and the 
    // Enzyme Classification number (ecNo)
    Dataset<Row> ds = CustomReportService.getDataset("Ki","Kd","hetId","ecNo");
    
    // show the schema of this dataset
    ds.printSchema();
        
    // select structures that either have a Ki or Kd value(s) and
    // are protein-serine/threonine kinases (EC 2.7.1.*):
    
    // A. by using dataset operations
    ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'");
    ds.show(10);
     
    // B. by creating a temporary query and running SQL
    ds.createOrReplaceTempView("table");
    ds.sparkSession().sql("SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'");
    ds.show(10);
    
    long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: Loader.java From AWS-MIMIC-IIItoOMOP with Apache License 2.0

5 votes

public void loadSourceFiles(Mapping mapping)
{
    for(String file : mapping.getSourceFiles())
    {
        String table = file.contains("/") ? file.substring(0, file.lastIndexOf("/")): file.substring(0, file.lastIndexOf("."));
        Dataset<Row> frame = spark.read().format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "false").option("nullValue", "").option("treatEmptyValuesAsNulls","true").load(configuration.getFileSystem() + "://" + configuration.getSourceBucket() + "/" + file);
        frame.createOrReplaceTempView(table.replace("/", "_").replace("-", "_"));
    }
}

Source File: SparkSessionHeloWorld.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
	SparkSession sparkSession = SparkSession.builder()
			.master("local")
			.appName("CSV Read Example")
			.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse")
			.getOrCreate();
	
	Dataset<Row> csv = sparkSession.read().format("com.databricks.spark.csv").option("header","true")
			.load("C:\\Users\\sgulati\\Documents\\my_docs\\book\\testdata\\emp.csv");
	
	csv.createOrReplaceTempView("test");
	Dataset<Row> sql = sparkSession.sql("select * from test");
	sql.collectAsList();
}

Source File: SparkStreamingSqlAnalyse.java From sylph with Apache License 2.0

5 votes

/**
 * 预编译sql 而不是等到运行时，才发现错误
 * Precompiled sql instead of waiting for the runtime to find the error
 */
private static void checkDStream(
        SparkSession spark,
        String sourceTableName,
        StructType sourceSchema,
        List<Consumer<SparkSession>> handlers
)
{
    RDD<Row> rdd = spark.sparkContext().<Row>emptyRDD(ClassTag$.MODULE$.<Row>apply(Row.class));
    Dataset<Row> df = spark.createDataFrame(rdd, sourceSchema);
    df.createOrReplaceTempView(sourceTableName);
    handlers.forEach(x -> x.accept(spark));
    spark.sql("drop view " + sourceTableName);
}

Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License

5 votes

private static void runJsonDatasetExample(SparkSession spark) {
  // $example on:json_dataset$
  // A JSON dataset is pointed to by path.
  // The path can be either a single text file or a directory storing text files
  Dataset<Row> people = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // The inferred schema can be visualized using the printSchema() method
  people.printSchema();
  // root
  //  |-- age: long (nullable = true)
  //  |-- name: string (nullable = true)

  // Creates a temporary view using the DataFrame
  people.createOrReplaceTempView("people");

  // SQL statements can be run by using the sql methods provided by spark
  Dataset<Row> namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");
  namesDF.show();
  // +------+
  // |  name|
  // +------+
  // |Justin|
  // +------+

  // Alternatively, a DataFrame can be created for a JSON dataset represented by
  // an RDD[String] storing one JSON object per string.
  List<String> jsonData = Arrays.asList(
          "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
  JavaRDD<String> anotherPeopleRDD =
          new JavaSparkContext(spark.sparkContext()).parallelize(jsonData);
  Dataset anotherPeople = spark.read().json(anotherPeopleRDD);
  anotherPeople.show();
  // +---------------+----+
  // |        address|name|
  // +---------------+----+
  // |[Columbus,Ohio]| Yin|
  // +---------------+----+
  // $example off:json_dataset$
}

Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License

5 votes

private static void runBasicParquetExample(SparkSession spark) {
  // $example on:basic_parquet_example$
  Dataset<Row> peopleDF = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // DataFrames can be saved as Parquet files, maintaining the schema information
  peopleDF.write().parquet("people.parquet");

  // Read in the Parquet file created above.
  // Parquet files are self-describing so the schema is preserved
  // The result of loading a parquet file is also a DataFrame
  Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet");

  // Parquet files can also be used to create a temporary view and then used in SQL statements
  parquetFileDF.createOrReplaceTempView("parquetFile");
  Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19");
  Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() {
    public String call(Row row) {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +------------+
  // |       value|
  // +------------+
  // |Name: Justin|
  // +------------+
  // $example off:basic_parquet_example$
}

Source File: ReadAndWriteTablesTest.java From iceberg with Apache License 2.0

5 votes

@Test
public void readFromPartitionedTableWithFilter() {
  table = tables.create(schema, pathToTable.toString());

  Dataset<Row> results = spark.read()
      .format("iceberg")
      .load(pathToTable.toString())
      .filter("data != \"b\"");

  results.createOrReplaceTempView("table");
  spark.sql("SELECT * FROM table").show();
}

Source File: SparkDataSet.java From spliceengine with GNU Affero General Public License v3.0

5 votes

@Override @SuppressWarnings({ "unchecked", "rawtypes" })
public void pin(ExecRow template, long conglomId) throws StandardException {
    Dataset<Row> pinDF = SpliceSpark.getSession().createDataFrame(
            rdd.map(new LocatedRowToRowFunction()),
            template.schema());
    pinDF.createOrReplaceTempView("SPLICE_"+conglomId);
    SpliceSpark.getSession().catalog().cacheTable("SPLICE_"+conglomId);
}

Source File: DataFrameLoader.java From ExecDashboard with Apache License 2.0

5 votes

public static void loadDataFrame(String collectionName, JavaSparkContext javaSparkContext) {
    if (StringUtils.isEmpty(collectionName) || (javaSparkContext == null)) { return; }

    Map<String, String> readOverrides = new HashMap<>();
    readOverrides.put("collection", collectionName);
    ReadConfig readConfig = ReadConfig.create(javaSparkContext).withOptions(readOverrides);
    Dataset<Row> frame = MongoSpark.load(javaSparkContext, readConfig).toDF();
    frame.createOrReplaceTempView(collectionName);
}

Source File: AbstractJavaEsSparkSQLTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testBasicRead() throws Exception {
       Dataset<Row> dataset = artistsAsDataset();
       assertTrue(dataset.count() > 300);
       dataset.createOrReplaceTempView("datfile");
       assertEquals(5, ((Object[]) dataset.take(5)).length);
       Dataset<Row> results = sqc
			.sql("SELECT name FROM datfile WHERE id >=1 AND id <=10");
       assertEquals(10, ((Object[]) results.take(10)).length);
}

Source File: JavaSparkSQLExample.java From SparkDemo with MIT License

4 votes

private static void runProgrammaticSchemaExample(SparkSession spark) {
  // $example on:programmatic_schema$
  // Create an RDD
  JavaRDD<String> peopleRDD = spark.sparkContext()
    .textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt", 1)
    .toJavaRDD();

  // The schema is encoded in a string
  String schemaString = "name age";

  // Generate the schema based on the string of schema
  List<StructField> fields = new ArrayList<>();
  for (String fieldName : schemaString.split(" ")) {
    StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
    fields.add(field);
  }
  StructType schema = DataTypes.createStructType(fields);

  // Convert records of the RDD (people) to Rows
  JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() {
    @Override
    public Row call(String record) throws Exception {
      String[] attributes = record.split(",");
      return RowFactory.create(attributes[0], attributes[1].trim());
    }
  });

  // Apply the schema to the RDD
  Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

  // Creates a temporary view using the DataFrame
  peopleDataFrame.createOrReplaceTempView("people");

  // SQL can be run over a temporary view created using DataFrames
  Dataset<Row> results = spark.sql("SELECT name FROM people");

  // The results of SQL queries are DataFrames and support all the normal RDD operations
  // The columns of a row in the result can be accessed by field index or by field name
  Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +-------------+
  // |        value|
  // +-------------+
  // |Name: Michael|
  // |   Name: Andy|
  // | Name: Justin|
  // +-------------+
  // $example off:programmatic_schema$
}

Source File: DatasetOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) throws AnalysisException {
	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
	 //Build a Spark Session	
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
      .appName("DatasetOperations")
      //.enableHiveSupport()
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
	  rootLogger.setLevel(Level.WARN); 
	  //Create a RDD
	  JavaRDD<String> deptRDD = sparkSession.sparkContext()
			  .textFile("src/main/resources/dept.txt", 1)
			  .toJavaRDD();

	  //Convert the RDD to RDD<Rows>
	 JavaRDD<Row> deptRows = deptRDD.filter(str-> !str.contains("deptno")).map(new Function<String, Row>() {
		private static final long serialVersionUID = 1L;
		@Override
		public Row call(String rowString) throws Exception {
			String[] cols = rowString.split(",");
		    return RowFactory.create(cols[0].trim(), cols[1].trim(),cols[2].trim());
		}
	});
	  
	  //Create schema 		  
	  String[] schemaArr=deptRDD.first().split(",");
	  List<StructField> structFieldList = new ArrayList<>();
	  for (String fieldName : schemaArr) {
	    StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
	    structFieldList.add(structField);
	  }
	  StructType schema = DataTypes.createStructType(structFieldList);
	  
	  Dataset<Row> deptDf = sparkSession.createDataFrame(deptRows, schema);
	  deptDf.printSchema();
	  deptDf.show();
	  
	  deptDf.createOrReplaceTempView("dept");	
	  
	  Dataset<Row> result = sparkSession.sql("select loc,count(loc) from dept  where deptno > 10 group by loc" );
	  result.show();
	  
	  
	 // sparkSession.newSession().sql("SELECT * FROM dept").show();
	  
	  
        deptDf.createGlobalTempView("dept_global_view");
	  
	  sparkSession.newSession().sql("SELECT deptno,dname,loc, rank() OVER (PARTITION BY loc ORDER BY deptno ) FROM global_temp.dept_global_view").show();
	 
	//  sparkSession.newSession().sql("SELECT * FROM dept_global_view").show();
	  
	  deptDf.write().mode(SaveMode.Overwrite).json("src/main/resources/output/dept");
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").save("src/main/resources/output/deptText");
	  deptDf.write().mode("overwrite").format("csv").save("src/main/resources/output/deptText");
	 
  
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").saveAsTable("Department");
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").option("path", "file:///E:/hadoop/bin").saveAsTable("Department");
	  
	// Read the CSV data
		 Dataset<Row> emp_ds = sparkSession.read()
				 .format("csv")
   		         .option("header", "true")
   		         .option("inferSchema", "true")
   		         .load("src/main/resources/employee.txt");    
		 
		 emp_ds.printSchema();
		 emp_ds.show();
		 
		emp_ds.select("empName" ,"empId").show();
		
		emp_ds.select(col("empName").name("Employee Name") ,col("empId").cast(DataTypes.IntegerType).name("Employee Id")).show();
		
		emp_ds.sort(col("empId").asc()).filter(col("salary").gt("2500"));
		
		emp_ds.select("job").groupBy(col("job")).count().show();
		
		//emp_ds.as("A").join(deptDf.as("B"),col("deptno"),"left").printSchema();

		emp_ds.as("A").join(deptDf.as("B"),emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"left").select("A.empId","A.empName","A.job","A.manager","A.hiredate","A.salary","A.comm","A.deptno","B.dname","B.loc").show();
		
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").show();			
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").logicalPlan();
		
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").explain();
		 
          sparkSession.sql("show functions").show(false);
          sparkSession.sql("DESCRIBE FUNCTION add_months").show(false);
          sparkSession.sql("DESCRIBE FUNCTION EXTENDED add_months").show(false);
          
         
}

Source File: JavaSparkSQLExample.java From SparkDemo with MIT License

4 votes

private static void runBasicDataFrameExample(SparkSession spark) throws AnalysisException {
  // $example on:create_df$
  Dataset<Row> df = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // Displays the content of the DataFrame to stdout
  df.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:create_df$

  // $example on:untyped_ops$
  // Print the schema in a tree format
  df.printSchema();
  // root
  // |-- age: long (nullable = true)
  // |-- name: string (nullable = true)

  // Select only the "name" column
  df.select("name").show();
  // +-------+
  // |   name|
  // +-------+
  // |Michael|
  // |   Andy|
  // | Justin|
  // +-------+

  // Select everybody, but increment the age by 1
  df.select(col("name"), col("age").plus(1)).show();
  // +-------+---------+
  // |   name|(age + 1)|
  // +-------+---------+
  // |Michael|     null|
  // |   Andy|       31|
  // | Justin|       20|
  // +-------+---------+

  // Select people older than 21
  df.filter(col("age").gt(21)).show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 30|Andy|
  // +---+----+

  // Count people by age
  df.groupBy("age").count().show();
  // +----+-----+
  // | age|count|
  // +----+-----+
  // |  19|    1|
  // |null|    1|
  // |  30|    1|
  // +----+-----+
  // $example off:untyped_ops$

  // $example on:run_sql$
  // Register the DataFrame as a SQL temporary view
  df.createOrReplaceTempView("people");

  Dataset<Row> sqlDF = spark.sql("SELECT * FROM people");
  sqlDF.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:run_sql$

  // $example on:global_temp_view$
  // Register the DataFrame as a global temporary view
  df.createGlobalTempView("people");

  // Global temporary view is tied to a system preserved database `global_temp`
  spark.sql("SELECT * FROM global_temp.people").show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+

  // Global temporary view is cross-session
  spark.newSession().sql("SELECT * FROM global_temp.people").show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:global_temp_view$
}

Source File: JavaSparkHiveExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  // $example on:spark_hive$
  // warehouseLocation points to the default location for managed databases and tables
  String warehouseLocation = "spark-warehouse";
  SparkSession spark = SparkSession
    .builder()
    .appName("Java Spark Hive Example")
    .config("spark.sql.warehouse.dir", warehouseLocation)
    .enableHiveSupport()
    .getOrCreate();

  spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
  spark.sql("LOAD DATA LOCAL INPATH 'data/resources/kv1.txt' INTO TABLE src");

  // Queries are expressed in HiveQL
  spark.sql("SELECT * FROM src").show();
  // +---+-------+
  // |key|  value|
  // +---+-------+
  // |238|val_238|
  // | 86| val_86|
  // |311|val_311|
  // ...

  // Aggregation queries are also supported.
  spark.sql("SELECT COUNT(*) FROM src").show();
  // +--------+
  // |count(1)|
  // +--------+
  // |    500 |
  // +--------+

  // The results of SQL queries are themselves DataFrames and support all normal functions.
  Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");

  // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
  Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Key: " + row.get(0) + ", Value: " + row.get(1);
    }
  }, Encoders.STRING());
  stringsDS.show();
  // +--------------------+
  // |               value|
  // +--------------------+
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // ...

  // You can also use DataFrames to create temporary views within a SparkSession.
  List<Record> records = new ArrayList<>();
  for (int key = 1; key < 100; key++) {
    Record record = new Record();
    record.setKey(key);
    record.setValue("val_" + key);
    records.add(record);
  }
  Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
  recordsDF.createOrReplaceTempView("records");

  // Queries can then join DataFrames data with data stored in Hive.
  spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
  // +---+------+---+------+
  // |key| value|key| value|
  // +---+------+---+------+
  // |  2| val_2|  2| val_2|
  // |  2| val_2|  2| val_2|
  // |  4| val_4|  4| val_4|
  // ...
  // $example off:spark_hive$

  spark.stop();
}

Source File: StructuredStreamingExample.java From Apache-Spark-2x-for-Java-Developers with MIT License

3 votes

public static void main(String[] args) throws StreamingQueryException {
	System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
	SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("structured Streaming Example")
			.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

	Dataset<Row> inStream = sparkSession.readStream().format("socket").option("host", "10.204.136.223")
			.option("port", 9999).load();

	Dataset<FlightDetails> dsFlightDetails = inStream.as(Encoders.STRING()).map(x -> {
		ObjectMapper mapper = new ObjectMapper();
		return mapper.readValue(x, FlightDetails.class);

	}, Encoders.bean(FlightDetails.class));
	
	
	dsFlightDetails.createOrReplaceTempView("flight_details");
	
	Dataset<Row> avdFlightDetails = sparkSession.sql("select flightId, avg(temperature) from flight_details group by flightId");
	
	StreamingQuery query = avdFlightDetails.writeStream()
			  .outputMode("complete")
			  .format("console")
			  .start();

			query.awaitTermination();
	

}

Java Code Examples for org.apache.spark.sql.Dataset#createOrReplaceTempView()