Java Code Examples for org.apache.spark.sql.Dataset#createOrReplaceTempView()
The following examples show how to use
org.apache.spark.sql.Dataset#createOrReplaceTempView() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JavaShakespeare.java From spark-bigquery-connector with Apache License 2.0 | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession.builder() .appName("spark-bigquery-demo") .getOrCreate(); // Use the Cloud Storage bucket for temporary BigQuery export data used // by the connector. This assumes the Cloud Storage connector for // Hadoop is configured. String bucket = spark.sparkContext().hadoopConfiguration().get("fs.gs.system.bucket"); spark.conf().set("temporaryGcsBucket", bucket); // Load data in from BigQuery. Dataset<Row> wordsDF = spark.read().format("bigquery") .option("table", "bigquery-public-data.samples.shakespeare").load().cache(); wordsDF.show(); wordsDF.printSchema(); wordsDF.createOrReplaceTempView("words"); // Perform word count. Dataset<Row> wordCountDF = spark.sql( "SELECT word, SUM(word_count) AS word_count FROM words GROUP BY word"); // Saving the data to BigQuery wordCountDF.write().format("bigquery").option("table", "wordcount_dataset.wordcount_output") .save(); }
Example 2
Source File: JavaIgniteDataFrameExample.java From ignite with Apache License 2.0 | 6 votes |
/** */ private static void nativeSparkSqlExample(SparkSession spark) { System.out.println("Querying using Spark SQL."); Dataset<Row> df = spark.read() .format(IgniteDataFrameSettings.FORMAT_IGNITE()) //Data source type. .option(IgniteDataFrameSettings.OPTION_TABLE(), "person") //Table to read. .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG) //Ignite config. .load(); //Registering DataFrame as Spark view. df.createOrReplaceTempView("person"); //Selecting data from Ignite through Spark SQL Engine. Dataset<Row> igniteDF = spark.sql("SELECT * FROM person WHERE id >= 2 AND name = 'Mary Major'"); System.out.println("Result schema:"); igniteDF.printSchema(); //Printing query schema to console. System.out.println("Result content:"); igniteDF.show(); //Printing query results to console. }
Example 3
Source File: TestSuite.java From stocator with Apache License 2.0 | 6 votes |
public void test3(SparkSession spark, Dataset<Row> schemaFlights, String outCSV1) throws Exception { try { System.out.println("*********************************"); System.out.println("T3: Run SQL query on the dataset"); schemaFlights.createOrReplaceTempView("flights"); Dataset<Row> results = spark.sql( "select year, month, count(*) as Count from flights WHERE cancellationCode like 'B' GROUP BY year, month ORDER By year, month"); System.out.println("*********************************"); System.out.println("T3: Save query result as CSV object: "); System.out.println("T3 " + outCSV1); results.write().mode("overwrite").format("com.databricks.spark.csv").save(outCSV1); // read and compare System.out.println("*********************************"); System.out.println("T3: Read query CSV object and compare"); JavaRDD<String> csvReadBack = spark.read().textFile(outCSV1).javaRDD(); countAndCompare(results, csvReadBack.count(), "T3"); } catch (Exception e) { throw e; } finally { deleteData(outCSV1, spark.sparkContext().hadoopConfiguration(), dataCreate); } }
Example 4
Source File: CustomDataFrame.java From sparkResearch with Apache License 2.0 | 5 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder() .master("local") .appName("spark app") .getOrCreate(); //创建普通的JavaRDD JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD(); //字符串编码的模式 String schema = "name age"; //根据模式的字符串生成模式 List<StructField> structFieldList = new ArrayList<>(); for (String fieldName : schema.split(" ")) { StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true); structFieldList.add(structField); } StructType structType = DataTypes.createStructType(structFieldList); JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() { @Override public Row call(String v1) { String[] attirbutes = v1.split(","); return RowFactory.create(attirbutes[0], attirbutes[1].trim()); } }); //将模式应用于RDD Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType); //创建临时视图 dataset.createOrReplaceTempView("user"); Dataset<Row> result = sparkSession.sql("select * from user"); result.show(); }
Example 5
Source File: CustomReportService.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Concatenates structureId and chainId fields into a single key if chainId * field is present * * @param spark * @param dataset * @return */ private static Dataset<Row> concatIds(SparkSession spark, Dataset<Row> dataset, String[] columnNames) { if (Arrays.asList(dataset.columns()).contains("chainId")) { dataset.createOrReplaceTempView("table"); String sql = "SELECT CONCAT(structureId,'.',chainId) as structureChainId," + "structureId,chainId," + columNamesString(columnNames) + " from table"; dataset = spark.sql(sql); } return dataset; }
Example 6
Source File: CustomReportDemo.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * @param args no input arguments * @throws IOException if custom report web service fails */ public static void main(String[] args) throws IOException { long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // retrieve PDB annotation: Binding affinities (Ki, Kd), // group name of the ligand (hetId), and the // Enzyme Classification number (ecNo) Dataset<Row> ds = CustomReportService.getDataset("Ki","Kd","hetId","ecNo"); // show the schema of this dataset ds.printSchema(); // select structures that either have a Ki or Kd value(s) and // are protein-serine/threonine kinases (EC 2.7.1.*): // A. by using dataset operations ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'"); ds.show(10); // B. by creating a temporary query and running SQL ds.createOrReplaceTempView("table"); ds.sparkSession().sql("SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'"); ds.show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 7
Source File: Loader.java From AWS-MIMIC-IIItoOMOP with Apache License 2.0 | 5 votes |
public void loadSourceFiles(Mapping mapping) { for(String file : mapping.getSourceFiles()) { String table = file.contains("/") ? file.substring(0, file.lastIndexOf("/")): file.substring(0, file.lastIndexOf(".")); Dataset<Row> frame = spark.read().format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "false").option("nullValue", "").option("treatEmptyValuesAsNulls","true").load(configuration.getFileSystem() + "://" + configuration.getSourceBucket() + "/" + file); frame.createOrReplaceTempView(table.replace("/", "_").replace("-", "_")); } }
Example 8
Source File: SparkSessionHeloWorld.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); SparkSession sparkSession = SparkSession.builder() .master("local") .appName("CSV Read Example") .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse") .getOrCreate(); Dataset<Row> csv = sparkSession.read().format("com.databricks.spark.csv").option("header","true") .load("C:\\Users\\sgulati\\Documents\\my_docs\\book\\testdata\\emp.csv"); csv.createOrReplaceTempView("test"); Dataset<Row> sql = sparkSession.sql("select * from test"); sql.collectAsList(); }
Example 9
Source File: SparkStreamingSqlAnalyse.java From sylph with Apache License 2.0 | 5 votes |
/** * 预编译sql 而不是等到运行时,才发现错误 * Precompiled sql instead of waiting for the runtime to find the error */ private static void checkDStream( SparkSession spark, String sourceTableName, StructType sourceSchema, List<Consumer<SparkSession>> handlers ) { RDD<Row> rdd = spark.sparkContext().<Row>emptyRDD(ClassTag$.MODULE$.<Row>apply(Row.class)); Dataset<Row> df = spark.createDataFrame(rdd, sourceSchema); df.createOrReplaceTempView(sourceTableName); handlers.forEach(x -> x.accept(spark)); spark.sql("drop view " + sourceTableName); }
Example 10
Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License | 5 votes |
private static void runJsonDatasetExample(SparkSession spark) { // $example on:json_dataset$ // A JSON dataset is pointed to by path. // The path can be either a single text file or a directory storing text files Dataset<Row> people = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); // The inferred schema can be visualized using the printSchema() method people.printSchema(); // root // |-- age: long (nullable = true) // |-- name: string (nullable = true) // Creates a temporary view using the DataFrame people.createOrReplaceTempView("people"); // SQL statements can be run by using the sql methods provided by spark Dataset<Row> namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19"); namesDF.show(); // +------+ // | name| // +------+ // |Justin| // +------+ // Alternatively, a DataFrame can be created for a JSON dataset represented by // an RDD[String] storing one JSON object per string. List<String> jsonData = Arrays.asList( "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD<String> anotherPeopleRDD = new JavaSparkContext(spark.sparkContext()).parallelize(jsonData); Dataset anotherPeople = spark.read().json(anotherPeopleRDD); anotherPeople.show(); // +---------------+----+ // | address|name| // +---------------+----+ // |[Columbus,Ohio]| Yin| // +---------------+----+ // $example off:json_dataset$ }
Example 11
Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License | 5 votes |
private static void runBasicParquetExample(SparkSession spark) { // $example on:basic_parquet_example$ Dataset<Row> peopleDF = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); // DataFrames can be saved as Parquet files, maintaining the schema information peopleDF.write().parquet("people.parquet"); // Read in the Parquet file created above. // Parquet files are self-describing so the schema is preserved // The result of loading a parquet file is also a DataFrame Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet"); // Parquet files can also be used to create a temporary view and then used in SQL statements parquetFileDF.createOrReplaceTempView("parquetFile"); Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19"); Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() { public String call(Row row) { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +------------+ // | value| // +------------+ // |Name: Justin| // +------------+ // $example off:basic_parquet_example$ }
Example 12
Source File: ReadAndWriteTablesTest.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void readFromPartitionedTableWithFilter() { table = tables.create(schema, pathToTable.toString()); Dataset<Row> results = spark.read() .format("iceberg") .load(pathToTable.toString()) .filter("data != \"b\""); results.createOrReplaceTempView("table"); spark.sql("SELECT * FROM table").show(); }
Example 13
Source File: SparkDataSet.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Override @SuppressWarnings({ "unchecked", "rawtypes" }) public void pin(ExecRow template, long conglomId) throws StandardException { Dataset<Row> pinDF = SpliceSpark.getSession().createDataFrame( rdd.map(new LocatedRowToRowFunction()), template.schema()); pinDF.createOrReplaceTempView("SPLICE_"+conglomId); SpliceSpark.getSession().catalog().cacheTable("SPLICE_"+conglomId); }
Example 14
Source File: DataFrameLoader.java From ExecDashboard with Apache License 2.0 | 5 votes |
public static void loadDataFrame(String collectionName, JavaSparkContext javaSparkContext) { if (StringUtils.isEmpty(collectionName) || (javaSparkContext == null)) { return; } Map<String, String> readOverrides = new HashMap<>(); readOverrides.put("collection", collectionName); ReadConfig readConfig = ReadConfig.create(javaSparkContext).withOptions(readOverrides); Dataset<Row> frame = MongoSpark.load(javaSparkContext, readConfig).toDF(); frame.createOrReplaceTempView(collectionName); }
Example 15
Source File: AbstractJavaEsSparkSQLTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test public void testBasicRead() throws Exception { Dataset<Row> dataset = artistsAsDataset(); assertTrue(dataset.count() > 300); dataset.createOrReplaceTempView("datfile"); assertEquals(5, ((Object[]) dataset.take(5)).length); Dataset<Row> results = sqc .sql("SELECT name FROM datfile WHERE id >=1 AND id <=10"); assertEquals(10, ((Object[]) results.take(10)).length); }
Example 16
Source File: JavaSparkSQLExample.java From SparkDemo with MIT License | 4 votes |
private static void runProgrammaticSchemaExample(SparkSession spark) { // $example on:programmatic_schema$ // Create an RDD JavaRDD<String> peopleRDD = spark.sparkContext() .textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt", 1) .toJavaRDD(); // The schema is encoded in a string String schemaString = "name age"; // Generate the schema based on the string of schema List<StructField> fields = new ArrayList<>(); for (String fieldName : schemaString.split(" ")) { StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true); fields.add(field); } StructType schema = DataTypes.createStructType(fields); // Convert records of the RDD (people) to Rows JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() { @Override public Row call(String record) throws Exception { String[] attributes = record.split(","); return RowFactory.create(attributes[0], attributes[1].trim()); } }); // Apply the schema to the RDD Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema); // Creates a temporary view using the DataFrame peopleDataFrame.createOrReplaceTempView("people"); // SQL can be run over a temporary view created using DataFrames Dataset<Row> results = spark.sql("SELECT name FROM people"); // The results of SQL queries are DataFrames and support all the normal RDD operations // The columns of a row in the result can be accessed by field index or by field name Dataset<String> namesDS = results.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Name: " + row.getString(0); } }, Encoders.STRING()); namesDS.show(); // +-------------+ // | value| // +-------------+ // |Name: Michael| // | Name: Andy| // | Name: Justin| // +-------------+ // $example off:programmatic_schema$ }
Example 17
Source File: DatasetOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) throws AnalysisException { //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set System.setProperty("hadoop.home.dir", "E:\\hadoop"); //Build a Spark Session SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse") .appName("DatasetOperations") //.enableHiveSupport() .getOrCreate(); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); //Create a RDD JavaRDD<String> deptRDD = sparkSession.sparkContext() .textFile("src/main/resources/dept.txt", 1) .toJavaRDD(); //Convert the RDD to RDD<Rows> JavaRDD<Row> deptRows = deptRDD.filter(str-> !str.contains("deptno")).map(new Function<String, Row>() { private static final long serialVersionUID = 1L; @Override public Row call(String rowString) throws Exception { String[] cols = rowString.split(","); return RowFactory.create(cols[0].trim(), cols[1].trim(),cols[2].trim()); } }); //Create schema String[] schemaArr=deptRDD.first().split(","); List<StructField> structFieldList = new ArrayList<>(); for (String fieldName : schemaArr) { StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true); structFieldList.add(structField); } StructType schema = DataTypes.createStructType(structFieldList); Dataset<Row> deptDf = sparkSession.createDataFrame(deptRows, schema); deptDf.printSchema(); deptDf.show(); deptDf.createOrReplaceTempView("dept"); Dataset<Row> result = sparkSession.sql("select loc,count(loc) from dept where deptno > 10 group by loc" ); result.show(); // sparkSession.newSession().sql("SELECT * FROM dept").show(); deptDf.createGlobalTempView("dept_global_view"); sparkSession.newSession().sql("SELECT deptno,dname,loc, rank() OVER (PARTITION BY loc ORDER BY deptno ) FROM global_temp.dept_global_view").show(); // sparkSession.newSession().sql("SELECT * FROM dept_global_view").show(); deptDf.write().mode(SaveMode.Overwrite).json("src/main/resources/output/dept"); deptDf.write().mode(SaveMode.Overwrite).format("csv").save("src/main/resources/output/deptText"); deptDf.write().mode("overwrite").format("csv").save("src/main/resources/output/deptText"); deptDf.write().mode(SaveMode.Overwrite).format("csv").saveAsTable("Department"); deptDf.write().mode(SaveMode.Overwrite).format("csv").option("path", "file:///E:/hadoop/bin").saveAsTable("Department"); // Read the CSV data Dataset<Row> emp_ds = sparkSession.read() .format("csv") .option("header", "true") .option("inferSchema", "true") .load("src/main/resources/employee.txt"); emp_ds.printSchema(); emp_ds.show(); emp_ds.select("empName" ,"empId").show(); emp_ds.select(col("empName").name("Employee Name") ,col("empId").cast(DataTypes.IntegerType).name("Employee Id")).show(); emp_ds.sort(col("empId").asc()).filter(col("salary").gt("2500")); emp_ds.select("job").groupBy(col("job")).count().show(); //emp_ds.as("A").join(deptDf.as("B"),col("deptno"),"left").printSchema(); emp_ds.as("A").join(deptDf.as("B"),emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"left").select("A.empId","A.empName","A.job","A.manager","A.hiredate","A.salary","A.comm","A.deptno","B.dname","B.loc").show(); emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").show(); emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").logicalPlan(); emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").explain(); sparkSession.sql("show functions").show(false); sparkSession.sql("DESCRIBE FUNCTION add_months").show(false); sparkSession.sql("DESCRIBE FUNCTION EXTENDED add_months").show(false); }
Example 18
Source File: JavaSparkSQLExample.java From SparkDemo with MIT License | 4 votes |
private static void runBasicDataFrameExample(SparkSession spark) throws AnalysisException { // $example on:create_df$ Dataset<Row> df = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json"); // Displays the content of the DataFrame to stdout df.show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:create_df$ // $example on:untyped_ops$ // Print the schema in a tree format df.printSchema(); // root // |-- age: long (nullable = true) // |-- name: string (nullable = true) // Select only the "name" column df.select("name").show(); // +-------+ // | name| // +-------+ // |Michael| // | Andy| // | Justin| // +-------+ // Select everybody, but increment the age by 1 df.select(col("name"), col("age").plus(1)).show(); // +-------+---------+ // | name|(age + 1)| // +-------+---------+ // |Michael| null| // | Andy| 31| // | Justin| 20| // +-------+---------+ // Select people older than 21 df.filter(col("age").gt(21)).show(); // +---+----+ // |age|name| // +---+----+ // | 30|Andy| // +---+----+ // Count people by age df.groupBy("age").count().show(); // +----+-----+ // | age|count| // +----+-----+ // | 19| 1| // |null| 1| // | 30| 1| // +----+-----+ // $example off:untyped_ops$ // $example on:run_sql$ // Register the DataFrame as a SQL temporary view df.createOrReplaceTempView("people"); Dataset<Row> sqlDF = spark.sql("SELECT * FROM people"); sqlDF.show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:run_sql$ // $example on:global_temp_view$ // Register the DataFrame as a global temporary view df.createGlobalTempView("people"); // Global temporary view is tied to a system preserved database `global_temp` spark.sql("SELECT * FROM global_temp.people").show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // Global temporary view is cross-session spark.newSession().sql("SELECT * FROM global_temp.people").show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:global_temp_view$ }
Example 19
Source File: JavaSparkHiveExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on:spark_hive$ // warehouseLocation points to the default location for managed databases and tables String warehouseLocation = "spark-warehouse"; SparkSession spark = SparkSession .builder() .appName("Java Spark Hive Example") .config("spark.sql.warehouse.dir", warehouseLocation) .enableHiveSupport() .getOrCreate(); spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)"); spark.sql("LOAD DATA LOCAL INPATH 'data/resources/kv1.txt' INTO TABLE src"); // Queries are expressed in HiveQL spark.sql("SELECT * FROM src").show(); // +---+-------+ // |key| value| // +---+-------+ // |238|val_238| // | 86| val_86| // |311|val_311| // ... // Aggregation queries are also supported. spark.sql("SELECT COUNT(*) FROM src").show(); // +--------+ // |count(1)| // +--------+ // | 500 | // +--------+ // The results of SQL queries are themselves DataFrames and support all normal functions. Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key"); // The items in DaraFrames are of type Row, which lets you to access each column by ordinal. Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() { @Override public String call(Row row) throws Exception { return "Key: " + row.get(0) + ", Value: " + row.get(1); } }, Encoders.STRING()); stringsDS.show(); // +--------------------+ // | value| // +--------------------+ // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // |Key: 0, Value: val_0| // ... // You can also use DataFrames to create temporary views within a SparkSession. List<Record> records = new ArrayList<>(); for (int key = 1; key < 100; key++) { Record record = new Record(); record.setKey(key); record.setValue("val_" + key); records.add(record); } Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class); recordsDF.createOrReplaceTempView("records"); // Queries can then join DataFrames data with data stored in Hive. spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show(); // +---+------+---+------+ // |key| value|key| value| // +---+------+---+------+ // | 2| val_2| 2| val_2| // | 2| val_2| 2| val_2| // | 4| val_4| 4| val_4| // ... // $example off:spark_hive$ spark.stop(); }
Example 20
Source File: StructuredStreamingExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 3 votes |
public static void main(String[] args) throws StreamingQueryException { System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("structured Streaming Example") .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); Dataset<Row> inStream = sparkSession.readStream().format("socket").option("host", "10.204.136.223") .option("port", 9999).load(); Dataset<FlightDetails> dsFlightDetails = inStream.as(Encoders.STRING()).map(x -> { ObjectMapper mapper = new ObjectMapper(); return mapper.readValue(x, FlightDetails.class); }, Encoders.bean(FlightDetails.class)); dsFlightDetails.createOrReplaceTempView("flight_details"); Dataset<Row> avdFlightDetails = sparkSession.sql("select flightId, avg(temperature) from flight_details group by flightId"); StreamingQuery query = avdFlightDetails.writeStream() .outputMode("complete") .format("console") .start(); query.awaitTermination(); }