org.apache.spark.sql.streaming.StreamingQueryException Java Examples
The following examples show how to use
org.apache.spark.sql.streaming.StreamingQueryException.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkMLHouses.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 5 votes |
public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); // * the schema can be written on disk, and read from disk // * the schema is not mandatory to be complete, it can contain only the needed fields StructType HOUSES_SCHEMA = new StructType() .add("House", LongType, true) .add("Taxes", LongType, true) .add("Bedrooms", LongType, true) .add("Baths", FloatType, true) .add("Quadrant", LongType, true) .add("NW", StringType, true) .add("Price($)", LongType, false) .add("Size(sqft)", LongType, false) .add("lot", LongType, true); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession sparkSession = SparkSession.builder() .config(conf) .getOrCreate(); Dataset<Row> housesDF = sparkSession.read() .schema(HOUSES_SCHEMA) .json(HOUSES_FILE_PATH); // Gathering Data Dataset<Row> gatheredDF = housesDF.select(col("Taxes"), col("Bedrooms"), col("Baths"), col("Size(sqft)"), col("Price($)")); // Data Preparation Dataset<Row> labelDF = gatheredDF.withColumnRenamed("Price($)", "label"); Imputer imputer = new Imputer() // .setMissingValue(1.0d) .setInputCols(new String[] { "Baths" }) .setOutputCols(new String[] { "~Baths~" }); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[] { "Taxes", "Bedrooms", "~Baths~", "Size(sqft)" }) .setOutputCol("features"); // Choosing a Model LinearRegression linearRegression = new LinearRegression(); linearRegression.setMaxIter(1000); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] { imputer, assembler, linearRegression }); // Training The Data Dataset<Row>[] splitDF = labelDF.randomSplit(new double[] { 0.8, 0.2 }); Dataset<Row> trainDF = splitDF[0]; Dataset<Row> evaluationDF = splitDF[1]; PipelineModel pipelineModel = pipeline.fit(trainDF); // Evaluation Dataset<Row> predictionsDF = pipelineModel.transform(evaluationDF); predictionsDF.show(false); Dataset<Row> forEvaluationDF = predictionsDF.select(col("label"), col("prediction")); RegressionEvaluator evaluteR2 = new RegressionEvaluator().setMetricName("r2"); RegressionEvaluator evaluteRMSE = new RegressionEvaluator().setMetricName("rmse"); double r2 = evaluteR2.evaluate(forEvaluationDF); double rmse = evaluteRMSE.evaluate(forEvaluationDF); logger.info("---------------------------"); logger.info("R2 =" + r2); logger.info("RMSE =" + rmse); logger.info("---------------------------"); }
Example #2
Source File: TestForwardCompatibility.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testSparkStreamingWriteFailsUnknownTransform() throws IOException { File parent = temp.newFolder("avro"); File location = new File(parent, "test"); File dataFolder = new File(location, "data"); dataFolder.mkdirs(); File checkpoint = new File(parent, "checkpoint"); checkpoint.mkdirs(); HadoopTables tables = new HadoopTables(CONF); tables.create(SCHEMA, UNKNOWN_SPEC, location.toString()); MemoryStream<Integer> inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); StreamingQuery query = inputStream.toDF() .selectExpr("value AS id", "CAST (value AS STRING) AS data") .writeStream() .outputMode("append") .format("iceberg") .option("checkpointLocation", checkpoint.toString()) .option("path", location.toString()) .start(); List<Integer> batch1 = Lists.newArrayList(1, 2); send(batch1, inputStream); AssertHelpers.assertThrows("Should reject streaming write with unsupported transform", StreamingQueryException.class, "Cannot write using unsupported transforms: zero", query::processAllAvailable); }
Example #3
Source File: TranslationContext.java From beam with Apache License 2.0 | 5 votes |
/** Starts the pipeline. */ public void startPipeline() { try { SparkStructuredStreamingPipelineOptions options = serializablePipelineOptions.get().as(SparkStructuredStreamingPipelineOptions.class); int datasetIndex = 0; for (Dataset<?> dataset : leaves) { if (options.isStreaming()) { // TODO: deal with Beam Discarding, Accumulating and Accumulating & Retracting outputmodes // with DatastreamWriter.outputMode DataStreamWriter<?> dataStreamWriter = dataset.writeStream(); // spark sets a default checkpoint dir if not set. if (options.getCheckpointDir() != null) { dataStreamWriter = dataStreamWriter.option("checkpointLocation", options.getCheckpointDir()); } // TODO: Do not await termination here. dataStreamWriter.foreach(new NoOpForeachWriter<>()).start().awaitTermination(); } else { if (options.getTestMode()) { LOG.debug("**** dataset {} catalyst execution plans ****", ++datasetIndex); dataset.explain(true); } // apply a dummy fn just to apply foreach action that will trigger the pipeline run in // spark dataset.foreach((ForeachFunction) t -> {}); } } } catch (StreamingQueryException e) { throw new RuntimeException("Pipeline execution failed: " + e); } }
Example #4
Source File: ReadLinesFromMultipleFileStreams.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() throws TimeoutException { log.debug("-> start()"); SparkSession spark = SparkSession.builder() .appName("Read lines over a file stream").master("local") .getOrCreate(); Dataset<Row> df = spark .readStream() .format("text") .load(StreamingUtils.getInputDirectory()); StreamingQuery query = df .writeStream() .outputMode(OutputMode.Update()) .format("console").start(); try { query.awaitTermination(); } catch (StreamingQueryException e) { log.error("Exception while waiting for query to end {}.", e .getMessage(), e); } // In this case everything is a string df.show(); df.printSchema(); }
Example #5
Source File: ReadLinesFromFileStream.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() throws TimeoutException { log.debug("-> start()"); SparkSession spark = SparkSession.builder() .appName("Read lines over a file stream") .master("local") .getOrCreate(); Dataset<Row> df = spark .readStream() .format("text") .load(StreamingUtils.getInputDirectory()); StreamingQuery query = df .writeStream() .outputMode(OutputMode.Update()) .format("console") .start(); try { query.awaitTermination(); } catch (StreamingQueryException e) { log.error( "Exception while waiting for query to end {}.", e.getMessage(), e); } // Never executed df.show(); df.printSchema(); }
Example #6
Source File: SparkStructuredStreaming.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession sparkSession = SparkSession.builder() .config(conf) .getOrCreate(); Dataset<Row> meetupDF = sparkSession.readStream() .format(STREAM_FORMAT) .option("kafka.bootstrap.servers", KAFKA_BROKERS) .option("subscribe", KAFKA_TOPIC) .load(); meetupDF.printSchema(); Dataset<Row> rsvpAndTimestampDF = meetupDF .select(col("timestamp"), from_json(col("value").cast("string"), RSVP_SCHEMA) .alias("rsvp")) .alias("meetup") .select("meetup.*"); rsvpAndTimestampDF.printSchema(); Dataset<Row> window = rsvpAndTimestampDF .withWatermark("timestamp", "1 minute") .groupBy( window(col("timestamp"), "4 minutes", "2 minutes"), col("rsvp.guests")) .count(); StreamingQuery query = window.writeStream() .outputMode("complete") .format("console") .option("checkpointLocation", CHECKPOINT_LOCATION) .option("truncate", false) .start(); query.awaitTermination(); }
Example #7
Source File: SparkMLScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); // * the schema can be written on disk, and read from disk // * the schema is not mandatory to be complete, it can contain only the needed fields StructType RSVP_SCHEMA = new StructType() .add("event", new StructType() .add("event_id", StringType, true) .add("event_name", StringType, true) .add("event_url", StringType, true) .add("time", LongType, true)) .add("group", new StructType() .add("group_city", StringType, true) .add("group_country", StringType, true) .add("group_id", LongType, true) .add("group_lat", DoubleType, true) .add("group_lon", DoubleType, true) .add("group_name", StringType, true) .add("group_state", StringType, true) .add("group_topics", DataTypes.createArrayType( new StructType() .add("topicName", StringType, true) .add("urlkey", StringType, true)), true) .add("group_urlname", StringType, true)) .add("guests", LongType, true) .add("member", new StructType() .add("member_id", LongType, true) .add("member_name", StringType, true) .add("photo", StringType, true)) .add("mtime", LongType, true) .add("response", StringType, true) .add("rsvp_id", LongType, true) .add("venue", new StructType() .add("lat", DoubleType, true) .add("lon", DoubleType, true) .add("venue_id", LongType, true) .add("venue_name", StringType, true)) .add("visibility", StringType, true); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession spark = SparkSession .builder() .config(conf) .getOrCreate(); PipelineModel pipelineModel = PipelineModel.load(MODEL_FOLDER_PATH); Dataset<Row> meetupStream = spark.readStream() .format(KAFKA_FORMAT) .option("kafka.bootstrap.servers", KAFKA_BROKERS) .option("subscribe", KAFKA_TOPIC) .load(); Dataset<Row> gatheredDF = meetupStream.select( (from_json(col("value").cast("string"), RSVP_SCHEMA)) .alias("rsvp")) .alias("meetup") .select("meetup.*"); Dataset<Row> filteredDF = gatheredDF.filter(e -> !e.anyNull()); Dataset<Row> preparedDF = filteredDF.select( col("rsvp.group.group_city"), col("rsvp.group.group_lat"), col("rsvp.group.group_lon"), col("rsvp.response") ); preparedDF.printSchema(); Dataset<Row> predictionDF = pipelineModel.transform(preparedDF); StreamingQuery query = predictionDF.writeStream() .format(JSON_FORMAT) .option("path", RESULT_FOLDER_PATH) .option("checkpointLocation", CHECKPOINT_LOCATION) .trigger(Trigger.ProcessingTime(QUERY_INTERVAL_SECONDS)) .option("truncate", false) .start(); query.awaitTermination(); }
Example #8
Source File: StructuredDemo.java From structured-streaming-avro-demo with BSD 3-Clause "New" or "Revised" License | 4 votes |
public static void main(String[] args) throws StreamingQueryException, TimeoutException { //set log4j programmatically LogManager.getLogger("org.apache.spark").setLevel(Level.WARN); LogManager.getLogger("org.apache.kafka").setLevel(Level.WARN); LogManager.getLogger("akka").setLevel(Level.ERROR); //on windows we may need to configure winutils if hadoop_home is not set //System.setProperty("hadoop.home.dir", "c:/app/hadoop"); //configure Spark SparkConf conf = new SparkConf() .setAppName("kafka-structured") .set("spark.driver.bindAddress", "localhost") .setMaster("local[*]"); //initialize spark session SparkSession sparkSession = SparkSession .builder() .config(conf) .getOrCreate(); //reduce task number sparkSession.sqlContext().setConf("spark.sql.shuffle.partitions", "3"); //data stream from kafka Dataset<Row> ds1 = sparkSession .readStream() .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", "mytopic") .option("startingOffsets", "earliest") .load(); //print kafka schema ds1.printSchema(); //start the streaming query Dataset<Row> ds2 = ds1 .select(from_avro(col("value"), USER_SCHEMA).as("rows")) .select("rows.*"); //print avro schema converted to dataframe :) ds2.printSchema(); StreamingQuery query1 = ds2 .groupBy("str1") .count() .writeStream() .queryName("Test query") .outputMode("complete") .format("console") .start(); query1.awaitTermination(); }
Example #9
Source File: StructuredStreamingExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 3 votes |
public static void main(String[] args) throws StreamingQueryException { System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("structured Streaming Example") .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); Dataset<Row> inStream = sparkSession.readStream().format("socket").option("host", "10.204.136.223") .option("port", 9999).load(); Dataset<FlightDetails> dsFlightDetails = inStream.as(Encoders.STRING()).map(x -> { ObjectMapper mapper = new ObjectMapper(); return mapper.readValue(x, FlightDetails.class); }, Encoders.bean(FlightDetails.class)); dsFlightDetails.createOrReplaceTempView("flight_details"); Dataset<Row> avdFlightDetails = sparkSession.sql("select flightId, avg(temperature) from flight_details group by flightId"); StreamingQuery query = avdFlightDetails.writeStream() .outputMode("complete") .format("console") .start(); query.awaitTermination(); }