org.apache.spark.sql.streaming.StreamingQueryException Java Exaples

Source File: SparkMLHouses.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

5 votes

public static void main(String[] args) throws InterruptedException, StreamingQueryException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                // * the schema can be written on disk, and read from disk
                // * the schema is not mandatory to be complete, it can contain only the needed fields    
                StructType HOUSES_SCHEMA = 
                       new StructType()
                           .add("House", LongType, true)
                           .add("Taxes", LongType, true)
                           .add("Bedrooms", LongType, true)
                           .add("Baths", FloatType, true)
                           .add("Quadrant", LongType, true)
                           .add("NW", StringType, true)
                           .add("Price($)", LongType, false)
                           .add("Size(sqft)", LongType, false)
                           .add("lot", LongType, true);

                final SparkConf conf = new SparkConf()
                    .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                    .setAppName(APPLICATION_NAME)
                    .set("spark.sql.caseSensitive", CASE_SENSITIVE);

                SparkSession sparkSession = SparkSession.builder()
                    .config(conf)
                    .getOrCreate();

                Dataset<Row> housesDF = sparkSession.read()
                     .schema(HOUSES_SCHEMA)
                     .json(HOUSES_FILE_PATH);
             
                // Gathering Data				
                Dataset<Row> gatheredDF = housesDF.select(col("Taxes"), 
                    col("Bedrooms"), col("Baths"),
                    col("Size(sqft)"), col("Price($)"));
                
                // Data Preparation  
                Dataset<Row> labelDF = gatheredDF.withColumnRenamed("Price($)", "label");
                
                Imputer imputer = new Imputer()
                    // .setMissingValue(1.0d)
                    .setInputCols(new String[] { "Baths" })
                    .setOutputCols(new String[] { "~Baths~" });

                VectorAssembler assembler = new VectorAssembler()
                    .setInputCols(new String[] { "Taxes", "Bedrooms", "~Baths~", "Size(sqft)" })
                    .setOutputCol("features");
                
                // Choosing a Model               
                LinearRegression linearRegression = new LinearRegression();
                linearRegression.setMaxIter(1000);

                Pipeline pipeline = new Pipeline()
                                .setStages(new PipelineStage[] {
                                    imputer, assembler, linearRegression 
                                });

                // Training The Data
                Dataset<Row>[] splitDF = labelDF.randomSplit(new double[] { 0.8, 0.2 });

                Dataset<Row> trainDF = splitDF[0];
                Dataset<Row> evaluationDF = splitDF[1];

                PipelineModel pipelineModel = pipeline.fit(trainDF);
                
                // Evaluation 
                Dataset<Row> predictionsDF = pipelineModel.transform(evaluationDF);

                predictionsDF.show(false);

                Dataset<Row> forEvaluationDF = predictionsDF.select(col("label"), 
                    col("prediction"));

                RegressionEvaluator evaluteR2 = new RegressionEvaluator().setMetricName("r2");
                RegressionEvaluator evaluteRMSE = new RegressionEvaluator().setMetricName("rmse");

                double r2 = evaluteR2.evaluate(forEvaluationDF);
                double rmse = evaluteRMSE.evaluate(forEvaluationDF);

                logger.info("---------------------------");
                logger.info("R2 =" + r2);
                logger.info("RMSE =" + rmse);
                logger.info("---------------------------");
        }

Source File: TestForwardCompatibility.java From iceberg with Apache License 2.0

5 votes

@Test
public void testSparkStreamingWriteFailsUnknownTransform() throws IOException {
  File parent = temp.newFolder("avro");
  File location = new File(parent, "test");
  File dataFolder = new File(location, "data");
  dataFolder.mkdirs();
  File checkpoint = new File(parent, "checkpoint");
  checkpoint.mkdirs();

  HadoopTables tables = new HadoopTables(CONF);
  tables.create(SCHEMA, UNKNOWN_SPEC, location.toString());

  MemoryStream<Integer> inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT());
  StreamingQuery query = inputStream.toDF()
      .selectExpr("value AS id", "CAST (value AS STRING) AS data")
      .writeStream()
      .outputMode("append")
      .format("iceberg")
      .option("checkpointLocation", checkpoint.toString())
      .option("path", location.toString())
      .start();

  List<Integer> batch1 = Lists.newArrayList(1, 2);
  send(batch1, inputStream);

  AssertHelpers.assertThrows("Should reject streaming write with unsupported transform",
      StreamingQueryException.class, "Cannot write using unsupported transforms: zero",
      query::processAllAvailable);
}

Source File: TranslationContext.java From beam with Apache License 2.0

5 votes

/** Starts the pipeline. */
public void startPipeline() {
  try {
    SparkStructuredStreamingPipelineOptions options =
        serializablePipelineOptions.get().as(SparkStructuredStreamingPipelineOptions.class);
    int datasetIndex = 0;
    for (Dataset<?> dataset : leaves) {
      if (options.isStreaming()) {
        // TODO: deal with Beam Discarding, Accumulating and Accumulating & Retracting	outputmodes
        // with DatastreamWriter.outputMode
        DataStreamWriter<?> dataStreamWriter = dataset.writeStream();
        // spark sets a default checkpoint dir if not set.
        if (options.getCheckpointDir() != null) {
          dataStreamWriter =
              dataStreamWriter.option("checkpointLocation", options.getCheckpointDir());
        }
        // TODO: Do not await termination here.
        dataStreamWriter.foreach(new NoOpForeachWriter<>()).start().awaitTermination();
      } else {
        if (options.getTestMode()) {
          LOG.debug("**** dataset {} catalyst execution plans ****", ++datasetIndex);
          dataset.explain(true);
        }
        // apply a dummy fn just to apply foreach action that will trigger the pipeline run in
        // spark
        dataset.foreach((ForeachFunction) t -> {});
      }
    }
  } catch (StreamingQueryException e) {
    throw new RuntimeException("Pipeline execution failed: " + e);
  }
}

Source File: ReadLinesFromMultipleFileStreams.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() throws TimeoutException {
  log.debug("-> start()");

  SparkSession spark = SparkSession.builder()
      .appName("Read lines over a file stream").master("local")
      .getOrCreate();

  Dataset<Row> df = spark
      .readStream()
      .format("text")
      .load(StreamingUtils.getInputDirectory());

  StreamingQuery query = df
      .writeStream()
      .outputMode(OutputMode.Update())
      .format("console").start();

  try {
    query.awaitTermination();
  } catch (StreamingQueryException e) {
    log.error("Exception while waiting for query to end {}.", e
        .getMessage(),
        e);
  }

  // In this case everything is a string
  df.show();
  df.printSchema();
}

Source File: ReadLinesFromFileStream.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() throws TimeoutException {
  log.debug("-> start()");

  SparkSession spark = SparkSession.builder()
      .appName("Read lines over a file stream")
      .master("local")
      .getOrCreate();

  Dataset<Row> df = spark
      .readStream()
      .format("text")
      .load(StreamingUtils.getInputDirectory());

  StreamingQuery query = df
      .writeStream()
      .outputMode(OutputMode.Update())
      .format("console")
      .start();

  try {
    query.awaitTermination();
  } catch (StreamingQueryException e) {
    log.error(
        "Exception while waiting for query to end {}.",
        e.getMessage(),
        e);
  }

  // Never executed
  df.show();
  df.printSchema();
}

Source File: SparkStructuredStreaming.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

4 votes

public static void main(String[] args) throws InterruptedException, StreamingQueryException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.sql.caseSensitive", CASE_SENSITIVE);

        SparkSession sparkSession = SparkSession.builder()
                .config(conf)
                .getOrCreate();

        Dataset<Row> meetupDF = sparkSession.readStream()
                .format(STREAM_FORMAT)
                .option("kafka.bootstrap.servers", KAFKA_BROKERS)
                .option("subscribe", KAFKA_TOPIC)                
                .load();                              
        
        meetupDF.printSchema();

        Dataset<Row> rsvpAndTimestampDF = meetupDF
                .select(col("timestamp"),
                        from_json(col("value").cast("string"), RSVP_SCHEMA)
                                 .alias("rsvp"))
                .alias("meetup")
                .select("meetup.*");
        
        rsvpAndTimestampDF.printSchema();
       
        Dataset<Row> window = rsvpAndTimestampDF
                .withWatermark("timestamp", "1 minute")
                .groupBy(
                        window(col("timestamp"), "4 minutes", "2 minutes"),
                        col("rsvp.guests"))
                .count();

        StreamingQuery query = window.writeStream()
                .outputMode("complete")
                .format("console")                               
                .option("checkpointLocation", CHECKPOINT_LOCATION)
                .option("truncate", false)
                .start();

        query.awaitTermination();
    }

Source File: SparkMLScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

4 votes

public static void main(String[] args) throws InterruptedException, StreamingQueryException {
 
      System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

      // * the schema can be written on disk, and read from disk
      // * the schema is not mandatory to be complete, it can contain only the needed fields    
      StructType RSVP_SCHEMA = new StructType()                                
              .add("event",
                      new StructType()
                              .add("event_id", StringType, true)
                              .add("event_name", StringType, true)
                              .add("event_url", StringType, true)
                              .add("time", LongType, true))
              .add("group",
                      new StructType()
                              .add("group_city", StringType, true)
                              .add("group_country", StringType, true)
                              .add("group_id", LongType, true)
                              .add("group_lat", DoubleType, true)
                              .add("group_lon", DoubleType, true)
                              .add("group_name", StringType, true)
                              .add("group_state", StringType, true)
                              .add("group_topics", DataTypes.createArrayType(
                                      new StructType()
                                              .add("topicName", StringType, true)
                                              .add("urlkey", StringType, true)), true)
                              .add("group_urlname", StringType, true))
              .add("guests", LongType, true)
              .add("member",
                      new StructType()
                              .add("member_id", LongType, true)
                              .add("member_name", StringType, true)                                
                              .add("photo", StringType, true))
              .add("mtime", LongType, true)
              .add("response", StringType, true)
              .add("rsvp_id", LongType, true)
              .add("venue",
                      new StructType()
                              .add("lat", DoubleType, true)
                              .add("lon", DoubleType, true)
                              .add("venue_id", LongType, true)
                              .add("venue_name", StringType, true))
              .add("visibility", StringType, true);

      final SparkConf conf = new SparkConf()
              .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
              .setAppName(APPLICATION_NAME)
              .set("spark.sql.caseSensitive", CASE_SENSITIVE);

      SparkSession spark = SparkSession
              .builder()
              .config(conf)
              .getOrCreate();

      PipelineModel pipelineModel = PipelineModel.load(MODEL_FOLDER_PATH);
     
      Dataset<Row> meetupStream = spark.readStream()
              .format(KAFKA_FORMAT)
              .option("kafka.bootstrap.servers", KAFKA_BROKERS)
              .option("subscribe", KAFKA_TOPIC)
              .load();

      Dataset<Row> gatheredDF = meetupStream.select(
    (from_json(col("value").cast("string"), RSVP_SCHEMA))
	        .alias("rsvp"))
	.alias("meetup")
          .select("meetup.*");
		
      Dataset<Row> filteredDF = gatheredDF.filter(e -> !e.anyNull());  

      Dataset<Row> preparedDF = filteredDF.select(
        col("rsvp.group.group_city"),
        col("rsvp.group.group_lat"), col("rsvp.group.group_lon"), 
		col("rsvp.response")
);
		                
      preparedDF.printSchema();
   
      Dataset<Row> predictionDF = pipelineModel.transform(preparedDF);
      
      StreamingQuery query = predictionDF.writeStream()                
              .format(JSON_FORMAT)
              .option("path", RESULT_FOLDER_PATH)
              .option("checkpointLocation", CHECKPOINT_LOCATION)
              .trigger(Trigger.ProcessingTime(QUERY_INTERVAL_SECONDS))
              .option("truncate", false)
              .start();

      query.awaitTermination();
  }

Source File: StructuredDemo.java From structured-streaming-avro-demo with BSD 3-Clause "New" or "Revised" License

4 votes

public static void main(String[] args) throws StreamingQueryException, TimeoutException {
    //set log4j programmatically
    LogManager.getLogger("org.apache.spark").setLevel(Level.WARN);
    LogManager.getLogger("org.apache.kafka").setLevel(Level.WARN);
    LogManager.getLogger("akka").setLevel(Level.ERROR);
    //on windows we may need to configure winutils if hadoop_home is not set
    //System.setProperty("hadoop.home.dir", "c:/app/hadoop");
    //configure Spark
    SparkConf conf = new SparkConf()
            .setAppName("kafka-structured")
            .set("spark.driver.bindAddress", "localhost")
            .setMaster("local[*]");

    //initialize spark session
    SparkSession sparkSession = SparkSession
            .builder()
            .config(conf)
            .getOrCreate();

    //reduce task number
    sparkSession.sqlContext().setConf("spark.sql.shuffle.partitions", "3");

    //data stream from kafka
    Dataset<Row> ds1 = sparkSession
            .readStream()
            .format("kafka")
            .option("kafka.bootstrap.servers", "localhost:9092")
            .option("subscribe", "mytopic")
            .option("startingOffsets", "earliest")
            .load();
    //print kafka schema
    ds1.printSchema();
    
    //start the streaming query
    Dataset<Row> ds2 = ds1
            .select(from_avro(col("value"), USER_SCHEMA).as("rows"))
            .select("rows.*");

    //print avro schema converted to dataframe :)
    ds2.printSchema();

    StreamingQuery query1 = ds2
            .groupBy("str1")
            .count()
            .writeStream()
            .queryName("Test query")
            .outputMode("complete")
            .format("console")
            .start();

    query1.awaitTermination();

}

Source File: StructuredStreamingExample.java From Apache-Spark-2x-for-Java-Developers with MIT License

3 votes

public static void main(String[] args) throws StreamingQueryException {
	System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
	SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("structured Streaming Example")
			.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

	Dataset<Row> inStream = sparkSession.readStream().format("socket").option("host", "10.204.136.223")
			.option("port", 9999).load();

	Dataset<FlightDetails> dsFlightDetails = inStream.as(Encoders.STRING()).map(x -> {
		ObjectMapper mapper = new ObjectMapper();
		return mapper.readValue(x, FlightDetails.class);

	}, Encoders.bean(FlightDetails.class));
	
	
	dsFlightDetails.createOrReplaceTempView("flight_details");
	
	Dataset<Row> avdFlightDetails = sparkSession.sql("select flightId, avg(temperature) from flight_details group by flightId");
	
	StreamingQuery query = avdFlightDetails.writeStream()
			  .outputMode("complete")
			  .format("console")
			  .start();

			query.awaitTermination();
	

}

org.apache.spark.sql.streaming.StreamingQueryException Java Examples