org.apache.spark.rdd.RDD Java Exaples

Source File: MLMetricsSupporter.java From DDF with Apache License 2.0

6 votes

@Override
public DDF residuals() throws DDFException {
  SparkDDF predictionDDF = (SparkDDF) this.getDDF();
  JavaRDD<double[]> predictionRDD = predictionDDF.getJavaRDD(double[].class);

  JavaRDD<double[]> result = predictionRDD.map(new MetricsMapperResiduals());

  if (result == null) mLog.error(">> javaRDD result of MetricMapper residuals is null");
  if (predictionDDF.getManager() == null) mLog.error(">> predictionDDF.getManager() is null");
  if (result.rdd() == null) mLog.error(">> result.rdd() is null");
  if (predictionDDF.getSchema() == null) mLog.error(">> predictionDDF.getSchema() is null");
  if (predictionDDF.getName() == null) mLog.error(">> predictionDDF.getName() is null");

  Schema schema = new Schema("residuals double");
  DDFManager manager = this.getDDF().getManager();
  DDF residualDDF = manager
      .newDDF(manager, result.rdd(), new Class<?>[] { RDD.class, double[].class }, null,
          schema);

  if (residualDDF == null) mLog.error(">>>>>>>>>>>.residualDDF is null");

  return residualDDF;
}

Source File: CollectedGroupConverter.java From spork with Apache License 2.0

6 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
    POCollectedGroup physicalOperator) throws IOException {
  SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
  RDD<Tuple> rdd = predecessors.get(0);
  // return predecessors.get(0);
  RDD<Tuple> rdd2 = rdd.coalesce(1, false, null);
  long count = 0;
  try {

    count = rdd2.count();

  } catch (Exception e) {

  }
  CollectedGroupFunction collectedGroupFunction
      = new CollectedGroupFunction(physicalOperator, count);
  return rdd.toJavaRDD().mapPartitions(collectedGroupFunction, true).rdd();
}

Source File: Evaluation.java From oryx with Apache License 2.0

6 votes

/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
      testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,Object>> testUserProducts =
      (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
  JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
  double mse = predictions.mapToPair(
      rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
  ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
    double diff = valuePrediction._1() - valuePrediction._2();
    return diff * diff;
  }).mean();
  return Math.sqrt(mse);
}

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

6 votes

/**
 * Convert a {@code MatrixObject} to a {@code RDD<String>} in IJV format.
 *
 * @param matrixObject
 *            the {@code MatrixObject}
 * @return the {@code MatrixObject} converted to a {@code RDD<String>}
 */
public static RDD<String> matrixObjectToRDDStringIJV(MatrixObject matrixObject) {

	// NOTE: The following works when called from Java but does not
	// currently work when called from Spark Shell (when you call
	// collect() on the RDD<String>).
	//
	// JavaRDD<String> javaRDD = jsc.parallelize(list);
	// RDD<String> rdd = JavaRDD.toRDD(javaRDD);
	//
	// Therefore, we call parallelize() on the SparkContext rather than
	// the JavaSparkContext to produce the RDD<String> for Scala.

	List<String> list = matrixObjectToListStringIJV(matrixObject);

	ClassTag<String> tag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
	return sc().parallelize(JavaConversions.asScalaBuffer(list), sc().defaultParallelism(), tag);
}

Source File: DeepSparkContext.java From deep-spark with Apache License 2.0

6 votes

/**
 * Returns a Cells RDD from HDFS.
 * @param config HDFS ExtractorConfig.
 * @return Cells RDD.
 */
public RDD<Cells> createHDFSRDD(ExtractorConfig<Cells> config) {

    Serializable host = config.getValues().get(ExtractorConstants.HOST);
    Serializable port = config.getValues().get(ExtractorConstants.PORT);
    Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH);

    final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this);

    String filePath = path.toString();
    if (config.getExtractorImplClassName().equals(ExtractorConstants.HDFS)) {
        filePath = ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString();
    }

    return createRDDFromFilePath(filePath, textFileDataTable);
}

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

6 votes

/**
 * Convert a {@code FrameObject} to a {@code RDD<String>} in IJV format.
 *
 * @param frameObject
 *            the {@code FrameObject}
 * @return the {@code FrameObject} converted to a {@code RDD<String>}
 */
public static RDD<String> frameObjectToRDDStringIJV(FrameObject frameObject) {

	// NOTE: The following works when called from Java but does not
	// currently work when called from Spark Shell (when you call
	// collect() on the RDD<String>).
	//
	// JavaRDD<String> javaRDD = jsc.parallelize(list);
	// RDD<String> rdd = JavaRDD.toRDD(javaRDD);
	//
	// Therefore, we call parallelize() on the SparkContext rather than
	// the JavaSparkContext to produce the RDD<String> for Scala.

	List<String> list = frameObjectToListStringIJV(frameObject);

	ClassTag<String> tag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
	return sc().parallelize(JavaConversions.asScalaBuffer(list), sc().defaultParallelism(), tag);
}

Source File: LoadConverter.java From spork with Apache License 2.0

6 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessorRdds, POLoad poLoad)
        throws IOException {
    // if (predecessors.size()!=0) {
    // throw new
    // RuntimeException("Should not have predecessors for Load. Got : "+predecessors);
    // }

    JobConf loadJobConf = SparkUtil.newJobConf(pigContext);
    configureLoader(physicalPlan, poLoad, loadJobConf);

    // don't know why but just doing this cast for now
    RDD<Tuple2<Text, Tuple>> hadoopRDD = sparkContext.newAPIHadoopFile(
            poLoad.getLFile().getFileName(), PigInputFormatSpark.class,
            Text.class, Tuple.class, loadJobConf);

    registerUdfFiles();
    // map to get just RDD<Tuple>
    return hadoopRDD.map(TO_TUPLE_FUNCTION,
            SparkUtil.getManifest(Tuple.class));
}

Source File: ExtractorTest.java From deep-spark with Apache License 2.0

6 votes

/**
 * Test filter EQ.
 *
 * @param <W> the type parameter
 */
@Test(alwaysRun = true, dependsOnGroups = { "FunctionalTests" })
protected <W> void testFilterEQ() {
    DeepSparkContext context = getDeepSparkContext();
    try {

        Filter[] filters = null;

        Filter filter = new Filter("id", FilterType.EQ, "TestDataSet");
        filters = new Filter[] { filter };
        ExtractorConfig<W> inputConfigEntity2 = getFilterConfig(filters);

        RDD<W> inputRDDEntity2 = context.createRDD(inputConfigEntity2);
        assertEquals(inputRDDEntity2.count(), 1);
    } finally {
        context.stop();
    }

}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testRDDGoodMetadataDML() {
	System.out.println("MLContextTest - RDD<String> good metadata DML");

	List<String> list = new ArrayList<>();
	list.add("1,1,1");
	list.add("2,2,2");
	list.add("3,3,3");
	JavaRDD<String> javaRDD = sc.parallelize(list);
	RDD<String> rdd = JavaRDD.toRDD(javaRDD);

	MatrixMetadata mm = new MatrixMetadata(3, 3, 9);

	Script script = dml("print('sum: ' + sum(M));").in("M", rdd, mm);
	setExpectedStdOut("sum: 18.0");
	ml.execute(script);
}

Source File: MLContextConversionUtil.java From systemds with Apache License 2.0

6 votes

/**
 * Convert a {@code MatrixObject} to a {@code RDD<String>} in IJV format.
 *
 * @param matrixObject
 *            the {@code MatrixObject}
 * @return the {@code MatrixObject} converted to a {@code RDD<String>}
 */
public static RDD<String> matrixObjectToRDDStringIJV(MatrixObject matrixObject) {

	// NOTE: The following works when called from Java but does not
	// currently work when called from Spark Shell (when you call
	// collect() on the RDD<String>).
	//
	// JavaRDD<String> javaRDD = jsc.parallelize(list);
	// RDD<String> rdd = JavaRDD.toRDD(javaRDD);
	//
	// Therefore, we call parallelize() on the SparkContext rather than
	// the JavaSparkContext to produce the RDD<String> for Scala.

	List<String> list = matrixObjectToListStringIJV(matrixObject);

	ClassTag<String> tag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
	return sc().parallelize(JavaConversions.asScalaBuffer(list), sc().defaultParallelism(), tag);
}

Source File: RankConverter.java From spork with Apache License 2.0

6 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank)
		throws IOException {
	SparkUtil.assertPredecessorSize(predecessors, poRank, 1);
       RDD<Tuple> rdd = predecessors.get(0);
	JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD()
			.mapToPair(new ToPairRdd());
	JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd
			.groupByKey();
	JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex
			.mapToPair(new IndexCounters());
	JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex
			.sortByKey(true);
	Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap();
	JavaRDD<Tuple> finalRdd = rdd.toJavaRDD()
			.map(new RankFunction(new HashMap<Integer, Long>(counts)));
	return finalRdd.rdd();
}

Source File: SparkDatasetBoundedSourceVertex.java From incubator-nemo with Apache License 2.0

6 votes

/**
 * Constructor.
 *
 * @param sparkSession sparkSession to recreate on each executor.
 * @param dataset      Dataset to read data from.
 */
public SparkDatasetBoundedSourceVertex(final SparkSession sparkSession, final Dataset<T> dataset) {
  this.readables = new ArrayList<>();
  final RDD rdd = dataset.sparkRDD();
  final Partition[] partitions = rdd.getPartitions();
  for (int i = 0; i < partitions.length; i++) {
    readables.add(new SparkDatasetBoundedSourceReadable(
      partitions[i],
      sparkSession.getDatasetCommandsList(),
      sparkSession.getInitialConf(),
      i));
  }
  this.estimatedByteSize = dataset.javaRDD()
    .map(o -> (long) o.toString().getBytes("UTF-8").length)
    .reduce((a, b) -> a + b);
}

Source File: AerospikeCellExtractorFT.java From deep-spark with Apache License 2.0

6 votes

@Test
@Override
public void testDataSet() {
    DeepSparkContext context = new DeepSparkContext("local", "deepSparkContextTest");

    try {

        ExtractorConfig<Cells> inputConfigEntity = new ExtractorConfig(Cells.class);
        inputConfigEntity.putValue(ExtractorConstants.HOST, AerospikeJavaRDDFT.HOST)
                .putValue(ExtractorConstants.PORT, AerospikeJavaRDDFT.PORT)
                .putValue(ExtractorConstants.NAMESPACE, AerospikeJavaRDDFT.NAMESPACE_CELL)
                .putValue(ExtractorConstants.SET, ExtractorTest.BOOK_INPUT);
        inputConfigEntity.setExtractorImplClass(AerospikeCellExtractor.class);

        RDD<Cells> inputRDDEntity = context.createRDD(inputConfigEntity);

        //Import dataSet was OK and we could read it
        assertEquals(inputRDDEntity.count(), 1, "Expected read entity count is 1");

    } finally {
        context.stop();
    }

}

Source File: DistinctConverter.java From spork with Apache License 2.0

6 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        PODistinct poDistinct) throws IOException {
    SparkUtil.assertPredecessorSize(predecessors, poDistinct, 1);
    RDD<Tuple> rdd = predecessors.get(0);

    ClassTag<Tuple2<Tuple, Object>> tuple2ClassManifest = SparkUtil
            .<Tuple, Object> getTuple2Manifest();

    RDD<Tuple2<Tuple, Object>> rddPairs = rdd.map(TO_KEY_VALUE_FUNCTION,
            tuple2ClassManifest);
    PairRDDFunctions<Tuple, Object> pairRDDFunctions
      = new PairRDDFunctions<Tuple, Object>(
            rddPairs, SparkUtil.getManifest(Tuple.class),
            SparkUtil.getManifest(Object.class), null);
    int parallelism = SparkUtil.getParallelism(predecessors, poDistinct);
    return pairRDDFunctions.reduceByKey(MERGE_VALUES_FUNCTION, parallelism)
            .map(TO_VALUE_FUNCTION, SparkUtil.getManifest(Tuple.class));
}

Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0

6 votes

@Test
public void createHDFSRDDTest() throws Exception {

    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);

    RDD<String> rdd = mock(RDD.class);
    JavaRDD<String> javaRdd = mock(JavaRDD.class);
    when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
    doReturn(javaRdd).when(deepSparkContextSpy).textFile(anyString());
    when(rdd.toJavaRDD()).thenReturn(javaRdd);
    when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();

    RDD rddReturn = deepSparkContextSpy.createHDFSRDD(config);

    verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());

    verify(javaRdd, times(1)).map(any(Function.class));

}

Source File: JdbcEntityExtractorFT.java From deep-spark with Apache License 2.0

6 votes

@Test
@Override
public void testDataSet() {
    DeepSparkContext context = new DeepSparkContext("local", "deepSparkContextTest");

    try {

        ExtractorConfig<MessageTestEntity> inputConfigEntity = getReadExtractorConfig();
        RDD<MessageTestEntity> inputRDDEntity = context.createRDD(inputConfigEntity);

        //Import dataSet was OK and we could read it
        assertEquals(inputRDDEntity.count(), 1, "Expected read entity count is 1");

    } finally {
        context.stop();
    }

}

Source File: SparkUtils.java From spliceengine with GNU Affero General Public License v3.0

6 votes

@SuppressWarnings("rawtypes")
// TODO (wjk): remove this when we have a better way to change name of RDDs implicitly created within spark
private static void setAncestorRDDNames(org.apache.spark.rdd.RDD rdd, int levels, String[] newNames, String[] checkNames) {
    assert levels > 0;
    org.apache.spark.rdd.RDD currentRDD = rdd;
    for (int i = 0; i < levels && currentRDD != null; i++) {
        org.apache.spark.rdd.RDD rddAnc =
                ((org.apache.spark.Dependency) currentRDD.dependencies().head()).rdd();
        if (rddAnc != null) {
            if (checkNames == null || checkNames[i] == null)
                rddAnc.setName(newNames[i]);
            else if (rddAnc.name().equals(checkNames[i]))
                rddAnc.setName(newNames[i]);
        }
        currentRDD = rddAnc;
    }
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testRDDSumIJVDML() {
	System.out.println("MLContextTest - RDD<String> IJV sum DML");

	List<String> list = new ArrayList<>();
	list.add("1 1 1");
	list.add("2 1 2");
	list.add("1 2 3");
	list.add("3 3 4");
	JavaRDD<String> javaRDD = sc.parallelize(list);
	RDD<String> rdd = JavaRDD.toRDD(javaRDD);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, 3, 3);

	Script script = dml("print('sum: ' + sum(M));").in("M", rdd, mm);
	setExpectedStdOut("sum: 10.0");
	ml.execute(script);
}

Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0

5 votes

@Test
public void textFileHDFSTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    RDD<Cells> result = mock(RDD.class);

    ExtractorConfig<Cells> config = createHDFSDeepJobConfig();
    PowerMockito.doReturn(result).when(deepSparkContextSpy).createHDFSRDD(config);
    deepSparkContextSpy.textFile(config);

    verify(deepSparkContextSpy, times(1)).createHDFSRDD(config);
}

Source File: MLUpdate.java From oryx with Apache License 2.0

5 votes

/**
 * Default implementation which randomly splits new data into train/test sets.
 * This handles the case where {@link #getTestFraction()} is not 0 or 1.
 *
 * @param newData data that has arrived in the current input batch
 * @return a {@link Pair} of train, test {@link RDD}s.
 */
protected Pair<JavaRDD<M>,JavaRDD<M>> splitNewDataToTrainTest(JavaRDD<M> newData) {
  RDD<M>[] testTrainRDDs = newData.rdd().randomSplit(
      new double[]{1.0 - testFraction, testFraction},
      RandomManager.getRandom().nextLong());
  return new Pair<>(newData.wrapRDD(testTrainRDDs[0]),
                    newData.wrapRDD(testTrainRDDs[1]));
}

Source File: SourceDStream.java From beam with Apache License 2.0

5 votes

@Override
public scala.Option<RDD<Tuple2<Source<T>, CheckpointMarkT>>> compute(Time validTime) {
  RDD<Tuple2<Source<T>, CheckpointMarkT>> rdd =
      new SourceRDD.Unbounded<>(
          ssc().sparkContext(), options, createMicrobatchSource(), numPartitions);
  return scala.Option.apply(rdd);
}

Source File: FilterConverter.java From spork with Apache License 2.0

5 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        POFilter physicalOperator) {
    SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    FilterFunction filterFunction = new FilterFunction(physicalOperator);
    return rdd.filter(filterFunction);
}

Source File: ForEachConverter.java From spork with Apache License 2.0

5 votes

@Override
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors,
        POForEach physicalOperator) {
    SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1);
    RDD<Tuple> rdd = predecessors.get(0);
    ForEachFunction forEachFunction = new ForEachFunction(physicalOperator, this.confBytes);
    return rdd.toJavaRDD().mapPartitions(forEachFunction, true).rdd();
}

Source File: CounterConverter.java From spork with Apache License 2.0

5 votes

@Override
	public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, 
			POCounter poCounter) throws IOException {
		SparkUtil.assertPredecessorSize(predecessors, poCounter, 1);
        RDD<Tuple> rdd = predecessors.get(0);
        CounterConverterFunction f = new CounterConverterFunction(poCounter);
        JavaRDD<Tuple> jRdd = rdd.toJavaRDD().mapPartitionsWithIndex(f, true);
//        jRdd = jRdd.cache();
        return jRdd.rdd();
	}

Source File: MaprStreamsOffsetManagerImpl.java From datacollector with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public void saveOffsets(RDD<?> rdd) {
  Map<Integer, Long> offset = getOffsetToSave(((HasOffsetRanges) rdd).offsetRanges());
  if (!offset.isEmpty()) {
    SparkStreamingBinding.offsetHelper.saveOffsets(offset);
  } else {
    LOG.trace("Offset is empty");
  }
}

Source File: SparkDDF.java From DDF with Apache License 2.0

5 votes

public <T> SparkDDF(DDFManager manager, RDD<?> rdd, Class<T> unitType, String name, Schema schema)
    throws DDFException {

  super(manager);
  if (rdd == null) throw new DDFException("Non-null RDD is required to instantiate a new SparkDDF");
  this.initialize(manager, rdd, new Class<?>[] { RDD.class, unitType }, name, schema);
}

Source File: SqlHandler.java From DDF with Apache License 2.0

5 votes

@Override
public DDF sql2ddf(String command, Schema schema, DataSourceDescriptor dataSource, DataFormat dataFormat) throws DDFException {
  //    TableRDD tableRdd = null;
  //    RDD<Row> rddRow = null;

  DataFrame rdd = this.getHiveContext().sql(command);
  if (schema == null) schema = SchemaHandler.getSchemaFromDataFrame(rdd);
  DDF ddf = this.getManager().newDDF(this.getManager(), rdd, new Class<?>[]
                  {DataFrame.class}, null, schema);
  ddf.getRepresentationHandler().cache(false);
  ddf.getRepresentationHandler().get(new Class<?>[]{RDD.class, Row.class});
  return ddf;
}

Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0

5 votes

@Test
public void createS3RDDTest() throws Exception {

    deepSparkContext = createDeepSparkContext();
    Configuration hadoopConf = mock(Configuration.class);
    when(sparkContext.hadoopConfiguration()).thenReturn(hadoopConf);
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = mock(SQLContext.class);
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);

    RDD<String> rdd = mock(RDD.class);
    JavaRDD<String> javaRDD = mock(JavaRDD.class);

    when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
    doReturn(javaRDD).when(deepSparkContextSpy).textFile(anyString());
    when(rdd.toJavaRDD()).thenReturn(javaRDD);
    when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);

    ExtractorConfig<Cells> config = createS3DeepJobConfig();

    deepSparkContextSpy.createS3RDD(config);

    verify(hadoopConf, times(1)).set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID));
    verify(hadoopConf, times(1)).set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY));
    verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());
    verify(javaRDD, times(1)).map(any(Function.class));
}

Source File: SparkStreamingSqlAnalyse.java From sylph with Apache License 2.0

5 votes

/**
 * 预编译sql 而不是等到运行时，才发现错误
 * Precompiled sql instead of waiting for the runtime to find the error
 */
private static void checkDStream(
        SparkSession spark,
        String sourceTableName,
        StructType sourceSchema,
        List<Consumer<SparkSession>> handlers
)
{
    RDD<Row> rdd = spark.sparkContext().<Row>emptyRDD(ClassTag$.MODULE$.<Row>apply(Row.class));
    Dataset<Row> df = spark.createDataFrame(rdd, sourceSchema);
    df.createOrReplaceTempView(sourceTableName);
    handlers.forEach(x -> x.accept(spark));
    spark.sql("drop view " + sourceTableName);
}

Source File: SparkSession.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<Row> createDataFrame(final RDD<?> rdd, final Class<?> beanClass) {
  final boolean userTriggered = initializeFunction(rdd, beanClass);
  final Dataset<Row> result = Dataset.from(super.createDataFrame(rdd, beanClass));
  this.setIsUserTriggered(userTriggered);
  return result;
}

org.apache.spark.rdd.RDD Java Examples