org.apache.spark.api.java.JavaPairRDD#fromJavaRDD

Source File: AbstractJavaEsSparkTest.java From elasticsearch-hadoop with Apache License 2.0

6 votes

public void testEsRDDWriteWithDynamicMappingBasedOnMaps() throws Exception {
    Map<String, ?> doc1 = ImmutableMap.of("one", 1, "two", 2, "number", 1);
    Map<String, ?> doc2 = ImmutableMap.of("OTP", "Otopeni", "SFO", "San Fran", "number", 2);

    String target = "spark-test-java-dyn-map-id-write/data";
    Map<Metadata, Object> header1 = ImmutableMap.<Metadata, Object> of(ID, 1, TTL, "1d");
    Map<Metadata, Object> header2 = ImmutableMap.<Metadata, Object> of(ID, "2", TTL, "2d");
    JavaRDD<Tuple2<Object, Object>> tupleRdd = sc.parallelize(ImmutableList.<Tuple2<Object, Object>> of(new Tuple2(header1, doc1), new Tuple2(header2, doc2)));
    JavaPairRDD pairRDD = JavaPairRDD.fromJavaRDD(tupleRdd);
    // eliminate with static import
    JavaEsSpark.saveToEsWithMeta(pairRDD, target);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(target + "/1"));
    assertTrue(RestUtils.exists(target + "/2"));
    String results = RestUtils.get(target + "/_search?");
    assertThat(results, containsString("SFO"));
}

Source File: TransformationRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * Test co group.
 * demo计算目的: 以成绩分组 同学([成绩优秀学科],[成绩中等学科],[成绩差劲学科])
 *
 * @since hui_project 1.0.0
 */
public void testCoGroup() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    //成绩优秀的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails1 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "语文")
            , new Tuple2("xiaoming", "数学")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "艺术")));
    //成绩中等的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails2 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "艺术")
            , new Tuple2("lihua", "艺术")
            , new Tuple2("xiaofeng", "语文")));
    //成绩差的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails3 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "英语")
            , new Tuple2("lihua", "英语")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "数学")
            , new Tuple2("xiaofeng", "英语")));

    JavaPairRDD<String, String> scoreMapRDD1 = JavaPairRDD.fromJavaRDD(scoreDetails1);
    JavaPairRDD<String, String> scoreMapRDD2 = JavaPairRDD.fromJavaRDD(scoreDetails2);
    JavaPairRDD<String, String> scoreMapRDD3 = JavaPairRDD.fromJavaRDD(scoreDetails3);

    JavaPairRDD<String, Tuple3<Iterable<String>, Iterable<String>, Iterable<String>>> cogroupRDD =
            scoreMapRDD1.cogroup(scoreMapRDD2, scoreMapRDD3);
    checkResult(cogroupRDD.collect());
}

Source File: TransformationRDDTest.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * Test co group.
 * demo计算目的: 以成绩分组 同学([成绩优秀学科],[成绩中等学科],[成绩差劲学科])
 * @since hui_project 1.0.0
 */
@Test
public void testCoGroup() {
    //成绩优秀的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails1 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "语文")
            , new Tuple2("xiaoming", "数学")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "艺术")));
    //成绩中等的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails2 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "艺术")
            , new Tuple2("lihua", "艺术")
            , new Tuple2("xiaofeng", "语文")));
    //成绩差的学生+科目
    JavaRDD<Tuple2<String, String>> scoreDetails3 = sparkContext.parallelize(Arrays.asList(
            new Tuple2("xiaoming", "英语")
            , new Tuple2("lihua", "英语")
            , new Tuple2("lihua", "数学")
            , new Tuple2("xiaofeng", "数学")
            , new Tuple2("xiaofeng", "英语")));

    JavaPairRDD<String, String> scoreMapRDD1 = JavaPairRDD.fromJavaRDD(scoreDetails1);
    JavaPairRDD<String, String> scoreMapRDD2 = JavaPairRDD.fromJavaRDD(scoreDetails2);
    JavaPairRDD<String, String> scoreMapRDD3 = JavaPairRDD.fromJavaRDD(scoreDetails3);

    JavaPairRDD<String, Tuple3<Iterable<String>, Iterable<String>, Iterable<String>>> cogroupRDD =
            scoreMapRDD1.cogroup(scoreMapRDD2, scoreMapRDD3);
    checkResult(cogroupRDD.collect());
}

Source File: Model.java From predictionio-template-java-ecom-recommender with Apache License 2.0

5 votes

public static Model load(String id, Params params, SparkContext sc) {
    JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
    JavaPairRDD<Integer, double[]> userFeatures = JavaPairRDD.<Integer, double[]>fromJavaRDD(jsc.<Tuple2<Integer, double[]>>objectFile("/tmp/" + id + "/userFeatures"));
    JavaPairRDD<Integer, Tuple2<String, double[]>> indexItemFeatures = JavaPairRDD.<Integer, Tuple2<String, double[]>>fromJavaRDD(jsc.<Tuple2<Integer, Tuple2<String, double[]>>>objectFile("/tmp/" + id + "/indexItemFeatures"));
    JavaPairRDD<String, Integer> userIndex = JavaPairRDD.<String, Integer>fromJavaRDD(jsc.<Tuple2<String, Integer>>objectFile("/tmp/" + id + "/userIndex"));
    JavaPairRDD<String, Integer> itemIndex = JavaPairRDD.<String, Integer>fromJavaRDD(jsc.<Tuple2<String, Integer>>objectFile("/tmp/" + id + "/itemIndex"));
    JavaRDD<ItemScore> itemPopularityScore = jsc.objectFile("/tmp/" + id + "/itemPopularityScore");
    Map<String, Item> items = jsc.<Map<String, Item>>objectFile("/tmp/" + id + "/items").collect().get(0);

    logger.info("loaded model");
    return new Model(userFeatures, indexItemFeatures, userIndex, itemIndex, itemPopularityScore, items);
}

Source File: PersistedInputRDD.java From tinkerpop with Apache License 2.0

5 votes

@Override
public JavaPairRDD<Object, VertexWritable> readGraphRDD(final Configuration configuration, final JavaSparkContext sparkContext) {
    if (!configuration.containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
        throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_INPUT_LOCATION + " to read the persisted RDD from");
    Spark.create(sparkContext.sc());
    final Optional<String> graphLocation = Constants.getSearchGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_INPUT_LOCATION), SparkContextStorage.open());
    return graphLocation.isPresent() ? JavaPairRDD.fromJavaRDD((JavaRDD) Spark.getRDD(graphLocation.get()).toJavaRDD()) : JavaPairRDD.fromJavaRDD(sparkContext.emptyRDD());
}

Source File: CollabFilterCassandra8.java From Spark-Cassandra-Collabfiltering with Apache License 2.0

5 votes

public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(pred -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating())));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(validation -> new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL)));
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(validationRating -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating()))).join(predictionsJavaPairs).values();

	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(pair -> {
		Double err = pair._1() - pair._2();
		return (Object) (err * err);// No covariance! Need to cast to Object
		}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;

}

Source File: CollabFilterCassandra7.java From Spark-Cassandra-Collabfiltering with Apache License 2.0

5 votes

public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating pred) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating());
		}
		//
	}));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(new org.apache.spark.api.java.function.Function<CassandraRow, Rating>() {
		@Override
		public Rating call(CassandraRow validation) throws Exception {
			return new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL));
		}
	
	});
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
	
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating validationRating) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating());
		}
	
	})).join(predictionsJavaPairs).values();
	
	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(new org.apache.spark.api.java.function.Function<Tuple2<Double, Double>, Object>() {
		@Override
		public Object call(Tuple2<Double, Double> pair) throws Exception {
			Double err = pair._1() - pair._2();
			return (Object) (err * err);// No covariance! Need to cast
		}
	}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;
	 
}

Source File: JavaLatentDirichletAllocationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKLatentDirichletAllocationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse the data
    String path = "data/mllib/sample_lda_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.trim().split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    // Index documents with unique IDs
    JavaPairRDD<Long, Vector> corpus =
      JavaPairRDD.fromJavaRDD(parsedData.zipWithIndex().map(
        new Function<Tuple2<Vector, Long>, Tuple2<Long, Vector>>() {
          public Tuple2<Long, Vector> call(Tuple2<Vector, Long> doc_id) {
            return doc_id.swap();
          }
        }
      )
    );
    corpus.cache();

    // Cluster the documents into three topics using LDA
    LDAModel ldaModel = new LDA().setK(3).run(corpus);

    // Output topics. Each is a distribution over words (matching word count vectors)
    System.out.println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize()
      + " words):");
    Matrix topics = ldaModel.topicsMatrix();
    for (int topic = 0; topic < 3; topic++) {
      System.out.print("Topic " + topic + ":");
      for (int word = 0; word < ldaModel.vocabSize(); word++) {
        System.out.print(" " + topics.apply(word, topic));
      }
      System.out.println();
    }

    ldaModel.save(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    DistributedLDAModel sameModel = DistributedLDAModel.load(jsc.sc(),
      "target/org/apache/spark/JavaLatentDirichletAllocationExample/LDAModel");
    // $example off$

    jsc.stop();
  }

Source File: Basic.java From learning-spark-with-java with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}

Source File: PersistedInputRDD.java From tinkerpop with Apache License 2.0

4 votes

@Override
public <K, V> JavaPairRDD<K, V> readMemoryRDD(final Configuration configuration, final String memoryKey, final JavaSparkContext sparkContext) {
    if (!configuration.containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
        throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_INPUT_LOCATION + " to read the persisted RDD from");
    return JavaPairRDD.fromJavaRDD((JavaRDD) Spark.getRDD(Constants.getMemoryLocation(configuration.getString(Constants.GREMLIN_HADOOP_INPUT_LOCATION), memoryKey)).toJavaRDD());
}

Source File: GeoWaveRDDLoader.java From geowave with Apache License 2.0

4 votes

public static JavaPairRDD<GeoWaveInputKey, SimpleFeature> loadRawRDD(
    final SparkContext sc,
    final DataStorePluginOptions storeOptions,
    final RDDOptions rddOpts) throws IOException {
  if (sc == null) {
    LOGGER.error("Must supply a valid Spark Context. Please set SparkContext and try again.");
    return null;
  }

  if (storeOptions == null) {
    LOGGER.error("Must supply input store to load. Please set storeOptions and try again.");
    return null;
  }

  if (rddOpts == null) {
    LOGGER.error("Must supply valid RDDOptions to load a rdd.");
    return null;
  }

  final Configuration conf = new Configuration(sc.hadoopConfiguration());

  GeoWaveInputFormat.setStoreOptions(conf, storeOptions);

  if (rddOpts.getQuery() != null) {
    GeoWaveInputFormat.setQuery(
        conf,
        rddOpts.getQuery(),
        storeOptions.createAdapterStore(),
        storeOptions.createInternalAdapterStore(),
        storeOptions.createIndexStore());
  }

  if ((rddOpts.getMinSplits() > -1) || (rddOpts.getMaxSplits() > -1)) {
    GeoWaveInputFormat.setMinimumSplitCount(conf, rddOpts.getMinSplits());
    GeoWaveInputFormat.setMaximumSplitCount(conf, rddOpts.getMaxSplits());
  } else {
    final int defaultSplitsSpark = sc.getConf().getInt("spark.default.parallelism", -1);
    // Attempt to grab default partition count for spark and split data
    // along that.
    // Otherwise just fallback to default according to index strategy
    if (defaultSplitsSpark != -1) {
      GeoWaveInputFormat.setMinimumSplitCount(conf, defaultSplitsSpark);
      GeoWaveInputFormat.setMaximumSplitCount(conf, defaultSplitsSpark);
    }
  }

  final RDD<Tuple2<GeoWaveInputKey, SimpleFeature>> rdd =
      sc.newAPIHadoopRDD(
          conf,
          GeoWaveInputFormat.class,
          GeoWaveInputKey.class,
          SimpleFeature.class);

  final JavaPairRDD<GeoWaveInputKey, SimpleFeature> javaRdd =
      JavaPairRDD.fromJavaRDD(rdd.toJavaRDD());

  return javaRdd;
}

Source File: GeoWaveRDDLoader.java From geowave with Apache License 2.0

4 votes

public static JavaPairRDD<GeoWaveInputKey, GridCoverage> loadRawRasterRDD(
    final SparkContext sc,
    final DataStorePluginOptions storeOptions,
    final String indexName,
    final Integer minSplits,
    final Integer maxSplits) throws IOException {
  if (sc == null) {
    LOGGER.error("Must supply a valid Spark Context. Please set SparkContext and try again.");
    return null;
  }

  if (storeOptions == null) {
    LOGGER.error("Must supply input store to load. Please set storeOptions and try again.");
    return null;
  }

  final Configuration conf = new Configuration(sc.hadoopConfiguration());

  GeoWaveInputFormat.setStoreOptions(conf, storeOptions);

  if (indexName != null) {
    GeoWaveInputFormat.setQuery(
        conf,
        QueryBuilder.newBuilder().indexName(indexName).build(),
        storeOptions.createAdapterStore(),
        storeOptions.createInternalAdapterStore(),
        storeOptions.createIndexStore());
  }
  if (((minSplits != null) && (minSplits > -1)) || ((maxSplits != null) && (maxSplits > -1))) {
    GeoWaveInputFormat.setMinimumSplitCount(conf, minSplits);
    GeoWaveInputFormat.setMaximumSplitCount(conf, maxSplits);
  } else {
    final int defaultSplitsSpark = sc.getConf().getInt("spark.default.parallelism", -1);
    // Attempt to grab default partition count for spark and split data
    // along that.
    // Otherwise just fallback to default according to index strategy
    if (defaultSplitsSpark != -1) {
      GeoWaveInputFormat.setMinimumSplitCount(conf, defaultSplitsSpark);
      GeoWaveInputFormat.setMaximumSplitCount(conf, defaultSplitsSpark);
    }
  }

  final RDD<Tuple2<GeoWaveInputKey, GridCoverage>> rdd =
      sc.newAPIHadoopRDD(
          conf,
          GeoWaveInputFormat.class,
          GeoWaveInputKey.class,
          GridCoverage.class);

  final JavaPairRDD<GeoWaveInputKey, GridCoverage> javaRdd =
      JavaPairRDD.fromJavaRDD(rdd.toJavaRDD());

  return javaRdd;
}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#fromJavaRDD()