Java Code Examples for

The following examples show how to use . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File:    From tutorials with MIT License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage: JavaWordCount <file>");
    SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount")
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator());
    JavaPairRDD<String, Integer> wordAsTuple = words.mapToPair(word -> new Tuple2<>(word, 1));
    JavaPairRDD<String, Integer> wordWithCount = wordAsTuple.reduceByKey((Integer i1, Integer i2)->i1 + i2);
    List<Tuple2<String, Integer>> output = wordWithCount.collect();
    for (Tuple2<?, ?> tuple : output) {
         System.out.println(tuple._1() + ": " + tuple._2());
Example 2
Source File:    From bunsen with Apache License 2.0 6 votes vote down vote up
 * Extracts the given resource type from the RDD of bundles and returns
 * it as a Dataset of that type, including any declared resources contained
 * to the parent resource.
 * @param spark the spark session
 * @param bundles the RDD of FHIR Bundles
 * @param resourceTypeUrl the url of the resource
 * @param containedClassesUrls the list of urls of the resources contained to the parent resource
 * @return a dataset of the given resource
public Dataset<Row> extractEntry(SparkSession spark, JavaRDD<BundleContainer> bundles,
    String resourceTypeUrl, List<String> containedClassesUrls) {

  FhirContext context = FhirContexts.contextFor(fhirVersion);

  SparkRowConverter converter = SparkRowConverter
      .forResource(context, resourceTypeUrl, containedClassesUrls);

  ToResourceRow resourceToRowConverter = new ToResourceRow(converter.getResourceType(),

  JavaRDD<Row> resourceRdd = bundles.flatMap(resourceToRowConverter);

  return spark.createDataFrame(resourceRdd.rdd(), converter.getSchema());
Example 3
Source File:    From SparkDemo with MIT License 6 votes vote down vote up
private static void flatMap(JavaSparkContext sc) {
	List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript");
	JavaRDD<String> rddData = sc.parallelize(data);

	FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() {
		public Iterator<String> call(String s) throws Exception {
			List<String> list = Arrays.asList(s.split(","));
			return list.iterator();
	JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction);

	flatMapData.foreach(new VoidFunction<String>() {
		public void call(String v) throws Exception {

Example 4
Source File:    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(MapTest.class);

	List<String> list = Arrays.asList("hello,bjsxt", "hello,xuruyun");

	JavaRDD<String> linesRDD = sc.parallelize(list);

	JavaRDD<Object> mapRDD = Function<String, Object>() {

		public Object call(String v1) throws Exception {
			return v1.split(",");

	JavaRDD<String> flatMapRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() {

		public Iterator<String> call(String t) throws Exception {
			// TODO Auto-generated method stub
			return Arrays.asList(t.split(",")).iterator();

	List<Object> collect = mapRDD.collect(); // Action算子 触发执行
	for (Object obj : collect) {

	List<String> collect2 = flatMapRDD.collect(); // Action算子 触发执行
	for (String s : collect2) {
Example 5
Source File:    From bunsen with Apache License 2.0 5 votes vote down vote up
 * Returns a new ValueSets instance that includes the given value sets.
 * @param valueSets the value sets to add to the returned collection.
 * @return a new ValueSets instance with the added value sets.
public ValueSets withValueSets(Dataset<Row> valueSets) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets);

  // Ensure that there are no duplicates among the value sets
  if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add value sets having duplicate valueSetUri and valueSetVersion");

  JavaRDD<Row> valueSetsRdd = valueSets.javaRDD();

  // The value set concepts will be stored in the values table for persistence, so we remove
  // them from the individual value sets. This can be done most easily by setting concepts to an
  // empty list.
  JavaRDD<Row> withoutConceptsRdd = RemoveConcepts(fhirVersion));

  Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd,

  JavaRDD<Value> newValuesRdd = valueSetsRdd.flatMap(new ExtractValues(fhirVersion));

  Dataset<Value> newValues = spark.createDataset(newValuesRdd.rdd(), getValueEncoder());

  return withValueSets(withoutConcepts, newValues);
Example 6
Source File:    From sparkResearch with Apache License 2.0 5 votes vote down vote up
 * flatMap分割字符串
public void flatMap(JavaSparkContext sparkContext){
    JavaRDD<String> lines = sparkContext.parallelize(Arrays.asList("hello world", "hi"));

    JavaRDD<String> flatMapResult  = lines.flatMap(new FlatMapFunction<String, String>() {
        public Iterator<String> call(String s) throws Exception {
            return Arrays.asList(PATTERN.split(s)).iterator();


Example 7
Source File:    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
 * 元素转换. 参数->数组参数
 * demo计算目的:获取地铁站信息切分后 获取数组信息1.出发站 2.终点站 3.经历站点数 4.距离
 * @since hui_project 1.0.0
public void testFlatMap() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> splitRDD = textRDD
            .flatMap(x -> Arrays.asList(x.split(",")).iterator());
Example 8
Source File:    From beam with Apache License 2.0 5 votes vote down vote up
private static <K, V> void translateGroupByKey(
    PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {

  RunnerApi.Components components = pipeline.getComponents();
  String inputId = getInputId(transformNode);
  Dataset inputDataset = context.popDataset(inputId);
  JavaRDD<WindowedValue<KV<K, V>>> inputRdd = ((BoundedDataset<KV<K, V>>) inputDataset).getRDD();
  WindowedValueCoder<KV<K, V>> inputCoder = getWindowedValueCoder(inputId, components);
  KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
  Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder();
  Coder<V> inputValueCoder = inputKvCoder.getValueCoder();
  WindowingStrategy windowingStrategy = getWindowingStrategy(inputId, components);
  WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
  WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(inputValueCoder, windowFn.windowCoder());

  JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKeyAndWindow;
  Partitioner partitioner = getPartitioner(context);
  if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
    // we can have a memory sensitive translation for non-merging windows
    groupedByKeyAndWindow =
            inputRdd, inputKeyCoder, inputValueCoder, windowingStrategy, partitioner);
  } else {
    JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly =
        GroupCombineFunctions.groupByKeyOnly(inputRdd, inputKeyCoder, wvCoder, partitioner);
    // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
    groupedByKeyAndWindow =
            new SparkGroupAlsoByWindowViaOutputBufferFn<>(
                new TranslationUtils.InMemoryStateInternalsFactory<>(),
  context.pushDataset(getOutputId(transformNode), new BoundedDataset<>(groupedByKeyAndWindow));
Example 9
Source File:    From beam with Apache License 2.0 4 votes vote down vote up
private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
  return new TransformEvaluator<GroupByKey<K, V>>() {
    public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
      JavaRDD<WindowedValue<KV<K, V>>> inRDD =
          ((BoundedDataset<KV<K, V>>) context.borrowDataset(transform)).getRDD();
      final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
      final WindowingStrategy<?, W> windowingStrategy =
          (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
      final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();

      // --- coders.
      final Coder<K> keyCoder = coder.getKeyCoder();
      final WindowedValue.WindowedValueCoder<V> wvCoder =
          WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());

      JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKey;
      Partitioner partitioner = getPartitioner(context);
      if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
        // we can have a memory sensitive translation for non-merging windows
        groupedByKey =
                inRDD, keyCoder, coder.getValueCoder(), windowingStrategy, partitioner);
      } else {
        // --- group by key only.
        JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly =
            GroupCombineFunctions.groupByKeyOnly(inRDD, keyCoder, wvCoder, partitioner);

        // --- now group also by window.
        // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
        groupedByKey =
                new SparkGroupAlsoByWindowViaOutputBufferFn<>(
                    new TranslationUtils.InMemoryStateInternalsFactory<>(),
      context.putDataset(transform, new BoundedDataset<>(groupedByKey));

    public String toNativeString() {
      return "groupByKey()";
Example 10
Source File:    From rdf2x with Apache License 2.0 4 votes vote down vote up
 * Reduce a RDD of {@link Instance}s into a map of [type index -&gt; list of its {@link Predicate}s and their properties (occurrences, is multiple)]
 * @param instances  a RDD of {@link Instance}s
 * @param typeCounts map of type indexes to counts of their instances
 * @return map of [type index -&gt; list of its {@link Predicate}s and their properties (occurrences, is multiple)]
private Map<Integer, List<EntityProperty>> getDistinctEntityProperties(JavaRDD<Instance> instances, Map<Integer, Long> typeCounts) {

    // all triples of (instance type, instance predicate, is multiple valued predicate)
    JavaRDD<Tuple3<Integer, Predicate, Boolean>> typePredicates = instances.flatMap(instance -> {
        Set<Predicate> predicates = instance.getLiteralPredicates();
        return instance.getTypes().stream()
                .flatMap(typeInt ->
                        .map(predicate -> new Tuple3<>(
                                typeInt, // type index
                                predicate, // predicate
                                instance.getLiteralValue(predicate) instanceof Set // is multiple valued

    return typePredicates
            .mapToPair(typePredicate -> new Tuple2<>(
                            new Tuple2<>(typePredicate._1(), typePredicate._2()), // predicate in type
                            new Tuple2<>(1L, typePredicate._3()) // count, is multiple valued
            // get properties of each predicate in a specific type (will become a column)
            .reduceByKey((a, b) -> new Tuple2<>(
                    a._1() + b._1(), // sum counts
                    a._2() || b._2() // is multiple if it is multiple in any instance
            // collect to Java list
            // group by type -> list of predicates and their properties
                    typePredicate -> typePredicate._1()._1(),
                            typePredicate -> new EntityProperty(
                                    typePredicate._1()._2(), // predicate index
                                    typePredicate._2()._2(), // is multiple
                                    typePredicate._2()._1() / ((double) typeCounts.get(typePredicate._1()._1())) // non-null ratio

Example 11
Source File:    From rdf2x with Apache License 2.0 4 votes vote down vote up
 * Persist the Entity Attribute Value table
 * @param entitySchema entity schema
 * @param instances    RDD of {@link Instance}s
public void writeEntityAttributeValueTable(EntitySchema entitySchema, JavaRDD<Instance> instances) {

    IndexMap<String> typeIndex = rdfSchema.getTypeIndex();
    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(ID_COLUMN_NAME, DataTypes.LongType, false));
    fields.add(DataTypes.createStructField(PREDICATE_COLUMN_NAME, DataTypes.IntegerType, false));
    fields.add(DataTypes.createStructField(EAV_DATATYPE_COLUMN_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(EAV_LANGUAGE_COLUMN_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(EAV_VALUE_COLUMN_NAME, DataTypes.StringType, false));
    StructType schema = DataTypes.createStructType(fields);

    List<Tuple2<String, String>> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, ID_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, PREDICATE_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_DATATYPE_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_LANGUAGE_COLUMN_NAME));

    // get map of type index -> set of attributes
    Map<Integer, Set<Predicate>> typeEavPredicates = entitySchema.getTables().stream()
                    table -> typeIndex.getIndex(table.getTypeURI()),
                    table -> table.getAttributes().stream()

    // get all entity attribute values
    JavaRDD<Row> rowRDD = instances.flatMap(instance ->
                    // filter predicates that are in the EAV set of at least one of the instance types
                    .filter(predicate -> instance.getTypes().stream().anyMatch(type ->
                            typeEavPredicates.containsKey(type) && // type could have been removed (not enough rows, ...)
                    // map to row of values
                    .flatMap(predicate -> {
                                Object value = instance.getLiteralValue(predicate);
                                if (value instanceof Set) {
                                    // return a row for each single value
                                    return ((Set<Object>) value).stream().map(val -> getAttributeRow(instance, predicate, val));
                                return Stream.of(getAttributeRow(instance, predicate, value));//getAttributeRow(instance, predicate, value)

    int predicateCount = typeEavPredicates.values().stream().collect(Collectors.summingInt(Set::size));

    // create and write the dataframe"Writing EAV table of {} predicates", predicateCount);
    DataFrame df = sql.createDataFrame(rowRDD, schema);
    persistor.writeDataFrame(EAV_TABLE_NAME, df);"Creating indexes for EAV table");
Example 12
Source File:    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
 * Convert the given sequence data set to a DataFrame.<br>
 * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(Dataset<Row>)}
 * @param schema Schema for the data
 * @param data   Sequence data to convert to a DataFrame
 * @return The dataframe object
public static Dataset<Row> toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());

    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema));
    return sqlContext.createDataFrame(rows, fromSchemaSequence(schema));
Example 13
Source File:    From DataVec with Apache License 2.0 3 votes vote down vote up
 * Convert the given sequence data set to a DataFrame.<br>
 * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(DataRowsFacade)}
 * @param schema Schema for the data
 * @param data   Sequence data to convert to a DataFrame
 * @return The dataframe object
public static DataRowsFacade toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());

    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema));
    return dataRows(sqlContext.createDataFrame(rows, fromSchemaSequence(schema)));
Example 14
Source File:    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
 * Randomly sample values from a single column, in all sequences.
 * Values may be taken from any sequence (i.e., sequence order is not preserved)
 * @param count         Number of values to sample
 * @param columnName    Name of the column to sample from
 * @param schema        Schema
 * @param sequenceData  Data to sample from
 * @return              A list of random samples
public static List<Writable> sampleFromColumnSequence(int count, String columnName, Schema schema,
                JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return sampleFromColumn(count, columnName, schema, flattenedSequence);
Example 15
Source File:    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
 * Get a list of unique values from the specified column of a sequence
 * @param columnName      Name of the column to get unique values from
 * @param schema          Data schema
 * @param sequenceData    Sequence data to get unique values from
 * @return
public static List<Writable> getUniqueSequence(String columnName, Schema schema,
                JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return getUnique(columnName, schema, flattenedSequence);
Example 16
Source File:    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
 * Analyze the data quality of sequence data - provides a report on missing values, values that don't comply with schema, etc
 * @param schema Schema for data
 * @param data   Data to analyze
 * @return DataQualityAnalysis object
public static DataQualityAnalysis analyzeQualitySequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
    JavaRDD<List<Writable>> fmSeq = data.flatMap(new SequenceFlatMapFunction());
    return analyzeQuality(schema, fmSeq);
Example 17
Source File:    From vn.vitk with GNU General Public License v3.0 2 votes vote down vote up
 * Extracts a RDD of labeled contexts from a RDD of rows where each row 
 * has two string cells containing a word sequence and a tag sequence. 
 * @param dataset
 * @return a RDD of labeled contexts
public JavaRDD<LabeledContext> extract(JavaRDD<Row> dataset) {
	return dataset.flatMap(new RowToContextFunction()); 
Example 18
Source File:    From DataVec with Apache License 2.0 2 votes vote down vote up
 * Randomly sample values from a single column, in all sequences.
 * Values may be taken from any sequence (i.e., sequence order is not preserved)
 * @param count         Number of values to sample
 * @param columnName    Name of the column to sample from
 * @param schema        Schema
 * @param sequenceData  Data to sample from
 * @return              A list of random samples
public static List<Writable> sampleFromColumnSequence(int count, String columnName, Schema schema,
                JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return sampleFromColumn(count, columnName, schema, flattenedSequence);
Example 19
Source File:    From DataVec with Apache License 2.0 2 votes vote down vote up
 * Get a list of unique values from the specified column of a sequence
 * @param columnName      Name of the column to get unique values from
 * @param schema          Data schema
 * @param sequenceData    Sequence data to get unique values from
 * @return
public static List<Writable> getUniqueSequence(String columnName, Schema schema,
                JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return getUnique(columnName, schema, flattenedSequence);
Example 20
Source File:    From DataVec with Apache License 2.0 2 votes vote down vote up
 * Get a list of unique values from the specified columns of a sequence
 * @param columnNames     Name of the columns to get unique values from
 * @param schema          Data schema
 * @param sequenceData    Sequence data to get unique values from
 * @return
public static Map<String,List<Writable>> getUniqueSequence(List<String> columnNames, Schema schema,
                                               JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return getUnique(columnNames, schema, flattenedSequence);