org.apache.spark.api.java.JavaRDD#flatMap

Source File: WordCount.java From tutorials with MIT License

6 votes

public static void main(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage: JavaWordCount <file>");
        System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount")
        .setMaster("local");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator());
    JavaPairRDD<String, Integer> wordAsTuple = words.mapToPair(word -> new Tuple2<>(word, 1));
    JavaPairRDD<String, Integer> wordWithCount = wordAsTuple.reduceByKey((Integer i1, Integer i2)->i1 + i2);
    List<Tuple2<String, Integer>> output = wordWithCount.collect();
    for (Tuple2<?, ?> tuple : output) {
         System.out.println(tuple._1() + ": " + tuple._2());
    }
    ctx.stop();
}

Source File: Bundles.java From bunsen with Apache License 2.0

6 votes

/**
 * Extracts the given resource type from the RDD of bundles and returns
 * it as a Dataset of that type, including any declared resources contained
 * to the parent resource.
 *
 * @param spark the spark session
 * @param bundles the RDD of FHIR Bundles
 * @param resourceTypeUrl the url of the resource
 * @param containedClassesUrls the list of urls of the resources contained to the parent resource
 * @return a dataset of the given resource
 */
public Dataset<Row> extractEntry(SparkSession spark, JavaRDD<BundleContainer> bundles,
    String resourceTypeUrl, List<String> containedClassesUrls) {

  FhirContext context = FhirContexts.contextFor(fhirVersion);

  SparkRowConverter converter = SparkRowConverter
      .forResource(context, resourceTypeUrl, containedClassesUrls);

  ToResourceRow resourceToRowConverter = new ToResourceRow(converter.getResourceType(),
      resourceTypeUrl,
      fhirVersion,
      converter,
      containedClassesUrls);

  JavaRDD<Row> resourceRdd = bundles.flatMap(resourceToRowConverter);

  return spark.createDataFrame(resourceRdd.rdd(), converter.getSchema());
}

Source File: FlatMap.java From SparkDemo with MIT License

6 votes

private static void flatMap(JavaSparkContext sc) {
	List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript");
	JavaRDD<String> rddData = sc.parallelize(data);

	FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() {
		@Override
		public Iterator<String> call(String s) throws Exception {
			List<String> list = Arrays.asList(s.split(","));
			return list.iterator();
		}
	};
	JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction);


	flatMapData.foreach(new VoidFunction<String>() {
		@Override
		public void call(String v) throws Exception {
			System.out.println(v);
		}
	});

	sc.close();
}

Source File: MapTest.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(MapTest.class);

	List<String> list = Arrays.asList("hello,bjsxt", "hello,xuruyun");

	JavaRDD<String> linesRDD = sc.parallelize(list);

	JavaRDD<Object> mapRDD = linesRDD.map(new Function<String, Object>() {

		@Override
		public Object call(String v1) throws Exception {
			return v1.split(",");
		}
	});

	JavaRDD<String> flatMapRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() {

		@Override
		public Iterator<String> call(String t) throws Exception {
			// TODO Auto-generated method stub
			return Arrays.asList(t.split(",")).iterator();
		}
	});

	List<Object> collect = mapRDD.collect(); // Action算子 触发执行
	for (Object obj : collect) {
		System.out.println(obj);
	}

	List<String> collect2 = flatMapRDD.collect(); // Action算子 触发执行
	for (String s : collect2) {
		System.out.println(s);
	}
}

Source File: ValueSets.java From bunsen with Apache License 2.0

5 votes

/**
 * Returns a new ValueSets instance that includes the given value sets.
 *
 * @param valueSets the value sets to add to the returned collection.
 * @return a new ValueSets instance with the added value sets.
 */
@Override
public ValueSets withValueSets(Dataset<Row> valueSets) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets);

  // Ensure that there are no duplicates among the value sets
  if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add value sets having duplicate valueSetUri and valueSetVersion");
  }

  JavaRDD<Row> valueSetsRdd = valueSets.javaRDD();

  // The value set concepts will be stored in the values table for persistence, so we remove
  // them from the individual value sets. This can be done most easily by setting concepts to an
  // empty list.
  JavaRDD<Row> withoutConceptsRdd = valueSetsRdd.map(new RemoveConcepts(fhirVersion));

  Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd,
      valueSetRowConverter.getSchema());

  JavaRDD<Value> newValuesRdd = valueSetsRdd.flatMap(new ExtractValues(fhirVersion));

  Dataset<Value> newValues = spark.createDataset(newValuesRdd.rdd(), getValueEncoder());

  return withValueSets(withoutConcepts, newValues);
}

Source File: Chapter4.java From sparkResearch with Apache License 2.0

5 votes

/**
 * flatMap分割字符串
 */
public void flatMap(JavaSparkContext sparkContext){
    JavaRDD<String> lines = sparkContext.parallelize(Arrays.asList("hello world", "hi"));

    JavaRDD<String> flatMapResult  = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String s) throws Exception {
            return Arrays.asList(PATTERN.split(s)).iterator();
        }
    });

    flatMapResult.first();

    //结果:hello
}

Source File: TransformationRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 元素转换. 参数->数组参数
 * demo计算目的：获取地铁站信息切分后 获取数组信息1.出发站 2.终点站 3.经历站点数 4.距离
 *
 * @since hui_project 1.0.0
 */
public void testFlatMap() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> splitRDD = textRDD
            .flatMap(x -> Arrays.asList(x.split(",")).iterator());
    checkResult(splitRDD.collect());
}

Source File: SparkBatchPortablePipelineTranslator.java From beam with Apache License 2.0

5 votes

private static <K, V> void translateGroupByKey(
    PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {

  RunnerApi.Components components = pipeline.getComponents();
  String inputId = getInputId(transformNode);
  Dataset inputDataset = context.popDataset(inputId);
  JavaRDD<WindowedValue<KV<K, V>>> inputRdd = ((BoundedDataset<KV<K, V>>) inputDataset).getRDD();
  WindowedValueCoder<KV<K, V>> inputCoder = getWindowedValueCoder(inputId, components);
  KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
  Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder();
  Coder<V> inputValueCoder = inputKvCoder.getValueCoder();
  WindowingStrategy windowingStrategy = getWindowingStrategy(inputId, components);
  WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
  WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(inputValueCoder, windowFn.windowCoder());

  JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKeyAndWindow;
  Partitioner partitioner = getPartitioner(context);
  if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
    // we can have a memory sensitive translation for non-merging windows
    groupedByKeyAndWindow =
        GroupNonMergingWindowsFunctions.groupByKeyAndWindow(
            inputRdd, inputKeyCoder, inputValueCoder, windowingStrategy, partitioner);
  } else {
    JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly =
        GroupCombineFunctions.groupByKeyOnly(inputRdd, inputKeyCoder, wvCoder, partitioner);
    // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
    groupedByKeyAndWindow =
        groupedByKeyOnly.flatMap(
            new SparkGroupAlsoByWindowViaOutputBufferFn<>(
                windowingStrategy,
                new TranslationUtils.InMemoryStateInternalsFactory<>(),
                SystemReduceFn.buffering(inputValueCoder),
                context.serializablePipelineOptions));
  }
  context.pushDataset(getOutputId(transformNode), new BoundedDataset<>(groupedByKeyAndWindow));
}

Source File: TransformTranslator.java From beam with Apache License 2.0

4 votes

private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
  return new TransformEvaluator<GroupByKey<K, V>>() {
    @Override
    public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
      @SuppressWarnings("unchecked")
      JavaRDD<WindowedValue<KV<K, V>>> inRDD =
          ((BoundedDataset<KV<K, V>>) context.borrowDataset(transform)).getRDD();
      final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
      @SuppressWarnings("unchecked")
      final WindowingStrategy<?, W> windowingStrategy =
          (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();

      // --- coders.
      final Coder<K> keyCoder = coder.getKeyCoder();
      final WindowedValue.WindowedValueCoder<V> wvCoder =
          WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());

      JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKey;
      Partitioner partitioner = getPartitioner(context);
      if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
        // we can have a memory sensitive translation for non-merging windows
        groupedByKey =
            GroupNonMergingWindowsFunctions.groupByKeyAndWindow(
                inRDD, keyCoder, coder.getValueCoder(), windowingStrategy, partitioner);
      } else {
        // --- group by key only.
        JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly =
            GroupCombineFunctions.groupByKeyOnly(inRDD, keyCoder, wvCoder, partitioner);

        // --- now group also by window.
        // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
        groupedByKey =
            groupedByKeyOnly.flatMap(
                new SparkGroupAlsoByWindowViaOutputBufferFn<>(
                    windowingStrategy,
                    new TranslationUtils.InMemoryStateInternalsFactory<>(),
                    SystemReduceFn.buffering(coder.getValueCoder()),
                    context.getSerializableOptions()));
      }
      context.putDataset(transform, new BoundedDataset<>(groupedByKey));
    }

    @Override
    public String toNativeString() {
      return "groupByKey()";
    }
  };
}

Source File: EntitySchemaCollector.java From rdf2x with Apache License 2.0

4 votes

/**
 * Reduce a RDD of {@link Instance}s into a map of [type index -&gt; list of its {@link Predicate}s and their properties (occurrences, is multiple)]
 *
 * @param instances  a RDD of {@link Instance}s
 * @param typeCounts map of type indexes to counts of their instances
 * @return map of [type index -&gt; list of its {@link Predicate}s and their properties (occurrences, is multiple)]
 */
private Map<Integer, List<EntityProperty>> getDistinctEntityProperties(JavaRDD<Instance> instances, Map<Integer, Long> typeCounts) {

    // all triples of (instance type, instance predicate, is multiple valued predicate)
    JavaRDD<Tuple3<Integer, Predicate, Boolean>> typePredicates = instances.flatMap(instance -> {
        Set<Predicate> predicates = instance.getLiteralPredicates();
        return instance.getTypes().stream()
                .flatMap(typeInt -> predicates.stream()
                        .map(predicate -> new Tuple3<>(
                                typeInt, // type index
                                predicate, // predicate
                                instance.getLiteralValue(predicate) instanceof Set // is multiple valued
                        ))
                ).collect(Collectors.toList());
    });

    return typePredicates
            .mapToPair(typePredicate -> new Tuple2<>(
                            new Tuple2<>(typePredicate._1(), typePredicate._2()), // predicate in type
                            new Tuple2<>(1L, typePredicate._3()) // count, is multiple valued
                    )
            )
            // get properties of each predicate in a specific type (will become a column)
            .reduceByKey((a, b) -> new Tuple2<>(
                    a._1() + b._1(), // sum counts
                    a._2() || b._2() // is multiple if it is multiple in any instance
            ))
            // collect to Java list
            .collect().stream()
            // group by type -> list of predicates and their properties
            .collect(Collectors.groupingBy(
                    typePredicate -> typePredicate._1()._1(),
                    Collectors.mapping(
                            typePredicate -> new EntityProperty(
                                    typePredicate._1()._2(), // predicate index
                                    typePredicate._2()._2(), // is multiple
                                    typePredicate._2()._1() / ((double) typeCounts.get(typePredicate._1()._1())) // non-null ratio
                            ),
                            Collectors.toList())
            ));

}

Source File: InstanceRelationWriter.java From rdf2x with Apache License 2.0

4 votes

/**
 * Persist the Entity Attribute Value table
 *
 * @param entitySchema entity schema
 * @param instances    RDD of {@link Instance}s
 */
public void writeEntityAttributeValueTable(EntitySchema entitySchema, JavaRDD<Instance> instances) {

    IndexMap<String> typeIndex = rdfSchema.getTypeIndex();
    // create the schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(ID_COLUMN_NAME, DataTypes.LongType, false));
    fields.add(DataTypes.createStructField(PREDICATE_COLUMN_NAME, DataTypes.IntegerType, false));
    fields.add(DataTypes.createStructField(EAV_DATATYPE_COLUMN_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(EAV_LANGUAGE_COLUMN_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(EAV_VALUE_COLUMN_NAME, DataTypes.StringType, false));
    StructType schema = DataTypes.createStructType(fields);

    List<Tuple2<String, String>> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, ID_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, PREDICATE_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_DATATYPE_COLUMN_NAME));
    indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_LANGUAGE_COLUMN_NAME));

    // get map of type index -> set of attributes
    Map<Integer, Set<Predicate>> typeEavPredicates = entitySchema.getTables().stream()
            .collect(Collectors.toMap(
                    table -> typeIndex.getIndex(table.getTypeURI()),
                    table -> table.getAttributes().stream()
                            .map(EntityProperty::getPredicate)
                            .collect(Collectors.toSet())
            ));

    // get all entity attribute values
    JavaRDD<Row> rowRDD = instances.flatMap(instance ->
            instance.getLiteralPredicates().stream()
                    // filter predicates that are in the EAV set of at least one of the instance types
                    .filter(predicate -> instance.getTypes().stream().anyMatch(type ->
                            typeEavPredicates.containsKey(type) && // type could have been removed (not enough rows, ...)
                                    typeEavPredicates.get(type).contains(predicate)
                    ))
                    // map to row of values
                    .flatMap(predicate -> {
                                Object value = instance.getLiteralValue(predicate);
                                if (value instanceof Set) {
                                    // return a row for each single value
                                    return ((Set<Object>) value).stream().map(val -> getAttributeRow(instance, predicate, val));
                                }
                                return Stream.of(getAttributeRow(instance, predicate, value));//getAttributeRow(instance, predicate, value)
                            }
                    )
                    .collect(Collectors.toList())
    );

    int predicateCount = typeEavPredicates.values().stream().collect(Collectors.summingInt(Set::size));

    // create and write the dataframe
    log.info("Writing EAV table of {} predicates", predicateCount);
    DataFrame df = sql.createDataFrame(rowRDD, schema);
    persistor.writeDataFrame(EAV_TABLE_NAME, df);
    log.info("Creating indexes for EAV table");
    persistor.createIndexes(indexes);
    df.unpersist();
}

Source File: DataFrames.java From deeplearning4j with Apache License 2.0

3 votes

/**
 * Convert the given sequence data set to a DataFrame.<br>
 * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(Dataset<Row>)}
 *
 * @param schema Schema for the data
 * @param data   Sequence data to convert to a DataFrame
 * @return The dataframe object
 */
public static Dataset<Row> toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());

    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema));
    return sqlContext.createDataFrame(rows, fromSchemaSequence(schema));
}

Source File: DataFrames.java From DataVec with Apache License 2.0

3 votes

/**
 * Convert the given sequence data set to a DataFrame.<br>
 * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br>
 * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br>
 * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position
 * of this record in the original time series.<br>
 * These two columns are required if the data is to be converted back into a sequence at a later point, for example
 * using {@link #toRecordsSequence(DataRowsFacade)}
 *
 * @param schema Schema for the data
 * @param data   Sequence data to convert to a DataFrame
 * @return The dataframe object
 */
public static DataRowsFacade toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
    JavaSparkContext sc = new JavaSparkContext(data.context());

    SQLContext sqlContext = new SQLContext(sc);
    JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema));
    return dataRows(sqlContext.createDataFrame(rows, fromSchemaSequence(schema)));
}

Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Randomly sample values from a single column, in all sequences.
 * Values may be taken from any sequence (i.e., sequence order is not preserved)
 *
 * @param count         Number of values to sample
 * @param columnName    Name of the column to sample from
 * @param schema        Schema
 * @param sequenceData  Data to sample from
 * @return              A list of random samples
 */
public static List<Writable> sampleFromColumnSequence(int count, String columnName, Schema schema,
                JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return sampleFromColumn(count, columnName, schema, flattenedSequence);
}

Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Get a list of unique values from the specified column of a sequence
 *
 * @param columnName      Name of the column to get unique values from
 * @param schema          Data schema
 * @param sequenceData    Sequence data to get unique values from
 * @return
 */
public static List<Writable> getUniqueSequence(String columnName, Schema schema,
                JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return getUnique(columnName, schema, flattenedSequence);
}

Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Analyze the data quality of sequence data - provides a report on missing values, values that don't comply with schema, etc
 * @param schema Schema for data
 * @param data   Data to analyze
 * @return DataQualityAnalysis object
 */
public static DataQualityAnalysis analyzeQualitySequence(Schema schema, JavaRDD<List<List<Writable>>> data) {
    JavaRDD<List<Writable>> fmSeq = data.flatMap(new SequenceFlatMapFunction());
    return analyzeQuality(schema, fmSeq);
}

Source File: ContextExtractor.java From vn.vitk with GNU General Public License v3.0

2 votes

/**
 * Extracts a RDD of labeled contexts from a RDD of rows where each row 
 * has two string cells containing a word sequence and a tag sequence. 
 * @param dataset
 * @return a RDD of labeled contexts
 */
public JavaRDD<LabeledContext> extract(JavaRDD<Row> dataset) {
	return dataset.flatMap(new RowToContextFunction()); 
}

Source File: AnalyzeSpark.java From DataVec with Apache License 2.0

2 votes

/**
 * Randomly sample values from a single column, in all sequences.
 * Values may be taken from any sequence (i.e., sequence order is not preserved)
 *
 * @param count         Number of values to sample
 * @param columnName    Name of the column to sample from
 * @param schema        Schema
 * @param sequenceData  Data to sample from
 * @return              A list of random samples
 */
public static List<Writable> sampleFromColumnSequence(int count, String columnName, Schema schema,
                JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return sampleFromColumn(count, columnName, schema, flattenedSequence);
}

Source File: AnalyzeSpark.java From DataVec with Apache License 2.0

2 votes

/**
 * Get a list of unique values from the specified column of a sequence
 *
 * @param columnName      Name of the column to get unique values from
 * @param schema          Data schema
 * @param sequenceData    Sequence data to get unique values from
 * @return
 */
public static List<Writable> getUniqueSequence(String columnName, Schema schema,
                JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return getUnique(columnName, schema, flattenedSequence);
}

Source File: AnalyzeSpark.java From DataVec with Apache License 2.0

2 votes

/**
 * Get a list of unique values from the specified columns of a sequence
 *
 * @param columnNames     Name of the columns to get unique values from
 * @param schema          Data schema
 * @param sequenceData    Sequence data to get unique values from
 * @return
 */
public static Map<String,List<Writable>> getUniqueSequence(List<String> columnNames, Schema schema,
                                               JavaRDD<List<List<Writable>>> sequenceData) {
    JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction());
    return getUnique(columnNames, schema, flattenedSequence);
}

Java Code Examples for org.apache.spark.api.java.JavaRDD#flatMap()