Java Code Examples for org.apache.spark.api.java.JavaRDD#flatMap()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#flatMap() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordCount.java From tutorials with MIT License | 6 votes |
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaWordCount <file>"); System.exit(1); } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount") .setMaster("local"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 1); JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator()); JavaPairRDD<String, Integer> wordAsTuple = words.mapToPair(word -> new Tuple2<>(word, 1)); JavaPairRDD<String, Integer> wordWithCount = wordAsTuple.reduceByKey((Integer i1, Integer i2)->i1 + i2); List<Tuple2<String, Integer>> output = wordWithCount.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } ctx.stop(); }
Example 2
Source File: Bundles.java From bunsen with Apache License 2.0 | 6 votes |
/** * Extracts the given resource type from the RDD of bundles and returns * it as a Dataset of that type, including any declared resources contained * to the parent resource. * * @param spark the spark session * @param bundles the RDD of FHIR Bundles * @param resourceTypeUrl the url of the resource * @param containedClassesUrls the list of urls of the resources contained to the parent resource * @return a dataset of the given resource */ public Dataset<Row> extractEntry(SparkSession spark, JavaRDD<BundleContainer> bundles, String resourceTypeUrl, List<String> containedClassesUrls) { FhirContext context = FhirContexts.contextFor(fhirVersion); SparkRowConverter converter = SparkRowConverter .forResource(context, resourceTypeUrl, containedClassesUrls); ToResourceRow resourceToRowConverter = new ToResourceRow(converter.getResourceType(), resourceTypeUrl, fhirVersion, converter, containedClassesUrls); JavaRDD<Row> resourceRdd = bundles.flatMap(resourceToRowConverter); return spark.createDataFrame(resourceRdd.rdd(), converter.getSchema()); }
Example 3
Source File: FlatMap.java From SparkDemo with MIT License | 6 votes |
private static void flatMap(JavaSparkContext sc) { List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript"); JavaRDD<String> rddData = sc.parallelize(data); FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { List<String> list = Arrays.asList(s.split(",")); return list.iterator(); } }; JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction); flatMapData.foreach(new VoidFunction<String>() { @Override public void call(String v) throws Exception { System.out.println(v); } }); sc.close(); }
Example 4
Source File: MapTest.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { JavaSparkContext sc = SparkUtils.getLocalSparkContext(MapTest.class); List<String> list = Arrays.asList("hello,bjsxt", "hello,xuruyun"); JavaRDD<String> linesRDD = sc.parallelize(list); JavaRDD<Object> mapRDD = linesRDD.map(new Function<String, Object>() { @Override public Object call(String v1) throws Exception { return v1.split(","); } }); JavaRDD<String> flatMapRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String t) throws Exception { // TODO Auto-generated method stub return Arrays.asList(t.split(",")).iterator(); } }); List<Object> collect = mapRDD.collect(); // Action算子 触发执行 for (Object obj : collect) { System.out.println(obj); } List<String> collect2 = flatMapRDD.collect(); // Action算子 触发执行 for (String s : collect2) { System.out.println(s); } }
Example 5
Source File: ValueSets.java From bunsen with Apache License 2.0 | 5 votes |
/** * Returns a new ValueSets instance that includes the given value sets. * * @param valueSets the value sets to add to the returned collection. * @return a new ValueSets instance with the added value sets. */ @Override public ValueSets withValueSets(Dataset<Row> valueSets) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets); // Ensure that there are no duplicates among the value sets if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add value sets having duplicate valueSetUri and valueSetVersion"); } JavaRDD<Row> valueSetsRdd = valueSets.javaRDD(); // The value set concepts will be stored in the values table for persistence, so we remove // them from the individual value sets. This can be done most easily by setting concepts to an // empty list. JavaRDD<Row> withoutConceptsRdd = valueSetsRdd.map(new RemoveConcepts(fhirVersion)); Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd, valueSetRowConverter.getSchema()); JavaRDD<Value> newValuesRdd = valueSetsRdd.flatMap(new ExtractValues(fhirVersion)); Dataset<Value> newValues = spark.createDataset(newValuesRdd.rdd(), getValueEncoder()); return withValueSets(withoutConcepts, newValues); }
Example 6
Source File: Chapter4.java From sparkResearch with Apache License 2.0 | 5 votes |
/** * flatMap分割字符串 */ public void flatMap(JavaSparkContext sparkContext){ JavaRDD<String> lines = sparkContext.parallelize(Arrays.asList("hello world", "hi")); JavaRDD<String> flatMapResult = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { return Arrays.asList(PATTERN.split(s)).iterator(); } }); flatMapResult.first(); //结果:hello }
Example 7
Source File: TransformationRDD.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 元素转换. 参数->数组参数 * demo计算目的:获取地铁站信息切分后 获取数组信息1.出发站 2.终点站 3.经历站点数 4.距离 * * @since hui_project 1.0.0 */ public void testFlatMap() { SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH); JavaRDD<String> splitRDD = textRDD .flatMap(x -> Arrays.asList(x.split(",")).iterator()); checkResult(splitRDD.collect()); }
Example 8
Source File: SparkBatchPortablePipelineTranslator.java From beam with Apache License 2.0 | 5 votes |
private static <K, V> void translateGroupByKey( PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) { RunnerApi.Components components = pipeline.getComponents(); String inputId = getInputId(transformNode); Dataset inputDataset = context.popDataset(inputId); JavaRDD<WindowedValue<KV<K, V>>> inputRdd = ((BoundedDataset<KV<K, V>>) inputDataset).getRDD(); WindowedValueCoder<KV<K, V>> inputCoder = getWindowedValueCoder(inputId, components); KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) inputCoder.getValueCoder(); Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder(); Coder<V> inputValueCoder = inputKvCoder.getValueCoder(); WindowingStrategy windowingStrategy = getWindowingStrategy(inputId, components); WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn(); WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(inputValueCoder, windowFn.windowCoder()); JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKeyAndWindow; Partitioner partitioner = getPartitioner(context); if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) { // we can have a memory sensitive translation for non-merging windows groupedByKeyAndWindow = GroupNonMergingWindowsFunctions.groupByKeyAndWindow( inputRdd, inputKeyCoder, inputValueCoder, windowingStrategy, partitioner); } else { JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly = GroupCombineFunctions.groupByKeyOnly(inputRdd, inputKeyCoder, wvCoder, partitioner); // for batch, GroupAlsoByWindow uses an in-memory StateInternals. groupedByKeyAndWindow = groupedByKeyOnly.flatMap( new SparkGroupAlsoByWindowViaOutputBufferFn<>( windowingStrategy, new TranslationUtils.InMemoryStateInternalsFactory<>(), SystemReduceFn.buffering(inputValueCoder), context.serializablePipelineOptions)); } context.pushDataset(getOutputId(transformNode), new BoundedDataset<>(groupedByKeyAndWindow)); }
Example 9
Source File: TransformTranslator.java From beam with Apache License 2.0 | 4 votes |
private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() { return new TransformEvaluator<GroupByKey<K, V>>() { @Override public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) { @SuppressWarnings("unchecked") JavaRDD<WindowedValue<KV<K, V>>> inRDD = ((BoundedDataset<KV<K, V>>) context.borrowDataset(transform)).getRDD(); final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder(); @SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy(); @SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn(); // --- coders. final Coder<K> keyCoder = coder.getKeyCoder(); final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder()); JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKey; Partitioner partitioner = getPartitioner(context); if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) { // we can have a memory sensitive translation for non-merging windows groupedByKey = GroupNonMergingWindowsFunctions.groupByKeyAndWindow( inRDD, keyCoder, coder.getValueCoder(), windowingStrategy, partitioner); } else { // --- group by key only. JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly = GroupCombineFunctions.groupByKeyOnly(inRDD, keyCoder, wvCoder, partitioner); // --- now group also by window. // for batch, GroupAlsoByWindow uses an in-memory StateInternals. groupedByKey = groupedByKeyOnly.flatMap( new SparkGroupAlsoByWindowViaOutputBufferFn<>( windowingStrategy, new TranslationUtils.InMemoryStateInternalsFactory<>(), SystemReduceFn.buffering(coder.getValueCoder()), context.getSerializableOptions())); } context.putDataset(transform, new BoundedDataset<>(groupedByKey)); } @Override public String toNativeString() { return "groupByKey()"; } }; }
Example 10
Source File: EntitySchemaCollector.java From rdf2x with Apache License 2.0 | 4 votes |
/** * Reduce a RDD of {@link Instance}s into a map of [type index -> list of its {@link Predicate}s and their properties (occurrences, is multiple)] * * @param instances a RDD of {@link Instance}s * @param typeCounts map of type indexes to counts of their instances * @return map of [type index -> list of its {@link Predicate}s and their properties (occurrences, is multiple)] */ private Map<Integer, List<EntityProperty>> getDistinctEntityProperties(JavaRDD<Instance> instances, Map<Integer, Long> typeCounts) { // all triples of (instance type, instance predicate, is multiple valued predicate) JavaRDD<Tuple3<Integer, Predicate, Boolean>> typePredicates = instances.flatMap(instance -> { Set<Predicate> predicates = instance.getLiteralPredicates(); return instance.getTypes().stream() .flatMap(typeInt -> predicates.stream() .map(predicate -> new Tuple3<>( typeInt, // type index predicate, // predicate instance.getLiteralValue(predicate) instanceof Set // is multiple valued )) ).collect(Collectors.toList()); }); return typePredicates .mapToPair(typePredicate -> new Tuple2<>( new Tuple2<>(typePredicate._1(), typePredicate._2()), // predicate in type new Tuple2<>(1L, typePredicate._3()) // count, is multiple valued ) ) // get properties of each predicate in a specific type (will become a column) .reduceByKey((a, b) -> new Tuple2<>( a._1() + b._1(), // sum counts a._2() || b._2() // is multiple if it is multiple in any instance )) // collect to Java list .collect().stream() // group by type -> list of predicates and their properties .collect(Collectors.groupingBy( typePredicate -> typePredicate._1()._1(), Collectors.mapping( typePredicate -> new EntityProperty( typePredicate._1()._2(), // predicate index typePredicate._2()._2(), // is multiple typePredicate._2()._1() / ((double) typeCounts.get(typePredicate._1()._1())) // non-null ratio ), Collectors.toList()) )); }
Example 11
Source File: InstanceRelationWriter.java From rdf2x with Apache License 2.0 | 4 votes |
/** * Persist the Entity Attribute Value table * * @param entitySchema entity schema * @param instances RDD of {@link Instance}s */ public void writeEntityAttributeValueTable(EntitySchema entitySchema, JavaRDD<Instance> instances) { IndexMap<String> typeIndex = rdfSchema.getTypeIndex(); // create the schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(ID_COLUMN_NAME, DataTypes.LongType, false)); fields.add(DataTypes.createStructField(PREDICATE_COLUMN_NAME, DataTypes.IntegerType, false)); fields.add(DataTypes.createStructField(EAV_DATATYPE_COLUMN_NAME, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(EAV_LANGUAGE_COLUMN_NAME, DataTypes.StringType, true)); fields.add(DataTypes.createStructField(EAV_VALUE_COLUMN_NAME, DataTypes.StringType, false)); StructType schema = DataTypes.createStructType(fields); List<Tuple2<String, String>> indexes = new ArrayList<>(); indexes.add(new Tuple2<>(EAV_TABLE_NAME, ID_COLUMN_NAME)); indexes.add(new Tuple2<>(EAV_TABLE_NAME, PREDICATE_COLUMN_NAME)); indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_DATATYPE_COLUMN_NAME)); indexes.add(new Tuple2<>(EAV_TABLE_NAME, EAV_LANGUAGE_COLUMN_NAME)); // get map of type index -> set of attributes Map<Integer, Set<Predicate>> typeEavPredicates = entitySchema.getTables().stream() .collect(Collectors.toMap( table -> typeIndex.getIndex(table.getTypeURI()), table -> table.getAttributes().stream() .map(EntityProperty::getPredicate) .collect(Collectors.toSet()) )); // get all entity attribute values JavaRDD<Row> rowRDD = instances.flatMap(instance -> instance.getLiteralPredicates().stream() // filter predicates that are in the EAV set of at least one of the instance types .filter(predicate -> instance.getTypes().stream().anyMatch(type -> typeEavPredicates.containsKey(type) && // type could have been removed (not enough rows, ...) typeEavPredicates.get(type).contains(predicate) )) // map to row of values .flatMap(predicate -> { Object value = instance.getLiteralValue(predicate); if (value instanceof Set) { // return a row for each single value return ((Set<Object>) value).stream().map(val -> getAttributeRow(instance, predicate, val)); } return Stream.of(getAttributeRow(instance, predicate, value));//getAttributeRow(instance, predicate, value) } ) .collect(Collectors.toList()) ); int predicateCount = typeEavPredicates.values().stream().collect(Collectors.summingInt(Set::size)); // create and write the dataframe log.info("Writing EAV table of {} predicates", predicateCount); DataFrame df = sql.createDataFrame(rowRDD, schema); persistor.writeDataFrame(EAV_TABLE_NAME, df); log.info("Creating indexes for EAV table"); persistor.createIndexes(indexes); df.unpersist(); }
Example 12
Source File: DataFrames.java From deeplearning4j with Apache License 2.0 | 3 votes |
/** * Convert the given sequence data set to a DataFrame.<br> * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br> * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br> * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position * of this record in the original time series.<br> * These two columns are required if the data is to be converted back into a sequence at a later point, for example * using {@link #toRecordsSequence(Dataset<Row>)} * * @param schema Schema for the data * @param data Sequence data to convert to a DataFrame * @return The dataframe object */ public static Dataset<Row> toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) { JavaSparkContext sc = new JavaSparkContext(data.context()); SQLContext sqlContext = new SQLContext(sc); JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema)); return sqlContext.createDataFrame(rows, fromSchemaSequence(schema)); }
Example 13
Source File: DataFrames.java From DataVec with Apache License 2.0 | 3 votes |
/** * Convert the given sequence data set to a DataFrame.<br> * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br> * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br> * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position * of this record in the original time series.<br> * These two columns are required if the data is to be converted back into a sequence at a later point, for example * using {@link #toRecordsSequence(DataRowsFacade)} * * @param schema Schema for the data * @param data Sequence data to convert to a DataFrame * @return The dataframe object */ public static DataRowsFacade toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) { JavaSparkContext sc = new JavaSparkContext(data.context()); SQLContext sqlContext = new SQLContext(sc); JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema)); return dataRows(sqlContext.createDataFrame(rows, fromSchemaSequence(schema))); }
Example 14
Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * Randomly sample values from a single column, in all sequences. * Values may be taken from any sequence (i.e., sequence order is not preserved) * * @param count Number of values to sample * @param columnName Name of the column to sample from * @param schema Schema * @param sequenceData Data to sample from * @return A list of random samples */ public static List<Writable> sampleFromColumnSequence(int count, String columnName, Schema schema, JavaRDD<List<List<Writable>>> sequenceData) { JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction()); return sampleFromColumn(count, columnName, schema, flattenedSequence); }
Example 15
Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * Get a list of unique values from the specified column of a sequence * * @param columnName Name of the column to get unique values from * @param schema Data schema * @param sequenceData Sequence data to get unique values from * @return */ public static List<Writable> getUniqueSequence(String columnName, Schema schema, JavaRDD<List<List<Writable>>> sequenceData) { JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction()); return getUnique(columnName, schema, flattenedSequence); }
Example 16
Source File: AnalyzeSpark.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * Analyze the data quality of sequence data - provides a report on missing values, values that don't comply with schema, etc * @param schema Schema for data * @param data Data to analyze * @return DataQualityAnalysis object */ public static DataQualityAnalysis analyzeQualitySequence(Schema schema, JavaRDD<List<List<Writable>>> data) { JavaRDD<List<Writable>> fmSeq = data.flatMap(new SequenceFlatMapFunction()); return analyzeQuality(schema, fmSeq); }
Example 17
Source File: ContextExtractor.java From vn.vitk with GNU General Public License v3.0 | 2 votes |
/** * Extracts a RDD of labeled contexts from a RDD of rows where each row * has two string cells containing a word sequence and a tag sequence. * @param dataset * @return a RDD of labeled contexts */ public JavaRDD<LabeledContext> extract(JavaRDD<Row> dataset) { return dataset.flatMap(new RowToContextFunction()); }
Example 18
Source File: AnalyzeSpark.java From DataVec with Apache License 2.0 | 2 votes |
/** * Randomly sample values from a single column, in all sequences. * Values may be taken from any sequence (i.e., sequence order is not preserved) * * @param count Number of values to sample * @param columnName Name of the column to sample from * @param schema Schema * @param sequenceData Data to sample from * @return A list of random samples */ public static List<Writable> sampleFromColumnSequence(int count, String columnName, Schema schema, JavaRDD<List<List<Writable>>> sequenceData) { JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction()); return sampleFromColumn(count, columnName, schema, flattenedSequence); }
Example 19
Source File: AnalyzeSpark.java From DataVec with Apache License 2.0 | 2 votes |
/** * Get a list of unique values from the specified column of a sequence * * @param columnName Name of the column to get unique values from * @param schema Data schema * @param sequenceData Sequence data to get unique values from * @return */ public static List<Writable> getUniqueSequence(String columnName, Schema schema, JavaRDD<List<List<Writable>>> sequenceData) { JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction()); return getUnique(columnName, schema, flattenedSequence); }
Example 20
Source File: AnalyzeSpark.java From DataVec with Apache License 2.0 | 2 votes |
/** * Get a list of unique values from the specified columns of a sequence * * @param columnNames Name of the columns to get unique values from * @param schema Data schema * @param sequenceData Sequence data to get unique values from * @return */ public static Map<String,List<Writable>> getUniqueSequence(List<String> columnNames, Schema schema, JavaRDD<List<List<Writable>>> sequenceData) { JavaRDD<List<Writable>> flattenedSequence = sequenceData.flatMap(new SequenceFlatMapFunction()); return getUnique(columnNames, schema, flattenedSequence); }