Java Code Examples for org.apache.spark.api.java.JavaRDD#foreach()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#foreach() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestSparkContextProvider.java From rdf2x with Apache License 2.0 | 6 votes |
public <T> void assertRDDEquals(String message, JavaRDD<T> expected, JavaRDD<T> result) { Option<Tuple3<T, Integer, Integer>> diff = JavaRDDComparisons.compareRDD(expected, result); if (diff.isDefined()) { log.error("EXPECTED"); expected.foreach(row -> log.error(row.toString())); log.error("RESULT"); result.foreach(row -> log.error(row.toString())); log.error("FIRST DIFF"); Tuple3<T, Integer, Integer> diffTriple = diff.get(); log.error(diffTriple.toString()); if (diffTriple._2() == 0) { log.error("(row not expected but present in result {} times)", diffTriple._3()); } if (diffTriple._3() == 0) { log.error("(row expected {} times but not present)", diffTriple._2()); } throw new AssertionError(message); } }
Example 2
Source File: InterleaveMulti.java From ViraPipe with MIT License | 6 votes |
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { Path fqpath = new Path(fqPath); String fqname = fqpath.getName(); String[] ns = fqname.split("\\."); //TODO: Handle also compressed files List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); splitRDD.foreach( split -> { FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split); writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]); }); }
Example 3
Source File: SparkOperatorProfiler.java From rheem with Apache License 2.0 | 6 votes |
/** * Helper method to generate data quanta and provide them as a cached {@link JavaRDD}. */ protected <T> JavaRDD<T> prepareInputRddInDriver(long cardinality, int inputIndex) { @SuppressWarnings("unchecked") final Supplier<T> supplier = (Supplier<T>) this.dataQuantumGenerators.get(inputIndex); JavaRDD<T> finalInputRdd = null; // Create batches, parallelize them, and union them. long remainder = cardinality; do { int batchSize = (int) Math.min(remainder, this.dataQuantumGeneratorBatchSize); List<T> batch = new ArrayList<>(batchSize); while (batch.size() < batchSize) { batch.add(supplier.get()); } final JavaRDD<T> batchRdd = this.sparkExecutor.sc.parallelize(batch); finalInputRdd = finalInputRdd == null ? batchRdd : finalInputRdd.union(batchRdd); remainder -= batchSize; } while (remainder > 0); // Shuffle and cache the RDD. final JavaRDD<T> cachedInputRdd = this.partition(finalInputRdd).cache(); cachedInputRdd.foreach(dataQuantum -> { }); return cachedInputRdd; }
Example 4
Source File: Intersection.java From SparkDemo with MIT License | 6 votes |
static void intersection(JavaSparkContext sc) { List<String> datas1 = Arrays.asList("张三", "李四", "tom"); List<String> datas2 = Arrays.asList("tom", "gim"); /** * ===================================== * | 返回两个RDD的交集 | * | Returns the intersection of two RDD | | * ===================================== */ JavaRDD<String> intersectionRDD = sc.parallelize(datas1).intersection(sc.parallelize(datas2)); intersectionRDD.foreach(new VoidFunction<String>() { @Override public void call(String t) throws Exception { System.out.println(t); } }); }
Example 5
Source File: SampleAndTake.java From SparkDemo with MIT License | 6 votes |
static void sample(JavaSparkContext sc) { List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8); JavaRDD<Integer> dataRDD = sc.parallelize(datas); /** * ====================================================================================================== * | 随机抽样-----参数withReplacement为true时表示抽样之后还放回,可以被多次抽样,false表示不放回;fraction表示抽样比例;seed为随机数种子 | * | The random sampling parameter withReplacement is true, which means that after sampling, it can be returned. It can be sampled many times, | * | and false indicates no return. Fraction represents the sampling proportion;seed is the random number seed | | * ====================================================================================================== */ JavaRDD<Integer> sampleRDD = dataRDD.sample(false, 0.5, System.currentTimeMillis()); // TODO dataRDD.takeSample(false, 3); // TODO dataRDD.take(3) sampleRDD.foreach(new VoidFunction<Integer>() { @Override public void call(Integer t) throws Exception { System.out.println(t); } }); sc.close(); }
Example 6
Source File: Distinct.java From SparkDemo with MIT License | 6 votes |
private static void distinct(JavaSparkContext sc) { List<String> datas = Arrays.asList("张三", "李四", "tom", "张三"); /** * =================================== * | 去重--包含shuffle操作 | * | Remove weights, including shuffle operations | | * =================================== */ JavaRDD<String> distinctRDD = sc.parallelize(datas).distinct(); distinctRDD.foreach(new VoidFunction<String>() { @Override public void call(String t) throws Exception { System.out.println(t); } }); }
Example 7
Source File: Filter.java From SparkDemo with MIT License | 6 votes |
private static void filter(JavaSparkContext sc) { List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8); JavaRDD<Integer> rddData = sc.parallelize(datas); JavaRDD<Integer> filterRDD = rddData.filter( // jdk1.8 // v1 -> v1 >= 3 new Function<Integer, Boolean>() { public Boolean call(Integer v) throws Exception { // 过滤小于4的数 return v >= 4; } }); filterRDD.foreach( // jdk1.8 // v -> System.out.println(v) new VoidFunction<Integer>() { @Override public void call(Integer integer) throws Exception { System.out.println(integer); } }); sc.close(); }
Example 8
Source File: FlatMap.java From SparkDemo with MIT License | 6 votes |
private static void flatMap(JavaSparkContext sc) { List<String> data = Arrays.asList("aa,bb,cc", "cxf,spring,struts2", "java,C++,javaScript"); JavaRDD<String> rddData = sc.parallelize(data); FlatMapFunction<String, String> flatMapFunction=new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { List<String> list = Arrays.asList(s.split(",")); return list.iterator(); } }; JavaRDD<String> flatMapData = rddData.flatMap(flatMapFunction); flatMapData.foreach(new VoidFunction<String>() { @Override public void call(String v) throws Exception { System.out.println(v); } }); sc.close(); }
Example 9
Source File: Union.java From SparkDemo with MIT License | 6 votes |
static void union(JavaSparkContext sc ) { List<String> datas1 = Arrays.asList("张三", "李四"); List<String> datas2 = Arrays.asList("tom", "gim"); JavaRDD<String> data1RDD = sc.parallelize(datas1); JavaRDD<String> data2RDD = sc.parallelize(datas2); /** * ==================================================================== * | 合并两个RDD,不去重,要求两个RDD中的元素类型一致 | * | Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD | | * ==================================================================== */ JavaRDD<String> unionRDD = data1RDD .union(data2RDD); unionRDD.foreach(new VoidFunction<String>() { @Override public void call(String t) throws Exception { System.out.println(t); } }); sc.close(); }
Example 10
Source File: UtilHelpers.java From hudi with Apache License 2.0 | 5 votes |
public static int handleErrors(JavaSparkContext jsc, String instantTime, JavaRDD<WriteStatus> writeResponse) { Accumulator<Integer> errors = jsc.accumulator(0); writeResponse.foreach(writeStatus -> { if (writeStatus.hasErrors()) { errors.add(1); LOG.error(String.format("Error processing records :writeStatus:%s", writeStatus.getStat().toString())); } }); if (errors.value() == 0) { LOG.info(String.format("Table imported into hoodie with %s instant time.", instantTime)); return 0; } LOG.error(String.format("Import failed with %d errors.", errors.value())); return -1; }
Example 11
Source File: SparkOperatorProfiler.java From rheem with Apache License 2.0 | 5 votes |
/** * Helper method to generate data quanta and provide them as a cached {@link JavaRDD}. */ protected <T> JavaRDD<T> prepareInputRddInWorker(long cardinality, int inputIndex) { // Create batches, parallelize them, and union them. final List<Integer> batchSizes = new LinkedList<>(); int numFullBatches = (int) (cardinality / this.dataQuantumGeneratorBatchSize); for (int i = 0; i < numFullBatches; i++) { batchSizes.add(this.dataQuantumGeneratorBatchSize); } batchSizes.add((int) (cardinality % this.dataQuantumGeneratorBatchSize)); @SuppressWarnings("unchecked") final Supplier<T> supplier = (Supplier<T>) this.dataQuantumGenerators.get(inputIndex); JavaRDD<T> finalInputRdd = this.sparkExecutor.sc .parallelize(batchSizes, 1) // Single partition to ensure the same data generator. .flatMap(batchSize -> { List<T> list = new ArrayList<>(batchSize); for (int i = 0; i < batchSize; i++) { list.add(supplier.get()); } return list.iterator(); }); // Shuffle and cache the RDD. final JavaRDD<T> cachedInputRdd = this.partition(finalInputRdd).cache(); cachedInputRdd.foreach(dataQuantum -> { }); return cachedInputRdd; }
Example 12
Source File: SparkSegmentTarPushJob.java From incubator-pinot with Apache License 2.0 | 5 votes |
@Override public void run() throws Exception { if (!_enableParallelPush) { super.run(); } else { List<Path> segmentPathsToPush = getDataFilePaths(_segmentPattern); retainRecentFiles(segmentPathsToPush, _lookBackPeriod); List<String> segmentsToPush = new ArrayList<>(); segmentPathsToPush.forEach(path -> { segmentsToPush.add(path.toString()); }); JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate()); if (_pushJobParallelism == -1) { _pushJobParallelism = segmentsToPush.size(); } JavaRDD<String> pathRDD = sparkContext.parallelize(segmentsToPush, _pushJobParallelism); pathRDD.foreach(segmentTarPath -> { try (ControllerRestApi controllerRestApi = getControllerRestApi()) { FileSystem fileSystem = FileSystem.get(new Path(segmentTarPath).toUri(), new Configuration()); // TODO: Deal with invalid prefixes in the future List<String> currentSegments = controllerRestApi.getAllSegments("OFFLINE"); controllerRestApi.pushSegments(fileSystem, Arrays.asList(new Path(segmentTarPath))); if (_deleteExtraSegments) { controllerRestApi .deleteSegmentUris(getSegmentsToDelete(currentSegments, Arrays.asList(new Path(segmentTarPath)))); } } }); } }
Example 13
Source File: Interleave.java From ViraPipe with MIT License | 5 votes |
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { Path fqpath = new Path(fqPath); String fqname = fqpath.getName(); String[] ns = fqname.split("\\."); List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); splitRDD.foreach( split -> { FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split); writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]); }); }
Example 14
Source File: Decompress.java From ViraPipe with MIT License | 5 votes |
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { Path fqpath = new Path(fqPath); String fqname = fqpath.getName(); String[] ns = fqname.split("\\."); List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); splitRDD.foreach( split -> { FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split); writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]); }); }
Example 15
Source File: ActionRDD.java From hui-bigdata-spark with Apache License 2.0 | 5 votes |
/** * 循环 * * @since hui_project 1.0.0 */ public void testForEach(){ SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH); stringJavaRDD.foreach(x->{ System.out.println(x); }); }
Example 16
Source File: PersistExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
/** * @param args */ public static void main(String[] args) { //C:\Users\sumit.kumar\Downloads\bin\warehouse //System.setProperty("hadoop.home.dir", "C:\\Users\\sumit.kumar\\Downloads"); String logFile = "src/main/resources/Apology_by_Plato.txt"; // Should be some file on your system Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); SparkConf conf = new SparkConf().setMaster("local").setAppName("ActionExamples").set("spark.hadoop.validateOutputSpecs", "false"); JavaSparkContext sparkContext = new JavaSparkContext(conf); JavaRDD<Integer> rdd = sparkContext.parallelize(Arrays.asList(1, 2, 3,4,5),3).cache(); JavaRDD<Integer> evenRDD= rdd.filter(new org.apache.spark.api.java.function.Function<Integer, Boolean>() { @Override public Boolean call(Integer v1) throws Exception { return ((v1%2)==0)?true:false; } }); evenRDD.persist(StorageLevel.MEMORY_AND_DISK()); evenRDD.foreach(new VoidFunction<Integer>() { @Override public void call(Integer t) throws Exception { System.out.println("The value of RDD are :"+t); } }); //unpersisting the RDD evenRDD.unpersist(); rdd.unpersist(); /* JavaRDD<String> lines = spark.read().textFile(logFile).javaRDD().cache(); System.out.println("DEBUG: \n"+ lines.toDebugString()); long word= lines.count(); JavaRDD<String> distinctLines=lines.distinct(); System.out.println("DEBUG: \n"+ distinctLines.toDebugString()); JavaRDD<String> finalRdd=lines.subtract(distinctLines); System.out.println("DEBUG: \n"+ finalRdd.toDebugString()); System.out.println("The count is "+word); System.out.println("The count is "+distinctLines.count()); System.out.println("The count is "+finalRdd.count()); finalRdd.foreach(new VoidFunction<String>() { @Override public void call(String t) throws Exception { // TODO Auto-generated method stub System.out.println(t); } }); */ /*SparkConf conf = new SparkConf().setAppName("Simple Application"); JavaSparkContext sc = new JavaSparkContext(conf); StorageLevel newLevel; JavaRDD<String> logData = sc.textFile(logFile).cache(); long numAs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) { public Boolean call(String s) { return s.contains("a"); } }).count(); long numBs = logData.filter(new Function(logFile, logFile, logFile, logFile, false) { public Boolean call(String s) { return s.contains("b"); } }).count(); System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs); sc.stop();*/ }
Example 17
Source File: TestNd4jKryoSerialization.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testSerialization() { Tuple2<INDArray, INDArray> t2 = new Tuple2<>(Nd4j.linspace(1, 10, 10, DataType.FLOAT), Nd4j.linspace(10, 20, 10, DataType.FLOAT)); Broadcast<Tuple2<INDArray, INDArray>> b = sc.broadcast(t2); List<INDArray> list = new ArrayList<>(); for (int i = 0; i < 100; i++) { list.add(Nd4j.ones(5)); } JavaRDD<INDArray> rdd = sc.parallelize(list); rdd.foreach(new AssertFn(b)); }
Example 18
Source File: JsonFileOperations.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir","file:///E:/sumitK/Hadoop/warehouse") .appName("JavaALSExample") .getOrCreate(); RDD<String> textFile = sparkSession.sparkContext().textFile("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json",2); JavaRDD<PersonDetails> mapParser = textFile.toJavaRDD().map(v1 -> new ObjectMapper().readValue(v1, PersonDetails.class)); mapParser.foreach(t -> System.out.println(t)); Dataset<Row> anotherPeople = sparkSession.read().json(textFile); anotherPeople.printSchema(); anotherPeople.show(); Dataset<Row> json_rec = sparkSession.read().json("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_json.json"); json_rec.printSchema(); json_rec.show(); StructType schema = new StructType( new StructField[] { DataTypes.createStructField("cid", DataTypes.IntegerType, true), DataTypes.createStructField("county", DataTypes.StringType, true), DataTypes.createStructField("firstName", DataTypes.StringType, true), DataTypes.createStructField("sex", DataTypes.StringType, true), DataTypes.createStructField("year", DataTypes.StringType, true), DataTypes.createStructField("dateOfBirth", DataTypes.TimestampType, true) }); /* StructType pep = new StructType(new StructField[] { new StructField("Count", DataTypes.StringType, true, Metadata.empty()), new StructField("County", DataTypes.StringType, true, Metadata.empty()), new StructField("First Name", DataTypes.StringType, true, Metadata.empty()), new StructField("Sex", DataTypes.StringType, true, Metadata.empty()), new StructField("Year", DataTypes.StringType, true, Metadata.empty()), new StructField("timestamp", DataTypes.TimestampType, true, Metadata.empty()) });*/ Dataset<Row> person_mod = sparkSession.read().schema(schema).json(textFile); person_mod.printSchema(); person_mod.show(); person_mod.write().format("json").mode("overwrite").save("C:/Users/sumit.kumar/git/learning/src/main/resources/pep_out.json"); }
Example 19
Source File: TestHoodieIndex.java From hudi with Apache License 2.0 | 4 votes |
@ParameterizedTest @EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE"}) public void testSimpleTagLocationAndUpdate(IndexType indexType) throws Exception { setUp(indexType); String newCommitTime = "001"; int totalRecords = 10 + random.nextInt(20); List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, totalRecords); JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieTable hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration()); // Test tagLocation without any entries in index JavaRDD<HoodieRecord> javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable); assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); // Insert totalRecords records writeClient.startCommitWithTime(newCommitTime); JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime); assertNoWriteErrors(writeStatues.collect()); // Now tagLocation for these records, index should not tag them since it was a failed // commit javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable); assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0); // Now commit this & update location of records inserted and validate no errors writeClient.commit(newCommitTime, writeStatues); // Now tagLocation for these records, index should tag them correctly metaClient = HoodieTableMetaClient.reload(metaClient); hoodieTable = HoodieTable.create(metaClient, config, jsc.hadoopConfiguration()); javaRDD = index.tagLocation(writeRecords, jsc, hoodieTable); Map<String, String> recordKeyToPartitionPathMap = new HashMap(); List<HoodieRecord> hoodieRecords = writeRecords.collect(); hoodieRecords.forEach(entry -> recordKeyToPartitionPathMap.put(entry.getRecordKey(), entry.getPartitionPath())); assertEquals(totalRecords, javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size()); assertEquals(totalRecords, javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count()); assertEquals(totalRecords, javaRDD.filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count()); javaRDD.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry.getRecordKey()), entry.getPartitionPath(), "PartitionPath mismatch")); JavaRDD<HoodieKey> hoodieKeyJavaRDD = writeRecords.map(entry -> entry.getKey()); JavaPairRDD<HoodieKey, Option<Pair<String, String>>> recordLocations = index.fetchRecordLocation(hoodieKeyJavaRDD, jsc, hoodieTable); List<HoodieKey> hoodieKeys = hoodieKeyJavaRDD.collect(); assertEquals(totalRecords, recordLocations.collect().size()); assertEquals(totalRecords, recordLocations.map(record -> record._1).distinct().count()); recordLocations.foreach(entry -> assertTrue(hoodieKeys.contains(entry._1), "Missing HoodieKey")); recordLocations.foreach(entry -> assertEquals(recordKeyToPartitionPathMap.get(entry._1.getRecordKey()), entry._1.getPartitionPath(), "PartitionPath mismatch")); }
Example 20
Source File: DecompressInterleave.java From ViraPipe with MIT License | 3 votes |
private static void splitFastq(FileStatus fst, String splitDir, int splitlen, JavaSparkContext sc) throws IOException { //TODO: Handle also compressed files List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, new Configuration(), splitlen); JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif); splitRDD.foreach( split -> { FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split); writeFastqFile(fqreader, new Configuration(), splitDir + "/" + split.getPath().getName()+"_"+split.getStart() + ".fq"); }); }