Java Code Examples for org.apache.spark.api.java.JavaPairRDD#filter()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#filter() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BroadCastParam.java From sparkResearch with Apache License 2.0 | 6 votes |
/** * 广播变量测试 * @param args */ public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder() .master("local[4]").appName("AttackFind").getOrCreate(); //初始化sparkContext JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext()); //在这里假定一份广播变量 //因为我们之前说过,广播变量只可读 final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK"); //设置广播变量,把broadcast广播出去 final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList); //定义数据 JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000"))); JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2)); resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println); }
Example 2
Source File: WordCount.java From spark-on-spring-boot with Apache License 2.0 | 6 votes |
public void count() { JavaRDD<String> tokenized = javaSparkContext.textFile(inputFile).flatMap((s1) -> Arrays.asList(s1.split(" "))); // count the occurrence of each word JavaPairRDD<String, Integer> counts = tokenized .mapToPair(s -> new Tuple2<>(s, 1)) .reduceByKey((i1, i2) -> i1 + i2); // filter out words with less than threshold occurrences JavaPairRDD<String, Integer> filtered = counts.filter(tup -> tup._2() >= threshold); // count characters JavaPairRDD<Character, Integer> charCounts = filtered.flatMap( s -> { Collection<Character> chars = new ArrayList<>(s._1().length()); for (char c : s._1().toCharArray()) { chars.add(c); } return chars; } ).mapToPair(c -> new Tuple2<>(c, 1)) .reduceByKey((i1, i2) -> i1 + i2); System.out.println(charCounts.collect()); }
Example 3
Source File: InteractionAnalysisSimple.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * @param args no input arguments * @throws IOException if MmtfReader fails */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisSimple.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // use only representative structures int sequenceIdentity = 40; double resolution = 2.5; pdb = pdb.filter(new Pisces(sequenceIdentity, resolution)); GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3); Dataset<Row> interactions = finder.getDataset(pdb).cache(); // list the top 10 residue types that interact with Zn interactions.printSchema(); interactions.show(20); System.out.println("# interactions: " + interactions.count()); // show the top 10 interacting groups interactions .groupBy(col("residue2")) .count() .sort(col("count").desc()) .show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 4
Source File: ALSUpdate.java From oryx with Apache License 2.0 | 5 votes |
/** * @param parsedRDD parsed input as {@code String[]} * @return {@link Rating}s ordered by timestamp */ private JavaRDD<Rating> parsedToRatingRDD(JavaRDD<String[]> parsedRDD, Broadcast<? extends Map<String,Integer>> bUserIDToIndex, Broadcast<? extends Map<String,Integer>> bItemIDToIndex) { JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> { try { return new Tuple2<>( Long.valueOf(tokens[3]), new Rating(bUserIDToIndex.value().get(tokens[0]), bItemIDToIndex.value().get(tokens[1]), // Empty value means 'delete'; propagate as NaN tokens[2].isEmpty() ? Double.NaN : Double.parseDouble(tokens[2]))); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(tokens)); throw e; } }); if (decayFactor < 1.0) { double factor = decayFactor; long now = System.currentTimeMillis(); timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> { long timestamp = timestampRating._1(); return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor)); }); } if (decayZeroThreshold > 0.0) { double theThreshold = decayZeroThreshold; timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold); } return timestampRatingRDD.sortByKey().values(); }
Example 5
Source File: ALSUpdate.java From oryx with Apache License 2.0 | 5 votes |
/** * Combines {@link Rating}s with the same user/item into one, with score as the sum of * all of the scores. */ private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) { JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples = original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated; if (implicit) { // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since // they don't guarantee the delete elements are properly handled aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN); } else { // For non-implicit, last wins. aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next); } JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN = aggregated.filter(kv -> !Double.isNaN(kv._2())); if (logStrength) { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), Math.log1p(userProductScore._2() / epsilon))); } else { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), userProductScore._2())); } }
Example 6
Source File: TieredSpatialJoin.java From geowave with Apache License 2.0 | 5 votes |
private JavaPairRDD<GeoWaveInputKey, ByteArray> joinAndCompareTiers( final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> leftTier, final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> rightTier, final Broadcast<GeomFunction> geomPredicate, final int highestPartitionCount, final HashPartitioner partitioner) { // Cogroup groups on same tier ByteArrayId and pairs them into Iterable // sets. JavaPairRDD<ByteArray, Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>> joinedTiers = leftTier.cogroup(rightTier, partitioner); // Filter only the pairs that have data on both sides, bucket strategy // should have been accounted for by this point. // We need to go through the pairs and test each feature against each // other // End with a combined RDD for that tier. joinedTiers = joinedTiers.filter(t -> t._2._1.iterator().hasNext() && t._2._2.iterator().hasNext()); final JavaPairRDD<GeoWaveInputKey, ByteArray> finalMatches = joinedTiers.flatMapValues( (Function<Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>, Iterable<GeoWaveInputKey>>) t -> { final GeomFunction predicate = geomPredicate.value(); final HashSet<GeoWaveInputKey> results = Sets.newHashSet(); for (final Tuple2<GeoWaveInputKey, Geometry> leftTuple : t._1) { for (final Tuple2<GeoWaveInputKey, Geometry> rightTuple : t._2) { if (predicate.call(leftTuple._2, rightTuple._2)) { results.add(leftTuple._1); results.add(rightTuple._1); } } } return results; }).mapToPair(Tuple2::swap).reduceByKey(partitioner, (id1, id2) -> id1).persist( StorageLevel.MEMORY_ONLY_SER()); return finalMatches; }
Example 7
Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0 | 4 votes |
private void processMatrixAggregate(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; DataCharacteristics mc = sec.getDataCharacteristics(input1.getName()); //get input JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> out = in; //filter input blocks for trace if( getOpcode().equalsIgnoreCase("uaktrace") ) out = out.filter(new FilterDiagMatrixBlocksFunction()); //execute unary aggregate operation AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr; AggregateOperator aggop = _aop; //perform aggregation if necessary and put output into symbol table if( _aggtype == SparkAggType.SINGLE_BLOCK ) { if( auop.sparseSafe ) out = out.filter(new FilterNonEmptyBlocksFunction()); JavaRDD<MatrixBlock> out2 = out.map( new RDDUAggFunction2(auop, mc.getBlocksize())); MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop); //drop correction after aggregation out3.dropLastRowsOrColumns(aggop.correction); //put output block into symbol table (no lineage because single block) //this also includes implicit maintenance of matrix characteristics sec.setMatrixOutput(output.getName(), out3); } else //MULTI_BLOCK or NONE { if( _aggtype == SparkAggType.NONE ) { //in case of no block aggregation, we always drop the correction as well as //use a partitioning-preserving mapvalues out = out.mapValues(new RDDUAggValueFunction(auop, mc.getBlocksize())); } else if( _aggtype == SparkAggType.MULTI_BLOCK ) { //in case of multi-block aggregation, we always keep the correction out = out.mapToPair(new RDDUAggFunction(auop, mc.getBlocksize())); out = RDDAggregateUtils.aggByKeyStable(out, aggop, false); //drop correction after aggregation if required (aggbykey creates //partitioning, drop correction via partitioning-preserving mapvalues) if( auop.aggOp.existsCorrection() ) out = out.mapValues( new AggregateDropCorrectionFunction(aggop) ); } //put output RDD handle into symbol table updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); } }
Example 8
Source File: InteractionAnalysisAdvanced.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args no input arguments * @throws IOException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // get non-redundant subset pdb = pdb.filter(new Pisces(40, 2.5)); // find Zinc interactions within 3 Angstroms GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3); Dataset<Row> interactions = finder.getDataset(pdb).cache(); // show the data schema of the dataset and some data interactions.printSchema(); interactions.show(20); long n = interactions.count(); System.out.println("# interactions: " + n); System.out.println("Top interacting groups"); Dataset<Row> topGroups = interactions .groupBy("residue2") .count(); topGroups .sort(col("count").desc()) // sort descending by count .show(10); System.out.println("Top interacting group/atoms types"); Dataset<Row> topGroupsAndAtoms = interactions .filter("element2 != 'C'") // exclude carbon interactions .groupBy("residue2","atom2") .count(); topGroupsAndAtoms .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence .filter("frequency > 0.01") // filter out occurrences < 1 % .sort(col("frequency").desc()) // sort descending .show(20); // TODO print the top 10 interacting elements System.out.println("Top interacting elements"); Dataset<Row> topElements = interactions .filter("element2 != 'C'") // exclude carbon interactions .groupBy("element2") .count(); topElements.withColumn("frequency", col("count").divide(n)) .filter("frequency > 0.01") // filter out occurrences < 1 % .sort(col("frequency").desc()) // sort descending .show(10); interactions .groupBy("element2") .avg("distance") .sort("avg(distance)") .show(10); // Aggregate multiple statistics // Note: import static org.apache.spark.sql.functions.* required! // e.g. org.apache.spark.sql.functions.avg // for a list of all available functions interactions .groupBy("element2") .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance")) .show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 9
Source File: AtpInteractionAnalysis.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args input arguments * @throws IOException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // filter by sequence identity subset int sequenceIdentity = 20; double resolution = 2.0; pdb = pdb.filter(new Pisces(sequenceIdentity, resolution)); // find ATP interactions within 3 Angstroms GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3); Dataset<Row> interactions = finder.getDataset(pdb).cache(); // TODO add a line to only analyze interactions // with the oxygens in the terminal phosphate group of ATP // (O1G, O2G, O3G) // Tip: Google SQL LIKE interactions = interactions.filter("atom1 LIKE('O%G')"); // show the data schema of the dataset and some data interactions.printSchema(); interactions.show(20); long n = interactions.count(); System.out.println("# interactions: " + n); System.out.println("Top interacting groups"); Dataset<Row> topGroups = interactions .groupBy("residue2") .count(); topGroups .sort(col("count").desc()) // sort descending by count .show(10); System.out.println("Top interacting group/atoms types"); Dataset<Row> topGroupsAndAtoms = interactions .groupBy("residue2","atom2") .count(); topGroupsAndAtoms .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence .sort(col("frequency").desc()) // sort descending .show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 10
Source File: FilterBySequenceRegex.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException { String path = MmtfReader.getMmtfReducedPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterBySequenceRegex.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // find structures that containing a Zinc finger motif pdb = pdb.filter(new ContainsSequenceRegex("C.{2,4}C.{12}H.{3,5}H")); System.out.println("Number of PDB entries containing a Zinc finger motif: " + pdb.count()); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + " sec."); sc.close(); }
Example 11
Source File: FilterByResolution.java From mmtf-spark with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws FileNotFoundException { String path = MmtfReader.getMmtfReducedPath(); long start = System.nanoTime(); // instantiate Spark. Each Spark application needs these two lines of code. SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByResolution.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read entire PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // filter PDB entries resolution. Entries without resolution values, // e.g., NMR structures, will be filtered out as well. pdb = pdb.filter(new Resolution(0.0, 2.0)); System.out.println("# structures: " + pdb.count()); // close Spark sc.close(); long end = System.nanoTime(); System.out.println((end-start)/1E9 + " sec."); }
Example 12
Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0 | 4 votes |
private void processMatrixAggregate(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; DataCharacteristics mc = sec.getDataCharacteristics(input1.getName()); //get input JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> out = in; //filter input blocks for trace if( getOpcode().equalsIgnoreCase("uaktrace") ) out = out.filter(new FilterDiagMatrixBlocksFunction()); //execute unary aggregate operation AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr; AggregateOperator aggop = _aop; //perform aggregation if necessary and put output into symbol table if( _aggtype == SparkAggType.SINGLE_BLOCK ) { if( auop.sparseSafe ) out = out.filter(new FilterNonEmptyBlocksFunction()); JavaRDD<MatrixBlock> out2 = out.map( new RDDUAggFunction2(auop, mc.getBlocksize())); MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop); //drop correction after aggregation out3.dropLastRowsOrColumns(aggop.correction); //put output block into symbol table (no lineage because single block) //this also includes implicit maintenance of matrix characteristics sec.setMatrixOutput(output.getName(), out3); } else //MULTI_BLOCK or NONE { if( _aggtype == SparkAggType.NONE ) { //in case of no block aggregation, we always drop the correction as well as //use a partitioning-preserving mapvalues out = out.mapValues(new RDDUAggValueFunction(auop, mc.getBlocksize())); } else if( _aggtype == SparkAggType.MULTI_BLOCK ) { //in case of multi-block aggregation, we always keep the correction out = out.mapToPair(new RDDUAggFunction(auop, mc.getBlocksize())); out = RDDAggregateUtils.aggByKeyStable(out, aggop, false); //drop correction after aggregation if required (aggbykey creates //partitioning, drop correction via partitioning-preserving mapvalues) if( auop.aggOp.existsCorrection() ) out = out.mapValues( new AggregateDropCorrectionFunction(aggop) ); } //put output RDD handle into symbol table updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); } }
Example 13
Source File: TieredSpatialJoin.java From geowave with Apache License 2.0 | 4 votes |
private JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> filterTier( final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> indexedRDD, final byte tierId) { return indexedRDD.filter(v1 -> v1._1().getBytes()[0] == tierId); }