org.apache.spark.api.java.JavaPairRDD#filter

Source File: BroadCastParam.java From sparkResearch with Apache License 2.0

6 votes

/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}

Source File: WordCount.java From spark-on-spring-boot with Apache License 2.0

6 votes

public void count() {

        JavaRDD<String> tokenized = javaSparkContext.textFile(inputFile).flatMap((s1) -> Arrays.asList(s1.split(" ")));

        // count the occurrence of each word
        JavaPairRDD<String, Integer> counts = tokenized
                .mapToPair(s -> new Tuple2<>(s, 1))
                .reduceByKey((i1, i2) -> i1 + i2);

        // filter out words with less than threshold occurrences
        JavaPairRDD<String, Integer> filtered = counts.filter(tup -> tup._2() >= threshold);

        // count characters
        JavaPairRDD<Character, Integer> charCounts = filtered.flatMap(
                s -> {
                    Collection<Character> chars = new ArrayList<>(s._1().length());
                    for (char c : s._1().toCharArray()) {
                        chars.add(c);
                    }
                    return chars;
                }
        ).mapToPair(c -> new Tuple2<>(c, 1))
                .reduceByKey((i1, i2) -> i1 + i2);

        System.out.println(charCounts.collect());
    }

Source File: InteractionAnalysisSimple.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * @param args no input arguments
 * @throws IOException if MmtfReader fails
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisSimple.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
    
    // use only representative structures
    int sequenceIdentity = 40;
    double resolution = 2.5;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // list the top 10 residue types that interact with Zn
       interactions.printSchema();
       interactions.show(20);
       
       System.out.println("# interactions: " + interactions.count());
       
       // show the top 10 interacting groups
       interactions
       .groupBy(col("residue2"))
       .count()
       .sort(col("count").desc())
       .show(10);
      
    long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: ALSUpdate.java From oryx with Apache License 2.0

5 votes

/**
 * @param parsedRDD parsed input as {@code String[]}
 * @return {@link Rating}s ordered by timestamp
 */
private JavaRDD<Rating> parsedToRatingRDD(JavaRDD<String[]> parsedRDD,
                                          Broadcast<? extends Map<String,Integer>> bUserIDToIndex,
                                          Broadcast<? extends Map<String,Integer>> bItemIDToIndex) {
  JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> {
    try {
      return new Tuple2<>(
          Long.valueOf(tokens[3]),
          new Rating(bUserIDToIndex.value().get(tokens[0]),
                     bItemIDToIndex.value().get(tokens[1]),
                     // Empty value means 'delete'; propagate as NaN
                     tokens[2].isEmpty() ? Double.NaN : Double.parseDouble(tokens[2])));
    } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
      log.warn("Bad input: {}", Arrays.toString(tokens));
      throw e;
    }
  });

  if (decayFactor < 1.0) {
    double factor = decayFactor;
    long now = System.currentTimeMillis();
    timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> {
        long timestamp = timestampRating._1();
        return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor));
      });
  }

  if (decayZeroThreshold > 0.0) {
    double theThreshold = decayZeroThreshold;
    timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
  }

  return timestampRatingRDD.sortByKey().values();
}

Source File: ALSUpdate.java From oryx with Apache License 2.0

5 votes

/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
  JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
      original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));

  JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
  if (implicit) {
    // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
    // they don't guarantee the delete elements are properly handled
    aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
  } else {
    // For non-implicit, last wins.
    aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
  }

  JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
      aggregated.filter(kv -> !Double.isNaN(kv._2()));

  if (logStrength) {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        Math.log1p(userProductScore._2() / epsilon)));
  } else {
    return noNaN.map(userProductScore -> new Rating(
        userProductScore._1()._1(),
        userProductScore._1()._2(),
        userProductScore._2()));
  }
}

Source File: TieredSpatialJoin.java From geowave with Apache License 2.0

5 votes

private JavaPairRDD<GeoWaveInputKey, ByteArray> joinAndCompareTiers(
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> leftTier,
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> rightTier,
    final Broadcast<GeomFunction> geomPredicate,
    final int highestPartitionCount,
    final HashPartitioner partitioner) {
  // Cogroup groups on same tier ByteArrayId and pairs them into Iterable
  // sets.
  JavaPairRDD<ByteArray, Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>> joinedTiers =
      leftTier.cogroup(rightTier, partitioner);

  // Filter only the pairs that have data on both sides, bucket strategy
  // should have been accounted for by this point.
  // We need to go through the pairs and test each feature against each
  // other
  // End with a combined RDD for that tier.
  joinedTiers =
      joinedTiers.filter(t -> t._2._1.iterator().hasNext() && t._2._2.iterator().hasNext());

  final JavaPairRDD<GeoWaveInputKey, ByteArray> finalMatches =
      joinedTiers.flatMapValues(
          (Function<Tuple2<Iterable<Tuple2<GeoWaveInputKey, Geometry>>, Iterable<Tuple2<GeoWaveInputKey, Geometry>>>, Iterable<GeoWaveInputKey>>) t -> {
            final GeomFunction predicate = geomPredicate.value();

            final HashSet<GeoWaveInputKey> results = Sets.newHashSet();
            for (final Tuple2<GeoWaveInputKey, Geometry> leftTuple : t._1) {
              for (final Tuple2<GeoWaveInputKey, Geometry> rightTuple : t._2) {
                if (predicate.call(leftTuple._2, rightTuple._2)) {
                  results.add(leftTuple._1);
                  results.add(rightTuple._1);
                }
              }
            }
            return results;
          }).mapToPair(Tuple2::swap).reduceByKey(partitioner, (id1, id2) -> id1).persist(
              StorageLevel.MEMORY_ONLY_SER());

  return finalMatches;
}

Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0

4 votes

private void processMatrixAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc = sec.getDataCharacteristics(input1.getName());

	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in;

	//filter input blocks for trace
	if( getOpcode().equalsIgnoreCase("uaktrace") )
		out = out.filter(new FilterDiagMatrixBlocksFunction());

	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		if( auop.sparseSafe )
			out = out.filter(new FilterNonEmptyBlocksFunction());

		JavaRDD<MatrixBlock> out2 = out.map(
				new RDDUAggFunction2(auop, mc.getBlocksize()));
		MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop);

		//drop correction after aggregation
		out3.dropLastRowsOrColumns(aggop.correction);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of matrix characteristics
		sec.setMatrixOutput(output.getName(), out3);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUAggValueFunction(auop, mc.getBlocksize()));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUAggFunction(auop, mc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.existsCorrection() )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}

Source File: InteractionAnalysisAdvanced.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args no input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // get non-redundant subset
    pdb = pdb.filter(new Pisces(40, 2.5));
    
    // find Zinc interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(20);

       // TODO print the top 10 interacting elements
       System.out.println("Top interacting elements");
       Dataset<Row> topElements = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("element2")
       		.count();
       
       topElements.withColumn("frequency", col("count").divide(n))
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       interactions
       .groupBy("element2")
       .avg("distance")
       .sort("avg(distance)")
       .show(10);

       // Aggregate multiple statistics
       // Note: import static org.apache.spark.sql.functions.* required!
       // e.g. org.apache.spark.sql.functions.avg
       // for a list of all available functions
       interactions
       .groupBy("element2")
       .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance"))
       .show(10);
       
       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: AtpInteractionAnalysis.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // filter by sequence identity subset
    int sequenceIdentity = 20;
    double resolution = 2.0;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    
    // find ATP interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // TODO add a line to only analyze interactions 
    // with the oxygens in the terminal phosphate group of ATP
    // (O1G, O2G, O3G)
    // Tip: Google SQL LIKE
    interactions = interactions.filter("atom1 LIKE('O%G')");
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: FilterBySequenceRegex.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args
 * @throws FileNotFoundException 
 */
public static void main(String[] args) throws FileNotFoundException {

	String path = MmtfReader.getMmtfReducedPath();
    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterBySequenceRegex.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path,  sc);

    // find structures that containing a Zinc finger motif
    pdb = pdb.filter(new ContainsSequenceRegex("C.{2,4}C.{12}H.{3,5}H"));
    
    System.out.println("Number of PDB entries containing a Zinc finger motif: " + pdb.count());
  
    long end = System.nanoTime();
    
    System.out.println("Time: " + (end-start)/1E9 + " sec.");
    
    sc.close();
}

Source File: FilterByResolution.java From mmtf-spark with Apache License 2.0

4 votes

public static void main(String[] args) throws FileNotFoundException {

		String path = MmtfReader.getMmtfReducedPath();
	    
	    long start = System.nanoTime();
	    
	    // instantiate Spark. Each Spark application needs these two lines of code.
	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByResolution.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);

	    // read entire PDB in MMTF format
	    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path,  sc);

	    // filter PDB entries resolution. Entries without resolution values, 
	    // e.g., NMR structures, will be filtered out as well.
	    pdb = pdb.filter(new Resolution(0.0, 2.0));
	    
	    System.out.println("# structures: " + pdb.count());
	   
	    // close Spark
	    sc.close();
	    
	    long end = System.nanoTime();
	    System.out.println((end-start)/1E9 + " sec.");    
	}

Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0

4 votes

private void processMatrixAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc = sec.getDataCharacteristics(input1.getName());

	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in;

	//filter input blocks for trace
	if( getOpcode().equalsIgnoreCase("uaktrace") )
		out = out.filter(new FilterDiagMatrixBlocksFunction());

	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		if( auop.sparseSafe )
			out = out.filter(new FilterNonEmptyBlocksFunction());

		JavaRDD<MatrixBlock> out2 = out.map(
				new RDDUAggFunction2(auop, mc.getBlocksize()));
		MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop);

		//drop correction after aggregation
		out3.dropLastRowsOrColumns(aggop.correction);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of matrix characteristics
		sec.setMatrixOutput(output.getName(), out3);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUAggValueFunction(auop, mc.getBlocksize()));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUAggFunction(auop, mc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.existsCorrection() )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}

Source File: TieredSpatialJoin.java From geowave with Apache License 2.0

4 votes

private JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> filterTier(
    final JavaPairRDD<ByteArray, Tuple2<GeoWaveInputKey, Geometry>> indexedRDD,
    final byte tierId) {
  return indexedRDD.filter(v1 -> v1._1().getBytes()[0] == tierId);
}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#filter()