org.apache.spark.api.java.function.Function2 Java Exaples

Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0

8 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: Reduce.java From SparkDemo with MIT License

6 votes

private static void reduce(JavaSparkContext sc) {
	
	List<Integer> numberList=Arrays.asList(1,2,3,4,5,6,7,8,9,10);
	JavaRDD<Integer> javaRDD = sc.parallelize(numberList);
	
	/**
	 *   =====================================================
	 *   |                                                                 累加求和                                                               | 
	 *   =====================================================
	 */
	Integer num = javaRDD.reduce(new Function2<Integer, Integer, Integer>() {
		/**
		 * @param num1上一次计算结果 return的值
		 * @param num2 当前值
		 */
		@Override
		public Integer call(Integer num1, Integer num2) throws Exception {
			// System.out.println(num1+"======"+num2);
			return num1 + num2;
		}
	});
	
	System.out.println(num);
	
	sc.close();
}

Source File: SparkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Long getRDDCountSum(JavaPairRDD<ByteArray, Object[]> rdd, final int countMeasureIndex) {
    final ByteArray ONE = new ByteArray();
    Long count = rdd.mapValues(new Function<Object[], Long>() {
        @Override
        public Long call(Object[] objects) throws Exception {
            return (Long) objects[countMeasureIndex];
        }
    }).reduce(new Function2<Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>>() {
        @Override
        public Tuple2<ByteArray, Long> call(Tuple2<ByteArray, Long> longTuple2, Tuple2<ByteArray, Long> longTuple22)
                throws Exception {
            return new Tuple2<>(ONE, longTuple2._2() + longTuple22._2());
        }
    })._2();
    return count;
}

Source File: NGlobalDictionaryV2Test.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private void runWithSparkBuildGlobalDict(NGlobalDictionaryV2 dict, List<String> stringSet) throws IOException {
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    dict.prepareWrite();
    List<Row> rowList = Lists.newLinkedList();
    for (String str : stringSet) {
        rowList.add(RowFactory.create(str));
    }
    Dataset<Row> ds = ss.createDataFrame(rowList,
            new StructType(new StructField[] { DataTypes.createStructField("col1", DataTypes.StringType, true) }));
    ds.toJavaRDD().mapToPair((PairFunction<Row, String, String>) row -> {
        if (row.get(0) == null)
            return new Tuple2<>(null, null);
        return new Tuple2<>(row.get(0).toString(), null);
    }).sortByKey().partitionBy(new HashPartitioner(BUCKET_SIZE)).mapPartitionsWithIndex(
            (Function2<Integer, Iterator<Tuple2<String, String>>, Iterator<Object>>) (bucketId, tuple2Iterator) -> {
                NBucketDictionary bucketDict = dict.loadBucketDictionary(bucketId);
                while (tuple2Iterator.hasNext()) {
                    Tuple2<String, String> tuple2 = tuple2Iterator.next();
                    bucketDict.addRelativeValue(tuple2._1);
                }
                bucketDict.saveBucketDict(bucketId);
                return Lists.newArrayList().iterator();
            }, true).count();

    dict.writeMetaDict(BUCKET_SIZE, config.getGlobalDictV2MaxVersions(), config.getGlobalDictV2VersionTTL());
}

Source File: BlurBulkLoadSparkProcessor.java From incubator-retired-blur with Apache License 2.0

6 votes

@Override
protected Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction() {
  return new Function2<JavaPairRDD<String, RowMutation>, Time, Void>() {
    // Blur Thrift Client
    @Override
    public Void call(JavaPairRDD<String, RowMutation> rdd, Time time) throws Exception {
      Iface client = getBlurClient();
      for (Tuple2<String, RowMutation> tuple : rdd.collect()) {
        if (tuple != null) {
          try {
            RowMutation rm = tuple._2;
            // Index using enqueue mutate call
            client.enqueueMutate(rm);
          } catch (Exception ex) {
            LOG.error("Unknown error while trying to call enqueueMutate.", ex);
            throw ex;
          }
        }
      }
      return null;
    }
  };
}

Source File: ReduceTransform.java From incubator-nemo with Apache License 2.0

6 votes

/**
 * Reduce the iterator elements into a single object.
 *
 * @param elements the iterator of elements.
 * @param func     function to apply for reduction.
 * @param <T>      type of the elements.
 * @return the reduced element.
 */
@Nullable
public static <T> T reduceIterator(final Iterator<T> elements, final Function2<T, T, T> func) {
  if (!elements.hasNext()) { // nothing to be done
    return null;
  }

  T res = elements.next();
  while (elements.hasNext()) {
    try {
      res = func.call(res, elements.next());
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  return res;
}

Source File: SparkCubingByLayer.java From kylin with Apache License 2.0

6 votes

private Long getRDDCountSum(JavaPairRDD<ByteArray, Object[]> rdd, final int countMeasureIndex) {
    final ByteArray ONE = new ByteArray();
    Long count = rdd.mapValues(new Function<Object[], Long>() {
        @Override
        public Long call(Object[] objects) throws Exception {
            return (Long) objects[countMeasureIndex];
        }
    }).reduce(new Function2<Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>, Tuple2<ByteArray, Long>>() {
        @Override
        public Tuple2<ByteArray, Long> call(Tuple2<ByteArray, Long> longTuple2, Tuple2<ByteArray, Long> longTuple22)
                throws Exception {
            return new Tuple2<>(ONE, longTuple2._2() + longTuple22._2());
        }
    })._2();
    return count;
}

Source File: Tokenizer.java From vn.vitk with GNU General Public License v3.0

6 votes

/**
 * Counts the number of non-space characters in this data set. This utility method 
 * is used to check the tokenization result.
 * @param lines
 * @return number of characters
 */
int numCharacters(JavaRDD<String> lines) {
	JavaRDD<Integer> lengths = lines.map(new Function<String, Integer>() {
		private static final long serialVersionUID = -2189399343462982586L;
		@Override
		public Integer call(String line) throws Exception {
			line = line.replaceAll("[\\s_]+", "");
			return line.length();
		}
	});
	return lengths.reduce(new Function2<Integer, Integer, Integer>() {
		private static final long serialVersionUID = -8438072946884289401L;

		@Override
		public Integer call(Integer e0, Integer e1) throws Exception {
			return e0 + e1;
		}
	});
}

Source File: ReduceTransform.java From nemo with Apache License 2.0

6 votes

/**
 * Reduce the iterator elements into a single object.
 * @param elements the iterator of elements.
 * @param func function to apply for reduction.
 * @param <T> type of the elements.
 * @return the reduced element.
 */
@Nullable
public static <T> T reduceIterator(final Iterator<T> elements, final Function2<T, T, T> func) {
  if (!elements.hasNext()) { // nothing to be done
    return null;
  }

  T res = elements.next();
  while (elements.hasNext()) {
    try {
      res = func.call(res, elements.next());
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  return res;
}

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <U> SparkJavaPairRDD<K, U> aggregateByKey(final U zeroValue,
                                                 final Partitioner partitioner,
                                                 final Function2<U, V, U> seqFunc,
                                                 final Function2<U, U, U> combFunc) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Source File: MLSupporter.java From DDF with Apache License 2.0

5 votes

@Override
public long[][] getConfusionMatrix(IModel model, double threshold) throws DDFException {
  SparkDDF ddf = (SparkDDF) this.getDDF();
  SparkDDF predictions = (SparkDDF) ddf.ML.applyModel(model, true, false);

  // Now get the underlying RDD to compute
  JavaRDD<double[]> yTrueYPred = (JavaRDD<double[]>) predictions.getJavaRDD(double[].class);
  final double threshold1 = threshold;
  long[] cm = yTrueYPred.map(new Function<double[], long[]>() {
    @Override
    public long[] call(double[] params) {
      byte isPos = toByte(params[0] > threshold1);
      byte predPos = toByte(params[1] > threshold1);

      long[] result = new long[] { 0L, 0L, 0L, 0L };
      result[isPos << 1 | predPos] = 1L;
      return result;
    }
  }).reduce(new Function2<long[], long[], long[]>() {
    @Override
    public long[] call(long[] a, long[] b) {
      return new long[] { a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3] };
    }
  });

  return new long[][] { new long[] { cm[3], cm[2] }, new long[] { cm[1], cm[0] } };
}

Source File: FunctionCompiler.java From rheem with Apache License 2.0

5 votes

/**
 * Create an appropriate {@link Function} for deploying the given {@link ReduceDescriptor}
 * on Apache Spark.
 */
public <T> Function2<T, T, T> compile(ReduceDescriptor<T> descriptor,
                                      SparkExecutionOperator operator,
                                      OptimizationContext.OperatorContext operatorContext,
                                      ChannelInstance[] inputs) {
    final BinaryOperator<T> javaImplementation = descriptor.getJavaImplementation();
    if (javaImplementation instanceof FunctionDescriptor.ExtendedSerializableBinaryOperator) {
        return new ExtendedBinaryOperatorAdapter<>(
                (FunctionDescriptor.ExtendedSerializableBinaryOperator<T>) javaImplementation,
                new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber())
        );
    } else {
        return new BinaryOperatorAdapter<>(javaImplementation);
    }
}

Source File: CountLines.java From examples with Apache License 2.0

5 votes

@SuppressWarnings("serial")
public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkGetExample ").setMaster("local[2]");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  JavaRDD<String> textFile = jsc.textFile("hdfs://localhost/user/cloudera/data.txt");
  JavaPairRDD<String, Integer> pairs = textFile.mapToPair(new PairFunction<String, String, Integer>() {
    public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s.substring(0, s.indexOf("|")), 1); }
  });
  JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
    public Integer call(Integer a, Integer b) { return a + b; }
  });
  System.out.println ("We have generaged " + counts.count() + " users");
  jsc.close();
}

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public SparkJavaPairRDD<K, V> reduceByKey(final Function2<V, V, V> func) {
  // Explicit conversion
  final PairRDDFunctions<K, V> pairRdd = RDD.rddToPairRDDFunctions(
    rdd, ClassTag$.MODULE$.apply(Object.class), ClassTag$.MODULE$.apply(Object.class), null);
  final RDD<Tuple2<K, V>> reducedRdd = pairRdd.reduceByKey(func);
  return SparkJavaPairRDD.fromRDD(reducedRdd);
}

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final int numPartitions) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <C> SparkJavaPairRDD<K, C> combineByKey(final Function<V, C> createCombiner,
                                               final Function2<C, V, C> mergeValue,
                                               final Function2<C, C, C> mergeCombiners,
                                               final Partitioner partitioner,
                                               final boolean mapSideCombine,
                                               final Serializer serializer) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Source File: SparkJavaPairRDD.java From incubator-nemo with Apache License 2.0

5 votes

@Override
public <U> SparkJavaPairRDD<K, U> aggregateByKey(final U zeroValue,
                                                 final int numPartitions,
                                                 final Function2<U, V, U> seqFunc,
                                                 final Function2<U, U, U> combFunc) {
  throw new UnsupportedOperationException(NOT_YET_SUPPORTED);
}

Source File: JavaPairRDD.java From nemo with Apache License 2.0

5 votes

@Override
public JavaPairRDD<K, V> reduceByKey(final Function2<V, V, V> func) {
  final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>(dag);

  final IRVertex reduceByKeyVertex = new OperatorVertex(new ReduceByKeyTransform<K, V>(func));
  builder.addVertex(reduceByKeyVertex, loopVertexStack);

  final IREdge newEdge = new IREdge(getEdgeCommunicationPattern(lastVertex, reduceByKeyVertex),
      lastVertex, reduceByKeyVertex, new SparkCoder(serializer));
  newEdge.setProperty(KeyExtractorProperty.of(new SparkKeyExtractor()));
  builder.connectVertices(newEdge);

  return new JavaPairRDD<>(this.sparkContext, builder.buildWithoutSourceSinkCheck(), reduceByKeyVertex);
}

Source File: JavaLogQuery.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLogQuery")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs);

  JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() {
    @Override
    public Tuple2<Tuple3<String, String, String>, Stats> call(String s) {
      return new Tuple2<>(extractKey(s), extractStats(s));
    }
  });

  JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() {
    @Override
    public Stats call(Stats stats, Stats stats2) {
      return stats.merge(stats2);
    }
  });

  List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
  for (Tuple2<?,?> t : output) {
    System.out.println(t._1() + "\t" + t._2());
  }
  spark.stop();
}

Source File: JavaCustomReceiver.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}

Source File: MapPartitionsWithIndex.java From SparkDemo with MIT License

5 votes

private static void mapPartitionsWithIndex(JavaSparkContext sc) {

		List<String> names = Arrays.asList("张三1", "李四1", "王五1", "张三2", "李四2", "王五2", "张三3", "李四3", "王五3", "张三4");

		// 初始化，分为3个分区
		JavaRDD<String> namesRDD = sc.parallelize(names, 3);
		JavaRDD<String> mapPartitionsWithIndexRDD = namesRDD
				.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

					private static final long serialVersionUID = 1L;

					public Iterator<String> call(Integer v1, Iterator<String> v2) throws Exception {
						List<String> list = new ArrayList<String>();
						while (v2.hasNext()) {
							list.add("分区索引:" + v1 + "\t" + v2.next());
						}
						return list.iterator();
					}
				}, true);

		// 从集群获取数据到本地内存中
		List<String> result = mapPartitionsWithIndexRDD.collect();
		for (String s : result) {
			System.out.println(s);
		}

		sc.close();
	}

Source File: Functions.java From spark-streaming-direct-kafka with Apache License 2.0

5 votes

/**
 * @return a function that returns the second of two values
 * @param <T> element type
 */
public static <T> Function2<T,T,T> last() {
    return new Function2<T,T,T>() {
        @Override
        public T call(T current, T next) {
            return next;
        }
    };
}

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

@Override
public Function2<MizoVertex, String, Boolean> parseVertexProperty() {
    return parseVertexProperty;
}

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

public MizoBuilder parseInEdge(Function2<MizoVertex, String, Boolean> predicate) {
    this.parseInEdge = predicate;

    return this;
}

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

@Override
public Function2<MizoVertex, String, Boolean> parseOutEdge() {
    return parseOutEdge;
}

Source File: CoverageModelEMWorkspace.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * A generic function for broadcasting an object to all compute blocks
 *
 * If Spark is enabled:
 *
 *      A {@link Broadcast} will be created from {@param obj} and will be "received" by the compute nodes by calling
 *      {@param pusher}. A reference to the updated RDD will replace the old RDD.
 *
 * If Spark is disabled:
 *
 *      The {@param pusher} function will be called together with {@param obj} and {@link #localComputeBlock}
 *
 * @param obj te object to broadcast
 * @param pusher a map from (V, {@link CoverageModelEMComputeBlock}) -> {@link CoverageModelEMComputeBlock} that
 *               updates the compute block with the broadcasted value
 * @param <V> the type of the broadcasted object
 */
@UpdatesRDD
private <V> void pushToWorkers(@Nonnull final V obj,
                               @Nonnull final Function2<V, CoverageModelEMComputeBlock, CoverageModelEMComputeBlock> pusher) {
    if (sparkContextIsAvailable) {
        final Broadcast<V> broadcastedObj = ctx.broadcast(obj);
        final Function<CoverageModelEMComputeBlock, CoverageModelEMComputeBlock> mapper =
                cb -> pusher.call(broadcastedObj.value(), cb);
        mapWorkers(mapper);
    } else {
        try {
            localComputeBlock = pusher.call(obj, localComputeBlock);
        } catch (final Exception ex) {
            throw new RuntimeException("Can not apply the map function to the local compute block", ex);
        }
    }
}

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

@Override
public Function2<MizoVertex, String, Boolean> parseInEdge() {
    return parseInEdge;
}

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

@Override
public Function2<MizoEdge, String, Boolean> parseEdgeProperty() {
    return parseEdgeProperty;
}

Source File: MizoBuilder.java From mizo with Apache License 2.0

4 votes

public MizoBuilder parseOutEdge(Function2<MizoVertex, String, Boolean> predicate) {
    this.parseOutEdge = predicate;

    return this;
}

org.apache.spark.api.java.function.Function2 Java Examples