org.apache.spark.api.java.function.Function Java Examples
The following examples show how to use
org.apache.spark.api.java.function.Function.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkCombineFn.java From beam with Apache License 2.0 | 6 votes |
SparkCombineFn( boolean global, Function<InputT, ValueT> toValue, CombineWithContext.CombineFnWithContext<ValueT, AccumT, OutputT> combineFn, SerializablePipelineOptions options, Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs, WindowingStrategy<?, ?> windowingStrategy) { this( global, toValue, combineFn, options, sideInputs, windowingStrategy, WindowedAccumulator.Type.EXPLODE_WINDOWS); }
Example #2
Source File: SparkStreamDemo.java From sparkResearch with Apache License 2.0 | 6 votes |
public static void main(String[] args) { //创建两个核心的本地线程,批处理的间隔为1秒 SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamIng"); JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Durations.seconds(1)); //创建一个连接到IP:localhost,PORT:8080的DStream JavaReceiverInputDStream<String> dStream = javaStreamingContext.socketTextStream("localhost", 8080); JavaDStream<String> errorLine = dStream.filter(new Function<String, Boolean>() { @Override public Boolean call(String v1) throws Exception { return v1.contains("error"); } }); //打印包含error的行 errorLine.print(); try { //开始计算 javaStreamingContext.start(); //等待计算完成 javaStreamingContext.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example #3
Source File: FunctionCompiler.java From rheem with Apache License 2.0 | 6 votes |
/** * Create an appropriate {@link Function}-based predicate for deploying the given {@link PredicateDescriptor} * on Apache Spark. * * @param predicateDescriptor describes the function * @param operator that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction} * @param operatorContext contains optimization information for the {@code operator} * @param inputs that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction} */ public <Type> Function<Type, Boolean> compile( PredicateDescriptor<Type> predicateDescriptor, SparkExecutionOperator operator, OptimizationContext.OperatorContext operatorContext, ChannelInstance[] inputs) { final Predicate<Type> javaImplementation = predicateDescriptor.getJavaImplementation(); if (javaImplementation instanceof PredicateDescriptor.ExtendedSerializablePredicate) { return new ExtendedPredicateAdapater<>( (PredicateDescriptor.ExtendedSerializablePredicate<Type>) javaImplementation, new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber()) ); } else { return new PredicateAdapter<>(javaImplementation); } }
Example #4
Source File: LogError.java From sparkResearch with Apache License 2.0 | 6 votes |
/** * 对日志进行 转换操作和行动操作 */ public void log(JavaSparkContext sparkContext) { JavaRDD<String> inputRDD = sparkContext.textFile("/usr/local/log"); JavaRDD<String> errorRDD = inputRDD.filter(new Function<String, Boolean>() { @Override public Boolean call(String v1) throws Exception { return null; } }); long errorRDDCount = errorRDD.count(); System.out.println("errorRDD count is " + errorRDDCount); for (String rddLine : errorRDD.take(10)) { System.out.println("errorRDD 数据is " + rddLine); } }
Example #5
Source File: Tagger.java From vn.vitk with GNU General Public License v3.0 | 6 votes |
private JavaRDD<String> toTaggedSentence(DataFrame output) { return output.javaRDD().map(new Function<Row, String>() { private static final long serialVersionUID = 4208643510231783579L; @Override public String call(Row row) throws Exception { String[] tokens = row.getString(0).trim().split("\\s+"); String[] tags = row.getString(1).trim().split("\\s+"); if (tokens.length != tags.length) { System.err.println("Incompatible lengths!"); return null; } StringBuilder sb = new StringBuilder(64); for (int j = 0; j < tokens.length; j++) { sb.append(tokens[j]); sb.append('/'); sb.append(tags[j]); sb.append(' '); } return sb.toString().trim(); } }); }
Example #6
Source File: SparkSqlApplication.java From sparkResearch with Apache License 2.0 | 6 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder().master("local") .appName("Java Spark SQL") .getOrCreate(); Dataset<Row> dataset = sparkSession.read().json("D:\\a.txt"); //只返回name字段 dataset.select("name").show(); //返回两个字段,所有age的value+1 dataset.select(col("name"),col("age").plus(1)).show(); //选择age大于21岁的人 dataset.filter(col("age").gt(21)).show(); //分组聚合,group age dataset.groupBy("age").count().show(); //显示 dataset.show(); /*以编程的方式运行SQL查询*/ //注册临时表 dataset.createOrReplaceTempView("user"); Dataset<Row> users = sparkSession.sql("SELECT * FROM user"); JavaRDD<Object> toText = users.toJavaRDD().map((Function<Row, Object>) v1 -> v1.getString(0)); users.show(); }
Example #7
Source File: BroadCastParam.java From sparkResearch with Apache License 2.0 | 6 votes |
/** * 广播变量测试 * @param args */ public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder() .master("local[4]").appName("AttackFind").getOrCreate(); //初始化sparkContext JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext()); //在这里假定一份广播变量 //因为我们之前说过,广播变量只可读 final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK"); //设置广播变量,把broadcast广播出去 final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList); //定义数据 JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000"))); JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2)); resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println); }
Example #8
Source File: DependencyParser.java From vn.vitk with GNU General Public License v3.0 | 6 votes |
/** * Parses all sentences in an input file, each on a line and writes the result to * the console window containing flattened dependency tuples. * @param jsc * @param inputFileName */ public void parse(JavaSparkContext jsc, String inputFileName) { List<String> sentences = jsc.textFile(inputFileName).collect(); JavaRDD<String> input = jsc.parallelize(sentences); JavaRDD<Sentence> sents = input.map(new TaggedLineToSentenceFunction()); JavaRDD<DependencyGraph> graphs = sents.map(new ParsingFunction()); JavaRDD<String> rows = graphs.map(new Function<DependencyGraph, String>() { private static final long serialVersionUID = -6021310762521034121L; public String call(DependencyGraph graph) { return graph.dependencies(); } }); for (String s : rows.collect()) { System.out.println(s); } }
Example #9
Source File: NGramBuilder.java From vn.vitk with GNU General Public License v3.0 | 6 votes |
/** * Creates a n-gram data frame from text lines. * @param lines * @return a n-gram data frame. */ DataFrame createNGramDataFrame(JavaRDD<String> lines) { JavaRDD<Row> rows = lines.map(new Function<String, Row>(){ private static final long serialVersionUID = -4332903997027358601L; @Override public Row call(String line) throws Exception { return RowFactory.create(Arrays.asList(line.split("\\s+"))); } }); StructType schema = new StructType(new StructField[] { new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema); // build a bigram language model NGram transformer = new NGram().setInputCol("words") .setOutputCol("ngrams").setN(2); DataFrame ngramDF = transformer.transform(wordDF); ngramDF.show(10, false); return ngramDF; }
Example #10
Source File: SparkUtil.java From kylin-on-parquet-v2 with Apache License 2.0 | 6 votes |
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().sparkContext(HiveUtils.withHiveExternalCatalog(sc.sc())) .config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }
Example #11
Source File: IteratorUtilsTest.java From kylin-on-parquet-v2 with Apache License 2.0 | 6 votes |
private static ArrayList<Tuple2<Integer, Integer>> getResult(List<Tuple2<Integer, Integer>> list) { return Lists.newArrayList(IteratorUtils.merge(list.iterator(), new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return o1 - o2; } }, new Function<Iterable<Integer>, Integer>() { @Override public Integer call(Iterable<Integer> v1) throws Exception { int sum = 0; for (Integer integer : v1) { sum += integer; } return sum; } })); }
Example #12
Source File: SparkCombineFn.java From beam with Apache License 2.0 | 6 votes |
/** Create concrete accumulator for given type. */ static <InputT, ValueT, AccumT> WindowedAccumulator<InputT, ValueT, AccumT, ?> create( Function<InputT, ValueT> toValue, Type type, Iterable<WindowedValue<AccumT>> values, Comparator<BoundedWindow> windowComparator) { switch (type) { case MERGING: return MergingWindowedAccumulator.from(toValue, values, windowComparator); case NON_MERGING: return NonMergingWindowedAccumulator.from(toValue, values); case SINGLE_WINDOW: case EXPLODE_WINDOWS: Iterator<WindowedValue<AccumT>> iter = values.iterator(); if (iter.hasNext()) { return SingleWindowWindowedAccumulator.create(toValue, iter.next()); } return SingleWindowWindowedAccumulator.create(toValue); default: throw new IllegalArgumentException("Unknown type: " + type); } }
Example #13
Source File: Filter.java From SparkDemo with MIT License | 6 votes |
private static void filter(JavaSparkContext sc) { List<Integer> datas = Arrays.asList(1, 2, 3, 7, 4, 5, 8); JavaRDD<Integer> rddData = sc.parallelize(datas); JavaRDD<Integer> filterRDD = rddData.filter( // jdk1.8 // v1 -> v1 >= 3 new Function<Integer, Boolean>() { public Boolean call(Integer v) throws Exception { // 过滤小于4的数 return v >= 4; } }); filterRDD.foreach( // jdk1.8 // v -> System.out.println(v) new VoidFunction<Integer>() { @Override public void call(Integer integer) throws Exception { System.out.println(integer); } }); sc.close(); }
Example #14
Source File: SparkCombineFn.java From beam with Apache License 2.0 | 5 votes |
SingleWindowWindowedAccumulator( Function<InputT, ValueT> toValue, WindowedValue<AccumT> accumulator) { this.toValue = toValue; this.windowAccumulator = accumulator.getValue(); this.accTimestamp = accumulator.getTimestamp().equals(BoundedWindow.TIMESTAMP_MIN_VALUE) ? null : accumulator.getTimestamp(); this.accWindow = getWindow(accumulator); }
Example #15
Source File: CustomDataFrame.java From sparkResearch with Apache License 2.0 | 5 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder() .master("local") .appName("spark app") .getOrCreate(); //创建普通的JavaRDD JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD(); //字符串编码的模式 String schema = "name age"; //根据模式的字符串生成模式 List<StructField> structFieldList = new ArrayList<>(); for (String fieldName : schema.split(" ")) { StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true); structFieldList.add(structField); } StructType structType = DataTypes.createStructType(structFieldList); JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() { @Override public Row call(String v1) { String[] attirbutes = v1.split(","); return RowFactory.create(attirbutes[0], attirbutes[1].trim()); } }); //将模式应用于RDD Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType); //创建临时视图 dataset.createOrReplaceTempView("user"); Dataset<Row> result = sparkSession.sql("select * from user"); result.show(); }
Example #16
Source File: BatchProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
private static Function<Row, IoTData> getRowIoTDataFunction() { return row -> new IoTData( row.getString(6), row.getString(7), row.getString(3), row.getString(1), row.getString(2), row.getDate(5), row.getDouble(4), row.getDouble(0) ); }
Example #17
Source File: SparkUtil.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) { return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values() .map(new Function<Text, String[]>() { @Override public String[] call(Text text) throws Exception { String s = Bytes.toString(text.getBytes(), 0, text.getLength()); return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER, -1); } }); }
Example #18
Source File: SparkFrontendUtils.java From incubator-nemo with Apache License 2.0 | 5 votes |
/** * Converts a {@link PairFunction} to a plain map {@link Function}. * * @param pairFunction the pair function to convert. * @param <T> the type of original element. * @param <K> the type of converted key. * @param <V> the type of converted value. * @return the converted map function. */ public static <T, K, V> Function<T, Tuple2<K, V>> pairFunctionToPlainFunction( final PairFunction<T, K, V> pairFunction) { return new Function<T, Tuple2<K, V>>() { @Override public Tuple2<K, V> call(final T elem) throws Exception { return pairFunction.call(elem); } }; }
Example #19
Source File: Functions.java From spark-streaming-direct-kafka with Apache License 2.0 | 5 votes |
public static <T> Function<T,T> identity() { return new Function<T,T>() { @Override public T call(T t) { return t; } }; }
Example #20
Source File: FunctionCompiler.java From rheem with Apache License 2.0 | 5 votes |
/** * Create an appropriate {@link Function} for deploying the given {@link MapPartitionsDescriptor} * on Apache Spark's {@link JavaRDD#mapPartitions(FlatMapFunction)}. * * @param descriptor describes the function * @param operator that executes the {@link Function}; only required if the {@code descriptor} describes an {@link ExtendedFunction} * @param operatorContext contains optimization information for the {@code operator} * @param inputs that feed the {@code operator}; only required if the {@code descriptor} describes an {@link ExtendedFunction} */ public <I, O> FlatMapFunction<Iterator<I>, O> compile(MapPartitionsDescriptor<I, O> descriptor, SparkExecutionOperator operator, OptimizationContext.OperatorContext operatorContext, ChannelInstance[] inputs) { final java.util.function.Function<Iterable<I>, Iterable<O>> javaImplementation = descriptor.getJavaImplementation(); if (javaImplementation instanceof FunctionDescriptor.ExtendedSerializableFunction) { return new ExtendedMapPartitionsFunctionAdapter<>( (FunctionDescriptor.ExtendedSerializableFunction<Iterable<I>, Iterable<O>>) javaImplementation, new SparkExecutionContext(operator, inputs, operatorContext.getOptimizationContext().getIterationNumber()) ); } else { return new MapPartitionsFunctionAdapter<>(javaImplementation); } }
Example #21
Source File: CaseWhenTest.java From BigDataPlatform with GNU General Public License v3.0 | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf() .setMaster("local") .setAppName("CaseWhenTest"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc.sc()); List<Integer> grades = Arrays.asList(85, 90, 60, 73); JavaRDD<Integer> gradesRDD = sc.parallelize(grades); JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() { private static final long serialVersionUID = 1L; @Override public Row call(Integer grade) throws Exception { return RowFactory.create(grade); } }); StructType schema = DataTypes.createStructType(Arrays.asList( DataTypes.createStructField("grade", DataTypes.IntegerType, true))); Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema); gradesDF.registerTempTable("grades"); Dataset<Row> gradeLevelDF = sqlContext.sql( "SELECT CASE " + "WHEN grade>=90 THEN 'A' " + "WHEN grade>=80 THEN 'B' " + "WHEN grade>=70 THEN 'C' " + "WHEN grade>=60 THEN 'D' " + "ELSE 'E' " + "END gradeLevel " + "FROM grades"); gradeLevelDF.show(); sc.close(); }
Example #22
Source File: SparkCombineFn.java From beam with Apache License 2.0 | 5 votes |
static <InputT, ValueT, AccumT> WindowedAccumulator<InputT, ValueT, AccumT, ?> create( Function<InputT, ValueT> toValue, Type type, Comparator<BoundedWindow> windowComparator) { switch (type) { case MERGING: return MergingWindowedAccumulator.create(toValue, windowComparator); case NON_MERGING: return NonMergingWindowedAccumulator.create(toValue); case SINGLE_WINDOW: case EXPLODE_WINDOWS: return SingleWindowWindowedAccumulator.create(toValue); default: throw new IllegalArgumentException("Unknown type: " + type); } }
Example #23
Source File: ScalaTest.java From Java-Data-Science-Cookbook with MIT License | 5 votes |
public static void main( String[] args ){ String inputFile = "data/dummy.txt"; SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App"); JavaSparkContext sparkContext = new JavaSparkContext(configuration); JavaRDD<String> logData = sparkContext.textFile(inputFile).cache(); long numberA = logData.filter(new Function<String,Boolean>(){ private static final long serialVersionUID = 1L; public Boolean call(String s){ return s.length() == 0; } }).count(); sparkContext.close(); System.out.println("Empty Lines: " + numberA); }
Example #24
Source File: WordCountTransformOpEx.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) throws Exception { System.setProperty("hadoop.home.dir", "E:\\hadoop"); SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); wordCounts.print(); JavaPairDStream<String, Integer> joinedDstream = wordCounts .transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() { @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception { JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair( new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() { @Override public Tuple2<String, Integer> call( Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception { return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2())); } }); return modRDD; } }); joinedDstream.print(); streamingContext.start(); streamingContext.awaitTermination(); }
Example #25
Source File: WordCountSocketJava8Ex.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) throws Exception { System.setProperty("hadoop.home.dir", "E:\\hadoop"); SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); wordCounts.print(); JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair( new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() { @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception { rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() { @Override public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception { // TODO Auto-generated method stub return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) ); } }); return rdd; } }); joinedDstream.print(); streamingContext.start(); streamingContext.awaitTermination(); }
Example #26
Source File: MapTest.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { JavaSparkContext sc = SparkUtils.getLocalSparkContext(MapTest.class); List<String> list = Arrays.asList("hello,bjsxt", "hello,xuruyun"); JavaRDD<String> linesRDD = sc.parallelize(list); JavaRDD<Object> mapRDD = linesRDD.map(new Function<String, Object>() { @Override public Object call(String v1) throws Exception { return v1.split(","); } }); JavaRDD<String> flatMapRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String t) throws Exception { // TODO Auto-generated method stub return Arrays.asList(t.split(",")).iterator(); } }); List<Object> collect = mapRDD.collect(); // Action算子 触发执行 for (Object obj : collect) { System.out.println(obj); } List<String> collect2 = flatMapRDD.collect(); // Action算子 触发执行 for (String s : collect2) { System.out.println(s); } }
Example #27
Source File: JavaGaussianMixtureExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaGaussianMixtureExample"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ // Load and parse data String path = "data/mllib/gmm_data.txt"; JavaRDD<String> data = jsc.textFile(path); JavaRDD<Vector> parsedData = data.map( new Function<String, Vector>() { public Vector call(String s) { String[] sarray = s.trim().split(" "); double[] values = new double[sarray.length]; for (int i = 0; i < sarray.length; i++) { values[i] = Double.parseDouble(sarray[i]); } return Vectors.dense(values); } } ); parsedData.cache(); // Cluster the data into two classes using GaussianMixture GaussianMixtureModel gmm = new GaussianMixture().setK(2).run(parsedData.rdd()); // Save and load GaussianMixtureModel gmm.save(jsc.sc(), "target/org/apache/spark/JavaGaussianMixtureExample/GaussianMixtureModel"); GaussianMixtureModel sameModel = GaussianMixtureModel.load(jsc.sc(), "target/org.apache.spark.JavaGaussianMixtureExample/GaussianMixtureModel"); // Output the parameters of the mixture model for (int j = 0; j < gmm.k(); j++) { System.out.printf("weight=%f\nmu=%s\nsigma=\n%s\n", gmm.weights()[j], gmm.gaussians()[j].mu(), gmm.gaussians()[j].sigma()); } // $example off$ jsc.stop(); }
Example #28
Source File: JavaFlumeEventCount.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: JavaFlumeEventCount <host> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); String host = args[0]; int port = Integer.parseInt(args[1]); Duration batchInterval = new Duration(2000); SparkConf sparkConf = new SparkConf().setAppName("JavaFlumeEventCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, batchInterval); JavaReceiverInputDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(ssc, host, port); flumeStream.count(); flumeStream.count().map(new Function<Long, String>() { @Override public String call(Long in) { return "Received " + in + " flume events."; } }).print(); ssc.start(); ssc.awaitTermination(); }
Example #29
Source File: JavaLogisticRegressionWithLBFGSExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); JavaRDD<LabeledPoint> training = splits[0].cache(); JavaRDD<LabeledPoint> test = splits[1]; // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training.rdd()); // Compute raw scores on the test set. JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { public Tuple2<Object, Object> call(LabeledPoint p) { Double prediction = model.predict(p.features()); return new Tuple2<Object, Object>(prediction, p.label()); } } ); // Get evaluation metrics. MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); double accuracy = metrics.accuracy(); System.out.println("Accuracy = " + accuracy); // Save and load model model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); // $example off$ sc.stop(); }
Example #30
Source File: JavaSimpleFPGrowth.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { JavaSparkContext sc = SparkUtils.getLocalSparkContext(JavaSimpleFPGrowth.class); // $example on$ JavaRDD<String> data = sc.textFile(Constant.LOCAL_FILE_PREX +"/data/mllib/sample_fpgrowth.txt"); JavaRDD<List<String>> transactions = data.map( new Function<String, List<String>>() { public List<String> call(String line) { String[] parts = line.split(" "); return Arrays.asList(parts); } } ); FPGrowth fpg = new FPGrowth() .setMinSupport(0.2) .setNumPartitions(10); FPGrowthModel<String> model = fpg.run(transactions); for (FPGrowth.FreqItemset<String> itemset: model.freqItemsets().toJavaRDD().collect()) { System.out.println("[" + itemset.javaItems() + "], " + itemset.freq()); } double minConfidence = 0.8; for (AssociationRules.Rule<String> rule : model.generateAssociationRules(minConfidence).toJavaRDD().collect()) { System.out.println( rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence()); } // $example off$ sc.stop(); }