org.apache.spark.mllib.clustering.KMeansModel Java Examples
The following examples show how to use
org.apache.spark.mllib.clustering.KMeansModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KMeansClusteringMlib.java From Java-Data-Science-Cookbook with MIT License | 5 votes |
public static void main( String[] args ){ SparkConf conf = new SparkConf().setMaster("local[4]").setAppName("K-means Example"); JavaSparkContext sc = new JavaSparkContext(conf); // Load and parse data String path = "data/km-data.txt"; JavaRDD<String> data = sc.textFile(path); JavaRDD<Vector> parsedData = data.map( new Function<String, Vector>() { public Vector call(String s) { String[] sarray = s.split(" "); double[] values = new double[sarray.length]; for (int i = 0; i < sarray.length; i++) values[i] = Double.parseDouble(sarray[i]); return Vectors.dense(values); } } ); parsedData.cache(); // Cluster the data into two classes using KMeans int numClusters = 2; int numIterations = 20; KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations); // Evaluate clustering by computing Within Set Sum of Squared Errors double WSSSE = clusters.computeCost(parsedData.rdd()); System.out.println("Within Set Sum of Squared Errors = " + WSSSE); }
Example #2
Source File: KMeansTest.java From DDF with Apache License 2.0 | 5 votes |
@Test public void TestKMeans() throws DDFException { createTableAirline(); DDF ddf = manager.sql2ddf("select deptime, arrtime, distance, depdelay, arrdelay from airline", false); KMeansModel kmeansModel = (KMeansModel) ddf.ML.KMeans(5, 5, 2, "random").getRawModel(); Assert.assertEquals(5, kmeansModel.clusterCenters().length); // Assert.assertTrue(kmeansModel.computeCost((RDD<double[]>)ddf.getRepresentationHandler().get( // RDD_ARR_DOUBLE().getTypeSpecsString())) > 0); // Assert.assertTrue(kmeansModel.predict(Vectors.dense(new double[] { 1232, 1341, 389, 7, 1 })) > -1); // Assert.assertTrue(kmeansModel.predict(new double[] { 1232, 1341, 389, 7, 1 }) < 5); }
Example #3
Source File: KMeansUpdate.java From oryx with Apache License 2.0 | 5 votes |
/** * @param sparkContext active Spark Context * @param trainData training data on which to build a model * @param hyperParameters ordered list of hyper parameter values to use in building model * @param candidatePath directory where additional model files can be written * @return a {@link PMML} representation of a model trained on the given data */ @Override public PMML buildModel(JavaSparkContext sparkContext, JavaRDD<String> trainData, List<?> hyperParameters, Path candidatePath) { int numClusters = (Integer) hyperParameters.get(0); Preconditions.checkArgument(numClusters > 1); log.info("Building KMeans Model with {} clusters", numClusters); JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN)); KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations, initializationStrategy); return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel)); }
Example #4
Source File: KMeansUpdate.java From oryx with Apache License 2.0 | 5 votes |
/** * @param model {@link KMeansModel} to translate to PMML * @return PMML representation of a KMeans cluster model */ private PMML kMeansModelToPMML(KMeansModel model, Map<Integer,Long> clusterSizesMap) { ClusteringModel clusteringModel = pmmlClusteringModel(model, clusterSizesMap); PMML pmml = PMMLUtils.buildSkeletonPMML(); pmml.setDataDictionary(AppPMMLUtils.buildDataDictionary(inputSchema, null)); pmml.addModels(clusteringModel); return pmml; }
Example #5
Source File: KMeansUpdate.java From oryx with Apache License 2.0 | 5 votes |
private ClusteringModel pmmlClusteringModel(KMeansModel model, Map<Integer,Long> clusterSizesMap) { Vector[] clusterCenters = model.clusterCenters(); List<ClusteringField> clusteringFields = new ArrayList<>(); for (int i = 0; i < inputSchema.getNumFeatures(); i++) { if (inputSchema.isActive(i)) { FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(i)); ClusteringField clusteringField = new ClusteringField(fieldName).setCenterField(ClusteringField.CenterField.TRUE); clusteringFields.add(clusteringField); } } List<Cluster> clusters = new ArrayList<>(clusterCenters.length); for (int i = 0; i < clusterCenters.length; i++) { clusters.add(new Cluster().setId(Integer.toString(i)) .setSize(clusterSizesMap.get(i).intValue()) .setArray(AppPMMLUtils.toArray(clusterCenters[i].toArray()))); } return new ClusteringModel( MiningFunction.CLUSTERING, ClusteringModel.ModelClass.CENTER_BASED, clusters.size(), AppPMMLUtils.buildMiningSchema(inputSchema), new ComparisonMeasure(ComparisonMeasure.Kind.DISTANCE, new SquaredEuclidean()), clusteringFields, clusters); }
Example #6
Source File: KMeansHullGenerator.java From geowave with Apache License 2.0 | 5 votes |
public static JavaPairRDD<Integer, Iterable<Vector>> groupByIndex( final JavaRDD<Vector> inputPoints, final KMeansModel clusterModel) { // Group the input points by their kmeans centroid index return inputPoints.groupBy(point -> { return clusterModel.predict(point); }); }
Example #7
Source File: JavaKMeansExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaKMeansExample"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ // Load and parse data String path = "data/mllib/kmeans_data.txt"; JavaRDD<String> data = jsc.textFile(path); JavaRDD<Vector> parsedData = data.map( new Function<String, Vector>() { public Vector call(String s) { String[] sarray = s.split(" "); double[] values = new double[sarray.length]; for (int i = 0; i < sarray.length; i++) { values[i] = Double.parseDouble(sarray[i]); } return Vectors.dense(values); } } ); parsedData.cache(); // Cluster the data into two classes using KMeans int numClusters = 2; int numIterations = 20; KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations); System.out.println("Cluster centers:"); for (Vector center: clusters.clusterCenters()) { System.out.println(" " + center); } double cost = clusters.computeCost(parsedData.rdd()); System.out.println("Cost: " + cost); // Evaluate clustering by computing Within Set Sum of Squared Errors double WSSSE = clusters.computeCost(parsedData.rdd()); System.out.println("Within Set Sum of Squared Errors = " + WSSSE); // Save and load model clusters.save(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel"); KMeansModel sameModel = KMeansModel.load(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel"); // $example off$ jsc.stop(); }
Example #8
Source File: KMeansUtils.java From geowave with Apache License 2.0 | 4 votes |
public static DataTypeAdapter writeClusterCentroids( final KMeansModel clusterModel, final DataStorePluginOptions outputDataStore, final String centroidAdapterName, final ScaledTemporalRange scaledRange) { final SimpleFeatureTypeBuilder typeBuilder = new SimpleFeatureTypeBuilder(); typeBuilder.setName(centroidAdapterName); typeBuilder.setNamespaceURI(BasicFeatureTypes.DEFAULT_NAMESPACE); try { typeBuilder.setCRS(CRS.decode("EPSG:4326", true)); } catch (final FactoryException fex) { LOGGER.error(fex.getMessage(), fex); } final AttributeTypeBuilder attrBuilder = new AttributeTypeBuilder(); typeBuilder.add( attrBuilder.binding(Geometry.class).nillable(false).buildDescriptor( Geometry.class.getName().toString())); if (scaledRange != null) { typeBuilder.add(attrBuilder.binding(Date.class).nillable(false).buildDescriptor("Time")); } typeBuilder.add( attrBuilder.binding(Integer.class).nillable(false).buildDescriptor("ClusterIndex")); final SimpleFeatureType sfType = typeBuilder.buildFeatureType(); final SimpleFeatureBuilder sfBuilder = new SimpleFeatureBuilder(sfType); final FeatureDataAdapter featureAdapter = new FeatureDataAdapter(sfType); final DataStore featureStore = outputDataStore.createDataStore(); final Index featureIndex = new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions()); featureStore.addType(featureAdapter, featureIndex); try (Writer writer = featureStore.createWriter(featureAdapter.getTypeName())) { for (final Vector center : clusterModel.clusterCenters()) { final int index = clusterModel.predict(center); final double lon = center.apply(0); final double lat = center.apply(1); sfBuilder.set( Geometry.class.getName(), GeometryUtils.GEOMETRY_FACTORY.createPoint(new Coordinate(lon, lat))); if ((scaledRange != null) && (center.size() > 2)) { final double timeVal = center.apply(2); final Date time = scaledRange.valueToTime(timeVal); sfBuilder.set("Time", time); LOGGER.warn("Write time: " + time); } sfBuilder.set("ClusterIndex", index); final SimpleFeature sf = sfBuilder.buildFeature("Centroid-" + index); writer.write(sf); } } return featureAdapter; }
Example #9
Source File: KMeansRunner.java From geowave with Apache License 2.0 | 4 votes |
public KMeansModel getOutputModel() { return outputModel; }
Example #10
Source File: GeoWaveSparkKMeansIT.java From geowave with Apache License 2.0 | 4 votes |
@Test public void testKMeansRunner() throws Exception { // Load data TestUtils.testLocalIngest(inputDataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1); // Create the runner long mark = System.currentTimeMillis(); final KMeansRunner runner = new KMeansRunner(); runner.setSparkSession(SparkTestEnvironment.getInstance().defaultSession); runner.setInputDataStore(inputDataStore); runner.setTypeName("hail"); runner.setCqlFilter(CQL_FILTER); runner.setUseTime(true); // Set output params to write centroids + hulls to store. runner.setOutputDataStore(inputDataStore); runner.setCentroidTypeName("kmeans-centroids-test"); runner.setGenerateHulls(true); runner.setComputeHullData(true); runner.setHullTypeName("kmeans-hulls-test"); // Run kmeans try { runner.run(); } catch (final IOException e) { throw new RuntimeException("Failed to execute: " + e.getMessage()); } // Create the output final KMeansModel clusterModel = runner.getOutputModel(); long dur = (System.currentTimeMillis() - mark); LOGGER.warn("KMeans duration: " + dur + " ms."); // Write out the centroid features final short centroidInternalAdapterId = inputDataStore.createInternalAdapterStore().getAdapterId("kmeans-centroids-test"); final DataTypeAdapter centroidAdapter = inputDataStore.createAdapterStore().getAdapter(centroidInternalAdapterId); // Query back from the new adapter mark = System.currentTimeMillis(); queryFeatures(centroidAdapter, clusterModel.clusterCenters().length); dur = (System.currentTimeMillis() - mark); LOGGER.warn("Centroid verify: " + dur + " ms."); // Generate the hulls final JavaPairRDD<Integer, Iterable<Vector>> groupByRDD = KMeansHullGenerator.groupByIndex(runner.getInputCentroids(), clusterModel); final JavaPairRDD<Integer, Geometry> hullsRDD = KMeansHullGenerator.generateHullsRDD(groupByRDD); Assert.assertTrue( "centroids from the model should match the hull count", clusterModel.clusterCenters().length == hullsRDD.count()); System.out.println("KMeans cluster hulls:"); for (final Tuple2<Integer, Geometry> hull : hullsRDD.collect()) { System.out.println("> Hull size (verts): " + hull._2.getNumPoints()); System.out.println("> Hull centroid: " + hull._2.getCentroid().toString()); } final short hullInternalAdapterId = inputDataStore.createInternalAdapterStore().getAdapterId("kmeans-hulls-test"); // Write out the hull features w/ metadata final DataTypeAdapter hullAdapter = inputDataStore.createAdapterStore().getAdapter(hullInternalAdapterId); mark = System.currentTimeMillis(); // Query back from the new adapter queryFeatures(hullAdapter, clusterModel.clusterCenters().length); dur = (System.currentTimeMillis() - mark); LOGGER.warn("Hull verify: " + dur + " ms."); TestUtils.deleteAll(inputDataStore); }
Example #11
Source File: KMeansUpdate.java From oryx with Apache License 2.0 | 2 votes |
/** * @param trainPointData data to cluster * @param model trained KMeans Model * @return map of ClusterId, count of points associated with the clusterId */ private static Map<Integer,Long> fetchClusterCountsFromModel(JavaRDD<? extends Vector> trainPointData, KMeansModel model) { return trainPointData.map(model::predict).countByValue(); }