org.apache.spark.mllib.clustering.KMeansModel Java Examples

The following examples show how to use org.apache.spark.mllib.clustering.KMeansModel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KMeansClusteringMlib.java    From Java-Data-Science-Cookbook with MIT License 5 votes vote down vote up
public static void main( String[] args ){
	SparkConf conf = new SparkConf().setMaster("local[4]").setAppName("K-means Example");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load and parse data
    String path = "data/km-data.txt";
    JavaRDD<String> data = sc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++)
            values[i] = Double.parseDouble(sarray[i]);
          return Vectors.dense(values);
        }
      }
    );
    parsedData.cache();

    // Cluster the data into two classes using KMeans
    int numClusters = 2;
    int numIterations = 20;
    KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    double WSSSE = clusters.computeCost(parsedData.rdd());
    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
	
	
	
}
 
Example #2
Source File: KMeansTest.java    From DDF with Apache License 2.0 5 votes vote down vote up
@Test
public void TestKMeans() throws DDFException {
  createTableAirline();

  DDF ddf = manager.sql2ddf("select deptime, arrtime, distance, depdelay, arrdelay from airline", false);

  KMeansModel kmeansModel = (KMeansModel) ddf.ML.KMeans(5, 5, 2, "random").getRawModel();
  Assert.assertEquals(5, kmeansModel.clusterCenters().length);
  // Assert.assertTrue(kmeansModel.computeCost((RDD<double[]>)ddf.getRepresentationHandler().get(
  // RDD_ARR_DOUBLE().getTypeSpecsString())) > 0);
  // Assert.assertTrue(kmeansModel.predict(Vectors.dense(new double[] { 1232, 1341, 389, 7, 1 })) > -1);
  // Assert.assertTrue(kmeansModel.predict(new double[] { 1232, 1341, 389, 7, 1 }) < 5);

}
 
Example #3
Source File: KMeansUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int numClusters = (Integer) hyperParameters.get(0);
  Preconditions.checkArgument(numClusters > 1);
  log.info("Building KMeans Model with {} clusters", numClusters);

  JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
  KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations, initializationStrategy);

  return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
 
Example #4
Source File: KMeansUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
/**
 * @param model {@link KMeansModel} to translate to PMML
 * @return PMML representation of a KMeans cluster model
 */
private PMML kMeansModelToPMML(KMeansModel model, Map<Integer,Long> clusterSizesMap) {
  ClusteringModel clusteringModel = pmmlClusteringModel(model, clusterSizesMap);
  PMML pmml = PMMLUtils.buildSkeletonPMML();
  pmml.setDataDictionary(AppPMMLUtils.buildDataDictionary(inputSchema, null));
  pmml.addModels(clusteringModel);
  return pmml;
}
 
Example #5
Source File: KMeansUpdate.java    From oryx with Apache License 2.0 5 votes vote down vote up
private ClusteringModel pmmlClusteringModel(KMeansModel model,
                                            Map<Integer,Long> clusterSizesMap) {
  Vector[] clusterCenters = model.clusterCenters();

  List<ClusteringField> clusteringFields = new ArrayList<>();
  for (int i = 0; i < inputSchema.getNumFeatures(); i++) {
    if (inputSchema.isActive(i)) {
      FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(i));
      ClusteringField clusteringField =
          new ClusteringField(fieldName).setCenterField(ClusteringField.CenterField.TRUE);
      clusteringFields.add(clusteringField);
    }
  }

  List<Cluster> clusters = new ArrayList<>(clusterCenters.length);
  for (int i = 0; i < clusterCenters.length; i++) {
    clusters.add(new Cluster().setId(Integer.toString(i))
                     .setSize(clusterSizesMap.get(i).intValue())
                     .setArray(AppPMMLUtils.toArray(clusterCenters[i].toArray())));
  }

  return new ClusteringModel(
      MiningFunction.CLUSTERING,
      ClusteringModel.ModelClass.CENTER_BASED,
      clusters.size(),
      AppPMMLUtils.buildMiningSchema(inputSchema),
      new ComparisonMeasure(ComparisonMeasure.Kind.DISTANCE, new SquaredEuclidean()),
      clusteringFields,
      clusters);
}
 
Example #6
Source File: KMeansHullGenerator.java    From geowave with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<Integer, Iterable<Vector>> groupByIndex(
    final JavaRDD<Vector> inputPoints,
    final KMeansModel clusterModel) {
  // Group the input points by their kmeans centroid index
  return inputPoints.groupBy(point -> {
    return clusterModel.predict(point);
  });
}
 
Example #7
Source File: JavaKMeansExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKMeansExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // Load and parse data
    String path = "data/mllib/kmeans_data.txt";
    JavaRDD<String> data = jsc.textFile(path);
    JavaRDD<Vector> parsedData = data.map(
      new Function<String, Vector>() {
        public Vector call(String s) {
          String[] sarray = s.split(" ");
          double[] values = new double[sarray.length];
          for (int i = 0; i < sarray.length; i++) {
            values[i] = Double.parseDouble(sarray[i]);
          }
          return Vectors.dense(values);
        }
      }
    );
    parsedData.cache();

    // Cluster the data into two classes using KMeans
    int numClusters = 2;
    int numIterations = 20;
    KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);

    System.out.println("Cluster centers:");
    for (Vector center: clusters.clusterCenters()) {
      System.out.println(" " + center);
    }
    double cost = clusters.computeCost(parsedData.rdd());
    System.out.println("Cost: " + cost);

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    double WSSSE = clusters.computeCost(parsedData.rdd());
    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);

    // Save and load model
    clusters.save(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel");
    KMeansModel sameModel = KMeansModel.load(jsc.sc(),
      "target/org/apache/spark/JavaKMeansExample/KMeansModel");
    // $example off$

    jsc.stop();
  }
 
Example #8
Source File: KMeansUtils.java    From geowave with Apache License 2.0 4 votes vote down vote up
public static DataTypeAdapter writeClusterCentroids(
    final KMeansModel clusterModel,
    final DataStorePluginOptions outputDataStore,
    final String centroidAdapterName,
    final ScaledTemporalRange scaledRange) {
  final SimpleFeatureTypeBuilder typeBuilder = new SimpleFeatureTypeBuilder();
  typeBuilder.setName(centroidAdapterName);
  typeBuilder.setNamespaceURI(BasicFeatureTypes.DEFAULT_NAMESPACE);

  try {
    typeBuilder.setCRS(CRS.decode("EPSG:4326", true));
  } catch (final FactoryException fex) {
    LOGGER.error(fex.getMessage(), fex);
  }

  final AttributeTypeBuilder attrBuilder = new AttributeTypeBuilder();

  typeBuilder.add(
      attrBuilder.binding(Geometry.class).nillable(false).buildDescriptor(
          Geometry.class.getName().toString()));

  if (scaledRange != null) {
    typeBuilder.add(attrBuilder.binding(Date.class).nillable(false).buildDescriptor("Time"));
  }

  typeBuilder.add(
      attrBuilder.binding(Integer.class).nillable(false).buildDescriptor("ClusterIndex"));

  final SimpleFeatureType sfType = typeBuilder.buildFeatureType();
  final SimpleFeatureBuilder sfBuilder = new SimpleFeatureBuilder(sfType);

  final FeatureDataAdapter featureAdapter = new FeatureDataAdapter(sfType);

  final DataStore featureStore = outputDataStore.createDataStore();
  final Index featureIndex =
      new SpatialDimensionalityTypeProvider().createIndex(new SpatialOptions());
  featureStore.addType(featureAdapter, featureIndex);
  try (Writer writer = featureStore.createWriter(featureAdapter.getTypeName())) {
    for (final Vector center : clusterModel.clusterCenters()) {
      final int index = clusterModel.predict(center);

      final double lon = center.apply(0);
      final double lat = center.apply(1);

      sfBuilder.set(
          Geometry.class.getName(),
          GeometryUtils.GEOMETRY_FACTORY.createPoint(new Coordinate(lon, lat)));

      if ((scaledRange != null) && (center.size() > 2)) {
        final double timeVal = center.apply(2);

        final Date time = scaledRange.valueToTime(timeVal);

        sfBuilder.set("Time", time);

        LOGGER.warn("Write time: " + time);
      }

      sfBuilder.set("ClusterIndex", index);

      final SimpleFeature sf = sfBuilder.buildFeature("Centroid-" + index);

      writer.write(sf);
    }
  }

  return featureAdapter;
}
 
Example #9
Source File: KMeansRunner.java    From geowave with Apache License 2.0 4 votes vote down vote up
public KMeansModel getOutputModel() {
  return outputModel;
}
 
Example #10
Source File: GeoWaveSparkKMeansIT.java    From geowave with Apache License 2.0 4 votes vote down vote up
@Test
public void testKMeansRunner() throws Exception {

  // Load data
  TestUtils.testLocalIngest(inputDataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1);

  // Create the runner
  long mark = System.currentTimeMillis();
  final KMeansRunner runner = new KMeansRunner();
  runner.setSparkSession(SparkTestEnvironment.getInstance().defaultSession);
  runner.setInputDataStore(inputDataStore);
  runner.setTypeName("hail");
  runner.setCqlFilter(CQL_FILTER);
  runner.setUseTime(true);
  // Set output params to write centroids + hulls to store.
  runner.setOutputDataStore(inputDataStore);
  runner.setCentroidTypeName("kmeans-centroids-test");

  runner.setGenerateHulls(true);
  runner.setComputeHullData(true);
  runner.setHullTypeName("kmeans-hulls-test");

  // Run kmeans
  try {
    runner.run();
  } catch (final IOException e) {
    throw new RuntimeException("Failed to execute: " + e.getMessage());
  }

  // Create the output
  final KMeansModel clusterModel = runner.getOutputModel();

  long dur = (System.currentTimeMillis() - mark);
  LOGGER.warn("KMeans duration: " + dur + " ms.");
  // Write out the centroid features

  final short centroidInternalAdapterId =
      inputDataStore.createInternalAdapterStore().getAdapterId("kmeans-centroids-test");

  final DataTypeAdapter centroidAdapter =
      inputDataStore.createAdapterStore().getAdapter(centroidInternalAdapterId);

  // Query back from the new adapter
  mark = System.currentTimeMillis();
  queryFeatures(centroidAdapter, clusterModel.clusterCenters().length);
  dur = (System.currentTimeMillis() - mark);
  LOGGER.warn("Centroid verify: " + dur + " ms.");

  // Generate the hulls
  final JavaPairRDD<Integer, Iterable<Vector>> groupByRDD =
      KMeansHullGenerator.groupByIndex(runner.getInputCentroids(), clusterModel);
  final JavaPairRDD<Integer, Geometry> hullsRDD =
      KMeansHullGenerator.generateHullsRDD(groupByRDD);

  Assert.assertTrue(
      "centroids from the model should match the hull count",
      clusterModel.clusterCenters().length == hullsRDD.count());

  System.out.println("KMeans cluster hulls:");
  for (final Tuple2<Integer, Geometry> hull : hullsRDD.collect()) {
    System.out.println("> Hull size (verts): " + hull._2.getNumPoints());

    System.out.println("> Hull centroid: " + hull._2.getCentroid().toString());
  }

  final short hullInternalAdapterId =
      inputDataStore.createInternalAdapterStore().getAdapterId("kmeans-hulls-test");
  // Write out the hull features w/ metadata
  final DataTypeAdapter hullAdapter =
      inputDataStore.createAdapterStore().getAdapter(hullInternalAdapterId);

  mark = System.currentTimeMillis();
  // Query back from the new adapter
  queryFeatures(hullAdapter, clusterModel.clusterCenters().length);
  dur = (System.currentTimeMillis() - mark);
  LOGGER.warn("Hull verify: " + dur + " ms.");

  TestUtils.deleteAll(inputDataStore);
}
 
Example #11
Source File: KMeansUpdate.java    From oryx with Apache License 2.0 2 votes vote down vote up
/**
 * @param trainPointData data to cluster
 * @param model trained KMeans Model
 * @return map of ClusterId, count of points associated with the clusterId
 */
private static Map<Integer,Long> fetchClusterCountsFromModel(JavaRDD<? extends Vector> trainPointData,
                                                             KMeansModel model) {
   return trainPointData.map(model::predict).countByValue();
}