scala.collection.GenSeq Scala Example

Source File: UtilSpark.scala From Clustering4Ever with Apache License 2.0

5 votes

package org.clustering4ever.sparktools

import scala.language.higherKinds
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner
import scala.reflect.runtime.universe.TypeTag
import scala.util.Random
import scala.reflect.ClassTag
import scala.collection.{GenSeq, mutable}
import org.clustering4ever.preprocessing.Preprocessable
import org.clustering4ever.hashing.HashingScalar
import org.clustering4ever.vectors.{GVector, ScalarVector}

object UtilSpark
{

	type IndexPartition = Int
	type HasConverged = Boolean
    type IsOriginalDot = Boolean


	final def generateDataLocalityOnHashsedDS[
		O,
		Pz[B, C <: GVector[C]] <: Preprocessable[B, C, Pz]
	](
		rddToPartitioned: RDD[Pz[O, ScalarVector]],
		nbblocs1: Int,
		nbBucketRange: Int
	): RDD[(IndexPartition, (Pz[O, ScalarVector], IsOriginalDot, HasConverged))] = {
		val isOriginalPoint = true
		val hasConverged = true
		val bucketRange = 1 to nbBucketRange

		val lshRDD = rddToPartitioned.map((_, isOriginalPoint, !hasConverged))

		val localityPerPartitionRDD = lshRDD.mapPartitionsWithIndex{ (idx, it) =>
			val ar = it.toList
			def rightNeighbourhood = ar.flatMap{ case (cz, _, _) => bucketRange.collect{ case i if(idx + i < nbblocs1) => (idx + i, (cz, !isOriginalPoint, !hasConverged)) } }
			def leftNeighbourhood = ar.flatMap{ case (cz, _, _) => bucketRange.collect{ case i if(idx - i >= 0) => (idx - i, (cz, !isOriginalPoint, !hasConverged)) } }
			val composing = if(idx == 0) ar.map((idx, _)) ::: rightNeighbourhood
				else if(idx == nbblocs1 - 1) ar.map((idx, _)) ::: leftNeighbourhood
				else ar.map((idx, _)) ::: leftNeighbourhood ::: rightNeighbourhood

	      composing.toIterator

	    }.partitionBy(new HashPartitioner(nbblocs1))
	    
	    localityPerPartitionRDD
	}

	final def generateDataLocalityLD[
		O,
		Pz[B, C <: GVector[C]] <: Preprocessable[B, C, Pz],
		Hasher <: HashingScalar
	](
		rddToPartitioned: RDD[Pz[O, ScalarVector]],
		hashing: Hasher,
		nbblocs1: Int,
		nbBucketRange: Int
	): RDD[(IndexPartition, (Pz[O, ScalarVector], IsOriginalDot, HasConverged))] = {
		val hashedRDD = rddToPartitioned.sortBy( cz => hashing.hf(cz.v) , ascending = true, nbblocs1 )
		generateDataLocalityOnHashsedDS(hashedRDD, nbblocs1, nbBucketRange)
	}

}

Source File: K-Modes.scala From Clustering4Ever with Apache License 2.0

5 votes

package org.clustering4ever.clustering.kcenters.scala

	final def fit[D <: BinaryDistance, GS[Y] <: GenSeq[Y]](
		data: GS[Array[Int]],
		k: Int,
		metric: D,
		maxIterations: Int,
		minShift: Double
	): KModesModel[D] = {
		KModes(k, metric, minShift, maxIterations).fit(binaryToClusterizable(data))
	}
}

Source File: KPPInitializer.scala From Clustering4Ever with Apache License 2.0

5 votes

package org.clustering4ever.clustering.kcenters.scala

	final def kppInit[
		O,
		V <: GVector[V],
		Cz[Y, Z <: GVector[Z]] <: Clusterizable[Y, Z, Cz],
		D <: Distance[V]
	](data: GenSeq[Cz[O, V]], metric: D, k: Int): immutable.HashMap[Int, V] = {

		val centers = mutable.ArrayBuffer(data(Random.nextInt(data.size)).v)

		def obtainNearestCenter(v: V): V = centers.minBy(metric.d(_, v))

		@annotation.tailrec
		def go(i: Int): Unit = {
			val preprocessed = data.map{ cz =>
				val toPow2 = metric.d(cz.v, obtainNearestCenter(cz.v))
				(cz.v, toPow2 * toPow2)
			}
			val phi = preprocessed.aggregate(0D)((agg, e) => agg + e._2, _ + _)
			val probabilities = preprocessed.map{ case (v, toPow2) => (v, toPow2 / phi) }.seq
			val shuffled = Random.shuffle(probabilities)
			centers += Stats.obtainMedianFollowingWeightedDistribution[V](shuffled)
			if(i < k - 2) go(i + 1)
		}

		go(0)
		
		immutable.HashMap(centers.zipWithIndex.map{ case (center, clusterID) => (clusterID, center) }:_*)

	}
}

Source File: K-Means.scala From Clustering4Ever with Apache License 2.0

5 votes

package org.clustering4ever.clustering.kcenters.scala

	final def fit[D <: ContinuousDistance, GS[Y] <: GenSeq[Y]](
		data: GS[Array[Double]],
		k: Int,
		metric: D,
		minShift: Double,
		maxIterations: Int
	): KMeansModel[D] = {
		KMeans(k, metric, minShift, maxIterations, immutable.HashMap.empty[Int, ScalarVector]).fit(scalarToClusterizable(data))
	}
}

Source File: Statistics.scala From Clustering4Ever with Apache License 2.0

5 votes

package org.clustering4ever.stats

	final def obtainMedianFollowingWeightedDistribution[V](distribution: Seq[(V, Double)]): V = {
		val p = scala.util.Random.nextDouble * distribution.foldLeft(0D)((agg, e) => agg + e._2)
		@annotation.tailrec
		def go(accum: Double, i: Int): Int = {
			if(accum < p) go(accum + distribution(i)._2, i + 1)
			else i
		}
		val cpt = go(0D, 0)
		if(cpt == 0) distribution.head._1 else distribution(cpt - 1)._1
	}
}

Source File: SortingTools.scala From Clustering4Ever with Apache License 2.0

5 votes

package org.clustering4ever.utils

    final def bucketSort(toSort: Array[Double], b: Int) = {
      val buckets = parallel.mutable.ParArray.fill(b)(mutable.ArrayBuffer.empty[Double])
      val m = toSort.max
      @annotation.tailrec
      def go(i: Int) : Unit = {
        if(i < toSort.size) {
            buckets((toSort(i) / m * (b - 1)).toInt) += toSort(i)
            go(i + 1)
        }
      }
      go(0)
      buckets.flatMap(_.sorted)
    }

}

scala.collection.GenSeq Scala Examples