org.apache.commons.math3.distribution.EnumeratedIntegerDistribution Java Exaples

Source File: SampleUtil.java From JavaBase with MIT License

6 votes

private static <T extends SampleAble> List<T> sampleResult(List<T> list, int count,
    BiFunction<EnumeratedIntegerDistribution, Integer, List<Integer>> function) {
  if (Objects.isNull(list) || list.isEmpty()) {
    return new ArrayList<>();
  }
  if (list.size() < count) {
    log.warn("data less than count: data size={} count={}", list.size(), count);
    return new ArrayList<>();
  }

  Map<Integer, T> data = IntStream.range(0, list.size()).boxed()
      .collect(Collectors.toMap(i -> i, list::get));

  EnumeratedIntegerDistribution distribution = generateEnumerated(list, data);

  List<Integer> indexes = function.apply(distribution, count);
  return indexes.stream().map(data::get).collect(Collectors.toList());
}

Source File: SampleUtil.java From JavaBase with MIT License

6 votes

private static List<Integer> sampleWithNoRepeated(EnumeratedIntegerDistribution distribution,
    int size) {
  if (Objects.isNull(distribution) || size <= 0) {
    return new ArrayList<>();
  }

  Set<Integer> unique = new HashSet<>(size);
  int count = 0;
  while (unique.size() < size) {
    unique.add(distribution.sample());
    count++;
  }

  log.debug("loop: count={}", count);
  return new ArrayList<>(unique);
}

Source File: MarkovChainEvaluator.java From lucene-solr with Apache License 2.0

6 votes

public MarkovChain(Matrix matrix, int state) throws IOException {
  double[][] data = matrix.getData();

  if(data.length != data[0].length) {
    throw new IOException("markovChain must be initialized with a square matrix.");
  }

  this.distributions = new EnumeratedIntegerDistribution[data.length];

  if(state > -1) {
    this.state = state;
  } else {
    this.state = new Random().nextInt(data.length);
  }

  for(int i=0; i<data.length; i++) {
    double[] probabilities = data[i];

    //Create the states array needed by the enumerated distribution
    int[] states = MathArrays.sequence(data.length, 0, 1);
    distributions[i] = new EnumeratedIntegerDistribution(states, probabilities);
  }
}

Source File: SampleUtil.java From JavaBase with MIT License

5 votes

private static <T extends SampleAble> T sampleOneWithNoReturn(List<T> list) {
  Map<Integer, T> data = IntStream.range(0, list.size()).boxed()
      .collect(Collectors.toMap(i -> i, list::get));

  EnumeratedIntegerDistribution distribution = generateEnumerated(list, data);
  int index = distribution.sample();
  T t = data.get(index);
  data.remove(index);
  list.remove(index);
  return t;
}

Source File: SampleUtil.java From JavaBase with MIT License

5 votes

private static List<Integer> sampleWithRepeated(EnumeratedIntegerDistribution distribution,
    int size) {
  List<Integer> result = new ArrayList<>();
  for (int i = 0; i < size; i++) {
    result.add(distribution.sample());
  }
  return result;
}

Source File: SampleUtil.java From JavaBase with MIT License

5 votes

private static <T extends SampleAble> EnumeratedIntegerDistribution generateEnumerated(
    List<T> list, Map<Integer, T> tempMap) {

  double sum = list.stream().mapToInt(SampleAble::getWeight).sum();

  List<Double> probList = list.stream().map(SampleAble::getWeight).map(value -> value / sum)
      .collect(Collectors.toList());

  return new EnumeratedIntegerDistribution(
      tempMap.keySet().stream().mapToInt(Integer::intValue).toArray(),
      probList.stream().mapToDouble(Double::doubleValue).toArray()
  );
}

Source File: PartitionTest.java From sequence-mining with GNU General Public License v3.0

5 votes

@Test
public void testInterleavingGenerator() {

	final Random random = new Random(1);
	final Random randomI = new Random(10);
	final RandomGenerator randomC = new JDKRandomGenerator();
	randomC.setSeed(100);

	final Multiset<Sequence> seqsI = HashMultiset.create();
	seqsI.add(new Sequence(1, 2, 3));
	seqsI.add(new Sequence(4, 5));
	seqsI.add(new Sequence(6));
	seqsI.add(new Sequence(7));

	final HashMap<Sequence, Double> seqsG = new HashMap<>();
	for (final Sequence seq : seqsI.elementSet()) {
		seqsG.put(seq, 1.0);
	}

	final Map<Sequence, EnumeratedIntegerDistribution> countDists = new HashMap<>();
	final EnumeratedIntegerDistribution oneRepeat = new EnumeratedIntegerDistribution(randomC, new int[] { 1 },
			new double[] { 1.0 });
	countDists.put(new Sequence(1, 2, 3), oneRepeat);
	countDists.put(new Sequence(4, 5), oneRepeat);
	countDists.put(new Sequence(6), oneRepeat);
	countDists.put(new Sequence(7), oneRepeat);

	final HashSet<Transaction> transG = new HashSet<>();
	for (int i = 0; i < 700000; i++)
		transG.add(
				TransactionGenerator.sampleFromDistribution(random, seqsG, countDists, new HashMap<>(), randomI));
	// Note that upper bound is exact when there are no repetitions
	assertEquals(transG.size(), modP(seqsI.iterator()), EPS);
}

Source File: BM.java From pyramid with Apache License 2.0

5 votes

/**
 * sample a vector from the mixture distribution
 * @return
 */
public Vector sample(){
    Vector vector = new DenseVector(dimension);
    // first sample cluster
    int[] clusters = IntStream.range(0,numClusters).toArray();
    EnumeratedIntegerDistribution enumeratedIntegerDistribution = new EnumeratedIntegerDistribution(clusters,mixtureCoefficients);
    int cluster = enumeratedIntegerDistribution.sample();
    // then sample each dimension
    for (int d=0;d<dimension;d++){
        vector.set(d,distributions[cluster][d].sample());
    }
    return vector;
}

Source File: KMeansPlusPlus.java From pyramid with Apache License 2.0

5 votes

public void initialize(boolean print){
    if (print){
        System.out.println("initialize");
    }

    int dataIndex = Sampling.intUniform(0,dataSet.getNumDataPoints()-1);
    centers.add(dataSet.getRow(dataIndex));
    pickedIds.add(dataIndex);
    if (print){
        System.out.println("randomly pick instance "+(dataIndex+1)+" as the initial centroid for cluster "+centers.size());
    }

    while(centers.size()<numComponents){
        updateDistance();
        double sum = MathUtil.arraySum(distances);
        for (int i=0;i<distances.length;i++){
            distances[i] /= sum;
        }
        int[] indices = IntStream.range(0, dataSet.getNumDataPoints()).toArray();
        EnumeratedIntegerDistribution dis = new EnumeratedIntegerDistribution(indices, distances);
        int sample = dis.sample();
        centers.add(dataSet.getRow(sample));
        pickedIds.add(sample);
        if (print){
            System.out.println("randomly pick instance "+(sample+1)+" as the initial centroid for cluster "+centers.size());
        }

    }
}

Source File: ConsumerVerifier.java From hermes with Apache License 2.0

4 votes

@Before
public void before() {
	int[] nackIndexes = new int[] { 0, 1 };
	double[] nackDis = new double[] { 0.05, 0.95 };
	nackRnd = new EnumeratedIntegerDistribution(nackIndexes, nackDis);
}

Source File: ClassifierWeightedSampling.java From AILibs with GNU Affero General Public License v3.0

4 votes

@Override
public List<Pair<ILabeledInstance, Double>> calculateAcceptanceThresholdsWithTrainedPilot(final D dataset, final IClassifier pilot) {

	/* compute mean value and base values the instances must have */
	double mid = this.getMean(dataset);
	double baseValue = 10 * mid + 1; // arbitrary value, there most likely be better one
	double addForRightClassification = baseValue + 2 * mid; // like baseValue

	/* determine probability for each index to be chosen */
	double[] weights = new double[dataset.size()];
	for (int i = 0; i < weights.length; i++) {
		try {
			IPrediction prediction = pilot.predict(dataset.get(i));
			if (prediction.getLabelWithHighestProbability() == dataset.get(i).getLabel()) {
				weights[i] = addForRightClassification - prediction.getProbabilityOfLabel(dataset.get(i).getLabel());
			} else {
				weights[i] = baseValue + prediction.getProbabilityOfLabel(prediction.getLabelWithHighestProbability());
			}
		} catch (Exception e) {
			weights[i] = 0;
		}
	}
	int[] indices = IntStream.range(0, this.getInput().size()).toArray();
	EnumeratedIntegerDistribution finalDistribution = new EnumeratedIntegerDistribution(indices, weights);
	finalDistribution.reseedRandomGenerator(this.rand.nextLong());

	/* now draw <number of samples> many indices whose threshold will be set to 1 */
	int n = this.getSampleSize();
	Set<Integer> consideredIndices = new HashSet<>();
	for (int i = 0; i < n; i++) {
		int index;
		do {
			index = finalDistribution.sample();
		} while (consideredIndices.contains(index));
		consideredIndices.add(index);
	}

	/* now create the list of pairs */
	List<Pair<ILabeledInstance, Double>> thresholds = new ArrayList<>();
	int m = dataset.size();
	for (int i = 0; i < m; i++) {
		ILabeledInstance inst = dataset.get(i);
		double threshold = consideredIndices.contains(i) ? 1 : 0;
		thresholds.add(new Pair<>(inst, threshold));
	}
	return thresholds;
}

Source File: TransactionGenerator.java From sequence-mining with GNU General Public License v3.0

4 votes

/**
 * Generate transactions from set of interesting sequences
 *
 * @return set of sequences added to transaction
 */
public static HashMap<Sequence, Double> generateTransactionDatabase(final Map<Sequence, Double> sequences,
		final Table<Sequence, Integer, Double> probabilities, final int noTransactions, final File outFile)
				throws IOException {

	// Set random number seeds
	final Random random = new Random(1);
	final Random randomI = new Random(10);
	final RandomGenerator randomC = new JDKRandomGenerator();
	randomC.setSeed(100);

	// Storage for sequences actually added
	final HashMap<Sequence, Double> addedSequences = new HashMap<>();

	// Set output file
	final PrintWriter out = new PrintWriter(outFile, "UTF-8");

	// Add to distribution class for easy sampling
	final Map<Sequence, EnumeratedIntegerDistribution> dists = new HashMap<>();
	for (final Sequence seq : sequences.keySet()) {
		final List<Integer> singletons = new ArrayList<>();
		final List<Double> probs = new ArrayList<>();
		for (final Entry<Integer, Double> entry : probabilities.row(seq).entrySet()) {
			singletons.add(entry.getKey());
			probs.add(entry.getValue());
		}
		final EnumeratedIntegerDistribution dist = new EnumeratedIntegerDistribution(randomC,
				Ints.toArray(singletons), Doubles.toArray(probs));
		dists.put(seq, dist);
	}

	// Generate transaction database
	int count = 0;
	while (count < noTransactions) {

		// Generate transaction from distribution
		final Transaction transaction = sampleFromDistribution(random, sequences, dists, addedSequences, randomI);
		for (final int item : transaction) {
			out.print(item + " -1 ");
		}
		if (!transaction.isEmpty()) {
			out.print("-2");
			out.println();
			count++;
		}

	}
	out.close();

	// Print file to screen
	if (VERBOSE) {
		final FileReader reader = new FileReader(outFile);
		final LineIterator it = new LineIterator(reader);
		while (it.hasNext()) {
			System.out.println(it.nextLine());
		}
		LineIterator.closeQuietly(it);
	}

	return addedSequences;
}

Source File: MultiLabelSynthesizer.java From pyramid with Apache License 2.0

4 votes

/**
 * y0: w=(0,1)
 * y1: w=(1,1)
 * y2: w=(1,0)
 * y3: w=(1,-1)
 * @param numData
 * @return
 */
public static MultiLabelClfDataSet flipOneNonUniform(int numData){
    int numClass = 4;
    int numFeature = 2;

    MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numFeatures(numFeature)
            .numClasses(numClass)
            .numDataPoints(numData)
            .build();

    // generate weights
    Vector[] weights = new Vector[numClass];
    for (int k=0;k<numClass;k++){
        Vector vector = new DenseVector(numFeature);
        weights[k] = vector;
    }

    weights[0].set(0,0);
    weights[0].set(1,1);

    weights[1].set(0, 1);
    weights[1].set(1, 1);

    weights[2].set(0, 1);
    weights[2].set(1, 0);

    weights[3].set(0,1);
    weights[3].set(1,-1);


    // generate features
    for (int i=0;i<numData;i++){
        for (int j=0;j<numFeature;j++){
            dataSet.setFeatureValue(i,j,Sampling.doubleUniform(-1, 1));
        }
    }

    // assign labels
    for (int i=0;i<numData;i++){
        for (int k=0;k<numClass;k++){
            double dot = weights[k].dot(dataSet.getRow(i));
            if (dot>=0){
                dataSet.addLabel(i,k);
            }
        }
    }

    int[] indices = {0,1,2,3};
    double[] probs = {0.4,0.2,0.2,0.2};
    IntegerDistribution distribution = new EnumeratedIntegerDistribution(indices,probs);

    // flip
    for (int i=0;i<numData;i++){
        int toChange = distribution.sample();
        MultiLabel label = dataSet.getMultiLabels()[i];
        if (label.matchClass(toChange)){
            label.removeLabel(toChange);
        } else {
            label.addLabel(toChange);
        }

    }


    return dataSet;
}

Source File: MultiLabelSynthesizer.java From pyramid with Apache License 2.0

4 votes

/**
 * C0, y0: w=(0,1)
 * C0, y1: w=(1,1)
 * C1, y0: w=(1,0)
 * C1, y1: w=(1,-1)
 * @return
 */
public static MultiLabelClfDataSet sampleFromMix(){
    int numData = 10000;
    int numClass = 2;
    int numFeature = 2;
    int numClusters = 2;
    double[] proportions = {0.4,0.6};
    int[] indices = {0,1};

    MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder()
            .numFeatures(numFeature)
            .numClasses(numClass)
            .numDataPoints(numData)
            .build();

    // generate weights
    Vector[][] weights = new Vector[numClusters][numClass];
    for (int c=0;c<numClusters;c++){
        for (int l=0;l<numClass;l++){
            Vector vector = new DenseVector(numFeature);
            weights[c][l] = vector;
        }
    }


    weights[0][0].set(0, 0);
    weights[0][0].set(1, 1);

    weights[0][1].set(0, 1);
    weights[0][1].set(1, 1);


    weights[1][0].set(0, 1);
    weights[1][0].set(1, 0);

    weights[1][1].set(0, 1);
    weights[1][1].set(1,-1);

    // generate features
    for (int i=0;i<numData;i++){
        for (int j=0;j<numFeature;j++){
            dataSet.setFeatureValue(i,j,Sampling.doubleUniform(-1, 1));
        }
    }
    IntegerDistribution distribution = new EnumeratedIntegerDistribution(indices,proportions);
    // assign labels
    for (int i=0;i<numData;i++){
        int cluster = distribution.sample();
        System.out.println("cluster "+cluster);
        for (int l=0;l<numClass;l++){
            System.out.println("row = "+dataSet.getRow(i));
            System.out.println("weight = "+ weights[cluster][l]);
            double dot = weights[cluster][l].dot(dataSet.getRow(i));
            System.out.println("dot = "+dot);
            if (dot>=0){
                dataSet.addLabel(i,l);
            }
        }
    }

    return dataSet;
}

Source File: SamplingPrediction.java From pyramid with Apache License 2.0

4 votes

public static MultiLabel predict(double[] probabilities, List<MultiLabel> candidates){
    int[] s = IntStream.range(0, probabilities.length).toArray();
    EnumeratedIntegerDistribution distribution = new EnumeratedIntegerDistribution(s, probabilities);
    int i = distribution.sample();
    return candidates.get(i);
}

org.apache.commons.math3.distribution.EnumeratedIntegerDistribution Java Examples