weka.core.Instances#sort

Source File: BinC45Split.java From tsml with GNU General Public License v3.0

6 votes

/**
 * Creates a C4.5-type split on the given data.
 *
 * @exception Exception if something goes wrong
 */
public void buildClassifier(Instances trainInstances)
     throws Exception {

  // Initialize the remaining instance variables.
  m_numSubsets = 0;
  m_splitPoint = Double.MAX_VALUE;
  m_infoGain = 0;
  m_gainRatio = 0;

  // Different treatment for enumerated and numeric
  // attributes.
  if (trainInstances.attribute(m_attIndex).isNominal()){
    handleEnumeratedAttribute(trainInstances);
  }else{
    trainInstances.sort(trainInstances.attribute(m_attIndex));
    handleNumericAttribute(trainInstances);
  }
}

Source File: NBTreeSplit.java From tsml with GNU General Public License v3.0

6 votes

/**
 * Creates a NBTree-type split on the given data. Assumes that none of
 * the class values is missing.
 *
 * @exception Exception if something goes wrong
 */
public void buildClassifier(Instances trainInstances) 
     throws Exception {

  // Initialize the remaining instance variables.
  m_numSubsets = 0;
  m_splitPoint = Double.MAX_VALUE;
  m_errors = 0;
  if (m_globalNB != null) {
    m_errors = m_globalNB.getErrors();
  }

  // Different treatment for enumerated and numeric
  // attributes.
  if (trainInstances.attribute(m_attIndex).isNominal()) {
    m_complexityIndex = trainInstances.attribute(m_attIndex).numValues();
    handleEnumeratedAttribute(trainInstances);
  }else{
    m_complexityIndex = 2;
    trainInstances.sort(trainInstances.attribute(m_attIndex));
    handleNumericAttribute(trainInstances);
  }
}

Source File: C45Split.java From tsml with GNU General Public License v3.0

6 votes

/**
 * Creates a C4.5-type split on the given data. Assumes that none of
 * the class values is missing.
 *
 * @exception Exception if something goes wrong
 */
public void buildClassifier(Instances trainInstances) 
     throws Exception {

  // Initialize the remaining instance variables.
  m_numSubsets = 0;
  m_splitPoint = Double.MAX_VALUE;
  m_infoGain = 0;
  m_gainRatio = 0;

  // Different treatment for enumerated and numeric
  // attributes.
  if (trainInstances.attribute(m_attIndex).isNominal()) {
    m_complexityIndex = trainInstances.attribute(m_attIndex).numValues();
    m_index = m_complexityIndex;
    handleEnumeratedAttribute(trainInstances);
  }else{
    m_complexityIndex = 2;
    m_index = 0;
    trainInstances.sort(trainInstances.attribute(m_attIndex));
    handleNumericAttribute(trainInstances);
  }
}

Source File: Discretize.java From tsml with GNU General Public License v3.0

6 votes

/**
 * Set cutpoints for a single attribute using MDL.
 *
 * @param index the index of the attribute to set cutpoints for
 * @param data the data to work with
 */
protected void calculateCutPointsByMDL(int index,
			 Instances data) {

  // Sort instances
  data.sort(data.attribute(index));

  // Find first instances that's missing
  int firstMissing = data.numInstances();
  for (int i = 0; i < data.numInstances(); i++) {
    if (data.instance(i).isMissing(index)) {
      firstMissing = i;
      break;
    }
  }
  m_CutPoints[index] = cutPointsForSubset(data, index, 0, firstMissing);
}

Source File: Sampling.java From tsml with GNU General Public License v3.0

5 votes

/** 
 * Reorder the dataset by its largest class
 * @param data
 * @return
 */
public static Instances orderByLargestClass(Instances data) {
	Instances newData = new Instances(data, data.numInstances());
	
	// get the number of class in the data
	int nbClass = data.numClasses();
	int[] instancePerClass = new int[nbClass];
	int[] labels = new int[nbClass];
	int[] classIndex = new int[nbClass];
	
	// sort the data base on its class
	data.sort(data.classAttribute());
	
	// get the number of instances per class in the data
	for (int i = 0; i < nbClass; i++) {
		instancePerClass[i] = data.attributeStats(data.classIndex()).nominalCounts[i];
		labels[i] = i;
		if (i > 0)
			classIndex[i] = classIndex[i-1] + instancePerClass[i-1];
	}
	QuickSort.sort(instancePerClass, labels);
	
	for (int i = nbClass-1; i >=0 ; i--) {
		for (int j = 0; j < instancePerClass[i]; j++) {
			newData.add(data.instance(classIndex[labels[i]] + j));
		}
	}
	
	return newData;
}

Source File: Segmenter.java From gsn with GNU General Public License v3.0

5 votes

public SegmentedClassifier computeErrors(Instances i,Double[] seg) throws Exception{
	Classifier cl = Tools.getClassifierById(model);
	Filter f = new DummyFilter();
	f.setInputFormat(i);
	SegmentedClassifier sc = new SegmentedClassifier(cl, 1, seg,f);
	sc.buildClassifier(i);	
	i.sort(0);
	Pred_errors = Tools.get_errors(sc, i); 
	return sc;

}

Source File: Sampling.java From tsml with GNU General Public License v3.0

4 votes

/** 
 * Reorder the data by compactness of each class using Euclidean distance
 * @param data
 * @return
 */
public static Instances orderByCompactClass(Instances data) {
	Instances newData = new Instances(data, data.numInstances());
	
	// get the number of class in the data
	int nbClass = data.numClasses();
	int[] instancePerClass = new int[nbClass];
	int[] labels = new int[nbClass];
	int[] classIndex = new int[nbClass];
	double[] compactness = new double[nbClass];
	
	// sort the data base on its class
	data.sort(data.classAttribute());
	
	int start = 0;
	// get the number of instances per class in the data
	for (int i = 0; i < nbClass; i++) {
		instancePerClass[i] = data.attributeStats(data.classIndex()).nominalCounts[i];
		labels[i] = i;
		if (i > 0) 
			classIndex[i] = classIndex[i-1] + instancePerClass[i-1];
		int end = start + instancePerClass[i];
		int counter = 0;
		double[][] dataPerClass = new double[instancePerClass[i]][data.numAttributes()-1];
		for (int j = start; j < end; j++) {
			dataPerClass[counter++] = data.instance(j).toDoubleArray();
		}
		double[] mean = arithmeticMean(dataPerClass);
		double d = 0;
		for (int j = 0; j < instancePerClass[i]; j++) {
			double temp = euclideanDistance(mean, dataPerClass[j]);
			temp *= temp;
			temp -= (mean[0] - dataPerClass[j][0]) * (mean[0] - dataPerClass[j][0]);
			d += temp;
		}
		compactness[i] = d / instancePerClass[i];
		start = end;
	}
	
	QuickSort.sort(compactness, labels);
	
	for (int i = nbClass-1; i >=0 ; i--) {
		for (int j = 0; j < instancePerClass[labels[i]]; j++) {
			newData.add(data.instance(classIndex[labels[i]] + j));
		}
	}
	
	return newData;
}

Source File: ResidualSplit.java From tsml with GNU General Public License v3.0

4 votes

/**
  * Selects split point for numeric attribute.
  */
 protected boolean getSplitPoint() throws Exception{

   //compute possible split points
   double[] splitPoints = new double[m_numInstances];
   int numSplitPoints = 0;

   Instances sortedData = new Instances(m_data);
   sortedData.sort(sortedData.attribute(m_attIndex));

   double last, current;

   last = sortedData.instance(0).value(m_attIndex);	

   for (int i = 0; i < m_numInstances - 1; i++) {
     current = sortedData.instance(i+1).value(m_attIndex);	
     if (!Utils.eq(current, last)){
splitPoints[numSplitPoints++] = (last + current) / 2.0;
     }
     last = current;
   }

   //compute entropy for all split points
   double[] entropyGain = new double[numSplitPoints];

   for (int i = 0; i < numSplitPoints; i++) {
     m_splitPoint = splitPoints[i];
     entropyGain[i] = entropyGain();
   }

   //get best entropy gain
   int bestSplit = -1;
   double bestGain = -Double.MAX_VALUE;

   for (int i = 0; i < numSplitPoints; i++) {
     if (entropyGain[i] > bestGain) {
bestGain = entropyGain[i];
bestSplit = i;
     }
   }

   if (bestSplit < 0) return false;

   m_splitPoint = splitPoints[bestSplit];	
   return true;
 }

Source File: IsotonicRegression.java From tsml with GNU General Public License v3.0

4 votes

/**
 * Does the actual regression.
 */
protected void regress(Attribute attribute, Instances insts, boolean ascending) 
  throws Exception {

  // Sort values according to current attribute
  insts.sort(attribute);
  
  // Initialize arrays
  double[] values = new double[insts.numInstances()];
  double[] weights = new double[insts.numInstances()];
  double[] cuts = new double[insts.numInstances() - 1];
  int size = 0;
  values[0] = insts.instance(0).classValue();
  weights[0] = insts.instance(0).weight();
  for (int i = 1; i < insts.numInstances(); i++) {
    if (insts.instance(i).value(attribute) >
        insts.instance(i - 1).value(attribute)) {
      cuts[size] = (insts.instance(i).value(attribute) +
                    insts.instance(i - 1).value(attribute)) / 2;
      size++;
    }
    values[size] += insts.instance(i).classValue();
    weights[size] += insts.instance(i).weight();
  }
  size++;
  
  // While there is a pair of adjacent violators
  boolean violators;
  do {
    violators = false;
    
    // Initialize arrays
    double[] tempValues = new double[size];
    double[] tempWeights = new double[size];
    double[] tempCuts = new double[size - 1];
    
    // Merge adjacent violators
    int newSize = 0;
    tempValues[0] = values[0];
    tempWeights[0] = weights[0];
    for (int j = 1; j < size; j++) {
      if ((ascending && (values[j] / weights[j] > 
                         tempValues[newSize] / tempWeights[newSize])) ||
          (!ascending && (values[j] / weights[j] < 
                          tempValues[newSize] / tempWeights[newSize]))) {
        tempCuts[newSize] = cuts[j - 1];
        newSize++;
        tempValues[newSize] = values[j];
        tempWeights[newSize] = weights[j];
      } else {
        tempWeights[newSize] += weights[j];
        tempValues[newSize] += values[j];
        violators = true;
      }
    }
    newSize++;
    
    // Copy references
    values = tempValues;
    weights = tempWeights;
    cuts = tempCuts;
    size = newSize;
  } while (violators);
  
  // Compute actual predictions
  for (int i = 0; i < size; i++) {
    values[i] /= weights[i];
  }
  
  // Backup best instance variables
  Attribute attributeBackedup = m_attribute;
  double[] cutsBackedup = m_cuts;
  double[] valuesBackedup = m_values;
  
  // Set instance variables to values computed for this attribute
  m_attribute = attribute;
  m_cuts = cuts;
  m_values = values;
  
  // Compute sum of squared errors
  Evaluation eval = new Evaluation(insts);
  eval.evaluateModel(this, insts);
  double msq = eval.rootMeanSquaredError();
  
  // Check whether this is the best attribute
  if (msq < m_minMsq) {
    m_minMsq = msq;
  } else {
    m_attribute = attributeBackedup;
    m_cuts = cutsBackedup;
    m_values = valuesBackedup;
  }
}

Source File: Discretize.java From tsml with GNU General Public License v3.0

4 votes

/**
  * Set cutpoints for a single attribute.
  *
  * @param index the index of the attribute to set cutpoints for
  */
 protected void calculateCutPointsByEqualFrequencyBinning(int index) {

   // Copy data so that it can be sorted
   Instances data = new Instances(getInputFormat());

   // Sort input data
   data.sort(index);

   // Compute weight of instances without missing values
   double sumOfWeights = 0;
   for (int i = 0; i < data.numInstances(); i++) {
     if (data.instance(i).isMissing(index)) {
break;
     } else {
sumOfWeights += data.instance(i).weight();
     }
   }
   double freq;
   double[] cutPoints = new double[m_NumBins - 1];
   if (getDesiredWeightOfInstancesPerInterval() > 0) {
     freq = getDesiredWeightOfInstancesPerInterval();
     cutPoints = new double[(int)(sumOfWeights / freq)];
   } else {
     freq = sumOfWeights / m_NumBins;
     cutPoints = new double[m_NumBins - 1];
   }

   // Compute break points
   double counter = 0, last = 0;
   int cpindex = 0, lastIndex = -1;
   for (int i = 0; i < data.numInstances() - 1; i++) {

     // Stop if value missing
     if (data.instance(i).isMissing(index)) {
break;
     }
     counter += data.instance(i).weight();
     sumOfWeights -= data.instance(i).weight();

     // Do we have a potential breakpoint?
     if (data.instance(i).value(index) <
  data.instance(i + 1).value(index)) {

// Have we passed the ideal size?
if (counter >= freq) {

  // Is this break point worse than the last one?
  if (((freq - last) < (counter - freq)) && (lastIndex != -1)) {
    cutPoints[cpindex] = (data.instance(lastIndex).value(index) +
			  data.instance(lastIndex + 1).value(index)) / 2;
    counter -= last;
    last = counter;
    lastIndex = i;
  } else {
    cutPoints[cpindex] = (data.instance(i).value(index) +
			  data.instance(i + 1).value(index)) / 2;
    counter = 0;
    last = 0;
    lastIndex = -1;
  }
  cpindex++;
  freq = (sumOfWeights + counter) / ((cutPoints.length + 1) - cpindex);
} else {
  lastIndex = i;
  last = counter;
}
     }
   }

   // Check whether there was another possibility for a cut point
   if ((cpindex < cutPoints.length) && (lastIndex != -1)) {
     cutPoints[cpindex] = (data.instance(lastIndex).value(index) +
		    data.instance(lastIndex + 1).value(index)) / 2;
     cpindex++;
   }

   // Did we find any cutpoints?
   if (cpindex == 0) {
     m_CutPoints[index] = null;
   } else {
     double[] cp = new double[cpindex];
     for (int i = 0; i < cpindex; i++) {
cp[i] = cutPoints[i];
     }
     m_CutPoints[index] = cp;
   }
 }

Source File: PropositionalToMultiInstance.java From tsml with GNU General Public License v3.0

4 votes

/**
 * Signify that this batch of input to the filter is finished. 
 * If the filter requires all instances prior to filtering,
 * output() may now be called to retrieve the filtered instances.
 *
 * @return true if there are instances pending output
 * @throws IllegalStateException if no input structure has been defined
 */
public boolean batchFinished() {

  if (getInputFormat() == null) {
    throw new IllegalStateException("No input instance format defined");
  }

  Instances input = getInputFormat();
  input.sort(0);   // make sure that bagID is sorted
  Instances output = getOutputFormat();
  Instances bagInsts = output.attribute(1).relation();
  Instance inst = new DenseInstance(bagInsts.numAttributes());
  inst.setDataset(bagInsts);

  double bagIndex   = input.instance(0).value(0);
  double classValue = input.instance(0).classValue(); 
  double bagWeight  = 0.0;

  // Convert pending input instances
  for(int i = 0; i < input.numInstances(); i++) {
    double currentBagIndex = input.instance(i).value(0);

    // copy the propositional instance value, except the bagIndex and the class value
    for (int j = 0; j < input.numAttributes() - 2; j++) 
      inst.setValue(j, input.instance(i).value(j + 1));
    inst.setWeight(input.instance(i).weight());

    if (currentBagIndex == bagIndex){
      bagInsts.add(inst);
      bagWeight += inst.weight();
    }
    else{
      addBag(input, output, bagInsts, (int) bagIndex, classValue, bagWeight);

      bagInsts   = bagInsts.stringFreeStructure();  
      bagInsts.add(inst);
      bagIndex   = currentBagIndex;
      classValue = input.instance(i).classValue();
      bagWeight  = inst.weight();
    }
  }

  // reach the last instance, create and add the last bag
  addBag(input, output, bagInsts, (int) bagIndex, classValue, bagWeight);

  if (getRandomize())
    output.randomize(new Random(getSeed()));
  
  for (int i = 0; i < output.numInstances(); i++)
    push(output.instance(i));
  
  // Free memory
  flushInput();

  m_NewBatch = true;
  m_FirstBatchDone = true;
  
  return (numPendingOutput() != 0);
}

Source File: SubSample.java From gsn with GNU General Public License v3.0

4 votes

@Override
protected Instances process(Instances instances) throws Exception {

	instances.sort(m_index);
	
	Instances output = new Instances(instances);
	
	if(instances.numInstances() <= m_ratio){return output;}
	
	for(int i=output.numInstances()-1;i>=0;i--){
		if((i+1) % m_ratio != 0){output.delete(i);}
	}
	//output.compactify();
	
	
	return output;
}

Java Code Examples for weka.core.Instances#sort()