weka.core.Utils#log2

Source File: Evaluation.java From tsml with GNU General Public License v3.0

6 votes

/**
 * Calculate the entropy of the prior distribution.
 * 
 * @return the entropy of the prior distribution
 * @throws Exception if the class is not nominal
 */
public final double priorEntropy() throws Exception {

  if (!m_ClassIsNominal) {
    throw new Exception("Can't compute entropy of class prior: "
        + "class numeric!");
  }

  if (m_NoPriors)
    return Double.NaN;

  double entropy = 0;
  for (int i = 0; i < m_NumClasses; i++) {
    entropy -= m_ClassPriors[i] / m_ClassPriorsSum
        * Utils.log2(m_ClassPriors[i] / m_ClassPriorsSum);
  }
  return entropy;
}

Source File: Id3.java From tsml with GNU General Public License v3.0

6 votes

/**
 * Computes the entropy of a dataset.
 * 
 * @param data the data for which entropy is to be computed
 * @return the entropy of the data's class distribution
 * @throws Exception if computation fails
 */
private double computeEntropy(Instances data) throws Exception {

  double [] classCounts = new double[data.numClasses()];
  Enumeration instEnum = data.enumerateInstances();
  while (instEnum.hasMoreElements()) {
    Instance inst = (Instance) instEnum.nextElement();
    classCounts[(int) inst.classValue()]++;
  }
  double entropy = 0;
  for (int j = 0; j < data.numClasses(); j++) {
    if (classCounts[j] > 0) {
      entropy -= classCounts[j] * Utils.log2(classCounts[j]);
    }
  }
  entropy /= (double) data.numInstances();
  return entropy + Utils.log2(data.numInstances());
}

Source File: CollectiveTree.java From collective-classification-weka-package with GNU General Public License v3.0

6 votes

/**
 * here initialization and building, possible iterations will happen
 *
 * @throws Exception	if something goes wrong
 */
@Override
protected void build() throws Exception {
  // determine number of features to be selected
  m_KValue = getNumFeatures();
  if (m_KValue < 1)
    m_KValue = (int) Utils.log2(m_Trainset.numAttributes()) + 1;

  // Make sure K value is in range
  if (m_KValue > m_Trainset.numAttributes() - 1)
    m_KValue = m_Trainset.numAttributes() - 1;

  // build classifier
  m_Random = m_Trainset.getRandomNumberGenerator(getSeed());
  buildClassifier();
}

Source File: Evaluation.java From tsml with GNU General Public License v3.0

5 votes

/**
 * Updates stats for conditional density estimator based on current test
 * instance.
 * 
 * @param classifier the conditional density estimator
 * @param classMissing the instance for which density is to be computed,
 *          without a class value
 * @param classValue the class value of this instance
 * @throws Exception if density could not be computed successfully
 */
protected void updateStatsForConditionalDensityEstimator(
    ConditionalDensityEstimator classifier, Instance classMissing,
    double classValue) throws Exception {

  if (m_PriorEstimator == null) {
    setNumericPriorsFromBuffer();
  }
  m_SumSchemeEntropy -= classifier.logDensity(classMissing, classValue)
      * classMissing.weight() / Utils.log2;
  m_SumPriorEntropy -= m_PriorEstimator.logDensity(classValue)
      * classMissing.weight() / Utils.log2;
}

Source File: InfoGainSplitMetric.java From tsml with GNU General Public License v3.0

5 votes

@Override
public double getMetricRange(Map<String, WeightMass> preDist) {

  int numClasses = preDist.size();
  if (numClasses < 2) {
    numClasses = 2;
  }

  return Utils.log2(numClasses);
}

Source File: BFTree.java From tsml with GNU General Public License v3.0

5 votes

/**
 * Compute and return entropy for a given distribution of a node.
 * 
 * @param dist 	class distributions
 * @param total 	class distributions
 * @return 		entropy of the class distributions
 */
protected double computeEntropy(double[] dist, double total) {
  if (total==0) return 0;
  double entropy = 0;
  for (int i=0; i<dist.length; i++) {
    if (dist[i]!=0) entropy -= dist[i]/total * Utils.log2(dist[i]/total);
  }
  return entropy;
}

Source File: RuleStats.java From tsml with GNU General Public License v3.0

5 votes

/** 
 * The description length of the theory for a given rule.  Computed as:<br>
 *                 0.5* [||k||+ S(t, k, k/t)]<br>
 * where k is the number of antecedents of the rule; t is the total
 * possible antecedents that could appear in a rule; ||K|| is the 
 * universal prior for k , log2*(k) and S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
 * is the subset encoding length.<p>
 *
 * Details see Quilan: "MDL and categorical theories (Continued)",ML95
 *
 * @param index the index of the given rule (assuming correct)
 * @return the theory DL, weighted if weight != 1.0
 */
public double theoryDL(int index){
	
  double k = ((Rule)m_Ruleset.elementAt(index)).size();
	
  if(k == 0)
    return 0.0;
	
  double tdl = Utils.log2(k);               	    
  if(k > 1)                           // Approximation
    tdl += 2.0 * Utils.log2(tdl);   // of log2 star	
  tdl += subsetDL(m_Total, k, k/m_Total);
  //System.out.println("!!!theory: "+MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl);
  return MDL_THEORY_WEIGHT * REDUNDANCY_FACTOR * tdl;
}

Source File: RuleStats.java From tsml with GNU General Public License v3.0

5 votes

/** 
  * The description length of data given the parameters of the data
  * based on the ruleset. <p>
  * Details see Quinlan: "MDL and categorical theories (Continued)",ML95<p>
  *
  * @param expFPOverErr expected FP/(FP+FN)
  * @param cover coverage
  * @param uncover uncoverage
  * @param fp False Positive
  * @param fn False Negative
  * @return the description length
  */
 public static double dataDL(double expFPOverErr, double cover, 
		      double uncover, double fp, double fn){
   double totalBits = Utils.log2(cover+uncover+1.0); // how many data?
   double coverBits, uncoverBits; // What's the error?
   double expErr;                 // Expected FP or FN

   if(Utils.gr(cover, uncover)){
     expErr = expFPOverErr*(fp+fn);
     coverBits = subsetDL(cover, fp, expErr/cover);
     uncoverBits = Utils.gr(uncover, 0.0) ? 
subsetDL(uncover, fn, fn/uncover) : 0.0;	    
   }
   else{
     expErr = (1.0-expFPOverErr)*(fp+fn);
     coverBits = Utils.gr(cover, 0.0) ? 
subsetDL(cover, fp, fp/cover) : 0.0;
     uncoverBits = subsetDL(uncover, fn, expErr/uncover);
   }

   /*
     System.err.println("!!!cover: " + cover + "|uncover" + uncover +
     "|coverBits: "+coverBits+"|uncBits: "+ uncoverBits+
     "|FPRate: "+expFPOverErr + "|expErr: "+expErr+
     "|fp: "+fp+"|fn: "+fn+"|total: "+totalBits);
   */
   return (totalBits + coverBits + uncoverBits);
 }

Source File: TunedRandomForest.java From tsml with GNU General Public License v3.0

5 votes

@Override
    public void setParametersFromIndex(int x) {
        tuneParameters=false;
//Three paras, evenly distributed, 1 to maxPerPara.
//Note that if maxPerPara > numFeaturesInProblem, we have a problem, so it will throw an exception later        
        paras=new int[3];
        if(x<1 || x>maxPerPara*maxPerPara*maxPerPara)//Error, invalid range
            throw new UnsupportedOperationException("ERROR parameter index "+x+" out of range for PolyNomialKernel"); //To change body of generated methods, choose Tools | Templates.
        int numLevelsIndex=(x-1)/(maxPerPara*maxPerPara);
        int numFeaturesIndex=((x-1)/(maxPerPara))%maxPerPara;
        int numTreesIndex=x%maxPerPara;
//Need to know number of attributes        
        if(numFeaturesInProblem==0)
            throw new RuntimeException("Error in TunedRandomForest in set ParametersFromIndex: we do not know the number of attributes, need to call setNumFeaturesInProblem before this call");
//Para 1. Maximum tree depth, m_MaxDepth
        if(numLevelsIndex==0)
            paras[0]=0;
        else
            paras[0]=numLevelsIndex*(numFeaturesInProblem/maxPerPara);
//Para 2. Num features
        if(numFeaturesIndex==0)
            paras[1]=(int)Math.sqrt(numFeaturesInProblem);
        else if(numFeaturesIndex==1)
            paras[1]=(int) Utils.log2(numFeaturesInProblem)+1;
        else
            paras[1]=((numFeaturesIndex-1)*numFeaturesInProblem)/maxPerPara;
        if(numTreesIndex==0)
            paras[2]=10; //Weka default
        else
            paras[2]=100*numTreesIndex;
        setMaxDepth(paras[0]);
        setNumFeaturesForEachTree(paras[1]);
        setNumTrees(paras[2]);
        if(m_Debug)
            System.out.println("Index ="+x+" Num Features ="+numFeaturesInProblem+" Max Depth="+paras[0]+" Num Features ="+paras[1]+" Num Trees ="+paras[2]);
    }

Source File: TSF.java From tsml with GNU General Public License v3.0

4 votes

/**
   * Parses a given list of options to set the parameters of the classifier.
   * We use this for the tuning mechanism, setting parameters through setOptions 
   <!-- options-start -->
   * Valid options are: <p/>
   * <pre> -T
   * Number of trees.</pre>
   * 
   * <pre> -I
   * Number of intervals to fit.</pre>
   * 
   <!-- options-end -->
   *
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
    @Override
    public void setOptions(String[] options) throws Exception{
/*        System.out.print("TSF para sets ");
        for (String str:options)
            System.out.print(","+str);
        System.out.print("\n");
*/
        String numTreesString=Utils.getOption('T', options);

        if (numTreesString.length() != 0) {
            numClassifiers = Integer.parseInt(numTreesString);
        }

        String numFeaturesString=Utils.getOption('I', options);
//Options here are a double between 0 and 1 (proportion of features), a text 
//string sqrt or log, or an integer number 
        if (numFeaturesString.length() != 0){
            try{
                if(numFeaturesString.equals("sqrt"))
                    numIntervalsFinder = (numAtts) -> (int)(Math.sqrt(numAtts));
                else if(numFeaturesString.equals("log"))
                    numIntervalsFinder = (numAtts) -> (int) Utils.log2(numAtts) + 1;
                else{
                        double d=Double.parseDouble(numFeaturesString);
                        if(d<=0)
                            throw new Exception("proportion of features of of range 0 to 1");
                        if(d<=1)
                            numIntervalsFinder = (numAtts) -> (int)(d*numAtts);
                        else
                            numIntervalsFinder = (numAtts) -> (int)(d);
 
                 }
            }catch(Exception e){
                System.err.print(" Error: invalid parameter passed to TSF setOptions for number of parameters. Setting to default");
                System.err.print("Value"+numIntervalsFinder+" Permissable values: sqrt, log, or a double range 0...1");
                numIntervalsFinder = (numAtts) -> (int)(Math.sqrt(numAtts));
            }
        }
        else
            numIntervalsFinder = (numAtts) -> (int)(Math.sqrt(numAtts));
    }

Source File: C45Split.java From tsml with GNU General Public License v3.0

4 votes

/**
 * Returns coding cost for split (used in rule learner).
 */
public final double codingCost() {

  return Utils.log2(m_index);
}

Source File: Discretize.java From tsml with GNU General Public License v3.0

4 votes

/**
  * Test using Kononenko's MDL criterion.
  *
  * @param priorCounts
  * @param bestCounts
  * @param numInstances
  * @param numCutPoints
  * @return true if the split is acceptable
  */
 private boolean KononenkosMDL(double[] priorCounts,
			double[][] bestCounts,
			double numInstances,
			int numCutPoints) {

   double distPrior, instPrior, distAfter = 0, sum, instAfter = 0;
   double before, after;
   int numClassesTotal;

   // Number of classes occuring in the set
   numClassesTotal = 0;
   for (int i = 0; i < priorCounts.length; i++) {
     if (priorCounts[i] > 0) {
numClassesTotal++;
     }
   }

   // Encode distribution prior to split
   distPrior = SpecialFunctions.log2Binomial(numInstances
				      + numClassesTotal - 1,
				      numClassesTotal - 1);

   // Encode instances prior to split.
   instPrior = SpecialFunctions.log2Multinomial(numInstances,
					 priorCounts);

   before = instPrior + distPrior;

   // Encode distributions and instances after split.
   for (int i = 0; i < bestCounts.length; i++) {
     sum = Utils.sum(bestCounts[i]);
     distAfter += SpecialFunctions.log2Binomial(sum + numClassesTotal - 1,
					 numClassesTotal - 1);
     instAfter += SpecialFunctions.log2Multinomial(sum,
					    bestCounts[i]);
   }

   // Coding cost after split
   after = Utils.log2(numCutPoints) + distAfter + instAfter;

   // Check if split is to be accepted
   return (before > after);
 }

Source File: Discretize.java From tsml with GNU General Public License v3.0

4 votes

/**
  * Test using Fayyad and Irani's MDL criterion.
  *
  * @param priorCounts
  * @param bestCounts
  * @param numInstances
  * @param numCutPoints
  * @return true if the splits is acceptable
  */
 private boolean FayyadAndIranisMDL(double[] priorCounts,
			     double[][] bestCounts,
			     double numInstances,
			     int numCutPoints) {

   double priorEntropy, entropy, gain;
   double entropyLeft, entropyRight, delta;
   int numClassesTotal, numClassesRight, numClassesLeft;

   // Compute entropy before split.
   priorEntropy = ContingencyTables.entropy(priorCounts);

   // Compute entropy after split.
   entropy = ContingencyTables.entropyConditionedOnRows(bestCounts);

   // Compute information gain.
   gain = priorEntropy - entropy;

   // Number of classes occuring in the set
   numClassesTotal = 0;
   for (int i = 0; i < priorCounts.length; i++) {
     if (priorCounts[i] > 0) {
numClassesTotal++;
     }
   }

   // Number of classes occuring in the left subset
   numClassesLeft = 0;
   for (int i = 0; i < bestCounts[0].length; i++) {
     if (bestCounts[0][i] > 0) {
numClassesLeft++;
     }
   }

   // Number of classes occuring in the right subset
   numClassesRight = 0;
   for (int i = 0; i < bestCounts[1].length; i++) {
     if (bestCounts[1][i] > 0) {
numClassesRight++;
     }
   }

   // Entropy of the left and the right subsets
   entropyLeft = ContingencyTables.entropy(bestCounts[0]);
   entropyRight = ContingencyTables.entropy(bestCounts[1]);

   // Compute terms for MDL formula
   delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) -
     (((double) numClassesTotal * priorEntropy) -
      (numClassesRight * entropyRight) -
      (numClassesLeft * entropyLeft));

   // Check if split is to be accepted
   return (gain > (Utils.log2(numCutPoints) + delta) / (double)numInstances);
 }

Source File: CollectiveForest.java From collective-classification-weka-package with GNU General Public License v3.0

4 votes

/**
 * here initialization and building, possible iterations will happen
 * 
 * @throws Exception	if something goes wrong
 */
@Override
protected void build() throws Exception {
  AttributeStats        stats;
  int                   i;
  
  // determine number of features to be selected
  m_KValue = getNumFeatures();
  if (m_KValue < 1) 
    m_KValue = (int) Utils.log2(m_Trainset.numAttributes()) + 1;

  // determine class distribution
  m_ClassDistribution = new double[2];
  stats = m_Trainset.attributeStats(m_Trainset.classIndex());
  for (i = 0; i < 2; i++) {
    if (stats.totalCount > 0)
      m_ClassDistribution[i] = stats.nominalCounts[i] / stats.totalCount;
    else
      m_ClassDistribution[i] = 0;
  }

  // the number of instances added to the training set in each iteration
  m_InstancesPerIteration =   (double) m_Testset.numInstances() 
                            / getFolds();
  if (getDebug())
    System.out.println("InstancesPerIteration: " + m_InstancesPerIteration);

  // build list of sorted test instances
  m_List = new RankedList(m_Testset, m_ClassDistribution);

  // build classifier
  m_Random = new Random(getSeed());
  for (i = 0; i <= getFolds(); i++) {
    if (getVerbose()) {
      if (getCutOff() > 0)
        System.out.println(   "\nFold " + i + "/" + getFolds() 
                            + " (CutOff at " + getCutOff() + ")");
      else
        System.out.println("\nFold " + i + "/" + getFolds());
    }
    buildTrainSet(i);
    buildClassifier();
    
    // cutoff of folds reached?
    if ( (i > 0) && (i == getCutOff()) )
      break;
  }
}

Source File: RuleStats.java From tsml with GNU General Public License v3.0

2 votes

/**
 * Subset description length: <br>
 * S(t,k,p) = -k*log2(p)-(n-k)log2(1-p)
 *
 * Details see Quilan: "MDL and categorical theories (Continued)",ML95
 *
 * @param t the number of elements in a known set
 * @param k the number of elements in a subset
 * @param p the expected proportion of subset known by recipient
 * @return the subset description length
 */
public static double subsetDL(double t, double k, double p){
  double rt = Utils.gr(p, 0.0) ? (- k*Utils.log2(p)) : 0.0;
  rt -= (t-k)*Utils.log2(1-p);
  return rt;
}

Source File: PMILexiconExpander.java From AffectiveTweets with GNU General Public License v3.0

2 votes

@Override
protected Instances process(Instances instances) throws Exception {



	Instances result = getOutputFormat();


	this.calculateWordCounts(instances);


	String[] sortedWords=this.wordInfo.keySet().toArray(new String[0]);

	Arrays.sort(sortedWords);

	for(String word:sortedWords){
		WordCount wordCount=this.wordInfo.get(word);

		if(wordCount.posCount+wordCount.negCount>=this.minFreq){

			double posProb=wordCount.posCount/posCount;
			double negProb=wordCount.negCount/negCount;
			double semanticOrientation=Utils.log2(posProb)-Utils.log2(negProb);



			double[] values = new double[result.numAttributes()];

			int wordNameIndex=result.attribute("WORD_NAME").index();
			values[wordNameIndex]=result.attribute(wordNameIndex).addStringValue(word);	

			values[result.numAttributes()-1]=semanticOrientation;


			Instance inst=new DenseInstance(1, values);

			inst.setDataset(result);

			result.add(inst);
		}

	}


	return result;



}

Java Code Examples for weka.core.Utils#log2()