Java Code Examples for org.apache.mahout.math.Vector#Element
The following examples show how to use
org.apache.mahout.math.Vector#Element .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BlockwiseCD.java From pyramid with Apache License 2.0 | 6 votes |
private double calHessiansForFeature(int l, int m) { double count = 0.0; if (m == -1) { for (int i=0; i<numData; i++) { count += (Math.pow(this.classProbMatrix[i][l],2) - this.classProbMatrix[i][l]); } } else { Vector featureColumn = dataSet.getColumn(m); for (Vector.Element element : featureColumn.nonZeroes()) { int dataPointIndex = element.index(); double featureValue = element.get(); count += (Math.pow(this.classProbMatrix[dataPointIndex][l]*featureValue, 2) - this.classProbMatrix[dataPointIndex][l] * Math.pow(featureValue,2)); } } return count; }
Example 2
Source File: DataSetUtil.java From pyramid with Apache License 2.0 | 6 votes |
/** * * @param dataSet * @param numClasses for new dataset * @return */ public static ClfDataSet changeLabels(ClfDataSet dataSet, int numClasses){ ClfDataSet dataSet1; int numDataPoints = dataSet.getNumDataPoints(); int numFeatures = dataSet.getNumFeatures(); boolean missingValue = dataSet.hasMissingValue(); if (dataSet.isDense()){ dataSet1 = new DenseClfDataSet(numDataPoints,numFeatures,missingValue,numClasses); } else { dataSet1 = new SparseClfDataSet(numDataPoints,numFeatures,missingValue,numClasses); } for (int i=0;i<numDataPoints;i++){ //only copy non-zero elements Vector vector = dataSet.getRow(i); for (Vector.Element element: vector.nonZeroes()){ int featureIndex = element.index(); double value = element.get(); if (featureIndex<numFeatures){ dataSet1.setFeatureValue(i,featureIndex,value); } } } return dataSet1; }
Example 3
Source File: KLLoss.java From pyramid with Apache License 2.0 | 6 votes |
private double calEmpiricalCountForFeature(int parameterIndex) { double empiricalCount = 0.0; int classIndex = parameterToClass[parameterIndex]; int featureIndex = parameterToFeature[parameterIndex]; if (featureIndex==-1){ for (int i=0; i<dataSet.getNumDataPoints(); i++) { empiricalCount += targetMarginals[i][classIndex]; } } else{ Vector column = dataSet.getColumn(featureIndex); for (Vector.Element element: column.nonZeroes()){ int dataIndex = element.index(); double featureValue = element.get(); empiricalCount += featureValue*targetMarginals[dataIndex][classIndex]; } } return empiricalCount; }
Example 4
Source File: Trec2Matlab.java From pyramid with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception{ Config config = new Config(args[0]); File trecFile = new File(config.getString("input.trecFile")); ClfDataSet dataSet = TRECFormat.loadClfDataSet(trecFile, DataSetType.CLF_SPARSE,false); File matlabFile = new File(config.getString("output.matlabFile")); matlabFile.getParentFile().mkdirs(); try(BufferedWriter bw = new BufferedWriter(new FileWriter(matlabFile)) ){ for (int i=0;i<dataSet.getNumDataPoints();i++){ Vector vector = dataSet.getRow(i); for (Vector.Element element: vector.nonZeroes()){ int j= element.index(); double value = element.get(); bw.write(""+(i+1)); bw.write("\t"); bw.write(""+(j+1)); bw.write("\t"); bw.write(""+value); bw.newLine(); } } } }
Example 5
Source File: DataSetUtil.java From pyramid with Apache License 2.0 | 6 votes |
/** * create a subset with the indices * it's fine to have duplicate indices * idTranslator is not saved in sampleData as we may have duplicate extIds * @param dataSet * @param indices * @return */ public static RegDataSet sampleData(RegDataSet dataSet, List<Integer> indices){ RegDataSet sample; sample = RegDataSetBuilder.getBuilder().numDataPoints(indices.size()) .numFeatures(dataSet.getNumFeatures()) .missingValue(dataSet.hasMissingValue()) .dense(dataSet.isDense()) .build(); double[] labels = dataSet.getLabels(); for (int i=0;i<indices.size();i++){ int indexInOld = indices.get(i); Vector oldVector = dataSet.getRow(indexInOld); double label = labels[indexInOld]; //copy label sample.setLabel(i,label); //copy row feature values, optimized for sparse vector for (Vector.Element element: oldVector.nonZeroes()){ sample.setFeatureValue(i,element.index(),element.get()); } } sample.setFeatureList(dataSet.getFeatureList()); //ignore idTranslator as we may have duplicate extIds return sample; }
Example 6
Source File: DataSetUtil.java From pyramid with Apache License 2.0 | 6 votes |
public static Pair<DataSet, double[][]> sampleData(DataSet dataSet, double[][] targetDistribution, List<Integer> indices){ DataSet sample; int numClasses = targetDistribution[0].length; double[][] sampledTargets = new double[indices.size()][numClasses]; sample = DataSetBuilder.getBuilder().dense(dataSet.isDense()).missingValue(dataSet.hasMissingValue()) .numDataPoints(indices.size()).numFeatures(dataSet.getNumFeatures()).build(); for (int i=0;i<indices.size();i++){ int indexInOld = indices.get(i); Vector oldVector = dataSet.getRow(indexInOld); double[] targets = targetDistribution[indexInOld]; //copy label sampledTargets[i] = Arrays.copyOf(targets,targets.length); //copy row feature values, optimized for sparse vector for (Vector.Element element: oldVector.nonZeroes()){ sample.setFeatureValue(i,element.index(),element.get()); } } sample.setFeatureList(dataSet.getFeatureList()); //ignore idTranslator as we may have duplicate extIds return new Pair<>(sample, sampledTargets); }
Example 7
Source File: BlockwiseCD.java From pyramid with Apache License 2.0 | 6 votes |
private double calEmpiricalCountForFeature(int parameterIndex) { double empiricalCount = 0.0; int classIndex = parameterToClass[parameterIndex]; int featureIndex = parameterToFeature[parameterIndex]; if (featureIndex==-1){ for (int i=0; i<dataSet.getNumDataPoints(); i++) { if (dataSet.getMultiLabels()[i].matchClass(classIndex)) { empiricalCount += 1; } } } else{ Vector column = dataSet.getColumn(featureIndex); MultiLabel[] multiLabels = dataSet.getMultiLabels(); for (Vector.Element element: column.nonZeroes()){ int dataIndex = element.index(); double featureValue = element.get(); if (multiLabels[dataIndex].matchClass(classIndex)){ empiricalCount += featureValue; } } } return empiricalCount; }
Example 8
Source File: DataSetUtil.java From pyramid with Apache License 2.0 | 6 votes |
public static void normalize(DataSet dataSet, double[] normalizationConstants){ for (int j=0;j<dataSet.getNumFeatures();j++){ Vector column = dataSet.getColumn(j); List<Integer> indices = new ArrayList<>(); List<Double> values = new ArrayList<>(); for (Vector.Element nonzero: column.nonZeroes()){ indices.add(nonzero.index()); values.add(nonzero.get()); } for (int i=0;i<indices.size();i++){ int dataId = indices.get(i); double old = values.get(i); // if normalization constant is 0, use 0 as the normalized value dataSet.setFeatureValue(dataId,j, SafeDivide.divide(old,old/normalizationConstants[j],0.0)); } } }
Example 9
Source File: MLLogisticLoss.java From pyramid with Apache License 2.0 | 6 votes |
private double calEmpricalCount(int parameterIndex){ int classIndex = mlLogisticRegression.getWeights().getClassIndex(parameterIndex); MultiLabel[] labels = dataSet.getMultiLabels(); int featureIndex = mlLogisticRegression.getWeights().getFeatureIndex(parameterIndex); double count = 0; //bias if (featureIndex == -1){ for (int i=0;i<dataSet.getNumDataPoints();i++){ if (labels[i].matchClass(classIndex)){ count +=1; } } } else { Vector featureColumn = dataSet.getColumn(featureIndex); for (Vector.Element element: featureColumn.nonZeroes()){ int dataPointIndex = element.index(); double featureValue = element.get(); MultiLabel label = labels[dataPointIndex]; if (label.matchClass(classIndex)){ count += featureValue; } } } return count; }
Example 10
Source File: AugmentedLRLoss.java From pyramid with Apache License 2.0 | 5 votes |
private double calEmpiricalCountFeatureWeight(int d){ Vector featureColumn = dataSet.getColumn(d); double sum = 0; for (Vector.Element element: featureColumn.nonZeroes()){ int dataIndex = element.index(); double feature = element.get(); if (binaryLabels[dataIndex]==1){ sum += feature; } } return sum; }
Example 11
Source File: Vectors.java From pyramid with Apache License 2.0 | 5 votes |
public static double[] toArray(Vector vector){ double[] arr = new double[vector.size()]; for (Vector.Element nonZero: vector.nonZeroes()){ int index = nonZero.index(); double v = nonZero.get(); arr[index] = v; } return arr; }
Example 12
Source File: AbstractRobustCBMOptimizer.java From pyramid with Apache License 2.0 | 5 votes |
private double effectivePositives(int componentIndex, int labelIndex){ double sum = 0; Vector labelColumn = labelMatrix.getColumn(labelIndex); for (Vector.Element element: labelColumn.nonZeroes()){ int dataIndex = element.index(); sum += gammas[dataIndex][componentIndex] * noiseLabelWeights[dataIndex][labelIndex]; } return sum; }
Example 13
Source File: CRFLoss.java From pyramid with Apache License 2.0 | 5 votes |
private double calGradientForFeature(int parameterIndex) { double count = 0.0; int classIndex = parameterToClass[parameterIndex]; int featureIndex = parameterToFeature[parameterIndex]; if (featureIndex == -1) { for (int i=0; i<dataSet.getNumDataPoints(); i++) { count += this.classProbMatrix[i][classIndex]; } } else { Vector featureColumn = dataSet.getColumn(featureIndex); for (Vector.Element element: featureColumn.nonZeroes()) { int dataPointIndex = element.index(); double featureValue = element.get(); count += this.classProbMatrix[dataPointIndex][classIndex] * featureValue; } } count -= this.empiricalCounts[parameterIndex]; // regularize if (regularizeAll){ count += cmlcrf.getWeights().getWeightForIndex(parameterIndex)/gaussianPriorVariance; } else { if (featureIndex != -1) { count += cmlcrf.getWeights().getWeightForIndex(parameterIndex)/gaussianPriorVariance; } } return count; }
Example 14
Source File: AugmentedLRLoss.java From pyramid with Apache License 2.0 | 5 votes |
private double calPredictedCountFeatureWeight(int d){ Vector featureColumn = dataSet.getColumn(d); double sum = 0; for (Vector.Element element: featureColumn.nonZeroes()){ int dataIndex = element.index(); double feature = element.get(); sum += feature* expectedProbs[dataIndex]; } return sum; }
Example 15
Source File: DataSetUtil.java From pyramid with Apache License 2.0 | 5 votes |
/** * merge to binary dataset * k=positive (1), others = negative(0) * @param dataSet * @param k * @return */ public static ClfDataSet toBinary(MultiLabelClfDataSet dataSet, int k){ int numDataPoints = dataSet.getNumDataPoints(); int numFeatures = dataSet.getNumFeatures(); boolean missingValue = dataSet.hasMissingValue(); ClfDataSet clfDataSet; if (dataSet.isDense()){ clfDataSet = new DenseClfDataSet(numDataPoints,numFeatures,missingValue, 2); } else { clfDataSet = new SparseClfDataSet(numDataPoints,numFeatures,missingValue, 2); } for (int i=0;i<numDataPoints;i++){ //only copy non-zero elements Vector vector = dataSet.getRow(i); for (Vector.Element element: vector.nonZeroes()){ int featureIndex = element.index(); double value = element.get(); clfDataSet.setFeatureValue(i,featureIndex,value); } if (dataSet.getMultiLabels()[i].matchClass(k)){ clfDataSet.setLabel(i,1); } else { clfDataSet.setLabel(i,0); } } List<String> extLabels = new ArrayList<>(); String extLabel = dataSet.getLabelTranslator().toExtLabel(k); extLabels.add("NOT "+extLabel); extLabels.add(extLabel); LabelTranslator labelTranslator = new LabelTranslator(extLabels); clfDataSet.setLabelTranslator(labelTranslator); clfDataSet.setFeatureList(dataSet.getFeatureList()); return clfDataSet; }
Example 16
Source File: DataSetUtil.java From pyramid with Apache License 2.0 | 5 votes |
/** * create a subset with the indices * it's fine to have duplicate indices * @param dataSet * @param indices * @return */ public static MultiLabelClfDataSet sampleData(MultiLabelClfDataSet dataSet, List<Integer> indices){ MultiLabelClfDataSet sample; sample = MLClfDataSetBuilder.getBuilder() .numClasses(dataSet.getNumClasses()) .numDataPoints(indices.size()) .numFeatures(dataSet.getNumFeatures()) .missingValue(dataSet.hasMissingValue()) .density(dataSet.density()) .build(); MultiLabel[] labels = dataSet.getMultiLabels(); IdTranslator idTranslator = new IdTranslator(); for (int i=0;i<indices.size();i++){ int indexInOld = indices.get(i); String extId = dataSet.getIdTranslator().toExtId(indexInOld); idTranslator.addData(i, extId); Vector oldVector = dataSet.getRow(indexInOld); Set<Integer> label = labels[indexInOld].getMatchedLabels(); //copy label sample.addLabels(i,label); //copy row feature values, optimized for sparse vector for (Vector.Element element: oldVector.nonZeroes()){ sample.setFeatureValue(i,element.index(),element.get()); } } sample.setFeatureList(dataSet.getFeatureList()); sample.setIdTranslator(idTranslator); sample.setLabelTranslator(dataSet.getLabelTranslator()); return sample; }
Example 17
Source File: AbstractCBMOptimizer.java From pyramid with Apache License 2.0 | 5 votes |
private double effectivePositives(int componentIndex, int labelIndex){ double sum = 0; Vector labelColumn = labelMatrix.getColumn(labelIndex); for (Vector.Element element: labelColumn.nonZeroes()){ int dataIndex = element.index(); sum += gammas[dataIndex][componentIndex]; } return sum; }
Example 18
Source File: Vectors.java From pyramid with Apache License 2.0 | 5 votes |
private static double dotDenseSparse(Vector denseVector, Vector sparseVector){ double sum = 0; for (Vector.Element element: sparseVector.nonZeroes()){ int index = element.index(); double value = element.get(); sum += value*denseVector.getQuick(index); } return sum; }
Example 19
Source File: LogisticRegressionInspector.java From pyramid with Apache License 2.0 | 4 votes |
public static String checkNgramUsage(LogisticRegression logisticRegression){ StringBuilder sb = new StringBuilder(); FeatureList featureList = logisticRegression.getFeatureList(); Set<Integer> usedFeatures = new HashSet<>(); for (int k=0;k<logisticRegression.getNumClasses();k++){ Vector vector = logisticRegression.getWeights().getWeightsWithoutBiasForClass(k); for (Vector.Element element: vector.nonZeroes()){ usedFeatures.add(element.index()); } } List<Ngram> selected = usedFeatures.stream().map(featureList::get).filter(feature -> feature instanceof Ngram) .map(feature -> (Ngram)feature).collect(Collectors.toList()); List<Ngram> candidates = featureList.getAll().stream() .filter(feature -> feature instanceof Ngram) .map(feature -> (Ngram)feature).collect(Collectors.toList()); int maxLength = candidates.stream().mapToInt(Ngram::getN).max().getAsInt(); int[] numberCandidates = new int[maxLength]; candidates.stream().forEach(ngram -> numberCandidates[ngram.getN() - 1] += 1); sb.append("number of ngram candidates: "); for (int n=1;n<=maxLength;n++){ sb.append(n+"-grams = "+numberCandidates[n-1]); sb.append("; "); } sb.append("\n"); int[] numberSelected = new int[maxLength]; selected.stream().forEach(ngram -> numberSelected[ngram.getN() - 1] += 1); sb.append("number of selected ngram: "); for (int n=1;n<=maxLength;n++){ sb.append(+n+"-grams = "+numberSelected[n-1]); sb.append("; "); } sb.append("\n"); int[] easyCandidates = new int[maxLength]; int[] easySelected = new int[maxLength]; Set<String> unigrams = selected.stream().filter(ngram -> ngram.getN() == 1) .map(Ngram::getNgram).collect(Collectors.toSet()); candidates.stream().filter(ngram -> isComposedOf(ngram.getNgram(), unigrams)) .forEach(ngram -> easyCandidates[ngram.getN() - 1] += 1); sb.append("number of ngram candidates that can be constructed from seeds: "); for (int n=1;n<=maxLength;n++){ sb.append(n+"-grams = "+easyCandidates[n-1]); sb.append("; "); } sb.append("\n"); selected.stream().filter(ngram -> isComposedOf(ngram.getNgram(), unigrams)) .forEach(ngram -> easySelected[ngram.getN() - 1] += 1); sb.append("number of selected ngrams that can be constructed from seeds: "); for (int n=1;n<=maxLength;n++){ sb.append(n+"-grams = "+easySelected[n-1]); sb.append("; "); } sb.append("\n"); sb.append("percentage of selected ngrams that can be constructed from seeds: "); for (int n=1;n<=maxLength;n++){ sb.append(n+"-grams = "+(double)easySelected[n-1]/numberSelected[n-1]); sb.append("; "); } sb.append("\n"); sb.append("feature selection ratio: "); for (int n=1;n<=maxLength;n++){ sb.append(n+"-grams = "+(double)numberSelected[n-1]/numberCandidates[n-1]); sb.append("; "); } sb.append("\n"); sb.append("feature selection ratio based on seeds: "); for (int n=1;n<=maxLength;n++){ sb.append(n+"-grams = "+(double)easySelected[n-1]/easyCandidates[n-1]); sb.append("; "); } return sb.toString(); }
Example 20
Source File: DataSetUtil.java From pyramid with Apache License 2.0 | 4 votes |
/** * only keep the selected featureList * @param dataSet * @return */ public static MultiLabelClfDataSet sampleFeatures(MultiLabelClfDataSet dataSet, List<Integer> columnsToKeep){ MultiLabelClfDataSet trimmed ; boolean missingValue = dataSet.hasMissingValue(); int numClasses = dataSet.getNumClasses(); // keep density if (dataSet.isDense()) { trimmed = new DenseMLClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses); } else{ trimmed = new SparseMLClfDataSet(dataSet.getNumDataPoints(),columnsToKeep.size(), missingValue, numClasses); } for (int j=0;j<trimmed.getNumFeatures();j++){ int oldColumnIndex = columnsToKeep.get(j); Vector vector = dataSet.getColumn(oldColumnIndex); for (Vector.Element element: vector.nonZeroes()){ int dataPointIndex = element.index(); double value = element.get(); trimmed.setFeatureValue(dataPointIndex,j,value); } } //copy labels MultiLabel[] multiLabels = dataSet.getMultiLabels(); for (int i=0;i<trimmed.getNumDataPoints();i++){ trimmed.addLabels(i,multiLabels[i].getMatchedLabels()); } //just copy settings trimmed.setLabelTranslator(dataSet.getLabelTranslator()); trimmed.setIdTranslator(dataSet.getIdTranslator()); List<Feature> oldFeatures = dataSet.getFeatureList().getAll(); List<Feature> newFeatures = columnsToKeep.stream().map(oldFeatures::get).collect(Collectors.toList()); for (int i=0;i<newFeatures.size();i++){ newFeatures.get(i).setIndex(i); } trimmed.setFeatureList(new FeatureList(newFeatures)); return trimmed; }