Java Code Examples for org.apache.mahout.math.Vector#Element

The following examples show how to use org.apache.mahout.math.Vector#Element . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BlockwiseCD.java    From pyramid with Apache License 2.0 6 votes vote down vote up
private double calHessiansForFeature(int l, int m) {
    double count = 0.0;
    if (m == -1) {
        for (int i=0; i<numData; i++) {
            count += (Math.pow(this.classProbMatrix[i][l],2) - this.classProbMatrix[i][l]);
        }
    } else {
        Vector featureColumn = dataSet.getColumn(m);
        for (Vector.Element element : featureColumn.nonZeroes()) {
            int dataPointIndex = element.index();
            double featureValue = element.get();
            count += (Math.pow(this.classProbMatrix[dataPointIndex][l]*featureValue, 2) -
                    this.classProbMatrix[dataPointIndex][l] * Math.pow(featureValue,2));
        }
    }
    return count;
}
 
Example 2
Source File: DataSetUtil.java    From pyramid with Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param dataSet
 * @param numClasses for new dataset
 * @return
 */
public static ClfDataSet changeLabels(ClfDataSet dataSet, int numClasses){
    ClfDataSet dataSet1;
    int numDataPoints = dataSet.getNumDataPoints();
    int numFeatures = dataSet.getNumFeatures();
    boolean missingValue = dataSet.hasMissingValue();
    if (dataSet.isDense()){
        dataSet1 = new DenseClfDataSet(numDataPoints,numFeatures,missingValue,numClasses);
    } else {
        dataSet1 = new SparseClfDataSet(numDataPoints,numFeatures,missingValue,numClasses);
    }
    for (int i=0;i<numDataPoints;i++){
        //only copy non-zero elements
        Vector vector = dataSet.getRow(i);
        for (Vector.Element element: vector.nonZeroes()){
            int featureIndex = element.index();
            double value = element.get();
            if (featureIndex<numFeatures){
                dataSet1.setFeatureValue(i,featureIndex,value);
            }
        }
    }

    return dataSet1;
}
 
Example 3
Source File: KLLoss.java    From pyramid with Apache License 2.0 6 votes vote down vote up
private double calEmpiricalCountForFeature(int parameterIndex) {
    double empiricalCount = 0.0;
    int classIndex = parameterToClass[parameterIndex];
    int featureIndex = parameterToFeature[parameterIndex];
    if (featureIndex==-1){
        for (int i=0; i<dataSet.getNumDataPoints(); i++) {
            empiricalCount += targetMarginals[i][classIndex];
        }
    } else{
        Vector column = dataSet.getColumn(featureIndex);
        for (Vector.Element element: column.nonZeroes()){
            int dataIndex = element.index();
            double featureValue = element.get();
            empiricalCount += featureValue*targetMarginals[dataIndex][classIndex];
        }
    }
    return empiricalCount;
}
 
Example 4
Source File: Trec2Matlab.java    From pyramid with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception{
    Config config = new Config(args[0]);
    File trecFile = new File(config.getString("input.trecFile"));
    ClfDataSet dataSet = TRECFormat.loadClfDataSet(trecFile, DataSetType.CLF_SPARSE,false);
    File matlabFile = new File(config.getString("output.matlabFile"));
    matlabFile.getParentFile().mkdirs();
    try(BufferedWriter bw = new BufferedWriter(new FileWriter(matlabFile))
    ){
        for (int i=0;i<dataSet.getNumDataPoints();i++){
            Vector vector = dataSet.getRow(i);
            for (Vector.Element element: vector.nonZeroes()){
                int j= element.index();
                double value = element.get();
                bw.write(""+(i+1));
                bw.write("\t");
                bw.write(""+(j+1));
                bw.write("\t");
                bw.write(""+value);
                bw.newLine();
            }
        }
    }
}
 
Example 5
Source File: DataSetUtil.java    From pyramid with Apache License 2.0 6 votes vote down vote up
/**
 * create a subset with the indices
 * it's fine to have duplicate indices
 * idTranslator is not saved in sampleData as we may have duplicate extIds
 * @param dataSet
 * @param indices
 * @return
 */
public static RegDataSet sampleData(RegDataSet dataSet, List<Integer> indices){
    RegDataSet sample;
    sample = RegDataSetBuilder.getBuilder().numDataPoints(indices.size())
            .numFeatures(dataSet.getNumFeatures())
            .missingValue(dataSet.hasMissingValue())
            .dense(dataSet.isDense())
            .build();
    double[] labels = dataSet.getLabels();
    for (int i=0;i<indices.size();i++){
        int indexInOld = indices.get(i);
        Vector oldVector = dataSet.getRow(indexInOld);
        double label = labels[indexInOld];
        //copy label
        sample.setLabel(i,label);
        //copy row feature values, optimized for sparse vector
        for (Vector.Element element: oldVector.nonZeroes()){
            sample.setFeatureValue(i,element.index(),element.get());
        }
    }
    sample.setFeatureList(dataSet.getFeatureList());

    //ignore idTranslator as we may have duplicate extIds
    return sample;
}
 
Example 6
Source File: DataSetUtil.java    From pyramid with Apache License 2.0 6 votes vote down vote up
public static Pair<DataSet, double[][]> sampleData(DataSet dataSet, double[][] targetDistribution, List<Integer> indices){
    DataSet sample;
    int numClasses = targetDistribution[0].length;
    double[][] sampledTargets = new double[indices.size()][numClasses];
    sample = DataSetBuilder.getBuilder().dense(dataSet.isDense()).missingValue(dataSet.hasMissingValue())
            .numDataPoints(indices.size()).numFeatures(dataSet.getNumFeatures()).build();

    for (int i=0;i<indices.size();i++){
        int indexInOld = indices.get(i);
        Vector oldVector = dataSet.getRow(indexInOld);
        double[] targets = targetDistribution[indexInOld];
        //copy label
        sampledTargets[i] = Arrays.copyOf(targets,targets.length);
        //copy row feature values, optimized for sparse vector
        for (Vector.Element element: oldVector.nonZeroes()){
            sample.setFeatureValue(i,element.index(),element.get());
        }

    }

    sample.setFeatureList(dataSet.getFeatureList());

    //ignore idTranslator as we may have duplicate extIds
    return new Pair<>(sample, sampledTargets);
}
 
Example 7
Source File: BlockwiseCD.java    From pyramid with Apache License 2.0 6 votes vote down vote up
private double calEmpiricalCountForFeature(int parameterIndex) {
    double empiricalCount = 0.0;
    int classIndex = parameterToClass[parameterIndex];
    int featureIndex = parameterToFeature[parameterIndex];
    if (featureIndex==-1){
        for (int i=0; i<dataSet.getNumDataPoints(); i++) {
            if (dataSet.getMultiLabels()[i].matchClass(classIndex)) {
                empiricalCount += 1;
            }
        }
    } else{
        Vector column = dataSet.getColumn(featureIndex);
        MultiLabel[] multiLabels = dataSet.getMultiLabels();
        for (Vector.Element element: column.nonZeroes()){
            int dataIndex = element.index();
            double featureValue = element.get();
            if (multiLabels[dataIndex].matchClass(classIndex)){
                empiricalCount += featureValue;
            }
        }
    }
    return empiricalCount;
}
 
Example 8
Source File: DataSetUtil.java    From pyramid with Apache License 2.0 6 votes vote down vote up
public static void normalize(DataSet dataSet, double[] normalizationConstants){
    for (int j=0;j<dataSet.getNumFeatures();j++){
        Vector column = dataSet.getColumn(j);
        List<Integer> indices = new ArrayList<>();
        List<Double> values = new ArrayList<>();
        for (Vector.Element nonzero: column.nonZeroes()){
            indices.add(nonzero.index());
            values.add(nonzero.get());
        }

        for (int i=0;i<indices.size();i++){
            int dataId = indices.get(i);
            double old = values.get(i);
            // if normalization constant is 0, use 0 as the normalized value
            dataSet.setFeatureValue(dataId,j, SafeDivide.divide(old,old/normalizationConstants[j],0.0));
        }
    }
}
 
Example 9
Source File: MLLogisticLoss.java    From pyramid with Apache License 2.0 6 votes vote down vote up
private double calEmpricalCount(int parameterIndex){
    int classIndex = mlLogisticRegression.getWeights().getClassIndex(parameterIndex);
    MultiLabel[] labels = dataSet.getMultiLabels();
    int featureIndex = mlLogisticRegression.getWeights().getFeatureIndex(parameterIndex);
    double count = 0;
    //bias
    if (featureIndex == -1){
        for (int i=0;i<dataSet.getNumDataPoints();i++){
            if (labels[i].matchClass(classIndex)){
                count +=1;
            }
        }
    } else {
        Vector featureColumn = dataSet.getColumn(featureIndex);
        for (Vector.Element element: featureColumn.nonZeroes()){
            int dataPointIndex = element.index();
            double featureValue = element.get();
            MultiLabel label = labels[dataPointIndex];
            if (label.matchClass(classIndex)){
                count += featureValue;
            }
        }
    }
    return count;
}
 
Example 10
Source File: AugmentedLRLoss.java    From pyramid with Apache License 2.0 5 votes vote down vote up
private double calEmpiricalCountFeatureWeight(int d){
    Vector featureColumn = dataSet.getColumn(d);
    double sum = 0;
    for (Vector.Element element: featureColumn.nonZeroes()){
        int dataIndex = element.index();
        double feature = element.get();
        if (binaryLabels[dataIndex]==1){
            sum += feature;
        }
    }
    return sum;
}
 
Example 11
Source File: Vectors.java    From pyramid with Apache License 2.0 5 votes vote down vote up
public static double[] toArray(Vector vector){
    double[] arr = new double[vector.size()];
    for (Vector.Element nonZero: vector.nonZeroes()){
        int index = nonZero.index();
        double v = nonZero.get();
        arr[index] = v;
    }
    return arr;
}
 
Example 12
Source File: AbstractRobustCBMOptimizer.java    From pyramid with Apache License 2.0 5 votes vote down vote up
private double effectivePositives(int componentIndex, int labelIndex){
    double sum = 0;
    Vector labelColumn = labelMatrix.getColumn(labelIndex);
    for (Vector.Element element: labelColumn.nonZeroes()){
        int dataIndex = element.index();
        sum += gammas[dataIndex][componentIndex] * noiseLabelWeights[dataIndex][labelIndex];
    }
    return sum;
}
 
Example 13
Source File: CRFLoss.java    From pyramid with Apache License 2.0 5 votes vote down vote up
private double calGradientForFeature(int parameterIndex) {
    double count = 0.0;
    int classIndex = parameterToClass[parameterIndex];
    int featureIndex = parameterToFeature[parameterIndex];

    if (featureIndex == -1) {
        for (int i=0; i<dataSet.getNumDataPoints(); i++) {
            count += this.classProbMatrix[i][classIndex];
        }
    } else {
        Vector featureColumn = dataSet.getColumn(featureIndex);
        for (Vector.Element element: featureColumn.nonZeroes()) {
            int dataPointIndex = element.index();
            double featureValue = element.get();
            count += this.classProbMatrix[dataPointIndex][classIndex] * featureValue;
        }
    }

    count -= this.empiricalCounts[parameterIndex];

    // regularize
    if (regularizeAll){
        count += cmlcrf.getWeights().getWeightForIndex(parameterIndex)/gaussianPriorVariance;
    } else {
        if (featureIndex != -1) {
            count += cmlcrf.getWeights().getWeightForIndex(parameterIndex)/gaussianPriorVariance;
        }
    }
    return count;
}
 
Example 14
Source File: AugmentedLRLoss.java    From pyramid with Apache License 2.0 5 votes vote down vote up
private double calPredictedCountFeatureWeight(int d){
    Vector featureColumn = dataSet.getColumn(d);
    double sum = 0;
    for (Vector.Element element: featureColumn.nonZeroes()){
        int dataIndex = element.index();
        double feature = element.get();
        sum += feature* expectedProbs[dataIndex];
    }
    return sum;
}
 
Example 15
Source File: DataSetUtil.java    From pyramid with Apache License 2.0 5 votes vote down vote up
/**
 * merge to binary dataset
 * k=positive (1), others = negative(0)
 * @param dataSet
 * @param k
 * @return
 */
public static ClfDataSet toBinary(MultiLabelClfDataSet dataSet, int k){
    int numDataPoints = dataSet.getNumDataPoints();
    int numFeatures = dataSet.getNumFeatures();
    boolean missingValue = dataSet.hasMissingValue();
    ClfDataSet clfDataSet;
    if (dataSet.isDense()){
        clfDataSet = new DenseClfDataSet(numDataPoints,numFeatures,missingValue, 2);
    } else {
        clfDataSet = new SparseClfDataSet(numDataPoints,numFeatures,missingValue, 2);
    }

    for (int i=0;i<numDataPoints;i++){
        //only copy non-zero elements
        Vector vector = dataSet.getRow(i);
        for (Vector.Element element: vector.nonZeroes()){
            int featureIndex = element.index();
            double value = element.get();
            clfDataSet.setFeatureValue(i,featureIndex,value);
        }
        if (dataSet.getMultiLabels()[i].matchClass(k)){
            clfDataSet.setLabel(i,1);
        } else {
            clfDataSet.setLabel(i,0);
        }
    }

    List<String> extLabels = new ArrayList<>();
    String extLabel = dataSet.getLabelTranslator().toExtLabel(k);
    extLabels.add("NOT "+extLabel);
    extLabels.add(extLabel);
    LabelTranslator labelTranslator = new LabelTranslator(extLabels);
    clfDataSet.setLabelTranslator(labelTranslator);
    clfDataSet.setFeatureList(dataSet.getFeatureList());


    return clfDataSet;
}
 
Example 16
Source File: DataSetUtil.java    From pyramid with Apache License 2.0 5 votes vote down vote up
/**
 * create a subset with the indices
 * it's fine to have duplicate indices
 * @param dataSet
 * @param indices
 * @return
 */
public static MultiLabelClfDataSet sampleData(MultiLabelClfDataSet dataSet, List<Integer> indices){
    MultiLabelClfDataSet sample;
    sample = MLClfDataSetBuilder.getBuilder()
            .numClasses(dataSet.getNumClasses())
            .numDataPoints(indices.size())
            .numFeatures(dataSet.getNumFeatures())
            .missingValue(dataSet.hasMissingValue())
            .density(dataSet.density())
            .build();
    MultiLabel[] labels = dataSet.getMultiLabels();
    IdTranslator idTranslator = new IdTranslator();
    for (int i=0;i<indices.size();i++){
        int indexInOld = indices.get(i);
        String extId = dataSet.getIdTranslator().toExtId(indexInOld);
        idTranslator.addData(i, extId);
        Vector oldVector = dataSet.getRow(indexInOld);
        Set<Integer> label = labels[indexInOld].getMatchedLabels();
        //copy label
        sample.addLabels(i,label);
        //copy row feature values, optimized for sparse vector
        for (Vector.Element element: oldVector.nonZeroes()){
            sample.setFeatureValue(i,element.index(),element.get());
        }
    }
    sample.setFeatureList(dataSet.getFeatureList());
    sample.setIdTranslator(idTranslator);
    sample.setLabelTranslator(dataSet.getLabelTranslator());
    return sample;
}
 
Example 17
Source File: AbstractCBMOptimizer.java    From pyramid with Apache License 2.0 5 votes vote down vote up
private double effectivePositives(int componentIndex, int labelIndex){
    double sum = 0;
    Vector labelColumn = labelMatrix.getColumn(labelIndex);
    for (Vector.Element element: labelColumn.nonZeroes()){
        int dataIndex = element.index();
        sum += gammas[dataIndex][componentIndex];
    }
    return sum;
}
 
Example 18
Source File: Vectors.java    From pyramid with Apache License 2.0 5 votes vote down vote up
private static double dotDenseSparse(Vector denseVector, Vector sparseVector){
    double sum = 0;
    for (Vector.Element element: sparseVector.nonZeroes()){
        int index = element.index();
        double value = element.get();
        sum += value*denseVector.getQuick(index);
    }
    return sum;
}
 
Example 19
Source File: LogisticRegressionInspector.java    From pyramid with Apache License 2.0 4 votes vote down vote up
public static String checkNgramUsage(LogisticRegression logisticRegression){
    StringBuilder sb = new StringBuilder();
    FeatureList featureList = logisticRegression.getFeatureList();
    Set<Integer> usedFeatures = new HashSet<>();
    for (int k=0;k<logisticRegression.getNumClasses();k++){
        Vector vector = logisticRegression.getWeights().getWeightsWithoutBiasForClass(k);
        for (Vector.Element element: vector.nonZeroes()){
            usedFeatures.add(element.index());
        }
    }

    List<Ngram> selected = usedFeatures.stream().map(featureList::get).filter(feature -> feature instanceof Ngram)
            .map(feature -> (Ngram)feature).collect(Collectors.toList());

    List<Ngram> candidates = featureList.getAll().stream()
            .filter(feature -> feature instanceof Ngram)
            .map(feature -> (Ngram)feature).collect(Collectors.toList());
    int maxLength = candidates.stream().mapToInt(Ngram::getN).max().getAsInt();
    int[] numberCandidates = new int[maxLength];
    candidates.stream().forEach(ngram -> numberCandidates[ngram.getN() - 1] += 1);
    sb.append("number of ngram candidates: ");
    for (int n=1;n<=maxLength;n++){
        sb.append(n+"-grams = "+numberCandidates[n-1]);
        sb.append("; ");
    }
    sb.append("\n");

    int[] numberSelected = new int[maxLength];
    selected.stream().forEach(ngram -> numberSelected[ngram.getN() - 1] += 1);
    sb.append("number of selected ngram: ");
    for (int n=1;n<=maxLength;n++){
        sb.append(+n+"-grams = "+numberSelected[n-1]);
        sb.append("; ");
    }
    sb.append("\n");

    int[] easyCandidates = new int[maxLength];
    int[] easySelected = new int[maxLength];
    Set<String> unigrams = selected.stream().filter(ngram -> ngram.getN() == 1)
            .map(Ngram::getNgram).collect(Collectors.toSet());

    candidates.stream().filter(ngram -> isComposedOf(ngram.getNgram(), unigrams))
            .forEach(ngram -> easyCandidates[ngram.getN() - 1] += 1);
    sb.append("number of ngram candidates that can be constructed from seeds: ");
    for (int n=1;n<=maxLength;n++){
        sb.append(n+"-grams = "+easyCandidates[n-1]);
        sb.append("; ");
    }
    sb.append("\n");

    selected.stream().filter(ngram -> isComposedOf(ngram.getNgram(), unigrams))
            .forEach(ngram -> easySelected[ngram.getN() - 1] += 1);
    sb.append("number of selected ngrams that can be constructed from seeds: ");
    for (int n=1;n<=maxLength;n++){
        sb.append(n+"-grams = "+easySelected[n-1]);
        sb.append("; ");
    }
    sb.append("\n");

    sb.append("percentage of selected ngrams that can be constructed from seeds: ");
    for (int n=1;n<=maxLength;n++){
        sb.append(n+"-grams = "+(double)easySelected[n-1]/numberSelected[n-1]);
        sb.append("; ");
    }
    sb.append("\n");

    sb.append("feature selection ratio: ");
    for (int n=1;n<=maxLength;n++){
        sb.append(n+"-grams = "+(double)numberSelected[n-1]/numberCandidates[n-1]);
        sb.append("; ");
    }
    sb.append("\n");

    sb.append("feature selection ratio based on seeds: ");
    for (int n=1;n<=maxLength;n++){
        sb.append(n+"-grams = "+(double)easySelected[n-1]/easyCandidates[n-1]);
        sb.append("; ");
    }

    return sb.toString();

}
 
Example 20
Source File: DataSetUtil.java    From pyramid with Apache License 2.0 4 votes vote down vote up
/**
 * only keep the selected featureList
 * @param dataSet
 * @return
 */
public static MultiLabelClfDataSet sampleFeatures(MultiLabelClfDataSet dataSet, List<Integer> columnsToKeep){
    MultiLabelClfDataSet trimmed ;
    boolean missingValue = dataSet.hasMissingValue();
    int numClasses = dataSet.getNumClasses();
    // keep density
    if (dataSet.isDense()) {
        trimmed = new DenseMLClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses);
    } else{
        trimmed = new SparseMLClfDataSet(dataSet.getNumDataPoints(),columnsToKeep.size(), missingValue, numClasses);
    }


    for (int j=0;j<trimmed.getNumFeatures();j++){
        int oldColumnIndex = columnsToKeep.get(j);
        Vector vector = dataSet.getColumn(oldColumnIndex);
        for (Vector.Element element: vector.nonZeroes()){
            int dataPointIndex = element.index();
            double value = element.get();
            trimmed.setFeatureValue(dataPointIndex,j,value);
        }
    }
    //copy labels
    MultiLabel[] multiLabels = dataSet.getMultiLabels();

    for (int i=0;i<trimmed.getNumDataPoints();i++){
        trimmed.addLabels(i,multiLabels[i].getMatchedLabels());
    }
    //just copy settings


    trimmed.setLabelTranslator(dataSet.getLabelTranslator());
    trimmed.setIdTranslator(dataSet.getIdTranslator());
    List<Feature> oldFeatures = dataSet.getFeatureList().getAll();
    List<Feature> newFeatures = columnsToKeep.stream().map(oldFeatures::get).collect(Collectors.toList());
    for (int i=0;i<newFeatures.size();i++){
        newFeatures.get(i).setIndex(i);
    }
    trimmed.setFeatureList(new FeatureList(newFeatures));

    return trimmed;
}