gnu.trove.TIntArrayList#sort

Source File: BestAutomaticNegativesChooser.java From jatecs with GNU General Public License v3.0

6 votes

public TIntArrayListIterator selectNegatives(String category) {
    short catID = _index.getCategoryDB().getCategory(category);

    TreeSet<DocumentSimilar> best = _best.get(catID);

    TIntArrayList neg = new TIntArrayList();
    Iterator<DocumentSimilar> it = best.iterator();
    while (it.hasNext()) {
        DocumentSimilar docS = it.next();
        neg.add(docS.docID);
    }

    neg.sort();

    return new TIntArrayListIterator(neg);
}

Source File: BestKNNNegativesChooser.java From jatecs with GNU General Public License v3.0

6 votes

public TIntArrayListIterator selectNegatives(String category) {

        short catID = _index.getCategoryDB().getCategory(category);

        TreeSet<Item> best = _best.get(catID);
        assert (best != null);

        TIntArrayList neg = new TIntArrayList();
        Iterator<Item> it = best.iterator();
        while (it.hasNext()) {
            Item docS = it.next();
            assert (!neg.contains(docS.docID));
            neg.add(docS.docID);
        }

        neg.sort();

        return new TIntArrayListIterator(neg);
    }

Source File: TreeBoostCMCLearner.java From jatecs with GNU General Public License v3.0

5 votes

protected IIndex selectPositives(short catID, IIndex training) {
    // First create a new index.
    IIndex idx = training.cloneIndex();

    IShortIterator childCats = getAllChildsCategoriesFor(idx, catID);
    short nextCatID = Short.MIN_VALUE;
    if (childCats.hasNext())
        nextCatID = childCats.next();

    // Remove unwanted categories.
    TShortArrayList toRemove = new TShortArrayList();
    for (short i = 0; i < training.getCategoryDB().getCategoriesCount(); i++) {
        if (i == nextCatID) {
            if (childCats.hasNext())
                nextCatID = childCats.next();
            continue;
        }

        toRemove.add(i);
    }
    toRemove.sort();
    idx.removeCategories(new TShortArrayListIterator(toRemove));

    // Remove unwanted documents.
    TIntArrayList docsToRemove = new TIntArrayList();
    IIntIterator docs = idx.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        IShortIterator curCats = idx.getClassificationDB()
                .getDocumentCategories(docID);
        if (!curCats.hasNext())
            docsToRemove.add(docID);
    }

    docsToRemove.sort();
    idx.removeDocuments(new TIntArrayListIterator(docsToRemove), false);

    return idx;
}

Source File: TrigramBuilderTest.java From consulo with Apache License 2.0

5 votes

public void testBuilder() {
  final Ref<Integer> trigramCountRef = new Ref<Integer>();
  final TIntArrayList list = new TIntArrayList();

  TrigramBuilder.processTrigrams("String$CharData", new TrigramBuilder.TrigramProcessor() {
    @Override
    public boolean execute(int value) {
      list.add(value);
      return true;
    }

    @Override
    public boolean consumeTrigramsCount(int count) {
      trigramCountRef.set(count);
      return true;
    }
  });

  list.sort();
  Integer trigramCount = trigramCountRef.get();
  assertNotNull(trigramCount);

  int expectedTrigramCount = 13;
  assertEquals(expectedTrigramCount, (int)trigramCount);
  assertEquals(expectedTrigramCount, list.size());

  int[] expected = {buildTrigram("$Ch"), buildTrigram("arD"), buildTrigram("ata"), 6514785, 6578548, 6759523, 6840690, 6909543, 7235364, 7496801, 7498094, 7566450, 7631465, };
  for(int i = 0; i < expectedTrigramCount; ++i) assertEquals(expected[i], list.getQuick(i));
}

Source File: TreeRecommenderLearner.java From jatecs with GNU General Public License v3.0

5 votes

protected IIndex selectPositives(short catID, IIndex training) {
    // First create a new index.
    IIndex idx = training.cloneIndex();

    IShortIterator childCats = getAllChildsCategoriesFor(idx, catID);
    short nextCatID = Short.MIN_VALUE;
    if (childCats.hasNext())
        nextCatID = childCats.next();

    // Remove unwanted categories.
    TShortArrayList toRemove = new TShortArrayList();
    for (short i = 0; i < training.getCategoryDB().getCategoriesCount(); i++) {
        if (i == nextCatID) {
            if (childCats.hasNext())
                nextCatID = childCats.next();
            continue;
        }

        toRemove.add(i);
    }
    toRemove.sort();
    idx.removeCategories(new TShortArrayListIterator(toRemove));

    // Remove unwanted documents.
    TIntArrayList docsToRemove = new TIntArrayList();
    IIntIterator docs = idx.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        IShortIterator curCats = idx.getClassificationDB()
                .getDocumentCategories(docID);
        if (!curCats.hasNext())
            docsToRemove.add(docID);
    }

    docsToRemove.sort();
    idx.removeDocuments(new TIntArrayListIterator(docsToRemove), false);

    return idx;
}

Source File: TroveDomainDB.java From jatecs with GNU General Public License v3.0

5 votes

public void removeCategoryFeatures(short category,
                                   IIntIterator removedFeatures) {
    TIntArrayList feats = _categoriesFeatures.get(category);
    while (removedFeatures.hasNext()) {
        int feature = removedFeatures.next();
        if (feats.binarySearch(feature) < 0)
            feats.add(feature);
    }
    feats.sort();
    _hasLocalRepresentation = _hasLocalRepresentation || feats.size() > 0;
}

Source File: SingleLabelTreeBoostLearner.java From jatecs with GNU General Public License v3.0

5 votes

protected IIndex selectPositives(short catID, IIndex training) {
    // First create a new index.
    IIndex idx = training.cloneIndex();

    IShortIterator childCats = getAllChildsCategoriesFor(idx, catID);
    short nextCatID = Short.MIN_VALUE;
    if (childCats.hasNext())
        nextCatID = childCats.next();

    // Remove unwanted categories.
    TShortArrayList toRemove = new TShortArrayList();
    for (short i = 0; i < training.getCategoryDB().getCategoriesCount(); i++) {
        if (i == nextCatID) {
            if (childCats.hasNext())
                nextCatID = childCats.next();
            continue;
        }

        toRemove.add(i);
    }
    toRemove.sort();
    idx.removeCategories(new TShortArrayListIterator(toRemove));

    // Remove unwanted documents.
    TIntArrayList docsToRemove = new TIntArrayList();
    IIntIterator docs = idx.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        IShortIterator curCats = idx.getClassificationDB()
                .getDocumentCategories(docID);
        if (!curCats.hasNext())
            docsToRemove.add(docID);
    }

    docsToRemove.sort();
    idx.removeDocuments(new TIntArrayListIterator(docsToRemove), false);

    return idx;
}

Source File: TreeBoostLearner.java From jatecs with GNU General Public License v3.0

5 votes

protected IIndex selectPositives(short catID, IIndex training) {
    // First create a new index.
    IIndex idx = training.cloneIndex();

    IShortIterator childCats = getAllChildsCategoriesFor(idx, catID);
    short nextCatID = Short.MIN_VALUE;
    if (childCats.hasNext())
        nextCatID = childCats.next();

    // Remove unwanted categories.
    TShortArrayList toRemove = new TShortArrayList();
    for (short i = 0; i < training.getCategoryDB().getCategoriesCount(); i++) {
        if (i == nextCatID) {
            if (childCats.hasNext())
                nextCatID = childCats.next();
            continue;
        }

        toRemove.add(i);
    }
    toRemove.sort();
    idx.removeCategories(new TShortArrayListIterator(toRemove));

    // Remove unwanted documents.
    TIntArrayList docsToRemove = new TIntArrayList();
    IIntIterator docs = idx.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        IShortIterator curCats = idx.getClassificationDB()
                .getDocumentCategories(docID);
        if (!curCats.hasNext())
            docsToRemove.add(docID);
    }

    docsToRemove.sort();
    idx.removeDocuments(new TIntArrayListIterator(docsToRemove), false);

    return idx;
}

Source File: RegexTSR.java From jatecs with GNU General Public License v3.0

5 votes

@Override
public void computeTSR(IIndex index) {
    TextualProgressBar bar = new TextualProgressBar(
            "Compute TSR with by using regex matcher");
    int total = index.getFeatureDB().getFeaturesCount();
    int step = 0;

    TIntArrayList toRemove = new TIntArrayList();

    IIntIterator it = index.getFeatureDB().getFeatures();
    while (it.hasNext()) {
        int featID = it.next();
        String featName = index.getFeatureDB().getFeatureName(featID);
        if (!featName.matches(regexPatternMatching)) {
            toRemove.add(featID);
        }

        step++;
        bar.signal((step * 100) / total);
    }

    bar.signal(100);

    toRemove.sort();

    // Remove the worst features.
    JatecsLogger.status().print("Removing worst features...");

    index.removeFeatures(new TIntArrayListIterator(toRemove));

    JatecsLogger.status().println(
            "done. Now the DB contains "
                    + index.getFeatureDB().getFeaturesCount()
                    + " feature(s).");

}

Source File: GlobalThresholdTSR.java From jatecs with GNU General Public License v3.0

5 votes

public void computeTSR(IIndex index) {
    TextualProgressBar bar = new TextualProgressBar(
            "Compute global threshold ("
                    + Os.generateDoubleString(_threshold, 3)
                    + ") TSR with " + _function.getClass().getName());
    int total = index.getFeatureDB().getFeaturesCount();
    int step = 0;

    TIntArrayList toRemove = new TIntArrayList();

    IIntIterator it = index.getFeatureDB().getFeatures();
    while (it.hasNext()) {
        int featID = it.next();

        double score = _function.compute((short) 0, featID, index);

        if (score < _threshold)
            toRemove.add(featID);

        step++;
        bar.signal((step * 100) / total);
    }

    bar.signal(100);

    toRemove.sort();

    // Remove the worst features.
    JatecsLogger.status().print(
            "Removing " + toRemove.size() + " features...");
    index.removeFeatures(new TIntArrayListIterator(toRemove));
    JatecsLogger.status().println(
            "done. Now the DB contains "
                    + index.getFeatureDB().getFeaturesCount()
                    + " feature(s).");
}

Source File: StringPrefixTSR.java From jatecs with GNU General Public License v3.0

5 votes

@Override
public void computeTSR(IIndex index) {
	TextualProgressBar bar = new TextualProgressBar(
			"Compute TSR with by using regex matcher");
	int total = index.getFeatureDB().getFeaturesCount();
	int step = 0;

	TIntArrayList toRemove = new TIntArrayList();

	IIntIterator it = index.getFeatureDB().getFeatures();
	while (it.hasNext()) {
		int featID = it.next();
		String featName = index.getFeatureDB().getFeatureName(featID);
		if (!featName.matches(regexPatternMatching)) {
			toRemove.add(featID);
		}

		step++;
		bar.signal((step * 100) / total);
	}

	bar.signal(100);

	toRemove.sort();

	// Remove the worst features.
	JatecsLogger.status().print("Removing worst features...");
	TIntArrayListIterator toRemoveIT = new TIntArrayListIterator(toRemove);
	index.removeFeatures(toRemoveIT);
	JatecsLogger.status().println(
			"done. Now the DB contains "
					+ index.getFeatureDB().getFeaturesCount()
					+ " feature(s).");
}

Source File: SvmDDagSingleLabelLearner.java From jatecs with GNU General Public License v3.0

4 votes

private IIndex buildBinaryLocalIndex(IIndex trainingIndex,
                                     ArrayList<Short> catsGood) {
    if (!(catsGood.size() == 2))
        throw new RuntimeException("The set of valid categories must be 2");

    // First create a new index.
    IIndex idx = trainingIndex.cloneIndex();

    // Remove unwanted categories.
    TShortArrayList toRemove = new TShortArrayList();
    IShortIterator allCats = idx.getCategoryDB().getCategories();
    while (allCats.hasNext()) {
        short catID = allCats.next();
        if (catsGood.contains(catID))
            continue;

        toRemove.add(catID);
    }
    toRemove.sort();
    idx.removeCategories(new TShortArrayListIterator(toRemove));

    // Remove unwanted documents.
    TIntArrayList docsToRemove = new TIntArrayList();
    IIntIterator docs = idx.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        IShortIterator curCats = idx.getClassificationDB()
                .getDocumentCategories(docID);
        if (!curCats.hasNext())
            docsToRemove.add(docID);
    }

    docsToRemove.sort();
    idx.removeDocuments(new TIntArrayListIterator(docsToRemove), true);

    // If the case, apply TSR.
    if (tsrType != null) {
        tsrType.computeTSR(idx);

        // Apply weighting.
        IWeighting weighting = null;
        if (weightingType == WeightingType.TF_IDF) {
            weighting = new TfNormalizedIdf(idx);
        } else if (weightingType == WeightingType.BM25) {
            weighting = new BM25(idx);
        }

        idx = weighting.computeWeights(idx);
    }

    // Remove 2nd category to make an index for a binary classifier.
    toRemove.clear();
    String catNameToRemove = trainingIndex.getCategoryDB()
            .getCategoryName(catsGood.get(1));
    toRemove.add(idx.getCategoryDB().getCategory(catNameToRemove));
    idx.removeCategories(new TShortArrayListIterator(toRemove));

    return idx;
}

Source File: DCSSingleLabelKnnFoldValidator.java From jatecs with GNU General Public License v3.0

4 votes

public static Pair<IIndex, IIndex> splitIndex(int step, IIndex index,
                                              int numValidationSteps) {
    int numPositives = index.getDocumentDB().getDocumentsCount();

    int numSteps = Math.min(numPositives, numValidationSteps);
    if (step >= numSteps)
        return null;

    TIntArrayList tr = new TIntArrayList();
    TIntArrayList va = new TIntArrayList();

    int numPositivesInValidation = numPositives / numSteps;
    int numPositivesInTraining = numPositives - numPositivesInValidation;
    int startTrainingID = (numPositives / numSteps) * step;
    int endTrainingID = (startTrainingID + numPositivesInTraining - 1);
    TIntIntHashMap map = new TIntIntHashMap();
    for (int i = startTrainingID; i <= endTrainingID; i++) {
        int v = i % numPositives;
        map.put(v, v);
    }

    int curDoc = 0;
    IIntIterator docs = index.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        if (map.containsKey(curDoc)) {
            tr.add(docID);
        } else {
            va.add(docID);
        }
        curDoc++;
    }

    tr.sort();
    va.sort();

    IIndex trIndex = index.cloneIndex();
    trIndex.removeDocuments(new TIntArrayListIterator(va), false);

    IIndex vaIndex = index.cloneIndex();
    vaIndex.removeDocuments(new TIntArrayListIterator(tr), false);

    JatecsLogger.status().println(
            "done. The training contains " + tr.size()
                    + " document(s) and the validation contains "
                    + va.size() + " document(s).");

    Pair<IIndex, IIndex> ret = new Pair<IIndex, IIndex>(trIndex, vaIndex);
    return ret;

}

Source File: CRWMVSingleLabelKnnFoldValidator.java From jatecs with GNU General Public License v3.0

4 votes

public static Pair<IIndex, IIndex> splitIndex(int step, IIndex index,
                                              int numValidationSteps) {
    int numPositives = index.getDocumentDB().getDocumentsCount();

    int numSteps = Math.min(numPositives, numValidationSteps);
    if (step >= numSteps)
        return null;

    TIntArrayList tr = new TIntArrayList();
    TIntArrayList va = new TIntArrayList();

    int numPositivesInValidation = numPositives / numSteps;
    int numPositivesInTraining = numPositives - numPositivesInValidation;
    int startTrainingID = (numPositives / numSteps) * step;
    int endTrainingID = (startTrainingID + numPositivesInTraining - 1);
    TIntIntHashMap map = new TIntIntHashMap();
    for (int i = startTrainingID; i <= endTrainingID; i++) {
        int v = i % numPositives;
        map.put(v, v);
    }

    int curDoc = 0;
    IIntIterator docs = index.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        if (map.containsKey(curDoc)) {
            tr.add(docID);
        } else {
            va.add(docID);
        }
        curDoc++;
    }

    tr.sort();
    va.sort();

    IIndex trIndex = index.cloneIndex();
    trIndex.removeDocuments(new TIntArrayListIterator(va), false);

    IIndex vaIndex = index.cloneIndex();
    vaIndex.removeDocuments(new TIntArrayListIterator(tr), false);

    JatecsLogger.status().println(
            "done. The training contains " + tr.size()
                    + " document(s) and the validation contains "
                    + va.size() + " document(s).");

    Pair<IIndex, IIndex> ret = new Pair<IIndex, IIndex>(trIndex, vaIndex);
    return ret;

}

Source File: SingleLabelKnnFoldValidator.java From jatecs with GNU General Public License v3.0

4 votes

public static Pair<IIndex, IIndex> splitIndex(int step, IIndex index,
                                              int numValidationSteps) {
    int numPositives = index.getDocumentDB().getDocumentsCount();

    int numSteps = Math.min(numPositives, numValidationSteps);
    if (step >= numSteps)
        return null;

    TIntArrayList tr = new TIntArrayList();
    TIntArrayList va = new TIntArrayList();

    int numPositivesInValidation = numPositives / numSteps;
    int numPositivesInTraining = numPositives - numPositivesInValidation;
    int startTrainingID = (numPositives / numSteps) * step;
    int endTrainingID = (startTrainingID + numPositivesInTraining - 1);
    TIntIntHashMap map = new TIntIntHashMap();
    for (int i = startTrainingID; i <= endTrainingID; i++) {
        int v = i % numPositives;
        map.put(v, v);
    }

    int curDoc = 0;
    IIntIterator docs = index.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        if (map.containsKey(curDoc)) {
            tr.add(docID);
        } else {
            va.add(docID);
        }
        curDoc++;
    }

    tr.sort();
    va.sort();

    IIndex trIndex = index.cloneIndex();
    trIndex.removeDocuments(new TIntArrayListIterator(va), false);

    IIndex vaIndex = index.cloneIndex();
    vaIndex.removeDocuments(new TIntArrayListIterator(tr), false);

    JatecsLogger.status().println(
            "done. The training contains " + tr.size()
                    + " document(s) and the validation contains "
                    + va.size() + " document(s).");

    Pair<IIndex, IIndex> ret = new Pair<IIndex, IIndex>(trIndex, vaIndex);
    return ret;

}

Source File: WMVSingleLabelKnnFoldValidator.java From jatecs with GNU General Public License v3.0

4 votes

public static Pair<IIndex, IIndex> splitIndex(int step, IIndex index, int numValidationSteps) {
    int numPositives = index.getDocumentDB().getDocumentsCount();

    int numSteps = Math.min(numPositives, numValidationSteps);
    if (step >= numSteps)
        return null;

    TIntArrayList tr = new TIntArrayList();
    TIntArrayList va = new TIntArrayList();

    int numPositivesInValidation = numPositives / numSteps;
    int numPositivesInTraining = numPositives - numPositivesInValidation;
    int startTrainingID = (numPositives / numSteps) * step;
    int endTrainingID = (startTrainingID + numPositivesInTraining - 1);
    TIntIntHashMap map = new TIntIntHashMap();
    for (int i = startTrainingID; i <= endTrainingID; i++) {
        int v = i % numPositives;
        map.put(v, v);
    }


    int curDoc = 0;
    IIntIterator docs = index.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        if (map.containsKey(curDoc)) {
            tr.add(docID);
        } else {
            va.add(docID);
        }
        curDoc++;
    }


    tr.sort();
    va.sort();


    IIndex trIndex = index.cloneIndex();
    trIndex.removeDocuments(new TIntArrayListIterator(va), false);


    IIndex vaIndex = index.cloneIndex();
    vaIndex.removeDocuments(new TIntArrayListIterator(tr), false);


    JatecsLogger.status().println("done. The training contains " + tr.size() + " document(s) and the validation contains " + va.size() + " document(s).");

    Pair<IIndex, IIndex> ret = new Pair<IIndex, IIndex>(trIndex, vaIndex);
    return ret;

}

Source File: CRDCSSingleLabelKnnFoldValidator.java From jatecs with GNU General Public License v3.0

4 votes

public static Pair<IIndex, IIndex> splitIndex(int step, IIndex index,
                                              int numValidationSteps) {
    int numPositives = index.getDocumentDB().getDocumentsCount();

    int numSteps = Math.min(numPositives, numValidationSteps);
    if (step >= numSteps)
        return null;

    TIntArrayList tr = new TIntArrayList();
    TIntArrayList va = new TIntArrayList();

    int numPositivesInValidation = numPositives / numSteps;
    int numPositivesInTraining = numPositives - numPositivesInValidation;
    int startTrainingID = (numPositives / numSteps) * step;
    int endTrainingID = (startTrainingID + numPositivesInTraining - 1);
    TIntIntHashMap map = new TIntIntHashMap();
    for (int i = startTrainingID; i <= endTrainingID; i++) {
        int v = i % numPositives;
        map.put(v, v);
    }

    int curDoc = 0;
    IIntIterator docs = index.getDocumentDB().getDocuments();
    while (docs.hasNext()) {
        int docID = docs.next();
        if (map.containsKey(curDoc)) {
            tr.add(docID);
        } else {
            va.add(docID);
        }
        curDoc++;
    }

    tr.sort();
    va.sort();

    IIndex trIndex = index.cloneIndex();
    trIndex.removeDocuments(new TIntArrayListIterator(va), false);

    IIndex vaIndex = index.cloneIndex();
    vaIndex.removeDocuments(new TIntArrayListIterator(tr), false);

    JatecsLogger.status().println(
            "done. The training contains " + tr.size()
                    + " document(s) and the validation contains "
                    + va.size() + " document(s).");

    Pair<IIndex, IIndex> ret = new Pair<IIndex, IIndex>(trIndex, vaIndex);
    return ret;

}

Source File: GlobalTSR.java From jatecs with GNU General Public License v3.0

4 votes

public void computeTSR(IIndex index) {
    TextualProgressBar bar = new TextualProgressBar(
            "Compute global TSR with " + _func.getClass().getName());
    int total = index.getFeatureDB().getFeaturesCount();
    int step = 0;

    TreeSet<FeatureEntry> best = new TreeSet<FeatureEntry>();
    TIntArrayList toRemove = new TIntArrayList();

    IIntIterator it = index.getFeatureDB().getFeatures();
    while (it.hasNext()) {
        int featID = it.next();

        double[] scores = new double[index.getCategoryDB()
                .getCategoriesCount()];

        for (short catID = 0; catID < scores.length; catID++) {
            if (!index.getDomainDB().hasCategoryFeature(catID, featID))
                scores[catID] = 0;
            else
                scores[catID] = _func.compute(catID, featID, index);
        }

        // Compute feature global value.
        double score = _global.compute(scores, index);

        FeatureEntry fe = new FeatureEntry();
        fe.featureID = featID;
        fe.score = score;
        best.add(fe);
        if (best.size() > _numBestFeature) {
            toRemove.add(best.first().featureID);
            best.remove(best.first());

        }

        step++;
        bar.signal((step * 100) / total);
    }

    bar.signal(100);

    toRemove.sort();

    // Remove the worst features.
    JatecsLogger.status().print("Removing worst features...");
    index.removeFeatures(new TIntArrayListIterator(toRemove));

    JatecsLogger.status().println(
            "done. Now the DB contains "
                    + index.getFeatureDB().getFeaturesCount()
                    + " feature(s).");
}

Java Code Examples for gnu.trove.TIntArrayList#sort()