Example 1
Source File:    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
 * Will return numElements integers from the input elements. If numElements
 * is larger than elements.size(), everything will be returned.
 * @param elements    Elements to choose from.
 * @param numElements Number of elements to choose.
 * @return numElement random integers from elements.
private TIntSet getRandomElements(TIntSet elements, int numElements) {
  TIntList source = new TIntArrayList(elements.toArray());
  TIntSet randomElements = new TIntHashSet();
  for (int i = 0; i < numElements; ++i) {
    if (source.size() == 0) {
    // TODO: this is not efficient, as deleting from the ArrayList
    // will copy ... make this more efficient when necessary.
    int elementPosition = random_.nextInt(source.size());
    int element = source.get(elementPosition);
  return randomElements;
Example 2
Source File:    From JedAIToolkit with Apache License 2.0 6 votes vote down vote up
private int check_overlap(TIntList a, TIntList b, int overlap) {
    int posa = 0, posb = 0, count = 0;
    while (posa < (int) a.size() && posb < (int) b.size()) {
        if (count + Math.min((int) a.size() - posa, (int) b.size() - posb) < overlap) {
            return -1;
        if (a.get(posa) == b.get(posb)) {
        } else if (a.get(posa) < b.get(posb)) {
        } else {
    return count;
Example 3
Source File:    From JedAIToolkit with Apache License 2.0 5 votes vote down vote up
private int checkEditDistance(TIntList p1, TIntList p2) {
    int i = 0, j = 0, updates = 0;
    while (i < p1.size() && j < p2.size()) {
        if (p1.get(i) == p2.get(j)) {
        } else if (p1.get(i) < p2.get(j)) {
        } else {
    return p1.size() + p2.size() - updates;
Example 4
Source File:    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
public static int getUnitId(TIntList unitTokens, TIntObjectHashMap<String> id2word, TObjectIntHashMap<String> word2id) {
  if (unitTokens == null || unitTokens.size() == 0) return 0;
  if (unitTokens.size() == 1) return unitTokens.get(0);
  return word2id.get(UnitBuilder.buildUnit(unitTokens, id2word));
Example 5
Source File:    From JedAIToolkit with Apache License 2.0 4 votes vote down vote up
private List<Comparison> performJoin() {
    final List<Comparison> executedComparisons = new ArrayList<>();
    final TIntObjectMap<ListItemPPJ> index = new TIntObjectHashMap<>();
    for (int k = 0; k < noOfEntities; k++) {
        final TIntList record = records[k];

        int minLength = minPossibleLength(record.size());
        int probeLength = probeLength(record.size());
        int indexLength = indexLength(record.size());

        final int[] requireOverlaps = new int[record.size() + 1];
        for (int l = minLength; l <= record.size(); l++) {
            requireOverlaps[l] = requireOverlap(record.size(), l);

        final TIntIntMap occurances = new TIntIntHashMap();
        for (int t = 0; t < probeLength; t++) {
            int token = record.get(t);

            ListItemPPJ item = index.get(token);
            if (item == null) {
                item = new ListItemPPJ();
                index.put(token, item);

            int pos = item.getPos();
            final List<IntPair> ids = item.getIds();
            int noOfIds = ids.size();
            while (pos < noOfIds && records[ids.get(pos).getKey()].size() < minLength) {

            for (int p = pos; p < noOfIds; p++) {
                int candId = ids.get(p).getKey();
                int oldValue = occurances.get(candId);
                occurances.put(candId, (oldValue + 1));

            if (t < indexLength) {
                ids.add(new IntPair(k, t));

        for (int cand : occurances.keys()) {
            if (k == cand) {

            if (isCleanCleanER) {
                if (originalId[k] < datasetDelimiter && originalId[cand] < datasetDelimiter) { // both belong to dataset 1

                if (datasetDelimiter <= originalId[k] && datasetDelimiter <= originalId[cand]) { // both belong to dataset 2

            int noOfCandidates = records[cand].size();
            int newindexLength = indexLength(noOfCandidates);
            if (records[cand].get(newindexLength - 1) < records[k].get(probeLength - 1)) {
                if (occurances.get(cand) + noOfCandidates - newindexLength < requireOverlaps[noOfCandidates]) {
            } else {
                if (occurances.get(cand) + records[k].size() - probeLength < requireOverlaps[noOfCandidates]) {

            int realOverlap = getOverlap(k, cand, requireOverlaps[noOfCandidates]);
            if (realOverlap != -1) {
                float jaccardSim = calcSimilarity(records[k].size(), noOfCandidates, realOverlap);
                if (jaccardSim >= threshold) {
                    final Comparison currentComp = getComparison(originalId[k], originalId[cand]);
                    currentComp.setUtilityMeasure(jaccardSim); // is this correct?
    return executedComparisons;
Example 6
Source File:    From JedAIToolkit with Apache License 2.0 4 votes vote down vote up
public Category(int len, float threshold, int categoryN) {
    Category.THRESHOLD = threshold;
    Category.N = categoryN;
    s_len = len;
    e_len = (int) ((float) (s_len / THRESHOLD));
    K = (int) (2 * (1 - THRESHOLD) / (1 + THRESHOLD) * (float) e_len);
    N1 = K + 1;
    N2 = 2;

    K2 = (K + 1) / N1 - 1;

    // important fix
    if ((K + 1) % N1 != 0) {

    if (N1 > K + 1 || N1 * N2 <= K + 1) {

    subs = new ArrayList<>();
    int n = N2;
    int k = N2 - K2;
    TIntList sub = new TIntArrayList();
    int s;
    for (s = 0; s < k; s++) {

    while (sub.get(0) < n - k) {
        for (s = 0; s < k; s++) {
            if (sub.get(k - s - 1) < n - s - 1) {
        s = k - s - 1;
        sub.set(s, sub.get(s) + 1);
        for (; s < k; s++) {
            sub.set(s, sub.get(s - 1) + 1);

    sig_len = N1 * subs.size();
    //System.out.println("ss "+sig_len);
    sig_map = new HashMap[sig_len];
    for (int is = 0; is < sig_len; is++) {
        sig_map[is] = new HashMap<>();

    int[] t = new int[N1 * N2];
    range_start = new int[N1][N2];
    for (int kk = 0; kk < N1; kk++) {
        range_start[kk][0] = t[kk * N2];

    t = new int[N1 * N2];
    range_end = new int[N1][N2];
    for (int kk = 0; kk < N1; kk++) {
        range_end[kk][0] = t[kk * N2];

    //System.out.println("n1 n2 "+N+" "+N1+" "+N2);
    for (int i = 0; i < N1; i++) {
        for (int j = 0; j < N2; j++) {
            range_start[i][j] = N * (N2 * i + j) / N1 / N2;
            range_end[i][j] = N * (N2 * i + j + 1) / N1 / N2;
            /*System.out.println("rs "+range_start[i][j]);
            System.out.println("re "+range_end[i][j]);*/