org.apache.commons.lang.StringUtils#getLevenshteinDistance

Source File: ValueDataUtil.java From pentaho-kettle with Apache License 2.0

5 votes

/**
 * Levenshtein distance (LD) is a measure of the similarity between two strings, which we will refer to as the source
 * string (s) and the target string (t). The distance is the number of deletions, insertions, or substitutions
 * required to transform s into t.
 */
public static Long getLevenshtein_Distance( ValueMetaInterface metaA, Object dataA, ValueMetaInterface metaB,
  Object dataB ) {
  if ( dataA == null || dataB == null ) {
    return null;
  }
  return new Long( StringUtils.getLevenshteinDistance( dataA.toString(), dataB.toString() ) );
}

Source File: DuplicateDataDetector.java From rya with Apache License 2.0

5 votes

@Override
public boolean areObjectsApproxEquals(final IRI lhs, final IRI rhs) {
    if (isOnlyOneNull(lhs, rhs)) {
        return false;
    }
    if (Objects.equals(lhs, rhs)) {
        return true;
    }
    final String uriString1 = lhs.stringValue();
    final String uriString2 = rhs.stringValue();
    if (StringUtils.equalsIgnoreCase(uriString1, uriString2)) {
        // They're exactly equals so get out
        return true;
    } else if (tolerance.getValue() == 0) {
        // If they're not exactly equals with zero tolerance then get out
        return false;
    }
    final int distance = StringUtils.getLevenshteinDistance(uriString1, uriString2);
    // Check based on tolerance
    switch (tolerance.getToleranceType()) {
        case PERCENTAGE:
            if (uriString1.length() == 0) {
                return uriString1.length() == uriString2.length();
            }
            if (tolerance.getValue() >= 1) {
                return true;
            }
            return ((double)distance / uriString1.length()) <= tolerance.getValue();
        case DIFFERENCE:
        default:
            return distance <= tolerance.getValue();
    }
}

Source File: DuplicateDataDetector.java From rya with Apache License 2.0

5 votes

@Override
public boolean areObjectsApproxEquals(final String lhs, final String rhs) {
    if (isOnlyOneNull(lhs, rhs)) {
        return false;
    }
    if (StringUtils.equalsIgnoreCase(lhs, rhs)) {
        // They're exactly equals so get out
        return true;
    } else if (tolerance.getValue() == 0) {
        // If they're not exactly equals with zero tolerance then get out
        return false;
    }

    // Only check one-way. Terms are not bi-directionally equivalent
    // unless specified.
    final List<String> lhsTermEquivalents = equivalentTermsMap.get(lhs);
    if (lhsTermEquivalents != null && lhsTermEquivalents.contains(rhs)) {
        return true;
    }
    final int distance = StringUtils.getLevenshteinDistance(lhs, rhs);
    // Check based on tolerance
    switch (tolerance.getToleranceType()) {
        case PERCENTAGE:
            if (lhs.length() == 0) {
                return lhs.length() == rhs.length();
            }
            if (tolerance.getValue() >= 1) {
                return true;
            }
            return ((double)distance / lhs.length()) <= tolerance.getValue();
        case DIFFERENCE:
        default:
            return distance <= tolerance.getValue();
    }
}

Source File: MCRAbstractMerger.java From mycore with GNU General Public License v3.0

5 votes

/**
 *  Two abstracts are regarded probably same
 *  if their levenshtein distance is less than a configured percentage of the text length.
 */
@Override
public boolean isProbablySameAs(MCRMerger other) {
    if (!(other instanceof MCRAbstractMerger)) {
        return false;
    }

    String textOther = ((MCRAbstractMerger) other).text;
    int length = Math.min(text.length(), textOther.length());
    int distance = StringUtils.getLevenshteinDistance(text, textOther);
    System.out.println(distance);
    return (distance * 100 / length) < MAX_DISTANCE_PERCENT;
}

Source File: ISimilarityMatcher.java From xtext-eclipse with Eclipse Public License 2.0

5 votes

@Override
public boolean isSimilar(String s0, String s1) {
	if(Strings.isEmpty(s0) || Strings.isEmpty(s1)) {
		return false;
	}
	double levenshteinDistance = StringUtils.getLevenshteinDistance(s0, s1);
	return levenshteinDistance <= 1;
}

Source File: ValueDataUtil.java From hop with Apache License 2.0

5 votes

/**
 * Levenshtein distance (LD) is a measure of the similarity between two strings, which we will refer to as the source
 * string (s) and the target string (t). The distance is the number of deletions, insertions, or substitutions
 * required to transform s into t.
 */
public static Long getLevenshtein_Distance( IValueMeta metaA, Object dataA, IValueMeta metaB,
                                            Object dataB ) {
  if ( dataA == null || dataB == null ) {
    return null;
  }
  return new Long( StringUtils.getLevenshteinDistance( dataA.toString(), dataB.toString() ) );
}

Source File: NameMatcher.java From Pushjet-Android with BSD 2-Clause "Simplified" License

4 votes

/**
 * Locates the best match for the given pattern in the given set of candidate items.
 *
 * @return The match if exactly 1 match found, null if no matches or multiple matches.
 */
public String find(String pattern, Collection<String> items) {
    this.pattern = pattern;
    matches.clear();
    candidates.clear();

    if (items.contains(pattern)) {
        matches.add(pattern);
        return pattern;
    }

    if (pattern.length() == 0) {
        return null;
    }

    Pattern camelCasePattern = getPatternForName(pattern);
    Pattern normalisedCamelCasePattern = Pattern.compile(camelCasePattern.pattern(), Pattern.CASE_INSENSITIVE);
    String normalisedPattern = pattern.toUpperCase();

    Set<String> caseInsensitiveMatches = new TreeSet<String>();
    Set<String> caseSensitiveCamelCaseMatches = new TreeSet<String>();
    Set<String> caseInsensitiveCamelCaseMatches = new TreeSet<String>();

    for (String candidate : items) {
        if (candidate.equalsIgnoreCase(pattern)) {
            caseInsensitiveMatches.add(candidate);
        }
        if (camelCasePattern.matcher(candidate).matches()) {
            caseSensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (normalisedCamelCasePattern.matcher(candidate).lookingAt()) {
            caseInsensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (StringUtils.getLevenshteinDistance(normalisedPattern, candidate.toUpperCase()) <= Math.min(3, pattern.length() / 2)) {
            candidates.add(candidate);
        }
    }

    if (!caseInsensitiveMatches.isEmpty()) {
        matches.addAll(caseInsensitiveMatches);
    } else if (!caseSensitiveCamelCaseMatches.isEmpty()) {
        matches.addAll(caseSensitiveCamelCaseMatches);
    } else {
        matches.addAll(caseInsensitiveCamelCaseMatches);
    }

    if (matches.size() == 1) {
        return matches.first();
    }

    return null;
}

Source File: FuzzyMatch.java From pentaho-kettle with Apache License 2.0

4 votes

private Object[] doDistance( Object[] row ) throws KettleValueException {
  // Reserve room
  Object[] rowData = buildEmptyRow();

  Iterator<Object[]> it = data.look.iterator();

  long distance = -1;

  // Object o=row[data.indexOfMainField];
  String lookupvalue = getInputRowMeta().getString( row, data.indexOfMainField );

  while ( it.hasNext() ) {
    // Get cached row data
    Object[] cachedData = it.next();
    // Key value is the first value
    String cacheValue = (String) cachedData[0];

    int cdistance = -1;
    String usecacheValue = cacheValue;
    String uselookupvalue = lookupvalue;
    if ( !meta.isCaseSensitive() ) {
      usecacheValue = cacheValue.toLowerCase();
      uselookupvalue = lookupvalue.toLowerCase();
    }

    switch ( meta.getAlgorithmType() ) {
      case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN:
        cdistance = Utils.getDamerauLevenshteinDistance( usecacheValue, uselookupvalue );
        break;
      case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH:
        cdistance = Math.abs( (int) new NeedlemanWunsch().score( usecacheValue, uselookupvalue ) );
        break;
      default:
        cdistance = StringUtils.getLevenshteinDistance( usecacheValue, uselookupvalue );
        break;
    }

    if ( data.minimalDistance <= cdistance && cdistance <= data.maximalDistance ) {
      if ( meta.isGetCloserValue() ) {
        if ( cdistance < distance || distance == -1 ) {
          // Get closer value
          // minimal distance
          distance = cdistance;
          int index = 0;
          rowData[index++] = cacheValue;
          // Add metric value?
          if ( data.addValueFieldName ) {
            rowData[index++] = distance;
          }
          // Add additional return values?
          if ( data.addAdditionalFields ) {
            for ( int i = 0; i < meta.getValue().length; i++ ) {
              int nr = i + 1;
              int nf = i + index;
              rowData[nf] = cachedData[nr];
            }
          }
        }
      } else {
        // get all values separated by values separator
        if ( rowData[0] == null ) {
          rowData[0] = cacheValue;
        } else {
          rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue;
        }
      }
    }
  }

  return rowData;
}

Source File: RevisedLesk.java From lesk-wsd-dsm with GNU General Public License v3.0

4 votes

private float computeLDscore(String s1, String s2) {
    float maxLength = (float) Math.max(s1.length(), s2.length());
    float ld = (float) StringUtils.getLevenshteinDistance(s1, s2);
    return 1 - ld / maxLength;
}

Source File: NameMatcher.java From Pushjet-Android with BSD 2-Clause "Simplified" License

4 votes

/**
 * Locates the best match for the given pattern in the given set of candidate items.
 *
 * @return The match if exactly 1 match found, null if no matches or multiple matches.
 */
public String find(String pattern, Collection<String> items) {
    this.pattern = pattern;
    matches.clear();
    candidates.clear();

    if (items.contains(pattern)) {
        matches.add(pattern);
        return pattern;
    }

    if (pattern.length() == 0) {
        return null;
    }

    Pattern camelCasePattern = getPatternForName(pattern);
    Pattern normalisedCamelCasePattern = Pattern.compile(camelCasePattern.pattern(), Pattern.CASE_INSENSITIVE);
    String normalisedPattern = pattern.toUpperCase();

    Set<String> caseInsensitiveMatches = new TreeSet<String>();
    Set<String> caseSensitiveCamelCaseMatches = new TreeSet<String>();
    Set<String> caseInsensitiveCamelCaseMatches = new TreeSet<String>();

    for (String candidate : items) {
        if (candidate.equalsIgnoreCase(pattern)) {
            caseInsensitiveMatches.add(candidate);
        }
        if (camelCasePattern.matcher(candidate).matches()) {
            caseSensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (normalisedCamelCasePattern.matcher(candidate).lookingAt()) {
            caseInsensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (StringUtils.getLevenshteinDistance(normalisedPattern, candidate.toUpperCase()) <= Math.min(3, pattern.length() / 2)) {
            candidates.add(candidate);
        }
    }

    if (!caseInsensitiveMatches.isEmpty()) {
        matches.addAll(caseInsensitiveMatches);
    } else if (!caseSensitiveCamelCaseMatches.isEmpty()) {
        matches.addAll(caseSensitiveCamelCaseMatches);
    } else {
        matches.addAll(caseInsensitiveCamelCaseMatches);
    }

    if (matches.size() == 1) {
        return matches.first();
    }

    return null;
}

Source File: JavaTypeQuickfixes.java From xtext-eclipse with Eclipse Public License 2.0

4 votes

protected boolean isSimilarTypeName(String s0, String s1) {
	double levenshteinDistance = StringUtils.getLevenshteinDistance(s0, s1);
	return levenshteinDistance <= 3;
}

Source File: ShapeDistanceCollectiveAnswerScorer.java From bioasq with Apache License 2.0

4 votes

private double getDistance(String text1, String text2) {
  int distance = StringUtils.getLevenshteinDistance(text1, text2);
  return (double) distance / Math.max(text1.length(), text2.length());
}

Source File: EditDistanceCollectiveAnswerScorer.java From bioasq with Apache License 2.0

4 votes

private double getDistance(String text1, String text2) {
  int distance = StringUtils.getLevenshteinDistance(text1, text2);
  return (double) distance / Math.max(text1.length(), text2.length());
}

Source File: NameMatcher.java From pushfish-android with BSD 2-Clause "Simplified" License

4 votes

/**
 * Locates the best match for the given pattern in the given set of candidate items.
 *
 * @return The match if exactly 1 match found, null if no matches or multiple matches.
 */
public String find(String pattern, Collection<String> items) {
    this.pattern = pattern;
    matches.clear();
    candidates.clear();

    if (items.contains(pattern)) {
        matches.add(pattern);
        return pattern;
    }

    if (pattern.length() == 0) {
        return null;
    }

    Pattern camelCasePattern = getPatternForName(pattern);
    Pattern normalisedCamelCasePattern = Pattern.compile(camelCasePattern.pattern(), Pattern.CASE_INSENSITIVE);
    String normalisedPattern = pattern.toUpperCase();

    Set<String> caseInsensitiveMatches = new TreeSet<String>();
    Set<String> caseSensitiveCamelCaseMatches = new TreeSet<String>();
    Set<String> caseInsensitiveCamelCaseMatches = new TreeSet<String>();

    for (String candidate : items) {
        if (candidate.equalsIgnoreCase(pattern)) {
            caseInsensitiveMatches.add(candidate);
        }
        if (camelCasePattern.matcher(candidate).matches()) {
            caseSensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (normalisedCamelCasePattern.matcher(candidate).lookingAt()) {
            caseInsensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (StringUtils.getLevenshteinDistance(normalisedPattern, candidate.toUpperCase()) <= Math.min(3, pattern.length() / 2)) {
            candidates.add(candidate);
        }
    }

    if (!caseInsensitiveMatches.isEmpty()) {
        matches.addAll(caseInsensitiveMatches);
    } else if (!caseSensitiveCamelCaseMatches.isEmpty()) {
        matches.addAll(caseSensitiveCamelCaseMatches);
    } else {
        matches.addAll(caseInsensitiveCamelCaseMatches);
    }

    if (matches.size() == 1) {
        return matches.first();
    }

    return null;
}

Source File: NameMatcher.java From pushfish-android with BSD 2-Clause "Simplified" License

4 votes

/**
 * Locates the best match for the given pattern in the given set of candidate items.
 *
 * @return The match if exactly 1 match found, null if no matches or multiple matches.
 */
public String find(String pattern, Collection<String> items) {
    this.pattern = pattern;
    matches.clear();
    candidates.clear();

    if (items.contains(pattern)) {
        matches.add(pattern);
        return pattern;
    }

    if (pattern.length() == 0) {
        return null;
    }

    Pattern camelCasePattern = getPatternForName(pattern);
    Pattern normalisedCamelCasePattern = Pattern.compile(camelCasePattern.pattern(), Pattern.CASE_INSENSITIVE);
    String normalisedPattern = pattern.toUpperCase();

    Set<String> caseInsensitiveMatches = new TreeSet<String>();
    Set<String> caseSensitiveCamelCaseMatches = new TreeSet<String>();
    Set<String> caseInsensitiveCamelCaseMatches = new TreeSet<String>();

    for (String candidate : items) {
        if (candidate.equalsIgnoreCase(pattern)) {
            caseInsensitiveMatches.add(candidate);
        }
        if (camelCasePattern.matcher(candidate).matches()) {
            caseSensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (normalisedCamelCasePattern.matcher(candidate).lookingAt()) {
            caseInsensitiveCamelCaseMatches.add(candidate);
            continue;
        }
        if (StringUtils.getLevenshteinDistance(normalisedPattern, candidate.toUpperCase()) <= Math.min(3, pattern.length() / 2)) {
            candidates.add(candidate);
        }
    }

    if (!caseInsensitiveMatches.isEmpty()) {
        matches.addAll(caseInsensitiveMatches);
    } else if (!caseSensitiveCamelCaseMatches.isEmpty()) {
        matches.addAll(caseSensitiveCamelCaseMatches);
    } else {
        matches.addAll(caseInsensitiveCamelCaseMatches);
    }

    if (matches.size() == 1) {
        return matches.first();
    }

    return null;
}

Source File: FuzzyMatch.java From hop with Apache License 2.0

4 votes

private Object[] doDistance( Object[] row ) throws HopValueException {
  // Reserve room
  Object[] rowData = buildEmptyRow();

  Iterator<Object[]> it = data.look.iterator();

  long distance = -1;

  // Object o=row[data.indexOfMainField];
  String lookupvalue = getInputRowMeta().getString( row, data.indexOfMainField );

  while ( it.hasNext() ) {
    // Get cached row data
    Object[] cachedData = it.next();
    // Key value is the first value
    String cacheValue = (String) cachedData[ 0 ];

    int cdistance = -1;
    String usecacheValue = cacheValue;
    String uselookupvalue = lookupvalue;
    if ( !meta.isCaseSensitive() ) {
      usecacheValue = cacheValue.toLowerCase();
      uselookupvalue = lookupvalue.toLowerCase();
    }

    switch ( meta.getAlgorithmType() ) {
      case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN:
        cdistance = Utils.getDamerauLevenshteinDistance( usecacheValue, uselookupvalue );
        break;
      case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH:
        cdistance = Math.abs( (int) new NeedlemanWunsch().score( usecacheValue, uselookupvalue ) );
        break;
      default:
        cdistance = StringUtils.getLevenshteinDistance( usecacheValue, uselookupvalue );
        break;
    }

    if ( data.minimalDistance <= cdistance && cdistance <= data.maximalDistance ) {
      if ( meta.isGetCloserValue() ) {
        if ( cdistance < distance || distance == -1 ) {
          // Get closer value
          // minimal distance
          distance = cdistance;
          int index = 0;
          rowData[ index++ ] = cacheValue;
          // Add metric value?
          if ( data.addValueFieldName ) {
            rowData[ index++ ] = distance;
          }
          // Add additional return values?
          if ( data.addAdditionalFields ) {
            for ( int i = 0; i < meta.getValue().length; i++ ) {
              int nr = i + 1;
              int nf = i + index;
              rowData[ nf ] = cachedData[ nr ];
            }
          }
        }
      } else {
        // get all values separated by values separator
        if ( rowData[ 0 ] == null ) {
          rowData[ 0 ] = cacheValue;
        } else {
          rowData[ 0 ] = (String) rowData[ 0 ] + data.valueSeparator + cacheValue;
        }
      }
    }
  }

  return rowData;
}

Source File: SpellCheckedMetadata.java From anthelion with Apache License 2.0

3 votes

/**
 * Get the normalized name of metadata attribute name. This method tries to
 * find a well-known metadata name (one of the metadata names defined in this
 * class) that matches the specified name. The matching is error tolerent. For
 * instance,
 * <ul>
 * <li>content-type gives Content-Type</li>
 * <li>CoNtEntType gives Content-Type</li>
 * <li>ConTnTtYpe gives Content-Type</li>
 * </ul>
 * If no matching with a well-known metadata name is found, then the original
 * name is returned.
 *
 * @param name
 *          Name to normalize
 * @return normalized name
 */
public static String getNormalizedName(final String name) {
  String searched = normalize(name);
  String value = NAMES_IDX.get(searched);

  if ((value == null) && (normalized != null)) {
    int threshold = searched.length() / TRESHOLD_DIVIDER;
    for (int i = 0; i < normalized.length && value == null; i++) {
      if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
        value = NAMES_IDX.get(normalized[i]);
      }
    }
  }
  return (value != null) ? value : name;
}

Source File: SpellCheckedMetadata.java From nutch-htmlunit with Apache License 2.0

3 votes

/**
 * Get the normalized name of metadata attribute name. This method tries to
 * find a well-known metadata name (one of the metadata names defined in this
 * class) that matches the specified name. The matching is error tolerent. For
 * instance,
 * <ul>
 * <li>content-type gives Content-Type</li>
 * <li>CoNtEntType gives Content-Type</li>
 * <li>ConTnTtYpe gives Content-Type</li>
 * </ul>
 * If no matching with a well-known metadata name is found, then the original
 * name is returned.
 *
 * @param name
 *          Name to normalize
 * @return normalized name
 */
public static String getNormalizedName(final String name) {
  String searched = normalize(name);
  String value = NAMES_IDX.get(searched);

  if ((value == null) && (normalized != null)) {
    int threshold = searched.length() / TRESHOLD_DIVIDER;
    for (int i = 0; i < normalized.length && value == null; i++) {
      if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
        value = NAMES_IDX.get(normalized[i]);
      }
    }
  }
  return (value != null) ? value : name;
}

Java Code Examples for org.apache.commons.lang.StringUtils#getLevenshteinDistance()