org.apache.lucene.search.similarities.DefaultSimilarity Java Exaples

Source File: MoreLikeThisQuery.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public Query rewrite(IndexReader reader) throws IOException {
    XMoreLikeThis mlt = new XMoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity);

    mlt.setFieldNames(moreLikeFields);
    mlt.setAnalyzer(analyzer);
    mlt.setMinTermFreq(minTermFrequency);
    mlt.setMinDocFreq(minDocFreq);
    mlt.setMaxDocFreq(maxDocFreq);
    mlt.setMaxQueryTerms(maxQueryTerms);
    mlt.setMinWordLen(minWordLen);
    mlt.setMaxWordLen(maxWordLen);
    mlt.setStopWords(stopWords);
    mlt.setBoost(boostTerms);
    mlt.setBoostFactor(boostTermsFactor);

    if (this.unlikeText != null || this.unlikeFields != null) {
        handleUnlike(mlt, this.unlikeText, this.unlikeFields);
    }
    
    return createQuery(mlt);
}

Source File: ContextAnalyzerIndex.java From modernmt with Apache License 2.0

6 votes

public ContextAnalyzerIndex(Directory directory, Rescorer rescorer) throws IOException {
    this.indexDirectory = directory;
    this.analyzer = new CorpusAnalyzer();
    this.rescorer = rescorer;

    // Index writer setup
    IndexWriterConfig indexConfig = new IndexWriterConfig(Version.LUCENE_4_10_4, this.analyzer);
    indexConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    indexConfig.setSimilarity(new DefaultSimilarity() {

        @Override
        public float lengthNorm(FieldInvertState state) {
            return 1.f;
        }

    });

    this.indexWriter = new IndexWriter(this.indexDirectory, indexConfig);

    // Ensure index exists
    if (!DirectoryReader.indexExists(directory))
        this.indexWriter.commit();
}

Source File: TermVectorsFilter.java From Elasticsearch with Apache License 2.0

5 votes

public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) {
    this.fields = termVectorsByField;
    this.topLevelFields = topLevelFields;
    this.selectedFields = selectedFields;

    this.dfs = dfs;
    this.scoreTerms = new HashMap<>();
    this.sizes = AtomicLongMap.create();
    this.similarity = new DefaultSimilarity();
}

Source File: DefaultSimilarityProvider.java From Elasticsearch with Apache License 2.0

4 votes

/**
 * {@inheritDoc}
 */
@Override
public DefaultSimilarity get() {
    return similarity;
}

Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0

4 votes

/**
 * Constructor requiring an IndexReader.
 */
public XMoreLikeThis(IndexReader ir) {
    this(ir, new DefaultSimilarity());
}

Source File: FullTextIndexTupleSerializer.java From database with GNU General Public License v2.0

4 votes

protected ITermDocKey<V> deserialize(final ITuple tuple,
            final boolean keyOnly) {
    
        // key is {term,docId,fieldId}
        // final byte[] key = tuple.getKey();
        //      
        // // decode the document identifier.
        // final long docId = KeyBuilder.decodeLong(key, key.length
        // - Bytes.SIZEOF_LONG /*docId*/ - Bytes.SIZEOF_INT/*fieldId*/);

        final ByteArrayBuffer kbuf = tuple.getKeyBuffer();

        /*
         * The byte offset of the docId in the key.
         * 
         * Note: This is also the byte length of the match on the unicode sort
         * key, which appears at the head of the key.
         */
        final int docIdOffset = kbuf.limit() - Bytes.SIZEOF_LONG /* docId */
                - (fieldsEnabled ? Bytes.SIZEOF_INT/* fieldId */: 0);

        final V docId = (V) (Object)Long.valueOf(KeyBuilder.decodeLong(kbuf.array(),
                docIdOffset));

        // Decode field when present
        final int fieldId;
        if (fieldsEnabled) {
            fieldId = KeyBuilder.decodeShort(kbuf.array(), kbuf.limit()
                    - Bytes.SIZEOF_INT);
        } else {
            fieldId = -1;
        }
        
        final int termWeightOffset = docIdOffset - Bytes.SIZEOF_BYTE;
        
        final byte termWeightCompact = kbuf.getByte(termWeightOffset);
        
        /*
         * See: http://lucene.apache.org/core/5_1_0/core/org/apache/lucene/search/similarities/DefaultSimilarity.html
         * 
         * For more information on the round-trip of normalized term weight.
         */
        
        final DefaultSimilarity similarity = new DefaultSimilarity();

        final double termWeight = similarity.decodeNormValue(termWeightCompact);

        if (keyOnly) {

            return new ReadOnlyTermDocKey(docId, fieldId, termWeight);
            
        }
        
//        final int termFreq;
//        final double termWeight;
//        try {
//
//            final DataInputBuffer dis = tuple.getValueStream();
//
//            termFreq = dis.readShort();
//
//            if(doublePrecision)
//                termWeight = dis.readDouble();
//            else
//                termWeight = dis.readFloat();
//            
//        } catch (IOException ex) {
//            
//            throw new RuntimeException(ex);
//
//        }
//
        return new ReadOnlyTermDocRecord<V>(null/* token */, docId, fieldId,
                /* termFreq, */ termWeight);

    }

Source File: RDFFullTextIndexTupleSerializer.java From database with GNU General Public License v2.0

4 votes

protected ITermDocKey deserialize(final ITuple tuple, final boolean keyOnly) {

        final ByteArrayBuffer kbuf = tuple.getKeyBuffer();

        // The byte length of the docId IV.
        final int byteLength;
        try {
//            byteLength = LongPacker.unpackInt((DataInput) tuple
//                    .getValueStream());
            byteLength = ShortPacker.unpackShort((DataInput) tuple
            		.getValueStream());
        } catch (IOException ex) {
            throw new RuntimeException(ex);
        }
        
        final int docIdOffset = kbuf.limit() - byteLength;

        // Decode the IV.
        final IV docId = (IV) IVUtility.decodeFromOffset(kbuf.array(),
                docIdOffset);

        final int termWeightOffset = docIdOffset - Bytes.SIZEOF_BYTE;
        
        final byte termWeightCompact = kbuf.getByte(termWeightOffset);
        
        /*
         * See: http://lucene.apache.org/core/old_versioned_docs/versions/3_0_2/api/all/org/apache/lucene/search/Similarity.html
         * 
         * For more information on the round-trip of normalized term weight.
         */
        
        final DefaultSimilarity similarity = new DefaultSimilarity(); 

        final double termWeight = similarity.decodeNormValue(termWeightCompact);

        if (keyOnly) {

            return new ReadOnlyTermDocKey(docId, NO_FIELD, termWeight);

        }

//        final int termFreq;
//        final double termWeight;
//        try {
//
//            final DataInputBuffer dis = tuple.getValueStream();
//
//            // skip the byte length of the IV.
//            LongPacker.unpackInt((DataInput) dis);
//            
//            termFreq = dis.readShort();
//            termFreq = LongPacker.unpackInt((DataInput) dis);

//            if (doublePrecision)
//                termWeight = dis.readDouble();
//            else
//                termWeight = dis.readFloat();
//
//        } catch (IOException ex) {
//
//            throw new RuntimeException(ex);
//
//        }

        return new ReadOnlyTermDocRecord(null/* token */, docId, NO_FIELD,
                /* termFreq, */ termWeight);

    }

Source File: FullTextIndexTupleSerializer.java From database with GNU General Public License v2.0

2 votes

@Override
public byte[] serializeKey(final Object obj) {

    @SuppressWarnings("unchecked")
    final ITermDocKey<V> entry = (ITermDocKey<V>) obj;

    final String termText = entry.getToken();
    
    final double termWeight = entry.getLocalTermWeight();
    
    /*
     * See: http://lucene.apache.org/core/5_1_0/core/org/apache/lucene/search/similarities/DefaultSimilarity.html
     * 
     * For more information on the round-trip of normalized term weight.
     */
    
    final DefaultSimilarity similarity = new DefaultSimilarity();
    
    final long termWeightCompact = similarity.encodeNormValue((float) termWeight);
    
    final V docId = entry.getDocId();

    final IKeyBuilder keyBuilder = getKeyBuilder();

    keyBuilder.reset();

    // the token text (or its successor as desired).
    keyBuilder
            .appendText(termText, true/* unicode */, false/* successor */);
    
    keyBuilder.append(termWeightCompact);

    keyBuilder.append((V) docId);

    if (fieldsEnabled)
        keyBuilder.append(entry.getFieldId());

    final byte[] key = keyBuilder.getKey();

    if (log.isDebugEnabled()) {

        log.debug("{" + termText + "," + docId
                + (fieldsEnabled ? "," + entry.getFieldId() : "")
                + "}, key=" + BytesUtil.toString(key));

    }

    return key;

}

Source File: RDFFullTextIndexTupleSerializer.java From database with GNU General Public License v2.0

2 votes

@Override
public byte[] serializeKey(final Object obj) {

    final ITermDocKey entry = (ITermDocKey) obj;

    final String termText = entry.getToken();
    
    final double termWeight = entry.getLocalTermWeight();
    
    /*
     * See: http://lucene.apache.org/core/old_versioned_docs/versions/3_0_2/api/all/org/apache/lucene/search/Similarity.html
     * 
     * For more information on the round-trip of normalized term weight.
     */
    final DefaultSimilarity similarity = new DefaultSimilarity(); 
    final long termWeightCompact = similarity.encodeNormValue((float) termWeight);
    
    final IV docId = (IV)entry.getDocId();

    final IKeyBuilder keyBuilder = getKeyBuilder();

    keyBuilder.reset();

    // the token text (or its successor as desired).
    keyBuilder
            .appendText(termText, true/* unicode */, false/* successor */);

    keyBuilder.append(termWeightCompact);

    IVUtility.encode(keyBuilder, docId);

    final byte[] key = keyBuilder.getKey();

    if (log.isDebugEnabled()) {

        log.debug("{" + termText + "," + docId + "}, key="
                + BytesUtil.toString(key));

    }

    return key;

}

org.apache.lucene.search.similarities.DefaultSimilarity Java Examples