org.apache.lucene.document.Document#get

Source File: LuceneResultSet.java From orientdb-lucene with Apache License 2.0

6 votes

@Override
public OIdentifiable next() {
  if (localIndex == array.length) {
    localIndex = 0;
    fetchMoreResult();
  }
  final ScoreDoc score = array[localIndex++];
  Document ret = null;
  OContextualRecordId res = null;
  try {
    ret = queryContext.searcher.doc(score.doc);
    String rId = ret.get(OLuceneIndexManagerAbstract.RID);
    res = new OContextualRecordId(rId);
    manager.onRecordAddedToResultSet(queryContext, res, ret, score);
  } catch (IOException e) {
    e.printStackTrace();
  }
  index++;
  return res;
}

Source File: LtrQueryTests.java From elasticsearch-learning-to-rank with Apache License 2.0

6 votes

private void assertScoresMatch(List<PrebuiltFeature> features, float[] scores,
                               RankerQuery ltrQuery, ScoreDoc scoreDoc) throws IOException {
    Document d = searcherUnderTest.doc(scoreDoc.doc);
    String idVal = d.get("id");
    int docId = Integer.decode(idVal);
    float modelScore = scores[docId];
    float queryScore = scoreDoc.score;

    assertEquals("Scores match with similarity " + similarity.getClass(), modelScore,
            queryScore, SCORE_NB_ULP_PREC *Math.ulp(modelScore));

    if (!(similarity instanceof TFIDFSimilarity)) {
        // There are precision issues with these similarities when using explain
        // It produces 0.56103003 for feat:0 in doc1 using score() but 0.5610301 using explain
        Explanation expl = searcherUnderTest.explain(ltrQuery, docId);

        assertEquals("Explain scores match with similarity " + similarity.getClass(), expl.getValue().floatValue(),
                queryScore, 5 * Math.ulp(modelScore));
        checkFeatureNames(expl, features);
    }
}

Source File: ExampleStatsApp.java From lucene4ir with Apache License 2.0

6 votes

public void iterateThroughDocList()  throws IOException {
    int n = reader.maxDoc();
    if (n>100) {
        n = 100;
    }
    for (int i = 0; i < n; i++) {
        Document doc = reader.document(i);
        // the doc.get pulls out the values stored - ONLY if you store the fields
        String docnum = doc.get("docnum");
        String title = doc.get("title");
        System.out.println("ID: " + i);
        System.out.println("docnum and title: " + docnum + " " + title);
        //System.out.println(doc.get("content"));
        iterateThroughDocTermVector(i);
    }
}

Source File: ExampleStatsApp.java From lucene4ir with Apache License 2.0

6 votes

public void countFieldData() throws IOException {
    int n = reader.maxDoc();
    int nt = 0;
    int nc = 0;

    for (int i = 0; i < n; i++) {
        Document doc = reader.document(i);

        // the doc.get pulls out the values stored - ONLY if you store the fields
        String title = doc.get(Lucene4IRConstants.FIELD_TITLE);
        String content = doc.get(Lucene4IRConstants.FIELD_CONTENT);
        if (title.length()>0){
            nt++;
        }
        if (content.length()>0){
            nc++;
        }
    }
    System.out.println("Num Docs: " +n + " Docs with Title text: " + nt + " Docs with Contents text: "+ nc);


}

Source File: SearchEngineIndexer.java From gravitee-management-rest-api with Apache License 2.0

6 votes

public void remove(Document document) throws TechnicalException {
    String type = document.get(TYPE_FIELD);
    String id = document.get(ID_FIELD);

    logger.debug("Removing document type[{}] ID[{}]", type, id);

    BooleanQuery.Builder bq = new BooleanQuery.Builder();
    bq.add(new TermQuery(new Term(ID_FIELD, id)), BooleanClause.Occur.MUST);
    bq.add(new TermQuery(new Term(TYPE_FIELD, type)), BooleanClause.Occur.MUST);

    try {
        writer.deleteDocuments(bq.build());
    } catch (IOException ioe) {
        logger.error("Fail to index document with ID: {}", id, ioe);
        throw new TechnicalException("Fail to index document with ID: " + id, ioe);
    }
}

Source File: TestFuzzyQuery.java From lucene-solr with Apache License 2.0

5 votes

public void testSingleQueryExactMatchScoresHighest() throws Exception {
  //See issue LUCENE-329 - IDF shouldn't wreck similarity ranking 
  Directory directory = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smith", writer);
  addDoc("smythe", writer);
  addDoc("smdssasd", writer);

  IndexReader reader = writer.getReader();
  IndexSearcher searcher = newSearcher(reader);
  searcher.setSimilarity(new ClassicSimilarity()); //avoid randomisation of similarity algo by test framework
  writer.close();
  String searchTerms[] = { "smith", "smythe", "smdssasd" };
  for (String searchTerm : searchTerms) {
    FuzzyQuery query = new FuzzyQuery(new Term("field", searchTerm), 2, 1);
    ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
    Document bestDoc = searcher.doc(hits[0].doc);
    assertTrue(hits.length > 0);
    String topMatch = bestDoc.get("field");
    assertEquals(searchTerm, topMatch);
    if (hits.length > 1) {
      Document worstDoc = searcher.doc(hits[hits.length - 1].doc);
      String worstMatch = worstDoc.get("field");
      assertNotSame(searchTerm, worstMatch);
    }
  }
  reader.close();
  directory.close();
}

Source File: OlatFullIndexer.java From olat with Apache License 2.0

5 votes

private void incrementDocumentTypeCounter(final Document document) {
    final String documentType = document.get(AbstractOlatDocument.DOCUMENTTYPE_FIELD_NAME);
    int intValue = 0;
    if (documentCounters.containsKey(documentType)) {
        final Integer docCounter = documentCounters.get(documentType);
        intValue = docCounter.intValue();
    }
    intValue++;
    documentCounters.put(documentType, new Integer(intValue));
}

Source File: SearchResultsImpl.java From olat with Apache License 2.0

5 votes

/**
 * Create a result document. Return null if the identity has not enough privileges to see the document.
 * 
 * @param doc
 * @param query
 * @param analyzer
 * @param doHighlight
 * @param identity
 * @param roles
 * @return
 * @throws IOException
 */
private ResultDocument createResultDocument(final Document doc, final int pos, final Query query, final Analyzer analyzer, final boolean doHighlight,
        final Identity identity, final Roles roles) throws IOException {
    boolean hasAccess = false;
    if (roles.isOLATAdmin()) {
        hasAccess = true;
    } else {
        String resourceUrl = doc.get(AbstractOlatDocument.RESOURCEURL_FIELD_NAME);
        if (resourceUrl == null) {
            resourceUrl = "";
        }

        final BusinessControl businessControl = BusinessControlFactory.getInstance().createFromString(resourceUrl);
        hasAccess = mainIndexer.checkAccess(null, businessControl, identity, roles);
    }

    ResultDocument resultDoc;
    if (hasAccess) {
        resultDoc = new ResultDocument(doc, pos);
        if (doHighlight) {
            doHighlight(query, analyzer, doc, resultDoc);
        }
    } else {
        resultDoc = null;
    }
    return resultDoc;
}

Source File: TripleIndexContext.java From AGDISTIS with GNU Affero General Public License v3.0

5 votes

private List<Triple> getFromIndex(int maxNumberOfResults, BooleanQuery bq) throws IOException {
	 log.debug("\t start asking index by context...");
	ScoreDoc[] hits = isearcher.search(bq, null, maxNumberOfResults).scoreDocs;

	if (hits.length == 0) {
		return new ArrayList<Triple>();
	}
	List<Triple> triples = new ArrayList<Triple>();
	String s, p, o;
	for (int i = 0; i < hits.length; i++) {
		Document hitDoc = isearcher.doc(hits[i].doc);
		s = hitDoc.get(FIELD_NAME_CONTEXT);
		p = hitDoc.get(FIELD_NAME_URI);
		o = hitDoc.get(FIELD_NAME_URI_COUNT);
		Triple triple = new Triple(s, p, o);
		triples.add(triple);
	}
	log.debug("\t finished asking index...");

	Collections.sort(triples);

	if (triples.size() < 500) {
		return triples.subList(0, triples.size());
	} else {
	return triples.subList(0, 500);
	}
}

Source File: CourseServiceImpl.java From TinyMooc with Apache License 2.0

5 votes

public List<Course> getCourses(String query) {
    try {
        List<Course> qlist = new ArrayList<Course>();
        IndexSearcher indexSearcher = new IndexSearcher(INDEXPATH);
        long begin = new Date().getTime();
        //下面的是进行title,content 两个范围内进行收索. SHOULD 表示OR
        BooleanClause.Occur[] clauses = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};
        Query queryOBJ = MultiFieldQueryParser.parse(query, new String[]{"courseIntro", "courseTitle"}, clauses, new StandardAnalyzer());//parser.parse(query);
        Filter filter = null;
        //################# 搜索相似度最高的记录 ###################
        TopDocs topDocs = indexSearcher.search(queryOBJ, filter, 1000);
        Course course = null;

        //输出结果
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            Document targetDoc = indexSearcher.doc(scoreDoc.doc);
            course = new Course();
            String courseIntro = targetDoc.get("courseIntro");
            String courseTitle = targetDoc.get("courseTitle");
            String courseId = targetDoc.get("courseId");
            TokenStream contentTokenStream = analyzer.tokenStream("courseIntro", new StringReader(courseIntro));
            TokenStream titleTokenStream = analyzer.tokenStream("courseTitle", new StringReader(courseTitle));
            course.setCourseIntro(courseIntro);
            course.setCourseTitle(courseTitle);
            course.setCourseId(courseId);
            course.setType(targetDoc.get("type"));
            course.setCourseState(targetDoc.get("courseState"));
            qlist.add(course);
        }
        indexSearcher.close();
        return qlist;
    } catch (Exception e) {
        logger.error("getCourses error.");
        return null;
    }
}

Source File: ExampleStatsApp.java From lucene4ir with Apache License 2.0

5 votes

public void iterateThroughDocListAll()  throws IOException {
    int n = reader.maxDoc();
    for (int i = 0; i < n; i++) {
        Document doc = reader.document(i);
        // the doc.get pulls out the values stored - ONLY if you store the fields
        String docnum = doc.get("docnum");
        String all = doc.get(Lucene4IRConstants.FIELD_ALL).trim();
        if (all.length() == 0) {
            System.out.println("docnum: " + docnum);
        }
    }
}

Source File: OperatorGlobalSearchGUIProvider.java From rapidminer-studio with GNU Affero General Public License v3.0

5 votes

@Override
public DragGestureListener getDragAndDropSupport(final Document document) {
	String operatorKey = document.get(GlobalSearchUtilities.FIELD_UNIQUE_ID);
	if (operatorKey == null) {
		LogService.getRoot().log(Level.WARNING, "com.rapidminer.gui.processeditor.global_search.OperatorSearchManager.error.no_key");
		return null;
	}

	try {
		return new OperatorDragGesture(OperatorService.getOperatorDescription(operatorKey).createOperatorInstance());
	} catch (OperatorCreationException e) {
		return null;
	}
}

Source File: SearchEngineIndexer.java From gravitee-management-rest-api with Apache License 2.0

5 votes

public long index(Document document) throws TechnicalException {
    logger.debug("Updating a document into the Lucene index");
    String id = document.get(ID_FIELD);
    try {
        long seq = writer.updateDocument(new Term(ID_FIELD, id), document);
        writer.commit();
        return seq;
    } catch (IOException ioe) {
        logger.error("Fail to index document with ID: {}", id, ioe);
        throw new TechnicalException("Fail to index document with ID: " + id, ioe);
    }
}

Source File: NGramTestSetup.java From uyuni with GNU General Public License v2.0

5 votes

protected void displayHits(Hits hits) throws IOException {
    for (int i = 0; i < hits.length(); i++) {
        Document doc = hits.doc(i);
        String name = doc.get("name");
        String description = doc.get("description");
        log.info("Hit<" + i + "> Score< " + hits.score(i) + ">  name = <" +
                name + "> description = <" + description + ">");
    }
}

Source File: DocumentBuilder.java From modernmt with Apache License 2.0

4 votes

public static String getId(Document self) {
    return self.get(DOC_ID_FIELD);
}

Source File: DumpTermsApp.java From lucene4ir with Apache License 2.0

4 votes

public void extractBigramsFromStoredText() throws IOException {

        HashMap<String, Integer> hmap = new HashMap<String, Integer>();
        int n = reader.maxDoc();

        for (int i = 0; i < n; i++) {

            Document doc = reader.document(i);
            String all = doc.get(lucene4ir.Lucene4IRConstants.FIELD_ALL);
            
            Analyzer a = new StandardAnalyzer();
            TokenStream ts = a.tokenStream(null, all);
            ts.reset();
            String w1 = "";
            String w2 = "";
            while (ts.incrementToken()) {
                w1 = w2;
                w2 = ts.getAttribute(CharTermAttribute.class).toString();
                if (w1 != "") {
                    //System.out.println(w1 + " " + w2);

                    String key = w1 + " " + w2;
                    if (hmap.containsKey(key)==true) {
                        int v = hmap.get(key);
                        hmap.put(key,v+1);
                    }
                    else {
                        hmap.put(key, 1);
                    }

                }
            }
        }

        Set set = hmap.entrySet();
        Iterator iterator = set.iterator();
        while(iterator.hasNext()) {
            Map.Entry me = (Map.Entry)iterator.next();
            if ((int)me.getValue() > 2) {
                System.out.print(me.getKey() + ": ");
                System.out.println(me.getValue());
            }
        }

    }

Source File: LumongoSegment.java From lumongo with Apache License 2.0

4 votes

private ScoredResult.Builder handleDocResult(IndexSearcher is, SortRequest sortRequest, boolean sorting, ScoreDoc[] results, int i,
		FetchType resultFetchType, List<String> fieldsToReturn, List<String> fieldsToMask, List<LumongoHighlighter> highlighterList,
		List<AnalysisHandler> analysisHandlerList) throws Exception {
	int docId = results[i].doc;

	Set<String> fieldsToFetch = fetchSet;
	if (indexConfig.getIndexSettings().getStoreDocumentInIndex()) {
		if (FetchType.FULL.equals(resultFetchType)) {
			fieldsToFetch = fetchSetWithDocument;
		}
		else if (FetchType.META.equals(resultFetchType)) {
			fieldsToFetch = fetchSetWithMeta;
		}
	}

	Document d = is.doc(docId, fieldsToFetch);

	IndexableField f = d.getField(LumongoConstants.TIMESTAMP_FIELD);
	long timestamp = f.numericValue().longValue();

	ScoredResult.Builder srBuilder = ScoredResult.newBuilder();
	String uniqueId = d.get(LumongoConstants.ID_FIELD);

	if (!highlighterList.isEmpty() && !FetchType.FULL.equals(resultFetchType)) {
		throw new Exception("Highlighting requires a full fetch of the document");
	}

	if (!analysisHandlerList.isEmpty() && !FetchType.FULL.equals(resultFetchType)) {
		throw new Exception("Analysis requires a full fetch of the document");
	}

	if (!FetchType.NONE.equals(resultFetchType)) {
		handleStoredDoc(srBuilder, uniqueId, d, resultFetchType, fieldsToReturn, fieldsToMask, highlighterList, analysisHandlerList);
	}

	srBuilder.setScore(results[i].score);

	srBuilder.setUniqueId(uniqueId);

	srBuilder.setTimestamp(timestamp);

	srBuilder.setDocId(docId);
	srBuilder.setSegment(segmentNumber);
	srBuilder.setIndexName(indexName);
	srBuilder.setResultIndex(i);

	if (sorting) {
		handleSortValues(sortRequest, results[i], srBuilder);
	}
	return srBuilder;
}

Source File: MtasDocumentIndex.java From inception with Apache License 2.0

4 votes

private long doCountResults(IndexSearcher searcher,
    SearchQueryRequest aRequest, MtasSpanQuery q) throws IOException
{
    ListIterator<LeafReaderContext> leafReaderContextIterator = searcher.getIndexReader()
            .leaves().listIterator();

    Map<Long, Long> annotatableDocuments = listAnnotatableDocuments(aRequest.getProject(),
        aRequest.getUser());

    final float boost = 0;
    SpanWeight spanweight = q.rewrite(searcher.getIndexReader()).createWeight(searcher, false,
            boost);

    long numResults = 0;

    while (leafReaderContextIterator.hasNext()) {
        LeafReaderContext leafReaderContext = leafReaderContextIterator.next();
        try {
            Spans spans = spanweight.getSpans(leafReaderContext, SpanWeight.Postings.POSITIONS);
            SegmentReader segmentReader = (SegmentReader) leafReaderContext.reader();
            if (spans != null) {
                while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
                    if (segmentReader.numDocs() == segmentReader.maxDoc()
                            || segmentReader.getLiveDocs().get(spans.docID())) {
                        Document document = segmentReader.document(spans.docID());

                        // Retrieve user
                        String user = document.get(FIELD_USER);

                        // Retrieve source and annotation document ids
                        String rawSourceDocumentId = document.get(FIELD_SOURCE_DOCUMENT_ID);
                        String rawAnnotationDocumentId = document
                                .get(FIELD_ANNOTATION_DOCUMENT_ID);
                        if (rawSourceDocumentId == null || rawAnnotationDocumentId == null) {
                            log.trace("Indexed document lacks source/annotation document IDs"
                                    + " - source: {}, annotation: {}", rawSourceDocumentId,
                                rawAnnotationDocumentId);
                            continue;

                        }
                        long sourceDocumentId = Long.valueOf(rawSourceDocumentId);
                        long annotationDocumentId = Long.valueOf(rawAnnotationDocumentId);

                        // If the query is limited to a given document, skip any results
                        // which are not in the given document
                        Optional<SourceDocument> limitedToDocument = aRequest
                                .getLimitedToDocument();
                        if (limitedToDocument.isPresent() && !Objects
                            .equals(limitedToDocument.get().getId(), sourceDocumentId)) {
                            log.trace("Query limited to document {}, skipping results for "
                                    + "document {}", limitedToDocument.get().getId(),
                                sourceDocumentId);
                            continue;
                        }

                        if (annotatableDocuments.containsKey(sourceDocumentId)
                            && annotationDocumentId == -1) {
                            // Exclude result if the retrieved document is a sourcedocument
                            // (that is, has annotationDocument = -1) AND it has a
                            // corresponding annotation document for this user
                            log.trace("Skipping results from indexed source document {} in" 
                                + "favor of results from the corresponding annotation "
                                + "document", sourceDocumentId);
                            continue;
                        }
                        else if (annotationDocumentId != -1 && !aRequest.getUser().getUsername()
                            .equals(user)) {
                            // Exclude result if the retrieved document is an annotation
                            // document (that is, annotationDocument != -1 and its username
                            // is different from the quering user
                            log.trace("Skipping results from annotation document for user {} "
                                    + "which does not match the requested user {}", user,
                                aRequest.getUser().getUsername());
                            continue;
                        }

                        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
                            numResults++;
                        }
                    }
                }
            }
        }
        catch (Exception e) {
            log.error("Unable to process query results", e);
            numResults = -1;
        }
    }
    return numResults;
}

Source File: ExampleStatsApp.java From lucene4ir with Apache License 2.0

4 votes

public void extractBigramsFromStoredText() throws IOException {

        HashMap<String, Integer> hmap = new HashMap<String, Integer>();
        int n = reader.maxDoc();

        for (int i = 0; i < n; i++) {

            Document doc = reader.document(i);
            String all = doc.get(Lucene4IRConstants.FIELD_ALL);

            //String[] words = all.split(" ");
            //for(String w: words ){
            //    System.out.println(w);
            //}

//        int n = words.length;
            //      for (int i=1; i<n; i++){
            //        System.out.println(words[i-1].toLowerCase().trim() + " " + words[i].toLowerCase().trim());
            //   }

            Analyzer a = new StandardAnalyzer();
            TokenStream ts = a.tokenStream(null, all);
            ts.reset();
            String w1 = "";
            String w2 = "";
            while (ts.incrementToken()) {
                w1 = w2;
                w2 = ts.getAttribute(CharTermAttribute.class).toString();
                if (w1 != "") {
                    //System.out.println(w1 + " " + w2);

                    String key = w1 + " " + w2;
                    if (hmap.containsKey(key)==true) {
                        int v = hmap.get(key);
                        hmap.put(key,v+1);
                    }
                    else {
                        hmap.put(key, 1);
                    }

                }
            }
        }

        Set set = hmap.entrySet();
        Iterator iterator = set.iterator();
        while(iterator.hasNext()) {
            Map.Entry me = (Map.Entry)iterator.next();
            if ((int)me.getValue() > 2) {
                System.out.print(me.getKey() + ": ");
                System.out.println(me.getValue());
            }
        }

    }

Source File: RetrievalAppQueryExpansion.java From lucene4ir with Apache License 2.0

4 votes

public void processQueryFile(){
    /*
    Assumes the query file contains a qno followed by the query terms.
    One query per line. i.e.

    Q1 hello world
    Q2 hello hello
    Q3 hello etc
     */
    try {
        BufferedReader br = new BufferedReader(new FileReader(p.queryFile));
        File file = new File(p.resultFile);
        FileWriter fw = new FileWriter(file);

        try {
            String line = br.readLine();
            while (line != null){

                String[] parts = line.split(" ");
                String qno = parts[0];
                String queryTerms = "";
                for (int i=1; i<parts.length; i++) {
                    queryTerms = queryTerms + " " + parts[i];
                }

                ScoreDoc[] scored = runQuery(qno, queryTerms);

                int n = Math.min(p.maxResults, scored.length);

                for(int i=0; i<n; i++){
                    Document doc = searcher.doc(scored[i].doc);
                    String docno = doc.get("docnum");
                    fw.write(qno + " QO " + docno + " " + (i+1) + " " + scored[i].score + " " + p.runTag);
                    fw.write(System.lineSeparator());
                }

                line = br.readLine();
            }

        } finally {
            br.close();
            fw.close();
        }
    } catch (Exception e){
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }



}

Java Code Examples for org.apache.lucene.document.Document#get()