org.apache.lucene.analysis.KeywordAnalyzer Java Exaples

Source File: DocumentUtil.java From netbeans with Apache License 2.0

5 votes

public static Analyzer createAnalyzer() {
    final PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new KeywordAnalyzer());
    analyzer.addAnalyzer(DocumentUtil.FIELD_IDENTS, new WhitespaceAnalyzer());
    analyzer.addAnalyzer(DocumentUtil.FIELD_FEATURE_IDENTS, new WhitespaceAnalyzer());
    analyzer.addAnalyzer(DocumentUtil.FIELD_CASE_INSENSITIVE_FEATURE_IDENTS, new DocumentUtil.LCWhitespaceAnalyzer());
    return analyzer;
}

Source File: IndexManager.java From netbeans with Apache License 2.0

5 votes

/**
 * Creates a transactional document based index.
 * The returned {@link DocumentIndex} is not cached, next call with the same arguments returns a different instance
 * of {@link DocumentIndex}. The caller is responsible to cache the returned {@link DocumentIndex}.
 * @param cacheFolder the folder in which the index should be stored
 * @param cache the document caching provider
 * @return the document based index
 * @since 2.19
 */
@NonNull
public static DocumentIndex.Transactional createTransactionalDocumentIndex (
        final @NonNull File cacheFolder,
        final @NonNull DocumentIndexCache cache) throws IOException {
    Parameters.notNull("cacheFolder", cacheFolder);     //NOI18N
    Parameters.notNull("cache", cache);                 //NOI18N
    return createTransactionalDocumentIndex(
            createTransactionalIndex(cacheFolder, new KeywordAnalyzer()),
            cache);
}

Source File: IndexTransactionTest.java From netbeans with Apache License 2.0

5 votes

private void setupLuceneIndex() throws Exception {
    clearWorkDir();
    final File wd = getWorkDir();
    cache = new File(wd,"cache");
    cache.mkdirs();
    index = LuceneIndex.create(cache, new KeywordAnalyzer());
    
}

Source File: AsyncCloseTest.java From netbeans with Apache License 2.0

5 votes

public void testAsyncClose() throws Exception {
    final CountDownLatch slot = new CountDownLatch(1);
    final CountDownLatch signal = new CountDownLatch(1);
    final  CountDownLatch done = new CountDownLatch(1);
    final AtomicReference<Exception> exception = new AtomicReference<Exception>();

    final Index index = IndexManager.createTransactionalIndex(indexFolder, new KeywordAnalyzer());
    final Thread worker = new Thread(new Runnable() {
        @Override
        public void run() {
            try {
                index.store(
                   new ArrayList<String>(Arrays.asList("foo")), //NOI18N
                   Collections.<String>emptySet(),
                   new TestInsertConvertor(slot, signal),
                   new TestDeleteConvertor(),
                   true);
            } catch (Exception ex) {
                exception.set(ex);
            } finally {
                done.countDown();
            }
        }
    });
    worker.start();

    signal.await();
    slot.countDown();
    index.close();
    done.await();
    assertNull(exception.get());
}

Source File: LayeredDocumentIndex.java From netbeans with Apache License 2.0

5 votes

@NonNull
private synchronized DocumentIndex2 getOverlay() throws IOException {
    if (overlay == null) {
        overlay = (DocumentIndex2) IndexManager.createDocumentIndex(IndexManager.createMemoryIndex(new KeywordAnalyzer()));
    }
    return overlay;
}

Source File: IndexBuilder.java From exhibitor with Apache License 2.0

5 votes

public void open() throws Exception
{
    if ( !directory.exists() && !directory.mkdirs() )
    {
        throw new IOException("Could not make: " + directory);
    }

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, new KeywordAnalyzer()).setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    niofsDirectory = new NIOFSDirectory(directory, new SingleInstanceLockFactory());
    writer = new IndexWriter(niofsDirectory, conf);
}

Source File: LuceneIndexTest.java From netbeans with Apache License 2.0

4 votes

public void testIsValid() throws Exception {
    final File wd = getWorkDir();
    final File cache = new File(wd,"cache");
    cache.mkdirs();
    final LuceneIndex index = LuceneIndex.create(cache, new KeywordAnalyzer());
    //Empty index => invalid
    assertEquals(Index.Status.EMPTY, index.getStatus(true));

    clearValidityCache(index);
    List<String> refs = new ArrayList<String>();
    refs.add("A");
    Set<String> toDel = new HashSet<String>();
    index.store(
            refs,
            toDel,
            new StrToDocConvertor("resources"),
            new StrToQueryCovertor("resource"),
            true);
    //Existing index => valid
    assertEquals(Index.Status.VALID, index.getStatus(true));
    assertTrue(cache.listFiles().length>0);

    clearValidityCache(index);
    createLock(index);
    //Index with orphan lock => invalid
    assertEquals(Index.Status.INVALID, index.getStatus(true));
    assertTrue(cache.listFiles().length==0);

    refs.add("B");
    clearValidityCache(index);
    index.store(
            refs,
            toDel,
            new StrToDocConvertor("resources"),
            new StrToQueryCovertor("resource"),
            true);
    assertEquals(Index.Status.VALID, index.getStatus(true));
    assertTrue(cache.listFiles().length>0);

    //Broken index => invalid
    clearValidityCache(index);
    File bt = null;
    for (File file : cache.listFiles()) {
        // either compound file or filds information must be present
        if (file.getName().endsWith(".cfs") || file.getName().endsWith(".fnm")) {
            bt = file;
            break;
        }
    }
    assertNotNull(bt);
    FileOutputStream out = new FileOutputStream(bt);
    try {
        out.write(new byte[] {0,0,0,0,0,0,0,0,0,0}, 0, 10);
    } finally {
        out.close();
    }
    assertEquals(Index.Status.INVALID, index.getStatus(true));
    assertTrue(cache.listFiles().length==0);
    
}

Source File: AsyncCloseTest.java From netbeans with Apache License 2.0

4 votes

public void testConcurrentReadWrite() throws Exception {
    final Index index = IndexManager.createTransactionalIndex(indexFolder, new KeywordAnalyzer());
    index.store(
        new ArrayList<String>(Arrays.asList("a")), //NOI18N
        Collections.<String>emptySet(),
        new TestInsertConvertor(),
        new TestDeleteConvertor(),
        true);

    final CountDownLatch slot = new CountDownLatch(1);
    final CountDownLatch signal = new CountDownLatch(1);
    final CountDownLatch done = new CountDownLatch(1);
    final AtomicReference<Exception> result = new AtomicReference<Exception>();

    final Thread worker = new Thread(new Runnable() {
        @Override
        public void run() {
            try {
                index.store(
                       new ArrayList<String>(Arrays.asList("b")), //NOI18N
                       Collections.<String>emptySet(),
                       new TestInsertConvertor(slot, signal),
                       new TestDeleteConvertor(),
                       true);
            } catch (Exception e) {
                result.set(e);
            } finally {
                done.countDown();
            }
        }
    });

    worker.start();
    signal.await();

    final Collection<String> data = new ArrayList<String>();
    index.query(
        data,
        new Convertor<Document,String>(){
            @Override
            public String convert(Document p) {
                return p.get(FLD_KEY);
            }
        },
        null,
        new AtomicBoolean(),
        new PrefixQuery(new Term(FLD_KEY,""))); //NOI18N
    assertEquals(1, data.size());
    assertEquals("a", data.iterator().next());  //NOI18N
    slot.countDown();
    done.await();
    assertNull(result.get());
}

Source File: AnchorIndexer.java From tagme with Apache License 2.0

4 votes

@Override
	public void makeIndex(String lang, File workingDir) throws IOException
	{
		log.info("Loading support datasets...");
		
		File all_anchors = new WikipediaAnchorParser(lang).getFile();
		long numAnchors = ExternalSortUtils.wcl(all_anchors);
		AnchorIterator iterator = new AnchorIterator(all_anchors);
		
		IntSet people = new PeopleWIDs(lang).getDataset();
		
//		IndexSearcher articles = Indexes.getSearcher(RepositoryDirs.WIKIPEDIA.getPath(lang));
		IndexSearcher articles = openWikipediaIndex(lang);
		//QueryParser queryParser = new QueryParser(Version.LUCENE_34, WikipediaIndexer.FIELD_BODY, new WhitespaceAnalyzer(Version.LUCENE_34));
		QueryParser queryParser = new QueryParser(Version.LUCENE_34, WikipediaIndexer.FIELD_BODY, new StandardAnalyzer(Version.LUCENE_34, new HashSet<String>()));
		
		IndexWriter index = new IndexWriter(FSDirectory.open(workingDir.getAbsoluteFile()), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer()));
		Document doc = new Document();
		Field fId = new Field(FIELD_ID, "", Store.YES, Index.NOT_ANALYZED);
		Field fText = new Field(FIELD_TEXT, "", Store.YES, Index.NOT_ANALYZED);
		Field fObject = new Field(FIELD_OBJECT, "", Store.YES, Index.NO);
		
		doc.add(fId);
		doc.add(fText);
		doc.add(fObject);
		
//		Field fOriginal = new Field(FIELD_ORIGINAL, "", Store.YES, Index.ANALYZED);
//		Field fWID = new Field(FIELD_WID, "", Store.NO, Index.ANALYZED);
		
		PLogger plog = new PLogger(log, Step.TEN_MINUTES, "lines", "anchors", "searches", "indexed", "0-freq","dropped");
		plog.setEnd(0, numAnchors);
		plog.start("Support datasets loaded, now parsing...");
		int id=0;
		while(iterator.next())
		{
			plog.update(0, iterator.scroll);
			plog.update(1);
			String anchorText = iterator.anchor;
			
			int freq = freq(iterator.originals, articles, queryParser);
			plog.update(2, iterator.originals.size());
			if (freq == 0) plog.update(4);
			
			Anchor anchorObj = Anchor.build(id, iterator.links, freq, people);
			if (anchorObj == null){
				plog.update(5);
				continue;
			}
			
			String anchorSerial = Anchor.serialize(anchorObj);
			fId.setValue(Integer.toString(++id));
			fText.setValue(anchorText);
			fObject.setValue(anchorSerial);
			
			for(int page : anchorObj){
				Field fWID = new Field(FIELD_WID, Integer.toString(page), Store.YES, Index.NOT_ANALYZED);
//				fWID.setBoost(iterator.links.get(page));
				doc.add(fWID);
			}
			for(String original : iterator.originals) {
				doc.add(new Field(FIELD_ORIGINAL, original, Store.YES, Index.NOT_ANALYZED));
			}
			
			index.addDocument(doc);
			plog.update(3);
			
			doc.removeFields(FIELD_ORIGINAL);
			doc.removeFields(FIELD_WID);
		}
		plog.stop();
		iterator.close();
		
		log.info("Now optimizing...");
		index.optimize();
		
		index.close();
		log.info("Done.");
	}

Source File: TopicIndexer.java From tagme with Apache License 2.0

4 votes

@Override
	public void makeIndex(String lang, File workingDir) throws IOException
	{
		
		IndexReader articles = Indexes.getReader(RepositoryDirs.WIKIPEDIA.getPath(lang));
		Int2ObjectMap<String> bestAnchorMap = new BestAnchors(lang).getDataset();
		
		IndexWriter index = new IndexWriter(new SimpleFSDirectory(workingDir), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer()));
		Document doc = new Document();
		Field fWID = new Field(FIELD_WID, "", Store.YES, Index.NOT_ANALYZED);
		Field fTitle = new Field(FIELD_TITLE, "", Store.YES, Index.NOT_ANALYZED);
		Field fAbstract = new Field(FIELD_ABSTRACT, "", Store.YES, Index.NO);
		Field fBestAnchor = new Field(FIELD_BEST_ANCHOR, "", Store.YES, Index.NO);
		doc.add(fWID);
		doc.add(fTitle);
		doc.add(fAbstract);
		doc.add(fBestAnchor);
				
		
		int max = articles.maxDoc();
		PLogger plog = new PLogger(log, Step.TEN_MINUTES, "pages", "indexed", "noBest");
		plog.setEnd(max);
		plog.start("Start indexing...");
		
		for(int i=0; i<max; i++)
		{
			plog.update(0);
			Document oldDoc = articles.document(i);
			PageType type = PageType.valueOf(oldDoc.get(WikipediaIndexer.FIELD_TYPE));
			if (type == PageType.TOPIC)
			{
				int wid = Integer.parseInt(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fWID.setValue(oldDoc.get(WikipediaIndexer.FIELD_WID));
				fAbstract.setValue(oldDoc.get(WikipediaIndexer.FIELD_ABSTRACT));
				fTitle.setValue(oldDoc.get(WikipediaIndexer.FIELD_TITLE));
				
				String bestAnchor = bestAnchorMap.get(wid);
				if (bestAnchor == null || bestAnchor.length() == 0) plog.update(2);
				fBestAnchor.setValue(bestAnchor==null?"":bestAnchor);
				
				String[] cats = oldDoc.getValues(WikipediaIndexer.FIELD_CAT);
				if (cats != null) {
					for (int j=0; j<cats.length; j++)
						doc.add(new Field(FIELD_CAT, cats[j], Store.YES, Index.NOT_ANALYZED));
				}
				
				index.addDocument(doc);
				plog.update(1);
				
				doc.removeFields(FIELD_CAT);
			}
		}
		
		plog.stop();
		
		log.info("Now optimizing...");
		index.optimize();
		
		index.close();
		
		//we cannot call this because the index is still in the temporary dir
		//so TopicDocs will be created using old index
//		log.info("Index Done, now creating WID->DOC_ID map");
//		
//		TopicDocs td = new TopicDocs(lang);
//		td.forceParsing();
		
		log.info("Done.");
	}

Source File: IndexManager.java From netbeans with Apache License 2.0

3 votes

/**
 * Creates a document based index
 * The returned {@link Index} is not cached, next call with the same arguments returns a different instance
 * of {@link Index}. The caller is responsible to cache the returned {@link DocumentIndex}.
 * @param cacheFolder the folder in which the index should be stored
 * @param cache the document caching provider
 * @return the document based index
 * @since 2.18.0
 */
public static DocumentIndex createDocumentIndex (
        final @NonNull File cacheFolder,
        final @NonNull DocumentIndexCache cache) throws IOException {
    Parameters.notNull("cacheFolder", cacheFolder);     //NOI18N
    Parameters.notNull("cache", cache);                 //NOI18N
    return createDocumentIndex(createIndex(cacheFolder, new KeywordAnalyzer()), cache);
}

Source File: IndexManager.java From netbeans with Apache License 2.0

2 votes

/**
 * Creates a document based index
 * The returned {@link Index} is not cached, next call with the same arguments returns a different instance
 * of {@link Index}. The caller is responsible to cache the returned {@link DocumentIndex}.
 * @param cacheFolder the folder in which the index should be stored
 * @param isWritable <code>false</code> if it is read only index
 * @return the document based index
 * @since 2.27.1
 */
public static DocumentIndex createDocumentIndex (final @NonNull File cacheFolder, boolean isWritable) throws IOException {
    Parameters.notNull("cacheFolder", cacheFolder);
    return createDocumentIndex(createIndex(cacheFolder, new KeywordAnalyzer(), isWritable));
}

Source File: IndexManager.java From netbeans with Apache License 2.0

2 votes

/**
 * Creates a document based index
 * The returned {@link Index} is not cached, next call with the same arguments returns a different instance
 * of {@link Index}. The caller is responsible to cache the returned {@link DocumentIndex}.
 * @param cacheFolder the folder in which the index should be stored
 * @return the document based index
 * @since 1.1
 */
public static DocumentIndex createDocumentIndex (final @NonNull File cacheFolder) throws IOException {
    Parameters.notNull("cacheFolder", cacheFolder);
    return createDocumentIndex(createIndex(cacheFolder, new KeywordAnalyzer()));
}

org.apache.lucene.analysis.KeywordAnalyzer Java Examples