org.apache.lucene.analysis.util.TokenizerFactory Java Examples
The following examples show how to use
org.apache.lucene.analysis.util.TokenizerFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HanLpTokenizerFactoryTestCase.java From jstarcraft-nlp with Apache License 2.0 | 6 votes |
@Test public void testCreate() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); TokenizerFactory factory = new HanLpTokenizerFactory(args); Tokenizer tokenizer = factory.create(null); tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。")); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #2
Source File: ResourceLoaderTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testCacheWrongType() throws Exception { clearCache(); SolrResourceLoader loader = new SolrResourceLoader(); @SuppressWarnings({"rawtypes"}) Class[] params = { Map.class }; Map<String,String> args = Map.of("minGramSize", "1", "maxGramSize", "2"); final String className = "solr.NGramTokenizerFactory"; // We could fail here since the class name and expected type don't match, but instead we try to infer what the user actually meant TokenFilterFactory tff = loader.newInstance(className, TokenFilterFactory.class, new String[0], params, new Object[]{new HashMap<>(args)}); assertNotNull("Did not load TokenFilter when asking for corresponding Tokenizer", tff); // This should work, but won't if earlier call succeeding corrupting the cache TokenizerFactory tf = loader.newInstance(className, TokenizerFactory.class, new String[0], params, new Object[]{new HashMap<>(args)}); assertNotNull("Did not load Tokenizer after bad call earlier", tf); loader.close(); }
Example #3
Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public void addExternalJars(List<String> jarFiles) { List<URL> urls = new ArrayList<>(); for (String jarFile : jarFiles) { Path path = FileSystems.getDefault().getPath(jarFile); if (!Files.exists(path) || !jarFile.endsWith(".jar")) { throw new LukeException(String.format(Locale.ENGLISH, "Invalid jar file path: %s", jarFile)); } try { URL url = path.toUri().toURL(); urls.add(url); } catch (IOException e) { throw new LukeException(e.getMessage(), e); } } // reload available tokenizers, charfilters, and tokenfilters URLClassLoader classLoader = new URLClassLoader( urls.toArray(new URL[0]), this.getClass().getClassLoader()); CharFilterFactory.reloadCharFilters(classLoader); TokenizerFactory.reloadTokenizers(classLoader); TokenFilterFactory.reloadTokenFilters(classLoader); }
Example #4
Source File: SolrUtil.java From ambari-logsearch with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") private static boolean checkTokenizer(Class<? extends TokenizerFactory> tokenizerFactoryClass, Map<String, Object> fieldTypeInfoMap) { HashMap<String, Object> analyzer = (HashMap<String, Object>) fieldTypeInfoMap.get("analyzer"); HashMap<String, Object> tokenizerMap = (HashMap<String, Object>)MapUtils.getObject(analyzer, "tokenizer"); if (tokenizerMap != null) { String tokenizerClass = (String) tokenizerMap.get("class"); if (StringUtils.isNotEmpty(tokenizerClass)) { tokenizerClass = tokenizerClass.replace("solr.", ""); return tokenizerClass.equalsIgnoreCase(tokenizerFactoryClass.getSimpleName()); } } return false; }
Example #5
Source File: AnalysisFactoryTestCase.java From crate with Apache License 2.0 | 5 votes |
public void testTokenizers() { Set<String> missing = new TreeSet<String>(); missing.addAll(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers() .stream().map(key -> key.toLowerCase(Locale.ROOT)).collect(Collectors.toSet())); missing.removeAll(getTokenizers().keySet()); assertTrue("new tokenizers found, please update KNOWN_TOKENIZERS: " + missing.toString(), missing.isEmpty()); }
Example #6
Source File: PluginsService.java From crate with Apache License 2.0 | 5 votes |
/** * Reloads all Lucene SPI implementations using the new classloader. * This method must be called after the new classloader has been created to * register the services for use. */ static void reloadLuceneSPI(ClassLoader loader) { // do NOT change the order of these method calls! // Codecs: PostingsFormat.reloadPostingsFormats(loader); DocValuesFormat.reloadDocValuesFormats(loader); Codec.reloadCodecs(loader); // Analysis: CharFilterFactory.reloadCharFilters(loader); TokenFilterFactory.reloadTokenFilters(loader); TokenizerFactory.reloadTokenizers(loader); }
Example #7
Source File: MMSegTokenizerFactoryTest.java From mmseg4j-solr with Apache License 2.0 | 5 votes |
private Dictionary getDictionaryByFieldType(String fieldTypeName) { FieldType ft = h.getCore().getLatestSchema().getFieldTypeByName(fieldTypeName); Analyzer a = ft.getIndexAnalyzer(); Assert.assertEquals(a.getClass(), TokenizerChain.class); TokenizerChain tc = (TokenizerChain) a; TokenizerFactory tf = tc.getTokenizerFactory(); Assert.assertEquals(tf.getClass(), MMSegTokenizerFactory.class); MMSegTokenizerFactory mtf = (MMSegTokenizerFactory) tf; Assert.assertNotNull(mtf.dic); return mtf.dic; }
Example #8
Source File: HanLPTokenizerFactoryTest.java From hanlp-lucene-plugin with Apache License 2.0 | 5 votes |
public void testCreate() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); TokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(null); tokenizer.setReader(new StringReader("大衛貝克漢不僅僅是名著名球員,球場以外,其妻為前" + "辣妹合唱團成員維多利亞·碧咸,亦由於他擁有" + "突出外表、百變髮型及正面的形象,以至自己" + "品牌的男士香水等商品,及長期擔任運動品牌" + "Adidas的代言人,因此對大眾傳播媒介和時尚界" + "等方面都具很大的影響力,在足球圈外所獲得的" + "認受程度可謂前所未見。")); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
Example #9
Source File: FieldAnalysisRequestHandlerTest.java From lucene-solr with Apache License 2.0 | 5 votes |
@Test //See SOLR-8460 public void testCustomAttribute() throws Exception { FieldAnalysisRequest request = new FieldAnalysisRequest(); request.addFieldType("skutype1"); request.setFieldValue("hi, 3456-12 a Test"); request.setShowMatch(false); FieldType fieldType = new TextField(); Analyzer analyzer = new TokenizerChain( new TokenizerFactory(Collections.emptyMap()) { @Override public Tokenizer create(AttributeFactory factory) { return new CustomTokenizer(factory); } }, new TokenFilterFactory[] { new TokenFilterFactory(Collections.emptyMap()) { @Override public TokenStream create(TokenStream input) { return new CustomTokenFilter(input); } } } ); fieldType.setIndexAnalyzer(analyzer); @SuppressWarnings({"rawtypes"}) NamedList<NamedList> result = handler.analyzeValues(request, fieldType, "fieldNameUnused"); // just test that we see "900" in the flags attribute here @SuppressWarnings({"unchecked", "rawtypes"}) List<NamedList> tokenInfoList = (List<NamedList>) result.findRecursive("index", CustomTokenFilter.class.getName()); // '1' from CustomTokenFilter plus 900 from CustomFlagsAttributeImpl. assertEquals(901, tokenInfoList.get(0).get("org.apache.lucene.analysis.tokenattributes.FlagsAttribute#flags")); }
Example #10
Source File: TokenizerChain.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Creates a new TokenizerChain. * * @param charFilters Factories for the CharFilters to use, if any - if null, will be treated as if empty. * @param tokenizer Factory for the Tokenizer to use, must not be null. * @param filters Factories for the TokenFilters to use if any- if null, will be treated as if empty. */ public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) { charFilters = null == charFilters ? EMPTY_CHAR_FITLERS : charFilters; filters = null == filters ? EMPTY_TOKEN_FITLERS : filters; if (null == tokenizer) { throw new NullPointerException("TokenizerFactory must not be null"); } this.charFilters = charFilters; this.tokenizer = tokenizer; this.filters = filters; }
Example #11
Source File: TestFactories.java From lucene-solr with Apache License 2.0 | 5 votes |
private void doTestTokenizer(String tokenizer) throws IOException { Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer); TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz); if (factory != null) { // we managed to fully create an instance. check a few more things: if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) { // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) Analyzer a = new FactoryAnalyzer(factory, null, null); checkRandomData(random(), a, 3, 20, false, false); a.close(); } } }
Example #12
Source File: TestFactories.java From lucene-solr with Apache License 2.0 | 5 votes |
public void test() throws IOException { for (String tokenizer : TokenizerFactory.availableTokenizers()) { doTestTokenizer(tokenizer); } for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) { doTestTokenFilter(tokenFilter); } for (String charFilter : CharFilterFactory.availableCharFilters()) { doTestCharFilter(charFilter); } }
Example #13
Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testNullTokenizerFactory() throws Exception { expectThrows(NullPointerException.class, () -> { CustomAnalyzer.builder() .withTokenizer((Class<TokenizerFactory>) null) .build(); }); }
Example #14
Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Uses the given tokenizer. * @param name is used to look up the factory with {@link TokenizerFactory#forName(String, Map)}. * The list of possible names can be looked up with {@link TokenizerFactory#availableTokenizers()}. * @param params the map of parameters to be passed to factory. The map must be modifiable. */ public Builder withTokenizer(String name, Map<String,String> params) throws IOException { Objects.requireNonNull(name, "Tokenizer name may not be null"); tokenizer.set(applyResourceLoader(TokenizerFactory.forName(name, applyDefaultParams(params)))); componentsAdded = true; return this; }
Example #15
Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Uses the given tokenizer. * @param factory class that is used to create the tokenizer. * @param params the map of parameters to be passed to factory. The map must be modifiable. */ public Builder withTokenizer(Class<? extends TokenizerFactory> factory, Map<String,String> params) throws IOException { Objects.requireNonNull(factory, "Tokenizer factory may not be null"); tokenizer.set(applyResourceLoader(newFactoryClassInstance(factory, applyDefaultParams(params)))); componentsAdded = true; return this; }
Example #16
Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
CustomAnalyzer(Version defaultMatchVersion, CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] tokenFilters, Integer posIncGap, Integer offsetGap) { this.charFilters = charFilters; this.tokenizer = tokenizer; this.tokenFilters = tokenFilters; this.posIncGap = posIncGap; this.offsetGap = offsetGap; if (defaultMatchVersion != null) { setVersion(defaultMatchVersion); } }
Example #17
Source File: PluginsService.java From Elasticsearch with Apache License 2.0 | 5 votes |
/** * Reloads all Lucene SPI implementations using the new classloader. * This method must be called after the new classloader has been created to * register the services for use. */ static void reloadLuceneSPI(ClassLoader loader) { // do NOT change the order of these method calls! // Codecs: PostingsFormat.reloadPostingsFormats(loader); DocValuesFormat.reloadDocValuesFormats(loader); Codec.reloadCodecs(loader); // Analysis: CharFilterFactory.reloadCharFilters(loader); TokenFilterFactory.reloadTokenFilters(loader); TokenizerFactory.reloadTokenizers(loader); }
Example #18
Source File: SynonymFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException { Class<? extends TokenizerFactory> clazz = loader.findClass(cname, TokenizerFactory.class); try { TokenizerFactory tokFactory = clazz.getConstructor(Map.class).newInstance(tokArgs); if (tokFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokFactory).inform(loader); } return tokFactory; } catch (Exception e) { throw new RuntimeException(e); } }
Example #19
Source File: SynonymGraphFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException { Class<? extends TokenizerFactory> clazz = loader.findClass(cname, TokenizerFactory.class); try { TokenizerFactory tokFactory = clazz.getConstructor(Map.class).newInstance(tokArgs); if (tokFactory instanceof ResourceLoaderAware) { ((ResourceLoaderAware) tokFactory).inform(loader); } return tokFactory; } catch (Exception e) { throw new RuntimeException(e); } }
Example #20
Source File: AlfrescoFieldType.java From SearchServices with GNU Lesser General Public License v3.0 | 5 votes |
public void add(Object current) { if (!(current instanceof MultiTermAwareComponent)) return; AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent) current).getMultiTermComponent(); if (newComponent instanceof TokenFilterFactory) { if (filters == null) { filters = new ArrayList<TokenFilterFactory>(2); } filters.add((TokenFilterFactory) newComponent); } else if (newComponent instanceof TokenizerFactory) { tokenizer = (TokenizerFactory) newComponent; } else if (newComponent instanceof CharFilterFactory) { if (charFilters == null) { charFilters = new ArrayList<CharFilterFactory>(1); } charFilters.add((CharFilterFactory) newComponent); } else { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent); } }
Example #21
Source File: TestFactories.java From lucene-solr with Apache License 2.0 | 5 votes |
private void doTestTokenizer(String tokenizer) throws IOException { Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer); TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz); if (factory != null) { // we managed to fully create an instance. check a few more things: if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) { // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) Analyzer a = new FactoryAnalyzer(factory, null, null); checkRandomData(random(), a, 3, 20, false, false); a.close(); } } }
Example #22
Source File: TestFactories.java From lucene-solr with Apache License 2.0 | 5 votes |
public void test() throws IOException { for (String tokenizer : TokenizerFactory.availableTokenizers()) { doTestTokenizer(tokenizer); } for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) { doTestTokenFilter(tokenFilter); } for (String charFilter : CharFilterFactory.availableCharFilters()) { doTestCharFilter(charFilter); } }
Example #23
Source File: AnalyzerFactoryTask.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * This method looks up a class with its fully qualified name (FQN), or a short-name * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis." * as the package prefix (e.g. "standard.ClassicTokenizerFactory" -> * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory"). * * If className contains a period, the class is first looked up as-is, assuming that it * is an FQN. If this fails, lookup is retried after prepending the Lucene analysis * package prefix to the class name. * * If className does not contain a period, the analysis SPI *Factory.lookupClass() * methods are used to find the class. * * @param className The name or the short name of the class. * @param expectedType The superclass className is expected to extend * @return the loaded class. * @throws ClassNotFoundException if lookup fails */ public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType) throws ClassNotFoundException { if (className.contains(".")) { try { // First, try className == FQN return Class.forName(className).asSubclass(expectedType); } catch (ClassNotFoundException e) { try { // Second, retry lookup after prepending the Lucene analysis package prefix return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType); } catch (ClassNotFoundException e1) { throw new ClassNotFoundException("Can't find class '" + className + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'"); } } } // No dot - use analysis SPI lookup final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst(""); if (CharFilterFactory.class.isAssignableFrom(expectedType)) { return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) { return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) { return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } throw new ClassNotFoundException("Can't find class '" + className + "'"); }
Example #24
Source File: AnalyzerFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
public AnalyzerFactory(List<CharFilterFactory> charFilterFactories, TokenizerFactory tokenizerFactory, List<TokenFilterFactory> tokenFilterFactories) { this.charFilterFactories = charFilterFactories; assert null != tokenizerFactory; this.tokenizerFactory = tokenizerFactory; this.tokenFilterFactories = tokenFilterFactories; }
Example #25
Source File: TestHMMChineseTokenizerFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Test showing the behavior */ public void testSimple() throws Exception { Reader reader = new StringReader("我购买了道具和服装。"); TokenizerFactory factory = new HMMChineseTokenizerFactory(new HashMap<String,String>()); Tokenizer tokenizer = factory.create(newAttributeFactory()); tokenizer.setReader(reader); // TODO: fix smart chinese to not emit punctuation tokens // at the moment: you have to clean up with WDF, or use the stoplist, etc assertTokenStreamContents(tokenizer, new String[] { "我", "购买", "了", "道具", "和", "服装", "," }); }
Example #26
Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
/** Returns the tokenizer that is used in this analyzer. */ public TokenizerFactory getTokenizerFactory() { return tokenizer; }
Example #27
Source File: AnalysisChainDialogFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
private JPanel analysisChain() { JPanel panel = new JPanel(new GridBagLayout()); panel.setOpaque(false); GridBagConstraints c = new GridBagConstraints(); c.fill = GridBagConstraints.HORIZONTAL; c.insets = new Insets(5, 5, 5, 5); c.gridx = 0; c.gridy = 0; c.weightx = 0.1; c.weighty = 0.5; panel.add(new JLabel(MessageUtils.getLocalizedMessage("analysis.dialog.chain.label.charfilters")), c); String[] charFilters = analyzer.getCharFilterFactories().stream().map(f -> CharFilterFactory.findSPIName(f.getClass())).toArray(String[]::new); JList<String> charFilterList = new JList<>(charFilters); charFilterList.setVisibleRowCount(charFilters.length == 0 ? 1 : Math.min(charFilters.length, 5)); c.gridx = 1; c.gridy = 0; c.weightx = 0.5; c.weighty = 0.5; panel.add(new JScrollPane(charFilterList), c); c.gridx = 0; c.gridy = 1; c.weightx = 0.1; c.weighty = 0.1; panel.add(new JLabel(MessageUtils.getLocalizedMessage("analysis.dialog.chain.label.tokenizer")), c); String tokenizer = TokenizerFactory.findSPIName(analyzer.getTokenizerFactory().getClass()); JTextField tokenizerTF = new JTextField(tokenizer); tokenizerTF.setColumns(30); tokenizerTF.setEditable(false); tokenizerTF.setPreferredSize(new Dimension(300, 25)); tokenizerTF.setBorder(BorderFactory.createLineBorder(Color.gray)); c.gridx = 1; c.gridy = 1; c.weightx = 0.5; c.weighty = 0.1; panel.add(tokenizerTF, c); c.gridx = 0; c.gridy = 2; c.weightx = 0.1; c.weighty = 0.5; panel.add(new JLabel(MessageUtils.getLocalizedMessage("analysis.dialog.chain.label.tokenfilters")), c); String[] tokenFilters = analyzer.getTokenFilterFactories().stream().map(f -> TokenFilterFactory.findSPIName(f.getClass())).toArray(String[]::new); JList<String> tokenFilterList = new JList<>(tokenFilters); tokenFilterList.setVisibleRowCount(tokenFilters.length == 0 ? 1 : Math.min(tokenFilters.length, 5)); tokenFilterList.setMinimumSize(new Dimension(300, 25)); c.gridx = 1; c.gridy = 2; c.weightx = 0.5; c.weighty = 0.5; panel.add(new JScrollPane(tokenFilterList), c); return panel; }
Example #28
Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public Collection<String> getAvailableTokenizers() { return TokenizerFactory.availableTokenizers().stream().sorted().collect(Collectors.toList()); }
Example #29
Source File: TokenizerChain.java From lucene-solr with Apache License 2.0 | 4 votes |
/** @return the TokenizerFactory in use, will never be null */ public TokenizerFactory getTokenizerFactory() { return tokenizer; }
Example #30
Source File: TestFactories.java From lucene-solr with Apache License 2.0 | 4 votes |
FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) { assert tokenizer != null; this.tokenizer = tokenizer; this.charFilter = charFilter; this.tokenfilter = tokenfilter; }