Java Code Examples for org.apache.lucene.analysis.tokenattributes.CharTermAttribute#toString()
The following examples show how to use
org.apache.lucene.analysis.tokenattributes.CharTermAttribute#toString() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Tokenizers.java From ache with Apache License 2.0 | 6 votes |
public List<String> tokenize(String cleanText) { try { TokenStream ts = analyzer.tokenStream("cleanText", cleanText); CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class); ts.reset(); List<String> tokens = new ArrayList<String>(); while (ts.incrementToken()) { String token = cattr.toString(); tokens.add(token); } ts.close(); return tokens; } catch (IOException e) { throw new RuntimeException( "Shigle tokenization failed for string: " + cleanText, e); } }
Example 2
Source File: MinHash.java From minhash with Apache License 2.0 | 6 votes |
/** * Calculates MinHash value. * * @param analyzer analyzer to parse a text * @param text a target text * @return MinHash value * @throws IOException */ public static byte[] calculate(final Analyzer analyzer, final String text) throws IOException { byte[] value = null; try (TokenStream stream = analyzer.tokenStream("minhash", text)) { final CharTermAttribute termAtt = stream .addAttribute(CharTermAttribute.class); stream.reset(); if (stream.incrementToken()) { final String minhashValue = termAtt.toString(); value = BaseEncoding.base64().decode(minhashValue); } stream.end(); } return value; }
Example 3
Source File: TestStopAnalyzer.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testStopList() throws IOException { CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false); StopAnalyzer newStop = new StopAnalyzer(stopWordsSet); try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) { assertNotNull(stream); CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String text = termAtt.toString(); assertFalse(stopWordsSet.contains(text)); } stream.end(); } newStop.close(); }
Example 4
Source File: ShingleAnalyzerWrapperTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception { BooleanQuery.Builder q = new BooleanQuery.Builder(); try (TokenStream ts = analyzer.tokenStream("content", "test sentence")) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String termText = termAtt.toString(); q.add(new TermQuery(new Term("content", termText)), BooleanClause.Occur.SHOULD); } ts.end(); } ScoreDoc[] hits = searcher.search(q.build(), 1000).scoreDocs; int[] ranks = new int[] { 1, 2, 0 }; compareRanks(hits, ranks); }
Example 5
Source File: BooleanPerceptronClassifier.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public ClassificationResult<Boolean> assignClass(String text) throws IOException { Long output = 0L; try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) { CharTermAttribute charTermAttribute = tokenStream .addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String s = charTermAttribute.toString(); Long d = Util.get(fst, new BytesRef(s)); if (d != null) { output += d; } } tokenStream.end(); } double score = 1 - Math.exp(-1 * Math.abs(bias - output.doubleValue()) / bias); return new ClassificationResult<>(output >= bias, score); }
Example 6
Source File: FeatureExtractorUtilities.java From samantha with MIT License | 6 votes |
static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) { TokenStream ts = analyzer.tokenStream(termField, text); CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class); Map<String, Integer> termFreq = new HashMap<>(); try { ts.reset(); while (ts.incrementToken()) { String term = cattr.toString(); int cnt = termFreq.getOrDefault( FeatureExtractorUtilities.composeKey(termField, term), 0); termFreq.put(term, cnt + 1); } ts.end(); ts.close(); } catch (IOException e) { logger.error("{}", e.getMessage()); throw new BadRequestException(e); } return termFreq; }
Example 7
Source File: LuceneUtil.java From jasperreports with GNU Lesser General Public License v3.0 | 6 votes |
protected String displayTokens(String text, String elementId) throws IOException { Analyzer analyzer = new LuceneSimpleAnalyzer(isCaseSensitive, removeAccents);; StringBuilder sb = new StringBuilder(); sb.append(elementId).append(": ").append(text).append(": "); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); sb.append("[" + term + "](" + startOffset + "," + endOffset + ") "); } return sb.toString(); }
Example 8
Source File: AbstractSearchTest.java From database with GNU General Public License v2.0 | 5 votes |
private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException { TokenStream s = a.tokenStream(null, new StringReader(text)); int ix = 0; s.reset(); while (s.incrementToken()) { final CharTermAttribute term = s.getAttribute(CharTermAttribute.class); final String word = term.toString(); assertTrue(ix < expected.length); assertEquals(expected[ix++], word); } s.close(); assertEquals(ix, expected.length); }
Example 9
Source File: TextParseUtils.java From SimpleTextSearch with MIT License | 5 votes |
public List<String> tokenize(String rawText) { List<String> retVal = new ArrayList<>(); if (StringUtils.isEmpty(rawText)) { return retVal; } try (TokenStream ts = analyzer.tokenStream(null,rawText)) { CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String str = term.toString(); if (str == null) { continue; } str = str.replaceAll("[^a-zA-Z ]", ""); if (str.isEmpty()) { continue; } retVal.add(str); } ts.end(); } catch (IOException ex) {} return retVal; }
Example 10
Source File: XmlInterpolationTest.java From SolrTextTagger with Apache License 2.0 | 5 votes |
private int[] analyzeTagOne(String docText, String start, String end) { int[] result = {-1, -1}; Reader filter = new HTMLStripCharFilter(new StringReader(docText)); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { final String termString = termAttribute.toString(); if (termString.equals(start)) result[0] = offsetAttribute.startOffset(); if (termString.equals(end)) { result[1] = offsetAttribute.endOffset(); return result; } } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result; }
Example 11
Source File: ChineseMatcher.java From zxl with Apache License 2.0 | 5 votes |
public double oneWayMatch(String text1,String text2) { try { Set<String> set = new HashSet<String>(10); TokenStream tokenStream = smartChineseAnalyzer.tokenStream("field", text1); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { set.add(charTermAttribute.toString()); } int originalCount = set.size(); tokenStream.end(); tokenStream.close(); tokenStream = smartChineseAnalyzer.tokenStream("field", text2); charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); int smallWeightWordsCount = 0; int denominator = 0; while (tokenStream.incrementToken()) { denominator++; String word = charTermAttribute.toString(); int tempSize = set.size(); set.add(word); if (tempSize + 1 == set.size() && smallWeightWords.contains(word)) { smallWeightWordsCount++; } } int numerator = set.size() - originalCount; double unmatchRate = (smallWeightWordsCount * smallWeight + numerator - ((double)smallWeightWordsCount))/denominator; tokenStream.end(); tokenStream.close(); return unmatchRate; } catch (IOException e) { return 1D; } }
Example 12
Source File: KuromojiUDF.java From incubator-hivemall with Apache License 2.0 | 5 votes |
private static void analyzeTokens(@Nonnull final TokenStream stream, @Nonnull final List<Text> tokenResult, @Nonnull final List<Text> posResult) throws IOException { // instantiate an attribute placeholder once CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); PartOfSpeechAttribute posAttr = stream.addAttribute(PartOfSpeechAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAttr.toString(); tokenResult.add(new Text(term)); String pos = posAttr.getPartOfSpeech(); posResult.add(new Text(pos)); } }
Example 13
Source File: KuromojiUDF.java From incubator-hivemall with Apache License 2.0 | 5 votes |
private static void analyzeTokens(@Nonnull final TokenStream stream, @Nonnull final List<Text> tokens) throws IOException { // instantiate an attribute placeholder once CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAttr.toString(); tokens.add(new Text(term)); } }
Example 14
Source File: SmartcnUDF.java From incubator-hivemall with Apache License 2.0 | 5 votes |
private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) throws IOException { // instantiate an attribute placeholder once CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAttr.toString(); results.add(new Text(term)); } }
Example 15
Source File: XmlInterpolationTest.java From lucene-solr with Apache License 2.0 | 5 votes |
private int[] analyzeTagOne(String docText, String start, String end) { int[] result = {-1, -1}; Reader filter = new HTMLStripCharFilter(new StringReader(docText)); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { final String termString = termAttribute.toString(); if (termString.equals(start)) result[0] = offsetAttribute.startOffset(); if (termString.equals(end)) { result[1] = offsetAttribute.endOffset(); return result; } } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result; }
Example 16
Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param perFieldTermFrequencies a Map of terms and their frequencies per field * @param fieldName Used by analyzer for any special per-field analysis */ private void addTermFrequencies(Reader r, Map<String, Map<String, Int>> perFieldTermFrequencies, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } Map<String, Int> termFreqMap = perFieldTermFrequencies.computeIfAbsent(fieldName, k -> new HashMap<>()); try (TokenStream ts = analyzer.tokenStream(fieldName, r)) { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); TermFrequencyAttribute tfAtt = ts.addAttribute(TermFrequencyAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int(tfAtt.getTermFrequency())); } else { cnt.x += tfAtt.getTermFrequency(); } } ts.end(); } }
Example 17
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 5 votes |
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis */ private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } try (TokenStream ts = analyzer.tokenStream(fieldName, r)) { int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } if (isSkipTerm(fieldName, word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); } }
Example 18
Source File: ReSearcherUtils.java From solr-researcher with Apache License 2.0 | 4 votes |
/** * Separates tokens from query. Treats each quote as a separate token, since that makes it easier to examine the query. * * @param queryString . * @param tokens . * @return number of quotes in the query */ public static int tokenizeQueryString(String queryString, List<String> tokens) { int countOfQuotes = 0; try { // first tokenize words and treat each quote as a separate token Map<String,String> args = new HashMap<String, String>(); args.put(WhitespaceTokenizerFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_6_3_0.toString()); WhitespaceTokenizerFactory f = new WhitespaceTokenizerFactory(args); WhitespaceTokenizer s = (WhitespaceTokenizer)f.create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); s.setReader(new StringReader(queryString)); s.reset(); while (true) { CharTermAttribute t = s.getAttribute(CharTermAttribute.class); if (t == null) { break; } String tokentText = new String(t.toString()); if (tokentText.equals("\"")) { tokens.add("\""); countOfQuotes++; } else if (tokentText.startsWith("\"")) { tokens.add("\""); countOfQuotes++; if (tokentText.endsWith("\"")) { tokens.add(tokentText.substring(1, tokentText.length() - 1)); tokens.add("\""); countOfQuotes++; } else { tokens.add(tokentText.substring(1)); } } else if (tokentText.endsWith("\"")) { tokens.add(tokentText.substring(0, tokentText.length() - 1)); tokens.add("\""); countOfQuotes++; } else if (!tokentText.trim().equals("")) { // take into account only if different than empty string tokens.add(tokentText); } if (!s.incrementToken()) { break; } } s.end(); s.close(); } catch (IOException e) { throw new RuntimeException(e); } return countOfQuotes; }
Example 19
Source File: Zemberek2DeASCIIfyFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException { StringReader reader = new StringReader("kus asisi ortaklar çekişme masali"); Map<String, String> map = new HashMap<>(); Zemberek2DeASCIIfyFilterFactory factory = new Zemberek2DeASCIIfyFilterFactory(map); WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer(); whitespaceTokenizer.setReader(reader); TokenStream stream = factory.create(whitespaceTokenizer); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAttribute.toString(); System.out.println(term); } stream.end(); reader.close(); }
Example 20
Source File: Zemberek2StemFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException { StringReader reader = new StringReader("elması utansın ortaklar çekişme ile"); Map<String, String> map = new HashMap<>(); map.put("strategy", "frequency"); Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map); WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer(); whitespaceTokenizer.setReader(reader); TokenStream stream = factory.create(whitespaceTokenizer); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAttribute.toString(); System.out.println(term); } stream.end(); reader.close(); }