org.apache.lucene.util.UnicodeUtil Java Examples
The following examples show how to use
org.apache.lucene.util.UnicodeUtil.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: UpperFunction.java From Elasticsearch with Apache License 2.0 | 6 votes |
@Override public BytesRef evaluate(Input<Object>... args) { Object stringValue = args[0].value(); if (stringValue == null) { return null; } BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue); char[] ref = new char[inputByteRef.length]; int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref); charUtils.toUpperCase(ref, 0, len); byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len]; len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res); return new BytesRef(res, 0, len); }
Example #2
Source File: StringEncoding.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
/** * Wraps the Lucene UnicodeUtil.UTF16toUTF8 bytes serializatiom... */ public static byte[] toBytes(String value, boolean desc){ if(value==null) return Encoding.EMPTY_BYTE_ARRAY; if(value.isEmpty()){ if(desc) return new byte[]{(byte)(0x01^0xff)}; else return new byte[]{0x01}; } //convert to UTF-8 encoding BytesRef result = new BytesRef(); UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result); byte[] returnArray = new byte[result.length]; for(int i=0;i<result.length;i++){ byte newD = (byte)(result.bytes[i+result.offset] + 2); if(desc) newD ^= 0xff; //reverse the sign bit so that data is reversed in 2's complement returnArray[i] = newD; } return returnArray; }
Example #3
Source File: FacetFieldProcessorByArrayUIF.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override protected void findStartAndEndOrds() throws IOException { uif = UnInvertedField.getUnInvertedField(freq.field, fcontext.searcher); te = uif.getOrdTermsEnum( fcontext.searcher.getSlowAtomicReader() ); // "te" can be null startTermIndex = 0; endTermIndex = uif.numTerms(); // one past the end if (prefixRef != null && te != null) { if (te.seekCeil(prefixRef.get()) == TermsEnum.SeekStatus.END) { startTermIndex = uif.numTerms(); } else { startTermIndex = (int) te.ord(); } prefixRef.append(UnicodeUtil.BIG_TERM); if (te.seekCeil(prefixRef.get()) == TermsEnum.SeekStatus.END) { endTermIndex = uif.numTerms(); } else { endTermIndex = (int) te.ord(); } } nTerms = endTermIndex - startTermIndex; }
Example #4
Source File: ByteBuffersDataOutput.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public void writeString(String v) { try { final int MAX_CHARS_PER_WINDOW = 1024; if (v.length() <= MAX_CHARS_PER_WINDOW) { final BytesRef utf8 = new BytesRef(v); writeVInt(utf8.length); writeBytes(utf8.bytes, utf8.offset, utf8.length); } else { writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length())); final byte [] buf = new byte [UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW]; UTF16toUTF8(v, 0, v.length(), buf, (len) -> { writeBytes(buf, 0, len); }); } } catch (IOException e) { throw new UncheckedIOException(e); } }
Example #5
Source File: DaciukMihovAutomatonBuilder.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. */ public static Automaton build(Collection<BytesRef> input) { final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); char[] chars = new char[0]; CharsRef ref = new CharsRef(); for (BytesRef b : input) { chars = ArrayUtil.grow(chars, b.length); final int len = UnicodeUtil.UTF8toUTF16(b, chars); ref.chars = chars; ref.length = len; builder.add(ref); } Automaton.Builder a = new Automaton.Builder(); convert(a, builder.complete(), new IdentityHashMap<State,Integer>()); return a.finish(); }
Example #6
Source File: FuzzyAutomatonBuilder.java From lucene-solr with Apache License 2.0 | 6 votes |
FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) { if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits); } if (prefixLength < 0) { throw new IllegalArgumentException("prefixLength cannot be less than 0"); } this.term = term; this.maxEdits = maxEdits; int[] codePoints = stringToUTF32(term); this.termLength = codePoints.length; prefixLength = Math.min(prefixLength, codePoints.length); int[] suffix = new int[codePoints.length - prefixLength]; System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length); this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions); this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength); }
Example #7
Source File: TestMappingCharFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public void setUp() throws Exception { super.setUp(); NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); builder.add( "aa", "a" ); builder.add( "bbb", "b" ); builder.add( "cccc", "cc" ); builder.add( "h", "i" ); builder.add( "j", "jj" ); builder.add( "k", "kkk" ); builder.add( "ll", "llll" ); builder.add( "empty", "" ); // BMP (surrogate pair): builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef"); builder.add("\uff01", "full-width-exclamation"); normMap = builder.build(); }
Example #8
Source File: TestJapaneseTokenizer.java From lucene-solr with Apache License 2.0 | 6 votes |
/** random test ensuring we don't ever split supplementaries */ public void testSurrogates2() throws IOException { int numIterations = atLeast(500); for (int i = 0; i < numIterations; i++) { if (VERBOSE) { System.out.println("\nTEST: iter=" + i); } String s = TestUtil.randomUnicodeString(random(), 100); try (TokenStream ts = analyzer.tokenStream("foo", s)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { assertTrue(UnicodeUtil.validUTF16String(termAtt)); } ts.end(); } } }
Example #9
Source File: LowerFunction.java From Elasticsearch with Apache License 2.0 | 6 votes |
@Override public BytesRef evaluate(Input<Object>... args) { Object stringValue = args[0].value(); if (stringValue == null) { return null; } BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue); char[] ref = new char[inputByteRef.length]; int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref); charUtils.toLowerCase(ref, 0, len); byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len]; len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res); return new BytesRef(res, 0, len); }
Example #10
Source File: LineContext.java From crate with Apache License 2.0 | 5 votes |
@Nullable String sourceAsString() { if (rawSource != null) { char[] chars = new char[rawSource.length]; int len = UnicodeUtil.UTF8toUTF16(rawSource, 0, rawSource.length, chars); return new String(chars, 0, len); } return null; }
Example #11
Source File: PartitionName.java From crate with Apache License 2.0 | 5 votes |
/** * Read utf8 bytes for bwc, with 0 as `null` indicator */ private static String readValueFrom(StreamInput in) throws IOException { int length = in.readVInt() - 1; if (length == -1) { return null; } if (length == 0) { return ""; } byte[] bytes = new byte[length]; in.readBytes(bytes, 0, length); char[] chars = new char[length]; int len = UnicodeUtil.UTF8toUTF16(bytes, 0, length, chars); return new String(chars, 0, len); }
Example #12
Source File: AutomatonTestUtil.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Returns random string, including full unicode range. */ public static String randomRegexp(Random r) { while (true) { String regexp = randomRegexpString(r); // we will also generate some undefined unicode queries if (!UnicodeUtil.validUTF16String(regexp)) continue; try { new RegExp(regexp, RegExp.NONE); return regexp; } catch (Exception e) {} } }
Example #13
Source File: FSTTester.java From lucene-solr with Apache License 2.0 | 5 votes |
static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) { if (!isValidUnicode) { return term.toString(); } else if (inputMode == 0) { // utf8 return toBytesRef(term).utf8ToString() + " " + term; } else { // utf32 return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term; } }
Example #14
Source File: UTF8TaxonomyWriterCache.java From lucene-solr with Apache License 2.0 | 5 votes |
private BytesRef toBytes(FacetLabel label) { BytesRefBuilder bytes = this.bytes.get(); bytes.clear(); for (int i = 0; i < label.length; i++) { String part = label.components[i]; if (i > 0) { bytes.append(DELIM_CHAR); } bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length())); bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length())); } return bytes.get(); }
Example #15
Source File: StringEncoding.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
public static int toBytes(String value, boolean desc, byte[] buffer, int offset){ if(value==null || value.isEmpty()) return 0; //convert to UTF-8 encoding BytesRef result = new BytesRef(); UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result); for(int i=0;i<result.length;i++){ byte newD = (byte)(result.bytes[i+result.offset] + 2); if(desc) newD ^= 0xff; //reverse the sign bit so that data is reversed in 2's complement buffer[offset+i] = newD; } return value.length(); }
Example #16
Source File: LabelledCharArrayMatcher.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Returns a representation of the automaton that matches char[] instead of byte[] */ static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) { return wrap(label, (chars, offset, length) -> { int state = 0; final int maxIdx = offset + length; for (int i = offset; i < maxIdx; i++) { final int code = chars[i]; int b; // UTF16 to UTF8 (inlined logic from UnicodeUtil.UTF16toUTF8 ) if (code < 0x80) { state = runAutomaton.step(state, code); if (state == -1) return false; } else if (code < 0x800) { b = (0xC0 | (code >> 6)); state = runAutomaton.step(state, b); if (state == -1) return false; b = (0x80 | (code & 0x3F)); state = runAutomaton.step(state, b); if (state == -1) return false; } else { // more complex byte[] utf8Bytes = new byte[4 * (maxIdx - i)]; int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes); for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) { state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF); if (state == -1) return false; } break; } } return runAutomaton.isAccept(state); }); }
Example #17
Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testAllUnicodeChars() throws Throwable { CharsRefBuilder utf16 = new CharsRefBuilder(); char[] chars = new char[2]; for(int ch=0;ch<0x0010FFFF;ch++) { if (ch == 0xd800) // Skip invalid code points ch = 0xe000; int len = 0; if (ch <= 0xffff) { chars[len++] = (char) ch; } else { chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START); chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START); } BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len)); String s1 = new String(chars, 0, len); String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8); assertEquals("codepoint " + ch, s1, s2); utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length); assertEquals("codepoint " + ch, s1, utf16.toString()); byte[] b = s1.getBytes(StandardCharsets.UTF_8); assertEquals(utf8.length, b.length); for(int j=0;j<utf8.length;j++) assertEquals(utf8.bytes[j], b[j]); } }
Example #18
Source File: TestExtendedMode.java From lucene-solr with Apache License 2.0 | 5 votes |
/** random test ensuring we don't ever split supplementaries */ public void testSurrogates2() throws IOException { int numIterations = atLeast(500); for (int i = 0; i < numIterations; i++) { String s = TestUtil.randomUnicodeString(random(), 100); try (TokenStream ts = analyzer.tokenStream("foo", s)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { assertTrue(UnicodeUtil.validUTF16String(termAtt)); } ts.end(); } } }
Example #19
Source File: IndexSizeEstimator.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Process a string field. */ public void stringField(FieldInfo fieldInfo, String value) throws IOException { // trim the value if needed int len = value != null ? UnicodeUtil.calcUTF16toUTF8Length(value, 0, value.length()) : 0; if (value.length() > maxLength) { value = value.substring(0, maxLength); } countItem(fieldInfo.name, value, len); }
Example #20
Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0 | 5 votes |
private void assertAutomaton(Automaton automaton) throws Exception { CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton); int num = atLeast(1000); for (int i = 0; i < num; i++) { final String string; if (random().nextBoolean()) { // likely not accepted string = TestUtil.randomUnicodeString(random()); } else { // will be accepted int[] codepoints = ras.getRandomAcceptedString(random()); try { string = UnicodeUtil.newString(codepoints, 0, codepoints.length); } catch (Exception e) { System.out.println(codepoints.length + " codepoints:"); for(int j=0;j<codepoints.length;j++) { System.out.println(" " + Integer.toHexString(codepoints[j])); } throw e; } } byte bytes[] = string.getBytes(StandardCharsets.UTF_8); assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length)); } }
Example #21
Source File: PHPSerializedResponseWriter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public void writeStr(String name, String val, boolean needsEscaping) throws IOException { // serialized PHP strings don't need to be escaped at all, however the // string size reported needs be the number of bytes rather than chars. utf8 = ArrayUtil.grow(utf8, val.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR); final int nBytes = UnicodeUtil.UTF16toUTF8(val, 0, val.length(), utf8); writer.write("s:"); writer.write(Integer.toString(nBytes)); writer.write(":\""); writer.write(val); writer.write("\";"); }
Example #22
Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testTermUTF16SortOrder() throws Throwable { Random rnd = random(); Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(rnd, dir); Document d = new Document(); // Single segment Field f = newStringField("f", "", Field.Store.NO); d.add(f); char[] chars = new char[2]; final Set<String> allTerms = new HashSet<>(); int num = atLeast(200); for (int i = 0; i < num; i++) { final String s; if (rnd.nextBoolean()) { // Single char if (rnd.nextBoolean()) { // Above surrogates chars[0] = (char) getInt(rnd, 1+UnicodeUtil.UNI_SUR_LOW_END, 0xffff); } else { // Below surrogates chars[0] = (char) getInt(rnd, 0, UnicodeUtil.UNI_SUR_HIGH_START-1); } s = new String(chars, 0, 1); } else { // Surrogate pair chars[0] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_HIGH_START, UnicodeUtil.UNI_SUR_HIGH_END); assertTrue(((int) chars[0]) >= UnicodeUtil.UNI_SUR_HIGH_START && ((int) chars[0]) <= UnicodeUtil.UNI_SUR_HIGH_END); chars[1] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_LOW_START, UnicodeUtil.UNI_SUR_LOW_END); s = new String(chars, 0, 2); } allTerms.add(s); f.setStringValue(s); writer.addDocument(d); if ((1+i) % 42 == 0) { writer.commit(); } } IndexReader r = writer.getReader(); // Test each sub-segment for (LeafReaderContext ctx : r.leaves()) { checkTermsOrder(ctx.reader(), allTerms, false); } checkTermsOrder(r, allTerms, true); // Test multi segment r.close(); writer.forceMerge(1); // Test single segment r = writer.getReader(); checkTermsOrder(r, allTerms, true); r.close(); writer.close(); dir.close(); }
Example #23
Source File: SerializerUtil.java From incubator-retired-blur with Apache License 2.0 | 4 votes |
public static void writeString(String s, DataOutput out) throws IOException { BytesRef bytes = new BytesRef(); UnicodeUtil.UTF16toUTF8(s, 0, s.length(), bytes); writeBytesRef(bytes, out); }
Example #24
Source File: RegexMatcher.java From Elasticsearch with Apache License 2.0 | 4 votes |
private static void UTF8toUTF16(BytesRef bytes, CharsRef charsRef) { if (charsRef.chars.length < bytes.length) { charsRef.chars = new char[bytes.length]; } charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars); }
Example #25
Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0 | 4 votes |
private static boolean isSurrogate(int code) { return code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_LOW_END; }
Example #26
Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0 | 4 votes |
private boolean matches(ByteRunAutomaton a, int code) { char[] chars = Character.toChars(code); byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)]; final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b); return a.run(b, 0, len); }
Example #27
Source File: TermGroupFacetCollector.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected void doSetNextReader(LeafReaderContext context) throws IOException { if (segmentFacetCounts != null) { segmentResults.add(createSegmentResult()); } groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField); facetFieldDocTermOrds = DocValues.getSortedSet(context.reader(), facetField); facetFieldNumTerms = (int) facetFieldDocTermOrds.getValueCount(); if (facetFieldNumTerms == 0) { facetOrdTermsEnum = null; } else { facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum(); } // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet field segmentFacetCounts = new int[facetFieldNumTerms + 1]; segmentTotalCount = 0; segmentGroupedFacetHits.clear(); for (GroupedFacetHit groupedFacetHit : groupedFacetHits) { int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int facetOrd; if (groupedFacetHit.facetValue != null) { if (facetOrdTermsEnum == null || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue)) { continue; } facetOrd = (int) facetOrdTermsEnum.ord(); } else { facetOrd = facetFieldNumTerms; } // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd; segmentGroupedFacetHits.put(segmentGroupedFacetsIndex); } if (facetPrefix != null) { TermsEnum.SeekStatus seekStatus; if (facetOrdTermsEnum != null) { seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix); } else { seekStatus = TermsEnum.SeekStatus.END; } if (seekStatus != TermsEnum.SeekStatus.END) { startFacetOrd = (int) facetOrdTermsEnum.ord(); } else { startFacetOrd = 0; endFacetOrd = 0; return; } BytesRefBuilder facetEndPrefix = new BytesRefBuilder(); facetEndPrefix.append(facetPrefix); facetEndPrefix.append(UnicodeUtil.BIG_TERM); seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix.get()); if (seekStatus != TermsEnum.SeekStatus.END) { endFacetOrd = (int) facetOrdTermsEnum.ord(); } else { endFacetOrd = facetFieldNumTerms; // Don't include null... } } else { startFacetOrd = 0; endFacetOrd = facetFieldNumTerms + 1; } }
Example #28
Source File: FuzzyTermsEnum.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public BytesRef next() throws IOException { if (queuedBottom != null) { bottomChanged(queuedBottom); queuedBottom = null; } BytesRef term; term = actualEnum.next(); if (term == null) { // end return null; } int ed = maxEdits; // we know the outer DFA always matches. // now compute exact edit distance while (ed > 0) { if (matches(term, ed - 1)) { ed--; } else { break; } } if (ed == 0) { // exact match boostAtt.setBoost(1.0F); } else { final int codePointCount = UnicodeUtil.codePointCount(term); int minTermLength = Math.min(codePointCount, termLength); float similarity = 1.0f - (float) ed / (float) minTermLength; boostAtt.setBoost(similarity); } final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost(); final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm(); if (bottom != this.bottom || bottomTerm != this.bottomTerm) { this.bottom = bottom; this.bottomTerm = bottomTerm; // clone the term before potentially doing something with it // this is a rare but wonderful occurrence anyway // We must delay bottomChanged until the next next() call otherwise we mess up docFreq(), etc., for the current term: queuedBottom = BytesRef.deepCopyOf(term); } return term; }
Example #29
Source File: TestMappingCharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNonBMPChar() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2); }
Example #30
Source File: TokenInfoDictionaryTest.java From lucene-solr with Apache License 2.0 | 4 votes |
/** enumerates the entire FST/lookup data and just does basic sanity checks */ public void testEnumerateAll() throws Exception { // just for debugging int numTerms = 0; int numWords = 0; int lastWordId = -1; int lastSourceId = -1; TokenInfoDictionary tid = TokenInfoDictionary.getInstance(); ConnectionCosts matrix = ConnectionCosts.getInstance(); FST<Long> fst = tid.getFST().getInternalFST(); IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst); IntsRefFSTEnum.InputOutput<Long> mapping; IntsRef scratch = new IntsRef(); while ((mapping = fstEnum.next()) != null) { numTerms++; IntsRef input = mapping.input; char[] chars = new char[input.length]; for (int i = 0; i < chars.length; i++) { chars[i] = (char)input.ints[input.offset+i]; } assertTrue(UnicodeUtil.validUTF16String(new String(chars))); Long output = mapping.output; int sourceId = output.intValue(); // we walk in order, terms, sourceIds, and wordIds should always be increasing assertTrue(sourceId > lastSourceId); lastSourceId = sourceId; tid.lookupWordIds(sourceId, scratch); for (int i = 0; i < scratch.length; i++) { numWords++; int wordId = scratch.ints[scratch.offset+i]; assertTrue(wordId > lastWordId); lastWordId = wordId; String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length); assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm)); String inflectionForm = tid.getInflectionForm(wordId); assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm)); if (inflectionForm != null) { // check that it's actually an ipadic inflection form assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm)); } String inflectionType = tid.getInflectionType(wordId); assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType)); if (inflectionType != null) { // check that it's actually an ipadic inflection type assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType)); } int leftId = tid.getLeftId(wordId); int rightId = tid.getRightId(wordId); matrix.get(rightId, leftId); tid.getWordCost(wordId); String pos = tid.getPartOfSpeech(wordId); assertNotNull(pos); assertTrue(UnicodeUtil.validUTF16String(pos)); // check that it's actually an ipadic pos tag assertNotNull(ToStringUtil.getPOSTranslation(pos)); String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length); assertNotNull(pronunciation); assertTrue(UnicodeUtil.validUTF16String(pronunciation)); String reading = tid.getReading(wordId, chars, 0, chars.length); assertNotNull(reading); assertTrue(UnicodeUtil.validUTF16String(reading)); } } if (VERBOSE) { System.out.println("checked " + numTerms + " terms, " + numWords + " words."); } }