Java Code Examples for org.apache.lucene.util.UnicodeUtil#newString()
The following examples show how to use
org.apache.lucene.util.UnicodeUtil#newString() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FuzzyAutomatonBuilder.java From lucene-solr with Apache License 2.0 | 6 votes |
FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) { if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits); } if (prefixLength < 0) { throw new IllegalArgumentException("prefixLength cannot be less than 0"); } this.term = term; this.maxEdits = maxEdits; int[] codePoints = stringToUTF32(term); this.termLength = codePoints.length; prefixLength = Math.min(prefixLength, codePoints.length); int[] suffix = new int[codePoints.length - prefixLength]; System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length); this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions); this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength); }
Example 2
Source File: FSTTester.java From lucene-solr with Apache License 2.0 | 5 votes |
static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) { if (!isValidUnicode) { return term.toString(); } else if (inputMode == 0) { // utf8 return toBytesRef(term).utf8ToString() + " " + term; } else { // utf32 return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term; } }
Example 3
Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0 | 5 votes |
private void assertAutomaton(Automaton automaton) throws Exception { CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); ByteRunAutomaton bra = new ByteRunAutomaton(automaton); final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton); int num = atLeast(1000); for (int i = 0; i < num; i++) { final String string; if (random().nextBoolean()) { // likely not accepted string = TestUtil.randomUnicodeString(random()); } else { // will be accepted int[] codepoints = ras.getRandomAcceptedString(random()); try { string = UnicodeUtil.newString(codepoints, 0, codepoints.length); } catch (Exception e) { System.out.println(codepoints.length + " codepoints:"); for(int j=0;j<codepoints.length;j++) { System.out.println(" " + Integer.toHexString(codepoints[j])); } throw e; } } byte bytes[] = string.getBytes(StandardCharsets.UTF_8); assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length)); } }
Example 4
Source File: TestMappingCharFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testNonBMPChar() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) ); TokenStream ts =whitespaceMockTokenizer(cs); assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2); }