org.apache.lucene.util.UnicodeUtil Java Exaples

Source File: UpperFunction.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toUpperCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}

Source File: StringEncoding.java From spliceengine with GNU Affero General Public License v3.0

6 votes

/**
 * Wraps the Lucene UnicodeUtil.UTF16toUTF8 bytes serializatiom...
 */
public static byte[] toBytes(String value, boolean desc){
    if(value==null) return Encoding.EMPTY_BYTE_ARRAY;
    if(value.isEmpty()){
        if(desc)
            return new byte[]{(byte)(0x01^0xff)};
        else
            return new byte[]{0x01};
    }

    //convert to UTF-8 encoding
    BytesRef result = new BytesRef();
    UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result);
    byte[] returnArray = new byte[result.length];
    for(int i=0;i<result.length;i++){
        byte newD = (byte)(result.bytes[i+result.offset] + 2);
        if(desc)
            newD ^= 0xff; //reverse the sign bit so that data is reversed in 2's complement
        returnArray[i] = newD;
    }
    return returnArray;
}

Source File: FacetFieldProcessorByArrayUIF.java From lucene-solr with Apache License 2.0

6 votes

@Override
protected void findStartAndEndOrds() throws IOException {
  uif = UnInvertedField.getUnInvertedField(freq.field, fcontext.searcher);
  te = uif.getOrdTermsEnum( fcontext.searcher.getSlowAtomicReader() );    // "te" can be null

  startTermIndex = 0;
  endTermIndex = uif.numTerms();  // one past the end

  if (prefixRef != null && te != null) {
    if (te.seekCeil(prefixRef.get()) == TermsEnum.SeekStatus.END) {
      startTermIndex = uif.numTerms();
    } else {
      startTermIndex = (int) te.ord();
    }
    prefixRef.append(UnicodeUtil.BIG_TERM);
    if (te.seekCeil(prefixRef.get()) == TermsEnum.SeekStatus.END) {
      endTermIndex = uif.numTerms();
    } else {
      endTermIndex = (int) te.ord();
    }
  }

  nTerms = endTermIndex - startTermIndex;
}

Source File: ByteBuffersDataOutput.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void writeString(String v) {
  try {
    final int MAX_CHARS_PER_WINDOW = 1024;
    if (v.length() <= MAX_CHARS_PER_WINDOW) {
      final BytesRef utf8 = new BytesRef(v);
      writeVInt(utf8.length);
      writeBytes(utf8.bytes, utf8.offset, utf8.length);
    } else {
      writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length()));
      final byte [] buf = new byte [UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW];
      UTF16toUTF8(v, 0, v.length(), buf, (len) -> {
        writeBytes(buf, 0, len);
      });
    }
  } catch (IOException e) {
    throw new UncheckedIOException(e);
  }    
}

Source File: DaciukMihovAutomatonBuilder.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
  
  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }
  
  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}

Source File: FuzzyAutomatonBuilder.java From lucene-solr with Apache License 2.0

6 votes

FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) {
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
  }
  if (prefixLength < 0) {
    throw new IllegalArgumentException("prefixLength cannot be less than 0");
  }
  this.term = term;
  this.maxEdits = maxEdits;
  int[] codePoints = stringToUTF32(term);
  this.termLength = codePoints.length;
  prefixLength = Math.min(prefixLength, codePoints.length);
  int[] suffix = new int[codePoints.length - prefixLength];
  System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length);
  this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions);
  this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength);
}

Source File: TestMappingCharFilter.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void setUp() throws Exception {
  super.setUp();
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

  builder.add( "aa", "a" );
  builder.add( "bbb", "b" );
  builder.add( "cccc", "cc" );

  builder.add( "h", "i" );
  builder.add( "j", "jj" );
  builder.add( "k", "kkk" );
  builder.add( "ll", "llll" );

  builder.add( "empty", "" );

  // BMP (surrogate pair):
  builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");

  builder.add("\uff01", "full-width-exclamation");

  normMap = builder.build();
}

Source File: TestJapaneseTokenizer.java From lucene-solr with Apache License 2.0

6 votes

/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(500);
  for (int i = 0; i < numIterations; i++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter=" + i);
    }
    String s = TestUtil.randomUnicodeString(random(), 100);
    try (TokenStream ts = analyzer.tokenStream("foo", s)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    }
  }
}

Source File: LowerFunction.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toLowerCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}

Source File: LineContext.java From crate with Apache License 2.0

5 votes

@Nullable
String sourceAsString() {
    if (rawSource != null) {
        char[] chars = new char[rawSource.length];
        int len = UnicodeUtil.UTF8toUTF16(rawSource, 0, rawSource.length, chars);
        return new String(chars, 0, len);
    }
    return null;
}

Source File: PartitionName.java From crate with Apache License 2.0

5 votes

/**
 * Read utf8 bytes for bwc, with 0 as `null` indicator
 */
private static String readValueFrom(StreamInput in) throws IOException {
    int length = in.readVInt() - 1;
    if (length == -1) {
        return null;
    }
    if (length == 0) {
        return "";
    }
    byte[] bytes = new byte[length];
    in.readBytes(bytes, 0, length);
    char[] chars = new char[length];
    int len = UnicodeUtil.UTF8toUTF16(bytes, 0, length, chars);
    return new String(chars, 0, len);
}

Source File: AutomatonTestUtil.java From lucene-solr with Apache License 2.0

5 votes

/** Returns random string, including full unicode range. */
public static String randomRegexp(Random r) {
  while (true) {
    String regexp = randomRegexpString(r);
    // we will also generate some undefined unicode queries
    if (!UnicodeUtil.validUTF16String(regexp))
      continue;
    try {
      new RegExp(regexp, RegExp.NONE);
      return regexp;
    } catch (Exception e) {}
  }
}

Source File: FSTTester.java From lucene-solr with Apache License 2.0

5 votes

static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
  if (!isValidUnicode) {
    return term.toString();
  } else if (inputMode == 0) {
    // utf8
    return toBytesRef(term).utf8ToString() + " " + term;
  } else {
    // utf32
    return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
  }
}

Source File: UTF8TaxonomyWriterCache.java From lucene-solr with Apache License 2.0

5 votes

private BytesRef toBytes(FacetLabel label) {
  BytesRefBuilder bytes = this.bytes.get();
  bytes.clear();
  for (int i = 0; i < label.length; i++) {
    String part = label.components[i];
    if (i > 0) {
      bytes.append(DELIM_CHAR);
    }
    bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
    bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
  }
  return bytes.get();
}

Source File: StringEncoding.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static int toBytes(String value, boolean desc, byte[] buffer, int offset){
    if(value==null || value.isEmpty()) return 0;

    //convert to UTF-8 encoding
    BytesRef result = new BytesRef();
    UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result);
    for(int i=0;i<result.length;i++){
        byte newD = (byte)(result.bytes[i+result.offset] + 2);
        if(desc)
            newD ^= 0xff; //reverse the sign bit so that data is reversed in 2's complement
        buffer[offset+i] = newD;
    }
    return value.length();
}

Source File: LabelledCharArrayMatcher.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Returns a representation of the automaton that matches char[] instead of byte[]
 */
static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) {
    return wrap(label, (chars, offset, length) -> {
      int state = 0;
      final int maxIdx = offset + length;
      for (int i = offset; i < maxIdx; i++) {
        final int code = chars[i];
        int b;
        // UTF16 to UTF8   (inlined logic from UnicodeUtil.UTF16toUTF8 )
        if (code < 0x80) {
          state = runAutomaton.step(state, code);
          if (state == -1) return false;
        } else if (code < 0x800) {
          b = (0xC0 | (code >> 6));
          state = runAutomaton.step(state, b);
          if (state == -1) return false;
          b = (0x80 | (code & 0x3F));
          state = runAutomaton.step(state, b);
          if (state == -1) return false;
        } else {
          // more complex
          byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
          int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
          for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
            state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
            if (state == -1) return false;
          }
          break;
        }
      }
      return runAutomaton.isAccept(state);
    });
}

Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0

5 votes

public void testAllUnicodeChars() throws Throwable {

    CharsRefBuilder utf16 = new CharsRefBuilder();
    char[] chars = new char[2];
    for(int ch=0;ch<0x0010FFFF;ch++) {

      if (ch == 0xd800)
        // Skip invalid code points
        ch = 0xe000;

      int len = 0;
      if (ch <= 0xffff) {
        chars[len++] = (char) ch;
      } else {
        chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
        chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
      }

      BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len));

      String s1 = new String(chars, 0, len);
      String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8);
      assertEquals("codepoint " + ch, s1, s2);

      utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
      assertEquals("codepoint " + ch, s1, utf16.toString());

      byte[] b = s1.getBytes(StandardCharsets.UTF_8);
      assertEquals(utf8.length, b.length);
      for(int j=0;j<utf8.length;j++)
        assertEquals(utf8.bytes[j], b[j]);
    }
  }

Source File: TestExtendedMode.java From lucene-solr with Apache License 2.0

5 votes

/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(500);
  for (int i = 0; i < numIterations; i++) {
    String s = TestUtil.randomUnicodeString(random(), 100);
    try (TokenStream ts = analyzer.tokenStream("foo", s)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    }
  }
}

Source File: IndexSizeEstimator.java From lucene-solr with Apache License 2.0

5 votes

/** Process a string field. */
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
  // trim the value if needed
  int len = value != null ? UnicodeUtil.calcUTF16toUTF8Length(value, 0, value.length()) : 0;
  if (value.length() > maxLength) {
    value = value.substring(0, maxLength);
  }
  countItem(fieldInfo.name, value, len);
}

Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0

5 votes

private void assertAutomaton(Automaton automaton) throws Exception {
  CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
  ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
  final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);
  
  int num = atLeast(1000);
  for (int i = 0; i < num; i++) {
    final String string;
    if (random().nextBoolean()) {
      // likely not accepted
      string = TestUtil.randomUnicodeString(random());
    } else {
      // will be accepted
      int[] codepoints = ras.getRandomAcceptedString(random());
      try {
        string = UnicodeUtil.newString(codepoints, 0, codepoints.length);
      } catch (Exception e) {
        System.out.println(codepoints.length + " codepoints:");
        for(int j=0;j<codepoints.length;j++) {
          System.out.println("  " + Integer.toHexString(codepoints[j]));
        }
        throw e;
      }
    }
    byte bytes[] = string.getBytes(StandardCharsets.UTF_8);
    assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length));
  }
}

Source File: PHPSerializedResponseWriter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void writeStr(String name, String val, boolean needsEscaping) throws IOException {
  // serialized PHP strings don't need to be escaped at all, however the 
  // string size reported needs be the number of bytes rather than chars.
  utf8 = ArrayUtil.grow(utf8, val.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
  final int nBytes = UnicodeUtil.UTF16toUTF8(val, 0, val.length(), utf8);

  writer.write("s:");
  writer.write(Integer.toString(nBytes));
  writer.write(":\"");
  writer.write(val);
  writer.write("\";");
}

Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0

4 votes

public void testTermUTF16SortOrder() throws Throwable {
  Random rnd = random();
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(rnd, dir);
  Document d = new Document();
  // Single segment
  Field f = newStringField("f", "", Field.Store.NO);
  d.add(f);
  char[] chars = new char[2];
  final Set<String> allTerms = new HashSet<>();

  int num = atLeast(200);
  for (int i = 0; i < num; i++) {

    final String s;
    if (rnd.nextBoolean()) {
      // Single char
      if (rnd.nextBoolean()) {
        // Above surrogates
        chars[0] = (char) getInt(rnd, 1+UnicodeUtil.UNI_SUR_LOW_END, 0xffff);
      } else {
        // Below surrogates
        chars[0] = (char) getInt(rnd, 0, UnicodeUtil.UNI_SUR_HIGH_START-1);
      }
      s = new String(chars, 0, 1);
    } else {
      // Surrogate pair
      chars[0] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_HIGH_START, UnicodeUtil.UNI_SUR_HIGH_END);
      assertTrue(((int) chars[0]) >= UnicodeUtil.UNI_SUR_HIGH_START && ((int) chars[0]) <= UnicodeUtil.UNI_SUR_HIGH_END);
      chars[1] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_LOW_START, UnicodeUtil.UNI_SUR_LOW_END);
      s = new String(chars, 0, 2);
    }
    allTerms.add(s);
    f.setStringValue(s);

    writer.addDocument(d);

    if ((1+i) % 42 == 0) {
      writer.commit();
    }
  }

  IndexReader r = writer.getReader();

  // Test each sub-segment
  for (LeafReaderContext ctx : r.leaves()) {
    checkTermsOrder(ctx.reader(), allTerms, false);
  }
  checkTermsOrder(r, allTerms, true);

  // Test multi segment
  r.close();

  writer.forceMerge(1);

  // Test single segment
  r = writer.getReader();
  checkTermsOrder(r, allTerms, true);
  r.close();

  writer.close();
  dir.close();
}

Source File: SerializerUtil.java From incubator-retired-blur with Apache License 2.0

4 votes

public static void writeString(String s, DataOutput out) throws IOException {
  BytesRef bytes = new BytesRef();
  UnicodeUtil.UTF16toUTF8(s, 0, s.length(), bytes);
  writeBytesRef(bytes, out);
}

Source File: RegexMatcher.java From Elasticsearch with Apache License 2.0

4 votes

private static void UTF8toUTF16(BytesRef bytes, CharsRef charsRef) {
    if (charsRef.chars.length < bytes.length) {
        charsRef.chars = new char[bytes.length];
    }
    charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars);
}

Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0

4 votes

private static boolean isSurrogate(int code) {
  return code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_LOW_END;
}

Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0

4 votes

private boolean matches(ByteRunAutomaton a, int code) {
  char[] chars = Character.toChars(code);
  byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)];
  final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
  return a.run(b, 0, len);
}

Source File: TermGroupFacetCollector.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
  if (segmentFacetCounts != null) {
    segmentResults.add(createSegmentResult());
  }

  groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField);
  facetFieldDocTermOrds = DocValues.getSortedSet(context.reader(), facetField);
  facetFieldNumTerms = (int) facetFieldDocTermOrds.getValueCount();
  if (facetFieldNumTerms == 0) {
    facetOrdTermsEnum = null;
  } else {
    facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum();
  }
  // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet field
  segmentFacetCounts = new int[facetFieldNumTerms + 1];
  segmentTotalCount = 0;

  segmentGroupedFacetHits.clear();
  for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
    int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue);
    if (groupedFacetHit.groupValue != null && groupOrd < 0) {
      continue;
    }

    int facetOrd;
    if (groupedFacetHit.facetValue != null) {
      if (facetOrdTermsEnum == null || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue)) {
        continue;
      }
      facetOrd = (int) facetOrdTermsEnum.ord();
    } else {
      facetOrd = facetFieldNumTerms;
    }

    // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field
    int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd;
    segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
  }

  if (facetPrefix != null) {
    TermsEnum.SeekStatus seekStatus;
    if (facetOrdTermsEnum != null) {
      seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix);
    } else {
      seekStatus = TermsEnum.SeekStatus.END;
    }

    if (seekStatus != TermsEnum.SeekStatus.END) {
      startFacetOrd = (int) facetOrdTermsEnum.ord();
    } else {
      startFacetOrd = 0;
      endFacetOrd = 0;
      return;
    }

    BytesRefBuilder facetEndPrefix = new BytesRefBuilder();
    facetEndPrefix.append(facetPrefix);
    facetEndPrefix.append(UnicodeUtil.BIG_TERM);
    seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix.get());
    if (seekStatus != TermsEnum.SeekStatus.END) {
      endFacetOrd = (int) facetOrdTermsEnum.ord();
    } else {
      endFacetOrd = facetFieldNumTerms; // Don't include null...
    }
  } else {
    startFacetOrd = 0;
    endFacetOrd = facetFieldNumTerms + 1;
  }
}

Source File: FuzzyTermsEnum.java From lucene-solr with Apache License 2.0

4 votes

@Override
public BytesRef next() throws IOException {

  if (queuedBottom != null) {
    bottomChanged(queuedBottom);
    queuedBottom = null;
  }
  

  BytesRef term;

  term = actualEnum.next();
  if (term == null) {
    // end
    return null;
  }

  int ed = maxEdits;
    
  // we know the outer DFA always matches.
  // now compute exact edit distance
  while (ed > 0) {
    if (matches(term, ed - 1)) {
      ed--;
    } else {
      break;
    }
  }
    
  if (ed == 0) { // exact match
    boostAtt.setBoost(1.0F);
  } else {
    final int codePointCount = UnicodeUtil.codePointCount(term);
    int minTermLength = Math.min(codePointCount, termLength);

    float similarity = 1.0f - (float) ed / (float) minTermLength;
    boostAtt.setBoost(similarity);
  }
    
  final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
  final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm();
  if (bottom != this.bottom || bottomTerm != this.bottomTerm) {
    this.bottom = bottom;
    this.bottomTerm = bottomTerm;
    // clone the term before potentially doing something with it
    // this is a rare but wonderful occurrence anyway

    // We must delay bottomChanged until the next next() call otherwise we mess up docFreq(), etc., for the current term:
    queuedBottom = BytesRef.deepCopyOf(term);
  }
  
  return term;
}

Source File: TestMappingCharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNonBMPChar() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
}

Source File: TokenInfoDictionaryTest.java From lucene-solr with Apache License 2.0

4 votes

/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
  // just for debugging
  int numTerms = 0;
  int numWords = 0;
  int lastWordId = -1;
  int lastSourceId = -1;
  TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
  ConnectionCosts matrix = ConnectionCosts.getInstance();
  FST<Long> fst = tid.getFST().getInternalFST();
  IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
  IntsRefFSTEnum.InputOutput<Long> mapping;
  IntsRef scratch = new IntsRef();
  while ((mapping = fstEnum.next()) != null) {
    numTerms++;
    IntsRef input = mapping.input;
    char[] chars = new char[input.length];
    for (int i = 0; i < chars.length; i++) {
      chars[i] = (char)input.ints[input.offset+i];
    }
    assertTrue(UnicodeUtil.validUTF16String(new String(chars)));

    Long output = mapping.output;
    int sourceId = output.intValue();
    // we walk in order, terms, sourceIds, and wordIds should always be increasing
    assertTrue(sourceId > lastSourceId);
    lastSourceId = sourceId;
    tid.lookupWordIds(sourceId, scratch);
    for (int i = 0; i < scratch.length; i++) {
      numWords++;
      int wordId = scratch.ints[scratch.offset+i];
      assertTrue(wordId > lastWordId);
      lastWordId = wordId;

      String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
      assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));

      String inflectionForm = tid.getInflectionForm(wordId);
      assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
      if (inflectionForm != null) {
        // check that it's actually an ipadic inflection form
        assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
      }

      String inflectionType = tid.getInflectionType(wordId);
      assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
      if (inflectionType != null) {
        // check that it's actually an ipadic inflection type
        assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
      }

      int leftId = tid.getLeftId(wordId);
      int rightId = tid.getRightId(wordId);

      matrix.get(rightId, leftId);

      tid.getWordCost(wordId);

      String pos = tid.getPartOfSpeech(wordId);
      assertNotNull(pos);
      assertTrue(UnicodeUtil.validUTF16String(pos));
      // check that it's actually an ipadic pos tag
      assertNotNull(ToStringUtil.getPOSTranslation(pos));

      String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
      assertNotNull(pronunciation);
      assertTrue(UnicodeUtil.validUTF16String(pronunciation));

      String reading = tid.getReading(wordId, chars, 0, chars.length);
      assertNotNull(reading);
      assertTrue(UnicodeUtil.validUTF16String(reading));
    }
  }
  if (VERBOSE) {
    System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
  }
}

org.apache.lucene.util.UnicodeUtil Java Examples