com.ibm.icu.text.UnicodeSetIterator Java Exaples

Source File: CollationDataBuilder.java From fitnotifications with Apache License 2.0

6 votes

void suppressContractions(UnicodeSet set) {
    if(set.isEmpty()) { return; }
    UnicodeSetIterator iter = new UnicodeSetIterator(set);
    while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 == Collation.FALLBACK_CE32) {
            ce32 = base.getFinalCE32(base.getCE32(c));
            if(Collation.ce32HasContext(ce32)) {
                ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
                trie.set(c, ce32);
            }
        } else if(isBuilderContextCE32(ce32)) {
            ce32 = getConditionalCE32ForCE32(ce32).ce32;
            // Simply abandon the list of ConditionalCE32.
            // The caller will copy this builder in the end,
            // eliminating unreachable data.
            trie.set(c, ce32);
            contextChars.remove(c);
        }
    }
    modified = true;
}

Source File: CollationDataBuilder.java From fitnotifications with Apache License 2.0

6 votes

protected void setDigitTags() {
    UnicodeSet digits = new UnicodeSet("[:Nd:]");
    UnicodeSetIterator iter = new UnicodeSetIterator(digits);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
            int index = addCE32(ce32);
            if(index > Collation.MAX_INDEX) {
                throw new IndexOutOfBoundsException("too many mappings");
                // BufferOverflowException is a better fit
                // but cannot be constructed with a message string.
            }
            ce32 = Collation.makeCE32FromTagIndexAndLength(
                    Collation.DIGIT_TAG, index, UCharacter.digit(c));  // u_charDigitValue(c)
            trie.set(c, ce32);
        }
    }
}

Source File: CollationDataBuilder.java From fitnotifications with Apache License 2.0

6 votes

protected void buildContexts() {
    // Ignore abandoned lists and the cached builtCE32,
    // and build all contexts from scratch.
    contexts.setLength(0);
    UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(!isBuilderContextCE32(ce32)) {
            throw new AssertionError("Impossible: No context data for c in contextChars.");
        }
        ConditionalCE32 cond = getConditionalCE32ForCE32(ce32);
        ce32 = buildContext(cond);
        trie.set(c, ce32);
    }
}

Source File: CollationBuilder.java From fitnotifications with Apache License 2.0

6 votes

private void closeOverComposites() {
    String prefix = "";  // empty
    UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        String nfdString = nfd.getDecomposition(iter.codepoint);
        cesLength = dataBuilder.getCEs(nfdString, ces, 0);
        if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
            // Too many CEs from the decomposition (unusual), ignore this composite.
            // We could add a capacity parameter to getCEs() and reallocate if necessary.
            // However, this can only really happen in contrived cases.
            continue;
        }
        String composite = iter.getString();
        addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32);
    }
}

Source File: UTR30DataFileGenerator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

private static void expandSingleRule(StringBuilder builder, String leftHandSide, String rightHandSide) {
    UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
    boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
    for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.nextRange(); ) {
        if (it.codepoint != UnicodeSetIterator.IS_STRING) {
            if (numericValue) {
                for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) {
                    builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>');
                    builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp)));
                    builder.append("   # ").append(UCharacter.getName(cp));
                    builder.append("\n");
                }
            } else {
                builder.append(String.format(Locale.ROOT, "%04X", it.codepoint));
                if (it.codepointEnd > it.codepoint) {
                    builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd));
                }
                builder.append('>').append(rightHandSide).append("\n");
            }
        } else {
            logger.error("ERROR: String '" + it.getString() + "' found in UnicodeSet");
        }
    }
}

Source File: CollationDataBuilder.java From fitnotifications with Apache License 2.0

5 votes

void optimize(UnicodeSet set) {
    if(set.isEmpty()) { return; }
    UnicodeSetIterator iter = new UnicodeSetIterator(set);
    while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
        int c = iter.codepoint;
        int ce32 = trie.get(c);
        if(ce32 == Collation.FALLBACK_CE32) {
            ce32 = base.getFinalCE32(base.getCE32(c));
            ce32 = copyFromBaseCE32(c, ce32, true);
            trie.set(c, ce32);
        }
    }
    modified = true;
}

Source File: CollationDataBuilder.java From fitnotifications with Apache License 2.0

5 votes

protected void clearContexts() {
    contexts.setLength(0);
    UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int ce32 = trie.get(iter.codepoint);
        assert(isBuilderContextCE32(ce32));
        getConditionalCE32ForCE32(ce32).builtCE32 = Collation.NO_CE32;
    }
}

Source File: GenerateUTR30DataFiles.java From lucene-solr with Apache License 2.0

5 votes

private static void expandSingleRule
    (StringBuilder builder, String leftHandSide, String rightHandSide)
    throws IllegalArgumentException {
  UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
  boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
  for (UnicodeSetIterator it = new UnicodeSetIterator(set) ; it.nextRange() ; ) {
    if (it.codepoint != UnicodeSetIterator.IS_STRING) {
      if (numericValue) {
        for (int cp = it.codepoint ; cp <= it.codepointEnd ; ++cp) {
          builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>');
          builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp)));
          builder.append("   # ").append(UCharacter.getName(cp));
          builder.append("\n");
        }
      } else {
        builder.append(String.format(Locale.ROOT, "%04X", it.codepoint));
        if (it.codepointEnd > it.codepoint) {
          builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd));
        }
        builder.append('>').append(rightHandSide).append("\n");
      }
    } else {
      System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet");
      System.exit(1);
    }
  }
}

Source File: CollationBuilder.java From fitnotifications with Apache License 2.0

4 votes

private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) {
    // Look for the last starter in the NFD string.
    int lastStarter;
    int indexAfterLastStarter = nfdString.length();
    for(;;) {
        if(indexAfterLastStarter == 0) { return; }  // no starter at all
        lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter);
        if(nfd.getCombiningClass(lastStarter) == 0) { break; }
        indexAfterLastStarter -= Character.charCount(lastStarter);
    }
    // No closure to Hangul syllables since we decompose them on the fly.
    if(Hangul.isJamoL(lastStarter)) { return; }

    // Are there any composites whose decomposition starts with the lastStarter?
    // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
    // We might find some more equivalent mappings here if it did.
    UnicodeSet composites = new UnicodeSet();
    if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }

    StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder();
    long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH];
    UnicodeSetIterator iter = new UnicodeSetIterator(composites);
    while(iter.next()) {
        assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
        int composite = iter.codepoint;
        String decomp = nfd.getDecomposition(composite);
        if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
                newNFDString, newString)) {
            continue;
        }
        int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0);
        if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) {
            // Ignore mappings that we cannot store.
            continue;
        }
        // Note: It is possible that the newCEs do not make use of the mapping
        // for which we are adding the tail composites, in which case we might be adding
        // unnecessary mappings.
        // For example, when we add tail composites for ae^ (^=combining circumflex),
        // UCA discontiguous-contraction matching does not find any matches
        // for ae_^ (_=any combining diacritic below) *unless* there is also
        // a contraction mapping for ae.
        // Thus, if there is no ae contraction, then the ae^ mapping is ignored
        // while fetching the newCEs for ae_^.
        // TODO: Try to detect this effectively.
        // (Alternatively, print a warning when prefix contractions are missing.)

        // We do not need an explicit mapping for the NFD strings.
        // It is fine if the NFD input collates like this via a sequence of mappings.
        // It also saves a little bit of space, and may reduce the set of characters with contractions.
        int ce32 = addIfDifferent(nfdPrefix, newString,
                                      newCEs, newCEsLength, Collation.UNASSIGNED_CE32);
        if(ce32 != Collation.UNASSIGNED_CE32) {
            // was different, was added
            addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32);
        }
    }
}

com.ibm.icu.text.UnicodeSetIterator Java Examples