com.ibm.icu.lang.UCharacter#getType

Source File: SpoofChecker.java From fitnotifications with Apache License 2.0

6 votes

/**
 * Computes the set of numerics for a string, according to UTS 39 section 5.3.
 */
private void getNumerics(String input, UnicodeSet result) {
    result.clear();

    for (int utf16Offset = 0; utf16Offset < input.length();) {
        int codePoint = Character.codePointAt(input, utf16Offset);
        utf16Offset += Character.charCount(codePoint);

        // Store a representative character for each kind of decimal digit
        if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
            // Store the zero character as a representative for comparison.
            // Unicode guarantees it is codePoint - value
            result.add(codePoint - UCharacter.getNumericValue(codePoint));
        }
    }
}

Source File: UCharacterName.java From fitnotifications with Apache License 2.0

6 votes

/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private static int getType(int ch)
{
    if (UCharacterUtility.isNonCharacter(ch)) {
        // not a character we return a invalid category count
        return NON_CHARACTER_;
    }
    int result = UCharacter.getType(ch);
    if (result == UCharacterCategory.SURROGATE) {
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            result = LEAD_SURROGATE_;
        }
        else {
            result = TRAIL_SURROGATE_;
        }
    }
    return result;
}

Source File: AlphabeticIndex.java From fitnotifications with Apache License 2.0

6 votes

/**
 * Return a list of the first character in each script. Only exposed for testing.
 *
 * @return list of first characters in each script
 * @internal
 * @deprecated This API is ICU internal, only for testing.
 */
@Deprecated
public List<String> getFirstCharactersInScripts() {
    List<String> dest = new ArrayList<String>(200);
    // Fetch the script-first-primary contractions which are defined in the root collator.
    // They all start with U+FDD1.
    UnicodeSet set = new UnicodeSet();
    collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
    if (set.isEmpty()) {
        throw new UnsupportedOperationException(
                "AlphabeticIndex requires script-first-primary contractions");
    }
    for (String boundary : set) {
        int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
        if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
            // Ignore boundaries for the special reordering groups.
            // Take only those for "real scripts" (where the sample character is a Letter,
            // and the one for unassigned implicit weights (Cn).
            continue;
        }
        dest.add(boundary);
    }
    return dest;
}

Source File: UCharacterName.java From trekarta with GNU General Public License v3.0

6 votes

/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private static int getType(int ch)
{
    if (UCharacterUtility.isNonCharacter(ch)) {
        // not a character we return a invalid category count
        return NON_CHARACTER_;
    }
    int result = UCharacter.getType(ch);
    if (result == UCharacterCategory.SURROGATE) {
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
            result = LEAD_SURROGATE_;
        }
        else {
            result = TRAIL_SURROGATE_;
        }
    }
    return result;
}

Source File: ScriptIterator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

/**
 * Iterates to the next script run, returning true if one exists.
 *
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
    if (scriptLimit >= limit) {
        return false;
    }
    scriptCode = UScript.COMMON;
    scriptStart = scriptLimit;
    while (index < limit) {
        final int ch = UTF16.charAt(text, start, limit, index - start);
        final int sc = getScript(ch);
        /*
         * From UTR #24: Implementations that determine the boundaries between
         * characters of given scripts should never break between a non-spacing
         * mark and its base character. Thus for boundary determinations and
         * similar sorts of processing, a non-spacing mark — whatever its script
         * value — should inherit the script value of its base character.
         */
        if (isSameScript(scriptCode, sc)
                || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
            index += UTF16.getCharCount(ch);
            /*
             * Inherited or Common becomes the script code of the surrounding text.
             */
            if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
                scriptCode = sc;
            }
        } else {
            break;
        }
    }
    scriptLimit = index;
    return true;
}

Source File: Character.java From juniversal with MIT License

5 votes

/**
 * Gets the general Unicode category of the specified code point.
 * 
 * @param codePoint
 *            the Unicode code point to get the category of.
 * @return the Unicode category of {@code codePoint}.
 */
public static int getType(int codePoint) {
	if (codePoint < 1000 && codePoint > 0) {
		return typeValuesCache[codePoint];
	} 
    int type = UCharacter.getType(codePoint);

    // the type values returned by UCharacter are not compatible with what
    // the spec says.RI's Character type values skip the value 17.
    if (type <= Character.FORMAT) {
        return type;
    }
    return (type + 1);
}

Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0

5 votes

@Override
boolean contains(int c) {
    // "horizontal space"
    if(c<=0x9f) {
        return c==9 || c==0x20; /* TAB or SPACE */
    } else {
        /* Zs */
        return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    }
}

Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0

5 votes

@Override
boolean contains(int c) {
    /*
     * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
     *
     * The only cntrl character in graph+blank is TAB (in blank).
     * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
     */
    return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
}

Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0

5 votes

@Override
boolean contains(int c) {
    /* check ASCII and Fullwidth ASCII a-fA-F */
    if(
        (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
        (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    ) {
        return true;
    }
    return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
}

Source File: ScriptIterator.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Iterates to the next script run, returning true if one exists.
 * 
 * @return true if there is another script run, false otherwise.
 */
boolean next() {
  if (scriptLimit >= limit)
    return false;

  scriptCode = UScript.COMMON;
  scriptStart = scriptLimit;

  while (index < limit) {
    final int ch = UTF16.charAt(text, start, limit, index - start);
    final int sc = getScript(ch);

    /*
     * From UTR #24: Implementations that determine the boundaries between
     * characters of given scripts should never break between a non-spacing
     * mark and its base character. Thus for boundary determinations and
     * similar sorts of processing, a non-spacing mark — whatever its script
     * value — should inherit the script value of its base character.
     */
    if (isSameScript(scriptCode, sc)
        || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
      index += UTF16.getCharCount(ch);

      /*
       * Inherited or Common becomes the script code of the surrounding text.
       */
      if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
        scriptCode = sc;
      }

    } else {
      break;
    }
  }

  scriptLimit = index;
  return true;
}

Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0

5 votes

@Override
boolean contains(int c) {
    /* check ASCII and Fullwidth ASCII a-fA-F */
    if(
        (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
        (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    ) {
        return true;
    }
    return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
}

Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0

5 votes

@Override
boolean contains(int c) {
    // "horizontal space"
    if(c<=0x9f) {
        return c==9 || c==0x20; /* TAB or SPACE */
    } else {
        /* Zs */
        return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    }
}

Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0

5 votes

@Override
boolean contains(int c) {
    /*
     * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
     *
     * The only cntrl character in graph+blank is TAB (in blank).
     * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
     */
    return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
}

Source File: UnicodeSet.java From trekarta with GNU General Public License v3.0

4 votes

@Override
public boolean contains(int ch) {
    return ((1 << UCharacter.getType(ch)) & mask) != 0;
}

Source File: UnicodeData.java From es6draft with MIT License

4 votes

@Override
public boolean has(int codePoint, int value) {
    return ((1 << UCharacter.getType(codePoint)) & value) != 0;
}

Source File: UTS46.java From trekarta with GNU General Public License v3.0

4 votes

private static int U_GET_GC_MASK(int c) {
    return (1<<UCharacter.getType(c));
}

Source File: UTS46.java From fitnotifications with Apache License 2.0

4 votes

private static int U_GET_GC_MASK(int c) {
    return (1<<UCharacter.getType(c));
}

Source File: UnicodeSet.java From fitnotifications with Apache License 2.0

4 votes

@Override
public boolean contains(int ch) {
    return ((1 << UCharacter.getType(ch)) & mask) != 0;
}

Source File: BreakTransliterator.java From fitnotifications with Apache License 2.0

4 votes

@Override
protected synchronized void handleTransliterate(Replaceable text, Position pos, boolean incremental) {
    boundaryCount = 0;
    int boundary = 0;
    getBreakIterator(); // Lazy-create it if necessary
    bi.setText(new ReplaceableCharacterIterator(text, pos.start, pos.limit, pos.start));
    // TODO: fix clumsy workaround used below.
    /*
    char[] tempBuffer = new char[text.length()];
    text.getChars(0, text.length(), tempBuffer, 0);
    bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start));
    */
    // end debugging

    // To make things much easier, we will stack the boundaries, and then insert at the end.
    // generally, we won't need too many, since we will be filtered.

    for(boundary = bi.first(); boundary != BreakIterator.DONE && boundary < pos.limit; boundary = bi.next()) {
        if (boundary == 0) continue;
        // HACK: Check to see that preceeding item was a letter

        int cp = UTF16.charAt(text, boundary-1);
        int type = UCharacter.getType(cp);
        //System.out.println(Integer.toString(cp,16) + " (before): " + type);
        if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;

        cp = UTF16.charAt(text, boundary);
        type = UCharacter.getType(cp);
        //System.out.println(Integer.toString(cp,16) + " (after): " + type);
        if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;

        if (boundaryCount >= boundaries.length) {       // realloc if necessary
            int[] temp = new int[boundaries.length * 2];
            System.arraycopy(boundaries, 0, temp, 0, boundaries.length);
            boundaries = temp;
        }

        boundaries[boundaryCount++] = boundary;
        //System.out.println(boundary);
    }

    int delta = 0;
    int lastBoundary = 0;

    if (boundaryCount != 0) { // if we found something, adjust
        delta = boundaryCount * insertion.length();
        lastBoundary = boundaries[boundaryCount-1];

        // we do this from the end backwards, so that we don't have to keep updating.

        while (boundaryCount > 0) {
            boundary = boundaries[--boundaryCount];
            text.replace(boundary, boundary, insertion);
        }
    }

    // Now fix up the return values
    pos.contextLimit += delta;
    pos.limit += delta;
    pos.start = incremental ? lastBoundary + delta : pos.limit;
}

Source File: Characters.java From es6draft with MIT License

2 votes

/**
 * Unicode category "Zs" (space separator)
 * 
 * @param c
 *            the character
 * @return {@code true} if the character is space separator
 */
public static boolean isSpaceSeparator(int c) {
    return UCharacter.getType(c) == UCharacterCategory.SPACE_SEPARATOR;
}

Java Code Examples for com.ibm.icu.lang.UCharacter#getType()