Java Code Examples for com.ibm.icu.lang.UCharacter#getType()
The following examples show how to use
com.ibm.icu.lang.UCharacter#getType() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SpoofChecker.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Computes the set of numerics for a string, according to UTS 39 section 5.3. */ private void getNumerics(String input, UnicodeSet result) { result.clear(); for (int utf16Offset = 0; utf16Offset < input.length();) { int codePoint = Character.codePointAt(input, utf16Offset); utf16Offset += Character.charCount(codePoint); // Store a representative character for each kind of decimal digit if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { // Store the zero character as a representative for comparison. // Unicode guarantees it is codePoint - value result.add(codePoint - UCharacter.getNumericValue(codePoint)); } } }
Example 2
Source File: UCharacterName.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Gets the character extended type * @param ch character to be tested * @return extended type it is associated with */ private static int getType(int ch) { if (UCharacterUtility.isNonCharacter(ch)) { // not a character we return a invalid category count return NON_CHARACTER_; } int result = UCharacter.getType(ch); if (result == UCharacterCategory.SURROGATE) { if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { result = LEAD_SURROGATE_; } else { result = TRAIL_SURROGATE_; } } return result; }
Example 3
Source File: AlphabeticIndex.java From fitnotifications with Apache License 2.0 | 6 votes |
/** * Return a list of the first character in each script. Only exposed for testing. * * @return list of first characters in each script * @internal * @deprecated This API is ICU internal, only for testing. */ @Deprecated public List<String> getFirstCharactersInScripts() { List<String> dest = new ArrayList<String>(200); // Fetch the script-first-primary contractions which are defined in the root collator. // They all start with U+FDD1. UnicodeSet set = new UnicodeSet(); collatorPrimaryOnly.internalAddContractions(0xFDD1, set); if (set.isEmpty()) { throw new UnsupportedOperationException( "AlphabeticIndex requires script-first-primary contractions"); } for (String boundary : set) { int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1)); if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) { // Ignore boundaries for the special reordering groups. // Take only those for "real scripts" (where the sample character is a Letter, // and the one for unassigned implicit weights (Cn). continue; } dest.add(boundary); } return dest; }
Example 4
Source File: UCharacterName.java From trekarta with GNU General Public License v3.0 | 6 votes |
/** * Gets the character extended type * @param ch character to be tested * @return extended type it is associated with */ private static int getType(int ch) { if (UCharacterUtility.isNonCharacter(ch)) { // not a character we return a invalid category count return NON_CHARACTER_; } int result = UCharacter.getType(ch); if (result == UCharacterCategory.SURROGATE) { if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { result = LEAD_SURROGATE_; } else { result = TRAIL_SURROGATE_; } } return result; }
Example 5
Source File: ScriptIterator.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
/** * Iterates to the next script run, returning true if one exists. * * @return true if there is another script run, false otherwise. */ boolean next() { if (scriptLimit >= limit) { return false; } scriptCode = UScript.COMMON; scriptStart = scriptLimit; while (index < limit) { final int ch = UTF16.charAt(text, start, limit, index - start); final int sc = getScript(ch); /* * From UTR #24: Implementations that determine the boundaries between * characters of given scripts should never break between a non-spacing * mark and its base character. Thus for boundary determinations and * similar sorts of processing, a non-spacing mark — whatever its script * value — should inherit the script value of its base character. */ if (isSameScript(scriptCode, sc) || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) { index += UTF16.getCharCount(ch); /* * Inherited or Common becomes the script code of the surrounding text. */ if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { scriptCode = sc; } } else { break; } } scriptLimit = index; return true; }
Example 6
Source File: Character.java From juniversal with MIT License | 5 votes |
/** * Gets the general Unicode category of the specified code point. * * @param codePoint * the Unicode code point to get the category of. * @return the Unicode category of {@code codePoint}. */ public static int getType(int codePoint) { if (codePoint < 1000 && codePoint > 0) { return typeValuesCache[codePoint]; } int type = UCharacter.getType(codePoint); // the type values returned by UCharacter are not compatible with what // the spec says.RI's Character type values skip the value 17. if (type <= Character.FORMAT) { return type; } return (type + 1); }
Example 7
Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0 | 5 votes |
@Override boolean contains(int c) { // "horizontal space" if(c<=0x9f) { return c==9 || c==0x20; /* TAB or SPACE */ } else { /* Zs */ return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR; } }
Example 8
Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0 | 5 votes |
@Override boolean contains(int c) { /* * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}. * * The only cntrl character in graph+blank is TAB (in blank). * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). */ return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c); }
Example 9
Source File: UCharacterProperty.java From fitnotifications with Apache License 2.0 | 5 votes |
@Override boolean contains(int c) { /* check ASCII and Fullwidth ASCII a-fA-F */ if( (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) ) { return true; } return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER; }
Example 10
Source File: ScriptIterator.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Iterates to the next script run, returning true if one exists. * * @return true if there is another script run, false otherwise. */ boolean next() { if (scriptLimit >= limit) return false; scriptCode = UScript.COMMON; scriptStart = scriptLimit; while (index < limit) { final int ch = UTF16.charAt(text, start, limit, index - start); final int sc = getScript(ch); /* * From UTR #24: Implementations that determine the boundaries between * characters of given scripts should never break between a non-spacing * mark and its base character. Thus for boundary determinations and * similar sorts of processing, a non-spacing mark — whatever its script * value — should inherit the script value of its base character. */ if (isSameScript(scriptCode, sc) || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) { index += UTF16.getCharCount(ch); /* * Inherited or Common becomes the script code of the surrounding text. */ if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { scriptCode = sc; } } else { break; } } scriptLimit = index; return true; }
Example 11
Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0 | 5 votes |
@Override boolean contains(int c) { /* check ASCII and Fullwidth ASCII a-fA-F */ if( (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) ) { return true; } return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER; }
Example 12
Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0 | 5 votes |
@Override boolean contains(int c) { // "horizontal space" if(c<=0x9f) { return c==9 || c==0x20; /* TAB or SPACE */ } else { /* Zs */ return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR; } }
Example 13
Source File: UCharacterProperty.java From trekarta with GNU General Public License v3.0 | 5 votes |
@Override boolean contains(int c) { /* * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}. * * The only cntrl character in graph+blank is TAB (in blank). * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). */ return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c); }
Example 14
Source File: UnicodeSet.java From trekarta with GNU General Public License v3.0 | 4 votes |
@Override public boolean contains(int ch) { return ((1 << UCharacter.getType(ch)) & mask) != 0; }
Example 15
Source File: UnicodeData.java From es6draft with MIT License | 4 votes |
@Override public boolean has(int codePoint, int value) { return ((1 << UCharacter.getType(codePoint)) & value) != 0; }
Example 16
Source File: UTS46.java From trekarta with GNU General Public License v3.0 | 4 votes |
private static int U_GET_GC_MASK(int c) { return (1<<UCharacter.getType(c)); }
Example 17
Source File: UTS46.java From fitnotifications with Apache License 2.0 | 4 votes |
private static int U_GET_GC_MASK(int c) { return (1<<UCharacter.getType(c)); }
Example 18
Source File: UnicodeSet.java From fitnotifications with Apache License 2.0 | 4 votes |
@Override public boolean contains(int ch) { return ((1 << UCharacter.getType(ch)) & mask) != 0; }
Example 19
Source File: BreakTransliterator.java From fitnotifications with Apache License 2.0 | 4 votes |
@Override protected synchronized void handleTransliterate(Replaceable text, Position pos, boolean incremental) { boundaryCount = 0; int boundary = 0; getBreakIterator(); // Lazy-create it if necessary bi.setText(new ReplaceableCharacterIterator(text, pos.start, pos.limit, pos.start)); // TODO: fix clumsy workaround used below. /* char[] tempBuffer = new char[text.length()]; text.getChars(0, text.length(), tempBuffer, 0); bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start)); */ // end debugging // To make things much easier, we will stack the boundaries, and then insert at the end. // generally, we won't need too many, since we will be filtered. for(boundary = bi.first(); boundary != BreakIterator.DONE && boundary < pos.limit; boundary = bi.next()) { if (boundary == 0) continue; // HACK: Check to see that preceeding item was a letter int cp = UTF16.charAt(text, boundary-1); int type = UCharacter.getType(cp); //System.out.println(Integer.toString(cp,16) + " (before): " + type); if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue; cp = UTF16.charAt(text, boundary); type = UCharacter.getType(cp); //System.out.println(Integer.toString(cp,16) + " (after): " + type); if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue; if (boundaryCount >= boundaries.length) { // realloc if necessary int[] temp = new int[boundaries.length * 2]; System.arraycopy(boundaries, 0, temp, 0, boundaries.length); boundaries = temp; } boundaries[boundaryCount++] = boundary; //System.out.println(boundary); } int delta = 0; int lastBoundary = 0; if (boundaryCount != 0) { // if we found something, adjust delta = boundaryCount * insertion.length(); lastBoundary = boundaries[boundaryCount-1]; // we do this from the end backwards, so that we don't have to keep updating. while (boundaryCount > 0) { boundary = boundaries[--boundaryCount]; text.replace(boundary, boundary, insertion); } } // Now fix up the return values pos.contextLimit += delta; pos.limit += delta; pos.start = incremental ? lastBoundary + delta : pos.limit; }
Example 20
Source File: Characters.java From es6draft with MIT License | 2 votes |
/** * Unicode category "Zs" (space separator) * * @param c * the character * @return {@code true} if the character is space separator */ public static boolean isSpaceSeparator(int c) { return UCharacter.getType(c) == UCharacterCategory.SPACE_SEPARATOR; }