Java Code Examples for org.jsoup.nodes.Entities#getCharacterByName()
The following examples show how to use
org.jsoup.nodes.Entities#getCharacterByName() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Tokeniser.java From astor with GNU General Public License v2.0 | 4 votes |
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAnySorted(notCharRefCharsSorted)) return null; final char[] charRef = charRefHolder; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException e) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); charRef[0] = replacementChar; return charRef; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors if (charval < Character.MIN_SUPPLEMENTARY_CODE_POINT) { charRef[0] = (char) charval; return charRef; } else return Character.toChars(charval); } } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi charRef[0] = Entities.getCharacterByName(nameRef); return charRef; } }
Example 2
Source File: Tokeniser.java From jsoup-learning with MIT License | 4 votes |
char[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) { if (reader.isEmpty()) return null; if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) return null; if (reader.matchesAny('\t', '\n', '\r', '\f', ' ', '<', '&')) return null; reader.mark(); if (reader.matchConsume("#")) { // numbered boolean isHexMode = reader.matchConsumeIgnoreCase("X"); String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); if (numRef.length() == 0) { // didn't match anything characterReferenceError("numeric reference with no numerals"); reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi int charval = -1; try { int base = isHexMode ? 16 : 10; charval = Integer.valueOf(numRef, base); } catch (NumberFormatException e) { } // skip if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) { characterReferenceError("character outside of valid range"); return new char[]{replacementChar}; } else { // todo: implement number replacement table // todo: check for extra illegal unicode points as parse errors return Character.toChars(charval); } } else { // named // get as many letters as possible, and look for matching entities. String nameRef = reader.consumeLetterThenDigitSequence(); boolean looksLegit = reader.matches(';'); // found if a base named entity without a ;, or an extended entity with the ;. boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); if (!found) { reader.rewindToMark(); if (looksLegit) // named with semicolon characterReferenceError(String.format("invalid named referenece '%s'", nameRef)); return null; } if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { // don't want that to match reader.rewindToMark(); return null; } if (!reader.matchConsume(";")) characterReferenceError("missing semicolon"); // missing semi return new char[]{Entities.getCharacterByName(nameRef)}; } }