java.nio.charset.CharsetDecoder#onMalformedInput

Source File: Text.java From hadoop-gpu with Apache License 2.0

6 votes

private static String decode(ByteBuffer utf8, boolean replace) 
  throws CharacterCodingException {
  CharsetDecoder decoder = DECODER_FACTORY.get();
  if (replace) {
    decoder.onMalformedInput(
        java.nio.charset.CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
  }
  String str = decoder.decode(utf8).toString();
  // set decoder back to its default value: REPORT
  if (replace) {
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  }
  return str;
}

Source File: UnicodeHelper.java From p4ic4idea with Apache License 2.0

6 votes

/**
 * Try to determine whether a byte buffer's character encoding is that of the
 * passed-in charset. Uses inefficient
 * heuristics that will be revisited when we're more familiar with likely
 * usage patterns.
 * 
 * Note this has been heavily changed since inception and will
 * almost certainly disappear in the 10.x timeframe -- HR.
 */
public static boolean inferCharset(byte[] bytes, int bytesRead, Charset clientCharset) {
	ByteBuffer byteBuf = ByteBuffer.wrap(bytes, 0, bytesRead);
	CharBuffer charBuf = CharBuffer.allocate(byteBuf.capacity() * 2);
	
	if (clientCharset != null) {
		CharsetDecoder decoder = clientCharset.newDecoder();
		decoder.onMalformedInput(CodingErrorAction.REPORT);
		decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
		CoderResult coderResult = decoder.decode(byteBuf, charBuf, false);
		if (coderResult != null) {
			if (coderResult.isError()) {
				// Wasn't this one...
				return false;
			} else {
				return true;	// Still only *probably* true, dammit...
			}
		}
	}
	
	return true;
}

Source File: CharsetUtil.java From android-netty with Apache License 2.0

6 votes

/**
 * Returns a cached thread-local {@link CharsetDecoder} for the specified
 * <tt>charset</tt>.
 */
public static CharsetDecoder getDecoder(Charset charset) {
    if (charset == null) {
        throw new NullPointerException("charset");
    }

    Map<Charset, CharsetDecoder> map = decoders.get();
    CharsetDecoder d = map.get(charset);
    if (d != null) {
        d.reset();
        d.onMalformedInput(CodingErrorAction.REPLACE);
        d.onUnmappableCharacter(CodingErrorAction.REPLACE);
        return d;
    }

    d = charset.newDecoder();
    d.onMalformedInput(CodingErrorAction.REPLACE);
    d.onUnmappableCharacter(CodingErrorAction.REPLACE);
    map.put(charset, d);
    return d;
}

Source File: Text.java From hadoop with Apache License 2.0

6 votes

private static String decode(ByteBuffer utf8, boolean replace) 
  throws CharacterCodingException {
  CharsetDecoder decoder = DECODER_FACTORY.get();
  if (replace) {
    decoder.onMalformedInput(
        java.nio.charset.CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
  }
  String str = decoder.decode(utf8).toString();
  // set decoder back to its default value: REPORT
  if (replace) {
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  }
  return str;
}

Source File: TracingManagedHttpClientConnectionFactory.java From caravan with Apache License 2.0

6 votes

@Override
public ManagedHttpClientConnection create(final HttpRoute route, final ConnectionConfig config) {
    final ConnectionConfig cconfig = config != null ? config : ConnectionConfig.DEFAULT;
    CharsetDecoder chardecoder = null;
    CharsetEncoder charencoder = null;
    final Charset charset = cconfig.getCharset();
    final CodingErrorAction malformedInputAction = cconfig.getMalformedInputAction() != null ? cconfig.getMalformedInputAction() : CodingErrorAction.REPORT;
    final CodingErrorAction unmappableInputAction = cconfig.getUnmappableInputAction() != null ? cconfig.getUnmappableInputAction()
            : CodingErrorAction.REPORT;
    if (charset != null) {
        chardecoder = charset.newDecoder();
        chardecoder.onMalformedInput(malformedInputAction);
        chardecoder.onUnmappableCharacter(unmappableInputAction);
        charencoder = charset.newEncoder();
        charencoder.onMalformedInput(malformedInputAction);
        charencoder.onUnmappableCharacter(unmappableInputAction);
    }
    final String id = "http-outgoing-" + Long.toString(COUNTER.getAndIncrement());
    return new TracingManagedHttpClientConnection(id, cconfig.getBufferSize(), cconfig.getFragmentSizeHint(), chardecoder, charencoder,
            cconfig.getMessageConstraints(), incomingContentStrategy, outgoingContentStrategy, requestWriterFactory, responseParserFactory, logFunc);
}

Source File: NetStringUtil.java From cronet with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Convert text in a given character set to a Unicode string.  Any invalid
 * characters are replaced with U+FFFD.  Returns null if the character set
 * is not recognized.
 * @param text ByteBuffer containing the character array to convert.
 * @param charsetName Character set it's in encoded in.
 * @return: Unicode string on success, null on failure.
 */
@CalledByNative
private static String convertToUnicodeWithSubstitutions(
        ByteBuffer text,
        String charsetName) {
    try {
        Charset charset = Charset.forName(charsetName);

        // TODO(mmenke):  Investigate if Charset.decode() can be used
        // instead.  The question is whether it uses the proper replace
        // character.  JDK CharsetDecoder docs say U+FFFD is the default,
        // but Charset.decode() docs say it uses the "charset's default
        // replacement byte array".
        CharsetDecoder decoder = charset.newDecoder();
        decoder.onMalformedInput(CodingErrorAction.REPLACE);
        decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
        decoder.replaceWith("\uFFFD");
        return decoder.decode(text).toString();
    } catch (Exception e) {
        return null;
    }
}

Source File: Text.java From RDFS with Apache License 2.0

6 votes

private static String decode(ByteBuffer utf8, boolean replace) 
  throws CharacterCodingException {
  CharsetDecoder decoder = DECODER_FACTORY.get();
  if (replace) {
    decoder.onMalformedInput(
        java.nio.charset.CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
  }
  String str = decoder.decode(utf8).toString();
  // set decoder back to its default value: REPORT
  if (replace) {
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  }
  return str;
}

Source File: UnicodeHelper.java From p4ic4idea with Apache License 2.0

6 votes

/**
 * Try to determine whether a byte buffer's character encoding is that of the
 * passed-in charset. Uses inefficient
 * heuristics that will be revisited when we're more familiar with likely
 * usage patterns.
 * 
 * Note this has been heavily changed since inception and will
 * almost certainly disappear in the 10.x timeframe -- HR.
 */
public static boolean inferCharset(byte[] bytes, int bytesRead, Charset clientCharset) {
	ByteBuffer byteBuf = ByteBuffer.wrap(bytes, 0, bytesRead);
	CharBuffer charBuf = CharBuffer.allocate(byteBuf.capacity() * 2);
	
	if (clientCharset != null) {
		CharsetDecoder decoder = clientCharset.newDecoder();
		decoder.onMalformedInput(CodingErrorAction.REPORT);
		decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
		CoderResult coderResult = decoder.decode(byteBuf, charBuf, false);
		if (coderResult != null) {
			if (coderResult.isError()) {
				// Wasn't this one...
				return false;
			} else {
				return true;	// Still only *probably* true, dammit...
			}
		}
	}
	
	return true;
}

Source File: Text.java From Canova with Apache License 2.0

6 votes

private static String decode(ByteBuffer utf8, boolean replace)
        throws CharacterCodingException {
    CharsetDecoder decoder = DECODER_FACTORY.get();
    if (replace) {
        decoder.onMalformedInput(
                java.nio.charset.CodingErrorAction.REPLACE);
        decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    }
    String str = decoder.decode(utf8).toString();
    // set decoder back to its default value: REPORT
    if (replace) {
        decoder.onMalformedInput(CodingErrorAction.REPORT);
        decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
    }
    return str;
}

Source File: StringUtils.java From Pydev with Eclipse Public License 1.0

6 votes

public static String safeDecodeByteArray(byte[] b, String baseCharset) {
    try {
        if (baseCharset == null) {
            return new String(b, StandardCharsets.ISO_8859_1);
        }
        return new String(b, baseCharset);
    } catch (Exception e) {
        try {
            //If it fails, go for something which shouldn't fail!
            CharsetDecoder decoder = Charset.forName(baseCharset).newDecoder();
            decoder.onMalformedInput(CodingErrorAction.IGNORE);
            decoder.onUnmappableCharacter(CodingErrorAction.IGNORE);
            CharBuffer parsed = decoder.decode(ByteBuffer.wrap(b, 0, b.length));
            return parsed.toString();
        } catch (Exception e2) {
            Log.log(e2);
            //Shouldn't ever happen!
            return new String("Unable to decode bytearray from Python.");
        }
    }
}

Source File: Text.java From Bats with Apache License 2.0

6 votes

private static String decode(ByteBuffer utf8, boolean replace)
    throws CharacterCodingException {
  CharsetDecoder decoder = DECODER_FACTORY.get();
  if (replace) {
    decoder.onMalformedInput(
        java.nio.charset.CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
  }
  String str = decoder.decode(utf8).toString();
  // set decoder back to its default value: REPORT
  if (replace) {
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  }
  return str;
}

Source File: LuceneUtil.java From localization_nifi with Apache License 2.0

5 votes

/**
 * Truncate a single field so that it does not exceed Lucene's byte size limit on indexed terms.
 *
 * @param field the string to be indexed
 * @return a string that can be indexed which is within Lucene's byte size limit, or null if anything goes wrong
 */
public static String truncateIndexField(String field) {
    if (field == null) {
        return field;
    }

    Charset charset = Charset.defaultCharset();
    byte[] bytes = field.getBytes(charset);
    if (bytes.length <= IndexWriter.MAX_TERM_LENGTH) {
        return field;
    }

    // chop the field to maximum allowed byte length
    ByteBuffer bbuf = ByteBuffer.wrap(bytes, 0, IndexWriter.MAX_TERM_LENGTH);

    try {
        // decode the chopped byte buffer back into original charset
        CharsetDecoder decoder = charset.newDecoder();
        decoder.onMalformedInput(CodingErrorAction.IGNORE);
        decoder.reset();
        CharBuffer cbuf = decoder.decode(bbuf);
        return cbuf.toString();
    } catch (CharacterCodingException shouldNotHappen) {}

    // if we get here, something bad has happened
    return null;
}

Source File: AbstractMatcher.java From netbeans with Apache License 2.0

5 votes

public CharsetDecoder prepareDecoder(Charset charset) {
    CharsetDecoder decoder = charset.newDecoder();
    if (strict) {
        decoder.onMalformedInput(CodingErrorAction.REPORT);
        decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
    } else {
        decoder.onMalformedInput(CodingErrorAction.IGNORE);
        decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    }
    return decoder;
}

Source File: Message.java From SI with BSD 2-Clause "Simplified" License

5 votes

public String getPayloadTracingString() {
	if (null == payload || 0 == payload.length)
		return "no payload";
	boolean text = true;
	for (byte b:payload) {
		if (' ' > b) {
			switch(b) {
			case '\t':
			case '\n':
			case '\r':
				continue;
			}
			text = false;
			break;
		}
	}
	if (text) {
		CharsetDecoder decoder = CoAP.UTF8_CHARSET.newDecoder();
		decoder.onMalformedInput(CodingErrorAction.REPORT);
		decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
		ByteBuffer in = ByteBuffer.wrap(payload);
		CharBuffer out = CharBuffer.allocate(24);
		CoderResult result = decoder.decode(in, out, true);
		decoder.flush(out);
		out.flip();
		if (CoderResult.OVERFLOW == result) {
			return "\"" + out +  "\".. " + payload.length + " bytes";
		} else if (!result.isError()){
			return "\"" + out + "\"" ;
		}
	}
	return Utils.toHexText(payload, 256);
}

Source File: JsonReader.java From jsondb-core with MIT License

5 votes

public JsonReader(JsonDBConfig dbConfig, File collectionFile) throws IOException {
  this.collectionFile = collectionFile;
  this.lockFilesLocation = new File(collectionFile.getParentFile(), "lock");
  this.fileLockLocation = new File(lockFilesLocation, collectionFile.getName() + ".lock");
  
  if(!lockFilesLocation.exists()) {
    lockFilesLocation.mkdirs();
  }
  if(!fileLockLocation.exists()) {
    fileLockLocation.createNewFile();
  }

  CharsetDecoder decoder = dbConfig.getCharset().newDecoder();
  decoder.onMalformedInput(CodingErrorAction.REPORT);
  decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  
  raf = new RandomAccessFile(fileLockLocation, "rw");
  channel = raf.getChannel();
  try {
    lock = channel.lock();
  } catch (IOException | OverlappingFileLockException e) {
    try {
      channel.close();
      raf.close();
    } catch (IOException e1) {
      logger.error("Failed while closing RandomAccessFile for collection file {}", collectionFile.getName());
    }
    throw new JsonFileLockException("JsonReader failed to obtain a file lock for file " + fileLockLocation, e);
  }

  fis = new FileInputStream(collectionFile);
  isr = new InputStreamReader(fis, decoder);
  reader = new BufferedReader(isr);
}

Source File: InputStreamReaderTest.java From j2objc with Apache License 2.0

5 votes

public void test_read_1() throws IOException {
    // if the decoder is constructed by InputStreamReader itself, the
    // decoder's default error action is REPLACE
    InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
            new byte[] { -32, -96 }), "UTF-8");
    assertEquals("read() return incorrect value", 65533, isr.read());

    InputStreamReader isr2 = new InputStreamReader(
            new ByteArrayInputStream(new byte[] { -32, -96 }), Charset
            .forName("UTF-8"));
    assertEquals("read() return incorrect value", 65533, isr2.read());

    // if the decoder is passed in, keep its status intact
    CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    InputStreamReader isr3 = new InputStreamReader(
            new ByteArrayInputStream(new byte[] { -32, -96 }), decoder);
    try {
        isr3.read();
        fail("Should throw MalformedInputException");
    } catch (MalformedInputException e) {
        // expected
    }

    CharsetDecoder decoder2 = Charset.forName("UTF-8").newDecoder();
    decoder2.onMalformedInput(CodingErrorAction.IGNORE);
    InputStreamReader isr4 = new InputStreamReader(
            new ByteArrayInputStream(new byte[] { -32, -96 }), decoder2);
    assertEquals("read() return incorrect value", -1, isr4.read());

    CharsetDecoder decoder3 = Charset.forName("UTF-8").newDecoder();
    decoder3.onMalformedInput(CodingErrorAction.REPLACE);
    InputStreamReader isr5 = new InputStreamReader(
            new ByteArrayInputStream(new byte[] { -32, -96 }), decoder3);
    assertEquals("read() return incorrect value", 65533, isr5.read());
}

Source File: CharacterEncodingExamples.java From tutorials with MIT License

5 votes

static String decodeText(String input, Charset charset, CodingErrorAction codingErrorAction) throws IOException {
    CharsetDecoder charsetDecoder = charset.newDecoder();
    charsetDecoder.onMalformedInput(codingErrorAction);
    return new BufferedReader(
      new InputStreamReader(
        new ByteArrayInputStream(input.getBytes()), charsetDecoder)).readLine();
}

Source File: AbstractTextParser.java From CloverETL-Engine with GNU Lesser General Public License v2.1

4 votes

/**
 * Sets the given decoder to replace all malformed and unmappable characters.
 * @param decoder
 */
protected static void setLenientDecoder(CharsetDecoder decoder) {
	decoder.onMalformedInput(CodingErrorAction.REPLACE);
	decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
}

Source File: NonBlockingFetcher.java From SEAL with Apache License 2.0

4 votes

private static void processDocuments() {
    CharsetDecoder decoder = charset.newDecoder();
    decoder.onMalformedInput(CodingErrorAction.IGNORE);
    decoder.onUnmappableCharacter(CodingErrorAction.IGNORE);
    
    // perform 1 to 2-pass decoding on every document
    for (int i = 0; i < works.length; i++) {
      documents.add(null);
      if (works[i] == null)
        continue;
      
      URL url = works[i].url;
      log.debug("[" + (i + 1) + "/" + urls.length + "] Processing: " + url);
      String encoding = DEFAULT_ENCODING;
      String doc = null;
      ByteBuffer buffer = works[i].buffer;
      buffer.flip();
      
      try {
        // try to use default encoding to decode the document
        doc = decoder.decode(buffer).toString();
        
        // identify encoding by looking into the <meta> tag
        Matcher m = CHARSET_PAT.matcher(doc);
        if (m.find()) {
          encoding = m.group(1).toUpperCase();
          log.debug("Encoding identified as: " + encoding);
        } else {
          log.debug("Encoding could not be identified! Using the default: " + DEFAULT_ENCODING);
        }

        // if the identified encoding is different from the default encoding
        if (!encoding.equals(DEFAULT_ENCODING)) {
          // decode again using the identified encoding 
          CharsetDecoder d = Charset.forName(encoding).newDecoder();
          d.onUnmappableCharacter(CodingErrorAction.IGNORE);
          d.onMalformedInput(CodingErrorAction.IGNORE);
          buffer.flip();
          doc = d.decode(buffer).toString();
        }
      } catch (Exception e) {
        log.error("Character coding error: " + e);
        continue;
      }
      documents.set(i, removeHTTPHeader(doc));
//      doc = removeHTTPHeader(doc);
//      doc = CacheRecoverer.recover(url, doc);
//      documents.set(i, doc);
    }
  }

Source File: ConvertCharacterSet.java From localization_nifi with Apache License 2.0

4 votes

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }

    final ComponentLog logger = getLogger();

    final Charset inputCharset = Charset.forName(context.getProperty(INPUT_CHARSET).evaluateAttributeExpressions(flowFile).getValue());
    final Charset outputCharset = Charset.forName(context.getProperty(OUTPUT_CHARSET).evaluateAttributeExpressions(flowFile).getValue());
    final CharBuffer charBuffer = CharBuffer.allocate(MAX_BUFFER_SIZE);

    final CharsetDecoder decoder = inputCharset.newDecoder();
    decoder.onMalformedInput(CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    decoder.replaceWith("?");

    final CharsetEncoder encoder = outputCharset.newEncoder();
    encoder.onMalformedInput(CodingErrorAction.REPLACE);
    encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    encoder.replaceWith("?".getBytes(outputCharset));

    try {
        final StopWatch stopWatch = new StopWatch(true);
        flowFile = session.write(flowFile, new StreamCallback() {
            @Override
            public void process(final InputStream rawIn, final OutputStream rawOut) throws IOException {
                try (final BufferedReader reader = new BufferedReader(new InputStreamReader(rawIn, decoder), MAX_BUFFER_SIZE);
                        final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(rawOut, encoder), MAX_BUFFER_SIZE)) {
                    int charsRead;
                    while ((charsRead = reader.read(charBuffer)) != -1) {
                        charBuffer.flip();
                        writer.write(charBuffer.array(), 0, charsRead);
                    }

                    writer.flush();
                }
            }
        });

        session.getProvenanceReporter().modifyContent(flowFile, stopWatch.getElapsed(TimeUnit.MILLISECONDS));
        logger.info("successfully converted characters from {} to {} for {}",
                new Object[]{inputCharset, outputCharset, flowFile});
        session.transfer(flowFile, REL_SUCCESS);
    } catch (final Exception e) {
        throw new ProcessException(e);
    }
}

Java Code Examples for java.nio.charset.CharsetDecoder#onMalformedInput()