org.apache.commons.io.ByteOrderMark Java Exaples

Source File: EncodingSniffer.java From htmlunit with Apache License 2.0

6 votes

/**
 * Attempts to sniff an encoding from a <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a>
 * in the specified byte array.
 *
 * @param bytes the bytes to check for a Byte Order Mark
 * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
 *         could not be determined
 */
static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
    if (bytes == null) {
        return null;
    }

    Charset encoding = null;
    if (startsWith(bytes, ByteOrderMark.UTF_8)) {
        encoding = UTF_8;
    }
    else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
        encoding = UTF_16BE;
    }
    else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
        encoding = UTF_16LE;
    }

    if (encoding != null && LOG.isDebugEnabled()) {
        LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
    }
    return encoding;
}

Source File: HtmlScript2Test.java From htmlunit with Apache License 2.0

6 votes

/**
 * @throws Exception if the test fails
 */
@Test
@Alerts("\u0623\u0647\u0644\u0627\u064b\u0623\u0647\u0644\u0627"
        + "\u064b\u0623\u0647\u0644\u0627\u064b\u0623\u0647\u0644\u0627\u064b")
public void incorrectCharset() throws Exception {
    final String html
        = "<html><head>\n"
        + "  <script src='" + URL_SECOND + "' charset='" + ISO_8859_1 + "'></script>\n"
        + "</head>\n"
        + "<body></body>\n"
        + "</html>";

    final String script = new String(ByteOrderMark.UTF_8.getBytes())
            + "alert('" + "\u0623\u0647\u0644\u0627\u064b\u0623\u0647\u0644\u0627"
                        + "\u064b\u0623\u0647\u0644\u0627\u064b\u0623\u0647\u0644\u0627\u064b" + "');";
    getMockWebConnection().setResponse(URL_SECOND, script, MimeType.APPLICATION_JAVASCRIPT, UTF_8);
    loadPageWithAlerts2(html);
}

Source File: TextInput.java From dremio-oss with Apache License 2.0

6 votes

private final boolean checkBom(ByteOrderMark bom) {
  int bomLength = bom.length();
  if (bufferPtr + bomLength >= length) {
    // Not enough bytes from the current position to the end of the buffer
    return false;
  }
  if (BoundsChecking.BOUNDS_CHECKING_ENABLED) {
    buffer.checkBytes(bufferPtr - 1, bufferPtr + bomLength);
  }

  byte[] bomBytes = bom.getBytes();
  for (int i = 0; i < bomLength; i++) {
    byte nextChar = PlatformDependent.getByte(bStartMinus1 + bufferPtr + i);
    if (nextChar != bomBytes[i]) {
      // No BOM. Position is unchanged
      return false;
    }
  }
  return true;
}

Source File: EncodingSniffer.java From HtmlUnit-Android with Apache License 2.0

6 votes

/**
 * Attempts to sniff an encoding from a <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a>
 * in the specified byte array.
 *
 * @param bytes the bytes to check for a Byte Order Mark
 * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
 *         could not be determined
 */
static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
    if (bytes == null) {
        return null;
    }

    Charset encoding = null;
    if (startsWith(bytes, ByteOrderMark.UTF_8)) {
        encoding = UTF_8;
    }
    else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
        encoding = UTF_16BE;
    }
    else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
        encoding = UTF_16LE;
    }

    if (encoding != null && LOG.isDebugEnabled()) {
        LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
    }
    return encoding;
}

Source File: TestNewTextReader.java From dremio-oss with Apache License 2.0

6 votes

@Test
public void testBomUtf8() throws Exception {
  // Simple .csv file with a UTF-8 BOM. Should read successfully
  File testFolder = tempDir.newFolder("testUtf8Folder");
  File testFile = new File(testFolder, "utf8.csv");
  PrintStream p = new PrintStream(testFile);
  p.write(ByteOrderMark.UTF_8.getBytes(), 0, ByteOrderMark.UTF_8.length());
  p.print("A,B\n");
  p.print("5,7\n");
  p.close();

  testBuilder()
    .sqlQuery(String.format("select * from table(dfs.\"%s\" (type => 'text', " +
      "fieldDelimiter => ',', lineDelimiter => '\n', extractHeader => true))",
      testFile.getAbsolutePath()))
    .unOrdered()
    .baselineColumns("A","B")
    .baselineValues("5", "7")
    .go();
}

Source File: TestNewTextReader.java From dremio-oss with Apache License 2.0

6 votes

@Test
public void testErrorBomUtf16() throws Exception {
  // UTF-16 BOM should cause a dataReadError user exception
  File testFolder = tempDir.newFolder("testUtf16Folder");
  File testFile = new File(testFolder, "utf16.csv");
  PrintStream p = new PrintStream(testFile);
  p.write(ByteOrderMark.UTF_16LE.getBytes(), 0, ByteOrderMark.UTF_16LE.length());
  p.print("A,B\n");
  p.print("5,7\n");
  p.close();

  thrownException.expect(new UserExceptionMatcher(UserBitShared.DremioPBError.ErrorType.DATA_READ,
    "DATA_READ ERROR: UTF-16 files not supported"));
  // NB: using test() instead of testBuilder() because it unwraps the thrown RpcException and re-throws the
  // underlying UserException (which is then matched with the UserExceptionMatcher)
  test(String.format("select * from table(dfs.\"%s\" (type => 'text', " +
      "fieldDelimiter => ',', lineDelimiter => '\n', extractHeader => true))",
    testFile.getAbsolutePath()));
}

Source File: SubmitAndSyncUtf16FileTypeTest.java From p4ic4idea with Apache License 2.0

5 votes

private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}

Source File: BOMInputStream.java From lams with GNU General Public License v2.0

5 votes

/**
 * Return the BOM (Byte Order Mark).
 * 
 * @return The BOM or null if none
 * @throws IOException
 *             if an error reading the first bytes of the stream occurs
 */
public ByteOrderMark getBOM() throws IOException {
    if (firstBytes == null) {
        fbLength = 0;
        // BOMs are sorted from longest to shortest
        final int maxBomSize = boms.get(0).length();
        firstBytes = new int[maxBomSize];
        // Read first maxBomSize bytes
        for (int i = 0; i < firstBytes.length; i++) {
            firstBytes[i] = in.read();
            fbLength++;
            if (firstBytes[i] < 0) {
                break;
            }
        }
        // match BOM in firstBytes
        byteOrderMark = find();
        if (byteOrderMark != null) {
            if (!include) {
                if (byteOrderMark.length() < firstBytes.length) {
                    fbIndex = byteOrderMark.length();
                } else {
                    fbLength = 0;
                }
            }
        }
    }
    return byteOrderMark;
}

Source File: BOMInputStream.java From lams with GNU General Public License v2.0

5 votes

/**
 * Find a BOM with the specified bytes.
 * 
 * @return The matched BOM or null if none matched
 */
private ByteOrderMark find() {
    for (final ByteOrderMark bom : boms) {
        if (matches(bom)) {
            return bom;
        }
    }
    return null;
}

Source File: BOMInputStream.java From lams with GNU General Public License v2.0

5 votes

/**
 * Check if the bytes match a BOM.
 * 
 * @param bom
 *            The BOM
 * @return true if the bytes match the bom, otherwise false
 */
private boolean matches(final ByteOrderMark bom) {
    // if (bom.length() != fbLength) {
    // return false;
    // }
    // firstBytes may be bigger than the BOM bytes
    for (int i = 0; i < bom.length(); i++) {
        if (bom.get(i) != firstBytes[i]) {
            return false;
        }
    }
    return true;
}

Source File: TextRecordWriter.java From dremio-oss with Apache License 2.0

5 votes

@Override
public void startPartition(WritePartition partition) throws Exception {

  if(this.partition != null){
    close();
  }

  this.partition = partition;
  // open a new file for writing data with new schema
  try {
    this.path = fs.canonicalizePath(partition.qualified(location, prefix + "_" + index + "." + extension));
    dos = new DataOutputStream(fs.create(path));
    stream = new PrintStream(dos);
    stream.write(ByteOrderMark.UTF_8.getBytes(), 0, ByteOrderMark.UTF_8.length());
    logger.debug("Created file: {}", path);
  } catch (IOException e) {
    throw UserException.dataWriteError(e)
      .message("Failure while attempting to write file %s.", path)
      .build(logger);
  }
  index++;

  String columns = Joiner.on(fieldDelimiter).join(columnNames);
  stream.print(columns);
  stream.print(lineDelimiter);

}

Source File: TextInput.java From dremio-oss with Apache License 2.0

5 votes

private final void skipOptionalBOM() throws IOException {
  if (checkBom(ByteOrderMark.UTF_8)) {
    bufferPtr += ByteOrderMark.UTF_8.length();
  } else if (checkBom(ByteOrderMark.UTF_16LE) || checkBom(ByteOrderMark.UTF_16BE)) {
    throw UserException.dataReadError()
      .message("UTF-16 files not supported")
      .build(logger);
  }
}

Source File: XMLUtils.java From modernmt with Apache License 2.0

5 votes

public static XMLEventReader createEventReader(InputStream stream) throws XMLStreamException {
    Charset charset = UTF8Charset.get();

    BOMInputStream bomStream = new BOMInputStream(stream, false,
            ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE);
    try {
        if (bomStream.hasBOM())
            charset = Charset.forName(bomStream.getBOMCharsetName());
    } catch (IOException e) {
        throw new XMLStreamException(e);
    }

    XMLInputFactory factory = XMLInputFactory.newInstance();
    return factory.createXMLEventReader(new XMLFixInputStreamReader(bomStream, charset));
}

Source File: MD5Digester.java From p4ic4idea with Apache License 2.0

5 votes

private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}

Source File: SubmitAndSyncUtf16FileTypeTest.java From p4ic4idea with Apache License 2.0

5 votes

private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}

Source File: MD5Digester.java From p4ic4idea with Apache License 2.0

5 votes

private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}

Source File: WebResponse.java From htmlunit with Apache License 2.0

5 votes

/**
 * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
 *
 * Returns the response content as a string, using the specified charset,
 * rather than the charset/encoding specified in the server response.
 * If there is a bom header the charset parameter will be overwritten by the bom.
 * @param encoding the charset/encoding to use to convert the response content into a string
 * @param ignoreUtf8Bom if true utf8 bom header will be ignored
 * @return the response content as a string or null if the content retrieval was failing
 */
public String getContentAsString(final Charset encoding, final boolean ignoreUtf8Bom) {
    if (responseData_ != null) {
        try (InputStream in = responseData_.getInputStreamWithBomIfApplicable(BOM_HEADERS)) {
            if (in instanceof BOMInputStream) {
                try (BOMInputStream bomIn = (BOMInputStream) in) {
                    // there seems to be a bug in BOMInputStream
                    // we have to call this before hasBOM(ByteOrderMark)
                    if (bomIn.hasBOM()) {
                        if (!ignoreUtf8Bom && bomIn.hasBOM(ByteOrderMark.UTF_8)) {
                            return IOUtils.toString(bomIn, UTF_8);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
                            return IOUtils.toString(bomIn, UTF_16BE);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
                            return IOUtils.toString(bomIn, UTF_16LE);
                        }
                    }
                    return IOUtils.toString(bomIn, encoding);
                }
            }

            return IOUtils.toString(in, encoding);
        }
        catch (final IOException e) {
            LOG.warn(e.getMessage(), e);
        }
    }
    return null;
}

Source File: MD5Digester.java From p4ic4idea with Apache License 2.0

5 votes

private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}

Source File: SubmitAndSyncUtf16FileTypeTest.java From p4ic4idea with Apache License 2.0

5 votes

private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}

Source File: MD5Digester.java From p4ic4idea with Apache License 2.0

5 votes

private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}

Source File: SubmitAndSyncUtf16FileTypeTest.java From p4ic4idea with Apache License 2.0

5 votes

private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}

Source File: StreamDecoder.java From batfish with Apache License 2.0

5 votes

private static @Nonnull BOMInputStream bomInputStream(@Nonnull InputStream inputStream) {
  return new BOMInputStream(
      inputStream,
      ByteOrderMark.UTF_8,
      ByteOrderMark.UTF_16BE,
      ByteOrderMark.UTF_16LE,
      ByteOrderMark.UTF_32BE,
      ByteOrderMark.UTF_32LE);
}

Source File: ChakraTest.java From es6draft with MIT License

5 votes

private static Charset charsetFor(BOMInputStream bis) throws IOException {
    ByteOrderMark bom = bis.getBOM();
    if (ByteOrderMark.UTF_8.equals(bom)) {
        return StandardCharsets.UTF_8;
    }
    if (ByteOrderMark.UTF_16LE.equals(bom)) {
        return StandardCharsets.UTF_16LE;
    }
    if (ByteOrderMark.UTF_16BE.equals(bom)) {
        return StandardCharsets.UTF_16BE;
    }
    return StandardCharsets.UTF_8;
}

Source File: StreamUtil.java From iaf with Apache License 2.0

5 votes

/**
 * Return a Reader that reads the InputStream in the character set specified by the BOM. If no BOM is found, a default character set is used.
 */
public static Reader getCharsetDetectingInputStreamReader(InputStream inputStream, String defaultCharset) throws IOException {
	BOMInputStream bOMInputStream = new BOMInputStream(inputStream,ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
	ByteOrderMark bom = bOMInputStream.getBOM();
	String charsetName = bom == null ? defaultCharset : bom.getCharsetName();
	return new InputStreamReader(new BufferedInputStream(bOMInputStream), charsetName);
}

Source File: CharsetIdentification.java From storm-crawler with Apache License 2.0

5 votes

/**
 * Detects any BOMs and returns the corresponding charset
 */
private static String getCharsetFromBOM(final byte[] byteData) {
    try (BOMInputStream bomIn = new BOMInputStream(
            new ByteArrayInputStream(byteData))) {
        ByteOrderMark bom = bomIn.getBOM();
        if (bom != null) {
            return bom.getCharsetName();
        }
    } catch (IOException e) {
        return null;
    }
    return null;
}

Source File: BOMInputStream.java From aion-germany with GNU General Public License v3.0

5 votes

public int compare(ByteOrderMark bom1, ByteOrderMark bom2) {
    int len1 = bom1.length();
    int len2 = bom2.length();
    if (len1 > len2) {
        return -1;
    }
    if (len2 > len1) {
        return 1;
    }
    return 0;
}

Source File: CsvInput.java From hop with Apache License 2.0

5 votes

String[] readFieldNamesFromFile( String fileName, CsvInputMeta csvInputMeta ) throws HopException {
  String delimiter = environmentSubstitute( csvInputMeta.getDelimiter() );
  String enclosure = environmentSubstitute( csvInputMeta.getEnclosure() );
  String realEncoding = environmentSubstitute( csvInputMeta.getEncoding() );

  try ( FileObject fileObject = HopVfs.getFileObject( fileName );
        BOMInputStream inputStream =
          new BOMInputStream( HopVfs.getInputStream( fileObject ), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE ) ) {
    InputStreamReader reader = null;
    if ( Utils.isEmpty( realEncoding ) ) {
      reader = new InputStreamReader( inputStream );
    } else {
      reader = new InputStreamReader( inputStream, realEncoding );
    }
    EncodingType encodingType = EncodingType.guessEncodingType( reader.getEncoding() );
    String line =
      TextFileInput.getLine( log, reader, encodingType, TextFileInputMeta.FILE_FORMAT_UNIX, new StringBuilder(
        1000 ) );
    String[] fieldNames = TextFileLineUtil.guessStringsFromLine( log, line, delimiter, enclosure, csvInputMeta.getEscapeCharacter() );
    if ( !Utils.isEmpty( csvInputMeta.getEnclosure() ) ) {
      removeEnclosure( fieldNames, csvInputMeta.getEnclosure() );
    }
    trimFieldNames( fieldNames );
    return fieldNames;
  } catch ( IOException e ) {
    throw new HopFileException( BaseMessages.getString( PKG, "CsvInput.Exception.CreateFieldMappingError" ), e );
  }
}

Source File: WebResponse.java From HtmlUnit-Android with Apache License 2.0

5 votes

/**
 * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
 *
 * Returns the response content as a string, using the specified charset,
 * rather than the charset/encoding specified in the server response.
 * If there is a bom header the charset parameter will be overwritten by the bom.
 * @param encoding the charset/encoding to use to convert the response content into a string
 * @param ignoreUtf8Bom if true utf8 bom header will be ignored
 * @return the response content as a string or null if the content retrieval was failing
 */
public String getContentAsString(final Charset encoding, final boolean ignoreUtf8Bom) {
    if (responseData_ != null) {
        try (InputStream in = responseData_.getInputStream()) {
            if (in != null) {
                try (BOMInputStream bomIn = new BOMInputStream(in, BOM_HEADERS)) {
                    // there seems to be a bug in BOMInputStream
                    // we have to call this before hasBOM(ByteOrderMark)
                    if (bomIn.hasBOM()) {
                        if (!ignoreUtf8Bom && bomIn.hasBOM(ByteOrderMark.UTF_8)) {
                            return IOUtils.toString(bomIn, UTF_8);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
                            return IOUtils.toString(bomIn, UTF_16BE);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
                            return IOUtils.toString(bomIn, UTF_16LE);
                        }
                    }
                    return IOUtils.toString(bomIn, encoding);
                }
            }
        }
        catch (final IOException e) {
            LOG.warn(e);
        }
    }
    return null;
}

Source File: BOMInputStream.java From aion-germany with GNU General Public License v3.0

5 votes

/**
 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
 * 
 * @param delegate
 *            the InputStream to delegate to
 * @param include
 *            true to include the specified BOMs or false to exclude them
 * @param boms
 *            The BOMs to detect and optionally exclude
 */
public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) {
    super(delegate);
    if (boms == null || boms.length == 0) {
        throw new IllegalArgumentException("No BOMs specified");
    }
    this.include = include;
    // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
    Arrays.sort(boms, ByteOrderMarkLengthComparator);
    this.boms = Arrays.asList(boms);

}

Source File: BOMInputStream.java From aion-germany with GNU General Public License v3.0

5 votes

/**
 * Return the BOM (Byte Order Mark).
 * 
 * @return The BOM or null if none
 * @throws IOException
 *             if an error reading the first bytes of the stream occurs
 */
public ByteOrderMark getBOM() throws IOException {
    if (firstBytes == null) {
        fbLength = 0;
        // BOMs are sorted from longest to shortest
        final int maxBomSize = boms.get(0).length();
        firstBytes = new int[maxBomSize];
        // Read first maxBomSize bytes
        for (int i = 0; i < firstBytes.length; i++) {
            firstBytes[i] = in.read();
            fbLength++;
            if (firstBytes[i] < 0) {
                break;
            }
        }
        // match BOM in firstBytes
        byteOrderMark = find();
        if (byteOrderMark != null) {
            if (!include) {
                if (byteOrderMark.length() < firstBytes.length) {
                    fbIndex = byteOrderMark.length();
                } else {
                    fbLength = 0;
                }
            }
        }
    }
    return byteOrderMark;
}

org.apache.commons.io.ByteOrderMark Java Examples