org.apache.commons.io.input.BOMInputStream#hasBOM

Source File: URLRespectsRobots.java From BUbiNG with Apache License 2.0

6 votes

/** Parses a <code>robots.txt</code> file contained in a {@link FetchData} and
 * returns the corresponding filter as an array of sorted prefixes. HTTP statuses
 * different from 2xx are {@linkplain Logger#warn(String) logged}. HTTP statuses of class 4xx
 * generate an empty filter. HTTP statuses 2xx/3xx cause the tentative parsing of the
 * request content. In the remaining cases we return {@code null}.
 *
 * @param robotsResponse the response containing <code>robots.txt</code>.
 * @param userAgent the string representing the user agent of interest.
 * @return an array of character arrays, which are prefixes of the URLs not to follow, in sorted order,
 * or {@code null}
 */
public static char[][] parseRobotsResponse(final URIResponse robotsResponse, final String userAgent) throws IOException {
	final int status = robotsResponse.response().getStatusLine().getStatusCode();
	if (status / 100 != 2) LOGGER.info("Got status " + status + " while fetching robots: URL was " + robotsResponse.uri());
	if (status / 100 == 4 || status / 100 == 5) return EMPTY_ROBOTS_FILTER; // For status 4xx and 5xx, we consider everything allowed.
	if (status / 100 != 2 && status / 100 != 3) return null; // For status 2xx and 3xx we parse the content. For the rest, we consider everything forbidden.
	// See if BOM is present and compute its length
	BOMInputStream bomInputStream = new BOMInputStream(robotsResponse.response().getEntity().getContent(), true);
	int bomLength = bomInputStream.hasBOM()? bomInputStream.getBOM().length() : 0;
	// Skip BOM, if necessary
	bomInputStream.skip(bomLength);
	// Parse robots (BOM is ignored, robots are UTF-8, as suggested by https://developers.google.com/search/reference/robots_txt
	char[][] result = parseRobotsReader(new InputStreamReader(bomInputStream, Charsets.UTF_8), userAgent);
	if (LOGGER.isDebugEnabled()) LOGGER.debug("Robots for {} successfully got with status {}: {}", robotsResponse.uri(), Integer.valueOf(status), toString(result));
	return result;
}

Source File: CommandHelper.java From mojito with Apache License 2.0

6 votes

/**
 * Get content from {@link java.nio.file.Path} using UTF8
 *
 * @param path
 * @return
 * @throws CommandException
 */
public String getFileContent(Path path) {
    try {
        File file = path.toFile();
        BOMInputStream inputStream = new BOMInputStream(FileUtils.openInputStream(file), false, boms);
        String fileContent;
        if (inputStream.hasBOM()) {
            fileContent = IOUtils.toString(inputStream, inputStream.getBOMCharsetName());
        } else {
            fileContent = IOUtils.toString(inputStream, StandardCharsets.UTF_8);
        }
        return fileContent;
    } catch (IOException e) {
        throw new UncheckedIOException("Cannot get file content for path: " + path.toString(), e);
    }
}

Source File: CommandHelper.java From mojito with Apache License 2.0

5 votes

/**
 * Writes the content into a file using same format as source file
 *
 * @param content         content to be written
 * @param path            path to the file
 * @param sourceFileMatch
 * @throws CommandException
 */
public void writeFileContent(String content, Path path, FileMatch sourceFileMatch) throws CommandException {
    try {
        File outputFile = path.toFile();
        BOMInputStream inputStream = new BOMInputStream(FileUtils.openInputStream(sourceFileMatch.getPath().toFile()), false, boms);
        if (inputStream.hasBOM()) {
            FileUtils.writeByteArrayToFile(outputFile, inputStream.getBOM().getBytes());
            FileUtils.writeByteArrayToFile(outputFile, content.getBytes(inputStream.getBOMCharsetName()), true);
        } else {
            FileUtils.writeStringToFile(outputFile, content, StandardCharsets.UTF_8);
        }
    } catch (IOException e) {
        throw new CommandException("Cannot write file content in path: " + path.toString(), e);
    }
}

Source File: XMLUtils.java From modernmt with Apache License 2.0

5 votes

public static XMLEventReader createEventReader(InputStream stream) throws XMLStreamException {
    Charset charset = UTF8Charset.get();

    BOMInputStream bomStream = new BOMInputStream(stream, false,
            ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE);
    try {
        if (bomStream.hasBOM())
            charset = Charset.forName(bomStream.getBOMCharsetName());
    } catch (IOException e) {
        throw new XMLStreamException(e);
    }

    XMLInputFactory factory = XMLInputFactory.newInstance();
    return factory.createXMLEventReader(new XMLFixInputStreamReader(bomStream, charset));
}

Source File: ReferenceCCDAValidationService.java From reference-ccda-validator with BSD 2-Clause "Simplified" License

4 votes

private List<RefCCDAValidationResult> runValidators(String validationObjective, String referenceFileName,
		MultipartFile ccdaFile, boolean curesUpdate, String vocabularyConfig, SeverityLevel severityLevel)
		throws SAXException, Exception {
	List<RefCCDAValidationResult> validatorResults = new ArrayList<>();
	InputStream ccdaFileInputStream = null;
	try {
		ccdaFileInputStream = ccdaFile.getInputStream();
		BOMInputStream bomInputStream = new BOMInputStream(ccdaFileInputStream);
		if (bomInputStream.hasBOM()) {
			logger.warn(
					"The C-CDA file has a BOM which is supposed to be removed by BOMInputStream - encoding w/o BOM: "
							+ bomInputStream.getBOMCharsetName());
		}
		String ccdaFileContents = IOUtils.toString(bomInputStream, "UTF-8");

		List<RefCCDAValidationResult> mdhtResults = doMDHTValidation(validationObjective, referenceFileName,
				ccdaFileContents, severityLevel);
		if (mdhtResults != null && !mdhtResults.isEmpty()) {
			logger.info("Adding MDHT results");
			validatorResults.addAll(mdhtResults);
		}

		boolean isSchemaErrorInMdhtResults = mdhtResultsHaveSchemaError(mdhtResults);
		boolean isObjectiveAllowingVocabularyValidation = objectiveAllowsVocabularyValidation(validationObjective);
		if (!isSchemaErrorInMdhtResults && isObjectiveAllowingVocabularyValidation) {
			if (vocabularyConfig == null || vocabularyConfig.isEmpty()) {
				logger.warn("Invalid vocabularyConfig of '" + vocabularyConfig != null ? vocabularyConfig
						: "null" + "' " + "received. Assigned default config of '"
								+ VocabularyConstants.Config.DEFAULT + "'.");
				vocabularyConfig = VocabularyConstants.Config.DEFAULT;
			}
			List<RefCCDAValidationResult> vocabResults = doVocabularyValidation(validationObjective,
					referenceFileName, ccdaFileContents, vocabularyConfig, severityLevel);
			if (vocabResults != null && !vocabResults.isEmpty()) {
				logger.info("Adding Vocabulary results");
				validatorResults.addAll(vocabResults);
			}
			if (objectiveAllowsContentValidation(validationObjective)) {
				List<RefCCDAValidationResult> contentResults = doContentValidation(validationObjective,
						referenceFileName, ccdaFileContents, curesUpdate, severityLevel);
				if (contentResults != null && !contentResults.isEmpty()) {
					logger.info("Adding Content results");
					validatorResults.addAll(contentResults);
				}
			} else {
				logger.info("Skipping Content validation due to: " + "validationObjective ("
						+ (validationObjective != null ? validationObjective : "null objective")
						+ ") is not relevant or valid for Content validation");
			}
		} else {
			String separator = !isObjectiveAllowingVocabularyValidation && isSchemaErrorInMdhtResults ? " and "
					: "";
			logger.info("Skipping Vocabulary (and thus Content) validation due to: "
					+ (isObjectiveAllowingVocabularyValidation ? ""
							: "validationObjective POSTed: "
									+ (validationObjective != null ? validationObjective : "null objective")
									+ separator)
					+ (isSchemaErrorInMdhtResults ? "C-CDA Schema error(s) found" : ""));
		}
	} catch (IOException e) {
		throw new RuntimeException("Error getting CCDA contents from provided file", e);
	} finally {
		closeFileInputStream(ccdaFileInputStream);
	}
	return validatorResults;
}

Source File: RpcInputStream.java From p4ic4idea with Apache License 2.0

4 votes

public RpcInputStream(RpcPerforceFile file, Charset fromCharset) throws IOException, FileEncoderException {
	super(file);
	if (file == null) {
		throw new NullPointerError(
				"Null RpcPerforceFile passed to RpcInputStream constructor");
	}

	this.file = file;
	this.fileType = this.file.getFileType();
	this.lineEnding = this.file.getLineEnding();

	if (this.lineEnding == null) {
		this.lineEnding = ClientLineEnding.FST_L_LOCAL;
	}

	if (this.fileType == null) {
		this.fileType = RpcPerforceFileType.FST_TEXT;
	}

	if (isTextType(this.fileType)) {

		if (this.fileType == RpcPerforceFileType.FST_TEXT || this.fileType == RpcPerforceFileType.FST_XTEXT) {
			this.lineEndStream = new BufferedInputStream(new FileInputStream(file));
		} else {
			BOMInputStream bis = new BOMInputStream(new FileInputStream(file), UTF_8, UTF_16LE, UTF_16BE);
			if (fromCharset == CharsetDefs.UTF16) {
				fromCharset = bis.hasBOM() ? Charset.forName(bis.getBOMCharsetName())
						: ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN) ?
						Charset.forName("UTF-16BE") : Charset.forName("UTF-16LE");
			}
			this.lineEndStream = new BufferedInputStream(bis);
		}

		boolean doLineCvt = ClientLineEnding.needsLineEndFiltering(this.lineEnding);
		if (fromCharset != null && fromCharset != CharsetDefs.UTF8) {
			this.lineEndStream = new CharsetConverterStream(lineEndStream, fromCharset, doLineCvt);
		}

		if (doLineCvt) {
			this.lineEndStream = new RpcLineEndFilterInputStream(
					new BufferedInputStream(lineEndStream), this.lineEnding);
		}
	}
}

Source File: RpcInputStream.java From p4ic4idea with Apache License 2.0

4 votes

public RpcInputStream(RpcPerforceFile file, Charset fromCharset) throws IOException, FileEncoderException {
	super(file);
	if (file == null) {
		throw new NullPointerError(
				"Null RpcPerforceFile passed to RpcInputStream constructor");
	}

	this.file = file;
	this.fileType = this.file.getFileType();
	this.lineEnding = this.file.getLineEnding();

	if (this.lineEnding == null) {
		this.lineEnding = ClientLineEnding.FST_L_LOCAL;
	}

	if (this.fileType == null) {
		this.fileType = RpcPerforceFileType.FST_TEXT;
	}

	if (isTextType(this.fileType)) {

		if (this.fileType == RpcPerforceFileType.FST_TEXT || this.fileType == RpcPerforceFileType.FST_XTEXT) {
			this.lineEndStream = new BufferedInputStream(new FileInputStream(file));
		} else {
			BOMInputStream bis = new BOMInputStream(new FileInputStream(file), UTF_8, UTF_16LE, UTF_16BE);
			if (fromCharset == CharsetDefs.UTF16) {
				fromCharset = bis.hasBOM() ? Charset.forName(bis.getBOMCharsetName())
						: ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN) ?
						Charset.forName("UTF-16BE") : Charset.forName("UTF-16LE");
			}
			this.lineEndStream = new BufferedInputStream(bis);
		}

		boolean doLineCvt = ClientLineEnding.needsLineEndFiltering(this.lineEnding);
		if (fromCharset != null && fromCharset != CharsetDefs.UTF8) {
			this.lineEndStream = new CharsetConverterStream(lineEndStream, fromCharset, doLineCvt);
		}

		if (doLineCvt) {
			this.lineEndStream = new RpcLineEndFilterInputStream(
					new BufferedInputStream(lineEndStream), this.lineEnding);
		}
	}
}

Source File: RpcInputStream.java From p4ic4idea with Apache License 2.0

4 votes

public RpcInputStream(RpcPerforceFile file, Charset fromCharset) throws IOException, FileEncoderException {
	super(file);
	if (file == null) {
		throw new NullPointerError(
				"Null RpcPerforceFile passed to RpcInputStream constructor");
	}

	this.file = file;
	this.fileType = this.file.getFileType();
	this.lineEnding = this.file.getLineEnding();

	if (this.lineEnding == null) {
		this.lineEnding = ClientLineEnding.FST_L_LOCAL;
	}

	if (this.fileType == null) {
		this.fileType = RpcPerforceFileType.FST_TEXT;
	}

	if (isTextType(this.fileType)) {

		if (this.fileType == RpcPerforceFileType.FST_TEXT || this.fileType == RpcPerforceFileType.FST_XTEXT) {
			this.lineEndStream = new BufferedInputStream(new FileInputStream(file));
		} else {
			BOMInputStream bis = new BOMInputStream(new FileInputStream(file), UTF_8, UTF_16LE, UTF_16BE);
			if (fromCharset == CharsetDefs.UTF16) {
				fromCharset = bis.hasBOM() ? Charset.forName(bis.getBOMCharsetName())
						: ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN) ?
						Charset.forName("UTF-16BE") : Charset.forName("UTF-16LE");
			}
			this.lineEndStream = new BufferedInputStream(bis);
		}

		boolean doLineCvt = ClientLineEnding.needsLineEndFiltering(this.lineEnding);
		if (fromCharset != null && fromCharset != CharsetDefs.UTF8) {
			this.lineEndStream = new CharsetConverterStream(lineEndStream, fromCharset, doLineCvt);
		}

		if (doLineCvt) {
			this.lineEndStream = new RpcLineEndFilterInputStream(
					new BufferedInputStream(lineEndStream), this.lineEnding);
		}
	}
}

Source File: RpcInputStream.java From p4ic4idea with Apache License 2.0

4 votes

public RpcInputStream(RpcPerforceFile file, Charset fromCharset) throws IOException, FileEncoderException {
	super(file);
	if (file == null) {
		throw new NullPointerError(
				"Null RpcPerforceFile passed to RpcInputStream constructor");
	}

	this.file = file;
	this.fileType = this.file.getFileType();
	this.lineEnding = this.file.getLineEnding();

	if (this.lineEnding == null) {
		this.lineEnding = ClientLineEnding.FST_L_LOCAL;
	}

	if (this.fileType == null) {
		this.fileType = RpcPerforceFileType.FST_TEXT;
	}

	if (isTextType(this.fileType)) {

		if (this.fileType == RpcPerforceFileType.FST_TEXT || this.fileType == RpcPerforceFileType.FST_XTEXT) {
			this.lineEndStream = new BufferedInputStream(new FileInputStream(file));
		} else {
			BOMInputStream bis = new BOMInputStream(new FileInputStream(file), UTF_8, UTF_16LE, UTF_16BE);
			if (fromCharset == CharsetDefs.UTF16) {
				fromCharset = bis.hasBOM() ? Charset.forName(bis.getBOMCharsetName())
						: ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN) ?
						Charset.forName("UTF-16BE") : Charset.forName("UTF-16LE");
			}
			this.lineEndStream = new BufferedInputStream(bis);
		}

		boolean doLineCvt = ClientLineEnding.needsLineEndFiltering(this.lineEnding);
		if (fromCharset != null && fromCharset != CharsetDefs.UTF8) {
			this.lineEndStream = new CharsetConverterStream(lineEndStream, fromCharset, doLineCvt);
		}

		if (doLineCvt) {
			this.lineEndStream = new RpcLineEndFilterInputStream(
					new BufferedInputStream(lineEndStream), this.lineEnding);
		}
	}
}

Java Code Examples for org.apache.commons.io.input.BOMInputStream#hasBOM()