org.apache.tika.metadata.TikaMetadataKeys Java Examples
The following examples show how to use
org.apache.tika.metadata.TikaMetadataKeys.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExecUtil.java From ctsms with GNU Lesser General Public License v2.1 | 6 votes |
public static String getMimeType(byte[] data, String fileName) throws Throwable { TikaInputStream tikaStream = null; Metadata metadata = new Metadata(); metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName); try { tikaStream = TikaInputStream.get(data, metadata); return detector.detect(tikaStream, metadata).toString(); } catch (Throwable t) { throw t; } finally { if (tikaStream != null) { try { tikaStream.close(); } catch (IOException e) { } } } }
Example #2
Source File: ExecUtil.java From ctsms with GNU Lesser General Public License v2.1 | 6 votes |
public static String getMimeType(File file) throws Throwable { TikaInputStream tikaStream = null; Metadata metadata = new Metadata(); metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName()); try { tikaStream = TikaInputStream.get(file, metadata); return detector.detect(tikaStream, metadata).toString(); } catch (Throwable t) { throw t; } finally { if (tikaStream != null) { try { tikaStream.close(); } catch (IOException e) { } } } }
Example #3
Source File: NodeTika.java From node-tika with MIT License | 6 votes |
public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException { final Detector detector = config.getDetector(); final TikaInputStream inputStream = createInputStream(uri); final Metadata metadata = new Metadata(); // Set the file name. This provides some level of type-hinting. metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); // Detect the content type. String contentType = detector.detect(inputStream, metadata).toString(); inputStream.close(); // Return the default content-type if undetermined. if (contentType == null || contentType.isEmpty()) { return MediaType.OCTET_STREAM.toString(); } return contentType; }
Example #4
Source File: AbstractFessFileTransformer.java From fess with Apache License 2.0 | 6 votes |
protected Map<String, String> createExtractParams(final ResponseData responseData, final CrawlingConfig crawlingConfig) { final Map<String, String> params = new HashMap<>(crawlingConfig.getConfigParameterMap(ConfigName.CONFIG)); params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData)); params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType()); params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet()); params.put(ExtractData.URL, responseData.getUrl()); final Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG); if (configParam != null) { final String keepOriginalBody = configParam.get(Config.KEEP_ORIGINAL_BODY); if (StringUtil.isNotBlank(keepOriginalBody)) { params.put(TikaExtractor.NORMALIZE_TEXT, Constants.TRUE.equalsIgnoreCase(keepOriginalBody) ? Constants.FALSE : Constants.TRUE); } } return params; }
Example #5
Source File: RegexRulesPasswordProvider.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public String getPassword(Metadata meta) { if(getExplicitPassword() != null) { return getExplicitPassword(); } if(passwordMap.size() > 0) return lookupPasswordFromMap(meta.get(TikaMetadataKeys.RESOURCE_NAME_KEY)); return null; }
Example #6
Source File: NodeTika.java From node-tika with MIT License | 5 votes |
private static void fillMetadata(Metadata metadata, String contentType, String uri) { // Set the file name. if (uri != null) { metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); } // Normalise the content-type. contentType = normalizeContentType(contentType); // Set the content-type. if (contentType != null) { metadata.add(HttpHeaders.CONTENT_TYPE, contentType); } }
Example #7
Source File: NodeTika.java From node-tika with MIT License | 5 votes |
public static String detectContentTypeAndCharset(String uri) throws FileNotFoundException, IOException, TikaException { final Detector detector = config.getDetector(); final TikaInputStream inputStream = createInputStream(uri); final Metadata metadata = new Metadata(); // Set the file name. This provides some level of type-hinting. metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); // Detect the content type. String contentType = detector.detect(inputStream, metadata).toString(); // Use metadata to provide type-hinting to the AutoDetectReader. fillMetadata(metadata, contentType, uri); // Detect the character set. final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata); String charset = reader.getCharset().toString(); inputStream.close(); // Return the default content-type if undetermined. if (contentType == null || contentType.isEmpty()) { return MediaType.OCTET_STREAM.toString(); } // Append the charset if the content-type was determined. if (charset != null && !charset.isEmpty()) { return contentType + "; charset=" + charset; } return contentType; }
Example #8
Source File: IdentifyMimeType.java From localization_nifi with Apache License 2.0 | 4 votes |
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) { FlowFile flowFile = session.get(); if (flowFile == null) { return; } final ComponentLog logger = getLogger(); final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null); final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key()); session.read(flowFile, new InputStreamCallback() { @Override public void process(final InputStream stream) throws IOException { try (final InputStream in = new BufferedInputStream(stream)) { TikaInputStream tikaStream = TikaInputStream.get(in); Metadata metadata = new Metadata(); // Add filename if it exists if (filename != null) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename); } // Get mime type MediaType mediatype = detector.detect(tikaStream, metadata); mimeTypeRef.set(mediatype.toString()); } } }); String mimeType = mimeTypeRef.get(); String extension = ""; try { MimeType mimetype; mimetype = config.getMimeRepository().forName(mimeType); extension = mimetype.getExtension(); } catch (MimeTypeException ex) { logger.warn("MIME type extension lookup failed: {}", new Object[]{ex}); } // Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563 if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) { extension = ".gz"; } if (mimeType == null) { flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream"); flowFile = session.putAttribute(flowFile, "mime.extension", ""); logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[]{flowFile}); } else { flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType); flowFile = session.putAttribute(flowFile, "mime.extension", extension); logger.info("Identified {} as having MIME Type {}", new Object[]{flowFile, mimeType}); } session.getProvenanceReporter().modifyAttributes(flowFile); session.transfer(flowFile, REL_SUCCESS); }
Example #9
Source File: IdentifyMimeType.java From nifi with Apache License 2.0 | 4 votes |
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) { FlowFile flowFile = session.get(); if (flowFile == null) { return; } final ComponentLog logger = getLogger(); final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null); final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key()); session.read(flowFile, new InputStreamCallback() { @Override public void process(final InputStream stream) throws IOException { try (final InputStream in = new BufferedInputStream(stream)) { TikaInputStream tikaStream = TikaInputStream.get(in); Metadata metadata = new Metadata(); if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename); } // Get mime type MediaType mediatype = detector.detect(tikaStream, metadata); mimeTypeRef.set(mediatype.toString()); } } }); String mimeType = mimeTypeRef.get(); String extension = ""; try { MimeType mimetype; mimetype = mimeTypes.forName(mimeType); extension = mimetype.getExtension(); } catch (MimeTypeException ex) { logger.warn("MIME type extension lookup failed: {}", new Object[]{ex}); } // Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563 if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) { extension = ".gz"; } if (mimeType == null) { flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream"); flowFile = session.putAttribute(flowFile, "mime.extension", ""); logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[]{flowFile}); } else { flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType); flowFile = session.putAttribute(flowFile, "mime.extension", extension); logger.info("Identified {} as having MIME Type {}", new Object[]{flowFile, mimeType}); } session.getProvenanceReporter().modifyAttributes(flowFile); session.transfer(flowFile, REL_SUCCESS); }