org.apache.tika.config.TikaConfig Java Examples
The following examples show how to use
org.apache.tika.config.TikaConfig.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DirectoryManifest.java From genie with Apache License 2.0 | 6 votes |
ManifestVisitor( final Path root, final ImmutableMap.Builder<String, ManifestEntry> builder, final boolean checksumFiles, final Filter filter ) throws IOException { this.root = root; this.builder = builder; this.checksumFiles = checksumFiles; this.filter = filter; this.metadata = new Metadata(); try { this.tikaConfig = new TikaConfig(); } catch (final TikaException te) { log.error("Unable to create Tika Configuration due to error", te); throw new IOException(te); } }
Example #2
Source File: ExtractingDocumentLoader.java From lucene-solr with Apache License 2.0 | 6 votes |
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor, TikaConfig config, ParseContextConfig parseContextConfig, SolrContentHandlerFactory factory) { this.params = req.getParams(); this.core = req.getCore(); this.config = config; this.parseContextConfig = parseContextConfig; this.processor = processor; templateAdd = new AddUpdateCommand(req); templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true); templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1); //this is lightweight autoDetectParser = new AutoDetectParser(config); this.factory = factory; ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false); }
Example #3
Source File: TikaDocumentItemProcessor.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
@PostConstruct public void init() throws IOException, SAXException, TikaException{ setFieldName(tikaFieldName); // load tika configuration tikaConfig = new TikaConfig(this.getClass().getClassLoader() .getResourceAsStream("tika-config.xml")); // load tesseract ocr configuration tesseractConfig = new TesseractOCRConfig(); if (tesseractTimeout > 0) { tesseractConfig.setTimeout(tesseractTimeout); } // load image magick configuration -- used for tiff conversion imgConfig = new ImageMagickConfig(); if (convertTimeout > 0) { imgConfig.setTimeout(convertTimeout); } parser = new AutoDetectParser(tikaConfig); }
Example #4
Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
@Override public void setUp() throws Exception { super.setUp(); TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig"); extracter = new TikaAutoMetadataExtracter(config); extracter.setDictionaryService(dictionaryService); extracter.register(); // Attach some extra mappings, using the Tika // metadata keys namespace // These will be tested later HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>( extracter.getMapping() ); Set<QName> tlaSet = new HashSet<QName>(); tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY); newMap.put( Metadata.CONTENT_TYPE, tlaSet ); extracter.setMapping(newMap); }
Example #5
Source File: TikaAutoMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig) { config = tikaConfig; parser = new AutoDetectParser(config); SUPPORTED_MIMETYPES = new ArrayList<String>(); for(MediaType mt : parser.getParsers().keySet()) { // Add the canonical mime type SUPPORTED_MIMETYPES.add( mt.toString() ); // And add any aliases of the mime type too - Alfresco uses some // non canonical forms of various mimetypes, so we need all of them for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) { SUPPORTED_MIMETYPES.add( alias.toString() ); } } return SUPPORTED_MIMETYPES; }
Example #6
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
@Setup public void setup() throws Exception { if (spec.getTikaConfigPath() != null) { ResourceId configResource = FileSystems.matchSingleFileSpec(spec.getTikaConfigPath().get()).resourceId(); tikaConfig = new TikaConfig(Channels.newInputStream(FileSystems.open(configResource))); } }
Example #7
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Before public void initConfig() { InputStream is = getClass().getClassLoader().getResourceAsStream("tika-config.xml"); try { config = new TikaConfig(is); } catch (TikaException | IOException | SAXException ex) { Logger.getLogger(PDFPreprocessorParserTest.class.getName()).log(Level.SEVERE, null, ex); } }
Example #8
Source File: DefaultMimeSupport.java From nexus-public with Eclipse Public License 1.0 | 5 votes |
@VisibleForTesting public DefaultMimeSupport(final NexusMimeTypes nexusMimeTypes) { this.tikaConfig = TikaConfig.getDefaultConfig(); this.detector = tikaConfig.getDetector(); // create the cache extensionToMimeTypeCache = CacheBuilder.newBuilder().maximumSize(500).build(new CacheLoader<String, List<String>>() { @Override public List<String> load(final String key) throws Exception { final List<String> detected = Lists.newArrayList(); final MimeRule mimeType = nexusMimeTypes.getMimeRuleForExtension(key); if (mimeType != null) { // add Nexus matches first detected.addAll(mimeType.getMimetypes()); if (mimeType.isOverride()) { return detected; } } // ask Tika too final Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, "dummy." + key); MediaType mediaType = detector.detect(null, metadata); // unravel to least specific unravel(detected, mediaType); return detected; } }); }
Example #9
Source File: UploadAuditSetUpFormValidator.java From Asqatasun with GNU Affero General Public License v3.0 | 5 votes |
/** * Control whether the uploaded files are of HTML type and whether their * size is under the maxFileSize limit. * * @param uploadAuditSetUpCommand * @param errors */ private void validateFiles(AuditSetUpCommand uploadAuditSetUpCommand, Errors errors) { boolean emptyFile = true; Metadata metadata = new Metadata(); MimeTypes mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository(); String mime; for (int i=0;i<uploadAuditSetUpCommand.getFileInputList().length;i++ ) { try { CommonsMultipartFile cmf = uploadAuditSetUpCommand.getFileInputList()[i]; if (cmf.getSize() > maxFileSize) { Long maxFileSizeInMega = maxFileSize / 1000000; String[] arg = {maxFileSizeInMega.toString()}; errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", FILE_SIZE_EXCEEDED_MSG_BUNDLE_KEY, arg, "{0}"); } if (cmf.getSize() > 0) { emptyFile = false; mime = mimeTypes.detect(new BufferedInputStream(cmf.getInputStream()), metadata).toString(); LOGGER.debug("mime " + mime + " " +cmf.getOriginalFilename()); if (!authorizedMimeType.contains(mime)) { errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", NOT_HTML_MSG_BUNDLE_KEY); } } } catch (IOException ex) { LOGGER.warn(ex); errors.rejectValue(ID_INPUT_FILE_PREFIX + "[" + i + "]", NOT_HTML_MSG_BUNDLE_KEY); } } if(emptyFile) { // if no file is uploaded LOGGER.debug("emptyFiles"); errors.rejectValue(GENERAL_ERROR_MSG_KEY, NO_FILE_UPLOADED_MSG_BUNDLE_KEY); } }
Example #10
Source File: TikaAutoContentTransformerTest.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@Override public void setUp() throws Exception { super.setUp(); TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig"); transformer = new TikaAutoContentTransformer( config ); transformer.setMimetypeService(mimetypeService); transformer.setTransformerDebug(transformerDebug); transformer.setTransformerConfig(transformerConfig); transformer.afterPropertiesSet(); }
Example #11
Source File: ArchiveContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@Override protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options) { ParseContext context = super.buildParseContext(metadata, targetMimeType, options); boolean recurse = includeContents; if(options.getIncludeEmbedded() != null) { recurse = options.getIncludeEmbedded(); } if(recurse) { // Use an auto detect parser to handle the contents if(tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } context.set(Parser.class, new AutoDetectParser(tikaConfig)); } else { // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096 // so we need to specify an empty one if we don't want the recurse parsing to happen context.set(Parser.class, new EmptyParser()); } return context; }
Example #12
Source File: TikaPoweredContainerExtractor.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
/** * Injects the TikaConfig to use * * @param tikaConfig The Tika Config to use */ public void setTikaConfig(TikaConfig tikaConfig) { this.config = tikaConfig; // Setup the detector and parser detector = new DefaultDetector(config.getMimeRepository()); parser = new AutoDetectParser(detector); }
Example #13
Source File: AddScenarioFormValidator.java From Asqatasun with GNU Affero General Public License v3.0 | 4 votes |
/** * * @param addScenarioCommand * @param errors * @return whether the scenario handled by the current AddScenarioCommand * has a correct type and size */ public boolean checkScenarioFileTypeAndSize( AddScenarioCommand addScenarioCommand, Errors errors) { if (addScenarioCommand.getScenarioFile() == null) { // if no file uploaded LOGGER.debug("empty Scenario File"); errors.rejectValue(GENERAL_ERROR_MSG_KEY, MANDATORY_FIELD_MSG_BUNDLE_KEY); errors.rejectValue(SCENARIO_FILE_KEY, NO_SCENARIO_UPLOADED_MSG_BUNDLE_KEY); return false; } Metadata metadata = new Metadata(); MimeTypes mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository(); String mime; try { CommonsMultipartFile cmf = addScenarioCommand.getScenarioFile(); if (cmf.getSize() > maxFileSize) { Long maxFileSizeInMega = maxFileSize / 1000000; String[] arg = {maxFileSizeInMega.toString()}; errors.rejectValue(GENERAL_ERROR_MSG_KEY, MANDATORY_FIELD_MSG_BUNDLE_KEY); errors.rejectValue(SCENARIO_FILE_KEY, FILE_SIZE_EXCEEDED_MSG_BUNDLE_KEY, arg, "{0}"); return false; } else if (cmf.getSize() > 0) { mime = mimeTypes.detect(new BufferedInputStream(cmf.getInputStream()), metadata).toString(); LOGGER.debug("mime " + mime + " " + cmf.getOriginalFilename()); if (!authorizedMimeType.contains(mime)) { errors.rejectValue(GENERAL_ERROR_MSG_KEY, MANDATORY_FIELD_MSG_BUNDLE_KEY); errors.rejectValue(SCENARIO_FILE_KEY, NOT_SCENARIO_MSG_BUNDLE_KEY); return false; } } else { LOGGER.debug("File with size null"); errors.rejectValue(GENERAL_ERROR_MSG_KEY, MANDATORY_FIELD_MSG_BUNDLE_KEY); errors.rejectValue(SCENARIO_FILE_KEY, NO_SCENARIO_UPLOADED_MSG_BUNDLE_KEY); return false; } } catch (IOException ex) { LOGGER.warn(ex); errors.rejectValue(SCENARIO_FILE_KEY, NOT_SCENARIO_MSG_BUNDLE_KEY); errors.rejectValue(GENERAL_ERROR_MSG_KEY, MANDATORY_FIELD_MSG_BUNDLE_KEY); return false; } return true; }
Example #14
Source File: TikaDetector.java From spring-boot-email-tools with Apache License 2.0 | 4 votes |
private TikaDetector() { final TikaConfig config = TikaConfig.getDefaultConfig(); detector = config.getDetector(); }
Example #15
Source File: IdentifyMimeType.java From nifi with Apache License 2.0 | 4 votes |
public IdentifyMimeType() { this.config = TikaConfig.getDefaultConfig(); }
Example #16
Source File: DirectoryScanner.java From importer-exporter with Apache License 2.0 | 4 votes |
public DirectoryScanner() throws TikaException, IOException { tikaConfig = new TikaConfig(); contentFile = Pattern.compile("(?i).+\\.((gml)|(xml)|(json)|(gz)|(gzip))$"); matcher = Pattern.compile("").matcher(""); }
Example #17
Source File: TikaIntegrationTest.java From wildfly-camel with Apache License 2.0 | 4 votes |
private static BeanRepository createRegistryWithEmptyConfig() throws Exception { Context jndiContext = createJndiContext(); jndiContext.bind("testConfig", new TikaConfig(new File("src/test/resources/tika/tika-empty.xml"))); JndiBeanRepository repository = new JndiBeanRepository(jndiContext); return repository; }
Example #18
Source File: MimetypeMap.java From alfresco-data-model with GNU Lesser General Public License v3.0 | 4 votes |
/** * Initialises the map using the configuration service provided */ public void init() { PropertyCheck.mandatory(this, "configService", configService); PropertyCheck.mandatory(this, "contentCharsetFinder", contentCharsetFinder); // Do we have any properties that indicate we will read JSON? if (mimetypeJsonConfigDir != null || jsonObjectMapper != null || cronExpression != null || initialAndOnErrorCronExpression != null) { PropertyCheck.mandatory(this, "jsonObjectMapper", jsonObjectMapper); // If we have a cronExpression it indicates that we will schedule reading. if (cronExpression != null) { PropertyCheck.mandatory(this, "initialAndOnErrorCronExpression", initialAndOnErrorCronExpression); } jsonConfigFileFinder = new ConfigFileFinder(jsonObjectMapper) { @Override protected void readJson(JsonNode jsonNode, String readFromMessage, String baseUrl) throws IOException { try { JsonNode mediaTypes = jsonNode.get("mediaTypes"); if (mediaTypes != null && mediaTypes.isArray()) { List<ConfigElement> mimetypes = new ArrayList<>(); for (JsonNode mediaType : mediaTypes) { MediaTypeDef def = jsonObjectMapper.convertValue(mediaType, MediaTypeDef.class); GenericConfigElement mimetype = new GenericConfigElement(ATTR_MIMETYPE); mimetype.addAttribute(ATTR_DISPLAY, def.name); mimetype.addAttribute(ATTR_MIMETYPE, def.mediaType); if (def.text) { mimetype.addAttribute(ATTR_TEXT, Boolean.TRUE.toString()); } GenericConfigElement ext = null; int count = 0; for (ExtensionDef extension : def.extensions) { ext = new GenericConfigElement(ATTR_EXTENSION); ext.setValue(extension.extension); if (extension.name != null && !extension.name.isBlank()) { ext.addAttribute(ATTR_DISPLAY, extension.name); } if (extension.isDefault) { ext.addAttribute(ATTR_DEFAULT, Boolean.TRUE.toString()); } mimetype.addChild(ext); count++; } if (count == 1 && ext.getAttribute(ATTR_DEFAULT) == null) { ext.addAttribute(ATTR_DEFAULT, Boolean.TRUE.toString()); } mimetypes.add(mimetype); } registerMimetypes(mimetypes); Data data = getData(); data.fileCount++; } } catch (IllegalArgumentException e) { logger.error("Error reading "+readFromMessage+" "+e.getMessage()); } } }; } // TikaConfig should be given, but work around it if not if (tikaConfig == null) { logger.warn("TikaConfig spring parameter not supplied, using default config"); setTikaConfig(TikaConfig.getDefaultConfig()); } // Create our Tika mimetype detector up-front // We can then be sure we only have the one, so it's quick (ALF-10813) detector = new DefaultDetector(tikaConfig.getMimeRepository()); // Work out the mappings - only runs once and straight away if cronExpression is null configScheduler.run(true, logger, cronExpression, initialAndOnErrorCronExpression); }
Example #19
Source File: IdentifyMimeType.java From localization_nifi with Apache License 2.0 | 4 votes |
public IdentifyMimeType() { // Setup Tika this.config = TikaConfig.getDefaultConfig(); this.detector = config.getDetector(); }
Example #20
Source File: TikaAutoMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
public TikaAutoMetadataExtracter(TikaConfig tikaConfig) { super( buildMimeTypes(tikaConfig) ); }
Example #21
Source File: TikaAudioMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
public void setTikaConfig(TikaConfig tikaConfig) { this.tikaConfig = tikaConfig; }
Example #22
Source File: TikaAutoContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
public TikaAutoContentTransformer(TikaConfig tikaConfig) { super( buildMimeTypes(tikaConfig) ); setUseTimeoutThread(true); setTransformerName("TikaAuto"); }
Example #23
Source File: TikaAutoContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0 | 4 votes |
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig) { config = tikaConfig; parser = new AutoDetectParser(config); SUPPORTED_MIMETYPES = new ArrayList<String>(); for(MediaType baseType : parser.getParsers().keySet()) { // Register both the canonical type, and any alias it may have // Alfresco sometimes uses the canonical type, and sometimes an alias ArrayList<MediaType> types = new ArrayList<MediaType>(); types.add(baseType); types.addAll( config.getMediaTypeRegistry().getAliases(baseType) ); for(MediaType mt : types) { if(mt.toString().startsWith("application/vnd.oasis.opendocument.formula")) { // TODO Tika support for quick.odf, mimetype=application/vnd.oasis.opendocument.formula // TODO Tika support for quick.otf, mimetype=application/vnd.oasis.opendocument.formula-template continue; } if(mt.toString().startsWith("application/vnd.oasis.opendocument.graphics")) { // TODO Tika support for quick.odg, mimetype=application/vnd.oasis.opendocument.graphics // TODO Tika support for quick.otg, mimetype=application/vnd.oasis.opendocument.graphics-template continue; } if(mt.getType().equals("image") || mt.getType().equals("audio") || mt.getType().equals("video")) { // Skip these, as Tika mostly just does // metadata rather than content } else if(mt.toString().equals("application/zip") || mt.toString().equals("application/tar") || mt.toString().equals("application/x-tar")) { // Skip these, as we handle container formats in a different // transformer to give the user control over recursion } else if(mt.toString().equals("message/rfc822") || mt.toString().equals("application/vnd.ms-outlook")) { // Skip these, as we want our textual representations to include // parts of the metadata (eg people, subjects, dates) too } else { // Tika can probably do some useful text SUPPORTED_MIMETYPES.add( mt.toString() ); } } } return SUPPORTED_MIMETYPES; }
Example #24
Source File: MimetypeMap.java From alfresco-data-model with GNU Lesser General Public License v3.0 | 2 votes |
/** * Injects the TikaConfig to use * * @param tikaConfig The Tika Config to use */ public void setTikaConfig(TikaConfig tikaConfig) { this.tikaConfig = tikaConfig; }
Example #25
Source File: TikaProcessor.java From jesterj with Apache License 2.0 | 2 votes |
/** * Specify a tika configuration via an XML document you have loaded via filesystem/classpath or other method * of your choice. * * @param config The configuration * @return This builder for further config * @throws TikaException if Tika doesn't like your config * @throws IOException if Tika can't find something it needed? */ public Builder configuredWith(org.w3c.dom.Document config) throws TikaException, IOException { getObj().tikaConfig = new TikaConfig(config); return this; }
Example #26
Source File: ArchiveContentTransformer.java From alfresco-repository with GNU Lesser General Public License v3.0 | 2 votes |
/** * Injects the TikaConfig to use * * @param tikaConfig The Tika Config to use */ public void setTikaConfig(TikaConfig tikaConfig) { this.tikaConfig = tikaConfig; }
Example #27
Source File: HTMLRenderingEngine.java From alfresco-repository with GNU Lesser General Public License v3.0 | 2 votes |
/** * Injects the TikaConfig to use * * @param tikaConfig The Tika Config to use */ public void setTikaConfig(TikaConfig tikaConfig) { this.tikaConfig = tikaConfig; }