org.apache.tika.Tika Java Examples
The following examples show how to use
org.apache.tika.Tika.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HttpClient.java From AuTe-Framework with Apache License 2.0 | 6 votes |
private MultipartEntityBuilder setEntity(List<FormData> formDataList, String projectPath, Map<String, Object> scenarioVariables) throws IOException { MultipartEntityBuilder entity = MultipartEntityBuilder.create().setMode(HttpMultipartMode.BROWSER_COMPATIBLE); for (FormData formData : formDataList) { if (formData.getFieldType() == null || FieldType.TEXT.equals(formData.getFieldType())) { entity.addTextBody(formData.getFieldName(), ExecutorUtils.insertSavedValues(formData.getValue(), scenarioVariables), ContentType.create("text/plain", StandardCharsets.UTF_8)); } else { log.debug("Try to identify Mime type projectPath = {}, formData = {}, fromData.getFilePath = {}", projectPath, formData, formData.getFilePath()); File file = new File((projectPath == null ? "" : projectPath) + formData.getFilePath()); String detectedMimeType = new Tika().detect(file); log.debug("Tika detection result = {}", detectedMimeType); log.debug("Try to get content type from formData.getMimeType = {}, tika detected mime type = {}", formData.getMimeType(), detectedMimeType); entity.addBinaryBody( formData.getFieldName(), file, ContentType.parse( StringUtils.isEmpty(formData.getMimeType()) ? detectedMimeType : formData.getMimeType()), file.getName() ); } } return entity; }
Example #2
Source File: Base64Utils.java From NutzSite with Apache License 2.0 | 6 votes |
/** * 将图片文件转换成base64字符串,参数为该图片的路径 * * @param file * @return java.lang.String */ public static String fileBase64(File file) { try { // check content type of the file Tika tika = new Tika(); String contentType =tika.detect(file); // read data as byte[] byte[] data = Files.readAllBytes(file.toPath()); // convert byte[] to base64(java7) String base64str = DatatypeConverter.printBase64Binary(data); // convert byte[] to base64(java8) // String base64str = Base64.getEncoder().encodeToString(data); // cretate "data URI" StringBuilder sb = new StringBuilder(); sb.append("data:"); sb.append(contentType); sb.append(";base64,"); sb.append(base64str); System.out.println(sb.toString()); return sb.toString(); } catch (IOException e) { e.printStackTrace(); } return null; }
Example #3
Source File: ContentExtractor.java From FXDesktopSearch with Apache License 2.0 | 6 votes |
public ContentExtractor(final Configuration aConfiguration) { // TODO: auch korrekt dieses Muster verarbeitrn : Mon Feb 18 15:55:10 CET 2013 metaDataDatePattern = Pattern.compile("(\\d{4})-(\\d{2})-(\\d{2})T(\\d{2}):(\\d{2}):(\\d{2})Z"); configuration = aConfiguration; tika = new Tika(); tika.setMaxStringLength(1024 * 1024 * 5); final var theDetector = new OptimaizeLangDetector(); try { theDetector.loadModels(); languageDetector = theDetector; } catch (final Exception e) { throw new RuntimeException(e); } }
Example #4
Source File: ActionToHtml.java From o2oa with GNU Affero General Public License v3.0 | 6 votes |
ActionResult<Wo> execute(EffectivePerson effectivePerson, byte[] bytes, FormDataContentDisposition disposition) throws Exception { ActionResult<Wo> result = new ActionResult<>(); Tika tika = new Tika(); String type = tika.detect(bytes); Wo wo = new Wo(); switch (type) { case ("application/msword"): wo.setValue(this.doc(bytes)); break; case ("application/vnd.openxmlformats-officedocument.wordprocessingml.document"): wo.setValue(this.docx(bytes)); break; default: throw new ExceptionUnsupportType(type); } result.setData(wo); return result; }
Example #5
Source File: ResourceServiceImpl.java From jwala with Apache License 2.0 | 6 votes |
public ResourceServiceImpl(final ResourcePersistenceService resourcePersistenceService, final GroupPersistenceService groupPersistenceService, final ApplicationPersistenceService applicationPersistenceService, final JvmPersistenceService jvmPersistenceService, final WebServerPersistenceService webServerPersistenceService, final ResourceDao resourceDao, final ResourceHandler resourceHandler, final ResourceContentGeneratorService resourceContentGeneratorService, final BinaryDistributionService binaryDistributionService, final Tika fileTypeDetector, final RepositoryService repositoryService) { this.resourcePersistenceService = resourcePersistenceService; this.groupPersistenceService = groupPersistenceService; this.applicationPersistenceService = applicationPersistenceService; this.jvmPersistenceService = jvmPersistenceService; this.webServerPersistenceService = webServerPersistenceService; this.resourceDao = resourceDao; this.resourceHandler = resourceHandler; this.resourceContentGeneratorService = resourceContentGeneratorService; this.binaryDistributionService = binaryDistributionService; this.fileTypeDetector = fileTypeDetector; this.repositoryService = repositoryService; }
Example #6
Source File: TikaProcessor.java From jesterj with Apache License 2.0 | 5 votes |
@Override public Document[] processDocument(Document document) { try { byte[] rawData = document.getRawData(); if (rawData == null) { log.debug("Skipping document without data in " + getName()); return new Document[]{document}; } Tika tika = new Tika(tikaConfig); tika.setMaxStringLength(document.getRawData().length); Metadata metadata = new Metadata(); try (ByteArrayInputStream bais = new ByteArrayInputStream(rawData)) { String textContent = tika.parseToString(bais, metadata, maxLength); if (replaceRaw) { document.setRawData(textContent.getBytes(StandardCharsets.UTF_8)); } if (destField != null) { document.put(destField,textContent); } for (String name : metadata.names()) { document.put(sanitize(name) + plusSuffix(), metadata.get(name)); } } catch (IOException | TikaException e) { log.debug("Tika processing failure!", e); // if tika can't parse it we certainly don't want random binary crap in the index document.setStatus(Status.ERROR); } } catch (Throwable t) { boolean isAccessControl = t instanceof AccessControlException; boolean isSecurity = t instanceof SecurityException; if (!isAccessControl && !isSecurity) { throw t; } } return new Document[]{document}; }
Example #7
Source File: DefaultMessagingConfigurer.java From ogham with Apache License 2.0 | 5 votes |
protected void configureImageInliningMimetype(MimetypeDetectionBuilder<?> builder) { // @formatter:off builder .tika() .instance(new Tika()) .failIfOctetStream().defaultValue(overrideIfNotSet(true)).and() .and() .allowed().properties("${ogham.email.image-inlining.mimetype.allowed-mimetypes}").defaultValue(overrideIfNotSet(new String[] { "image/*" })); // @formatter:on }
Example #8
Source File: DefaultMessagingConfigurer.java From ogham with Apache License 2.0 | 5 votes |
@Override public void configure(MimetypeDetectionBuilder<?> builder) { // @formatter:off builder .tika() .instance(new Tika()) .failIfOctetStream().properties("${ogham.mimetype.tika.fail-if-octet-stream}").defaultValue(overrideIfNotSet(true)).and() .and() .defaultMimetype().properties("${ogham.mimetype.default-mimetype}").defaultValue(overrideIfNotSet("application/octet-stream")); // @formatter:on }
Example #9
Source File: ImageViewPanel.java From Orienteer with Apache License 2.0 | 5 votes |
public ImageViewPanel(String id, IModel<V> valueModel) { super(id, valueModel); byte[] imageBytes = (byte[]) getModelObject(); if (imageBytes != null) { String mimeType = new Tika().detect(imageBytes); ByteArrayResource byteArrayResource = new ByteArrayResource(mimeType, imageBytes); add(new Image("image", byteArrayResource)); } else { add(new EmptyPanel("image")); } }
Example #10
Source File: ImageEditPanel.java From Orienteer with Apache License 2.0 | 5 votes |
@Override public void validate() { super.validate(); FileUpload fileUpload = fileUploadField.getFileUpload(); if(fileUpload!=null) { byte[] bytes = fileUpload.getBytes(); boolean isImage = new Tika().detect(bytes).startsWith("image/"); if (!isImage) { error(getString("errors.wrong.image.uploaded")); } } }
Example #11
Source File: DataURI.java From osiam with MIT License | 5 votes |
/** * @param inputStream a inputStream which will be transformed into an DataURI * @throws IOException if the stream can not be read or is closed * @throws SCIMDataValidationException if the inputStream can't be converted into an DataURI */ public DataURI(InputStream inputStream) throws IOException { if (inputStream == null) { throw new SCIMDataValidationException("The given inputStream can't be null."); } String mimeType = new Tika().detect(inputStream); dataUri = convertInputStreamToDataURI(inputStream, mimeType); }
Example #12
Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0 | 5 votes |
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException { _logger.log("Extracting text with Tika"); String extractedText = ""; SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter sw = new StringWriter(); handler.setResult(new StreamResult(sw)); AutoDetectParser parser = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); Tika tika = new Tika(); Metadata tikaMetadata = new Metadata(); try { // for synthetic transactions if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) { throw new TikaException("Test Tika Exception"); } parser.parse(objectData, handler, tikaMetadata, parseContext); extractedText = sw.toString(); } catch( TikaException e) { _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage()); return assembleExceptionResult(bucket, key, e); } _logger.log("Tika parsing success"); return assembleExtractionResult(bucket, key, extractedText, tikaMetadata); }
Example #13
Source File: FileRequest.java From Bastion with GNU General Public License v3.0 | 5 votes |
private void guessResourceMimeType(String resource) { String mimeType = new Tika().detect(resource); if (mimeType.equals("application/octet-stream")) { LOG.warning("Bastion might not have been able to determine the MIME type and is using" + " [application/octet-stream] for this request. Use setContentType() to change the MIME type."); } generalRequest.setContentType(ContentType.create(mimeType)); }
Example #14
Source File: SecureFileController.java From cerberus with Apache License 2.0 | 5 votes |
@Autowired public SecureFileController( SecureDataService secureDataService, SecureDataVersionService secureDataVersionService, SdbAccessRequest sdbAccessRequest) { this.secureDataService = secureDataService; this.secureDataVersionService = secureDataVersionService; this.sdbAccessRequest = sdbAccessRequest; tika = new Tika(); }
Example #15
Source File: MyMimeTypeUtils.java From spring-boot with Apache License 2.0 | 5 votes |
/** * 利用 Tika 分析 Mime Type * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。 * Parses the resource at the given URL and returns the extracted text content. * * @param url * @return */ public static String detect(URL url, int timeout) throws Exception { //网址不存在 if (!MyUrlUtils.isURLAvailable(url, timeout)) { throw new Exception("exception ! " + url.getAuthority() + " not available"); } Tika t = new Tika(); return t.detect(url); }
Example #16
Source File: MyMimeTypeUtils.java From spring-boot with Apache License 2.0 | 5 votes |
/** * 利用 Tika 分析 Mime Type * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。 * <p> * Parses the given file and returns the extracted text content. * * @param file * @return */ public static String detect(File file) throws Exception { //文件不存在 if (!file.exists()) { throw new Exception("exception ! " + file.getAbsoluteFile() + " not existes."); } Tika t = new Tika(); return t.detect(file); }
Example #17
Source File: TestUtil.java From gplaymusic with MIT License | 5 votes |
public static void testDownload(String fileName, Track track) throws IOException { Path path = FileSystems.getDefault().getPath(System.getProperty("java.io.tmpdir"), fileName); track.download(StreamQuality.LOW, path); File file = path.toFile(); Assert.assertTrue("File does not exist", file.exists()); Assert.assertEquals("Is not an audio file", new Tika().detect(file), "audio/mpeg"); }
Example #18
Source File: ResourceServiceImplTest.java From jwala with Apache License 2.0 | 5 votes |
@Bean public ResourceService getResourceService() { ResourceContentGeneratorService resourceContentGeneratorService = new ResourceContentGeneratorServiceImpl(mockGroupPesistenceService, mockWebServerPersistenceService, mockJvmPersistenceService, mockAppPersistenceService, mockHistoryFacadeService); Tika tika = new Tika(); BinaryDistributionService mockBinaryDistributionService = mock(BinaryDistributionService.class); return new ResourceServiceImpl(mockResourcePersistenceService, mockGroupPesistenceService, mockAppPersistenceService, mockJvmPersistenceService, mockWebServerPersistenceService, mockResourceDao, mockResourceHandler, resourceContentGeneratorService, mockBinaryDistributionService, tika, mockRepositoryService); }
Example #19
Source File: AemServiceConfiguration.java From jwala with Apache License 2.0 | 5 votes |
@Bean(name = "resourceService") public ResourceService getResourceService(final ApplicationPersistenceService applicationPersistenceService, final JvmPersistenceService jvmPersistenceService, final WebServerPersistenceService webServerPersistenceService, final ResourceDao resourceDao, final WebServerResourceHandler webServerResourceHandler, final ResourceContentGeneratorService resourceContentGeneratorService, @Qualifier("resourceRepositoryService") final RepositoryService repositoryService) { return new ResourceServiceImpl(aemPersistenceServiceConfiguration.getResourcePersistenceService(), aemPersistenceServiceConfiguration.getGroupPersistenceService(), applicationPersistenceService, jvmPersistenceService, webServerPersistenceService, resourceDao, webServerResourceHandler, resourceContentGeneratorService, binaryDistributionService, new Tika(), repositoryService); }
Example #20
Source File: TikaParserTest.java From JQF with BSD 2-Clause "Simplified" License | 5 votes |
@Fuzz public void fuzz(@From(InputStreamGenerator.class) InputStream in) throws IOException { Tika tika = new Tika(); try(Reader reader = tika.parse(in)) { char[] buf = new char[1024]; while (reader.read(buf) != -1); // Keep reading until EOF } }
Example #21
Source File: Reference.java From oodt with Apache License 2.0 | 5 votes |
/** * <p> * Constructs a new Reference with the specified parameters. * </p> * * @param origRef * The item's original location. * @param dataRef * The item's location within the data store. * @param size * The size of the file that this reference refers to. */ public Reference(String origRef, String dataRef, long size) { origReference = origRef; dataStoreReference = dataRef; fileSize = size; // TODO: since no mimetype was specified, do the dirty work // ourselves to determine the which MimeType class to associate // with this reference. try { this.mimeType = mimeTypeRepository.forName(new Tika().detect(origRef)); } catch (MimeTypeException e) { LOG.log(Level.SEVERE, e.getMessage()); } }
Example #22
Source File: MimeTypeUtils.java From oodt with Apache License 2.0 | 5 votes |
public MimeTypeUtils(InputStream mimeIs, boolean magic) { try { this.mimeTypes = MimeTypesFactory.create(mimeIs); this.mimeMagic = magic; this.tika = new Tika(new DefaultDetector(this.mimeTypes)); }catch (Exception e) { LOG.log(Level.SEVERE, "Failed to load MimeType Registry : " + e.getMessage(), e); } }
Example #23
Source File: TikaAnalysis.java From tutorials with MIT License | 5 votes |
public static Metadata extractMetadatatUsingFacade(InputStream stream) throws IOException, TikaException { Tika tika = new Tika(); Metadata metadata = new Metadata(); tika.parse(stream, metadata); return metadata; }
Example #24
Source File: MediaTypeValidator.java From iaf with Apache License 2.0 | 5 votes |
/** * Package default access because it specific for the conversion. */ public MediaTypeValidator(String pdfOutputlocation) { // Create only once. Tika seems to be thread safe // (see // http://stackoverflow.com/questions/10190980/spring-tika-integration-is-my-approach-thread-safe) tika = new Tika(); this.pdfOutputlocation = pdfOutputlocation; }
Example #25
Source File: MimeTypeUnitTest.java From tutorials with MIT License | 5 votes |
/** * Test method demonstrating usage of Apache Tika. * * @throws IOException */ @Test public void whenUsingTika_thenSuccess() throws IOException { final File file = new File(FILE_LOC); final Tika tika = new Tika(); final String mimeType = tika.detect(file); assertEquals(mimeType, PNG_EXT); }
Example #26
Source File: ParserBolt.java From storm-crawler with Apache License 2.0 | 4 votes |
@SuppressWarnings({ "rawtypes", "unchecked" }) @Override public void prepare(Map conf, TopologyContext context, OutputCollector collector) { emitOutlinks = ConfUtils.getBoolean(conf, "parser.emitOutlinks", true); urlFilters = URLFilters.fromConf(conf); parseFilters = ParseFilters.fromConf(conf); upperCaseElementNames = ConfUtils.getBoolean(conf, "parser.uppercase.element.names", true); extractEmbedded = ConfUtils.getBoolean(conf, "parser.extract.embedded", false); String htmlmapperClassName = ConfUtils.getString(conf, "parser.htmlmapper.classname", "org.apache.tika.parser.html.IdentityHtmlMapper"); try { HTMLMapperClass = Class.forName(htmlmapperClassName); boolean interfaceOK = HtmlMapper.class .isAssignableFrom(HTMLMapperClass); if (!interfaceOK) { throw new RuntimeException("Class " + htmlmapperClassName + " does not implement HtmlMapper"); } } catch (ClassNotFoundException e) { LOG.error("Can't load class {}", htmlmapperClassName); throw new RuntimeException("Can't load class " + htmlmapperClassName); } mimeTypeWhiteList = ConfUtils.loadListFromConf( "parser.mimetype.whitelist", conf); protocolMDprefix = ConfUtils.getString(conf, ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, ""); // instantiate Tika long start = System.currentTimeMillis(); tika = new Tika(); long end = System.currentTimeMillis(); LOG.debug("Tika loaded in {} msec", end - start); this.collector = collector; this.eventCounter = context.registerMetric(this.getClass() .getSimpleName(), new MultiCountMetric(), 10); this.metadataTransfer = MetadataTransfer.getInstance(conf); }
Example #27
Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0 | 4 votes |
public ContentExtractor() { // autoDetectParser = new AutoDetectParser(); txtParser = new TXTParser(); tika = new Tika(); }
Example #28
Source File: TikaAnalysis.java From tutorials with MIT License | 4 votes |
public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException { Tika tika = new Tika(); String content = tika.parseToString(stream); return content; }
Example #29
Source File: TikaAnalysis.java From tutorials with MIT License | 4 votes |
public static String detectDocTypeUsingFacade(InputStream stream) throws IOException { Tika tika = new Tika(); String mediaType = tika.detect(stream); return mediaType; }
Example #30
Source File: OMailAttachment.java From Orienteer with Apache License 2.0 | 4 votes |
public DataSource toDataSource() { byte [] data = getData(); ByteArrayDataSource dataSource = new ByteArrayDataSource(data, new Tika().detect(data)); dataSource.setName(getName()); return dataSource; }