Java Code Examples for org.apache.tika.Tika#detect()
The following examples show how to use
org.apache.tika.Tika#detect() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Base64Utils.java From NutzSite with Apache License 2.0 | 6 votes |
/** * 将图片文件转换成base64字符串,参数为该图片的路径 * * @param file * @return java.lang.String */ public static String fileBase64(File file) { try { // check content type of the file Tika tika = new Tika(); String contentType =tika.detect(file); // read data as byte[] byte[] data = Files.readAllBytes(file.toPath()); // convert byte[] to base64(java7) String base64str = DatatypeConverter.printBase64Binary(data); // convert byte[] to base64(java8) // String base64str = Base64.getEncoder().encodeToString(data); // cretate "data URI" StringBuilder sb = new StringBuilder(); sb.append("data:"); sb.append(contentType); sb.append(";base64,"); sb.append(base64str); System.out.println(sb.toString()); return sb.toString(); } catch (IOException e) { e.printStackTrace(); } return null; }
Example 2
Source File: ActionToHtml.java From o2oa with GNU Affero General Public License v3.0 | 6 votes |
ActionResult<Wo> execute(EffectivePerson effectivePerson, byte[] bytes, FormDataContentDisposition disposition) throws Exception { ActionResult<Wo> result = new ActionResult<>(); Tika tika = new Tika(); String type = tika.detect(bytes); Wo wo = new Wo(); switch (type) { case ("application/msword"): wo.setValue(this.doc(bytes)); break; case ("application/vnd.openxmlformats-officedocument.wordprocessingml.document"): wo.setValue(this.docx(bytes)); break; default: throw new ExceptionUnsupportType(type); } result.setData(wo); return result; }
Example 3
Source File: MyMimeTypeUtils.java From spring-boot with Apache License 2.0 | 5 votes |
/** * 利用 Tika 分析 Mime Type * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。 * <p> * Parses the given file and returns the extracted text content. * * @param file * @return */ public static String detect(File file) throws Exception { //文件不存在 if (!file.exists()) { throw new Exception("exception ! " + file.getAbsoluteFile() + " not existes."); } Tika t = new Tika(); return t.detect(file); }
Example 4
Source File: MyMimeTypeUtils.java From spring-boot with Apache License 2.0 | 5 votes |
/** * 利用 Tika 分析 Mime Type * 因为 Tika 要解析 File 、 URL 数据流,所以解析需要一定时间。不要用解析扩展名的方法,无法动态判断,不准。 * Parses the resource at the given URL and returns the extracted text content. * * @param url * @return */ public static String detect(URL url, int timeout) throws Exception { //网址不存在 if (!MyUrlUtils.isURLAvailable(url, timeout)) { throw new Exception("exception ! " + url.getAuthority() + " not available"); } Tika t = new Tika(); return t.detect(url); }
Example 5
Source File: MimeTypeUnitTest.java From tutorials with MIT License | 5 votes |
/** * Test method demonstrating usage of Apache Tika. * * @throws IOException */ @Test public void whenUsingTika_thenSuccess() throws IOException { final File file = new File(FILE_LOC); final Tika tika = new Tika(); final String mimeType = tika.detect(file); assertEquals(mimeType, PNG_EXT); }
Example 6
Source File: DocumentTools.java From o2oa with GNU Affero General Public License v3.0 | 4 votes |
public static byte[] toPdf(String fileName, byte[] bytes, String stamp) throws Exception { Config.collect().validate(); Tika tika = new Tika(); String type = tika.detect(bytes, fileName); switch (Objects.toString(type, "")) { case MEDIATYPE_DOC: break; case MEDIATYPE_DOCX: break; default: throw new ExceptionUnsupportedMediaType(type); } URL serverUrl = new URL(Config.collect().url() + "/o2_collect_assemble/jaxrs/document/to/pdf"); HttpURLConnection connection = (HttpURLConnection) serverUrl.openConnection(); String boundary = "----" + StringTools.uniqueToken(); connection.setRequestMethod("POST"); connection.setDoOutput(true); connection.setUseCaches(false); connection.addRequestProperty("Content-Type", "multipart/form-data; boundary=" + boundary); try (OutputStream out = connection.getOutputStream(); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out))) { writer.write(twoHyphens + boundary); writer.write(CRLF); writer.write("Content-Disposition: form-data; name=\"file\"; filename=\"" + (StringUtils.isEmpty(fileName) ? StringTools.uniqueToken() : fileName) + "\""); writer.write(CRLF); writer.write("Content-Type: " + HttpMediaType.APPLICATION_OCTET_STREAM); writer.write(CRLF); writer.write(CRLF); writer.flush(); out.write(bytes); out.flush(); writer.write(CRLF); writer.write(twoHyphens + boundary); if (StringUtils.isNotEmpty(stamp)) { writer.write(CRLF); writer.write("Content-Disposition: form-data; name=\"stamp\""); writer.write(CRLF); writer.write("Content-Type: " + HttpMediaType.TEXT_PLAIN); writer.write(CRLF); writer.write(CRLF); writer.write(stamp); writer.write(CRLF); writer.write(twoHyphens + boundary); } writer.write(twoHyphens); writer.flush(); } String respText = null; try (InputStream in = connection.getInputStream()) { respText = IOUtils.toString(in, DefaultCharset.charset_utf_8); } if (StringUtils.isNotEmpty(respText)) { ActionResponse response = XGsonBuilder.instance().fromJson(respText, ActionResponse.class); WrapString wrap = XGsonBuilder.instance().fromJson(response.getData(), WrapString.class); return Base64.decodeBase64(wrap.getValue()); } return null; }
Example 7
Source File: DocumentTools.java From o2oa with GNU Affero General Public License v3.0 | 4 votes |
public static byte[] toImage(String fileName, byte[] bytes, String stamp, Integer page) throws Exception { Config.collect().validate(); Tika tika = new Tika(); String type = tika.detect(bytes, fileName); switch (Objects.toString(type, "")) { case MEDIATYPE_DOC: break; case MEDIATYPE_DOCX: break; default: throw new ExceptionUnsupportedMediaType(type); } URL serverUrl = new URL(Config.collect().url() + "/o2_collect_assemble/jaxrs/document/to/image"); HttpURLConnection connection = (HttpURLConnection) serverUrl.openConnection(); String boundary = "----" + StringTools.uniqueToken(); connection.setRequestMethod("POST"); connection.setDoOutput(true); connection.setUseCaches(false); connection.addRequestProperty("Content-Type", "multipart/form-data; boundary=" + boundary); try (OutputStream out = connection.getOutputStream(); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out))) { writer.write(twoHyphens + boundary); writer.write(CRLF); writer.write("Content-Disposition: form-data; name=\"file\"; filename=\"" + (StringUtils.isEmpty(fileName) ? StringTools.uniqueToken() : fileName) + "\""); writer.write(CRLF); writer.write("Content-Type: " + HttpMediaType.APPLICATION_OCTET_STREAM); writer.write(CRLF); writer.write(CRLF); writer.flush(); out.write(bytes); out.flush(); writer.write(CRLF); writer.write(twoHyphens + boundary); writer.write(CRLF); writer.write("Content-Disposition: form-data; name=\"page\""); writer.write(CRLF); writer.write("Content-Type: " + HttpMediaType.TEXT_PLAIN); writer.write(CRLF); writer.write(CRLF); writer.write("" + ((page == null || page < 0) ? 0 : page)); writer.write(CRLF); writer.write(twoHyphens + boundary); if (StringUtils.isNotEmpty(stamp)) { writer.write(CRLF); writer.write("Content-Disposition: form-data; name=\"stamp\""); writer.write(CRLF); writer.write("Content-Type: " + HttpMediaType.TEXT_PLAIN); writer.write(CRLF); writer.write(CRLF); writer.write(stamp); writer.write(CRLF); writer.write(twoHyphens + boundary); } writer.write(twoHyphens); writer.flush(); } String respText = null; try (InputStream in = connection.getInputStream()) { respText = IOUtils.toString(in, DefaultCharset.charset_utf_8); } if (StringUtils.isNotEmpty(respText)) { ActionResponse response = XGsonBuilder.instance().fromJson(respText, ActionResponse.class); WrapString wrap = XGsonBuilder.instance().fromJson(response.getData(), WrapString.class); return Base64.decodeBase64(wrap.getValue()); } return null; }
Example 8
Source File: Helpers.java From jobson with Apache License 2.0 | 4 votes |
public static String getMimeType(InputStream s, String fileName) throws IOException { final Tika t = new Tika(); return t.detect(s, fileName); }
Example 9
Source File: DataResourceWorker.java From scipio-erp with Apache License 2.0 | 4 votes |
public static String getMimeTypeWithByteBuffer(java.nio.ByteBuffer buffer) throws IOException { byte[] b = buffer.array(); Tika tika = new Tika(); return tika.detect(b); }
Example 10
Source File: ZipTextExtractor.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException { String resultText = ""; ZipInputStream zin = new ZipInputStream(input); ZipEntry entry; while ((entry = zin.getNextEntry()) != null) { if (!entry.isDirectory()) { int size = (int) entry.getSize(); byte[] b = new byte[size]; for(int x = 0; x < size; x++) { int err = zin.read(); if(err != -1) { b[x] = (byte)err; } } String newurl = url + "/"; String fname = entry.getName(); newurl += fname; URL aURL = new URL(newurl); String base = aURL.toString(); int i = fname.lastIndexOf('.'); if (i != -1) { // Trying to resolve the Mime-Type Tika tika = new Tika(); String contentType = tika.detect(fname); try { Metadata metadata = new Metadata(); metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize())); metadata.set(Response.CONTENT_TYPE, contentType); Content content = new Content(newurl, base, b, contentType, metadata, this.conf); Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl()); ParseData theParseData = parse.getData(); Outlink[] theOutlinks = theParseData.getOutlinks(); for(int count = 0; count < theOutlinks.length; count++) { outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor())); } resultText += entry.getName() + " " + parse.getText() + " "; } catch (ParseException e) { if (LOG.isInfoEnabled()) { LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage()); } } } } } return resultText; }
Example 11
Source File: TikaAnalysis.java From tutorials with MIT License | 4 votes |
public static String detectDocTypeUsingFacade(InputStream stream) throws IOException { Tika tika = new Tika(); String mediaType = tika.detect(stream); return mediaType; }