org.apache.tika.mime.MediaType#toString

Source File: DetectMimeTypeBuilder.java From kite with Apache License 2.0

7 votes

/**
 * Detects the content type of the given input event. Returns
 * <code>application/octet-stream</code> if the type of the event can not be
 * detected.
 * <p>
 * It is legal for the event headers or body to be empty. The detector may
 * read bytes from the start of the body stream to help in type detection.
 * 
 * @return detected media type, or <code>application/octet-stream</code>
 */
private String getMediaType(InputStream in, Metadata metadata, boolean excludeParameters) {
  MediaType mediaType;
  try {
    mediaType = getDetector().detect(in, metadata);
  } catch (IOException e) {
    throw new MorphlineRuntimeException(e);
  }
  String mediaTypeStr = mediaType.toString();
  if (excludeParameters) {
    int i = mediaTypeStr.indexOf(';');
    if (i >= 0) {
      mediaTypeStr = mediaTypeStr.substring(0, i);
    }
  }
  return mediaTypeStr;
}

Source File: XlsDetector.java From data-prep with Apache License 2.0

6 votes

/**
 * Reads an input stream and checks if it has a XLS format.
 *
 * The general contract of a detector is to not close the specified stream before returning. It is to the
 * responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
 * {@see TikaInputStream} in order to let the stream always return the same bytes.
 *
 * @param metadata the specified TIKA {@link Metadata}
 * @param inputStream the specified input stream
 * @return either null or an XLS format
 * @throws IOException
 */
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {

    Format result = null;

    MediaType mediaType = super.detect(inputStream, metadata);
    if (mediaType == null || StringUtils.equals(mediaType.toString(), FormatUtils.UNKNOWN_MEDIA_TYPE)) {
        mediaType = zipContainerDetector.detect(inputStream, new Metadata());
    }

    if (mediaType != null) {
        String mediaTypeName = mediaType.toString();
        if (StringUtils.startsWith(mediaTypeName, XlsFormatFamily.MEDIA_TYPE)
                || StringUtils.equals(mediaTypeName, OLD_XLS_MEDIA_TYPE)) {
            result = new Format(xlsFormatFamily, FormatUtils.DEFAULT_ENCODING);
        }
    }

    return result;
}

Source File: CSVDetector.java From data-prep with Apache License 2.0

6 votes

/**
 * A private utility class used to detect format.
 *
 * @param metadata the specified TIKA {@link Metadata}
 * @param inputStream the specified input stream
 * @return either null or an CSV format
 * @throws IOException
 */
private Format detectText(Metadata metadata, InputStream inputStream) throws IOException {
    MediaType mediaType = mimeTypes.detect(inputStream, metadata);
    if (mediaType != null) {
        String mediaTypeName = mediaType.toString();

        if (StringUtils.startsWith(mediaTypeName, TEXT_PLAIN)) {
            Charset charset = null;
            try {
                charset = encodingDetector.detect(inputStream, metadata);
            } catch (IOException e) {
                LOGGER.debug("Unable to detect the encoding for a data set in CSV format", e);
            }
            if (charset != null) {
                return new Format(csvFormatFamily, charset.name());
            } else {
                return new Format(csvFormatFamily, FormatUtils.DEFAULT_ENCODING);
            }
        }
    }
    return null;
}

Source File: JSoupParserBolt.java From storm-crawler with Apache License 2.0

6 votes

public String guessMimeType(String URL, String httpCT, byte[] content) {

        org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();

        if (StringUtils.isNotBlank(httpCT)) {
            // pass content type from server as a clue
            metadata.set(org.apache.tika.metadata.Metadata.CONTENT_TYPE, httpCT);
        }

        // use full URL as a clue
        metadata.set(org.apache.tika.metadata.Metadata.RESOURCE_NAME_KEY, URL);

        metadata.set(org.apache.tika.metadata.Metadata.CONTENT_LENGTH,
                Integer.toString(content.length));

        try (InputStream stream = new ByteArrayInputStream(content)) {
            MediaType mt = detector.detect(stream, metadata);
            return mt.toString();
        } catch (IOException e) {
            throw new IllegalStateException("Unexpected IOException", e);
        }
    }

Source File: MimetypeMap.java From alfresco-data-model with GNU Lesser General Public License v3.0

5 votes

/**
 * Use Apache Tika to check if the mime type of the document really matches
 * what it claims to be. This is typically used when a transformation or
 * metadata extractions fails, and you want to know if someone has renamed a
 * file and consequently it has the wrong mime type.
 * 
 * @return Null if the mime type seems ok, otherwise the mime type it
 *         probably is
 */
public String getMimetypeIfNotMatches(ContentReader reader)
{
    MediaType type = detectType(null, reader);
    if (type == null)
    {
        // Tika doesn't know so we can't help, sorry...
        return null;
    }

    // Is it a good match?
    if (type.toString().equals(reader.getMimetype())) { return null; }

    // Is it close?
    MediaType claimed = MediaType.parse(reader.getMimetype());
    if (tikaConfig.getMediaTypeRegistry().isSpecializationOf(claimed, type)
            || tikaConfig.getMediaTypeRegistry().isSpecializationOf(type, claimed))
    {
        // Probably close enough
        return null;
    }
    
    // Check through known aliases of the type
    SortedSet<MediaType> aliases = tikaConfig.getMediaTypeRegistry().getAliases(type);
    for (MediaType alias : aliases)
    {
        String aliasType = alias.toString();
        if (aliasType.equals(claimed.toString())) 
        {
            return null; 
        }
    }

    // If we get here, then most likely the type is wrong
    return type.toString();
}

Source File: TikaAnalysis.java From tutorials with MIT License

5 votes

public static String detectDocTypeUsingDetector(InputStream stream) throws IOException {
    Detector detector = new DefaultDetector();
    Metadata metadata = new Metadata();

    MediaType mediaType = detector.detect(stream, metadata);
    return mediaType.toString();
}

Source File: AssetMimeHandler.java From usergrid with Apache License 2.0

5 votes

/**
 * Get the Mime type of an Asset based on its type. If the Asset already has the "content-type" property set, we
 * return that. Otherwise the Apache Tika library is used to do file type detection.
 *
 * @return A string representation of the content type suitable for use in an HTTP header. Eg. "image/jpeg" for a
 * jpeg
 *         image.
 */
public <T> String getMimeType( Entity entity, T type ) {

    Map<String, Object> fileMetadata = AssetUtils.getFileMetadata( entity );
    if ( fileMetadata.get( AssetUtils.CONTENT_TYPE ) != null ) {
        return ( String ) fileMetadata.get( AssetUtils.CONTENT_TYPE );
    }

    Metadata metadata = new Metadata();
    MediaType mediaType = MediaType.OCTET_STREAM;
    try {
        if ( type instanceof byte[] ) {

            ByteArrayInputStream bais = new ByteArrayInputStream( ( byte[] ) type );
            mediaType = detector.detect( bais, metadata );
        }
        else if ( type instanceof File ) {

            InputStream fis = new BufferedInputStream( new FileInputStream( ( File ) type ) );
            try {
                mediaType = detector.detect( fis, metadata );
            }
            finally {
                fis.close();
            }
        }
        else {
            return mediaType.toString();
        }

        fileMetadata.put( AssetUtils.CONTENT_TYPE, mediaType.toString() );
    }
    catch ( IOException e ) {
        logger.error( "error detecting mime type", e );
    }

    return mediaType.toString();
}

Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

/**
 * Test several different files
 * Note - doesn't use extractFromMimetype
 */
public void testSupportedMimetypes() throws Exception
{
    String[] testFiles = new String[] {
          ".doc", ".docx", ".xls", ".xlsx",
          ".ppt", ".pptx", 
          //".vsd", // Our sample file lacks suitable metadata
          "2010.dwg",
          "2003.mpp", "2007.mpp",
          ".pdf",
          ".odt",
    };
       
    AutoDetectParser ap = new AutoDetectParser();
    for (String fileBase : testFiles)
    {
       String filename = "quick" + fileBase;
       URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
       File file = new File(url.getFile());
       
       // Cheat and ask Tika for the mime type!
       Metadata metadata = new Metadata();
       metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
       MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
       String mimetype = mt.toString();
       
       if (logger.isDebugEnabled())
       {
          logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
       }

       // Have it processed
       Map<QName, Serializable> properties = extractFromFile(file, mimetype);
       
       // check we got something
       assertFalse("extractFromMimetype should return at least some properties, " +
       		"none found for " + mimetype + " - " + filename,
          properties.isEmpty());
       
       // check common metadata
       testCommonMetadata(mimetype, properties);
       // check file-type specific metadata
       testFileSpecificMetadata(mimetype, properties);
    }
}

Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0

4 votes

/**
 * Test MNT-15219 Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may
 * cause OutOfMemory in Tika Note - doesn't use extractFromMimetype
 */
public void testParsingOfShapesInXLSXFiles() throws Exception
{
    AutoDetectParser ap = new AutoDetectParser();

    String filename = "dmsu1332-reproduced.xlsx";
    URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
    File file = new File(url.getFile());

    // Cheat and ask Tika for the mime type!
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
    String mimetype = mt.toString();

    if (logger.isDebugEnabled())
    {
        logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
    }

    // Have it processed
    // see MNT-15219 and REPO-3251
    Map<QName, Serializable> properties = extractFromFile(file, mimetype);

    // check we got something
    assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype + " - " + filename, 
            properties.isEmpty());

    if (properties.containsKey(ContentModel.PROP_AUTHOR))
    {
        assertEquals("Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, 
                "Udintsev, Anton (external - Project)",
                DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
    }
    else
    {
        fail("Expected one property out of " + ContentModel.PROP_CREATOR + " and " + ContentModel.PROP_AUTHOR + " but found neither of them for "
                + mimetype);
    }

    // Ensure that we can also get things which are standard
    // Tika metadata properties, if we so choose to
    assertTrue("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype, 
            properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY));
    assertEquals("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype, 
            mimetype,
            DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
}

Source File: MimetypeMap.java From alfresco-data-model with GNU Lesser General Public License v3.0

4 votes

/**
 * Uses Tika to try to identify the mimetype of the file, falling back on
 * {@link #guessMimetype(String)} for an extension based one if Tika can't
 * help.
 */
public String guessMimetype(String filename, InputStream input)
{
    MediaType type = detectType(filename, input);
    String filenameGuess = guessMimetype(filename);

    // If Tika doesn't know what the type is, or file is password protected, go with the filename one
    if (type == null || MediaType.OCTET_STREAM.equals(type) || MIMETYPE_ENCRYPTED_OFFICE.equals(type.toString())) { return filenameGuess; }

    // If Tika has supplied a very generic type, go with the filename one,
    // as it's probably a custom Text or XML format known only to Alfresco
    if ((MediaType.TEXT_PLAIN.equals(type) || MediaType.APPLICATION_XML.equals(type)) && (! filenameGuess.equals(MIMETYPE_BINARY)))
    { 
        return filenameGuess; 
    }

    // Alfresco doesn't support mimetype parameters
    // Use the form of the mimetype without any
    if (type.hasParameters())
    {
        type = type.getBaseType();
    }

    // Not all the mimetypes we use are the Tika Canonical one.
    // So, detect when this happens and use ours in preference
    String tikaType = type.toString();
    Data data = getData();
    if (data.mimetypes.contains(tikaType))
    {
        // Alfresco and Tika agree!
        return tikaType;
    }

    // Check the aliases
    SortedSet<MediaType> aliases = tikaConfig.getMediaTypeRegistry().getAliases(type);
    for (MediaType alias : aliases)
    {
        String aliasType = alias.toString();
        if (data.mimetypes.contains(aliasType)) { return aliasType; }
    }

    // If we get here, then Tika has identified something that
    // Alfresco doesn't really know about. Just trust Tika on it
    logger.info("Tika detected a type of " + tikaType + " for file " + filename
            + " which Alfresco doesn't know about. Consider " + " adding that type to your configuration");
    return tikaType;
}

Java Code Examples for org.apache.tika.mime.MediaType#toString()