org.apache.tika.metadata.Metadata#names

Source File: TikaExtractor.java From ache with Apache License 2.0

6 votes

public ParsedData parse(InputStream stream, String fileName, String contentType) {
    BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
    BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
    Metadata metadata = createMetadata(fileName, contentType);
    ParseContext context = new ParseContext();
    try {
        parser.parse(stream, textHandler, metadata, context);
        
        Map<String, String> metadataMap = new HashMap<String, String>();
        for (String propertyName : metadata.names()) {
            metadataMap.put(propertyName, metadata.get(propertyName));
        }
        
        return new ParsedData(handler.toString(), metadataMap);
        
    } catch (IOException | SAXException | TikaException e) {
        logger.error("Failed to extract metadata using Tika.", e);
        return null;
    }
}

Source File: NodeTika.java From node-tika with MIT License

6 votes

public static String extractMeta(String uri, String contentType) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();

	fillMetadata(parser, metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	parser.parse(inputStream, new DefaultHandler(), metadata);

	Map meta = new HashMap();
	for (String name : metadata.names()) {
		String[] values = metadata.getValues(name);
		meta.put(name, values);
	}

	inputStream.close();

	return new Gson().toJson(meta);
}

Source File: TikaTest.java From tika-server with Apache License 2.0

6 votes

/**
 * Test that in at least one item in metadataList, all keys and values
 * in minExpected are contained.
 * <p>
 * The values in minExpected are tested for whether they are contained
 * within a value in the target.  If minExpected=&dquot;text/vbasic&dquot;  and
 * what was actually found in the target within metadatalist is
 * &dquot;text/vbasic; charset=windows-1252&dquot;,
 * that is counted as a hit.
 *
 * @param minExpected
 * @param metadataList
 */
public static void assertContainsAtLeast(Metadata minExpected, List<Metadata> metadataList) {

    for (Metadata m : metadataList) {
        int foundPropertyCount = 0;
        for (String n : minExpected.names()) {
            int foundValCount = 0;
            for (String foundVal : m.getValues(n)) {
                for (String expectedVal : minExpected.getValues(n)) {
                    if (foundVal.contains(expectedVal)) {
                        foundValCount++;
                    }
                }
            }
            if (foundValCount == minExpected.getValues(n).length) {
                foundPropertyCount++;
            }
        }
        if (foundPropertyCount == minExpected.names().length) {
            //found everything!
            return;
        }
    }
    //TODO: figure out how to have more informative error message
    fail("Couldn't find everything within a single metadata item");
}

Source File: TikaContentExtractor.java From baleen with Apache License 2.0

6 votes

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}

Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0

6 votes

/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

/** Sets the input metadata for {@link Parser#parse}. */
public ParseFiles withInputMetadata(Metadata metadata) {
  Metadata inputMetadata = this.getInputMetadata();
  if (inputMetadata != null) {
    for (String name : metadata.names()) {
      inputMetadata.set(name, metadata.get(name));
    }
  } else {
    inputMetadata = metadata;
  }
  return toBuilder().setInputMetadata(inputMetadata).build();
}

Source File: TikaProcessor.java From jesterj with Apache License 2.0

5 votes

@Override
public Document[]   processDocument(Document document) {
  try {
    byte[] rawData = document.getRawData();
    if (rawData == null) {
      log.debug("Skipping document without data in " + getName());
      return new Document[]{document};
    }
    Tika tika = new Tika(tikaConfig);
    tika.setMaxStringLength(document.getRawData().length);
    Metadata metadata = new Metadata();
    try (ByteArrayInputStream bais = new ByteArrayInputStream(rawData)) {
      String textContent = tika.parseToString(bais, metadata, maxLength);
      if (replaceRaw) {
        document.setRawData(textContent.getBytes(StandardCharsets.UTF_8));
      }
      if (destField != null) {
        document.put(destField,textContent);
      }
      for (String name : metadata.names()) {
        document.put(sanitize(name) + plusSuffix(), metadata.get(name));
      }
    } catch (IOException | TikaException e) {
      log.debug("Tika processing failure!", e);
      // if tika can't parse it we certainly don't want random binary crap in the index
      document.setStatus(Status.ERROR);
    }
  } catch (Throwable t) {
    boolean isAccessControl = t instanceof AccessControlException;
    boolean isSecurity = t instanceof SecurityException;
    if (!isAccessControl && !isSecurity) {
      throw t;
    }
  }
  return new Document[]{document};
}

Source File: TikaContentExtractor.java From cxf with Apache License 2.0

5 votes

/**
 * Extract the metadata only from the input stream. Depending on media type validation,
 * the detector could be run against input stream in order to ensure that parser supports this
 * type of content.
 * @param in input stream to extract the metadata from
 * @return the extracted metadata converted to SearchBean or null if extraction is not possible
 *         or was unsuccessful
 */
public SearchBean extractMetadataToSearchBean(final InputStream in) {
    TikaContent tc = extractMetadata(in);
    if (tc == null) {
        return null;
    }
    Metadata metadata = tc.getMetadata();
    SearchBean bean = new SearchBean();
    for (final String property: metadata.names()) {
        bean.set(property, metadata.get(property));
    }
    return bean;
}

Source File: TearlineContentExtractor.java From baleen with Apache License 2.0

5 votes

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}

Source File: TikaLuceneContentExtractor.java From cxf with Apache License 2.0

5 votes

private Document extractAll(final InputStream in,
                            LuceneDocumentMetadata documentMetadata,
                            boolean extractContent,
                            boolean extractMetadata) {

    TikaContent content =
        extractor.extract(in, extractContent ? new ToTextContentHandler() : null);

    if (content == null) {
        return null;
    }
    final Document document = new Document();

    if (documentMetadata == null) {
        documentMetadata = defaultDocumentMetadata;
    }
    if (content.getContent() != null) {
        document.add(getContentField(documentMetadata, content.getContent()));
    }

    if (extractMetadata) {
        Metadata metadata = content.getMetadata();
        for (final String property: metadata.names()) {
            addField(document, documentMetadata, property, metadata.get(property));
        }
    }

    if (!StringUtils.isEmpty(documentMetadata.getSource())) {
        document.add(new StringField(documentMetadata.getSourceFieldName(),
            documentMetadata.getSource(), Store.YES));
    }

    return document;

}

Source File: ParseResult.java From beam with Apache License 2.0

5 votes

private ParseResult(String fileLocation, String content, Metadata metadata, Throwable error) {
  checkArgument(fileLocation != null, "fileLocation can not be null");
  checkArgument(content != null, "content can not be null");
  checkArgument(metadata != null, "metadata can not be null");
  this.fileLocation = fileLocation;
  this.content = content;
  this.metadata = metadata;
  this.metadataNames = metadata.names();
  this.error = (error == null) ? null : new SerializableThrowable(error);
}

Source File: ElasticsearchSpewerTest.java From datashare with GNU Affero General Public License v3.0

5 votes

private Map<String, Object> convert(Metadata metadata) {
    Map<String, Object> map = new HashMap<>();
    for (String name: metadata.names()) {
        map.put(name, metadata.get(name));
    }
    return map;
}

Source File: TikaCallable.java From flink-crawler with Apache License 2.0

5 votes

private static Map<String, String> makeMap(Metadata metadata) {
    Map<String, String> result = new HashMap<String, String>();

    for (String key : metadata.names()) {
        result.put(key, metadata.get(key));
    }

    return result;
}

Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0

5 votes

@Override
public JSONObject toJson(String filePath) throws OperationException {

    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }

    final String fileText = handler.toString();
    if(fileText == null || fileText.isEmpty()) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document");
    }

    JSONObject jsonObject = new JSONObject();
    jsonObject.put("_txt", fileText);

    String[] metadataNames = metadata.names();
    for(String name : metadataNames) {
        jsonObject.put(name, metadata.get(name));
    }

    return jsonObject;
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

public static void debug(Metadata metadata) {
    for (String n : metadata.names()) {
        for (String v : metadata.getValues(n)) {
            System.out.println(n + " : "+v);
        }
    }
}

Source File: TikaTest.java From tika-server with Apache License 2.0

5 votes

public static void debug(List<Metadata> list) {
    int i = 0;
    for (Metadata m : list) {
        for (String n : m.names()) {
            for (String v : m.getValues(n)) {
                System.out.println(i + ": "+n + " : "+v);
            }
        }
        i++;
    }
}

Source File: ExtractMediaMetadata.java From nifi with Apache License 2.0

4 votes

private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs,
                                       Integer maxAttribLen) throws IOException, TikaException, SAXException {
    final Metadata metadata = new Metadata();
    final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream);
    try {
        autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);
    } finally {
        tikaInputStream.close();
    }

    final Map<String, String> results = new HashMap<>();
    final Pattern metadataKeyFilter = metadataKeyFilterRef.get();
    final StringBuilder dataBuilder = new StringBuilder();
    for (final String key : metadata.names()) {
        if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) {
            continue;
        }
        dataBuilder.setLength(0);
        if (metadata.isMultiValued(key)) {
            for (String val : metadata.getValues(key)) {
                if (dataBuilder.length() > 1) {
                    dataBuilder.append(", ");
                }
                if (dataBuilder.length() + val.length() < maxAttribLen) {
                    dataBuilder.append(val);
                } else {
                    dataBuilder.append("...");
                    break;
                }
            }
        } else {
            dataBuilder.append(metadata.get(key));
        }
        if (prefix == null) {
            results.put(key, dataBuilder.toString().trim());
        } else {
            results.put(prefix + key, dataBuilder.toString().trim());
        }

        // cutoff at max if provided
        if (maxAttribs != null && results.size() >= maxAttribs) {
            break;
        }
    }
    return results;
}

Source File: MP3Reader.java From red5-io with Apache License 2.0

4 votes

/**
 * Creates reader from file input stream
 * 
 * @param file
 *            file input
 * @throws IOException
 *             on IO error
 */
public MP3Reader(File file) throws IOException {
    this.file = file;
    fis = new FileInputStream(file);
    try {
        // parse the ID3 info
        BodyContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        // MP3 parser
        Mp3Parser parser = new Mp3Parser();
        parser.parse(fis, handler, metadata, null);
        log.debug("Contents of the document: {}", handler.toString());
        // create meta data holder
        metaData = new MetaData();
        String val = null;
        String[] metadataNames = metadata.names();
        for (String name : metadataNames) {
            val = metadata.get(name);
            log.debug("Meta name: {} value: {}", name, val);
            if ("xmpDM:artist".equals(name)) {
                metaData.setArtist(val);
            } else if ("xmpDM:album".equals(name)) {
                metaData.setAlbum(val);
            } else if ("title".equals(name)) {
                metaData.setSongName(val);
            } else if ("xmpDM:genre".equals(name)) {
                metaData.setGenre(val);
            } else if ("xmpDM:logComment".equals(name)) {
                metaData.setComment(val);
            } else if ("xmpDM:trackNumber".equals(name)) {
                metaData.setTrack(val);
            } else if ("xmpDM:releaseDate".equals(name)) {
                metaData.setYear(val);
            } else if ("xmpDM:duration".equals(name) || "duration".equals(name)) {
                metaData.setDuration(val);
            } else if ("xmpDM:audioSampleRate".equals(name) || "samplerate".equals(name)) {
                metaData.setSampleRate(val);
            } else if ("channels".equals(name)) {
                metaData.setChannels(val);
            }
        }
        /*
         * //send album image if included List<Artwork> tagFieldList = idTag.getArtworkList(); if (tagFieldList == null || tagFieldList.isEmpty()) { log.debug("No cover art was found"); }
         * else { Artwork imageField = tagFieldList.get(0); log.debug("Picture type: {}", imageField.getPictureType()); FrameBodyAPIC imageFrameBody = new FrameBodyAPIC();
         * imageFrameBody.setImageData(imageField.getBinaryData()); if (!imageFrameBody.isImageUrl()) { byte[] imageBuffer = (byte[])
         * imageFrameBody.getObjectValue(DataTypes.OBJ_PICTURE_DATA); //set the cover image on the metadata metaData.setCovr(imageBuffer); // Create tag for onImageData event IoBuffer buf
         * = IoBuffer.allocate(imageBuffer.length); buf.setAutoExpand(true); Output out = new Output(buf); out.writeString("onImageData"); Map<Object, Object> props = new HashMap<Object,
         * Object>(); props.put("trackid", 1); props.put("data", imageBuffer); out.writeMap(props); buf.flip(); //Ugh i hate flash sometimes!! //Error #2095: flash.net.NetStream was unable
         * to invoke callback onImageData. ITag result = new Tag(IoConstants.TYPE_METADATA, 0, buf.limit(), null, 0); result.setBody(buf); //add to first frames firstTags.add(result); } }
         * } else { log.info("File did not contain ID3v2 data: {}", file.getName()); }
         */
    } catch (Exception e) {
        log.error("MP3Reader {}", e);
    }
    // ensure we have a valid sample rate
    checkValidHeader();
    // get the total bytes / file size
    fileSize = file.length();
    log.debug("File size: {}", fileSize);
    // analyze keyframes data
    analyzeKeyFrames();
    // create file metadata object
    firstTags.addFirst(createFileMeta());
    log.trace("File input stream - open: {} position: {}", fis.getChannel().isOpen(), fis.getChannel().position());
    // create a channel for reading
    fileChannel = fis.getChannel();
}

Source File: S3River.java From es-amazon-s3-river with Apache License 2.0

4 votes

/** Index an Amazon S3 file by retrieving its content and building the suitable Json content. */
private String indexFile(S3ObjectSummary summary){
   if (logger.isDebugEnabled()){
      logger.debug("Trying to index '{}'", summary.getKey());
   }
   
   try{
      // Build a unique id from S3 unique summary key.
      String fileId = buildIndexIdFromS3Key(summary.getKey());

      if (feedDefinition.isJsonSupport()){
         esIndex(indexName, typeName, summary.getKey(), s3.getContent(summary));
      } else {
         byte[] fileContent = s3.getContent(summary);

         if (fileContent != null) {
            // Compute number of chars to index.
            // see https://github.com/lbroudoux/es-amazon-s3-river/issues/36
            int indexedChars = 100000;
            if (feedDefinition.getIndexedCharsRatio() > 0) {
               indexedChars = (int) Math.round(fileContent.length * feedDefinition.getIndexedCharsRatio());
            }

            // Parse content using Tika directly.
            Metadata fileMetadata = new Metadata();
            String parsedContent = TikaHolder.tika().parseToString(
                  new BytesStreamInput(fileContent), fileMetadata, indexedChars);

            // Store Tika metadatas into a map.
            Map<String, Object> fileMetadataMap = new HashMap<String, Object>();
            for (String key : fileMetadata.names()) {
               fileMetadataMap.put(key, fileMetadata.get(key));
            }

            esIndex(indexName, typeName, fileId,
                  jsonBuilder()
                        .startObject()
                           .field(S3RiverUtil.DOC_FIELD_TITLE, summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1))
                           .field(S3RiverUtil.DOC_FIELD_MODIFIED_DATE, summary.getLastModified().getTime())
                           .field(S3RiverUtil.DOC_FIELD_SOURCE_URL, s3.getDownloadUrl(summary, feedDefinition))
                           .field(S3RiverUtil.DOC_FIELD_METADATA, s3.getS3UserMetadata(summary.getKey()))
                           .startObject("file")
                              .field("_name", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1))
                              .field("title", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1))
                              .field("file", parsedContent)
                              .field("metadata", fileMetadataMap)
                           .endObject()
                        .endObject()
            );
            return fileId;
         }
      }
   } catch (Exception e) {
      logger.warn("Can not index " + summary.getKey() + " : " + e.getMessage());
   }
   return null;
}

Source File: DocUtils.java From geoportal-server-harvester with Apache License 2.0

4 votes

public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException {
	
	// Input & Output Variables
	ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes);
	byte[]               xml_bytes  = null;
	
	// Tika Parser Objects
    Parser               parser     = new AutoDetectParser();
    BodyContentHandler   handler    = new BodyContentHandler();
    Metadata             metadata   = new Metadata();
    ParseContext         context    = new ParseContext();
	  
    try {
    	// Populate Metadata Object with Tika Parser
    	parser.parse(base_input, handler, metadata, context);
    	
    	// Container & Writer for Metadata
    	Properties   meta_props = new Properties();
    	StringWriter sw         = new StringWriter();
    	
    	// Put Tika Metadata in Properties
    	for(String name : metadata.names()) {
    		if (!metadata.get(name).isEmpty()) {
    			meta_props.put(name, metadata.get(name));
    		}
    	}
    	meta_props.store(sw, "Tika Values");

    	// Expected Harvester Properties
    	String     meta_descr  = metadata.get(TikaCoreProperties.DESCRIPTION);
    	String     meta_modif  = metadata.get(TikaCoreProperties.MODIFIED);
    	String     meta_title  = metadata.get(TikaCoreProperties.TITLE);
    	
    	// Default Label for Undefined Tika Properties
    	DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd");
    	Date       date        = new Date();
    	String     date_today  = date_format.format(date);
    	String     tika_label  = String.format("TIKA_%s", date_today);
    	
    	// Check For Null Values & Set Defaults
    	if (meta_descr == null) {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString());
    	} else {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr);
    	}
    	
    	if (meta_modif == null) {
    		meta_props.put(WKAConstants.WKA_MODIFIED, tika_label);
    	} else {
    		meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif);
    	}
    	
    	if (meta_title == null) {
    		meta_props.put(WKAConstants.WKA_TITLE, file_name);
    	} else {
    		meta_props.put(WKAConstants.WKA_TITLE, meta_title);
    	}
 	
    	// Build XML as Bytes
    	MapAttribute attr = AttributeUtils.fromProperties(meta_props);
		Document document = new SimpleDcMetaBuilder().create(attr);
		xml_bytes = XmlUtils.toString(document).getBytes("UTF-8");
    		
    } catch (Exception ex) {
      LOG.error(String.format("Error reading data."), ex);
    } finally {
    	base_input.close();
    }
	
	return xml_bytes;
	
}

Java Code Examples for org.apache.tika.metadata.Metadata#names()