org.apache.tika.parser.html.HtmlMapper Java Examples

The following examples show how to use org.apache.tika.parser.html.HtmlMapper. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TikaCallable.java    From flink-crawler with Apache License 2.0 6 votes vote down vote up
/**
 * Decide if we need to set up our own HtmlMapper, because the link extractor has tags that aren't part of the
 * default set.
 * 
 * @return
 */
private ParseContext makeParseContext() {
    ParseContext result = new ParseContext();

    Set<String> validTags = _linkExtractor.getLinkTags();
    HtmlMapper defaultMapper = DefaultHtmlMapper.INSTANCE;
    for (String tag : validTags) {
        if (defaultMapper.mapSafeElement(tag) == null) {
            result.set(HtmlMapper.class,
                    new CustomHtmlMapper(validTags, _linkExtractor.getLinkAttributeTypes()));
            break;
        }
    }

    return result;
}
 
Example #2
Source File: Extractor.java    From extract with MIT License 4 votes vote down vote up
/**
 * Create a pull-parser from the given {@link TikaInputStream}.
 *
 * @param path the stream to extract from
 * @return A pull-parsing reader.
 */
public TikaDocument extract(final Path path) throws IOException {
	final TikaDocument rootDocument = documentFactory.create(path);
	TikaInputStream tikaInputStream = TikaInputStream.get(path, rootDocument.getMetadata());
	final ParseContext context = new ParseContext();
	final AutoDetectParser autoDetectParser = new AutoDetectParser(defaultParser);

	// Set a fallback parser that outputs an empty tikaDocument for empty files,
	// otherwise throws an exception.
	autoDetectParser.setFallback(FallbackParser.INSTANCE);
	final Parser parser;

	if (null != digester) {
		parser = new DigestingParser(autoDetectParser, digester);
	} else {
		parser = autoDetectParser;
	}

	if (!ocrDisabled) {
		context.set(TesseractOCRConfig.class, ocrConfig);
	}

	context.set(PDFParserConfig.class, pdfConfig);

	// Only include "safe" tags in the HTML output from Tika's HTML parser.
	// This excludes script tags and objects.
	context.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE);

	final Reader reader;
	final Function<Writer, ContentHandler> handler;

	if (OutputFormat.HTML == outputFormat) {
		handler = (writer) -> new ExpandedTitleContentHandler(new HTML5Serializer(writer));
	} else {

		// The default BodyContentHandler is used when constructing the ParsingReader for text output, but
		// because only the body of embeds is pushed to the content handler further down the line, we can't
		// expect a body tag.
		handler = WriteOutContentHandler::new;
	}

	if (EmbedHandling.SPAWN == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedSpawner(rootDocument, context, embedOutput, handler));
	} else if (EmbedHandling.CONCATENATE == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedParser(rootDocument, context));
	} else {
		context.set(Parser.class, EmptyParser.INSTANCE);
		context.set(EmbeddedDocumentExtractor.class, new EmbedBlocker());
	}

	// the constructor of ParsingReader actually parses the document in background
	if (OutputFormat.HTML == outputFormat) {
		reader = new ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context, handler);
	} else {
		reader = new org.apache.tika.parser.ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context);
	}
	rootDocument.setReader(reader);

	return rootDocument;
}
 
Example #3
Source File: ParserBolt.java    From storm-crawler with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public void prepare(Map conf, TopologyContext context,
        OutputCollector collector) {

    emitOutlinks = ConfUtils.getBoolean(conf, "parser.emitOutlinks", true);

    urlFilters = URLFilters.fromConf(conf);

    parseFilters = ParseFilters.fromConf(conf);

    upperCaseElementNames = ConfUtils.getBoolean(conf,
            "parser.uppercase.element.names", true);

    extractEmbedded = ConfUtils.getBoolean(conf, "parser.extract.embedded",
            false);

    String htmlmapperClassName = ConfUtils.getString(conf,
            "parser.htmlmapper.classname",
            "org.apache.tika.parser.html.IdentityHtmlMapper");

    try {
        HTMLMapperClass = Class.forName(htmlmapperClassName);
        boolean interfaceOK = HtmlMapper.class
                .isAssignableFrom(HTMLMapperClass);
        if (!interfaceOK) {
            throw new RuntimeException("Class " + htmlmapperClassName
                    + " does not implement HtmlMapper");
        }
    } catch (ClassNotFoundException e) {
        LOG.error("Can't load class {}", htmlmapperClassName);
        throw new RuntimeException("Can't load class "
                + htmlmapperClassName);
    }

    mimeTypeWhiteList = ConfUtils.loadListFromConf(
            "parser.mimetype.whitelist", conf);

    protocolMDprefix = ConfUtils.getString(conf,
            ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, "");

    // instantiate Tika
    long start = System.currentTimeMillis();
    tika = new Tika();
    long end = System.currentTimeMillis();

    LOG.debug("Tika loaded in {} msec", end - start);

    this.collector = collector;

    this.eventCounter = context.registerMetric(this.getClass()
            .getSimpleName(), new MultiCountMetric(), 10);

    this.metadataTransfer = MetadataTransfer.getInstance(conf);
}
 
Example #4
Source File: SimplePageParser.java    From flink-crawler with Apache License 2.0 3 votes vote down vote up
/**
 * @param parserPolicy
 *            to customize operation of the parser
 * @param pageScorer
 *            to score importance of page (priority of its outlinks)
 * @param includeMarkup
 *            true if output should be raw HTML, versus extracted text <BR>
 * <BR>
 *            <B>Note:</B> There is no need to construct your own {@link SimpleLinkExtractor}
 *            simply to control the set of link tags and attributes it processes. Instead, use
 *            {@link ParserPolicy#setLinkTags} and {@link ParserPolicy#setLinkAttributeTypes},
 *            and then pass this policy to {@link SimplePageParser#SimpleParser(ParserPolicy)}.
 */
public SimplePageParser(BaseContentExtractor contentExtractor, BaseLinkExtractor linkExtractor,
        ParserPolicy parserPolicy, BasePageScorer pageScorer, boolean includeMarkup) {
    super(parserPolicy, pageScorer);

    _contentExtractor = contentExtractor;
    _linkExtractor = linkExtractor;

    if (includeMarkup) {
        _parseContext = new ParseContext();
        _parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
    }
}