org.apache.tika.parser.html.HtmlParser Java Examples

The following examples show how to use org.apache.tika.parser.html.HtmlParser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExcelHtmlParser.java    From components with Apache License 2.0 5 votes vote down vote up
public static List<List<String>> getRows(InputStream rawContent, String encoding, long limit) {
  SimpleValuesContentHandler valuesContentHandler = new SimpleValuesContentHandler(-1, limit);

  HtmlParser htmlParser = new HtmlParser();
  Metadata metadata = new Metadata();
  metadata.add(Metadata.CONTENT_ENCODING, encoding);
  try {
    htmlParser.parse(rawContent, valuesContentHandler, metadata, new ParseContext());
  } catch (Exception e) {
    LOGGER.debug("Failed to parse the excel html format document.", e);
  }

  return valuesContentHandler.getValues();
}
 
Example #2
Source File: HtmlSchemaParser.java    From data-prep with Apache License 2.0 5 votes vote down vote up
/**
 * @see SchemaParser#parse(Request)
 */
@Override
public Schema parse(Request request) {

    try {
        SimpleHeadersContentHandler headersContentHandler = new SimpleHeadersContentHandler();

        InputStream inputStream = request.getContent();
        HtmlParser htmlParser = new HtmlParser();

        Metadata metadata = new Metadata();

        htmlParser.parse(inputStream, headersContentHandler, metadata, new ParseContext());

        List<ColumnMetadata> columns = new ArrayList<>(headersContentHandler.getHeaderValues().size());

        for (String headerValue : headersContentHandler.getHeaderValues()) {
            columns.add(ColumnMetadata.Builder
                    .column() //
                    .type(Type.STRING) // ATM not doing any complicated type calculation
                    .name(headerValue) //
                    .id(columns.size()) //
                    .build());
        }

        Schema.SheetContent sheetContent = new Schema.SheetContent();
        sheetContent.setColumnMetadatas(columns);

        return Schema.Builder
                .parserResult() //
                .sheetContents(Collections.singletonList(sheetContent)) //
                .draft(false) //
                .build();

    } catch (Exception e) {
        LOGGER.debug("Exception during parsing html request :" + e.getMessage(), e);
        throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
    }

}
 
Example #3
Source File: CrawlTool.java    From flink-crawler with Apache License 2.0 4 votes vote down vote up
public static void run(StreamExecutionEnvironment env, CrawlToolOptions options)
        throws Exception {

    // TODO Complain if -cachedir is specified when not running locally?

    SimpleUrlValidator urlValidator = (options.isSingleDomain()
            ? new SingleDomainUrlValidator(options.getSingleDomain())
            : new SimpleUrlValidator());

    UserAgent userAgent = (options.isCommonCrawl()
            ? new UserAgent("unused-common-crawl-user-agent", "", "") : options.getUserAgent());

    BaseUrlLengthener urlLengthener = CrawlToolUtils.getUrlLengthener(options, userAgent);
    BaseHttpFetcherBuilder siteMapFetcherBuilder = CrawlToolUtils.getSitemapFetcherBuilder(options, userAgent);
    BaseHttpFetcherBuilder robotsFetcherBuilder = CrawlToolUtils.getRobotsFetcherBuilder(options, userAgent);
    BaseHttpFetcherBuilder pageFetcherBuilder = CrawlToolUtils.getPageFetcherBuilder(options, userAgent);

    // See if we need to restrict what mime types we download.
    if (options.isHtmlOnly()) {
        Set<String> validMimeTypes = new HashSet<>();
        for (MediaType mediaType : new HtmlParser().getSupportedTypes(new ParseContext())) {
            validMimeTypes.add(mediaType.toString());
        }

        pageFetcherBuilder.setValidMimeTypes(validMimeTypes);
    }

    CrawlTopologyBuilder builder = new CrawlTopologyBuilder(env)
            .setUserAgent(userAgent)
            .setUrlLengthener(urlLengthener)
            .setUrlSource(new SeedUrlSource(options.getSeedUrlsFilename(), RawUrl.DEFAULT_SCORE))
            .setCrawlTerminator(new DurationCrawlTerminator(options.getMaxCrawlDurationSec()))
            .setRobotsFetcherBuilder(robotsFetcherBuilder).setUrlFilter(urlValidator)
            .setSiteMapFetcherBuilder(siteMapFetcherBuilder)
            .setPageFetcherBuilder(pageFetcherBuilder)
            .setForceCrawlDelay(options.getForceCrawlDelay())
            .setDefaultCrawlDelay(options.getDefaultCrawlDelay())
            .setParallelism(options.getParallelism())
            .setIterationTimeout(options.getIterationTimeoutSec() * 1000L)
            .setMaxOutlinksPerPage(options.getMaxOutlinksPerPage());

    if (options.getTextContentPathString() != null) {
        builder.setTextContentPath(options.getTextContentPathString());
    }

    builder.build().execute();
}