com.gargoylesoftware.htmlunit.html.HtmlPage#asXml

Source File: WebClient8Test.java From htmlunit with Apache License 2.0

6 votes

/**
 * @throws Exception if something goes wrong
 */
@Test
public void cloneNode() throws Exception {
    final String html = "<html>\n"
            + "<head><title>foo</title></head>\n"
            + "<body>\n"
            + "<p>hello world</p>\n"
            + "</body></html>";

    try (WebClient webClient = new WebClient(getBrowserVersion(), false, null, -1)) {
        final HtmlPage page = loadPage(webClient, html, null, URL_FIRST);

        final String org = page.asXml();

        final HtmlPage clonedPage = page.cloneNode(true);
        final String clone = clonedPage.asXml();

        assertEquals(org, clone);
    }
}

Source File: EpgCrawler.java From MyTv with Apache License 2.0

6 votes

@Override
public List<TvStation> crawlAllTvStation() {
	String epgFile = getCrawlFilePath();
	File file = new File(epgFile);
	String html = null;
	if (file.exists()) {
		try {
			html = MyTvUtils.readAsHtml(epgFile);
			return parseTvStation(html);
		} catch (IOException e) {
			// do nothing
		}
		return null;
	}
	HtmlPage htmlPage = (HtmlPage) WebCrawler.crawl(getUrl());
	html = htmlPage.asXml();
	MyTvUtils.outputCrawlData(getCrawlerName(), html, getCrawlFileName());
	List<TvStation> stationList = parseTvStation(html);
	for (CrawlEventListener listener : listeners) {
		listener.crawlEnd(new AllTvStationCrawlEndEvent(this, stationList));
	}
	return stationList;
}

Source File: HTMLTableElement2Test.java From htmlunit with Apache License 2.0

5 votes

/**
 * @throws Exception if the test fails
 */
@Test
public void width() throws Exception {
    final String content
        = "<html><head></head><body>\n"
            + "<table id='tableID' style='background:blue'><tr><td></td></tr></table>\n"
            + "<script language='javascript'>\n"
            + "    var table = document.getElementById('tableID');\n"
            + "    table.width = '200';\n"
            + "</script></body></html>";

    final HtmlPage page = loadPage(content);
    final String xml = page.asXml();
    assertTrue(xml.contains("width=\"200\""));
}

Source File: Downloader.java From MMDownloader with Apache License 2.0

5 votes

/**
 * HtmlUnit을 이용한 HTML 코드 파싱.
 *
 * @param eachArchiveAddress 실제 만화가 담긴 아카이브 주소
 * @return 성공 시 html 코드를 리턴
 */
private String getHtmlPageHtmlUnit(String eachArchiveAddress) throws Exception {
	/* 필수! 로그 메세지 출력 안함 -> HtmlUnit 이용시 Verbose한 로그들이 너무 많아서 다 끔 */
	java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
	System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");

	print.info("일반 연결 시도중...\n");

	WebClient webClient = new WebClient();
	webClient.getOptions().setRedirectEnabled(true);

	WebRequest req = new WebRequest(new URL(eachArchiveAddress));
	req.setHttpMethod(HttpMethod.POST);
	req.setAdditionalHeader("User-Agent", UserAgent.getUserAgent());
	req.setAdditionalHeader("Accept-Encoding", "gzip"); //20171126 gzip 추가
	req.getRequestParameters().add(new NameValuePair("pass", PASSWORD)); //비밀번호 post 방식 전송

	HtmlPage page = webClient.getPage(req);

	//Html코드를 포함한 페이지 소스코드가 담길 스트링
	String pageSource = page.asXml();

	/** 여기도 페이지 파싱 실패 시 검증하는 코드 들어가야 됨 **/

	webClient.close();
	print.info("일반 연결 성공\n");
	return pageSource;
}

Source File: htmlunitTest.java From crawler-jsoup-maven with Apache License 2.0

5 votes

public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
    
    // 屏蔽HtmlUnit等系统 log
    LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
    java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
    java.util.logging.Logger.getLogger("org.apache.http.client").setLevel(Level.OFF);
    
    String url = "https://www.newsmth.net/nForum/#!section/Estate";
    System.out.println("Loading page now-----------------------------------------------: "+url);
    
    /* HtmlUnit 模拟浏览器 */
    WebClient webClient = new WebClient(BrowserVersion.CHROME);
    webClient.getOptions().setJavaScriptEnabled(true);              // 启用JS解释器，默认为true  
    webClient.getOptions().setCssEnabled(false);                    // 禁用css支持  
    webClient.getOptions().setThrowExceptionOnScriptError(false);   // js运行错误时，是否抛出异常
    webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
    webClient.getOptions().setTimeout(10 * 1000);                   // 设置连接超时时间
    HtmlPage page = webClient.getPage(url);
    webClient.waitForBackgroundJavaScript(30 * 1000);               // 等待js后台执行30秒

    String pageAsXml = page.asXml();
    
    /* Jsoup解析处理 */
    // Document doc = Jsoup.parse(pageAsXml, "https://bluetata.com/");
    Document doc = Jsoup.parse(pageAsXml);  
    //Elements pngs = doc.select("img[src$=.png]");                   // 获取所有图片元素集
    
    Elements eles = doc.select("td.title_1");
    // 其他操作
    System.out.println(eles.toString());
}

Source File: TvMaoCrawler.java From MyTv with Apache License 2.0

5 votes

/**
 * 解析指定城市下的电视台
 * 
 * @param htmlPage
 * @param city
 *            所属城市
 * @return
 */
private List<TvStation> getTvStations(HtmlPage htmlPage, String city) {
	String html = htmlPage.asXml();
	List<?> elements = htmlPage
			.getByXPath("//div[@class='chlsnav']/div[@class='pbar']/b");
	HtmlBold hb = (HtmlBold) elements.get(0);
	String classify = hb.getTextContent().trim();
	MyTvUtils.outputCrawlData(getCrawlerName(), html,
			getCrawlFileName(city, classify));
	List<TvStation> stationList = parseTvStation(city, html);
	logger.debug("tv station crawled." + stationList);
	return stationList;
}

Source File: SentenceExtractor.java From superword with Apache License 2.0

5 votes

public static String getContent2(String url) {
    try{
        LOGGER.debug("url:"+url);
        HtmlPage htmlPage = WEB_CLIENT.getPage(url);
        String html = htmlPage.asXml();
        //LOGGER.debug("html:"+html);
        return html;
    }catch (Exception e) {
        e.printStackTrace();
        LOGGER.error("获取URL："+url+"页面出错", e);
    }
    return "";
}

Source File: JsSupporedUrlFetcher.java From seldon-server with Apache License 2.0

5 votes

@Override
public String getUrl(String url) throws Exception {
    long timing_start = System.currentTimeMillis();

    BrowserVersion browserVersion = BrowserVersion.getDefault();
    logger.info("Using user-agent: " + browserVersion.getUserAgent());
    final WebClient webClient = new WebClient(browserVersion);
    webClient.setTimeout(httpGetTimeout);
    final HtmlPage page = webClient.getPage(url);
    long timing_end = System.currentTimeMillis();
    logger.info(String.format("fetched page[%s] in ms[%d]", url, (timing_end - timing_start)));
    return page.asXml();
}

Source File: HtmlUnitPageLoader.java From xxl-crawler with GNU General Public License v3.0

4 votes

@Override
public Document load(PageRequest pageRequest) {
    if (!UrlUtil.isUrl(pageRequest.getUrl())) {
        return null;
    }

    WebClient webClient = new WebClient();
    try {
        WebRequest webRequest = new WebRequest(new URL(pageRequest.getUrl()));

        // 请求设置
        webClient.getOptions().setUseInsecureSSL(true);
        webClient.getOptions().setJavaScriptEnabled(true);
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setDoNotTrackEnabled(false);
        webClient.getOptions().setUseInsecureSSL(!pageRequest.isValidateTLSCertificates());

        if (pageRequest.getParamMap() != null && !pageRequest.getParamMap().isEmpty()) {
            for (Map.Entry<String, String> paramItem : pageRequest.getParamMap().entrySet()) {
                webRequest.getRequestParameters().add(new NameValuePair(paramItem.getKey(), paramItem.getValue()));
            }
        }
        if (pageRequest.getCookieMap() != null && !pageRequest.getCookieMap().isEmpty()) {
            webClient.getCookieManager().setCookiesEnabled(true);
            for (Map.Entry<String, String> cookieItem : pageRequest.getCookieMap().entrySet()) {
                webClient.getCookieManager().addCookie(new Cookie("", cookieItem.getKey(), cookieItem.getValue()));
            }
        }
        if (pageRequest.getHeaderMap() != null && !pageRequest.getHeaderMap().isEmpty()) {
            webRequest.setAdditionalHeaders(pageRequest.getHeaderMap());
        }
        if (pageRequest.getUserAgent() != null) {
            webRequest.setAdditionalHeader("User-Agent", pageRequest.getUserAgent());
        }
        if (pageRequest.getReferrer() != null) {
            webRequest.setAdditionalHeader("Referer", pageRequest.getReferrer());
        }

        webClient.getOptions().setTimeout(pageRequest.getTimeoutMillis());
        webClient.setJavaScriptTimeout(pageRequest.getTimeoutMillis());
        webClient.waitForBackgroundJavaScript(pageRequest.getTimeoutMillis());

        // 代理
        if (pageRequest.getProxy() != null) {
            InetSocketAddress address = (InetSocketAddress) pageRequest.getProxy().address();
            boolean isSocks = pageRequest.getProxy().type() == Proxy.Type.SOCKS;
            webClient.getOptions().setProxyConfig(new ProxyConfig(address.getHostName(), address.getPort(), isSocks));
        }

        // 发出请求
        if (pageRequest.isIfPost()) {
            webRequest.setHttpMethod(HttpMethod.POST);
        } else {
            webRequest.setHttpMethod(HttpMethod.GET);
        }
        HtmlPage page = webClient.getPage(webRequest);

        String pageAsXml = page.asXml();
        if (pageAsXml != null) {
            Document html = Jsoup.parse(pageAsXml);
            return html;
        }
    } catch (IOException e) {
        logger.error(e.getMessage(), e);
    } finally {
        if (webClient != null) {
            webClient.close();
        }
    }
    return null;
}

Source File: HtmlUnitDownloder.java From gecco-htmlunit with MIT License

4 votes

public HttpResponse download(HttpRequest request, int timeout) throws DownloadException {
	try {
		URL url = new URL(request.getUrl());
		WebRequest webRequest = new WebRequest(url);
		webRequest.setHttpMethod(HttpMethod.GET);
		if(request instanceof HttpPostRequest) {//post
			HttpPostRequest post = (HttpPostRequest)request;
			webRequest.setHttpMethod(HttpMethod.POST);
			List<NameValuePair> requestParameters = new ArrayList<NameValuePair>();
			for(Map.Entry<String, Object> entry : post.getFields().entrySet()) {
				NameValuePair nvp = new NameValuePair(entry.getKey(), entry.getValue().toString());
				requestParameters.add(nvp);
			}
			webRequest.setRequestParameters(requestParameters);	
		}
		//header
		boolean isMobile = SpiderThreadLocal.get().getEngine().isMobile();
		webRequest.setAdditionalHeader("User-Agent", UserAgent.getUserAgent(isMobile));
		webRequest.setAdditionalHeaders(request.getHeaders());
		//proxy
		HttpHost proxy = Proxys.getProxy();
		if(proxy != null) {
			webRequest.setProxyHost(proxy.getHostName());
			webRequest.setProxyPort(proxy.getPort());
		}
		//timeout
		this.webClient.getOptions().setTimeout(timeout);
		//request,response
		webClient.getPage(webRequest);
		HtmlPage page = webClient.getPage(request.getUrl());
		HttpResponse resp = new HttpResponse();
		WebResponse webResponse = page.getWebResponse();
		int status = webResponse.getStatusCode();
		resp.setStatus(status);
		if(status == 302 || status == 301) {
			String redirectUrl = webResponse.getResponseHeaderValue("Location");
			resp.setContent(UrlUtils.relative2Absolute(request.getUrl(), redirectUrl));
		} else if(status == 200) {
			String content = page.asXml();
			resp.setContent(content);
			resp.setRaw(webResponse.getContentAsStream());
			String contentType = webResponse.getContentType();
			resp.setContentType(contentType);
			String charset = getCharset(request.getCharset(), contentType);
			resp.setCharset(charset);
		} else {
			throw new DownloadException("ERROR : " + status);
		}
		return resp;
	} catch(Exception ex) {
		throw new DownloadException(ex);
	}
}

Java Code Examples for com.gargoylesoftware.htmlunit.html.HtmlPage#asXml()