Java Code Examples for com.geccocrawler.gecco.request.HttpRequest#getUrl()

The following examples show how to use com.geccocrawler.gecco.request.HttpRequest#getUrl() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SpiderBeanFactory.java    From gecco with MIT License 6 votes vote down vote up
public Class<? extends SpiderBean> matchSpider(HttpRequest request) {
	String url = request.getUrl();
	Class<? extends SpiderBean> commonSpider = null;// 通用爬虫
	for (Map.Entry<String, Class<? extends SpiderBean>> entrys : spiderBeans.entrySet()) {
		Class<? extends SpiderBean> spider = entrys.getValue();
		String urlPattern = entrys.getKey();
		Map<String, String> params = UrlMatcher.match(url, urlPattern);
		if (params != null) {
			request.setParameters(params);
			return spider;
		} else {
			if (urlPattern.equals("*")) {
				commonSpider = spider;
			}
		}
	}
	if (commonSpider != null) {// 如果包含通用爬虫,返回通用爬虫
		return commonSpider;
	}
	return null;
}
 
Example 2
Source File: ProductListPipeline.java    From gecco with MIT License 6 votes vote down vote up
@Override
public void process(ProductList productList) {
	HttpRequest currRequest = productList.getRequest();
	//下一页继续抓取
	int currPage = productList.getCurrPage();
	int nextPage = currPage + 1;
	int totalPage = productList.getTotalPage();
	if(nextPage <= totalPage) {
		String nextUrl = "";
		String currUrl = currRequest.getUrl();
		if(currUrl.indexOf("page=") != -1) {
			nextUrl = StringUtils.replaceOnce(currUrl, "page=" + currPage, "page=" + nextPage);
		} else {
			nextUrl = currUrl + "&" + "page=" + nextPage;
		}
		SchedulerContext.into(currRequest.subRequest(nextUrl));
	}
}
 
Example 3
Source File: ProductListJsonPipeline.java    From gecco with MIT License 6 votes vote down vote up
@Override
public void process(JSONObject productList) {
	HttpRequest currRequest = HttpGetRequest.fromJson(productList.getJSONObject("request"));
	//下一页继续抓取
	int currPage = productList.getIntValue("currPage");
	int nextPage = currPage + 1;
	int totalPage = productList.getIntValue("totalPage");
	if(nextPage <= totalPage) {
		String nextUrl = "";
		String currUrl = currRequest.getUrl();
		if(currUrl.indexOf("page=") != -1) {
			nextUrl = StringUtils.replaceOnce(currUrl, "page=" + currPage, "page=" + nextPage);
		} else {
			nextUrl = currUrl + "&" + "page=" + nextPage;
		}
		SchedulerContext.into(currRequest.subRequest(nextUrl));
	}
}
 
Example 4
Source File: JSVarFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@Override
@SuppressWarnings({ "unchecked" })
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) {
	Context cx = Context.enter();
	ScriptableObject scope = cx.initSafeStandardObjects();
	String windowScript = "var window = {};var document = {};";
	cx.evaluateString(scope, windowScript, "window", 1, null);
	HtmlParser parser = new HtmlParser(request.getUrl(), response.getContent());
	for (Element ele : parser.$("script")) {
		String sc = ele.html();
		if (StringUtils.isNotEmpty(sc)) {
			try {
				cx.evaluateString(scope, sc, "", 1, null);
			} catch (Exception ex) {
				// ex.printStackTrace();
			}
		}
	}
	Map<String, Object> fieldMap = new HashMap<String, Object>();
	Set<Field> jsVarFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(JSVar.class));
	for (Field jsVarField : jsVarFields) {
		Object value = injectJsVarField(request, beanMap, jsVarField, cx, scope);
		if(value != null) {
			fieldMap.put(jsVarField.getName(), value);
		}
	}
	beanMap.putAll(fieldMap);
	Context.exit();
}
 
Example 5
Source File: HtmlUnitDownloder.java    From gecco-htmlunit with MIT License 4 votes vote down vote up
public HttpResponse download(HttpRequest request, int timeout) throws DownloadException {
	try {
		URL url = new URL(request.getUrl());
		WebRequest webRequest = new WebRequest(url);
		webRequest.setHttpMethod(HttpMethod.GET);
		if(request instanceof HttpPostRequest) {//post
			HttpPostRequest post = (HttpPostRequest)request;
			webRequest.setHttpMethod(HttpMethod.POST);
			List<NameValuePair> requestParameters = new ArrayList<NameValuePair>();
			for(Map.Entry<String, Object> entry : post.getFields().entrySet()) {
				NameValuePair nvp = new NameValuePair(entry.getKey(), entry.getValue().toString());
				requestParameters.add(nvp);
			}
			webRequest.setRequestParameters(requestParameters);	
		}
		//header
		boolean isMobile = SpiderThreadLocal.get().getEngine().isMobile();
		webRequest.setAdditionalHeader("User-Agent", UserAgent.getUserAgent(isMobile));
		webRequest.setAdditionalHeaders(request.getHeaders());
		//proxy
		HttpHost proxy = Proxys.getProxy();
		if(proxy != null) {
			webRequest.setProxyHost(proxy.getHostName());
			webRequest.setProxyPort(proxy.getPort());
		}
		//timeout
		this.webClient.getOptions().setTimeout(timeout);
		//request,response
		webClient.getPage(webRequest);
		HtmlPage page = webClient.getPage(request.getUrl());
		HttpResponse resp = new HttpResponse();
		WebResponse webResponse = page.getWebResponse();
		int status = webResponse.getStatusCode();
		resp.setStatus(status);
		if(status == 302 || status == 301) {
			String redirectUrl = webResponse.getResponseHeaderValue("Location");
			resp.setContent(UrlUtils.relative2Absolute(request.getUrl(), redirectUrl));
		} else if(status == 200) {
			String content = page.asXml();
			resp.setContent(content);
			resp.setRaw(webResponse.getContentAsStream());
			String contentType = webResponse.getContentType();
			resp.setContentType(contentType);
			String charset = getCharset(request.getCharset(), contentType);
			resp.setCharset(charset);
		} else {
			throw new DownloadException("ERROR : " + status);
		}
		return resp;
	} catch(Exception ex) {
		throw new DownloadException(ex);
	}
}