com.geccocrawler.gecco.request.HttpRequest Java Examples

The following examples show how to use com.geccocrawler.gecco.request.HttpRequest. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JsonFieldRender.java    From gecco with MIT License 6 votes vote down vote up
@Override
@SuppressWarnings({ "unchecked" })
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) {
	Map<String, Object> fieldMap = new HashMap<String, Object>();
	Set<Field> jsonPathFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(JSONPath.class));
	String jsonStr = response.getContent();
	jsonStr = jsonp2Json(jsonStr);
	if (jsonStr == null) {
		return;
	}
	try {
		Object json = JSON.parse(jsonStr);
		for (Field field : jsonPathFields) {
			Object value = injectJsonField(request, field, json);
			if(value != null) {
				fieldMap.put(field.getName(), value);
			}
		}
	} catch(JSONException ex) {
		//throw new RenderException(ex.getMessage(), bean.getClass());
		RenderException.log("json parse error : " + request.getUrl(), bean.getClass(), ex);
	}
	beanMap.putAll(fieldMap);
}
 
Example #2
Source File: ProductListPipeline.java    From gecco with MIT License 6 votes vote down vote up
@Override
public void process(ProductList productList) {
	HttpRequest currRequest = productList.getRequest();
	//下一页继续抓取
	int currPage = productList.getCurrPage();
	int nextPage = currPage + 1;
	int totalPage = productList.getTotalPage();
	if(nextPage <= totalPage) {
		String nextUrl = "";
		String currUrl = currRequest.getUrl();
		if(currUrl.indexOf("page=") != -1) {
			nextUrl = StringUtils.replaceOnce(currUrl, "page=" + currPage, "page=" + nextPage);
		} else {
			nextUrl = currUrl + "&" + "page=" + nextPage;
		}
		SchedulerContext.into(currRequest.subRequest(nextUrl));
	}
}
 
Example #3
Source File: Spider.java    From gecco with MIT License 6 votes vote down vote up
private HttpResponse download(SpiderBeanContext context, HttpRequest request) throws DownloadException {
		Downloader currDownloader = null;
		BeforeDownload before = null;
		AfterDownload after = null;
		int timeout = 1000;
		if(context != null) {
			currDownloader = context.getDownloader();
			before = context.getBeforeDownload();
			after = context.getAfterDownload();
			timeout = context.getTimeout();
		} else {
			currDownloader = engine.getSpiderBeanFactory().getDownloaderFactory().defaultDownloader();
		}
		if(before != null) {
			before.process(request);
		}
		HttpResponse response = currDownloader.download(request, timeout);
		if(after != null) {
			after.process(request, response);
		}
		return response;
}
 
Example #4
Source File: SpiderBeanFactory.java    From gecco with MIT License 6 votes vote down vote up
public Class<? extends SpiderBean> matchSpider(HttpRequest request) {
	String url = request.getUrl();
	Class<? extends SpiderBean> commonSpider = null;// 通用爬虫
	for (Map.Entry<String, Class<? extends SpiderBean>> entrys : spiderBeans.entrySet()) {
		Class<? extends SpiderBean> spider = entrys.getValue();
		String urlPattern = entrys.getKey();
		Map<String, String> params = UrlMatcher.match(url, urlPattern);
		if (params != null) {
			request.setParameters(params);
			return spider;
		} else {
			if (urlPattern.equals("*")) {
				commonSpider = spider;
			}
		}
	}
	if (commonSpider != null) {// 如果包含通用爬虫,返回通用爬虫
		return commonSpider;
	}
	return null;
}
 
Example #5
Source File: JdPricesFieldRender.java    From gecco with MIT License 6 votes vote down vote up
@Override
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean, Field field) {
	ProductList jd = (ProductList)bean;
	StringBuffer sb = new StringBuffer();
	/*for(String code : jd.getCodes()) {
		sb.append("J_").append(code).append(",");
	}*/
	String skuIds = sb.toString();
	try {
		skuIds = URLEncoder.encode(skuIds, "UTF-8");
	} catch (UnsupportedEncodingException e) {
		e.printStackTrace();
	}
	String url = "http://p.3.cn/prices/mgets?skuIds="+skuIds;
	HttpRequest subRequest = request.subRequest(url);
	try {
		HttpResponse subReponse = DownloaderContext.download(subRequest);
		String json = subReponse.getContent();
		List<JDPrice> prices = JSON.parseArray(json, JDPrice.class);
		beanMap.put(field.getName(), prices);
	} catch(Exception ex) {
		ex.printStackTrace();
	}
}
 
Example #6
Source File: ImageFieldRender.java    From gecco with MIT License 5 votes vote down vote up
private String downloadImage(HttpRequest request, Field field, String imgUrl) {
	if(StringUtils.isEmpty(imgUrl)) {
		return imgUrl;
	}
	Image image = field.getAnnotation(Image.class);
	String parentPath = image.download();
	if(StringUtils.isEmpty(parentPath)) {
		return imgUrl;
	}
	HttpResponse subReponse = null;
	try {
		String before =  StringUtils.substringBefore(imgUrl, "?");
		String last =  StringUtils.substringAfter(imgUrl, "?");
		String fileName = StringUtils.substringAfterLast(before, "/");
		if(StringUtils.isNotEmpty(last)) {
			last = URLEncoder.encode(last, "UTF-8");
			imgUrl = before + "?" + last;
		}
		HttpRequest subRequest = request.subRequest(imgUrl);
		subReponse = DownloaderContext.defaultDownload(subRequest);
		return DownloadImage.download(parentPath, fileName, subReponse.getRaw());
	} catch (Exception ex) {
		//throw new FieldRenderException(field, ex.getMessage(), ex);
		FieldRenderException.log(field, "download image error : " + imgUrl, ex);
		return imgUrl;
	} finally {
		if(subReponse != null) {
			subReponse.close();
		}
	}
}
 
Example #7
Source File: AjaxFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) {
	Map<String, Object> fieldMap = new HashMap<String, Object>();
	Set<Field> ajaxFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(Ajax.class));
	for (Field ajaxField : ajaxFields) {
		Object value = injectAjaxField(request, beanMap, ajaxField);
		if(value != null) {
			fieldMap.put(ajaxField.getName(), value);
		}
	}
	beanMap.putAll(fieldMap);
}
 
Example #8
Source File: UnirestDownloader.java    From gecco with MIT License 5 votes vote down vote up
@Override
public HttpResponse download(HttpRequest request) throws DownloaderException {
	if(log.isDebugEnabled()) {
		log.debug("downloading..." + request.getUrl());
	}
	try {
		HttpHost proxy = Proxys.getProxy();
		if(proxy != null) {
			Unirest.setProxy(proxy);
		} else {
			Unirest.setProxy(null);
		}
		request.addHeader("User-Agent", UserAgent.getUserAgent());
		com.mashape.unirest.http.HttpResponse<String> response = null;
		if(request instanceof HttpPostRequest) {
			HttpPostRequest post = (HttpPostRequest)request;
			HttpRequestWithBody httpRequestWithBody = Unirest.post(post.getUrl());
			httpRequestWithBody.headers(post.getHeaders());
			httpRequestWithBody.fields(post.getFields());
			response = httpRequestWithBody.asString();
		} else {
			response = Unirest.get(request.getUrl()).headers(request.getHeaders()).asString();
		}
		String contentType = response.getHeaders().getFirst("Content-Type");
		HttpResponse resp = new HttpResponse();
		resp.setStatus(response.getStatus());
		resp.setRaw(response.getRawBody());
		resp.setContent(response.getBody());
		resp.setContentType(contentType);
		resp.setCharset(getCharset(request, contentType));
		return resp;
	} catch (UnirestException e) {
		throw new DownloaderException(e);
	}
}
 
Example #9
Source File: ImageFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) {
	Map<String, Object> fieldMap = new HashMap<String, Object>();
	Set<Field> imageFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(Image.class));
	for (Field imageField : imageFields) {
		Object value = injectImageField(request, beanMap, bean, imageField);
		if(value != null) {
			fieldMap.put(imageField.getName(), value);
		}
	}
	beanMap.putAll(fieldMap);
}
 
Example #10
Source File: HtmlFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@Override
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) {
	Map<String, Object> fieldMap = new HashMap<String, Object>();
	Set<Field> htmlFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(HtmlField.class));
	for (Field htmlField : htmlFields) {
		Object value = injectHtmlField(request, response, htmlField, bean.getClass());
		if(value != null) {
			fieldMap.put(htmlField.getName(), value);
		}
	}
	beanMap.putAll(fieldMap);
}
 
Example #11
Source File: HtmlRender.java    From gecco with MIT License 5 votes vote down vote up
@Override
public void fieldRender(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) {
	htmlFieldRender.render(request, response, beanMap, bean);
	ajaxFieldRender.render(request, response, beanMap, bean);
	jsVarFieldRender.render(request, response, beanMap, bean);
	imageFieldRender.render(request, response, beanMap, bean);
}
 
Example #12
Source File: HtmlParser.java    From gecco with MIT License 5 votes vote down vote up
public SpiderBean $bean(String selector, HttpRequest request, Class<? extends SpiderBean> clazz) {
	String subHtml = $html(selector);
	// table
	HttpResponse subResponse = HttpResponse.createSimple(subHtml);
	Render render = RenderContext.getRender(RenderType.HTML);
	return render.inject(clazz, request, subResponse);
}
 
Example #13
Source File: AllSortJsonPipeline.java    From gecco with MIT License 5 votes vote down vote up
private void process(HttpRequest currRequest, JSONArray categorys) {
	if(categorys == null) {
		return;
	}
	for(int i = 0; i < categorys.size(); i++) {
		JSONObject category = categorys.getJSONObject(i);
		JSONArray hrefs = category.getJSONArray("categorys");
		for(int j = 0; j < hrefs.size(); j++) {
			String url = hrefs.getJSONObject(j).getString("url")+"&delivery=1&page=1&JL=4_10_0&go=0";
			SchedulerContext.into(currRequest.subRequest(url));
		}
	}
}
 
Example #14
Source File: JDDetail.java    From gecco-htmlunit with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	HttpRequest request = new HttpGetRequest("http://item.jd.com/1455427.html");
	request.setCharset("GBK");
	GeccoEngine.create()
	.classpath("com.geccocrawler.gecco.htmlunit")
	//开始抓取的页面地址
	.start(request)
	//开启几个爬虫线程
	.thread(1)
	.run();
}
 
Example #15
Source File: AllSortPipeline.java    From gecco with MIT License 5 votes vote down vote up
private void process(AllSort allSort, List<Category> categorys) {
	if(categorys == null) {
		return;
	}
	for(Category category : categorys) {
		List<HrefBean> hrefs = category.getCategorys();
		for(HrefBean href : hrefs) {
			String url = href.getUrl()+"&delivery=1&page=1&JL=4_10_0&go=0";
			HttpRequest currRequest = allSort.getRequest();
			//SchedulerContext.into(currRequest.subRequest(url));
			//将分类的商品列表地址暂存起来
			sortRequests.add(currRequest.subRequest(url));
		}
	}
}
 
Example #16
Source File: JsonFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@SuppressWarnings({ "rawtypes" })
private List<SpiderBean> spiderBeanListRender(Object src, Class genericClass, HttpRequest request) {
	List<SpiderBean> list = new ArrayList<SpiderBean>();
	Iterable ja = (Iterable) src;
	for (Object jo : ja) {
		if(jo != null) {
			SpiderBean subBean = this.spiderBeanRender(jo, genericClass, request);
			list.add(subBean);
		}
	}
	return list;
}
 
Example #17
Source File: JsonFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
private SpiderBean spiderBeanRender(Object src, Class genericClass, HttpRequest request) {
	HttpResponse subResponse = HttpResponse.createSimple(src.toString());
	Render render = null;
	if(ReflectUtils.haveSuperType(genericClass, JsonBean.class)) {
		render = RenderContext.getRender(RenderType.JSON);
	} else {
		render = RenderContext.getRender(RenderType.HTML);
	}
	SpiderBean subBean = render.inject(genericClass, request, subResponse);
	return subBean;
}
 
Example #18
Source File: RequestFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@Override
@SuppressWarnings({"unchecked" })
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) {
	Set<Field> requestFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(Request.class));
	for(Field field : requestFields) {
		beanMap.put(field.getName(), request);
	}
}
 
Example #19
Source File: JSVarFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@Override
@SuppressWarnings({ "unchecked" })
public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) {
	Context cx = Context.enter();
	ScriptableObject scope = cx.initSafeStandardObjects();
	String windowScript = "var window = {};var document = {};";
	cx.evaluateString(scope, windowScript, "window", 1, null);
	HtmlParser parser = new HtmlParser(request.getUrl(), response.getContent());
	for (Element ele : parser.$("script")) {
		String sc = ele.html();
		if (StringUtils.isNotEmpty(sc)) {
			try {
				cx.evaluateString(scope, sc, "", 1, null);
			} catch (Exception ex) {
				// ex.printStackTrace();
			}
		}
	}
	Map<String, Object> fieldMap = new HashMap<String, Object>();
	Set<Field> jsVarFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(JSVar.class));
	for (Field jsVarField : jsVarFields) {
		Object value = injectJsVarField(request, beanMap, jsVarField, cx, scope);
		if(value != null) {
			fieldMap.put(jsVarField.getName(), value);
		}
	}
	beanMap.putAll(fieldMap);
	Context.exit();
}
 
Example #20
Source File: HtmlParser.java    From gecco with MIT License 5 votes vote down vote up
public List<SpiderBean> $beanList(String selector, HttpRequest request, Class<? extends SpiderBean> clazz) {
	List<SpiderBean> list = new ArrayList<SpiderBean>();
	List<String> els = $list(selector);
	for (String el : els) {
		// table
		HttpResponse subResponse = HttpResponse.createSimple(el);
		Render render = RenderContext.getRender(RenderType.HTML);
		SpiderBean subBean = render.inject(clazz, request, subResponse);
		list.add(subBean);
	}
	return list;
}
 
Example #21
Source File: AjaxFieldRender.java    From gecco with MIT License 5 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
private Object injectAjaxField(HttpRequest request, BeanMap beanMap, Field field) {
	Class clazz = field.getType();
	// ajax的属性类型必须是spiderBean
	Ajax ajax = field.getAnnotation(Ajax.class);
	String url = ajax.url();
	url = UrlMatcher.replaceParams(url, request.getParameters());
	url = UrlMatcher.replaceFields(url, beanMap);
	HttpRequest subRequest = request.subRequest(url);
	HttpResponse subReponse = null;
	try {
		subReponse = DownloaderContext.download(subRequest);
		RenderType type = RenderType.HTML;
		if (ReflectUtils.haveSuperType(clazz, JsonBean.class)) {
			type = RenderType.JSON;
		}
		Render render = RenderContext.getRender(type);
		return render.inject(clazz, subRequest, subReponse);
	} catch (DownloadException ex) {
		//throw new FieldRenderException(field, ex.getMessage(), ex);
		FieldRenderException.log(field, ex.getMessage(), ex);
		return null;
	} finally {
		if(subReponse != null) {
			subReponse.close();
		}
	}
}
 
Example #22
Source File: SpiderScheduler.java    From gecco with MIT License 5 votes vote down vote up
@Override
public void into(HttpRequest request) {
	queue.offer(request);
	if(log.isDebugEnabled()) {
		log.debug("INTO:"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")");
	}
}
 
Example #23
Source File: SpiderScheduler.java    From gecco with MIT License 5 votes vote down vote up
@Override
public HttpRequest out() {
	HttpRequest request = queue.poll();
	if(request != null) {
		if(log.isDebugEnabled()) {
			log.debug("OUT:"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")");
		}
	}
	return request;
}
 
Example #24
Source File: StartScheduler.java    From gecco with MIT License 5 votes vote down vote up
@Override
public void into(HttpRequest request) {
	try {
		startQueue.put(request);
	} catch (InterruptedException e) {
		e.printStackTrace();
	}
}
 
Example #25
Source File: AllSortJsonPipeline.java    From gecco with MIT License 5 votes vote down vote up
@Override
public void process(JSONObject allSort) {
	HttpRequest currRequest = HttpGetRequest.fromJson(allSort.getJSONObject("request"));
	JSONArray categorys = allSort.getJSONArray("mobile");
	process(currRequest, categorys);
	/*List<Category> domestics = allSort.getDomestic();
	process(allSort, domestics);
	List<Category> bodys = allSort.getBaby();
	process(allSort, bodys);*/
}
 
Example #26
Source File: UnirestDownloader.java    From gecco with MIT License 5 votes vote down vote up
private String getCharset(HttpRequest request, String contentType) {
	String charset = ResponseUtils.getCharsetFromContentType(contentType);
	if(charset == null) {
		charset = request.getCharset();
	}
	if(charset == null) {
		charset = "UTF-8";
	}
	return charset;
}
 
Example #27
Source File: FIFOScheduler.java    From gecco with MIT License 5 votes vote down vote up
/**
 * 入队列,超过边界会阻塞等待
 */
@Override
public void into(HttpRequest request) {
	if(request == null) {
		return;
	}
	if(queue.offer(request)) {
		if(log.isDebugEnabled()) {
			log.debug("<==="+request.getUrl());
		}
	} else {
		log.error(request.getUrl());
	}
}
 
Example #28
Source File: FIFOScheduler.java    From gecco with MIT License 5 votes vote down vote up
/**
 * 出队列,队列为空会阻塞等待
 */
@Override
public HttpRequest out() {
	outLock.lock();
	try {
		HttpRequest request = queue.poll();
		if(request == null) {
				request = starQueue.take();
				if(log.isDebugEnabled()) {
					log.debug("[start]===>"+request.getUrl());
				}
				starQueue.put(request);
				if(log.isDebugEnabled()) {
					log.debug("<===[start]"+request.getUrl());
				}
				return request;
		} else {
			if(log.isDebugEnabled()) {
				log.debug("===>"+request.getUrl());
			}
			return request;
		}
	} catch (InterruptedException e) {
		e.printStackTrace();
		return null;
	} finally {
		outLock.unlock();
	}
}
 
Example #29
Source File: UniqueSpiderScheduler.java    From gecco with MIT License 5 votes vote down vote up
@Override
public HttpRequest out() {
	SortHttpRequest sortHttpRequest = set.pollFirst();
	if(sortHttpRequest == null) {
		return null;
	}
	long priority = sortHttpRequest.getPriority();
	HttpRequest request = sortHttpRequest.getHttpRequest();
	if(request != null && log.isDebugEnabled()) {
		log.debug("OUT("+priority+"):"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")");
	}
	return request;
}
 
Example #30
Source File: UniqueSpiderScheduler.java    From gecco with MIT License 5 votes vote down vote up
@Override
public void into(HttpRequest request) {
	long priority = System.nanoTime();
	boolean success = set.add(new SortHttpRequest(priority, request));
	if(success && log.isDebugEnabled()) {
		log.debug("INTO("+priority+"):"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")");
	}
	if(!success && log.isDebugEnabled()) {
		log.error("not unique request : " + request.getUrl());
	}
}