com.geccocrawler.gecco.request.HttpRequest Java Examples
The following examples show how to use
com.geccocrawler.gecco.request.HttpRequest.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JsonFieldRender.java From gecco with MIT License | 6 votes |
@Override @SuppressWarnings({ "unchecked" }) public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) { Map<String, Object> fieldMap = new HashMap<String, Object>(); Set<Field> jsonPathFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(JSONPath.class)); String jsonStr = response.getContent(); jsonStr = jsonp2Json(jsonStr); if (jsonStr == null) { return; } try { Object json = JSON.parse(jsonStr); for (Field field : jsonPathFields) { Object value = injectJsonField(request, field, json); if(value != null) { fieldMap.put(field.getName(), value); } } } catch(JSONException ex) { //throw new RenderException(ex.getMessage(), bean.getClass()); RenderException.log("json parse error : " + request.getUrl(), bean.getClass(), ex); } beanMap.putAll(fieldMap); }
Example #2
Source File: ProductListPipeline.java From gecco with MIT License | 6 votes |
@Override public void process(ProductList productList) { HttpRequest currRequest = productList.getRequest(); //下一页继续抓取 int currPage = productList.getCurrPage(); int nextPage = currPage + 1; int totalPage = productList.getTotalPage(); if(nextPage <= totalPage) { String nextUrl = ""; String currUrl = currRequest.getUrl(); if(currUrl.indexOf("page=") != -1) { nextUrl = StringUtils.replaceOnce(currUrl, "page=" + currPage, "page=" + nextPage); } else { nextUrl = currUrl + "&" + "page=" + nextPage; } SchedulerContext.into(currRequest.subRequest(nextUrl)); } }
Example #3
Source File: Spider.java From gecco with MIT License | 6 votes |
private HttpResponse download(SpiderBeanContext context, HttpRequest request) throws DownloadException { Downloader currDownloader = null; BeforeDownload before = null; AfterDownload after = null; int timeout = 1000; if(context != null) { currDownloader = context.getDownloader(); before = context.getBeforeDownload(); after = context.getAfterDownload(); timeout = context.getTimeout(); } else { currDownloader = engine.getSpiderBeanFactory().getDownloaderFactory().defaultDownloader(); } if(before != null) { before.process(request); } HttpResponse response = currDownloader.download(request, timeout); if(after != null) { after.process(request, response); } return response; }
Example #4
Source File: SpiderBeanFactory.java From gecco with MIT License | 6 votes |
public Class<? extends SpiderBean> matchSpider(HttpRequest request) { String url = request.getUrl(); Class<? extends SpiderBean> commonSpider = null;// 通用爬虫 for (Map.Entry<String, Class<? extends SpiderBean>> entrys : spiderBeans.entrySet()) { Class<? extends SpiderBean> spider = entrys.getValue(); String urlPattern = entrys.getKey(); Map<String, String> params = UrlMatcher.match(url, urlPattern); if (params != null) { request.setParameters(params); return spider; } else { if (urlPattern.equals("*")) { commonSpider = spider; } } } if (commonSpider != null) {// 如果包含通用爬虫,返回通用爬虫 return commonSpider; } return null; }
Example #5
Source File: JdPricesFieldRender.java From gecco with MIT License | 6 votes |
@Override public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean, Field field) { ProductList jd = (ProductList)bean; StringBuffer sb = new StringBuffer(); /*for(String code : jd.getCodes()) { sb.append("J_").append(code).append(","); }*/ String skuIds = sb.toString(); try { skuIds = URLEncoder.encode(skuIds, "UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } String url = "http://p.3.cn/prices/mgets?skuIds="+skuIds; HttpRequest subRequest = request.subRequest(url); try { HttpResponse subReponse = DownloaderContext.download(subRequest); String json = subReponse.getContent(); List<JDPrice> prices = JSON.parseArray(json, JDPrice.class); beanMap.put(field.getName(), prices); } catch(Exception ex) { ex.printStackTrace(); } }
Example #6
Source File: ImageFieldRender.java From gecco with MIT License | 5 votes |
private String downloadImage(HttpRequest request, Field field, String imgUrl) { if(StringUtils.isEmpty(imgUrl)) { return imgUrl; } Image image = field.getAnnotation(Image.class); String parentPath = image.download(); if(StringUtils.isEmpty(parentPath)) { return imgUrl; } HttpResponse subReponse = null; try { String before = StringUtils.substringBefore(imgUrl, "?"); String last = StringUtils.substringAfter(imgUrl, "?"); String fileName = StringUtils.substringAfterLast(before, "/"); if(StringUtils.isNotEmpty(last)) { last = URLEncoder.encode(last, "UTF-8"); imgUrl = before + "?" + last; } HttpRequest subRequest = request.subRequest(imgUrl); subReponse = DownloaderContext.defaultDownload(subRequest); return DownloadImage.download(parentPath, fileName, subReponse.getRaw()); } catch (Exception ex) { //throw new FieldRenderException(field, ex.getMessage(), ex); FieldRenderException.log(field, "download image error : " + imgUrl, ex); return imgUrl; } finally { if(subReponse != null) { subReponse.close(); } } }
Example #7
Source File: AjaxFieldRender.java From gecco with MIT License | 5 votes |
@Override @SuppressWarnings("unchecked") public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) { Map<String, Object> fieldMap = new HashMap<String, Object>(); Set<Field> ajaxFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(Ajax.class)); for (Field ajaxField : ajaxFields) { Object value = injectAjaxField(request, beanMap, ajaxField); if(value != null) { fieldMap.put(ajaxField.getName(), value); } } beanMap.putAll(fieldMap); }
Example #8
Source File: UnirestDownloader.java From gecco with MIT License | 5 votes |
@Override public HttpResponse download(HttpRequest request) throws DownloaderException { if(log.isDebugEnabled()) { log.debug("downloading..." + request.getUrl()); } try { HttpHost proxy = Proxys.getProxy(); if(proxy != null) { Unirest.setProxy(proxy); } else { Unirest.setProxy(null); } request.addHeader("User-Agent", UserAgent.getUserAgent()); com.mashape.unirest.http.HttpResponse<String> response = null; if(request instanceof HttpPostRequest) { HttpPostRequest post = (HttpPostRequest)request; HttpRequestWithBody httpRequestWithBody = Unirest.post(post.getUrl()); httpRequestWithBody.headers(post.getHeaders()); httpRequestWithBody.fields(post.getFields()); response = httpRequestWithBody.asString(); } else { response = Unirest.get(request.getUrl()).headers(request.getHeaders()).asString(); } String contentType = response.getHeaders().getFirst("Content-Type"); HttpResponse resp = new HttpResponse(); resp.setStatus(response.getStatus()); resp.setRaw(response.getRawBody()); resp.setContent(response.getBody()); resp.setContentType(contentType); resp.setCharset(getCharset(request, contentType)); return resp; } catch (UnirestException e) { throw new DownloaderException(e); } }
Example #9
Source File: ImageFieldRender.java From gecco with MIT License | 5 votes |
@Override @SuppressWarnings("unchecked") public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) { Map<String, Object> fieldMap = new HashMap<String, Object>(); Set<Field> imageFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(Image.class)); for (Field imageField : imageFields) { Object value = injectImageField(request, beanMap, bean, imageField); if(value != null) { fieldMap.put(imageField.getName(), value); } } beanMap.putAll(fieldMap); }
Example #10
Source File: HtmlFieldRender.java From gecco with MIT License | 5 votes |
@Override public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) { Map<String, Object> fieldMap = new HashMap<String, Object>(); Set<Field> htmlFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(HtmlField.class)); for (Field htmlField : htmlFields) { Object value = injectHtmlField(request, response, htmlField, bean.getClass()); if(value != null) { fieldMap.put(htmlField.getName(), value); } } beanMap.putAll(fieldMap); }
Example #11
Source File: HtmlRender.java From gecco with MIT License | 5 votes |
@Override public void fieldRender(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) { htmlFieldRender.render(request, response, beanMap, bean); ajaxFieldRender.render(request, response, beanMap, bean); jsVarFieldRender.render(request, response, beanMap, bean); imageFieldRender.render(request, response, beanMap, bean); }
Example #12
Source File: HtmlParser.java From gecco with MIT License | 5 votes |
public SpiderBean $bean(String selector, HttpRequest request, Class<? extends SpiderBean> clazz) { String subHtml = $html(selector); // table HttpResponse subResponse = HttpResponse.createSimple(subHtml); Render render = RenderContext.getRender(RenderType.HTML); return render.inject(clazz, request, subResponse); }
Example #13
Source File: AllSortJsonPipeline.java From gecco with MIT License | 5 votes |
private void process(HttpRequest currRequest, JSONArray categorys) { if(categorys == null) { return; } for(int i = 0; i < categorys.size(); i++) { JSONObject category = categorys.getJSONObject(i); JSONArray hrefs = category.getJSONArray("categorys"); for(int j = 0; j < hrefs.size(); j++) { String url = hrefs.getJSONObject(j).getString("url")+"&delivery=1&page=1&JL=4_10_0&go=0"; SchedulerContext.into(currRequest.subRequest(url)); } } }
Example #14
Source File: JDDetail.java From gecco-htmlunit with MIT License | 5 votes |
public static void main(String[] args) throws Exception { HttpRequest request = new HttpGetRequest("http://item.jd.com/1455427.html"); request.setCharset("GBK"); GeccoEngine.create() .classpath("com.geccocrawler.gecco.htmlunit") //开始抓取的页面地址 .start(request) //开启几个爬虫线程 .thread(1) .run(); }
Example #15
Source File: AllSortPipeline.java From gecco with MIT License | 5 votes |
private void process(AllSort allSort, List<Category> categorys) { if(categorys == null) { return; } for(Category category : categorys) { List<HrefBean> hrefs = category.getCategorys(); for(HrefBean href : hrefs) { String url = href.getUrl()+"&delivery=1&page=1&JL=4_10_0&go=0"; HttpRequest currRequest = allSort.getRequest(); //SchedulerContext.into(currRequest.subRequest(url)); //将分类的商品列表地址暂存起来 sortRequests.add(currRequest.subRequest(url)); } } }
Example #16
Source File: JsonFieldRender.java From gecco with MIT License | 5 votes |
@SuppressWarnings({ "rawtypes" }) private List<SpiderBean> spiderBeanListRender(Object src, Class genericClass, HttpRequest request) { List<SpiderBean> list = new ArrayList<SpiderBean>(); Iterable ja = (Iterable) src; for (Object jo : ja) { if(jo != null) { SpiderBean subBean = this.spiderBeanRender(jo, genericClass, request); list.add(subBean); } } return list; }
Example #17
Source File: JsonFieldRender.java From gecco with MIT License | 5 votes |
@SuppressWarnings({ "rawtypes", "unchecked" }) private SpiderBean spiderBeanRender(Object src, Class genericClass, HttpRequest request) { HttpResponse subResponse = HttpResponse.createSimple(src.toString()); Render render = null; if(ReflectUtils.haveSuperType(genericClass, JsonBean.class)) { render = RenderContext.getRender(RenderType.JSON); } else { render = RenderContext.getRender(RenderType.HTML); } SpiderBean subBean = render.inject(genericClass, request, subResponse); return subBean; }
Example #18
Source File: RequestFieldRender.java From gecco with MIT License | 5 votes |
@Override @SuppressWarnings({"unchecked" }) public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) { Set<Field> requestFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(Request.class)); for(Field field : requestFields) { beanMap.put(field.getName(), request); } }
Example #19
Source File: JSVarFieldRender.java From gecco with MIT License | 5 votes |
@Override @SuppressWarnings({ "unchecked" }) public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) { Context cx = Context.enter(); ScriptableObject scope = cx.initSafeStandardObjects(); String windowScript = "var window = {};var document = {};"; cx.evaluateString(scope, windowScript, "window", 1, null); HtmlParser parser = new HtmlParser(request.getUrl(), response.getContent()); for (Element ele : parser.$("script")) { String sc = ele.html(); if (StringUtils.isNotEmpty(sc)) { try { cx.evaluateString(scope, sc, "", 1, null); } catch (Exception ex) { // ex.printStackTrace(); } } } Map<String, Object> fieldMap = new HashMap<String, Object>(); Set<Field> jsVarFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(JSVar.class)); for (Field jsVarField : jsVarFields) { Object value = injectJsVarField(request, beanMap, jsVarField, cx, scope); if(value != null) { fieldMap.put(jsVarField.getName(), value); } } beanMap.putAll(fieldMap); Context.exit(); }
Example #20
Source File: HtmlParser.java From gecco with MIT License | 5 votes |
public List<SpiderBean> $beanList(String selector, HttpRequest request, Class<? extends SpiderBean> clazz) { List<SpiderBean> list = new ArrayList<SpiderBean>(); List<String> els = $list(selector); for (String el : els) { // table HttpResponse subResponse = HttpResponse.createSimple(el); Render render = RenderContext.getRender(RenderType.HTML); SpiderBean subBean = render.inject(clazz, request, subResponse); list.add(subBean); } return list; }
Example #21
Source File: AjaxFieldRender.java From gecco with MIT License | 5 votes |
@SuppressWarnings({ "rawtypes", "unchecked" }) private Object injectAjaxField(HttpRequest request, BeanMap beanMap, Field field) { Class clazz = field.getType(); // ajax的属性类型必须是spiderBean Ajax ajax = field.getAnnotation(Ajax.class); String url = ajax.url(); url = UrlMatcher.replaceParams(url, request.getParameters()); url = UrlMatcher.replaceFields(url, beanMap); HttpRequest subRequest = request.subRequest(url); HttpResponse subReponse = null; try { subReponse = DownloaderContext.download(subRequest); RenderType type = RenderType.HTML; if (ReflectUtils.haveSuperType(clazz, JsonBean.class)) { type = RenderType.JSON; } Render render = RenderContext.getRender(type); return render.inject(clazz, subRequest, subReponse); } catch (DownloadException ex) { //throw new FieldRenderException(field, ex.getMessage(), ex); FieldRenderException.log(field, ex.getMessage(), ex); return null; } finally { if(subReponse != null) { subReponse.close(); } } }
Example #22
Source File: SpiderScheduler.java From gecco with MIT License | 5 votes |
@Override public void into(HttpRequest request) { queue.offer(request); if(log.isDebugEnabled()) { log.debug("INTO:"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")"); } }
Example #23
Source File: SpiderScheduler.java From gecco with MIT License | 5 votes |
@Override public HttpRequest out() { HttpRequest request = queue.poll(); if(request != null) { if(log.isDebugEnabled()) { log.debug("OUT:"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")"); } } return request; }
Example #24
Source File: StartScheduler.java From gecco with MIT License | 5 votes |
@Override public void into(HttpRequest request) { try { startQueue.put(request); } catch (InterruptedException e) { e.printStackTrace(); } }
Example #25
Source File: AllSortJsonPipeline.java From gecco with MIT License | 5 votes |
@Override public void process(JSONObject allSort) { HttpRequest currRequest = HttpGetRequest.fromJson(allSort.getJSONObject("request")); JSONArray categorys = allSort.getJSONArray("mobile"); process(currRequest, categorys); /*List<Category> domestics = allSort.getDomestic(); process(allSort, domestics); List<Category> bodys = allSort.getBaby(); process(allSort, bodys);*/ }
Example #26
Source File: UnirestDownloader.java From gecco with MIT License | 5 votes |
private String getCharset(HttpRequest request, String contentType) { String charset = ResponseUtils.getCharsetFromContentType(contentType); if(charset == null) { charset = request.getCharset(); } if(charset == null) { charset = "UTF-8"; } return charset; }
Example #27
Source File: FIFOScheduler.java From gecco with MIT License | 5 votes |
/** * 入队列,超过边界会阻塞等待 */ @Override public void into(HttpRequest request) { if(request == null) { return; } if(queue.offer(request)) { if(log.isDebugEnabled()) { log.debug("<==="+request.getUrl()); } } else { log.error(request.getUrl()); } }
Example #28
Source File: FIFOScheduler.java From gecco with MIT License | 5 votes |
/** * 出队列,队列为空会阻塞等待 */ @Override public HttpRequest out() { outLock.lock(); try { HttpRequest request = queue.poll(); if(request == null) { request = starQueue.take(); if(log.isDebugEnabled()) { log.debug("[start]===>"+request.getUrl()); } starQueue.put(request); if(log.isDebugEnabled()) { log.debug("<===[start]"+request.getUrl()); } return request; } else { if(log.isDebugEnabled()) { log.debug("===>"+request.getUrl()); } return request; } } catch (InterruptedException e) { e.printStackTrace(); return null; } finally { outLock.unlock(); } }
Example #29
Source File: UniqueSpiderScheduler.java From gecco with MIT License | 5 votes |
@Override public HttpRequest out() { SortHttpRequest sortHttpRequest = set.pollFirst(); if(sortHttpRequest == null) { return null; } long priority = sortHttpRequest.getPriority(); HttpRequest request = sortHttpRequest.getHttpRequest(); if(request != null && log.isDebugEnabled()) { log.debug("OUT("+priority+"):"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")"); } return request; }
Example #30
Source File: UniqueSpiderScheduler.java From gecco with MIT License | 5 votes |
@Override public void into(HttpRequest request) { long priority = System.nanoTime(); boolean success = set.add(new SortHttpRequest(priority, request)); if(success && log.isDebugEnabled()) { log.debug("INTO("+priority+"):"+request.getUrl()+"(Referer:"+request.getHeaders().get("Referer")+")"); } if(!success && log.isDebugEnabled()) { log.error("not unique request : " + request.getUrl()); } }