org.apache.nutch.net.protocols.HttpDateFormat Java Examples
The following examples show how to use
org.apache.nutch.net.protocols.HttpDateFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileResponse.java From anthelion with Apache License 2.0 | 6 votes |
private void getDirAsHttpResponse(java.io.File f) throws IOException { String path = f.toString(); if (this.file.crawlParents) this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); else this.content = list2html(f.listFiles(), path, false); // set headers headers.set(Response.CONTENT_LENGTH, new Integer(this.content.length).toString()); headers.set(Response.CONTENT_TYPE, "text/html"); headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f.lastModified())); // response code this.code = 200; // http OK }
Example #2
Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * get dir list as http response * @param f * @throws IOException */ private void getDirAsHttpResponse(java.io.File f) throws IOException { String path = f.toString(); if (this.file.crawlParents) this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); else this.content = list2html(f.listFiles(), path, false); // set headers headers.set(Response.CONTENT_LENGTH, new Integer(this.content.length).toString()); headers.set(Response.CONTENT_TYPE, "text/html"); headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f.lastModified())); // response code this.code = 200; // http OK }
Example #3
Source File: FtpResponse.java From anthelion with Apache License 2.0 | 5 votes |
private byte[] list2html(List list, String path, boolean includeDotDot) { //StringBuffer x = new StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>"); StringBuffer x = new StringBuffer("<html><head>"); x.append("<title>Index of "+path+"</title></head>\n"); x.append("<body><h1>Index of "+path+"</h1><pre>\n"); if (includeDotDot) { x.append("<a href='../'>../</a>\t-\t-\t-\n"); } for (int i=0; i<list.size(); i++) { FTPFile f = (FTPFile) list.get(i); String name = f.getName(); String time = HttpDateFormat.toString(f.getTimestamp()); if (f.isDirectory()) { // some ftp server LIST "." and "..", we skip them here if (name.equals(".") || name.equals("..")) continue; x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t"); x.append(time+"\t-\n"); } else if (f.isFile()) { x.append("<a href='"+name+ "'>"+name+"</a>\t"); x.append(time+"\t"+f.getSize()+"\n"); } else { // ignore isSymbolicLink() // ignore isUnknown() } } x.append("</pre></body></html>\n"); return new String(x).getBytes(); }
Example #4
Source File: FileResponse.java From anthelion with Apache License 2.0 | 5 votes |
private byte[] list2html(java.io.File[] list, String path, boolean includeDotDot) { StringBuffer x = new StringBuffer("<html><head>"); x.append("<title>Index of " + path + "</title></head>\n"); x.append("<body><h1>Index of " + path + "</h1><pre>\n"); if (includeDotDot) { x.append("<a href='../'>../</a>\t-\t-\t-\n"); } // fix me: we might want to sort list here! but not now. java.io.File f; for (int i = 0; i < list.length; i++) { f = list[i]; String name = f.getName(); String time = HttpDateFormat.toString(f.lastModified()); if (f.isDirectory()) { // java 1.4.2 api says dir itself and parent dir are not listed // so the following is not needed. // if (name.equals(".") || name.equals("..")) // continue; x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t"); x.append(time + "\t-\n"); } else if (f.isFile()) { x.append("<a href='" + name + "'>" + name + "</a>\t"); x.append(time + "\t" + f.length() + "\n"); } else { // ignore any other } } x.append("</pre></body></html>\n"); return new String(x).getBytes(); }
Example #5
Source File: FtpResponse.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
private byte[] list2html(List<FTPFile> list, String path, boolean includeDotDot) { //StringBuffer x = new StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>"); StringBuffer x = new StringBuffer("<html><head>"); x.append("<title>Index of "+path+"</title></head>\n"); x.append("<body><h1>Index of "+path+"</h1><pre>\n"); if (includeDotDot) { x.append("<a href='../'>../</a>\t-\t-\t-\n"); } for (int i=0; i<list.size(); i++) { FTPFile f = (FTPFile) list.get(i); String name = f.getName(); String time = HttpDateFormat.toString(f.getTimestamp()); if (f.isDirectory()) { // some ftp server LIST "." and "..", we skip them here if (name.equals(".") || name.equals("..")) continue; x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t"); x.append(time+"\t-\n"); } else if (f.isFile()) { x.append("<a href='"+name+ "'>"+name+"</a>\t"); x.append(time+"\t"+f.getSize()+"\n"); } else { // ignore isSymbolicLink() // ignore isUnknown() } } x.append("</pre></body></html>\n"); return new String(x).getBytes(); }
Example #6
Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * generate html page from dir list * @param list * @param path * @param includeDotDot * @return */ private byte[] list2html(java.io.File[] list, String path, boolean includeDotDot) { StringBuffer x = new StringBuffer("<html><head>"); x.append("<title>Index of " + path + "</title></head>\n"); x.append("<body><h1>Index of " + path + "</h1><pre>\n"); if (includeDotDot) { x.append("<a href='../'>../</a>\t-\t-\t-\n"); } // fix me: we might want to sort list here! but not now. java.io.File f; for (int i = 0; i < list.length; i++) { f = list[i]; String name = f.getName(); String time = HttpDateFormat.toString(f.lastModified()); if (f.isDirectory()) { // java 1.4.2 api says dir itself and parent dir are not listed // so the following is not needed. // if (name.equals(".") || name.equals("..")) // continue; x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t"); x.append(time + "\t-\n"); } else if (f.isFile()) { x.append("<a href='" + name + "'>" + name + "</a>\t"); x.append(time + "\t" + f.length() + "\n"); } else { // ignore any other } } x.append("</pre></body></html>\n"); return new String(x).getBytes(); }
Example #7
Source File: FileResponse.java From anthelion with Apache License 2.0 | 4 votes |
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) throws FileException, IOException { this.orig = url.toString(); this.base = url.toString(); this.file = file; this.conf = conf; MIME = new MimeUtil(conf); tika = new Tika(); if (!"file".equals(url.getProtocol())) throw new FileException("Not a file url:" + url); if (File.LOG.isTraceEnabled()) { File.LOG.trace("fetching " + url); } if (url.getPath() != url.getFile()) { if (File.LOG.isWarnEnabled()) { File.LOG.warn("url.getPath() != url.getFile(): " + url); } } String path = "".equals(url.getPath()) ? "/" : url.getPath(); try { // specify the encoding via the config later? path = java.net.URLDecoder.decode(path, "UTF-8"); } catch (UnsupportedEncodingException ex) { } try { this.content = null; // url.toURI() is only in j2se 1.5.0 //java.io.File f = new java.io.File(url.toURI()); java.io.File f = new java.io.File(path); if (!f.exists()) { this.code = 404; // http Not Found return; } if (!f.canRead()) { this.code = 401; // http Unauthorized return; } // symbolic link or relative path on unix // fix me: what's the consequence on windows platform // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers //hdrs.put("Location", f.getCanonicalFile().toURI()); // // we want to automatically escape characters that are illegal in URLs. // It is recommended that new code convert an abstract pathname into a URL // by first converting it into a URI, via the toURI method, and then // converting the URI into a URL via the URI.toURL method. headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString()); this.code = 300; // http redirect return; } if (f.lastModified() <= datum.getModifiedTime()) { this.code = 304; this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified())); return; } if (f.isDirectory()) { getDirAsHttpResponse(f); } else if (f.isFile()) { getFileAsHttpResponse(f); } else { this.code = 500; // http Internal Server Error return; } } catch (IOException e) { throw e; } }
Example #8
Source File: FileResponse.java From anthelion with Apache License 2.0 | 4 votes |
private void getFileAsHttpResponse(java.io.File f) throws FileException, IOException { // ignore file of size larger than // Integer.MAX_VALUE = 2^31-1 = 2147483647 long size = f.length(); if (size > Integer.MAX_VALUE) { throw new FileException("file is too large, size: " + size); // or we can do this? // this.code = 400; // http Bad request // return; } // capture content int len = (int) size; if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength) len = this.file.maxContentLength; this.content = new byte[len]; java.io.InputStream is = new java.io.FileInputStream(f); int offset = 0; int n = 0; while (offset < len && (n = is.read(this.content, offset, len - offset)) >= 0) { offset += n; } if (offset < len) { // keep whatever already have, but issue a warning if (File.LOG.isWarnEnabled()) { File.LOG.warn("not enough bytes read from file: " + f.getPath()); } } is.close(); // set headers headers.set(Response.CONTENT_LENGTH, new Long(size).toString()); headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f.lastModified())); String mimeType = MIME.getMimeType(f); headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : ""); // response code this.code = 200; // http OK }
Example #9
Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0 | 4 votes |
private long getTime(String date, String url) { long time = -1; try { time = HttpDateFormat.toLong(date); } catch (ParseException e) { // try to parse it as date in alternative format try { Date parsedDate = DateUtils.parseDate(date, new String [] { "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz", "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm", "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", "yyyy-MM-dd'T'HH:mm:ss'Z'" }); time = parsedDate.getTime(); // if (LOG.isWarnEnabled()) { // LOG.warn(url + ": parsed date: " + date +" to:"+time); // } } catch (Exception e2) { if (LOG.isWarnEnabled()) { LOG.warn(url + ": can't parse erroneous date: " + date); } } } return time; }
Example #10
Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
/** * Default public constructor * @param url * @param datum * @param file * @param conf * @throws FileException * @throws IOException */ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) throws FileException, IOException { this.orig = url.toString(); this.base = url.toString(); this.file = file; this.conf = conf; MIME = new MimeUtil(conf); tika = new Tika(); if (!"file".equals(url.getProtocol())) throw new FileException("Not a file url:" + url); if (File.LOG.isTraceEnabled()) { File.LOG.trace("fetching " + url); } if (url.getPath() != url.getFile()) { if (File.LOG.isWarnEnabled()) { File.LOG.warn("url.getPath() != url.getFile(): " + url); } } String path = "".equals(url.getPath()) ? "/" : url.getPath(); try { // specify the encoding via the config later? path = java.net.URLDecoder.decode(path, "UTF-8"); } catch (UnsupportedEncodingException ex) { } try { this.content = null; // url.toURI() is only in j2se 1.5.0 //java.io.File f = new java.io.File(url.toURI()); java.io.File f = new java.io.File(path); if (!f.exists()) { this.code = 404; // http Not Found return; } if (!f.canRead()) { this.code = 401; // http Unauthorized return; } // symbolic link or relative path on unix // fix me: what's the consequence on windows platform // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers //hdrs.put("Location", f.getCanonicalFile().toURI()); // // we want to automatically escape characters that are illegal in URLs. // It is recommended that new code convert an abstract pathname into a URL // by first converting it into a URI, via the toURI method, and then // converting the URI into a URL via the URI.toURL method. headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString()); this.code = 300; // http redirect return; } if (f.lastModified() <= datum.getModifiedTime()) { this.code = 304; this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified())); return; } if (f.isDirectory()) { getDirAsHttpResponse(f); } else if (f.isFile()) { getFileAsHttpResponse(f); } else { this.code = 500; // http Internal Server Error return; } } catch (IOException e) { throw e; } }
Example #11
Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
private void getFileAsHttpResponse(java.io.File f) throws FileException, IOException { // ignore file of size larger than // Integer.MAX_VALUE = 2^31-1 = 2147483647 long size = f.length(); if (size > Integer.MAX_VALUE) { throw new FileException("file is too large, size: " + size); // or we can do this? // this.code = 400; // http Bad request // return; } // capture content int len = (int) size; if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength) len = this.file.maxContentLength; this.content = new byte[len]; java.io.InputStream is = new java.io.FileInputStream(f); int offset = 0; int n = 0; while (offset < len && (n = is.read(this.content, offset, len - offset)) >= 0) { offset += n; } if (offset < len) { // keep whatever already have, but issue a warning if (File.LOG.isWarnEnabled()) { File.LOG.warn("not enough bytes read from file: " + f.getPath()); } } is.close(); // set headers headers.set(Response.CONTENT_LENGTH, new Long(size).toString()); headers.set(Response.LAST_MODIFIED, HttpDateFormat.toString(f.lastModified())); String mimeType = tika.detect(f); headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : ""); // response code this.code = 200; // http OK }
Example #12
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
private long getTime(String date, String url) { long time = -1; try { time = HttpDateFormat.toLong(date); } catch (ParseException e) { // try to parse it as date in alternative format try { Date parsedDate = DateUtils.parseDate(date, new String [] { "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz", "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm", "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", "yyyy-MM-dd'T'HH:mm:ss'Z'" }); time = parsedDate.getTime(); // if (LOG.isWarnEnabled()) { // LOG.warn(url + ": parsed date: " + date +" to:"+time); // } } catch (Exception e2) { if (LOG.isWarnEnabled()) { LOG.warn(url + ": can't parse erroneous date: " + date); } } } return time; }