Java Code Examples for org.jsoup.Jsoup#parse()
The following examples show how to use
org.jsoup.Jsoup#parse() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Nanrentu.java From PicKing with Apache License 2.0 | 6 votes |
@Override public String getDetailNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException { Document document = Jsoup.parse(new String(result, "gb2312")); Elements elements = document.select("div.pagelist a:contains(下一页)"); if (elements.size() > 0) { String tempUrl = elements.get(0).attr("href"); if (tempUrl == null || "".equals(tempUrl)) return ""; Pattern pattern = Pattern.compile("http.*/"); Matcher matcher = pattern.matcher(currentUrl); if (matcher.find()) { return matcher.group() + tempUrl; } } return ""; }
Example 2
Source File: ElementTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void testGetElementsWithClass() { Document doc = Jsoup.parse("<div class='mellow yellow'><span class=mellow>Hello <b class='yellow'>Yellow!</b></span><p>Empty</p></div>"); List<Element> els = doc.getElementsByClass("mellow"); assertEquals(2, els.size()); assertEquals("div", els.get(0).tagName()); assertEquals("span", els.get(1).tagName()); List<Element> els2 = doc.getElementsByClass("yellow"); assertEquals(2, els2.size()); assertEquals("div", els2.get(0).tagName()); assertEquals("b", els2.get(1).tagName()); List<Element> none = doc.getElementsByClass("solo"); assertEquals(0, none.size()); }
Example 3
Source File: Job51ResumeParser.java From job with MIT License | 6 votes |
protected Document parse2HtmlAsMail(File file) throws Exception { InputStream in = new FileInputStream(file); Session mailSession = Session.getDefaultInstance(System.getProperties(), null); MimeMessage msg = new MimeMessage(mailSession, in); Multipart part = (Multipart) msg.getContent(); String html = null; for(int i = 0; i < part.getCount(); i++) { html = parseHtml(part.getBodyPart(i)); if(html != null) { break; } } in.close(); return html == null ? null : Jsoup.parse(html); }
Example 4
Source File: ElementTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void testAppendTo() { String parentHtml = "<div class='a'></div>"; String childHtml = "<div class='b'></div><p>Two</p>"; Document parentDoc = Jsoup.parse(parentHtml); Element parent = parentDoc.body(); Document childDoc = Jsoup.parse(childHtml); Element div = childDoc.select("div").first(); Element p = childDoc.select("p").first(); Element appendTo1 = div.appendTo(parent); assertEquals(div, appendTo1); Element appendTo2 = p.appendTo(div); assertEquals(p, appendTo2); assertEquals("<div class=\"a\"></div>\n<div class=\"b\">\n <p>Two</p>\n</div>", parentDoc.body().html()); assertEquals("", childDoc.body().html()); // got moved out }
Example 5
Source File: Demo.java From java-Crawler with MIT License | 6 votes |
public static Queue getUrlQueue(String url) throws Exception{ Queue queue = new Queue() ; CloseableHttpClient closeableHttpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url) ; CloseableHttpResponse closeableHttpResponse = closeableHttpClient.execute(httpGet) ; HttpEntity httpEntity = closeableHttpResponse.getEntity() ; String index = EntityUtils.toString(httpEntity,"gb2312"); Document doc = Jsoup.parse(index); Elements elements = doc.select("a"); for(Element element : elements) { String aurl = element.attr("href"); if(aurl.indexOf("webPlay")!=-1){ }else { queue.enQueue("http://www.dy2018.com" + aurl); } } return queue ; }
Example 6
Source File: NodeTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void unwrap() { Document doc = Jsoup.parse("<div>One <span>Two <b>Three</b></span> Four</div>"); Element span = doc.select("span").first(); Node twoText = span.childNode(0); Node node = span.unwrap(); assertEquals("<div>One Two <b>Three</b> Four</div>", TextUtil.stripNewlines(doc.body().html())); assertTrue(node instanceof TextNode); assertEquals("Two ", ((TextNode) node).text()); assertEquals(node, twoText); assertEquals(node.parent(), doc.select("div").first()); }
Example 7
Source File: LyricsChart.java From QuickLyric with GNU General Public License v3.0 | 5 votes |
public static Lyrics fromXml(String xmlString, String... originalMetadata) { if (TextUtils.isEmpty(xmlString)) return new Lyrics(Lyrics.ERROR); Document doc = Jsoup.parse(xmlString); Element element = doc.getElementsByTag("GetLyricResult").first(); String id = element.getElementsByTag("TrackId").get(0).text(); String checksum = element.getElementsByTag("LyricChecksum").get(0).text(); Lyrics lyrics = new Lyrics(Lyrics.POSITIVE_RESULT); lyrics.setArtist(element.getElementsByTag("LyricArtist").get(0).text()); lyrics.setTitle(element.getElementsByTag("LyricSong").get(0).text()); lyrics.setURL("http://api.chartlyrics.com/apiv1.asmx/GetLyric?lyricId=" + id + "&lyricCheckSum=" + checksum); boolean hasOriginalMetadata = originalMetadata != null && originalMetadata.length > 0; String originalArtist = hasOriginalMetadata ? originalMetadata[0] : null; String originalTitle = hasOriginalMetadata ? originalMetadata[1] : null; if (TextUtils.isEmpty(lyrics.getArtist())) lyrics.setArtist(originalArtist); else lyrics.setOriginalArtist(originalArtist); if (TextUtils.isEmpty(lyrics.getTitle())) lyrics.setTitle(originalTitle); else lyrics.setOriginalTitle(originalTitle); lyrics.setText(element.getElementsByTag("Lyric").get(0).html()); lyrics.setSource(domain); return lyrics; }
Example 8
Source File: HtmlParserTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void handlesJavadocFont() { String h = "<TD BGCOLOR=\"#EEEEFF\" CLASS=\"NavBarCell1\"> <A HREF=\"deprecated-list.html\"><FONT CLASS=\"NavBarFont1\"><B>Deprecated</B></FONT></A> </TD>"; Document doc = Jsoup.parse(h); Element a = doc.select("a").first(); assertEquals("Deprecated", a.text()); assertEquals("font", a.child(0).tagName()); assertEquals("b", a.child(0).child(0).tagName()); }
Example 9
Source File: ElementTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void testGetParents() { Document doc = Jsoup.parse("<div><p>Hello <span>there</span></div>"); Element span = doc.select("span").first(); Elements parents = span.parents(); assertEquals(4, parents.size()); assertEquals("p", parents.get(0).tagName()); assertEquals("div", parents.get(1).tagName()); assertEquals("body", parents.get(2).tagName()); assertEquals("html", parents.get(3).tagName()); }
Example 10
Source File: PhotoContentPresenter.java From Toutiao with Apache License 2.0 | 5 votes |
private Boolean parseHTML(String HTML) { boolean flag = false; Document doc = Jsoup.parse(HTML); // 取得所有的script tag Elements scripts = doc.getElementsByTag("script"); for (Element e : scripts) { // 过滤字符串 String script = e.toString(); if (script.contains("BASE_DATA.galleryInfo")) { // 只取得script的內容 script = e.childNode(0).toString(); Matcher matcher = Pattern.compile("(JSON.parse\\(\\\".+\\))").matcher(script); while (matcher.find()) { int count = matcher.groupCount(); if (count >= 1) { int start = script.indexOf("("); int end = script.indexOf("),"); String json = script.substring(start + 2, end - 1); // 处理特殊符号 json = ChineseUtil.UnicodeToChs(json); json = json.replace("\\", ""); JsonReader reader = new JsonReader(new StringReader(json)); reader.setLenient(true); bean = new Gson().fromJson(reader, PhotoGalleryBean.class); Log.d(TAG, "parseHTML: " + bean.toString()); flag = true; break; } } } } return flag; }
Example 11
Source File: MultiUploadDotBiz.java From neembuu-uploader with GNU General Public License v3.0 | 5 votes |
private void initialize() throws Exception { responseString = NUHttpClientUtils.getData("http://multiupload.biz/", httpContext); doc = Jsoup.parse(responseString); uploadURL = doc.select("form#F1").attr("action"); uploadId = doc.select("input[name=upload_id]").val(); //Get all the services Elements elements = doc.select(".srvtbl input[checked]"); ListIterator<Element> listIterator = elements.listIterator(); while(listIterator.hasNext()){ //NULogger.getLogger().log(Level.INFO, "Value of name: {0}", listIterator.next().attr("name")); services.add(listIterator.next().val()); } }
Example 12
Source File: Yande.java From PicKing with Apache License 2.0 | 5 votes |
@Override public String getContentNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException { Document document = Jsoup.parse(new String(result, "utf-8")); Log.e("Yande", "getSinglePicContent: " + new String(result, "utf-8")); Elements elements = document.select("div#paginator a.next_page"); if (elements.size() > 0) { Log.e("Yande", "getContentNext: " + baseUrl + elements.get(0).attr("href")); return baseUrl + elements.get(0).attr("href"); } return ""; }
Example 13
Source File: RelinkImagesTask.java From coolreader with MIT License | 4 votes |
private void processImageInContents() { // get all contents ArrayList<PageModel> pages = NovelsDao.getInstance().getAllContentPageModel(); updated = 0; int count = 1; for (PageModel page : pages) { String message = LNReaderApplication.getInstance().getApplicationContext().getResources().getString(R.string.relink_task_progress, page.getPage(), count, pages.size()); publishProgress(new CallbackEventData(message)); try { // get the contents NovelContentModel content = NovelsDao.getInstance().getNovelContent(new BookModel(), false, callback); if (content != null) { // replace the rootpath based on /project/ // for now just replace the thumbs // file:///mnt/sdcard/test/project/images/thumb/c/c7/Accel_World_v01_262.jpg/84px-Accel_World_v01_262.jpg // file:///sdcard-ext/.bakareaderex/project/images/thumb/c/c7/Accel_World_v01_262.jpg/84px-Accel_World_v01_262.jpg Document doc = Jsoup.parse(content.getContent()); Elements imageElements = doc.select("img"); for (Element image : imageElements) { String imgUrl = image.attr("src"); if (imgUrl.startsWith("file:///") && imgUrl.contains("/project/images/thumb/")) { String mntImgUrl = imgUrl.replace("file:///", ""); Log.d(TAG, "Found image : " + imgUrl); if (!new File(mntImgUrl).exists()) { Log.d(TAG, "Old image doesn't exists/moved: " + mntImgUrl); String newUrl = imgUrl.replaceAll("file:///[\\w/\\./!$%^&*()_+|~\\={}\\[\\]:\";'<>?,-]+/project/images/thumb/", "file:///" + rootPath + "/project/images/thumb/"); String mntNewUrl = newUrl.replace("file:///", ""); Log.d(TAG, "Trying to replace with " + mntNewUrl); if (new File(mntNewUrl).exists()) { Log.d(TAG, "Replace image: " + imgUrl + " ==> " + newUrl); image.attr("src", newUrl); ++updated; } } } } content.setContent(doc.html()); NovelsDao.getInstance().updateNovelContent(content); } } catch (Exception e) { message = LNReaderApplication.getInstance().getApplicationContext().getResources().getString(R.string.relink_task_error, page.getPage()); Log.e(TAG, message, e); publishProgress(new CallbackEventData(message)); } ++count; } }
Example 14
Source File: TheVideoAccount.java From neembuu-uploader with GNU General Public License v3.0 | 4 votes |
@Override public void login() { loginsuccessful = false; try { initialize(); NULogger.getLogger().info("Trying to log in to TheVideo.me"); httpPost = new NUHttpPost("http://www.thevideo.me/"); List<NameValuePair> formparams = new ArrayList<NameValuePair>(); formparams.add(new BasicNameValuePair("op", "login")); formparams.add(new BasicNameValuePair("login", getUsername())); formparams.add(new BasicNameValuePair("password", getPassword())); UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8"); httpPost.setEntity(entity); httpResponse = httpclient.execute(httpPost, httpContext); NULogger.getLogger().info(httpResponse.getStatusLine().toString()); if (!CookieUtils.getCookieValue(httpContext, "xfsts").isEmpty() && !CookieUtils.getCookieValue(httpContext, "login").isEmpty()) { EntityUtils.consume(httpResponse.getEntity()); loginsuccessful = true; username = getUsername(); password = getPassword(); hostsAccountUI().hostUI(HOSTNAME).setEnabled(true); NULogger.getLogger().info("TheVideo.me login successful!"); } else { //Get error message responseString = EntityUtils.toString(httpResponse.getEntity()); //FileUtils.saveInFile("TheVideoAccount.html", responseString); Document doc = Jsoup.parse(responseString); String error = doc.select(".err").first().text(); if("Incorrect Login or Password".equals(error)){ throw new NUInvalidLoginException(getUsername(), HOSTNAME); } //Generic exception throw new Exception("Login error: " + error); } } catch(NUException ex){ resetLogin(); ex.printError(); accountUIShow().setVisible(true); } catch (Exception e) { resetLogin(); NULogger.getLogger().log(Level.SEVERE, "{0}: {1}", new Object[]{getClass().getName(), e}); showWarningMessage( Translation.T().loginerror(), HOSTNAME); accountUIShow().setVisible(true); } }
Example 15
Source File: ContentExtractor.java From WebCollector with GNU General Public License v3.0 | 4 votes |
public static String getContentByHtml(String html, String url) throws Exception { Document doc = Jsoup.parse(html, url); return getContentElementByDoc(doc).text(); }
Example 16
Source File: SendSpace.java From neembuu-uploader with GNU General Public License v3.0 | 4 votes |
@Override public void run() { try { if (sendSpaceAccount.loginsuccessful) { userType = "reg"; httpContext = sendSpaceAccount.getHttpContext(); maxFileSizeLimit = 314572800; // 300 MB } else { userType = "anon"; cookieStore = new BasicCookieStore(); httpContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore); maxFileSizeLimit = 314572800; // 300 MB } if (file.length() > maxFileSizeLimit) { throw new NUMaxFileSizeException(maxFileSizeLimit, file.getName(), host); } uploadInitialising(); initialize(); // https://fs08u.sendspace.com/upload?SPEED_LIMIT=0&MAX_FILE_SIZE=314572800&UPLOAD_IDENTIFIER=x.x.x.x.0&DESTINATION_DIR=xx // fs08u.sendspace.com/upload?SPEED_LIMIT=0&MAX_FILE_SIZE=314572800&UPLOAD_IDENTIFIER=x.x.x.x.0&DESTINATION_DIR=xx hostName = StringUtils.stringStartingFromString(uploadURL, "https://"); // fs08u.sendspace.com hostName = StringUtils.stringUntilString(hostName, "sendspace.com") + "sendspace.com"; // https://fs08u.sendspace.com/upload?SPEED_LIMIT=0&MAX_FILE_SIZE=314572800&UPLOAD_IDENTIFIER=910609187.1440099567.3BB289C9.22.0&DESTINATION_DIR=22 httpPost = new NUHttpPost(uploadURL); httpPost.setHeader("Host", hostName); httpPost.setHeader("Referer", "https://www.sendspace.com/"); MultipartEntity mpEntity = new MultipartEntity(HttpMultipartMode.BROWSER_COMPATIBLE); mpEntity.addPart("PROGRESS_URL", new StringBody(progressURL)); mpEntity.addPart("js_enabled", new StringBody("1")); mpEntity.addPart("signature", new StringBody(signature)); mpEntity.addPart("upload_files", new StringBody("")); if (userType.equals("reg")) { mpEntity.addPart("userid", new StringBody(userID)); mpEntity.addPart("folder_id", new StringBody("0")); } mpEntity.addPart("terms", new StringBody("1")); mpEntity.addPart("file[]", new StringBody("")); mpEntity.addPart("description[]", new StringBody("")); mpEntity.addPart("upload_file[]", createMonitoredFileBody()); httpPost.setEntity(mpEntity); NULogger.getLogger().log(Level.INFO, "executing request {0}", httpPost.getRequestLine()); NULogger.getLogger().info("Now uploading your file into SendSpace.com"); uploading(); httpResponse = httpclient.execute(httpPost, httpContext); responseString = EntityUtils.toString(httpResponse.getEntity()); doc = Jsoup.parse(responseString); //Read the links gettingLink(); downloadlink = doc.select("div[class=file_description]").select("a").first().attr("href"); deletelink = doc.select("a[class=link]").attr("href"); NULogger.getLogger().log(Level.INFO, "Delete link : {0}", deletelink); NULogger.getLogger().log(Level.INFO, "Download link : {0}", downloadlink); downURL = downloadlink; delURL = deletelink; uploadFinished(); } catch(NUException ex){ ex.printError(); uploadInvalid(); } catch (Exception e) { Logger.getLogger(getClass().getName()).log(Level.SEVERE, null, e); uploadFailed(); } }
Example 17
Source File: HtmlParserTest.java From astor with GNU General Public License v2.0 | 4 votes |
@Test public void handlesControlCodeInAttributeName() { Document doc = Jsoup.parse("<p><a \06=foo>One</a><a/\06=bar><a foo\06=bar>Two</a></p>"); assertEquals("<p><a>One</a><a></a><a foo=\"bar\">Two</a></p>", doc.body().html()); }
Example 18
Source File: UpLeaAccount.java From neembuu-uploader with GNU General Public License v3.0 | 4 votes |
@Override public void login() { loginsuccessful = false; try { initialize(); NULogger.getLogger().info("Trying to log in to UpLea.com"); httpPost = new NUHttpPost("http://api.uplea.com/api/get-my-api-key"); List<NameValuePair> formparams = new ArrayList<NameValuePair>(); formparams.add(new BasicNameValuePair("username", getUsername())); formparams.add(new BasicNameValuePair("password", getPassword())); UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8"); httpPost.setEntity(entity); httpResponse = httpclient.execute(httpPost, httpContext); NULogger.getLogger().info(httpResponse.getStatusLine().toString()); responseString = EntityUtils.toString(httpResponse.getEntity()); // {"error":[],"result":{"api_key":"alphanumericstring"},"status":true} api_key = StringUtils.stringBetweenTwoStrings(responseString, "\"api_key\":\"", "\""); if (responseString.contains("true") && !api_key.isEmpty()) { EntityUtils.consume(httpResponse.getEntity()); loginsuccessful = true; username = getUsername(); password = getPassword(); NULogger.getLogger().info("UpLea.com login successful!"); } else { //Get error message responseString = EntityUtils.toString(httpResponse.getEntity()); Document doc = Jsoup.parse(responseString); String error = doc.select(".err").first().text(); if("Incorrect Login or Password".equals(error)){ throw new NUInvalidLoginException(getUsername(), HOSTNAME); } //Generic exception throw new Exception("Login error: " + error); } } catch(NUException ex){ resetLogin(); ex.printError(); accountUIShow().setVisible(true); } catch (Exception e) { resetLogin(); NULogger.getLogger().log(Level.SEVERE, "{0}: {1}", new Object[]{getClass().getName(), e}); showWarningMessage( Translation.T().loginerror(), HOSTNAME); accountUIShow().setVisible(true); } }
Example 19
Source File: HtmlParserTest.java From astor with GNU General Public License v2.0 | 4 votes |
@Test public void handlesNestedImplicitTable() { Document doc = Jsoup.parse("<table><td>1</td></tr> <td>2</td></tr> <td> <table><td>3</td> <td>4</td></table> <tr><td>5</table>"); assertEquals("<table><tbody><tr><td>1</td></tr> <tr><td>2</td></tr> <tr><td> <table><tbody><tr><td>3</td> <td>4</td></tr></tbody></table> </td></tr><tr><td>5</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); }
Example 20
Source File: PolymerServerEventHandlersTest.java From flow with Apache License 2.0 | 4 votes |
CorrectAnnotationUsage() { super((clazz, tag, service) -> new TemplateData("", Jsoup.parse("<dom-module id='polymer'></dom-module>"))); }