org.jsoup.nodes.Document#title

Source File: DynamicIp.java From rank with Apache License 2.0

6 votes

public static boolean isConnected(){
    try {
        Document doc = Jsoup.connect("http://www.baidu.com/s?wd=杨尚川&t=" + System.currentTimeMillis())
                .header("Accept", ACCEPT)
                .header("Accept-Encoding", ENCODING)
                .header("Accept-Language", LANGUAGE)
                .header("Connection", CONNECTION)
                .header("Referer", "https://www.baidu.com")
                .header("Host", "www.baidu.com")
                .header("User-Agent", USER_AGENT)
                .ignoreContentType(true)
                .timeout(30000)
                .get();
        LOGGER.info("搜索结果页面标题："+doc.title());
        if(doc.title() != null && doc.title().contains("杨尚川")){
            return true;
        }
    }catch (Exception e){
        if("Network is unreachable".equals(e.getMessage())){
            return false;
        }else{
            LOGGER.error("状态检查失败:"+e.getMessage());
        }
    }
    return false;
}

Source File: UrlTitleAnnouncer.java From VileBot with MIT License

6 votes

/**
 * Accesses the source of a HTML page and looks for a title element
 * 
 * @param url http URI String
 * @return String of text between the first <title> tag group on the page, empty if error.
 */
private String scrapeURLHTMLTitle( String url )
{
    String title = "";

    try
    {
        Document doc = Jsoup.connect( url ).get();
        title = doc.title();
    }
    catch ( IOException x )
    {
        System.err.format( "scrapeURLHTMLTitle BufferedReader error: %s%n", x );
    }

    return title;
}

Source File: DynamicIp.java From superword with Apache License 2.0

6 votes

public static boolean isConnected(){
    try {
        Document doc = Jsoup.connect("http://www.baidu.com/s?wd=杨尚川&t=" + System.currentTimeMillis())
                .header("Accept", ACCEPT)
                .header("Accept-Encoding", ENCODING)
                .header("Accept-Language", LANGUAGE)
                .header("Connection", CONNECTION)
                .header("Referer", "https://www.baidu.com")
                .header("Host", "www.baidu.com")
                .header("User-Agent", USER_AGENT)
                .ignoreContentType(true)
                .timeout(30000)
                .get();
        LOGGER.info("搜索结果页面标题："+doc.title());
        if(doc.title() != null && doc.title().contains("杨尚川")){
            return true;
        }
    }catch (Exception e){
        if("Network is unreachable".equals(e.getMessage())){
            return false;
        }else{
            LOGGER.error("状态检查失败:"+e.getMessage());
        }
    }
    return false;
}

Source File: ArticalRemoteDataSource.java From KotlinMVPRxJava2Dagger2GreenDaoRetrofitDemo with Apache License 2.0

6 votes

private List<String> parseData(String html) {
    //jsoup解析数据
    Document document = Jsoup.parse(html);
    String title = document.title();
    ArrayList<String> strings = new ArrayList<>();
    strings.add(title);

    Elements ul = document.getElementsByTag("ul");
    for (Element element : ul) {
        if (ul.hasClass("panel_body itemlist")) {
            Elements a = element.getElementsByTag("a");
            for (Element aa : a) {
                if (aa.ownText().length() > 20)
                    strings.add(aa.ownText());
            }
        }
    }
    return strings;
}

Source File: TwitchVideoRipper.java From ripme with MIT License

6 votes

@Override
public void rip() throws IOException {
    LOGGER.info("Retrieving " + this.url);
    Document doc = Http.url(url).get();
    
    //Get user friendly filename from page title
    String title = doc.title();
    
    Elements script = doc.select("script");
    if (script.isEmpty()) {
        throw new IOException("Could not find script code at " + url);
    }
    //Regex assumes highest quality source is listed first
    Pattern p = Pattern.compile("\"source\":\"(.*?)\"");
    
    for (Element element : script) {
        Matcher m = p.matcher(element.data());
        if (m.find()){
            String vidUrl = m.group(1);
            addURLToDownload(new URL(vidUrl), HOST + "_" + title);
        }
    }
    waitForThreads();
}

Source File: JsoupTesting.java From Java-Data-Science-Cookbook with MIT License

6 votes

public void extractDataWithJsoup(String href){
	Document doc = null;
	try {
		doc = Jsoup.connect(href).timeout(10*1000).userAgent("Mozilla").ignoreHttpErrors(true).get();
	} catch (IOException e) {
		//Your exception handling here
	}
	if(doc != null){
		String title = doc.title();
		String text = doc.body().text();
		Elements links = doc.select("a[href]");
		for (Element link : links) {
			String linkHref = link.attr("href");
			String linkText = link.text();
			String linkOuterHtml = link.outerHtml(); 
			String linkInnerHtml = link.html();
		}
	}
}

Source File: ParserDemo.java From crawler4j with Apache License 2.0

5 votes

@Override
public void parse(HttpFetchResult result, String url, String threadName, boolean isUpdate) {
	try {
		String html = result.getHtml();
		Document doc = Jsoup.parse(html);
		String title = doc.title();
		logger.info(threadName +" " + title + " " + url + " ");
		
	} catch (Exception e) {
		e.printStackTrace();
	}

}

Source File: UtilsDemoActivity.java From UltimateAndroid with Apache License 2.0

5 votes

@Override
protected Void doInBackground(Void... params) {
    try {
        // Connect to the web site
        Document document = Jsoup.connect(url).get();
        // Get the html document title
        title = document.title();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}

Source File: UtilsDemoActivity.java From UltimateAndroid with Apache License 2.0

5 votes

public void onHandleIntent(Intent intent) {
    this.url = intent.getStringExtra("url");
    try {
        // Connect to the web site
        Document document = Jsoup.connect(url).get();
        // Get the html document title
        title = document.title();
    } catch (IOException e) {
        e.printStackTrace();
    }
    Intent resultIntent = new Intent(TITLE_FILTER);
    resultIntent.putExtra("title", title);
    LocalBroadcastManager.getInstance(this).sendBroadcast(resultIntent);
}

Source File: DynamicIp.java From superword with Apache License 2.0

5 votes

public static boolean execute(Map<String, String> cookies, String action){
    String url = "http://192.168.0.1/goform/SysStatusHandle";
    Map<String, String> map = new HashMap<>();
    map.put("action", action);
    map.put("CMD", "WAN_CON");
    map.put("GO", "system_status.asp");
    Connection conn = Jsoup.connect(url)
            .header("Accept", ACCEPT)
            .header("Accept-Encoding", ENCODING)
            .header("Accept-Language", LANGUAGE)
            .header("Connection", CONNECTION)
            .header("Host", HOST)
            .header("Referer", REFERER)
            .header("User-Agent", USER_AGENT)
            .ignoreContentType(true)
            .timeout(30000);
    for(String cookie : cookies.keySet()){
        conn.cookie(cookie, cookies.get(cookie));
    }

    String title = null;
    try {
        Connection.Response response = conn.method(Connection.Method.POST).data(map).execute();
        String html = response.body();
        Document doc = Jsoup.parse(html);
        title = doc.title();
        LOGGER.info("操作连接页面标题："+title);
        Thread.sleep(10000);
    }catch (Exception e){
        LOGGER.error(e.getMessage());
    }
    if("LAN | LAN Settings".equals(title)){
        if(("3".equals(action) && isConnected())
                || ("4".equals(action) && !isConnected())){
            return true;
        }
    }
    return false;
}

Source File: WxCrawlServiceImpl.java From wx-crawl with Apache License 2.0

5 votes

private String getArticleTitle(Document sourceDoc) {
    String title = "";
    if(sourceDoc.head() != null &&
            StringUtils.isNotEmpty(sourceDoc.head().attr(WxCrawlerConstant.BackupArticle.ARTICLE_TITLE))) {
        title = sourceDoc.head().attr(WxCrawlerConstant.BackupArticle.ARTICLE_TITLE);
    } else if (sourceDoc.select(WxCrawlerConstant.HTMLElementSelector.TITLE).first() != null) {
        title = sourceDoc.select(WxCrawlerConstant.HTMLElementSelector.TITLE).first().text();
    } else {
        title = sourceDoc.title();
    }
    return title;
}

Source File: JSoupExamples.java From Java-for-Data-Science with MIT License

5 votes

public void displayBodyText(Document document) {
    // Displays the entire body of the document
    String title = document.title();
    out.println("Title: " + title);

    out.println("---Body---");
    Elements element = document.select("body");
    out.println("Text: " + element.text());
}

Source File: InternetBrowser.java From petscii-bbs with Mozilla Public License 2.0

5 votes

public static List<Entry> getAllLinks(Document webpage) throws Exception {
    List<Entry> urls = new ArrayList<>(); //why
    String title = webpage.title();
    Elements links = webpage.select("a[href]");
    Element link;

    for(int j=0; j < links.size(); j++){
        link=links.get(j);
        final String label = defaultIfBlank(link.text(), link.attr("href"));

        urls.add(new Entry(link.absUrl("href"), label));

    }
    return urls;
}

Source File: JsoupHCalendarExtractor.java From wandora with GNU General Public License v3.0

5 votes

private void parseCalendar(Document document) throws TopicMapException {
    String title = document.title();
    Topic type = getType("vcalendar");
    Topic topic = getOrCreateTopic(tm,null, title);
    topic.addType(type);
    
    parseCalendar(topic, document.body());
}

Source File: ApiCatalogEndpointIntegrationTest.java From api-layer with Eclipse Public License 2.0

5 votes

@Test
public void whenMisSpeltContainersEndpoint_thenNotFoundResponseWithAPIMessage() throws Exception {
    HttpResponse response = getResponse(INVALID_CONTAINER_ENDPOINT, HttpStatus.SC_NOT_FOUND);
    final String htmlResponse = EntityUtils.toString(response.getEntity());
    Document doc = Jsoup.parse(htmlResponse);
    String title = doc.title();
    Elements h1 = doc.select("h1:first-child");
    Elements a = doc.select("a");
    assertNotNull(title);
    assertEquals("404 Not Found", title);
    assertEquals("404 Page Not Found", h1.text());
    assertEquals("Go to Dashboard", a.text());
}

Source File: ShadowSocksCrawlerService.java From ShadowSocks-Share with Apache License 2.0

5 votes

/**
 * 爬取 ss 账号
 */
public ShadowSocksEntity getShadowSocks() {
	try {
		Document document = getDocument();
		ShadowSocksEntity entity = new ShadowSocksEntity(getTargetURL(), document.title(), true, new Date());
		entity.setShadowSocksSet(parse(document));
		return entity;
	} catch (IOException e) {
		log.error(e.getMessage());
	}
	return new ShadowSocksEntity(getTargetURL(), "", false, new Date());
}

Source File: AutoGetHtml.java From danyuan-application with Apache License 2.0

4 votes

/**
 * @throws IOException
 * 方法名： getBody
 * 功 能： TODO(这里用一句话描述这个方法的作用)
 * 参 数： @param url
 * 参 数： @param key
 * 参 数： @return
 * 返 回： String
 * 作 者 ： Tenghui.Wang
 * @throws
 */
public static String getBody(String url, String key) throws IOException {
	Document doc = Jsoup.connect("http://www.oschina.net/")
	        
	        .data("query", "Java") // 请求参数
	        
	        .userAgent("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2") // 设置 User-Agent
	        
	        .cookie("auth", "token") // 设置 cookie
	        
	        .timeout(3000) // 设置连接超时时间
	        
	        .post(); // 使用 POST 方法访问 URL
	
	return doc.title();
}

Source File: AbstractHtmlConsumer.java From baleen with Apache License 2.0

4 votes

@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
  final File f = getFileName(jCas);
  final DocumentAnnotation da = getDocumentAnnotation(jCas);

  final Document doc =
      Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>");
  doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
  final Element head = doc.head();

  if (!Strings.isNullOrEmpty(css)) {
    final Element cssLink = head.appendElement("link");
    cssLink.attr("rel", "stylesheet");
    cssLink.attr("href", css);
  }

  final Element charset = head.appendElement("meta");
  charset.attr("charset", "utf-8");

  appendMeta(head, "document.type", da.getDocType());
  appendMeta(head, "document.sourceUri", da.getSourceUri());
  appendMeta(head, "externalId", da.getHash());

  appendMeta(head, "document.classification", da.getDocumentClassification());
  appendMeta(
      head,
      "document.caveats",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentCaveats())));
  appendMeta(
      head,
      "document.releasability",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentReleasability())));

  String title = null;
  for (final Metadata md : JCasUtil.select(jCas, Metadata.class)) {
    appendMeta(head, md.getKey(), md.getValue());
    if ("documentTitle".equalsIgnoreCase(md.getKey())) {
      title = md.getValue();
    }
  }

  if (!Strings.isNullOrEmpty(title)) {
    doc.title(title);
  }

  final Element body = doc.body();

  writeBody(jCas, body);

  try {
    FileUtils.writeStringToFile(f, doc.html(), Charset.defaultCharset());
  } catch (final IOException e) {
    throw new AnalysisEngineProcessException(e);
  }
}

Source File: Utils.java From SteamGifts with MIT License

4 votes

/**
 * The document title is in the format "Game Title - Page X" if we're on /giveaways/id/name/search?page=X,
 * so we strip out the page number.
 */
public static String getPageTitle(Document document) {
    String title = document.title();
    return title.replaceAll(" - Page ([\\d,]+)$", "");
}

Source File: HtmlUtils.java From ogham with Apache License 2.0

2 votes

/**
 * Get the title of the HTML. If no <code>title</code> tag exists, then the
 * title is null.
 * 
 * @param htmlContent
 *            the HTML content that may contain a title
 * @return the title of the HTML or null if none
 */
public static String getTitle(String htmlContent) {
	Document doc = Jsoup.parse(htmlContent);
	Elements titleNode = doc.select("head > title");
	return titleNode.isEmpty() ? null : doc.title();
}

Java Code Examples for org.jsoup.nodes.Document#title()