org.jsoup.nodes.Document#setBaseUri

Source File: XHTMLDocumentHandler.java From docx4j-template with Apache License 2.0

6 votes

/**
 * Jsoup.parse(String url, int timeoutMillis)
 * Jsoup.connect(String url) 方法创建一个新的 Connection, 和  post() 取得和解析一个HTML文件。如果从该URL获取HTML时发生错误，便会抛出 IOException，应适当处理。
 * 这两个方法只支持Web URLs (http和https 协议); 
 */
@Override
public Document handle(String url, DataMap dataMap) throws IOException{
	//获取Jsoup参数
	String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
	String userAgent = "Mozilla/5.0 (jsoup)";
	int timeout = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_TIMEOUTMILLIS, Docx4jConstants.DEFAULT_TIMEOUTMILLIS);
	//fetch the specified URL and parse to a HTML DOM
	Document doc = Jsoup.connect(url)
			  .data(dataMap.getData1())
			  .data(dataMap.getData2())
			  .userAgent(userAgent)
			  .cookies(dataMap.getCookies())
			  .timeout(timeout)
			  .post();
	doc.setBaseUri(baseUri);
	//返回Document对象
	return doc;
}

Source File: HtmlParse.java From ChipHellClient with Apache License 2.0

6 votes

/**
 * 解析用户信息
 *
 * @param responseBody
 * @return
 */
public static User parseUserInfo(String responseBody) {
    User user = new User();
    try {
        Document document = Jsoup.parse(responseBody);
        document.setBaseUri(Constants.BASE_URL);
        Element elementUser = document.getElementsByClass("userinfo").first();
        Element elementAvatar = elementUser.getElementsByTag("img").first();
        user.setAvatarUrl(elementAvatar.attr("src"));
        user.setName(elementUser.getElementsByClass("name").first().text());
        user.setInfo(elementUser.getElementsByClass("user_box").html());

        Element btn_exit = document.getElementsByClass("btn_exit").first();

        String url = btn_exit.child(0).attr("href");
        UrlParamsMap map = new UrlParamsMap(url);
        String formHash = map.get("formhash");

        user.setFormHash(formHash);
        LogMessage.i("formHash", formHash);
    } catch (Exception e) {
        LogMessage.w(TAG + "#parseUserInfo", e);
    }
    return user;
}

Source File: HtmlParse.java From ChipHellClient with Apache License 2.0

6 votes

/**
 * 解析相册
 *
 * @param responseBody
 * @return
 */
public static AlbumWrap parseAubum(String responseBody) {
    AlbumWrap albumWrap = new AlbumWrap();
    List<String> albums = new ArrayList<String>();

    Document document = Jsoup.parse(responseBody);
    document.setBaseUri(Constants.BASE_URL);
    Elements elements = document.getElementsByClass("postalbum_i");
    for (Element album : elements) {
        String url = album.absUrl("orig");
        albums.add(url);
    }
    albumWrap.setUrls(albums);

    String strCurpic = document.getElementById("curpic").text();
    int curpic = Integer.valueOf(strCurpic) - 1;
    albumWrap.setCurPosition(curpic);
    return albumWrap;
}

Source File: XHTMLDocumentHandler.java From docx4j-template with Apache License 2.0

5 votes

/**
 * Jsoup.parse(String url, int timeoutMillis)
 * Jsoup.connect(String url) 方法创建一个新的 Connection, 和  post() 取得和解析一个HTML文件。如果从该URL获取HTML时发生错误，便会抛出 IOException，应适当处理。
 * 这两个方法只支持Web URLs (http和https 协议); 
 */
@Override
public Document handle(URL url) throws IOException{
	//获取Jsoup参数
	String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
	int timeout = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_TIMEOUTMILLIS, Docx4jConstants.DEFAULT_TIMEOUTMILLIS);
	//fetch the specified URL and parse to a HTML DOM
	Document doc = Jsoup.parse(url,timeout);
	doc.setBaseUri(baseUri);
	//返回Document对象
	return doc;
}

Source File: HtmlBot.java From ContentExtractor with GNU General Public License v2.0

5 votes

public static DomPage getDomPageByHtml(String html,String url){

        Document doc= Jsoup.parse(html);
        if(url!=null){
            doc.setBaseUri(url);
        }
        DomPage domPage=new DomPage(doc);
        return domPage;
    }

Source File: HtmlBot.java From WordCount with GNU General Public License v2.0

5 votes

public static DomPage getDomPageByHtml(String html,String url){

        Document doc= Jsoup.parse(html);
        if(url!=null){
            doc.setBaseUri(url);
        }
        DomPage domPage=new DomPage(doc);
        return domPage;
    }

Source File: HtmlParse.java From ChipHellClient with Apache License 2.0

5 votes

/**
 * 解析引用回复的准备数据
 *
 * @param responseBody
 * @return
 */
public static PrepareQuoteReply parsePrepareQuoteReply(String responseBody) {
    PrepareQuoteReply quoteReply = new PrepareQuoteReply();
    try {

        Document document = Jsoup.parse(responseBody);
        document.setBaseUri(Constants.BASE_URL);

        Element postform = document.getElementById("postform");
        String url = postform.absUrl("action");

        String formhash = postform.getElementsByAttributeValue("name", "formhash").first().attr("value");
        String posttime = postform.getElementsByAttributeValue("name", "posttime").first().attr("value");
        String noticeauthor = postform.getElementsByAttributeValue("name", "noticeauthor").first().attr("value");
        String noticetrimstr = postform.getElementsByAttributeValue("name", "noticetrimstr").first().attr("value");
        String noticeauthormsg = postform.getElementsByAttributeValue("name", "noticeauthormsg").first().attr("value");
        String reppid = postform.getElementsByAttributeValue("name", "reppid").first().attr("value");
        String reppost = postform.getElementsByAttributeValue("name", "reppost").first().attr("value");
        String quoteBody = postform.getElementsByTag("blockquote").first().toString();

        quoteReply.setNoticeauthor(noticeauthor);
        quoteReply.setNoticeauthormsg(noticeauthormsg);
        quoteReply.setNoticetrimstr(noticetrimstr);
        quoteReply.setPosttime(posttime);
        quoteReply.setQuoteBody(quoteBody);
        quoteReply.setReppid(reppid);
        quoteReply.setUrl(url);
        quoteReply.setFormhash(formhash);
        quoteReply.setReppost(reppost);
    } catch (Exception e) {
        e.printStackTrace();
    }

    return quoteReply;
}

Source File: AbstractJsoupExtractor.java From wandora with GNU General Public License v3.0

5 votes

@Override
public boolean _extractTopicsFrom(File f, TopicMap t) throws Exception {
    if(f.isDirectory()) 
        throw new Exception("Directories are not supported.");
    
    Document d = Jsoup.parse(f,"UTF-8");
    d.setBaseUri(f.getAbsolutePath());
    
    return extractTopicsFrom(d, f.getAbsolutePath(), t);
}

Source File: Item.java From KaellyBot with GNU General Public License v3.0

4 votes

public static Item getItem(Language lg, String url) throws IOException {
    Document doc = JSoupManager.getDocument(url);
    doc.setBaseUri(url);
    String name = doc.getElementsByClass("ak-return-link").first().text();
    String level = doc.getElementsByClass("ak-encyclo-detail-level").first().text()
            .replaceAll(Translator.getLabel(lg, "item.extract.level") + " ", "");
    String type = doc.getElementsByClass("ak-encyclo-detail-type").last().children().last().text();

    String skinURL = doc.getElementsByClass("ak-encyclo-detail-illu").first()
            .getElementsByTag("img").first().attr("src");

    String description = null;
    String effects = null;
    String caracteristics = null;
    String conditions = null;
    String set = null;
    String setURL = null;
    String recipe = null;

    Elements titles = doc.getElementsByClass("ak-panel-title");
    Elements lines;
    StringBuilder tmp;
    for (Element title : titles)
        if (title.text().equals(Translator.getLabel(lg, "item.extract.description")))
            description = title.parent().getElementsByClass("ak-panel-content").first().text();
        else if (title.text().equals(Translator.getLabel(lg, "item.extract.effets")))
            effects = extractStatsFromTitle(lg, title);
        else if (title.text().equals(Translator.getLabel(lg, "item.extract.caracteristiques")))
            caracteristics = extractLinesFromTitle(title);
        else if (title.text().equals(Translator.getLabel(lg, "item.extract.evolution_effects")))
            effects = extractEvolutionEffectsFromTitle(lg, url);
        else if (title.text().equals(Translator.getLabel(lg, "item.extract.conditions")))
            conditions = extractLinesFromTitle(title);
        else if (title.text().contains(Translator.getLabel(lg, "item.extract.panoplie"))) {
            set = title.getElementsByTag("a").first().text();
            setURL = title.getElementsByTag("a").first().attr("abs:href");
        } else if (title.text().equals(Translator.getLabel(lg, "item.extract.recette"))) {
            lines = title.parent().getElementsByClass("ak-column");
            tmp = new StringBuilder();
            for (Element line : lines)
                tmp.append(line.getElementsByClass("ak-front").text()).append(" [")
                        .append(line.getElementsByClass("ak-title").first().text()).append("](")
                        .append(line.getElementsByClass("ak-title").first()
                                .children().first().attr("abs:href")).append(")\n");
            recipe = tmp.toString();
        }

    return new Item(name, type, level, description, effects, URLManager.abs(skinURL), url,
            caracteristics, conditions, set, setURL, recipe);
}

Source File: Resource.java From KaellyBot with GNU General Public License v3.0

4 votes

public static Resource getResource(Language lg, String url) throws IOException {
    Document doc = JSoupManager.getDocument(url);
    doc.setBaseUri(url);
    String name = doc.getElementsByClass("ak-return-link").first().text();
    String level = null;
    if (! doc.getElementsByClass("ak-encyclo-detail-level").isEmpty())
        level = doc.getElementsByClass("ak-encyclo-detail-level").first().text()
            .replaceAll(Translator.getLabel(lg, "resource.extract.level") + " ", "");
    String type = doc.getElementsByClass("ak-encyclo-detail-type").last().children().last().text();

    String skinURL = doc.getElementsByClass("ak-encyclo-detail-illu").first()
            .getElementsByTag("img").first().attr("src");

    String description = null;
    String effects = null;
    String bonus = null;
    String sorts = null;
    String recipe = null;
    List<String> monsterDrops = new ArrayList<>();

    Elements titles = doc.getElementsByClass("ak-panel-title");
    Elements lines;
    StringBuilder tmp;
    for (Element title : titles)
        if (title.text().equals(Translator.getLabel(lg, "resource.extract.description")))
            description = title.parent().getElementsByClass("ak-panel-content").first().text();
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.effets")))
            effects = extractStatsFromTitle(lg, title);
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.bonus")))
            bonus = extractLinesFromTitle(title);
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.sorts")))
            sorts = title.parent().getElementsByClass("ak-panel-content").first().text();
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.monsterDrop")))
            monsterDrops = extractDrops(title.parent());
        else if (title.text().equals(Translator.getLabel(lg, "resource.extract.recette"))){
            lines = title.parent().getElementsByClass("ak-column");
            tmp = new StringBuilder();
            for (Element line : lines)
                tmp.append(line.getElementsByClass("ak-front").text()).append(" [")
                        .append(line.getElementsByClass("ak-title").first().text()).append("](")
                        .append(line.getElementsByClass("ak-title").first()
                                .children().first().attr("abs:href")).append(")\n");
            recipe = tmp.toString();
        }

    return new Resource(name, type, level, description, effects, URLManager.abs(skinURL), url,
            bonus, sorts, recipe, monsterDrops);
}

Source File: UntisInfoHeadlessParser.java From substitution-schedule-parser with Mozilla Public License 2.0

4 votes

@Override
public SubstitutionSchedule getSubstitutionSchedule()
		throws IOException, JSONException, CredentialInvalidException {
	new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);

	SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

	Document doc = Jsoup.parse(httpGet(url, data.optString(PARAM_ENCODING, null)));
       doc.setBaseUri(url);
       Elements dayElems = doc.select("#vertretung > p > b, #vertretung > b");

       Elements frames = doc.select("frame[src*=w00]");
       if (dayElems.size() == 0 && frames.size() > 0) {
           // doc is embedded in frame
           doc = Jsoup.parse(httpGet(frames.get(0).absUrl("src"), data.optString(PARAM_ENCODING, null)));
           dayElems = doc.select("#vertretung > p > b, #vertretung > b");
       } else if (dayElems.size() == 0) {
           // seen at GHS Berlin, different kinds of center > font > center ... stacked (sometimes within #vertretung)
           dayElems = doc.select("center > font > p > b");
       }

       final List<String> allClasses = getAllClasses();
       if (dayElems.size() > 0) {
           // untis-info days
           for (Element dayElem : dayElems) {
               SubstitutionScheduleDay day = new SubstitutionScheduleDay();
               day.setLastChangeString("");

               String date = dayElem.text();
               day.setDateString(date);
               day.setDate(ParserUtils.parseDate(date));

               Element next;
               if (dayElem.parent().tagName().equals("p")) {
                   next = dayElem.parent().nextElementSibling().nextElementSibling();
               } else {
                   next = dayElem.parent().select("p").first().nextElementSibling();
               }
               parseDay(day, next, v, null, allClasses);
           }
       } else if (doc.select("tr:has(td[align=center]):gt(0)").size() > 0) {
           // untis-subst table
           parseSubstitutionTable(v, null, doc);
       }

       v.setClasses(allClasses);
       v.setTeachers(getAllTeachers());
	return v;
}

Source File: HtmlParse.java From ChipHellClient with Apache License 2.0

4 votes

/**
 * 解析板块列表
 *
 * @param content
 * @return
 */
public static List<PlateGroup> parsePlateGroupList(String content) {
    List<PlateGroup> groups = new ArrayList<PlateGroup>();
    Document document = Jsoup.parse(content);
    document.setBaseUri(Constants.BASE_URL);
    Elements elementsGroup = document.getElementsByClass("bm");
    for (Element bm : elementsGroup) {
        PlateGroup plateGroup = new PlateGroup();

        Element bm_h = bm.getElementsByClass("bm_h").first();
        String title = bm_h.text();
        plateGroup.setTitle(title);
        List<Plate> plates = new ArrayList<Plate>();
        Elements plateElements = bm.getElementsByClass("bm_c");

        for (Element bm_c : plateElements) {
            Plate plate = new Plate();
            //链接，第一个是版块链接，如果有第二个则是删除收藏连接
            Elements as = bm_c.getElementsByTag("a");
            Element a1 = as.first();
            String plateTitle = a1.text();
            String url = a1.absUrl("href");
            Elements count = bm_c.getElementsByClass("xg1");
            String xg1 = null;
            if (count.size() != 0) {
                xg1 = count.first().text();
            } else {
                xg1 = "(0)";
            }

            //判断是否收藏
            String favoriteId = null;
            if (as.size() > 1) {
                String urlDelete = as.get(1).absUrl("href");
                favoriteId = new UrlParamsMap(urlDelete).get("favid");
            }

            plate.setTitle(plateTitle);
            plate.setUrl(url);
            plate.setXg1(xg1);
            plate.setFavoriteId(favoriteId);
            plates.add(plate);

        }

        plateGroup.setPlates(plates);
        groups.add(plateGroup);
    }

    return groups;
}

Java Code Examples for org.jsoup.nodes.Document#setBaseUri()