use of org.jsoup.nodes.Document in project cucumber-jvm by cucumber.
the class HTMLFormatterTest method writes_index_html.
@Test
public void writes_index_html() throws IOException {
URL indexHtml = new URL(outputDir, "index.html");
Document document = Jsoup.parse(new File(indexHtml.getFile()), "UTF-8");
Element reportElement = document.body().getElementsByClass("cucumber-report").first();
assertEquals("", reportElement.text());
}
use of org.jsoup.nodes.Document in project webmagic by code4craft.
the class CharsetUtils method detectCharset.
public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
String charset;
// charset
// 1、encoding in http header Content-Type
charset = UrlUtils.getCharset(contentType);
if (StringUtils.isNotBlank(contentType)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset);
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
} else // 2.2、html5 <meta charset="UTF-8" />
if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;
}
use of org.jsoup.nodes.Document in project ZhihuDailyPurify by izzyleung.
the class NewsListFromZhihuObservable method convertToDailyNews.
private static Optional<DailyNews> convertToDailyNews(Pair<Story, Document> pair) {
DailyNews result = null;
Story story = pair.first;
Document document = pair.second;
String dailyTitle = story.getDailyTitle();
List<Question> questions = getQuestions(document, dailyTitle);
if (Stream.of(questions).allMatch(Question::isValidZhihuQuestion)) {
result = new DailyNews();
result.setDailyTitle(dailyTitle);
result.setThumbnailUrl(story.getThumbnailUrl());
result.setQuestions(questions);
}
return Optional.ofNullable(result);
}
use of org.jsoup.nodes.Document in project jsoup by jhy.
the class ParseTest method testYahooJp.
@Test
public void testYahooJp() throws IOException {
File in = getFile("/htmltests/yahoo-jp.html");
// http charset is utf-8.
Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html");
assertEquals("Yahoo! JAPAN", doc.title());
Element a = doc.select("a[href=t/2322m2]").first();
assertEquals("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/t/2322m2", // session put into <base>
a.attr("abs:href"));
assertEquals("全国、人気の駅ランキング", a.text());
}
use of org.jsoup.nodes.Document in project jsoup by jhy.
the class ParseTest method testHtml5Charset.
@Test
public void testHtml5Charset() throws IOException {
// test that <meta charset="gb2312"> works
File in = getFile("/htmltests/meta-charset-1.html");
//gb2312, has html5 <meta charset>
Document doc = Jsoup.parse(in, null, "http://example.com/");
assertEquals("新", doc.text());
assertEquals("GB2312", doc.outputSettings().charset().displayName());
// double check, no charset, falls back to utf8 which is incorrect
//
in = getFile("/htmltests/meta-charset-2.html");
// gb2312, no charset
doc = Jsoup.parse(in, null, "http://example.com");
assertEquals("UTF-8", doc.outputSettings().charset().displayName());
assertFalse("新".equals(doc.text()));
// confirm fallback to utf8
in = getFile("/htmltests/meta-charset-3.html");
// utf8, no charset
doc = Jsoup.parse(in, null, "http://example.com/");
assertEquals("UTF-8", doc.outputSettings().charset().displayName());
assertEquals("新", doc.text());
}
Aggregations