use of org.jsoup.select.Elements in project SpiderForUESTC by ClaudiusGitHub.
the class FindComent method findCommentByUid.
/**
* 如果返回true,则还有下一页点评。
* @param tid
* @param page 第几页点评
* @return
*/
public boolean findCommentByUid(String commentId, int tid, int page) {
boolean result = false;
String commentUrl = commentUrlPre + commentId + commentUrlMid + page + commentUrlSuf;
HttpGet httpGet = new HttpGet(commentUrl);
httpGet.setConfig(requestConfig);
try {
CloseableHttpResponse httpResponse = httpClient.execute(httpGet);
int times = 10;
while (httpResponse.getStatusLine().getStatusCode() != 200 && times > 0) {
httpResponse.close();
httpResponse = httpClient.execute(httpGet);
times--;
}
try {
HttpEntity httpEntity = httpResponse.getEntity();
String response = EntityUtils.toString(httpEntity);
if (response.contains("下一页"))
result = true;
if (!response.contains("" + uid))
return result;
int length = response.length();
response = response.substring(55, length - 10);
Document document = Jsoup.parse(response);
Elements elements = document.getElementsByClass("psti");
for (Element element : elements) {
if (element.toString().contains(uid + "")) {
System.out.print(element.text());
output(url + tid + " " + element.text());
}
}
} finally {
httpResponse.close();
}
} catch (IOException e) {
httpGet.releaseConnection();
e.printStackTrace();
}
return result;
}
use of org.jsoup.select.Elements in project jabref by JabRef.
the class GoogleScholar method findFullText.
@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
Objects.requireNonNull(entry);
Optional<URL> pdfLink = Optional.empty();
// Search in title
if (!entry.hasField(FieldName.TITLE)) {
return pdfLink;
}
try {
URIBuilder uriBuilder = new URIBuilder(SEARCH_IN_TITLE_URL);
uriBuilder.addParameter("as_q", "");
uriBuilder.addParameter("as_epq", entry.getField(FieldName.TITLE).orElse(null));
uriBuilder.addParameter("as_occt", "title");
Document doc = Jsoup.connect(uriBuilder.toString()).userAgent(URLDownload.USER_AGENT).get();
// TODO: link always on first result or none?
for (int i = 0; i < NUM_RESULTS; i++) {
Elements link = doc.select(String.format("#gs_ggsW%s a", i));
if (link.first() != null) {
String s = link.first().attr("href");
// link present?
if (!"".equals(s)) {
// TODO: check title inside pdf + length?
// TODO: report error function needed?! query -> result
LOGGER.info("Fulltext PDF found @ Google: " + s);
pdfLink = Optional.of(new URL(s));
break;
}
}
}
} catch (URISyntaxException e) {
throw new FetcherException("Building URI failed.", e);
}
return pdfLink;
}
use of org.jsoup.select.Elements in project Gargoyle by callakrsos.
the class ArticleExtractorComposite method findAllLink.
/**
* @작성자 : KYJ
* @작성일 : 2016. 12. 5.
* @param content
* @return
*/
private Collection<String> findAllLink(String content) {
Set<String> collect = Collections.emptySet();
try (ByteArrayInputStream in = new ByteArrayInputStream(content.getBytes())) {
Document parse = Jsoup.parse(in, "UTF-8", "https");
// Repository로 저장 관리할 필요성이 있음.
/* a 태그 만 추출. */
// parse.getElementsByTag("a");
Elements elementsByTag = parse.getElementsByTag("a");
collect = elementsByTag.stream().filter(e -> e.hasAttr("href")).map(e -> e.attr("href").trim()).filter(e -> e.startsWith("http") || e.startsWith("https")).filter(filterRepository.getFilter()).collect(Collectors.toSet());
} catch (IOException e) {
e.printStackTrace();
}
return collect;
}
use of org.jsoup.select.Elements in project Gargoyle by callakrsos.
the class NaverRealtimeSearchFactory method parse.
public List<RealtimeSearchVO> parse(String htmlBody) {
Document doc = Jsoup.parse(htmlBody);
Elements r = doc.select("[class*='realtime_srch']");
Elements select = r.select(".lst_realtime_srch");
List<RealtimeSearchVO> realtimeSearchItems = select.stream().map(e -> {
RealtimeSearchVO realtimeSearchVO = null;
Element previousElementSibling = e.previousElementSibling();
if (previousElementSibling != null) {
realtimeSearchVO = new RealtimeSearchVO();
String text = previousElementSibling.text();
if (text.length() >= 15) {
realtimeSearchVO.setTitle(text.substring(0, 15) + "...");
} else {
realtimeSearchVO.setTitle(text);
}
Elements liTags = e.getElementsByTag("li");
List<RealtimeSearchItemVO> items = liTags.stream().map(li -> {
RealtimeSearchItemVO item = new RealtimeSearchItemVO();
Element aTag = li.getElementsByTag("a").first();
Elements elementsByAttribute = aTag.getElementsByAttribute("href");
String url = elementsByAttribute.attr("href");
Element numElement = li.getElementsByClass("num").first();
String num = numElement.text();
Element titElement = li.getElementsByClass("tit").first();
String title = titElement.text();
item.setRank(Integer.parseInt(num, 10));
item.setKeyword(title);
item.setLink(url);
LOGGER.debug("title [{}] num [{}] url : [{}] , toString : {}", title, num, url, li.toString());
return item;
}).collect(Collectors.toList());
realtimeSearchVO.setItems(items);
}
return realtimeSearchVO;
}).filter(v -> v != null).collect(Collectors.toList());
return realtimeSearchItems;
}
use of org.jsoup.select.Elements in project CodeUtils by boredream.
the class Main method getAllAppLinks.
public static List<String> getAllAppLinks() throws Exception {
List<String> links = new ArrayList<>();
String response = HttpUtils.getString(hostUrl);
Document parse = Jsoup.parse(response);
Element appsElement = parse.getElementById("apps-dropdown");
Elements appsItemElements = appsElement.getElementsByTag("a");
for (Element element : appsItemElements) {
String attr = element.attr("href");
links.add(attr);
}
return links;
}
Aggregations