use of org.jsoup.nodes.Element in project Java-Tutorial by gpcodervn.
the class JSoupExample method getAllPostImages.
// Lấy tất cả ảnh đại diện của bài viết
public static void getAllPostImages() throws IOException {
Document doc = Jsoup.connect("https://gpcoder.com").get();
Elements images = doc.select("article img[src~=(?i)\\.(png|jpe?g|gif)]");
for (Element image : images) {
System.out.println("\nsrc : " + image.attr("src"));
System.out.println("height : " + image.attr("height"));
System.out.println("width : " + image.attr("width"));
System.out.println("alt : " + image.attr("alt"));
}
}
use of org.jsoup.nodes.Element in project lavaplayer by sedmelluq.
the class YoutubeAudioSourceManager method extractPlaylistTracks.
private String extractPlaylistTracks(Element videoContainer, Element loadMoreContainer, List<AudioTrack> tracks) {
for (Element video : videoContainer.select(".pl-video")) {
Elements lengthElements = video.select(".timestamp span");
// If the timestamp element does not exist, it means the video is private
if (!lengthElements.isEmpty()) {
String videoId = video.attr("data-video-id").trim();
String title = video.attr("data-title").trim();
String author = video.select(".pl-video-owner a").text().trim();
long duration = DataFormatTools.durationTextToMillis(lengthElements.first().text());
tracks.add(buildTrackObject(videoId, title, author, false, duration));
}
}
if (loadMoreContainer != null) {
Elements more = loadMoreContainer.select(".load-more-button");
if (!more.isEmpty()) {
return more.first().attr("data-uix-load-more-href");
}
}
return null;
}
use of org.jsoup.nodes.Element in project lavaplayer by sedmelluq.
the class YoutubeSearchProvider method extractTrackFromResultEntry.
private void extractTrackFromResultEntry(List<AudioTrack> tracks, Element element) {
Element durationElement = element.select("[class^=video-time]").first();
Element contentElement = element.select(".yt-lockup-content").first();
String videoId = element.attr("data-context-item-id");
if (durationElement == null || contentElement == null || videoId.isEmpty()) {
return;
}
long duration = DataFormatTools.durationTextToMillis(durationElement.text());
String title = contentElement.select(".yt-lockup-title > a").text();
String author = contentElement.select(".yt-lockup-byline > a").text();
tracks.add(sourceManager.buildTrackObject(videoId, title, author, false, duration));
}
use of org.jsoup.nodes.Element in project epadd by ePADD.
the class Highlighter method getHTMLAnnotatedDocumentContents.
/**
* @param contents is the content to be annotated, typically the text in email body
* A convenience method to do the bulk job of annotating all the terms in termsToHighlight, termsToHyperlink and entitiesWithId
* Also hyperlinks any URLs found in the content
* @param regexToHighlight - the output will highlight all the strings matching regexToHighlight
* @param showDebugInfo - when set will append to the output some debug info. related to the entities present in the content and passed through entitiesWithId
*
* Note: DO not modify any of the objects passed in the parameter
* if need to be modified then clone and modify a local copy
*/
// TODO: can also get rid of termsToHyperlink
public static String getHTMLAnnotatedDocumentContents(String contents, Date d, String docId, String regexToHighlight, Set<String> termsToHighlight, Map<String, EmailRenderer.Entity> entitiesWithId, Set<String> termsToHyperlink, boolean showDebugInfo) {
Set<String> highlightTerms = new LinkedHashSet<>(), hyperlinkTerms = new LinkedHashSet<>();
if (termsToHighlight != null)
highlightTerms.addAll(termsToHighlight);
if (termsToHyperlink != null)
hyperlinkTerms.addAll(termsToHyperlink);
if (log.isDebugEnabled())
log.debug("DocId: " + docId + "; Highlight terms: " + highlightTerms + "; Entities: " + entitiesWithId + "; Hyperlink terms: " + hyperlinkTerms);
// System.err.println("DocId: " + docId + "; Highlight terms: " + highlightTerms + "; Entities: " + entitiesWithId + "; Hyperlink terms: " + hyperlinkTerms);
short HIGHLIGHT = 0, HYPERLINK = 1;
// pp for post process, as we cannot add complex tags which highlighting
String preHighlightTag = "<span class='hilitedTerm rounded' >", postHighlightTag = "</span>";
String preHyperlinkTag = "<span data-process='pp'>", postHyperlinkTag = "</span>";
// since the urls are not tokenized as one token, it is not possible to highlight with Lucene highlighter.
Pattern p = Pattern.compile("https?://[^\\s\\n]*");
Matcher m = p.matcher(contents);
StringBuffer sb = new StringBuffer();
while (m.find()) {
String link = m.group();
String url = link;
if (d != null) {
Calendar c = new GregorianCalendar();
c.setTime(d);
String archiveDate = c.get(Calendar.YEAR) + String.format("%02d", c.get(Calendar.MONTH)) + String.format("%02d", c.get(Calendar.DATE)) + "120000";
url = "http://web.archive.org/web/" + archiveDate + "/" + link;
}
m.appendReplacement(sb, Matcher.quoteReplacement("<a target=\"_blank\" href=\"" + url + "\">" + link + "</a> "));
}
m.appendTail(sb);
contents = sb.toString();
if (!Util.nullOrEmpty(regexToHighlight)) {
contents = annotateRegex(contents, regexToHighlight, preHighlightTag, postHighlightTag);
}
List<String> catchTerms = Arrays.asList("class", "span", "data", "ignore");
Set<String> ignoreTermsForHyperlinking = catchTerms.stream().map(String::toLowerCase).collect(Collectors.toSet());
// entitiesid stuff is already canonicalized with tokenize used with analyzer
if (entitiesWithId != null)
hyperlinkTerms.addAll(entitiesWithId.keySet().stream().filter(term -> !ignoreTermsForHyperlinking.contains(term.trim().toLowerCase())).map(term -> "\"" + term + "\"").collect(Collectors.toSet()));
// If there are overlapping annotations, then they need to be serialised.
// This is serialized order for such annotations.
// map strings to be annotated -> boolean denoting whether to highlight or hyperlink.
List<Pair<String, Short>> order = new ArrayList<>();
// should preserve order so that highlight terms are seen before hyperlink
Set<String> allTerms = new LinkedHashSet<>();
allTerms.addAll(highlightTerms);
/*
* We ant to assign order in which terms are highlighted or hyperlinked.
* for example: if we want to annotate both "Robert" and "Robert Creeley", and if we annotate "Robert" first then we may miss on "Robert Creeley"
* so we assign order over strings that share any common words as done in the loop below
* TODO:
* This test can still miss cases when a regular expression that eventually matches a word already annotated and
* when two terms like "Robert Creeley" "Mr Robert" to match a text like: "Mr Robert Creeley".
* TODO: Give pref. to highlighter over hyperlink
* TODO: remove order and simplify
* In such cases one of the terms may not be annotated.
* Terms that are added to o are those that just share at-least one word
*/
// should preserve order so that highlight terms that are added first stay that way
Map<Pair<String, Short>, Integer> o = new LinkedHashMap<>();
// prioritised terms
// Note that a term can be marked both for highlight and hyperlink
Set<String> consTermsHighlight = new HashSet<>(), consTermsHyperlink = new HashSet<>();
for (String at : allTerms) {
// Catch: if we are trying to highlight terms like class, span e.t.c,
// we better annotate them first as it may go into span tags and annotate the stuff, causing the highlighter to break
Set<String> substrs = IndexUtils.computeAllSubstrings(at);
for (String substr : substrs) {
if (at.equals(substr) || at.equals("\"" + substr + "\""))
continue;
boolean match = catchTerms.contains(substr.toLowerCase());
int val = match ? Integer.MAX_VALUE : substr.length();
// The highlight or hyperlink terms may have quotes, specially handling below is for that.. is there a better way?
if (highlightTerms.contains(substr) || highlightTerms.contains("\"" + substr + "\"")) {
highlightTerms.remove(substr);
highlightTerms.remove("\"" + substr + "\"");
// there should be no repetitions in the order array, else it leads to multiple annotations i.e. two spans around one single element
if (!consTermsHighlight.contains(substr)) {
o.put(new Pair<>(substr, HIGHLIGHT), val);
consTermsHighlight.add(substr);
}
}
if (hyperlinkTerms.contains(substr) || hyperlinkTerms.contains("\"" + substr + "\"")) {
hyperlinkTerms.remove(substr);
hyperlinkTerms.remove("\"" + substr + "\"");
if (!consTermsHyperlink.contains(substr)) {
o.put(new Pair<>(substr, HYPERLINK), val);
consTermsHyperlink.add(substr);
}
}
}
}
// now sort the phrases from longest length to smallest length
List<Pair<Pair<String, Short>, Integer>> os = Util.sortMapByValue(o);
order.addAll(os.stream().map(pair -> pair.first).collect(Collectors.toSet()));
// System.err.println(order+" hit: "+highlightTerms+" -- hyt: "+hyperlinkTerms);
// annotate whatever is left in highlight and hyperlink Terms.
// String result = contents;
String result = highlightBatch(contents, highlightTerms.toArray(new String[highlightTerms.size()]), preHighlightTag, postHighlightTag);
result = highlightBatch(result, hyperlinkTerms.toArray(new String[hyperlinkTerms.size()]), preHyperlinkTag, postHyperlinkTag);
// now highlight terms in order.
for (Pair<String, Short> ann : order) {
short type = ann.second;
String term = ann.first;
String preTag = null, postTag = null;
if (type == HYPERLINK) {
preTag = preHyperlinkTag;
postTag = postHyperlinkTag;
} else if (type == HIGHLIGHT) {
preTag = preHighlightTag;
postTag = postHighlightTag;
}
try {
result = highlight(result, term, preTag, postTag);
} catch (IOException | InvalidTokenOffsetsException | ParseException e) {
Util.print_exception("Exception while adding html annotation: " + ann.first, e, log);
e.printStackTrace();
}
}
// do some line breaking and show overflow.
String[] lines = result.split("\\n");
StringBuilder htmlResult = new StringBuilder();
boolean overflow = false;
for (String line : lines) {
htmlResult.append(line);
htmlResult.append("\n<br/>");
}
if (overflow) {
htmlResult.append("</div>\n");
// the nojog class ensures that the jog doesn't pop up when the more
// button is clicked
htmlResult.append("<span class=\"nojog\" style=\"color:#500050;text-decoration:underline;font-size:12px\" onclick=\"muse.reveal(this, false);\">More</span><br/>\n");
}
// Now do post-processing to add complex tags that depend on the text inside. title, link and cssclass
org.jsoup.nodes.Document doc = Jsoup.parse(htmlResult.toString());
Elements elts = doc.select("[data-process]");
for (int j = 0; j < elts.size(); j++) {
Element elt = elts.get(j);
Element par = elt.parent();
// Do not touch nested entities
if (par != null && par.attr("data-process") == null)
// (preHighlightTag.contains(par.tagName())||preHyperlinkTag.contains(par.tagName())))
continue;
String entity = elt.text();
int span_j = j;
String link = "browse?adv-search=1&termBody=on&termSubject=on&termAttachments=on&termOriginalBody=on&term=\"" + Util.escapeHTML(entity) + "\"";
// note " here because the quotes have to survive
// through the html page and reflect back in the URL
// may need to URI escape docId?
link += "&initDocId=" + docId;
String title = "";
try {
String cssclass = "";
EmailRenderer.Entity info = entitiesWithId.get(entity);
if (info != null) {
if (info.ids != null) {
title += "<div id=\"fast_" + info.ids + "\"></div>";
title += "<script>getFastData(\"" + info.ids + "\");</script>";
cssclass = "resolved";
} else {
// the last three are the OpenNLPs'
// could have defined overlapping sub-classes, which would have reduced code repetitions in css file; but this way more flexibility
String[] types = new String[] { "cp", "cl", "co", "person", "org", "place", "acr" };
String[] cssclasses = new String[] { "custom-people", "custom-loc", "custom-org", "opennlp-person", "opennlp-org", "opennlp-place", "acronym" };
outer: for (String et : info.types) {
for (int t = 0; t < types.length; t++) {
String type = types[t];
if (type.equals(et)) {
if (t < 3) {
cssclass += cssclasses[t] + " ";
// consider no other class
continue outer;
} else {
cssclass += cssclasses[t] + " ";
}
}
}
}
}
} else {
cssclass += " unresolved";
}
// enables completion (expansion) of words while browsing of messages.
if (entity != null) {
// enable for only few types
if (cssclass.contains("custom-people") || cssclass.contains("acronym") || cssclass.contains("custom-org") || cssclass.contains("custom-loc")) {
// TODO: remove regexs
entity = entity.replaceAll("(^\\s+|\\s+$)", "");
if (!entity.contains(" ")) {
// String rnd = rand.nextInt() + "";
// <img src="images/spinner.gif" style="height:15px"/>
// <script>expand("" + entity + "\",\"" + StringEscapeUtils.escapeJava(docId) + "\",\"" + rnd + "");</script>
// if(info.expandsTo!=null)
// title += "<div class=\"resolutions\" id=\"expand_" + rnd + "\"><a href='browse?term=\""+info.expandsTo+"\"'>"+info.expandsTo+"</a></div>";
cssclass += " expand";
}
}
}
for (int k = j; k <= span_j; k++) {
elt = elts.get(k);
// don't annotate nested tags-- double check if the parent tag is highlight related tag or entity related annotation
if (elt.parent().tag().getName().toLowerCase().equals("span") && elt.parent().classNames().toString().contains("custom")) {
continue;
}
String cc = elt.attr("class");
elt.attr("class", cc + " " + cssclass);
elt.attr("title", title);
elt.attr("onclick", "window.location='" + link + "'");
// A tag may have nested tags in it and is involved to get the text in it.
elt.attr("data-text", entity);
elt.attr("data-docId", StringEscapeUtils.escapeHtml(docId));
}
} catch (Exception e) {
Util.print_exception("Some unknown error while highlighting", e, log);
}
}
// The output Jsoup .html() will dump each tag in separate line
String html = doc.html();
if (showDebugInfo) {
String debug_html = html + "<br>";
debug_html += "<div class='debug' style='display:none'>";
debug_html += "docId: " + docId;
debug_html += "<br>-------------------------------------------------<br>";
for (String str : entitiesWithId.keySet()) debug_html += str + ":" + entitiesWithId.get(str).types + ";;; ";
debug_html += "<br>-------------------------------------------------<br>";
String[] opennlp = new String[] { "person", "place", "org" };
String[] custom = new String[] { "cp", "cl", "co" };
for (int j = 0; j < opennlp.length; j++) {
String t1 = opennlp[j];
String t2 = custom[j];
Set<String> e1 = new HashSet<>();
Set<String> e2 = new HashSet<>();
for (String str : entitiesWithId.keySet()) {
Set<String> types = entitiesWithId.get(str).types;
if (types.contains(t1) && !types.contains(t2))
e1.add(entitiesWithId.get(str).name);
else if (types.contains(t2) && !types.contains(t1))
e2.add(entitiesWithId.get(str).name);
}
debug_html += opennlp[j] + " entities recognised by only opennlp: " + e1;
debug_html += "<br>";
debug_html += opennlp[j] + " entities recognised by only custom: " + e2;
debug_html += "<br><br>";
}
debug_html += "-------------------------------------------------<br>";
lines = contents.split("\\n");
for (String line : lines) debug_html += line + "<br>";
debug_html += "</div>";
debug_html += "<button onclick='$(\".debug\").style(\"display\",\"block\");'>Show Debug Info</button>";
return debug_html;
}
return html;
}
use of org.jsoup.nodes.Element in project ripme by RipMeApp.
the class VkRipper method ripImages.
private void ripImages() throws IOException {
Map<String, String> photoIDsToURLs = new HashMap<>();
int offset = 0;
while (true) {
logger.info(" Retrieving " + this.url);
// al=1&offset=80&part=1
Map<String, String> postData = new HashMap<>();
postData.put("al", "1");
postData.put("offset", Integer.toString(offset));
postData.put("part", "1");
Document doc = Http.url(this.url).referrer(this.url).ignoreContentType().data(postData).post();
String body = doc.toString();
if (!body.contains("<div")) {
break;
}
body = body.substring(body.indexOf("<div"));
doc = Jsoup.parseBodyFragment(body);
List<Element> elements = doc.select("a");
Set<String> photoIDsToGet = new HashSet<>();
for (Element a : elements) {
if (!a.attr("onclick").contains("showPhoto('")) {
logger.error("a: " + a);
continue;
}
String photoID = a.attr("onclick");
photoID = photoID.substring(photoID.indexOf("showPhoto('") + "showPhoto('".length());
photoID = photoID.substring(0, photoID.indexOf("'"));
if (!photoIDsToGet.contains(photoID)) {
photoIDsToGet.add(photoID);
}
}
for (String photoID : photoIDsToGet) {
if (!photoIDsToURLs.containsKey(photoID)) {
try {
photoIDsToURLs.putAll(getPhotoIDsToURLs(photoID));
} catch (IOException e) {
logger.error("Exception while retrieving photo id " + photoID, e);
continue;
}
}
if (!photoIDsToURLs.containsKey(photoID)) {
logger.error("Could not find URL for photo ID: " + photoID);
continue;
}
String url = photoIDsToURLs.get(photoID);
addURLToDownload(new URL(url));
if (isStopped() || isThisATest()) {
break;
}
}
if (elements.size() < 40 || isStopped() || isThisATest()) {
break;
}
offset += elements.size();
}
waitForThreads();
}
Aggregations