Search in sources :

Example 26 with Node

use of org.jsoup.nodes.Node in project opacclient by opacapp.

the class SISIS method parseDetail.

static DetailedItem parseDetail(String html, String html2, String html3, String coverJs, JSONObject data, StringProvider stringProvider) throws IOException {
    Document doc = Jsoup.parse(html);
    String opac_url = data.optString("baseurl", "");
    doc.setBaseUri(opac_url);
    Document doc2 = Jsoup.parse(html2);
    doc2.setBaseUri(opac_url);
    Document doc3 = Jsoup.parse(html3);
    doc3.setBaseUri(opac_url);
    DetailedItem result = new DetailedItem();
    try {
        result.setId(doc.select("#bibtip_id").text().trim());
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    List<String> reservationlinks = new ArrayList<>();
    for (Element link : doc3.select("#vormerkung a, #tab-content a")) {
        String href = link.absUrl("href");
        Map<String, String> hrefq = getQueryParamsFirst(href);
        if (result.getId() == null) {
            // ID retrieval
            String key = hrefq.get("katkey");
            if (key != null) {
                result.setId(key);
                break;
            }
        }
        // Vormerken
        if (hrefq.get("methodToCall") != null) {
            if (hrefq.get("methodToCall").equals("doVormerkung") || hrefq.get("methodToCall").equals("doBestellung")) {
                reservationlinks.add(href.split("\\?")[1]);
            }
        }
    }
    if (reservationlinks.size() == 1) {
        result.setReservable(true);
        result.setReservation_info(reservationlinks.get(0));
    } else if (reservationlinks.size() == 0) {
        result.setReservable(false);
    } else {
    // TODO: Multiple options - handle this case!
    }
    if (result.getId() == null && doc.select("#permalink_link").size() > 0) {
        result.setId(doc.select("#permalink_link").text());
    }
    if (coverJs != null) {
        Pattern srcPattern = Pattern.compile("<img .* src=\"([^\"]+)\">");
        Matcher matcher = srcPattern.matcher(coverJs);
        if (matcher.find()) {
            result.setCover(matcher.group(1));
        }
    } else if (doc.select(".data td img").size() == 1) {
        result.setCover(doc.select(".data td img").first().attr("abs:src"));
    }
    if (doc.select(".aw_teaser_title").size() == 1) {
        result.setTitle(doc.select(".aw_teaser_title").first().text().trim());
    } else if (doc.select(".data td strong").size() > 0) {
        result.setTitle(doc.select(".data td strong").first().text().trim());
    } else {
        result.setTitle("");
    }
    if (doc.select(".aw_teaser_title_zusatz").size() > 0) {
        result.addDetail(new Detail("Titelzusatz", doc.select(".aw_teaser_title_zusatz").text().trim()));
    }
    String title = "";
    String text = "";
    boolean takeover = false;
    Element detailtrs = doc2.select(".box-container .data td").first();
    for (Node node : detailtrs.childNodes()) {
        if (node instanceof Element) {
            Element element = (Element) node;
            if (element.tagName().equals("strong")) {
                if (element.hasClass("c2")) {
                    if (!title.equals("")) {
                        result.addDetail(new Detail(title, text.trim()));
                    }
                    title = element.text().trim();
                    text = "";
                } else {
                    text = text + element.text();
                }
            } else {
                if (element.tagName().equals("a")) {
                    if (element.text().trim().contains("hier klicken") || title.contains("Link")) {
                        text = text + node.attr("href");
                        takeover = true;
                        break;
                    } else {
                        text = text + element.text();
                    }
                }
            }
        } else if (node instanceof TextNode) {
            text = text + ((TextNode) node).text();
        }
    }
    if (!takeover) {
        text = "";
        title = "";
    }
    detailtrs = doc2.select("#tab-content .data td").first();
    if (detailtrs != null) {
        for (Node node : detailtrs.childNodes()) {
            if (node instanceof Element) {
                if (((Element) node).tagName().equals("strong")) {
                    if (!text.equals("") && !title.equals("")) {
                        result.addDetail(new Detail(title.trim(), text.trim()));
                        if (title.equals("Titel:")) {
                            result.setTitle(text.trim());
                        }
                        text = "";
                    }
                    title = ((Element) node).text().trim();
                } else {
                    if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) {
                        text = text + node.attr("href");
                    } else {
                        text = text + ((Element) node).text();
                    }
                }
            } else if (node instanceof TextNode) {
                text = text + ((TextNode) node).text();
            }
        }
    } else {
        if (doc2.select("#tab-content .fulltitle tr").size() > 0) {
            Elements rows = doc2.select("#tab-content .fulltitle tr");
            for (Element tr : rows) {
                if (tr.children().size() == 2) {
                    Element valcell = tr.child(1);
                    String value = valcell.text().trim();
                    if (valcell.select("a").size() == 1) {
                        value = valcell.select("a").first().absUrl("href");
                    }
                    result.addDetail(new Detail(tr.child(0).text().trim(), value));
                }
            }
        } else {
            result.addDetail(new Detail(stringProvider.getString(StringProvider.ERROR), stringProvider.getString(StringProvider.COULD_NOT_LOAD_DETAIL)));
        }
    }
    if (!text.equals("") && !title.equals("")) {
        result.addDetail(new Detail(title.trim(), text.trim()));
        if (title.equals("Titel:")) {
            result.setTitle(text.trim());
        }
    }
    for (Element link : doc3.select("#tab-content a")) {
        Map<String, String> hrefq = getQueryParamsFirst(link.absUrl("href"));
        if (result.getId() == null) {
            // ID retrieval
            String key = hrefq.get("katkey");
            if (key != null) {
                result.setId(key);
                break;
            }
        }
    }
    for (Element link : doc3.select(".box-container a")) {
        if (link.text().trim().equals("Download")) {
            result.addDetail(new Detail(stringProvider.getString(StringProvider.DOWNLOAD), link.absUrl("href")));
        }
    }
    if (doc3.select("#tab-content .textrot").size() > 0) {
        result.addDetail(new Detail(stringProvider.getString(StringProvider.STATUS), doc3.select("#tab-content .textrot").text()));
    }
    Map<String, Integer> copy_columnmap = new HashMap<>();
    // Default values
    copy_columnmap.put("barcode", 1);
    copy_columnmap.put("branch", 3);
    copy_columnmap.put("status", 4);
    Element table = doc.select("#tab-content .data").first();
    Elements copy_columns = table != null ? table.select("tr#bg2 th") : new Elements();
    for (int i = 0; i < copy_columns.size(); i++) {
        Element th = copy_columns.get(i);
        String head = th.text().trim();
        if (head.contains("Status")) {
            copy_columnmap.put("status", i);
        }
        if (head.contains("Zweigstelle")) {
            copy_columnmap.put("branch", i);
        }
        if (head.contains("Mediennummer")) {
            copy_columnmap.put("barcode", i);
        }
        if (head.contains("Standort")) {
            copy_columnmap.put("location", i);
        }
        if (head.contains("Signatur")) {
            copy_columnmap.put("signature", i);
        }
    }
    Pattern status_lent = Pattern.compile("^(entliehen) bis ([0-9]{1,2}.[0-9]{1,2}.[0-9]{2," + "4}) \\(gesamte Vormerkungen: ([0-9]+)\\)$");
    Pattern status_and_barcode = Pattern.compile("^(.*) ([0-9A-Za-z]+)$");
    Elements exemplartrs = table != null ? table.select("tr").not("#bg2") : new Elements();
    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
    for (Element tr : exemplartrs) {
        try {
            Copy copy = new Copy();
            Element status = tr.child(copy_columnmap.get("status"));
            Element barcode = tr.child(copy_columnmap.get("barcode"));
            String barcodetext = barcode.text().trim().replace(" Wegweiser", "");
            // STATUS
            String statustext;
            if (status.getElementsByTag("b").size() > 0) {
                statustext = status.getElementsByTag("b").text().trim();
            } else {
                statustext = status.text().trim();
            }
            if (copy_columnmap.get("status").equals(copy_columnmap.get("barcode"))) {
                Matcher matcher1 = status_and_barcode.matcher(statustext);
                if (matcher1.matches()) {
                    statustext = matcher1.group(1);
                    barcodetext = matcher1.group(2);
                }
            }
            Matcher matcher = status_lent.matcher(statustext);
            if (matcher.matches()) {
                copy.setStatus(matcher.group(1));
                copy.setReservations(matcher.group(3));
                copy.setReturnDate(fmt.parseLocalDate(matcher.group(2)));
            } else {
                copy.setStatus(statustext.trim().replace(" Wegweiser", ""));
            }
            copy.setBarcode(barcodetext);
            if (status.select("a[href*=doVormerkung]").size() == 1) {
                copy.setResInfo(status.select("a[href*=doVormerkung]").attr("href").split("\\?")[1]);
            }
            String branchtext = tr.child(copy_columnmap.get("branch")).text().trim().replace(" Wegweiser", "");
            copy.setBranch(branchtext);
            if (copy_columnmap.containsKey("location")) {
                copy.setLocation(tr.child(copy_columnmap.get("location")).text().trim().replace(" Wegweiser", ""));
            }
            if (copy_columnmap.containsKey("signature")) {
                copy.setShelfmark(tr.child(copy_columnmap.get("signature")).text().trim().replace(" Wegweiser", ""));
            }
            result.addCopy(copy);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
    try {
        Element isvolume = null;
        Map<String, String> volume = new HashMap<>();
        Elements links = doc.select(".data td a");
        int elcount = links.size();
        for (int eli = 0; eli < elcount; eli++) {
            List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(links.get(eli).attr("href")), "UTF-8");
            for (NameValuePair nv : anyurl) {
                if (nv.getName().equals("methodToCall") && nv.getValue().equals("volumeSearch")) {
                    isvolume = links.get(eli);
                } else if (nv.getName().equals("catKey")) {
                    volume.put("catKey", nv.getValue());
                } else if (nv.getName().equals("dbIdentifier")) {
                    volume.put("dbIdentifier", nv.getValue());
                }
            }
            if (isvolume != null) {
                volume.put("volume", "true");
                result.setVolumesearch(volume);
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result;
}
Also used : Pattern(java.util.regex.Pattern) NameValuePair(org.apache.http.NameValuePair) BasicNameValuePair(org.apache.http.message.BasicNameValuePair) Matcher(java.util.regex.Matcher) HashMap(java.util.HashMap) Element(org.jsoup.nodes.Element) Node(org.jsoup.nodes.Node) TextNode(org.jsoup.nodes.TextNode) ArrayList(java.util.ArrayList) TextNode(org.jsoup.nodes.TextNode) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements) URI(java.net.URI) URISyntaxException(java.net.URISyntaxException) JSONException(org.json.JSONException) NotReachableException(de.geeksfactory.opacclient.networking.NotReachableException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ClientProtocolException(org.apache.http.client.ClientProtocolException) IOException(java.io.IOException) Copy(de.geeksfactory.opacclient.objects.Copy) DetailedItem(de.geeksfactory.opacclient.objects.DetailedItem) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) Detail(de.geeksfactory.opacclient.objects.Detail)

Example 27 with Node

use of org.jsoup.nodes.Node in project k-9 by k9mail.

the class AdvancedNodeTraversor method filter.

/**
 * Start a depth-first filtering of the root and all of its descendants.
 *
 * @param root
 *         the root node point to traverse.
 *
 * @return The result of the filter operation.
 */
public FilterResult filter(Node root) {
    Node node = root;
    int depth = 0;
    while (node != null) {
        HeadFilterDecision headResult = filter.head(node, depth);
        if (headResult == HeadFilterDecision.STOP) {
            return FilterResult.STOPPED;
        }
        if (headResult == HeadFilterDecision.CONTINUE && node.childNodeSize() > 0) {
            node = node.childNode(0);
            ++depth;
            continue;
        }
        TailFilterDecision tailResult = TailFilterDecision.CONTINUE;
        while (node.nextSibling() == null && depth > 0) {
            if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
                tailResult = filter.tail(node, depth);
                if (tailResult == TailFilterDecision.STOP) {
                    return FilterResult.STOPPED;
                }
            }
            Node prev = node;
            node = node.parentNode();
            depth--;
            if (headResult == HeadFilterDecision.REMOVE || tailResult == TailFilterDecision.REMOVE) {
                prev.remove();
            }
            headResult = HeadFilterDecision.CONTINUE;
        }
        if (headResult == HeadFilterDecision.CONTINUE || headResult == HeadFilterDecision.SKIP_CHILDREN) {
            tailResult = filter.tail(node, depth);
            if (tailResult == TailFilterDecision.STOP) {
                return FilterResult.STOPPED;
            }
        }
        Node prev = node;
        node = node.nextSibling();
        if (headResult == HeadFilterDecision.REMOVE) {
            prev.remove();
        }
        if (prev == root) {
            return headResult == HeadFilterDecision.REMOVE ? FilterResult.ROOT_REMOVED : FilterResult.ENDED;
        }
    }
    return FilterResult.ENDED;
}
Also used : HeadFilterDecision(com.fsck.k9.helper.jsoup.NodeFilter.HeadFilterDecision) Node(org.jsoup.nodes.Node) TailFilterDecision(com.fsck.k9.helper.jsoup.NodeFilter.TailFilterDecision)

Example 28 with Node

use of org.jsoup.nodes.Node in project ez-vcard by mangstadt.

the class HCardElement method visitForValue.

private void visitForValue(Element element, StringBuilder value) {
    for (Node node : element.childNodes()) {
        if (node instanceof Element) {
            Element e = (Element) node;
            if (e.classNames().contains("type")) {
                // ignore "type" elements
                continue;
            }
            if ("br".equals(e.tagName())) {
                // convert "<br>" to a newline
                value.append(NEWLINE);
                continue;
            }
            if ("del".equals(e.tagName())) {
                // skip "<del>" tags
                continue;
            }
            visitForValue(e, value);
            continue;
        }
        if (node instanceof TextNode) {
            TextNode t = (TextNode) node;
            value.append(t.text());
            continue;
        }
    }
}
Also used : Node(org.jsoup.nodes.Node) TextNode(org.jsoup.nodes.TextNode) Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode)

Example 29 with Node

use of org.jsoup.nodes.Node in project Java-readability by basis-technology-corp.

the class XmlDataMap method recurse.

private void recurse(Element element) {
    ElementAction action = classifyElement(element);
    if (action == ElementAction.Whitespace || action == ElementAction.Sentence) {
        appendSpace();
    }
    for (Node childNode : element.childNodes()) {
        // though we could use canonical XML to get rid of them.
        if (childNode instanceof TextNode && action != ElementAction.Banned) {
            TextNode textContent = (TextNode) childNode;
            String textString = textContent.text();
            append(textContent, textString);
        } else if (childNode instanceof Element) {
            recurse((Element) childNode);
        }
    }
    if (action == ElementAction.Whitespace) {
        appendSpace();
    } else if (action == ElementAction.Sentence) {
        appendPeriod();
    } else if (action == ElementAction.Mark) {
        Mark mark = new Mark();
        mark.setOffset(pcDataOffset);
        mark.setTag(element.tagName());
    }
}
Also used : Node(org.jsoup.nodes.Node) TextNode(org.jsoup.nodes.TextNode) Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode)

Example 30 with Node

use of org.jsoup.nodes.Node in project jsoup by jhy.

the class Parser method parseBodyFragment.

/**
     * Parse a fragment of HTML into the {@code body} of a Document.
     *
     * @param bodyHtml fragment of HTML
     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
     *
     * @return Document, with empty head, and HTML parsed into body
     */
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
    Document doc = Document.createShell(baseUri);
    Element body = doc.body();
    List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
    // the node list gets modified when re-parented
    Node[] nodes = nodeList.toArray(new Node[nodeList.size()]);
    for (int i = nodes.length - 1; i > 0; i--) {
        nodes[i].remove();
    }
    for (Node node : nodes) {
        body.appendChild(node);
    }
    return doc;
}
Also used : Element(org.jsoup.nodes.Element) Node(org.jsoup.nodes.Node) Document(org.jsoup.nodes.Document)

Aggregations

Node (org.jsoup.nodes.Node)55 Element (org.jsoup.nodes.Element)39 TextNode (org.jsoup.nodes.TextNode)39 Document (org.jsoup.nodes.Document)19 ArrayList (java.util.ArrayList)17 Elements (org.jsoup.select.Elements)11 IOException (java.io.IOException)7 HashMap (java.util.HashMap)6 Copy (de.geeksfactory.opacclient.objects.Copy)5 DetailedItem (de.geeksfactory.opacclient.objects.DetailedItem)5 NameValuePair (org.apache.http.NameValuePair)5 BasicNameValuePair (org.apache.http.message.BasicNameValuePair)5 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)5 JSONException (org.json.JSONException)5 NotReachableException (de.geeksfactory.opacclient.networking.NotReachableException)4 Detail (de.geeksfactory.opacclient.objects.Detail)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 URI (java.net.URI)4 Matcher (java.util.regex.Matcher)4 URISyntaxException (java.net.URISyntaxException)3