Search in sources :

Example 1 with MediaType

use of de.geeksfactory.opacclient.objects.SearchResult.MediaType in project opacclient by opacapp.

the class Pica method getMediaTypeInSingleResult.

public MediaType getMediaTypeInSingleResult(String html) {
    Document doc = Jsoup.parse(html);
    MediaType mediatype = MediaType.UNKNOWN;
    if (doc.select("table[summary=presentation switch] img").size() > 0) {
        String[] fparts = doc.select("table[summary=presentation switch] img").get(0).attr("src").split("/");
        String fname = fparts[fparts.length - 1];
        if (data.has("mediatypes")) {
            try {
                mediatype = MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname));
            } catch (JSONException e) {
                mediatype = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "").replace(".gif", "").replace(".png", ""));
            } catch (IllegalArgumentException e) {
                mediatype = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "").replace(".gif", "").replace(".png", ""));
            }
        } else {
            mediatype = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "").replace(".gif", "").replace(".png", ""));
        }
    }
    return mediatype;
}
Also used : MediaType(de.geeksfactory.opacclient.objects.SearchResult.MediaType) JSONException(org.json.JSONException) Document(org.jsoup.nodes.Document)

Example 2 with MediaType

use of de.geeksfactory.opacclient.objects.SearchResult.MediaType in project opacclient by opacapp.

the class SISIS method parse_search.

public SearchRequestResult parse_search(String html, int page) throws OpacErrorException, SingleResultFound {
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url + "/searchfoo");
    if (doc.select(".error").size() > 0) {
        throw new OpacErrorException(doc.select(".error").text().trim());
    } else if (doc.select(".nohits").size() > 0) {
        throw new OpacErrorException(doc.select(".nohits").text().trim());
    } else if (doc.select(".box-header h2, #nohits").text().contains("keine Treffer")) {
        return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1);
    }
    int results_total = -1;
    String resultnumstr = doc.select(".box-header h2").first().text();
    if (resultnumstr.contains("(1/1)") || resultnumstr.contains(" 1/1")) {
        throw new SingleResultFound();
    } else if (resultnumstr.contains("(")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1"));
    } else if (resultnumstr.contains(": ")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1"));
    }
    Elements table = doc.select("table.data tbody tr");
    identifier = null;
    Elements links = doc.select("table.data a");
    boolean haslink = false;
    for (int i = 0; i < links.size(); i++) {
        Element node = links.get(i);
        if (node.hasAttr("href") & node.attr("href").contains("singleHit.do") && !haslink) {
            haslink = true;
            try {
                List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(node.attr("href").replace(" ", "%20").replace("&amp;", "&")), ENCODING);
                for (NameValuePair nv : anyurl) {
                    if (nv.getName().equals("identifier")) {
                        identifier = nv.getValue();
                        break;
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    List<SearchResult> results = new ArrayList<>();
    for (int i = 0; i < table.size(); i++) {
        Element tr = table.get(i);
        SearchResult sr = new SearchResult();
        if (tr.select("td img[title]").size() > 0) {
            String title = tr.select("td img").get(0).attr("title");
            String[] fparts = tr.select("td img").get(0).attr("src").split("/");
            String fname = fparts[fparts.length - 1];
            MediaType default_by_fname = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "").replace(".gif", "").replace(".png", ""));
            MediaType default_by_title = defaulttypes.get(title);
            MediaType default_name = default_by_title != null ? default_by_title : default_by_fname;
            if (data.has("mediatypes")) {
                try {
                    sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname)));
                } catch (JSONException | IllegalArgumentException e) {
                    sr.setType(default_name);
                }
            } else {
                sr.setType(default_name);
            }
        }
        String alltext = tr.text();
        if (alltext.contains("eAudio") || alltext.contains("eMusic")) {
            sr.setType(MediaType.MP3);
        } else if (alltext.contains("eVideo")) {
            sr.setType(MediaType.EVIDEO);
        } else if (alltext.contains("eBook")) {
            sr.setType(MediaType.EBOOK);
        } else if (alltext.contains("Munzinger")) {
            sr.setType(MediaType.EDOC);
        }
        if (tr.children().size() > 3 && tr.child(3).select("img[title*=cover]").size() == 1) {
            sr.setCover(tr.child(3).select("img[title*=cover]").attr("abs:src"));
            if (sr.getCover().contains("showCover.do")) {
                downloadCover(sr);
            }
        }
        Element middlething;
        if (tr.children().size() > 2 && tr.child(2).select("a").size() > 0) {
            middlething = tr.child(2);
        } else {
            middlething = tr.child(1);
        }
        List<Node> children = middlething.childNodes();
        if (middlething.select("div").not("#hlrightblock,.bestellfunktionen").size() == 1) {
            Element indiv = middlething.select("div").not("#hlrightblock,.bestellfunktionen").first();
            if (indiv.select("a").size() > 0 && indiv.children().size() > 1) {
                children = indiv.childNodes();
            }
        } else if (middlething.select("span.titleData").size() == 1) {
            children = middlething.select("span.titleData").first().childNodes();
        }
        int childrennum = children.size();
        List<String[]> strings = new ArrayList<>();
        for (int ch = 0; ch < childrennum; ch++) {
            Node node = children.get(ch);
            if (node instanceof TextNode) {
                String text = ((TextNode) node).text().trim();
                if (text.length() > 3) {
                    strings.add(new String[] { "text", "", text });
                }
            } else if (node instanceof Element) {
                List<Node> subchildren = node.childNodes();
                for (int j = 0; j < subchildren.size(); j++) {
                    Node subnode = subchildren.get(j);
                    if (subnode instanceof TextNode) {
                        String text = ((TextNode) subnode).text().trim();
                        if (text.length() > 3) {
                            strings.add(new String[] { ((Element) node).tag().getName(), "text", text, ((Element) node).className(), node.attr("style") });
                        }
                    } else if (subnode instanceof Element) {
                        String text = ((Element) subnode).text().trim();
                        if (text.length() > 3) {
                            strings.add(new String[] { ((Element) node).tag().getName(), ((Element) subnode).tag().getName(), text, ((Element) node).className(), node.attr("style") });
                        }
                    }
                }
            }
        }
        StringBuilder description = null;
        if (tr.select("span.Z3988").size() == 1) {
            // Sometimes there is a <span class="Z3988"> item which provides
            // data in a standardized format.
            List<NameValuePair> z3988data;
            boolean hastitle = false;
            try {
                description = new StringBuilder();
                z3988data = URLEncodedUtils.parse(new URI("http://dummy/?" + tr.select("span.Z3988").attr("title")), "UTF-8");
                for (NameValuePair nv : z3988data) {
                    if (nv.getValue() != null) {
                        if (!nv.getValue().trim().equals("")) {
                            if (nv.getName().equals("rft.btitle") && !hastitle) {
                                description.append("<b>").append(nv.getValue()).append("</b>");
                                hastitle = true;
                            } else if (nv.getName().equals("rft.atitle") && !hastitle) {
                                description.append("<b>").append(nv.getValue()).append("</b>");
                                hastitle = true;
                            } else if (nv.getName().equals("rft.au")) {
                                description.append("<br />").append(nv.getValue());
                            } else if (nv.getName().equals("rft.date")) {
                                description.append("<br />").append(nv.getValue());
                            }
                        }
                    }
                }
            } catch (URISyntaxException e) {
                description = null;
            }
        }
        boolean described = false;
        if (description != null && description.length() > 0) {
            sr.setInnerhtml(description.toString());
            described = true;
        } else {
            description = new StringBuilder();
        }
        int k = 0;
        boolean yearfound = false;
        boolean titlefound = false;
        boolean sigfound = false;
        for (String[] part : strings) {
            if (!described) {
                if (part[0].equals("a") && (k == 0 || !titlefound)) {
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append("<b>").append(part[2]).append("</b>");
                    titlefound = true;
                } else if (part[2].matches("\\D*[0-9]{4}\\D*") && part[2].length() <= 10) {
                    yearfound = true;
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append(part[2]);
                } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) {
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append(part[2]);
                } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) {
                    if (k != 0) {
                        description.append("<br />");
                    }
                    description.append(part[2]);
                } else if (k == 1 && !yearfound) {
                    description.append("<br />");
                    description.append(part[2]);
                } else if (k > 1 && k < 4 && !sigfound && part[0].equals("text") && part[2].matches("^[A-Za-z0-9,\\- ]+$")) {
                    description.append("<br />");
                    description.append(part[2]);
                }
            }
            if (part.length == 4) {
                if (part[0].equals("span") && part[3].equals("textgruen")) {
                    sr.setStatus(SearchResult.Status.GREEN);
                } else if (part[0].equals("span") && part[3].equals("textrot")) {
                    sr.setStatus(SearchResult.Status.RED);
                }
            } else if (part.length == 5) {
                if (part[4].contains("purple")) {
                    sr.setStatus(SearchResult.Status.YELLOW);
                }
            }
            if (sr.getStatus() == null) {
                if ((part[2].contains("entliehen") && part[2].startsWith("Vormerkung ist leider nicht möglich")) || part[2].contains("Alle Exemplare des gewählten Titels sind entliehen") || part[2].contains("nur in anderer Zweigstelle ausleihbar und nicht bestellbar")) {
                    sr.setStatus(SearchResult.Status.RED);
                } else if (part[2].startsWith("entliehen") || part[2].contains("Ein Exemplar finden Sie in einer anderen Zweigstelle")) {
                    sr.setStatus(SearchResult.Status.YELLOW);
                } else if ((part[2].startsWith("bestellbar") && !part[2].contains("nicht bestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vormerkbar") && !part[2].contains("nicht vormerkbar")) || (part[2].contains("heute zurückgebucht")) || (part[2].contains("ausleihbar") && !part[2].contains("nicht ausleihbar"))) {
                    sr.setStatus(SearchResult.Status.GREEN);
                }
                if (sr.getType() != null) {
                    if (sr.getType().equals(MediaType.EBOOK) || sr.getType().equals(MediaType.EVIDEO) || sr.getType().equals(MediaType.MP3)) // Especially Onleihe.de ebooks are often marked
                    // green though they are not available.
                    {
                        sr.setStatus(SearchResult.Status.UNKNOWN);
                    }
                }
            }
            k++;
        }
        if (!described) {
            sr.setInnerhtml(description.toString());
        }
        sr.setNr(10 * (page - 1) + i);
        sr.setId(null);
        results.add(sr);
    }
    resultcount = results.size();
    return new SearchRequestResult(results, results_total, page);
}
Also used : Element(org.jsoup.nodes.Element) Node(org.jsoup.nodes.Node) TextNode(org.jsoup.nodes.TextNode) ArrayList(java.util.ArrayList) URISyntaxException(java.net.URISyntaxException) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements) URI(java.net.URI) MediaType(de.geeksfactory.opacclient.objects.SearchResult.MediaType) List(java.util.List) ArrayList(java.util.ArrayList) NameValuePair(org.apache.http.NameValuePair) BasicNameValuePair(org.apache.http.message.BasicNameValuePair) JSONException(org.json.JSONException) SearchResult(de.geeksfactory.opacclient.objects.SearchResult) TextNode(org.jsoup.nodes.TextNode) URISyntaxException(java.net.URISyntaxException) JSONException(org.json.JSONException) NotReachableException(de.geeksfactory.opacclient.networking.NotReachableException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ClientProtocolException(org.apache.http.client.ClientProtocolException) IOException(java.io.IOException) SearchRequestResult(de.geeksfactory.opacclient.objects.SearchRequestResult)

Example 3 with MediaType

use of de.geeksfactory.opacclient.objects.SearchResult.MediaType in project opacclient by opacapp.

the class TouchPoint method parse_search.

protected SearchRequestResult parse_search(String html, int page) throws OpacErrorException, IOException, IOException, SingleResultFound {
    Document doc = Jsoup.parse(html);
    if (doc.select("#RefineHitListForm").size() > 0) {
        // the results are located on a different page loaded via AJAX
        html = httpGet(opac_url + "/speedHitList.do?_=" + String.valueOf(System.currentTimeMillis() / 1000) + "&hitlistindex=0&exclusionList=", ENCODING);
        doc = Jsoup.parse(html);
    }
    if (doc.select(".nodata").size() > 0) {
        return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1);
    }
    doc.setBaseUri(opac_url + "/searchfoo");
    int results_total = -1;
    String resultnumstr = doc.select(".box-header h2, .box-header h1").first().text();
    if (resultnumstr.contains("(1/1)") || resultnumstr.contains(" 1/1")) {
        throw new SingleResultFound();
    } else if (resultnumstr.contains("(")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1"));
    } else if (resultnumstr.contains(": ")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1"));
    } else if (resultnumstr.contains("Treffer")) {
        try {
            results_total = Integer.parseInt(resultnumstr.replaceAll(".* ([0-9]+)$", "$1"));
        } catch (NumberFormatException e) {
        // pass
        }
    }
    Elements table = doc.select("table.data > tbody > tr");
    identifier = null;
    Elements links = doc.select("table.data a");
    boolean haslink = false;
    for (Element node : links) {
        if (node.hasAttr("href") & node.attr("href").contains("singleHit.do") && !haslink) {
            haslink = true;
            try {
                List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(node.attr("href").replace(" ", "%20").replace("&amp;", "&")), ENCODING);
                for (NameValuePair nv : anyurl) {
                    if (nv.getName().equals("identifier")) {
                        identifier = nv.getValue();
                        break;
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    List<SearchResult> results = new ArrayList<>();
    for (int i = 0; i < table.size(); i++) {
        Element tr = table.get(i);
        SearchResult sr = new SearchResult();
        if (tr.select(".icn, img[width=32]").size() > 0) {
            String[] fparts = tr.select(".icn, img[width=32]").first().attr("src").split("/");
            String fname = fparts[fparts.length - 1];
            String changedFname = fname.toLowerCase(Locale.GERMAN).replace(".jpg", "").replace(".gif", "").replace(".png", "");
            // File names can look like this: "20_DVD_Video.gif"
            Pattern pattern = Pattern.compile("(\\d+)_.*");
            Matcher matcher = pattern.matcher(changedFname);
            if (matcher.find()) {
                changedFname = matcher.group(1);
            }
            MediaType defaulttype = defaulttypes.get(changedFname);
            if (data.has("mediatypes")) {
                try {
                    sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname)));
                } catch (JSONException | IllegalArgumentException e) {
                    sr.setType(defaulttype);
                }
            } else {
                sr.setType(defaulttype);
            }
        }
        String title;
        String text;
        if (tr.select(".results table").size() > 0) {
            // e.g. RWTH Aachen
            title = tr.select(".title a").text();
            text = tr.select(".title div").text();
        } else {
            // e.g. Schaffhausen, BSB München
            title = tr.select(".title, .hitlistTitle").text();
            text = tr.select(".results, .hitlistMetadata").first().ownText();
        }
        // we need to do some evil javascript parsing here to get the cover
        // and loan status of the item
        // get cover
        sr.setCover(findCoverUrl(tr, true));
        // get loan status and media ID
        if (tr.select("div[id^=loanstatus] + script").size() > 0) {
            String js = tr.select("div[id^=loanstatus] + script").first().html();
            String[] variables = new String[] { "loanstateDBId", "itemIdentifier", "hitlistIdentifier", "hitlistPosition", "duplicateHitlistIdentifier", "itemType", "titleStatus", "typeofHit", "context" };
            String ajaxUrl = matchJSVariable(js, "ajaxUrl");
            if (!"".equals(ajaxUrl)) {
                JSONObject id = new JSONObject();
                List<NameValuePair> map = new ArrayList<>();
                for (String variable : variables) {
                    String value = matchJSVariable(js, variable);
                    if (!"".equals(value)) {
                        map.add(new BasicNameValuePair(variable, value));
                    }
                    try {
                        if (variable.equals("itemIdentifier")) {
                            id.put("id", value);
                        } else if (variable.equals("loanstateDBId")) {
                            id.put("db", value);
                        }
                    } catch (JSONException e) {
                        e.printStackTrace();
                    }
                }
                sr.setId(id.toString());
                String url = new URL(new URL(opac_url + "/"), ajaxUrl).toString();
                String loanStatusHtml = httpGet(url + "?" + URLEncodedUtils.format(map, "UTF-8"), ENCODING).replace("\r\n", "").trim();
                Document loanStatusDoc = Jsoup.parse(loanStatusHtml);
                String loanstatus = loanStatusDoc.text().replace("\u00bb", "").trim();
                if ((loanstatus.startsWith("entliehen") && loanstatus.contains("keine Vormerkung möglich") || loanstatus.contains("Keine Exemplare verfügbar"))) {
                    sr.setStatus(SearchResult.Status.RED);
                } else if (loanstatus.startsWith("entliehen") || loanstatus.contains("andere Zweigstelle")) {
                    sr.setStatus(SearchResult.Status.YELLOW);
                } else if ((loanstatus.startsWith("bestellbar") && !loanstatus.contains("nicht bestellbar")) || (loanstatus.startsWith("vorbestellbar") && !loanstatus.contains("nicht vorbestellbar")) || (loanstatus.startsWith("vorbestellbar") && !loanstatus.contains("nicht vorbestellbar")) || (loanstatus.startsWith("vormerkbar") && !loanstatus.contains("nicht vormerkbar")) || (loanstatus.contains("heute zurückgebucht")) || (loanstatus.contains("ausleihbar") && !loanstatus.contains("nicht ausleihbar"))) {
                    sr.setStatus(SearchResult.Status.GREEN);
                } else if (loanstatus.equals("")) {
                    // In special databases (like "Handschriften" in Winterthur) ID lookup is
                    // not possible, which we try to detect this way. We therefore also cannot
                    // use getResultById when accessing the results.
                    sr.setId(null);
                }
                if (sr.getType() != null) {
                    if (sr.getType().equals(MediaType.EBOOK) || sr.getType().equals(MediaType.EVIDEO) || sr.getType().equals(MediaType.MP3)) // Especially Onleihe.de ebooks are often marked
                    // green though they are not available.
                    {
                        sr.setStatus(SearchResult.Status.UNKNOWN);
                    }
                }
            }
        }
        sr.setInnerhtml(("<b>" + title + "</b><br/>") + text);
        sr.setNr(10 * (page - 1) + i + 1);
        results.add(sr);
    }
    resultcount = results.size();
    return new SearchRequestResult(results, results_total, page);
}
Also used : Matcher(java.util.regex.Matcher) Element(org.jsoup.nodes.Element) ArrayList(java.util.ArrayList) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements) URI(java.net.URI) URL(java.net.URL) BasicNameValuePair(org.apache.http.message.BasicNameValuePair) MediaType(de.geeksfactory.opacclient.objects.SearchResult.MediaType) NameValuePair(org.apache.http.NameValuePair) BasicNameValuePair(org.apache.http.message.BasicNameValuePair) Pattern(java.util.regex.Pattern) JSONException(org.json.JSONException) SearchResult(de.geeksfactory.opacclient.objects.SearchResult) URISyntaxException(java.net.URISyntaxException) JSONException(org.json.JSONException) NotReachableException(de.geeksfactory.opacclient.networking.NotReachableException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) SearchRequestResult(de.geeksfactory.opacclient.objects.SearchRequestResult) JSONObject(org.json.JSONObject)

Aggregations

MediaType (de.geeksfactory.opacclient.objects.SearchResult.MediaType)3 JSONException (org.json.JSONException)3 Document (org.jsoup.nodes.Document)3 NotReachableException (de.geeksfactory.opacclient.networking.NotReachableException)2 SearchRequestResult (de.geeksfactory.opacclient.objects.SearchRequestResult)2 SearchResult (de.geeksfactory.opacclient.objects.SearchResult)2 IOException (java.io.IOException)2 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2 URI (java.net.URI)2 URISyntaxException (java.net.URISyntaxException)2 ArrayList (java.util.ArrayList)2 NameValuePair (org.apache.http.NameValuePair)2 BasicNameValuePair (org.apache.http.message.BasicNameValuePair)2 Element (org.jsoup.nodes.Element)2 Elements (org.jsoup.select.Elements)2 URL (java.net.URL)1 List (java.util.List)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 ClientProtocolException (org.apache.http.client.ClientProtocolException)1