Search in sources :

Example 26 with TextNode

use of org.jsoup.nodes.TextNode in project jsoup by jhy.

the class TraversorTest method canRemoveDuringHead.

@Test
public void canRemoveDuringHead() {
    Document doc = Jsoup.parse("<div><p id=1>Zero<p id=1>One<p id=2>Two<p>Three</div>");
    NodeTraversor.traverse((node, depth) -> {
        if (node.attr("id").equals("1"))
            node.remove();
        else if (node instanceof TextNode && ((TextNode) node).text().equals("Three"))
            node.remove();
    }, doc);
    assertEquals("<div><p id=\"2\">Two</p><p></p></div>", TextUtil.stripNewlines(doc.body().html()));
}
Also used : TextNode(org.jsoup.nodes.TextNode) Document(org.jsoup.nodes.Document) Test(org.junit.jupiter.api.Test)

Example 27 with TextNode

use of org.jsoup.nodes.TextNode in project jsoup by jhy.

the class TokeniserStateTest method testCommentEndCoverage.

@Test
public void testCommentEndCoverage() {
    String html = "<html><head></head><body><img src=foo><!-- <table><tr><td></table> --! --- --><p>Hello</p></body></html>";
    Document doc = Jsoup.parse(html);
    Element body = doc.body();
    Comment comment = (Comment) body.childNode(1);
    assertEquals(" <table><tr><td></table> --! --- ", comment.getData());
    Element p = body.child(1);
    TextNode text = (TextNode) p.childNode(0);
    assertEquals("Hello", text.getWholeText());
}
Also used : Comment(org.jsoup.nodes.Comment) Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode) Document(org.jsoup.nodes.Document) Test(org.junit.jupiter.api.Test)

Example 28 with TextNode

use of org.jsoup.nodes.TextNode in project webmagic by code4craft.

the class CssSelector method getText.

protected String getText(Element element) {
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            accum.append(textNode.text());
        }
    }
    return accum.toString();
}
Also used : Node(org.jsoup.nodes.Node) TextNode(org.jsoup.nodes.TextNode) TextNode(org.jsoup.nodes.TextNode)

Example 29 with TextNode

use of org.jsoup.nodes.TextNode in project flow by vaadin.

the class TemplateParser method splitInclude.

private static void splitInclude(TextNode nodeToSplit, TemplateResolver resolver) {
    while (nodeToSplit != null) {
        String text = nodeToSplit.getWholeText();
        int includeStart = text.indexOf(INCLUDE_PREFIX);
        if (includeStart == -1) {
            return;
        }
        int includeEnd = text.indexOf('@', includeStart + INCLUDE_PREFIX.length());
        if (includeEnd == -1) {
            return;
        }
        int includeLength = includeEnd - includeStart;
        String includeFileName = text.substring(includeStart + INCLUDE_PREFIX.length(), includeEnd).trim();
        Element replacement = loadInclude(includeFileName, resolver);
        // Split the original node into an untouched prefix, the actual
        // include statement and (if there's more text) a remainder
        TextNode includeStatement = nodeToSplit.splitText(includeStart);
        TextNode remainder = null;
        if (includeStatement.getWholeText().length() > includeLength + 1) {
            remainder = includeStatement.splitText(includeLength + 1);
        }
        includeStatement.replaceWith(replacement);
        // Continue splitting the rest of the node
        nodeToSplit = remainder;
    }
}
Also used : Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode)

Example 30 with TextNode

use of org.jsoup.nodes.TextNode in project opacclient by opacapp.

the class Pica method parse_search.

protected SearchRequestResult parse_search(String html, int page) throws OpacErrorException {
    Document doc = Jsoup.parse(html);
    updateSearchSetValue(doc);
    if (doc.select(".error").size() > 0) {
        String error = doc.select(".error").first().text().trim();
        if (error.equals("Es wurde nichts gefunden.") || error.equals("Nothing has been found") || error.equals("Er is niets gevonden.") || error.equals("Rien n'a été trouvé.")) {
            // nothing found
            return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1);
        } else {
            // error
            throw new OpacErrorException(error);
        }
    }
    reusehtml = html;
    int results_total;
    String resultnumstr = doc.select(".pages").first().text();
    Pattern p = Pattern.compile("[0-9]+$");
    Matcher m = p.matcher(resultnumstr);
    if (m.find()) {
        resultnumstr = m.group();
    }
    if (resultnumstr.contains("(")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1"));
    } else if (resultnumstr.contains(": ")) {
        results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1"));
    } else {
        results_total = Integer.parseInt(resultnumstr);
    }
    List<SearchResult> results = new ArrayList<>();
    if (results_total == 1) {
        // Only one result
        DetailedItem singleResult = parse_result(html);
        SearchResult sr = new SearchResult();
        sr.setType(getMediaTypeInSingleResult(html));
        sr.setInnerhtml("<b>" + singleResult.getTitle() + "</b><br>" + singleResult.getDetails().get(0).getContent());
        results.add(sr);
    }
    Elements table = doc.select("table[summary=hitlist] tbody tr[valign=top]");
    // identifier = null;
    Elements links = doc.select("table[summary=hitlist] a");
    boolean haslink = false;
    for (int i = 0; i < links.size(); i++) {
        Element node = links.get(i);
        if (node.hasAttr("href") & node.attr("href").contains("SHW?") && !haslink) {
            haslink = true;
            try {
                List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(node.attr("href")), getDefaultEncoding());
                for (NameValuePair nv : anyurl) {
                    if (nv.getName().equals("identifier")) {
                        // identifier = nv.getValue();
                        break;
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    for (int i = 0; i < table.size(); i++) {
        Element tr = table.get(i);
        SearchResult sr = new SearchResult();
        if (tr.select("td.hit img").size() > 0) {
            String[] fparts = tr.select("td img").get(0).attr("src").split("/");
            String fname = fparts[fparts.length - 1];
            if (data.has("mediatypes")) {
                try {
                    sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname)));
                } catch (JSONException | IllegalArgumentException e) {
                    sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "").replace(".gif", "").replace(".png", "")));
                }
            } else {
                sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "").replace(".gif", "").replace(".png", "")));
            }
        }
        Element middlething = tr.child(2);
        List<Node> children = middlething.childNodes();
        int childrennum = children.size();
        List<String[]> strings = new ArrayList<>();
        for (int ch = 0; ch < childrennum; ch++) {
            Node node = children.get(ch);
            if (node instanceof TextNode) {
                String text = ((TextNode) node).text().trim();
                if (text.length() > 3) {
                    strings.add(new String[] { "text", "", text });
                }
            } else if (node instanceof Element) {
                List<Node> subchildren = node.childNodes();
                for (int j = 0; j < subchildren.size(); j++) {
                    Node subnode = subchildren.get(j);
                    if (subnode instanceof TextNode) {
                        String text = ((TextNode) subnode).text().trim();
                        if (text.length() > 3) {
                            strings.add(new String[] { ((Element) node).tag().getName(), "text", text, ((Element) node).className(), node.attr("style") });
                        }
                    } else if (subnode instanceof Element) {
                        String text = ((Element) subnode).text().trim();
                        if (text.length() > 3) {
                            strings.add(new String[] { ((Element) node).tag().getName(), ((Element) subnode).tag().getName(), text, ((Element) node).className(), node.attr("style") });
                        }
                    }
                }
            }
        }
        StringBuilder description = new StringBuilder();
        int k = 0;
        for (String[] part : strings) {
            if (part[0].equals("a") && k == 0) {
                description.append("<b>").append(part[2]).append("</b>");
            } else if (k < 3) {
                description.append("<br />").append(part[2]);
            }
            k++;
        }
        sr.setInnerhtml(description.toString());
        sr.setNr(10 * (page - 1) + i);
        sr.setId(null);
        results.add(sr);
    }
    resultcount = results.size();
    return new SearchRequestResult(results, results_total, page);
}
Also used : Matcher(java.util.regex.Matcher) Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode) Node(org.jsoup.nodes.Node) ArrayList(java.util.ArrayList) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements) URI(java.net.URI) DetailedItem(de.geeksfactory.opacclient.objects.DetailedItem) ArrayList(java.util.ArrayList) List(java.util.List) Pattern(java.util.regex.Pattern) BasicNameValuePair(org.apache.http.message.BasicNameValuePair) NameValuePair(org.apache.http.NameValuePair) JSONException(org.json.JSONException) SearchResult(de.geeksfactory.opacclient.objects.SearchResult) TextNode(org.jsoup.nodes.TextNode) JSONException(org.json.JSONException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) SearchRequestResult(de.geeksfactory.opacclient.objects.SearchRequestResult)

Aggregations

TextNode (org.jsoup.nodes.TextNode)52 Element (org.jsoup.nodes.Element)41 Node (org.jsoup.nodes.Node)37 Document (org.jsoup.nodes.Document)19 ArrayList (java.util.ArrayList)16 Elements (org.jsoup.select.Elements)14 IOException (java.io.IOException)6 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)6 JSONException (org.json.JSONException)6 Copy (de.geeksfactory.opacclient.objects.Copy)5 DetailedItem (de.geeksfactory.opacclient.objects.DetailedItem)5 HashMap (java.util.HashMap)5 NameValuePair (org.apache.http.NameValuePair)5 BasicNameValuePair (org.apache.http.message.BasicNameValuePair)5 Test (org.junit.jupiter.api.Test)5 NotReachableException (de.geeksfactory.opacclient.networking.NotReachableException)4 Detail (de.geeksfactory.opacclient.objects.Detail)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 URI (java.net.URI)4 Matcher (java.util.regex.Matcher)4