Search in sources :

Example 51 with Node

use of org.jsoup.nodes.Node in project structr by structr.

the class Importer method createChildNodes.

private DOMNode createChildNodes(final Node startNode, final DOMNode parent, final Page page, final boolean removeHashAttribute, final int depth) throws FrameworkException {
    DOMNode rootElement = null;
    Linkable res = null;
    String instructions = null;
    final List<Node> children = startNode.childNodes();
    for (Node node : children) {
        String tag = node.nodeName();
        // clean tag, remove non-word characters except : and #
        if (tag != null) {
            tag = tag.replaceAll("[^a-zA-Z0-9#:.-_]+", "");
        }
        final StringBuilder classString = new StringBuilder();
        final String type = CaseHelper.toUpperCamelCase(tag);
        String comment = null;
        String content = null;
        String id = null;
        boolean isNewTemplateOrComponent = false;
        if (ignoreElementNames.contains(type)) {
            continue;
        }
        if (node instanceof Element) {
            final Element el = ((Element) node);
            final Set<String> classes = el.classNames();
            for (String cls : classes) {
                classString.append(cls).append(" ");
            }
            id = el.id();
            // do not download files when called from DeployCommand!
            if (!isDeployment) {
                String downloadAddressAttr = srcElements.contains(tag) ? "src" : hrefElements.contains(tag) ? "href" : null;
                if (downloadAddressAttr != null && StringUtils.isNotBlank(node.attr(downloadAddressAttr))) {
                    String downloadAddress = node.attr(downloadAddressAttr);
                    res = downloadFile(downloadAddress, originalUrl);
                } else {
                    res = null;
                }
            }
            if (removeHashAttribute) {
                // Remove data-structr-hash attribute
                node.removeAttr("data-structr-hash");
            }
        }
        // Data and comment nodes: Trim the text and put it into the "content" field without changes
        if (type.equals("#comment")) {
            comment = ((Comment) node).getData();
            tag = "";
            // Don't add content node for whitespace
            if (StringUtils.isBlank(comment)) {
                continue;
            }
            // store for later use
            commentSource.append(comment).append("\n");
            // check if comment contains instructions
            if (commentHandler != null && commentHandler.containsInstructions(comment)) {
                if (instructions != null) {
                    // unhandled instructions from previous iteration => empty content element
                    createEmptyContentNode(page, parent, commentHandler, instructions);
                }
                instructions = comment;
                continue;
            }
        } else if (type.equals("#data")) {
            tag = "";
            content = ((DataNode) node).getWholeData();
            // Don't add content node for whitespace
            if (StringUtils.isBlank(content)) {
                continue;
            }
        } else // Text-only nodes: Trim the text and put it into the "content" field
        {
            if (type.equals("#text")) {
                tag = "";
                if (isDeployment) {
                    content = trimTrailingNewline(((TextNode) node).getWholeText());
                    if (content == null || content.length() == 0) {
                        continue;
                    }
                } else {
                    content = trimTrailingNewline(((TextNode) node).text());
                    if (StringUtils.isBlank(content)) {
                        continue;
                    }
                }
            }
        }
        org.structr.web.entity.dom.DOMNode newNode = null;
        // create node
        if (StringUtils.isBlank(tag)) {
            if (page != null) {
                // create comment or content node
                if (!StringUtils.isBlank(comment)) {
                    final PropertyKey<String> contentTypeKey = StructrApp.key(Content.class, "contentType");
                    newNode = (DOMNode) page.createComment(comment);
                    newNode.setProperty(contentTypeKey, "text/html");
                } else {
                    newNode = (Content) page.createTextNode(content);
                }
            }
        } else if ("structr:template".equals(tag)) {
            final String src = node.attr("src");
            if (src != null) {
                DOMNode template = null;
                if (DeployCommand.isUuid(src)) {
                    template = (DOMNode) StructrApp.getInstance().nodeQuery(NodeInterface.class).and(GraphObject.id, src).getFirst();
                    if (template == null) {
                        System.out.println("##################################### template with UUID " + src + " not found, this is a known bug");
                    }
                } else if (DeployCommand.endsWithUuid(src)) {
                    final String uuid = src.substring(src.length() - 32);
                    template = (DOMNode) StructrApp.getInstance().nodeQuery(NodeInterface.class).and(GraphObject.id, uuid).getFirst();
                    if (template == null) {
                        System.out.println("##################################### template with UUID " + uuid + " not found, this is a known bug");
                    }
                } else {
                    template = Importer.findSharedComponentByName(src);
                    if (template == null) {
                        template = Importer.findTemplateByName(src);
                        if (template == null) {
                            template = createNewTemplateNode(parent, node.childNodes());
                            isNewTemplateOrComponent = true;
                        }
                    }
                }
                if (template != null) {
                    newNode = template;
                    if (template.isSharedComponent()) {
                        newNode = (DOMNode) template.cloneNode(false);
                        newNode.setSharedComponent(template);
                        newNode.setOwnerDocument(page);
                    } else if (page != null) {
                        newNode.setOwnerDocument(page);
                    }
                } else {
                    logger.warn("Unable to find template or shared component {}, template ignored!", src);
                }
            } else {
                logger.warn("Invalid template definition, missing src attribute!");
            }
        } else if ("structr:component".equals(tag)) {
            final String src = node.attr("src");
            if (src != null) {
                DOMNode component = null;
                if (DeployCommand.isUuid(src)) {
                    component = app.nodeQuery(DOMNode.class).and(GraphObject.id, src).getFirst();
                } else {
                    component = Importer.findSharedComponentByName(src);
                }
                if (component == null) {
                    component = createSharedComponent(node);
                }
                isNewTemplateOrComponent = true;
                if (component != null) {
                    newNode = (DOMNode) component.cloneNode(false);
                    newNode.setSharedComponent(component);
                    newNode.setOwnerDocument(page);
                } else {
                    logger.warn("Unable to find shared component {} - ignored!", src);
                }
            } else {
                logger.warn("Invalid component definition, missing src attribute!");
            }
        } else {
            if (page != null) {
                newNode = (org.structr.web.entity.dom.DOMElement) page.createElement(tag, true);
            }
            if (newNode == null) {
                final PropertyKey<Boolean> hideOnDetailKey = StructrApp.key(DOMNode.class, "hideOnDetail");
                final PropertyKey<Boolean> hideOnIndexKey = StructrApp.key(DOMNode.class, "hideOnIndex");
                final PropertyKey<String> tagKey = StructrApp.key(DOMElement.class, "tag");
                // experimental: create DOM element with literal tag
                newNode = (DOMElement) app.create(DOMElement.class, new NodeAttribute(tagKey, node.nodeName()), new NodeAttribute(hideOnDetailKey, false), new NodeAttribute(hideOnIndexKey, false));
                if (newNode != null && page != null) {
                    newNode.doAdopt(page);
                }
            /* disabled / replaced by implementation above
					newNode = createNewHTMLTemplateNodeForUnsupportedTag(parent, node);
					isNewTemplateOrComponent = true;
					*/
            }
        }
        if (newNode != null) {
            // save root element for later use
            if (rootElement == null && !(newNode instanceof org.structr.web.entity.dom.Comment)) {
                rootElement = newNode;
            }
            // set linkable
            if (res != null && newNode instanceof LinkSource) {
                ((LinkSource) newNode).setLinkable(res);
            }
            // container for bulk setProperties()
            final PropertyMap newNodeProperties = new PropertyMap();
            final Class newNodeType = newNode.getClass();
            newNodeProperties.put(AbstractNode.visibleToPublicUsers, publicVisible);
            newNodeProperties.put(AbstractNode.visibleToAuthenticatedUsers, authVisible);
            // "id" attribute: Put it into the "_html_id" field
            if (StringUtils.isNotBlank(id)) {
                newNodeProperties.put(StructrApp.key(DOMElement.class, "_html_id"), id);
            }
            if (StringUtils.isNotBlank(classString.toString())) {
                newNodeProperties.put(StructrApp.key(DOMElement.class, "_html_class"), StringUtils.trim(classString.toString()));
            }
            for (Attribute nodeAttr : node.attributes()) {
                final String key = nodeAttr.getKey();
                if (!key.equals("text")) {
                    // Don't add text attribute as _html_text because the text is already contained in the 'content' attribute
                    final String value = nodeAttr.getValue();
                    if (key.startsWith("data-")) {
                        if (key.startsWith(DATA_META_PREFIX)) {
                            // convert data-structr-meta-* attributes to local camel case properties on the node,
                            int l = DATA_META_PREFIX.length();
                            String upperCaseKey = WordUtils.capitalize(key.substring(l), new char[] { '-' }).replaceAll("-", "");
                            String camelCaseKey = key.substring(l, l + 1).concat(upperCaseKey.substring(1));
                            if (value != null) {
                                // store value using actual input converter
                                final PropertyKey actualKey = StructrApp.getConfiguration().getPropertyKeyForJSONName(newNodeType, camelCaseKey, false);
                                if (actualKey != null) {
                                    final PropertyConverter converter = actualKey.inputConverter(securityContext);
                                    if (converter != null) {
                                        final Object convertedValue = converter.convert(value);
                                        newNodeProperties.put(actualKey, convertedValue);
                                    } else {
                                        newNodeProperties.put(actualKey, value);
                                    }
                                } else {
                                    logger.warn("Unknown meta property key {}, ignoring.", camelCaseKey);
                                }
                            }
                        } else if (key.startsWith(DATA_STRUCTR_PREFIX)) {
                            // don't convert data-structr-* attributes as they are internal
                            final PropertyKey propertyKey = StructrApp.getConfiguration().getPropertyKeyForJSONName(newNodeType, key);
                            if (propertyKey != null) {
                                final PropertyConverter inputConverter = propertyKey.inputConverter(securityContext);
                                if (value != null && inputConverter != null) {
                                    newNodeProperties.put(propertyKey, propertyKey.inputConverter(securityContext).convert(value));
                                } else {
                                    newNodeProperties.put(propertyKey, value);
                                }
                            }
                        } else {
                            // store data-* attributes in node
                            final PropertyKey propertyKey = new StringProperty(key);
                            if (value != null) {
                                newNodeProperties.put(propertyKey, value);
                            }
                        }
                    } else {
                        boolean notBlank = StringUtils.isNotBlank(value);
                        boolean isAnchor = notBlank && value.startsWith("#");
                        boolean isLocal = notBlank && !value.startsWith("http");
                        boolean isActive = notBlank && value.contains("${");
                        boolean isStructrLib = notBlank && value.startsWith("/structr/js/");
                        if ("link".equals(tag) && "href".equals(key) && isLocal && !isActive && !isDeployment) {
                            newNodeProperties.put(new StringProperty(PropertyView.Html.concat(key)), "${link.path}?${link.version}");
                        } else if (("href".equals(key) || "src".equals(key)) && isLocal && !isActive && !isAnchor && !isStructrLib && !isDeployment) {
                            newNodeProperties.put(new StringProperty(PropertyView.Html.concat(key)), "${link.path}");
                        } else {
                            newNodeProperties.put(new StringProperty(PropertyView.Html.concat(key)), value);
                        }
                    }
                }
            }
            // bulk set properties on new node
            newNode.setProperties(securityContext, newNodeProperties);
            if ("script".equals(tag)) {
                final PropertyKey<String> typeKey = StructrApp.key(Input.class, "_html_type");
                final String contentType = newNode.getProperty(typeKey);
                if (contentType == null) {
                    // Set default type of script tag to "text/javascript" to ensure inline JS gets imported properly
                    newNode.setProperty(typeKey, "text/javascript");
                } else if (contentType.equals("application/schema+json")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        // Import schema JSON
                        SchemaJsonImporter.importSchemaJson(source);
                    }
                } else if (contentType.equals("application/x-structr-script")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        try {
                            Actions.execute(securityContext, null, source, null);
                        } catch (UnlicensedException ex) {
                            ex.log(logger);
                        }
                    }
                    continue;
                } else if (contentType.equals("application/x-structr-javascript")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        try {
                            Actions.execute(securityContext, null, source, null);
                        } catch (UnlicensedException ex) {
                            ex.log(logger);
                        }
                    }
                    continue;
                }
            } else if ("style".equals(tag)) {
                final PropertyKey<String> typeKey = StructrApp.key(Input.class, "_html_type");
                final String contentType = newNode.getProperty(typeKey);
                if ("text/css".equals(contentType)) {
                    // parse content of style elements and add referenced files to list of resources to be downloaded
                    for (final Node styleContentNode : node.childNodes()) {
                        final String source = styleContentNode.toString();
                        try {
                            // Import referenced resources
                            processCss(source, originalUrl);
                        } catch (IOException ex) {
                            logger.warn("Couldn't process CSS source", ex);
                        }
                    }
                }
            }
            if (instructions != null) {
                if (instructions.contains("@structr:content") && !(newNode instanceof Content)) {
                    // unhandled instructions from previous iteration => empty content element
                    createEmptyContentNode(page, parent, commentHandler, instructions);
                } else {
                    // apply instructions to new DOM element
                    if (commentHandler != null) {
                        commentHandler.handleComment(page, newNode, instructions, true);
                    }
                }
                instructions = null;
            }
            // allow parent to be null to prevent direct child relationship
            if (parent != null) {
                // special handling for <head> elements
                if (newNode instanceof Head && parent instanceof Body) {
                    final org.w3c.dom.Node html = parent.getParentNode();
                    html.insertBefore(newNode, parent);
                } else {
                    parent.appendChild(newNode);
                }
            }
            // Step down and process child nodes except for newly created templates
            if (!isNewTemplateOrComponent) {
                createChildNodes(node, newNode, page, removeHashAttribute, depth + 1);
            }
        }
    }
    // reset instructions when leaving a level
    if (instructions != null) {
        createEmptyContentNode(page, parent, commentHandler, instructions);
        instructions = null;
    }
    return rootElement;
}
Also used : LinkSource(org.structr.web.entity.LinkSource) NodeAttribute(org.structr.core.graph.NodeAttribute) Attribute(org.jsoup.nodes.Attribute) Node(org.jsoup.nodes.Node) DOMNode(org.structr.web.entity.dom.DOMNode) DataNode(org.jsoup.nodes.DataNode) TextNode(org.jsoup.nodes.TextNode) AbstractNode(org.structr.core.entity.AbstractNode) DOMElement(org.structr.web.entity.dom.DOMElement) Element(org.jsoup.nodes.Element) StringProperty(org.structr.core.property.StringProperty) DOMElement(org.structr.web.entity.dom.DOMElement) Input(org.structr.web.entity.html.Input) UnlicensedException(org.structr.common.error.UnlicensedException) DataNode(org.jsoup.nodes.DataNode) PropertyConverter(org.structr.core.converter.PropertyConverter) DOMNode(org.structr.web.entity.dom.DOMNode) Body(org.structr.web.entity.html.Body) NodeInterface(org.structr.core.graph.NodeInterface) NodeAttribute(org.structr.core.graph.NodeAttribute) Head(org.structr.web.entity.html.Head) IOException(java.io.IOException) PropertyMap(org.structr.core.property.PropertyMap) Content(org.structr.web.entity.dom.Content) DOMNode(org.structr.web.entity.dom.DOMNode) Linkable(org.structr.web.entity.Linkable) GraphObject(org.structr.core.GraphObject) PropertyKey(org.structr.core.property.PropertyKey)

Example 52 with Node

use of org.jsoup.nodes.Node in project structr by structr.

the class Importer method parse.

/**
 * Parse the code previously read by {@link Importer#readPage()} and treat it as page fragment.
 *
 * @param fragment
 * @return
 * @throws FrameworkException
 */
public boolean parse(final boolean fragment) throws FrameworkException {
    init();
    if (StringUtils.isNotBlank(code)) {
        if (!isDeployment) {
            logger.info("##### Start parsing code for page {} #####", new Object[] { name });
        } else {
            // a trailing slash to all void/self-closing tags so the XML parser can parse it correctly
            code = code.replaceAll("<(area|base|br|col|command|embed|hr|img|input|keygen|link|meta|param|source|track|wbr)([^>]*)>", "<$1$2/>");
        }
        if (fragment) {
            if (isDeployment) {
                final List<Node> nodeList = Parser.parseXmlFragment(code, "");
                parsedDocument = Document.createShell("");
                final Element body = parsedDocument.body();
                final Node[] nodes = nodeList.toArray(new Node[nodeList.size()]);
                for (int i = nodes.length - 1; i > 0; i--) {
                    nodes[i].remove();
                }
                for (Node node : nodes) {
                    body.appendChild(node);
                }
            } else {
                parsedDocument = Jsoup.parseBodyFragment(code);
            }
        } else {
            if (isDeployment) {
                parsedDocument = Jsoup.parse(code, "", Parser.xmlParser());
            } else {
                parsedDocument = Jsoup.parse(code);
            }
        }
    } else {
        if (!isDeployment) {
            logger.info("##### Start fetching {} for page {} #####", new Object[] { address, name });
        }
        code = HttpHelper.get(address);
        parsedDocument = Jsoup.parse(code);
    }
    return true;
}
Also used : Node(org.jsoup.nodes.Node) DOMNode(org.structr.web.entity.dom.DOMNode) DataNode(org.jsoup.nodes.DataNode) TextNode(org.jsoup.nodes.TextNode) AbstractNode(org.structr.core.entity.AbstractNode) DOMElement(org.structr.web.entity.dom.DOMElement) Element(org.jsoup.nodes.Element)

Example 53 with Node

use of org.jsoup.nodes.Node in project Lightning-Browser by anthonycr.

the class OutputFormatter method appendTextSkipHidden.

private void appendTextSkipHidden(@NonNull Element e, @NonNull StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}
Also used : Node(org.jsoup.nodes.Node) TextNode(org.jsoup.nodes.TextNode) Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode)

Example 54 with Node

use of org.jsoup.nodes.Node in project Java-readability by basis-technology-corp.

the class Readability method changeElementTag.

private Element changeElementTag(Element e, String newTag) {
    Element newElement = document.createElement(newTag);
    /* JSoup gives us the live child list, so we need to make a copy. */
    List<Node> copyOfChildNodeList = new ArrayList<Node>();
    copyOfChildNodeList.addAll(e.childNodes());
    for (Node n : copyOfChildNodeList) {
        n.remove();
        newElement.appendChild(n);
    }
    e.replaceWith(newElement);
    return newElement;
}
Also used : Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode) Node(org.jsoup.nodes.Node) ArrayList(java.util.ArrayList)

Example 55 with Node

use of org.jsoup.nodes.Node in project Java-readability by basis-technology-corp.

the class Readability method grabArticle.

// CHECKSTYLE:OFF
private Element grabArticle(Element pageElement) {
    boolean isPaging = pageElement != null;
    if (pageElement == null) {
        pageElement = body;
    }
    String pageCacheHtml = pageElement.html();
    Elements allElements = pageElement.getAllElements();
    /*
         * Note: in Javascript, this list would be *live*. If you deleted a node from the tree, it and its
         * children would remove themselves. To get the same effect, we make a linked list and we remove
         * things from it. This won't win prizes for speed, but, then again, the code in Javascript has to be
         * doing something nearly as awful.
         */
    LinkedList<Element> allElementsList = new LinkedList<Element>();
    allElementsList.addAll(allElements);
    /**
     * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc),
     * and turn divs into P tags where they have been used inappropriately (as in, where they contain no
     * other block level elements.) Note: Assignment from index for performance. See
     * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse
     * traversal?
     */
    List<Element> nodesToScore = new ArrayList<Element>();
    ListIterator<Element> elIterator = allElementsList.listIterator();
    Set<Element> goodAsDead = new HashSet<Element>();
    while (elIterator.hasNext()) {
        Element node = elIterator.next();
        if (goodAsDead.contains(node)) {
            continue;
        }
        /* Remove unlikely candidates */
        if (stripUnlikelyCandidates) {
            String unlikelyMatchString = node.className() + node.id();
            if (Patterns.exists(Patterns.UNLIKELY_CANDIDATES, unlikelyMatchString) && !Patterns.exists(Patterns.OK_MAYBE_ITS_A_CANDIDATE, unlikelyMatchString) && !"body".equals(node.tagName())) {
                LOG.debug("Removing unlikely candidate - " + unlikelyMatchString);
                List<Element> toRemoveAndBelow = node.getAllElements();
                elIterator.remove();
                /*
                     * adding 'node' to that set is harmless and reduces the code complexity here.
                     */
                goodAsDead.addAll(toRemoveAndBelow);
                continue;
            }
        }
        if ("p".equals(node.tagName()) || "td".equals(node.tagName()) || "pre".equals(node.tagName())) {
            nodesToScore.add(node);
        }
        /*
             * Turn all divs that don't have children block level elements into p's
             */
        if ("div".equals(node.tagName())) {
            boolean hasBlock = false;
            for (Element divChild : node.getAllElements()) {
                if (divChild != node) {
                    if (DIV_TO_P_ELEMENTS.contains(divChild.tagName())) {
                        hasBlock = true;
                        break;
                    }
                }
            }
            if (!hasBlock) {
                Element newElement = changeElementTag(node, "p");
                nodesToScore.remove(node);
                nodesToScore.add(newElement);
            } else {
                /* EXPERIMENTAL */
                /*
                                       * grab just child text and wrap each chunk in a p
                                       */
                int limit = node.childNodes().size();
                for (int i = 0; i < limit; i++) {
                    Node childNode = node.childNodes().get(i);
                    if (childNode instanceof TextNode) {
                        Element p = document.createElement("p");
                        p.attr("basisInline", "true");
                        p.html(((TextNode) childNode).text());
                        childNode.replaceWith(p);
                    }
                }
            }
        }
    }
    /**
     * Loop through all paragraphs, and assign a score to them based on how content-y they look. Then add
     * their score to their parent node. A score is determined by things like number of commas, class
     * names, etc. Maybe eventually link density.
     */
    List<Element> candidates = new ArrayList<Element>();
    for (Element nodeToScore : nodesToScore) {
        Element parentNode = nodeToScore.parent();
        if (null == parentNode) {
            // dropped previously.
            continue;
        }
        Element grandParentNode = parentNode.parent();
        if (grandParentNode == null) {
            // ditto
            continue;
        }
        String innerText = nodeToScore.text();
        /*
             * If this paragraph is less than 25 characters, don't even count it.
             */
        if (innerText.length() < 25) {
            continue;
        }
        /* Initialize readability data for the parent. */
        if ("".equals(parentNode.attr("readability"))) {
            initializeNode(parentNode);
            candidates.add(parentNode);
        }
        /*
             * If the grandparent has no parent, we don't want it as a candidate. It's probably a symptom that
             * we're operating in an orphan.
             */
        if (grandParentNode.parent() != null && "".equals(grandParentNode.attr("readability"))) {
            initializeNode(grandParentNode);
            candidates.add(grandParentNode);
        }
        double contentScore = 0;
        /* Add a point for the paragraph itself as a base. */
        contentScore++;
        /* Add points for any commas within this paragraph */
        contentScore += innerText.split(",").length;
        /*
             * For every 100 characters in this paragraph, add another point. Up to 3 points.
             */
        contentScore += Math.min(Math.floor(innerText.length() / 100.0), 3.0);
        /* Add the score to the parent. The grandparent gets half. */
        incrementContentScore(parentNode, contentScore);
        if (grandParentNode != null) {
            incrementContentScore(grandParentNode, contentScore / 2.0);
        }
    }
    /**
     * After we've calculated scores, loop through all of the possible candidate nodes we found and find
     * the one with the highest score.
     */
    Element topCandidate = null;
    for (Element candidate : candidates) {
        /**
         * Scale the final candidates score based on link density. Good content should have a relatively
         * small link density (5% or less) and be mostly unaffected by this operation.
         */
        double score = getContentScore(candidate);
        double newScore = score * (1.0 - getLinkDensity(candidate));
        setContentScore(candidate, newScore);
        LOG.debug("Candidate [" + candidate.getClass() + "] (" + candidate.className() + ":" + candidate.id() + ") with score " + newScore);
        if (null == topCandidate || newScore > getContentScore(topCandidate)) {
            topCandidate = candidate;
        }
    }
    /**
     * If we still have no top candidate, just use the body as a last resort. We also have to copy the
     * body node so it is something we can modify.
     */
    if (topCandidate == null || topCandidate == body) {
        topCandidate = document.createElement("div");
        // not efficient but not likely.
        topCandidate.html(pageElement.html());
        pageElement.html("");
        pageElement.appendChild(topCandidate);
        initializeNode(topCandidate);
    }
    /**
     * Now that we have the top candidate, look through its siblings for content that might also be
     * related. Things like preambles, content split by ads that we removed, etc.
     */
    Element articleContent = document.createElement("div");
    if (isPaging) {
        articleContent.attr("id", "readability-content");
    }
    double siblingScoreThreshold = Math.max(10, getContentScore(topCandidate) * 0.2);
    List<Element> siblingNodes = topCandidate.parent().children();
    for (Element siblingNode : siblingNodes) {
        boolean scored = isElementScored(siblingNode);
        boolean append = false;
        LOG.debug("Looking at sibling node: [" + siblingNode.getClass() + "] (" + siblingNode.className() + ":" + siblingNode.id() + ")");
        if (scored) {
            LOG.debug("Sibling has score " + getContentScore(siblingNode));
        } else {
            LOG.debug("Sibling has score unknown");
        }
        if (siblingNode == topCandidate) {
            append = true;
        }
        double contentBonus = 0;
        /*
             * Give a bonus if sibling nodes and top candidates have the example same classname
             */
        if (siblingNode.className().equals(topCandidate.className()) && !"".equals(topCandidate.className())) {
            contentBonus += getContentScore(topCandidate) * 0.2;
        }
        if (scored && (getContentScore(siblingNode) + contentBonus >= siblingScoreThreshold)) {
            append = true;
        }
        if ("p".equals(siblingNode.tagName())) {
            double linkDensity = getLinkDensity(siblingNode);
            String nodeContent = siblingNode.text();
            int nodeLength = nodeContent.length();
            if (nodeLength > 80 && linkDensity < 0.25) {
                append = true;
            } else if (nodeLength < 80 && linkDensity == 0 && Patterns.exists(Patterns.ENDS_WITH_DOT, nodeContent)) {
                append = true;
            }
        }
        if (append) {
            LOG.debug("Appending node: [" + siblingNode.getClass() + "]");
            Element nodeToAppend = null;
            if (!"div".equals(siblingNode.tagName()) && !"p".equals(siblingNode.tagName())) {
                /*
                     * We have a node that isn't a common block level element, like a form or td tag. Turn it
                     * into a div so it doesn't get filtered out later by accident.
                     */
                LOG.debug("Altering siblingNode of " + siblingNode.tagName() + " to div.");
                nodeToAppend = changeElementTag(siblingNode, "div");
            } else {
                nodeToAppend = siblingNode;
            }
            /*
                 * To ensure a node does not interfere with readability styles, remove its classnames
                 */
            nodeToAppend.removeAttr("class");
            /*
                 * Append sibling and subtract from our list because it removes the node when you append to
                 * another node
                 */
            articleContent.appendChild(nodeToAppend);
        }
    }
    document.body().empty();
    document.body().appendChild(articleContent);
    /**
     * So we have all of the content that we need. Now we clean it up for presentation.
     */
    prepArticle(articleContent);
    /**
     * Now that we've gone through the full algorithm, check to see if we got any meaningful content. If
     * we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
     * likelihood of finding the content, and the sieve approach gives us a higher likelihood of finding
     * the -right- content.
     */
    if (articleContent.text().length() < 250) {
        pageElement.html(pageCacheHtml);
        if (stripUnlikelyCandidates) {
            try {
                stripUnlikelyCandidates = false;
                return grabArticle(pageElement);
            } finally {
                stripUnlikelyCandidates = true;
            }
        } else if (classWeight) {
            try {
                classWeight = false;
                return grabArticle(pageElement);
            } finally {
                classWeight = true;
            }
        } else if (cleanConditionally) {
            try {
                cleanConditionally = false;
                return grabArticle(pageElement);
            } finally {
                cleanConditionally = true;
            }
        } else {
            return null;
        }
    }
    return articleContent;
}
Also used : Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode) Node(org.jsoup.nodes.Node) ArrayList(java.util.ArrayList) TextNode(org.jsoup.nodes.TextNode) Elements(org.jsoup.select.Elements) LinkedList(java.util.LinkedList) HashSet(java.util.HashSet)

Aggregations

Node (org.jsoup.nodes.Node)55 Element (org.jsoup.nodes.Element)39 TextNode (org.jsoup.nodes.TextNode)39 Document (org.jsoup.nodes.Document)19 ArrayList (java.util.ArrayList)17 Elements (org.jsoup.select.Elements)11 IOException (java.io.IOException)7 HashMap (java.util.HashMap)6 Copy (de.geeksfactory.opacclient.objects.Copy)5 DetailedItem (de.geeksfactory.opacclient.objects.DetailedItem)5 NameValuePair (org.apache.http.NameValuePair)5 BasicNameValuePair (org.apache.http.message.BasicNameValuePair)5 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)5 JSONException (org.json.JSONException)5 NotReachableException (de.geeksfactory.opacclient.networking.NotReachableException)4 Detail (de.geeksfactory.opacclient.objects.Detail)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 URI (java.net.URI)4 Matcher (java.util.regex.Matcher)4 URISyntaxException (java.net.URISyntaxException)3