Search in sources :

Example 41 with TextNode

use of org.jsoup.nodes.TextNode in project AozoraEpub3 by hmdev.

the class WebAozoraConverter method _printNode.

/**
 * ノードを出力 再帰用
 */
private void _printNode(BufferedWriter bw, Node parent) throws IOException {
    for (Node node : parent.childNodes()) {
        if (startElement != null) {
            if (node.equals(startElement)) {
                startElement = null;
                continue;
            }
            if (node instanceof Element)
                _printNode(bw, node);
            continue;
        }
        if (endElement != null && node.equals(endElement)) {
            return;
        }
        if (node instanceof TextNode)
            printText(bw, ((TextNode) node).getWholeText());
        else if (node instanceof Element) {
            Element elem = (Element) node;
            if ("br".equals(elem.tagName())) {
                if (elem.nextSibling() != null)
                    bw.append('\n');
            } else if ("div".equals(elem.tagName())) {
                if (elem.previousSibling() != null && !isBlockNode(elem.previousSibling()))
                    bw.append('\n');
                // 子を出力
                _printNode(bw, node);
                if (elem.nextSibling() != null)
                    bw.append('\n');
            } else if ("p".equals(elem.tagName())) {
                if (elem.previousSibling() != null && !isBlockNode(elem.previousSibling()))
                    bw.append('\n');
                // 子を出力
                _printNode(bw, node);
                if (elem.nextSibling() != null)
                    bw.append('\n');
            } else if ("ruby".equals(elem.tagName())) {
                // ルビ注記出力
                printRuby(bw, elem);
            } else if ("img".equals(elem.tagName())) {
                // 画像をキャッシュして注記出力
                printImage(bw, elem);
            } else if ("hr".equals(elem.tagName()) && !this.noHr) {
                bw.append("[#区切り線]\n");
            } else if ("b".equals(elem.tagName())) {
                bw.append("[#ここから太字]");
                // 子を出力
                _printNode(bw, node);
                bw.append("[#ここで太字終わり]");
            } else if ("sup".equals(elem.tagName())) {
                bw.append("[#上付き小文字]");
                // 子を出力
                _printNode(bw, node);
                bw.append("[#上付き小文字終わり]");
            } else if ("sub".equals(elem.tagName())) {
                bw.append("[#下付き小文字]");
                // 子を出力
                _printNode(bw, node);
                bw.append("[#下付き小文字終わり]");
            } else if ("strike".equals(elem.tagName()) || "s".equals(elem.tagName())) {
                bw.append("[#取消線]");
                // 子を出力
                _printNode(bw, node);
                bw.append("[#取消線終わり]");
            } else if ("tr".equals(elem.tagName())) {
                // 子を出力
                _printNode(bw, node);
                bw.append('\n');
            } else {
                // 子を出力
                _printNode(bw, node);
            }
        } else {
            System.out.println(node.getClass().getName());
        }
    }
}
Also used : TextNode(org.jsoup.nodes.TextNode) Node(org.jsoup.nodes.Node) Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode)

Example 42 with TextNode

use of org.jsoup.nodes.TextNode in project solr-cmd-utils by tblsoft.

the class HtmlJsoupFilter method mapAllElements.

public void mapAllElements(String selector, String fieldName) {
    Elements elements = jsoupDocument.select(selector);
    for (int i = 0; i < elements.size(); i++) {
        Element element = elements.get(i);
        StringBuilder value = new StringBuilder();
        for (Element subElements : element.getAllElements()) {
            for (TextNode textNode : subElements.textNodes()) {
                final String text = textNode.text();
                value.append(text);
                value.append(" ");
            }
        }
        document.addField(fieldName, value.toString().trim());
    }
}
Also used : Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode) Elements(org.jsoup.select.Elements)

Example 43 with TextNode

use of org.jsoup.nodes.TextNode in project Lightning-Browser by anthonycr.

the class OutputFormatter method appendTextSkipHidden.

private void appendTextSkipHidden(@NonNull Element e, @NonNull StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}
Also used : Node(org.jsoup.nodes.Node) TextNode(org.jsoup.nodes.TextNode) Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode)

Example 44 with TextNode

use of org.jsoup.nodes.TextNode in project onebusaway-application-modules by camsys.

the class GtfsFullValidationTaskJob method checkOutputForErrors.

/**
 * This method will parse the validation output HTML file checking to see
 * if any errors were found during the validation.  If any were, a summary csv
 * file is created listing the errors.
 *
 * @param agencyId - the agency id of the HTML file being checked
 * @param outputFile - the name of the HTML file to be checked.
 * @throws IOException
 */
private void checkOutputForErrors(String agencyId, String outputFile) throws IOException {
    File validationHtmlFile = new File(outputFile);
    Document doc = Jsoup.parse(validationHtmlFile, "UTF-8");
    Elements select = doc.select(".issueHeader:containsOwn(Errors:) ~ ul");
    if (select == null)
        return;
    Element first = select.first();
    if (first == null)
        return;
    Elements validationErrors = first.select("li");
    if (validationErrors != null && validationErrors.hasText()) {
        for (Node parentNode : validationErrors) {
            // for each <li>
            String errorMsgText = "";
            String errorDetailText = "";
            for (Node node : parentNode.childNodes()) {
                if (node instanceof TextNode) {
                    errorMsgText += ((TextNode) node).text();
                } else if (node instanceof Element) {
                    String tagName = ((Element) node).tagName();
                    if (tagName.equals("br")) {
                        errorMsgText += " ";
                    } else if (tagName.equals("div")) {
                        errorMsgText += parseDivData(node);
                    } else if (tagName.equals("table")) {
                        errorDetailText = parseTableData(node);
                    } else {
                        errorMsgText += ((Element) node).text();
                    }
                }
            }
            result.addError(errorMsgText + "," + errorDetailText);
        }
    }
}
Also used : Element(org.jsoup.nodes.Element) TextNode(org.jsoup.nodes.TextNode) Node(org.jsoup.nodes.Node) TextNode(org.jsoup.nodes.TextNode) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements) File(java.io.File)

Example 45 with TextNode

use of org.jsoup.nodes.TextNode in project structr by structr.

the class Importer method createChildNodes.

private DOMNode createChildNodes(final Node startNode, final DOMNode parent, final Page page, final boolean removeHashAttribute, final int depth) throws FrameworkException {
    DOMNode rootElement = null;
    Linkable res = null;
    String instructions = null;
    final List<Node> children = startNode.childNodes();
    for (Node node : children) {
        String tag = node.nodeName();
        // clean tag, remove non-word characters except : and #
        if (tag != null) {
            tag = tag.replaceAll("[^a-zA-Z0-9#:.-_]+", "");
        }
        final StringBuilder classString = new StringBuilder();
        final String type = CaseHelper.toUpperCamelCase(tag);
        String comment = null;
        String content = null;
        String id = null;
        boolean isNewTemplateOrComponent = false;
        if (ignoreElementNames.contains(type)) {
            continue;
        }
        if (node instanceof Element) {
            final Element el = ((Element) node);
            final Set<String> classes = el.classNames();
            for (String cls : classes) {
                classString.append(cls).append(" ");
            }
            id = el.id();
            // do not download files when called from DeployCommand!
            if (!isDeployment) {
                String downloadAddressAttr = srcElements.contains(tag) ? "src" : hrefElements.contains(tag) ? "href" : null;
                if (downloadAddressAttr != null && StringUtils.isNotBlank(node.attr(downloadAddressAttr))) {
                    String downloadAddress = node.attr(downloadAddressAttr);
                    res = downloadFile(downloadAddress, originalUrl);
                } else {
                    res = null;
                }
            }
            if (removeHashAttribute) {
                // Remove data-structr-hash attribute
                node.removeAttr("data-structr-hash");
            }
        }
        // Data and comment nodes: Trim the text and put it into the "content" field without changes
        if (type.equals("#comment")) {
            comment = ((Comment) node).getData();
            tag = "";
            // Don't add content node for whitespace
            if (StringUtils.isBlank(comment)) {
                continue;
            }
            // store for later use
            commentSource.append(comment).append("\n");
            // check if comment contains instructions
            if (commentHandler != null && commentHandler.containsInstructions(comment)) {
                if (instructions != null) {
                    // unhandled instructions from previous iteration => empty content element
                    createEmptyContentNode(page, parent, commentHandler, instructions);
                }
                instructions = comment;
                continue;
            }
        } else if (type.equals("#data")) {
            tag = "";
            content = ((DataNode) node).getWholeData();
            // Don't add content node for whitespace
            if (StringUtils.isBlank(content)) {
                continue;
            }
        } else // Text-only nodes: Trim the text and put it into the "content" field
        {
            if (type.equals("#text")) {
                tag = "";
                if (isDeployment) {
                    content = trimTrailingNewline(((TextNode) node).getWholeText());
                    if (content == null || content.length() == 0) {
                        continue;
                    }
                } else {
                    content = trimTrailingNewline(((TextNode) node).text());
                    if (StringUtils.isBlank(content)) {
                        continue;
                    }
                }
            }
        }
        org.structr.web.entity.dom.DOMNode newNode = null;
        // create node
        if (StringUtils.isBlank(tag)) {
            if (page != null) {
                // create comment or content node
                if (!StringUtils.isBlank(comment)) {
                    final PropertyKey<String> contentTypeKey = StructrApp.key(Content.class, "contentType");
                    newNode = (DOMNode) page.createComment(comment);
                    newNode.setProperty(contentTypeKey, "text/html");
                } else {
                    newNode = (Content) page.createTextNode(content);
                }
            }
        } else if ("structr:template".equals(tag)) {
            final String src = node.attr("src");
            if (src != null) {
                DOMNode template = null;
                if (DeployCommand.isUuid(src)) {
                    template = (DOMNode) StructrApp.getInstance().nodeQuery(NodeInterface.class).and(GraphObject.id, src).getFirst();
                    if (template == null) {
                        System.out.println("##################################### template with UUID " + src + " not found, this is a known bug");
                    }
                } else if (DeployCommand.endsWithUuid(src)) {
                    final String uuid = src.substring(src.length() - 32);
                    template = (DOMNode) StructrApp.getInstance().nodeQuery(NodeInterface.class).and(GraphObject.id, uuid).getFirst();
                    if (template == null) {
                        System.out.println("##################################### template with UUID " + uuid + " not found, this is a known bug");
                    }
                } else {
                    template = Importer.findSharedComponentByName(src);
                    if (template == null) {
                        template = Importer.findTemplateByName(src);
                        if (template == null) {
                            template = createNewTemplateNode(parent, node.childNodes());
                            isNewTemplateOrComponent = true;
                        }
                    }
                }
                if (template != null) {
                    newNode = template;
                    if (template.isSharedComponent()) {
                        newNode = (DOMNode) template.cloneNode(false);
                        newNode.setSharedComponent(template);
                        newNode.setOwnerDocument(page);
                    } else if (page != null) {
                        newNode.setOwnerDocument(page);
                    }
                } else {
                    logger.warn("Unable to find template or shared component {}, template ignored!", src);
                }
            } else {
                logger.warn("Invalid template definition, missing src attribute!");
            }
        } else if ("structr:component".equals(tag)) {
            final String src = node.attr("src");
            if (src != null) {
                DOMNode component = null;
                if (DeployCommand.isUuid(src)) {
                    component = app.nodeQuery(DOMNode.class).and(GraphObject.id, src).getFirst();
                } else {
                    component = Importer.findSharedComponentByName(src);
                }
                if (component == null) {
                    component = createSharedComponent(node);
                }
                isNewTemplateOrComponent = true;
                if (component != null) {
                    newNode = (DOMNode) component.cloneNode(false);
                    newNode.setSharedComponent(component);
                    newNode.setOwnerDocument(page);
                } else {
                    logger.warn("Unable to find shared component {} - ignored!", src);
                }
            } else {
                logger.warn("Invalid component definition, missing src attribute!");
            }
        } else {
            if (page != null) {
                newNode = (org.structr.web.entity.dom.DOMElement) page.createElement(tag, true);
            }
            if (newNode == null) {
                final PropertyKey<Boolean> hideOnDetailKey = StructrApp.key(DOMNode.class, "hideOnDetail");
                final PropertyKey<Boolean> hideOnIndexKey = StructrApp.key(DOMNode.class, "hideOnIndex");
                final PropertyKey<String> tagKey = StructrApp.key(DOMElement.class, "tag");
                // experimental: create DOM element with literal tag
                newNode = (DOMElement) app.create(DOMElement.class, new NodeAttribute(tagKey, node.nodeName()), new NodeAttribute(hideOnDetailKey, false), new NodeAttribute(hideOnIndexKey, false));
                if (newNode != null && page != null) {
                    newNode.doAdopt(page);
                }
            /* disabled / replaced by implementation above
					newNode = createNewHTMLTemplateNodeForUnsupportedTag(parent, node);
					isNewTemplateOrComponent = true;
					*/
            }
        }
        if (newNode != null) {
            // save root element for later use
            if (rootElement == null && !(newNode instanceof org.structr.web.entity.dom.Comment)) {
                rootElement = newNode;
            }
            // set linkable
            if (res != null && newNode instanceof LinkSource) {
                ((LinkSource) newNode).setLinkable(res);
            }
            // container for bulk setProperties()
            final PropertyMap newNodeProperties = new PropertyMap();
            final Class newNodeType = newNode.getClass();
            newNodeProperties.put(AbstractNode.visibleToPublicUsers, publicVisible);
            newNodeProperties.put(AbstractNode.visibleToAuthenticatedUsers, authVisible);
            // "id" attribute: Put it into the "_html_id" field
            if (StringUtils.isNotBlank(id)) {
                newNodeProperties.put(StructrApp.key(DOMElement.class, "_html_id"), id);
            }
            if (StringUtils.isNotBlank(classString.toString())) {
                newNodeProperties.put(StructrApp.key(DOMElement.class, "_html_class"), StringUtils.trim(classString.toString()));
            }
            for (Attribute nodeAttr : node.attributes()) {
                final String key = nodeAttr.getKey();
                if (!key.equals("text")) {
                    // Don't add text attribute as _html_text because the text is already contained in the 'content' attribute
                    final String value = nodeAttr.getValue();
                    if (key.startsWith("data-")) {
                        if (key.startsWith(DATA_META_PREFIX)) {
                            // convert data-structr-meta-* attributes to local camel case properties on the node,
                            int l = DATA_META_PREFIX.length();
                            String upperCaseKey = WordUtils.capitalize(key.substring(l), new char[] { '-' }).replaceAll("-", "");
                            String camelCaseKey = key.substring(l, l + 1).concat(upperCaseKey.substring(1));
                            if (value != null) {
                                // store value using actual input converter
                                final PropertyKey actualKey = StructrApp.getConfiguration().getPropertyKeyForJSONName(newNodeType, camelCaseKey, false);
                                if (actualKey != null) {
                                    final PropertyConverter converter = actualKey.inputConverter(securityContext);
                                    if (converter != null) {
                                        final Object convertedValue = converter.convert(value);
                                        newNodeProperties.put(actualKey, convertedValue);
                                    } else {
                                        newNodeProperties.put(actualKey, value);
                                    }
                                } else {
                                    logger.warn("Unknown meta property key {}, ignoring.", camelCaseKey);
                                }
                            }
                        } else if (key.startsWith(DATA_STRUCTR_PREFIX)) {
                            // don't convert data-structr-* attributes as they are internal
                            final PropertyKey propertyKey = StructrApp.getConfiguration().getPropertyKeyForJSONName(newNodeType, key);
                            if (propertyKey != null) {
                                final PropertyConverter inputConverter = propertyKey.inputConverter(securityContext);
                                if (value != null && inputConverter != null) {
                                    newNodeProperties.put(propertyKey, propertyKey.inputConverter(securityContext).convert(value));
                                } else {
                                    newNodeProperties.put(propertyKey, value);
                                }
                            }
                        } else {
                            // store data-* attributes in node
                            final PropertyKey propertyKey = new StringProperty(key);
                            if (value != null) {
                                newNodeProperties.put(propertyKey, value);
                            }
                        }
                    } else {
                        boolean notBlank = StringUtils.isNotBlank(value);
                        boolean isAnchor = notBlank && value.startsWith("#");
                        boolean isLocal = notBlank && !value.startsWith("http");
                        boolean isActive = notBlank && value.contains("${");
                        boolean isStructrLib = notBlank && value.startsWith("/structr/js/");
                        if ("link".equals(tag) && "href".equals(key) && isLocal && !isActive && !isDeployment) {
                            newNodeProperties.put(new StringProperty(PropertyView.Html.concat(key)), "${link.path}?${link.version}");
                        } else if (("href".equals(key) || "src".equals(key)) && isLocal && !isActive && !isAnchor && !isStructrLib && !isDeployment) {
                            newNodeProperties.put(new StringProperty(PropertyView.Html.concat(key)), "${link.path}");
                        } else {
                            newNodeProperties.put(new StringProperty(PropertyView.Html.concat(key)), value);
                        }
                    }
                }
            }
            // bulk set properties on new node
            newNode.setProperties(securityContext, newNodeProperties);
            if ("script".equals(tag)) {
                final PropertyKey<String> typeKey = StructrApp.key(Input.class, "_html_type");
                final String contentType = newNode.getProperty(typeKey);
                if (contentType == null) {
                    // Set default type of script tag to "text/javascript" to ensure inline JS gets imported properly
                    newNode.setProperty(typeKey, "text/javascript");
                } else if (contentType.equals("application/schema+json")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        // Import schema JSON
                        SchemaJsonImporter.importSchemaJson(source);
                    }
                } else if (contentType.equals("application/x-structr-script")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        try {
                            Actions.execute(securityContext, null, source, null);
                        } catch (UnlicensedException ex) {
                            ex.log(logger);
                        }
                    }
                    continue;
                } else if (contentType.equals("application/x-structr-javascript")) {
                    for (final Node scriptContentNode : node.childNodes()) {
                        final String source = scriptContentNode.toString();
                        try {
                            Actions.execute(securityContext, null, source, null);
                        } catch (UnlicensedException ex) {
                            ex.log(logger);
                        }
                    }
                    continue;
                }
            } else if ("style".equals(tag)) {
                final PropertyKey<String> typeKey = StructrApp.key(Input.class, "_html_type");
                final String contentType = newNode.getProperty(typeKey);
                if ("text/css".equals(contentType)) {
                    // parse content of style elements and add referenced files to list of resources to be downloaded
                    for (final Node styleContentNode : node.childNodes()) {
                        final String source = styleContentNode.toString();
                        try {
                            // Import referenced resources
                            processCss(source, originalUrl);
                        } catch (IOException ex) {
                            logger.warn("Couldn't process CSS source", ex);
                        }
                    }
                }
            }
            if (instructions != null) {
                if (instructions.contains("@structr:content") && !(newNode instanceof Content)) {
                    // unhandled instructions from previous iteration => empty content element
                    createEmptyContentNode(page, parent, commentHandler, instructions);
                } else {
                    // apply instructions to new DOM element
                    if (commentHandler != null) {
                        commentHandler.handleComment(page, newNode, instructions, true);
                    }
                }
                instructions = null;
            }
            // allow parent to be null to prevent direct child relationship
            if (parent != null) {
                // special handling for <head> elements
                if (newNode instanceof Head && parent instanceof Body) {
                    final org.w3c.dom.Node html = parent.getParentNode();
                    html.insertBefore(newNode, parent);
                } else {
                    parent.appendChild(newNode);
                }
            }
            // Step down and process child nodes except for newly created templates
            if (!isNewTemplateOrComponent) {
                createChildNodes(node, newNode, page, removeHashAttribute, depth + 1);
            }
        }
    }
    // reset instructions when leaving a level
    if (instructions != null) {
        createEmptyContentNode(page, parent, commentHandler, instructions);
        instructions = null;
    }
    return rootElement;
}
Also used : LinkSource(org.structr.web.entity.LinkSource) NodeAttribute(org.structr.core.graph.NodeAttribute) Attribute(org.jsoup.nodes.Attribute) Node(org.jsoup.nodes.Node) DOMNode(org.structr.web.entity.dom.DOMNode) DataNode(org.jsoup.nodes.DataNode) TextNode(org.jsoup.nodes.TextNode) AbstractNode(org.structr.core.entity.AbstractNode) DOMElement(org.structr.web.entity.dom.DOMElement) Element(org.jsoup.nodes.Element) StringProperty(org.structr.core.property.StringProperty) DOMElement(org.structr.web.entity.dom.DOMElement) Input(org.structr.web.entity.html.Input) UnlicensedException(org.structr.common.error.UnlicensedException) DataNode(org.jsoup.nodes.DataNode) PropertyConverter(org.structr.core.converter.PropertyConverter) DOMNode(org.structr.web.entity.dom.DOMNode) Body(org.structr.web.entity.html.Body) NodeInterface(org.structr.core.graph.NodeInterface) NodeAttribute(org.structr.core.graph.NodeAttribute) Head(org.structr.web.entity.html.Head) IOException(java.io.IOException) PropertyMap(org.structr.core.property.PropertyMap) Content(org.structr.web.entity.dom.Content) DOMNode(org.structr.web.entity.dom.DOMNode) Linkable(org.structr.web.entity.Linkable) GraphObject(org.structr.core.GraphObject) PropertyKey(org.structr.core.property.PropertyKey)

Aggregations

TextNode (org.jsoup.nodes.TextNode)52 Element (org.jsoup.nodes.Element)41 Node (org.jsoup.nodes.Node)37 Document (org.jsoup.nodes.Document)19 ArrayList (java.util.ArrayList)16 Elements (org.jsoup.select.Elements)14 IOException (java.io.IOException)6 DateTimeFormatter (org.joda.time.format.DateTimeFormatter)6 JSONException (org.json.JSONException)6 Copy (de.geeksfactory.opacclient.objects.Copy)5 DetailedItem (de.geeksfactory.opacclient.objects.DetailedItem)5 HashMap (java.util.HashMap)5 NameValuePair (org.apache.http.NameValuePair)5 BasicNameValuePair (org.apache.http.message.BasicNameValuePair)5 Test (org.junit.jupiter.api.Test)5 NotReachableException (de.geeksfactory.opacclient.networking.NotReachableException)4 Detail (de.geeksfactory.opacclient.objects.Detail)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 URI (java.net.URI)4 Matcher (java.util.regex.Matcher)4