Search in sources :

Example 1 with DocumentTypeImpl

use of org.loboevolution.html.dom.domimpl.DocumentTypeImpl in project LoboEvolution by LoboEvolution.

the class HtmlParser method parseToken.

/**
 * Parses text followed by one element.
 *
 * @param parent
 * @param reader
 * @param stopTags    If tags in this set are encountered, the method throws
 *                    StopException.
 * @return
 * @throws IOException
 * @throws StopException
 * @throws SAXException
 */
private int parseToken(final Node parent, final LineNumberReader reader, final Set<HTMLTag> stopTags, final LinkedList<String> ancestors) throws IOException, StopException, SAXException {
    final Document doc = this.document;
    final HTMLDocumentImpl htmlDoc = (HTMLDocumentImpl) doc;
    final StringBuilder textSb = this.readUpToTagBegin(reader);
    if (textSb == null) {
        return TOKEN_EOD;
    }
    if (textSb.length() != 0) {
        // int textLine = reader.getLineNumber();
        final StringBuilder decText = entityDecode(textSb);
        final Node textNode = doc.createTextNode(decText.toString());
        try {
            safeAppendChild(parent, textNode);
        } catch (final DOMException de) {
            if ((parent.getNodeType() != NodeType.DOCUMENT_NODE) || (de.getCode() != DOMException.HIERARCHY_REQUEST_ERR)) {
                logger.log(Level.WARNING, "parseToken(): Unable to append child to " + parent + ".", de);
            }
        }
    }
    if (this.justReadTagBegin) {
        String tag = this.readTag(parent, reader);
        if (tag == null) {
            return TOKEN_EOD;
        }
        String normalTag = tag.toUpperCase();
        try {
            if (tag.startsWith("!")) {
                if ("!--".equals(tag)) {
                    final StringBuilder comment = this.passEndOfComment(reader);
                    final StringBuilder decText = entityDecode(comment);
                    safeAppendChild(parent, doc.createComment(decText.toString()));
                    return TOKEN_COMMENT;
                } else if ("!DOCTYPE".equals(tag)) {
                    final String doctypeStr = this.parseEndOfTag(reader);
                    String qName = null;
                    String publicId = null;
                    String systemId = null;
                    if (doctypeStr.contains("PUBLIC")) {
                        final Matcher doctypeMatcher = doctypePattern.matcher(doctypeStr);
                        if (doctypeMatcher.matches()) {
                            qName = doctypeMatcher.group(1);
                            publicId = doctypeMatcher.group(2);
                            systemId = doctypeMatcher.group(3);
                        }
                    } else {
                        qName = doctypeStr.replace(">", "");
                    }
                    final DocumentTypeImpl doctype = new DocumentTypeImpl(qName, publicId, systemId);
                    htmlDoc.setDoctype(doctype);
                    needRoot = false;
                    return TOKEN_BAD;
                } else {
                    passEndOfTag(reader);
                    return TOKEN_BAD;
                }
            } else if (tag.startsWith("/")) {
                tag = tag.substring(1);
                normalTag = normalTag.substring(1);
                this.passEndOfTag(reader);
                return TOKEN_END_ELEMENT;
            } else if (tag.startsWith("?")) {
                tag = tag.substring(1);
                final StringBuilder data = readProcessingInstruction(reader);
                safeAppendChild(parent, doc.createProcessingInstruction(tag, data.toString()));
                return TOKEN_FULL_ELEMENT;
            } else {
                final int localIndex = normalTag.indexOf(':');
                final boolean tagHasPrefix = localIndex > 0;
                final String localName = tagHasPrefix ? normalTag.substring(localIndex + 1) : normalTag;
                Element element = doc.createElement(localName);
                element.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
                try {
                    if (!this.justReadTagEnd) {
                        while (this.readAttribute(reader, element)) {
                        // EMPTY LOOP
                        }
                    }
                    if (stopTags != null && stopTags.contains(HTMLTag.get(normalTag))) {
                        // After MODIFYING_KEY is set.
                        throw new StopException(element);
                    }
                    // Add element to parent before children are added.
                    // This is necessary for incremental rendering.
                    safeAppendChild(parent, element);
                    if (!this.justReadEmptyElement) {
                        ElementInfo einfo = HTMLEntities.ELEMENT_INFOS.get(HTMLTag.get(localName.toUpperCase()));
                        int endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.getEndElementType();
                        if (endTagType != ElementInfo.END_ELEMENT_FORBIDDEN) {
                            boolean childrenOk = einfo == null || einfo.isChildElementOk();
                            Set<HTMLTag> newStopSet = einfo == null ? null : einfo.getStopTags();
                            if (newStopSet == null) {
                                if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
                                    newStopSet = Collections.singleton(HTMLTag.get(normalTag));
                                }
                            }
                            if (stopTags != null) {
                                if (newStopSet != null) {
                                    final Set<HTMLTag> newStopSet2 = new HashSet<>();
                                    newStopSet2.addAll(stopTags);
                                    newStopSet2.addAll(newStopSet);
                                    newStopSet = newStopSet2;
                                } else {
                                    newStopSet = endTagType == ElementInfo.END_ELEMENT_REQUIRED ? null : stopTags;
                                }
                            }
                            ancestors.addFirst(normalTag);
                            try {
                                for (; ; ) {
                                    try {
                                        int token;
                                        if ((einfo != null) && einfo.isNoScriptElement()) {
                                            final UserAgentContext ucontext = this.ucontext;
                                            if ((ucontext == null) || ucontext.isScriptingEnabled()) {
                                                token = this.parseForEndTag(parent, reader, tag, false, shouldDecodeEntities(einfo));
                                            } else {
                                                token = this.parseToken(element, reader, newStopSet, ancestors);
                                            }
                                        } else {
                                            token = childrenOk ? this.parseToken(element, reader, newStopSet, ancestors) : this.parseForEndTag(element, reader, tag, true, shouldDecodeEntities(einfo));
                                        }
                                        if (token == TOKEN_END_ELEMENT) {
                                            final String normalLastTag = this.normalLastTag;
                                            if (normalTag.equalsIgnoreCase(normalLastTag)) {
                                                return TOKEN_FULL_ELEMENT;
                                            } else {
                                                final ElementInfo closeTagInfo = HTMLEntities.ELEMENT_INFOS.get(HTMLTag.get(normalLastTag.toUpperCase()));
                                                if ((closeTagInfo == null) || (closeTagInfo.getEndElementType() != ElementInfo.END_ELEMENT_FORBIDDEN)) {
                                                    // TODO: Rather inefficient algorithm, but it's
                                                    // probably executed infrequently?
                                                    final Iterator<String> i = ancestors.iterator();
                                                    if (i.hasNext()) {
                                                        i.next();
                                                        while (i.hasNext()) {
                                                            final String normalAncestorTag = i.next();
                                                            if (normalLastTag.equals(normalAncestorTag)) {
                                                                normalTag = normalLastTag;
                                                                return TOKEN_END_ELEMENT;
                                                            }
                                                        }
                                                    }
                                                }
                                            // TODO: Working here
                                            }
                                        } else if (token == TOKEN_EOD) {
                                            return TOKEN_EOD;
                                        }
                                    } catch (final StopException se) {
                                        // newElement does not have a parent.
                                        final Element newElement = se.getElement();
                                        tag = newElement.getTagName();
                                        normalTag = tag.toUpperCase();
                                        if (stopTags != null && stopTags.contains(HTMLTag.get(normalTag))) {
                                            throw se;
                                        }
                                        einfo = HTMLEntities.ELEMENT_INFOS.get(HTMLTag.get(normalTag.toUpperCase()));
                                        endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.getEndElementType();
                                        childrenOk = einfo == null || einfo.isChildElementOk();
                                        newStopSet = einfo == null ? null : einfo.getStopTags();
                                        if (newStopSet == null) {
                                            if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
                                                newStopSet = Collections.singleton(HTMLTag.get(normalTag));
                                            }
                                        }
                                        if (stopTags != null && newStopSet != null) {
                                            final Set<HTMLTag> newStopSet2 = new HashSet<>();
                                            newStopSet2.addAll(stopTags);
                                            newStopSet2.addAll(newStopSet);
                                            newStopSet = newStopSet2;
                                        }
                                        ancestors.removeFirst();
                                        ancestors.addFirst(normalTag);
                                        // Switch element
                                        element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
                                        // newElement should have been suspended.
                                        element = newElement;
                                        // Add to parent
                                        safeAppendChild(parent, element);
                                        if (this.justReadEmptyElement) {
                                            return TOKEN_BEGIN_ELEMENT;
                                        }
                                    }
                                }
                            } finally {
                                ancestors.removeFirst();
                            }
                        }
                    }
                    return TOKEN_BEGIN_ELEMENT;
                } finally {
                    // This can inform elements to continue with notifications.
                    // It can also cause Javascript to be loaded / processed.
                    // Update: Elements now use Document.addJob() to delay processing
                    element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
                }
            }
        } finally {
            this.normalLastTag = normalTag;
        }
    } else {
        this.normalLastTag = null;
        return TOKEN_TEXT;
    }
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) UserAgentContext(org.loboevolution.http.UserAgentContext) Matcher(java.util.regex.Matcher) ElementInfo(org.loboevolution.info.ElementInfo) Node(org.loboevolution.html.node.Node) Element(org.loboevolution.html.node.Element) DocumentTypeImpl(org.loboevolution.html.dom.domimpl.DocumentTypeImpl) Document(org.loboevolution.html.node.Document) HTMLDocumentImpl(org.loboevolution.html.dom.domimpl.HTMLDocumentImpl) DOMException(com.gargoylesoftware.css.dom.DOMException) HTMLTag(org.loboevolution.html.HTMLTag) HashSet(java.util.HashSet)

Aggregations

DOMException (com.gargoylesoftware.css.dom.DOMException)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1 Matcher (java.util.regex.Matcher)1 HTMLTag (org.loboevolution.html.HTMLTag)1 DocumentTypeImpl (org.loboevolution.html.dom.domimpl.DocumentTypeImpl)1 HTMLDocumentImpl (org.loboevolution.html.dom.domimpl.HTMLDocumentImpl)1 Document (org.loboevolution.html.node.Document)1 Element (org.loboevolution.html.node.Element)1 Node (org.loboevolution.html.node.Node)1 UserAgentContext (org.loboevolution.http.UserAgentContext)1 ElementInfo (org.loboevolution.info.ElementInfo)1