Examples with HTMLConfiguration - org.cyberneko.html.HTMLConfiguration

Example 1 with HTMLConfiguration

use of org.cyberneko.html.HTMLConfiguration in project gate-core by GateNLP.

the class NekoHtmlDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format into annotations in GATE format. If the document was
 * created from a String, then is recomandable to set the doc's
 * sourceUrl to <b>null</b>. So, if the document has a valid URL,
 * then the parser will try to parse the XML document pointed by the
 * URL.If the URL is not valid, or is null, then the doc's content
 * will be parsed. If the doc's content is not a valid XML then the
 * parser might crash.
 *
 * @param doc The gate document you want to parse. If
 *          <code>doc.getSourceUrl()</code> returns <b>null</b>
 *          then the content of doc will be parsed. Using a URL is
 *          recomended because the parser will report errors corectlly
 *          if the document is not well formed.
 */
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    NekoHtmlDocumentHandler handler = null;
    try {
        org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
        // convert element and attribute names to lower case
        parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
        // make parser augment infoset with location information
        parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
        // Create a new Xml document handler
        handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
        // Register a status listener with it
        handler.addStatusListener(statusListener);
        // set repositioning object
        handler.setRepositioningInfo(repInfo);
        // set the object with ampersand coding positions
        handler.setAmpCodingInfo(ampCodingInfo);
        // construct the list of offsets for each line of the document
        int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
        handler.setLineOffsets(lineOffsets);
        // set the handlers
        parser.setDocumentHandler(handler);
        parser.setErrorHandler(handler);
        // Parse the XML Document with the appropriate encoding
        XMLInputSource is;
        if (docHasContentButNoValidURL) {
            // no URL, so parse from string
            is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
        } else if (doc instanceof TextualDocument) {
            // textual document - load with user specified encoding
            String docEncoding = ((TextualDocument) doc).getEncoding();
            // XML, so no BOM stripping.
            URLConnection conn = doc.getSourceUrl().openConnection();
            InputStream uStream = conn.getInputStream();
            if ("gzip".equals(conn.getContentEncoding())) {
                uStream = new GZIPInputStream(uStream);
            }
            Reader docReader = new InputStreamReader(uStream, docEncoding);
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
            // since we control the encoding, tell the parser to ignore any
            // meta http-equiv hints
            parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
        } else {
            // let the parser decide the encoding
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
        }
        /* The following line can forward an
       * ArrayIndexOutOfBoundsException from
       * org.cyberneko.html.HTMLConfiguration.parse and crash GATE.    */
        parser.parse(is);
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
    }/* Handle IOException specially.      */
     catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
    }/* Handle XNIException and ArrayIndexOutOfBoundsException:
     * flag the parsing error and keep going.     */
     catch (Exception e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } finally {
        if (handler != null)
            handler.removeStatusListener(statusListener);
    }
// End if else try
}

Also used : InputStreamReader(java.io.InputStreamReader) XMLInputSource(org.apache.xerces.xni.parser.XMLInputSource) GZIPInputStream(java.util.zip.GZIPInputStream) InputStream(java.io.InputStream) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) IOException(java.io.IOException) URLConnection(java.net.URLConnection) ResourceInstantiationException(gate.creole.ResourceInstantiationException) IOException(java.io.IOException) DocumentFormatException(gate.util.DocumentFormatException) DocumentFormatException(gate.util.DocumentFormatException) NekoHtmlDocumentHandler(gate.html.NekoHtmlDocumentHandler) GZIPInputStream(java.util.zip.GZIPInputStream) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) StatusListener(gate.event.StatusListener)

Example 2 with HTMLConfiguration

use of org.cyberneko.html.HTMLConfiguration in project muikku by otavanopisto.

the class WorkspaceMaterialController method createContentNode.

private ContentNode createContentNode(WorkspaceNode rootMaterialNode, int level, boolean processHtml, boolean includeHidden) throws WorkspaceMaterialException {
    boolean viewRestricted = false;
    try {
        switch(rootMaterialNode.getType()) {
            case FOLDER:
                WorkspaceFolder workspaceFolder = (WorkspaceFolder) rootMaterialNode;
                viewRestricted = !sessionController.isLoggedIn() && workspaceFolder.getViewRestrict() == MaterialViewRestrict.LOGGED_IN;
                ContentNode folderContentNode = new ContentNode(workspaceFolder.getTitle(), "folder", rootMaterialNode.getId(), null, level, null, null, rootMaterialNode.getParent().getId(), rootMaterialNode.getHidden(), null, 0l, 0l, workspaceFolder.getPath(), null, null, workspaceFolder.getViewRestrict(), viewRestricted);
                List<WorkspaceNode> children = includeHidden ? workspaceNodeDAO.listByParentSortByOrderNumber(workspaceFolder) : workspaceNodeDAO.listByParentAndHiddenSortByOrderNumber(workspaceFolder, Boolean.FALSE);
                List<FlattenedWorkspaceNode> flattenedChildren;
                if (level >= FLATTENING_LEVEL) {
                    flattenedChildren = flattenWorkspaceNodes(children, level, includeHidden);
                } else {
                    flattenedChildren = new ArrayList<>();
                    for (WorkspaceNode node : children) {
                        flattenedChildren.add(new FlattenedWorkspaceNode(false, null, node, level, node.getParent().getId(), node.getHidden()));
                    }
                }
                for (FlattenedWorkspaceNode child : flattenedChildren) {
                    ContentNode contentNode;
                    if (child.isEmptyFolder) {
                        contentNode = new ContentNode(child.emptyFolderTitle, "folder", rootMaterialNode.getId(), null, child.level, null, null, child.parentId, child.hidden, null, 0l, 0l, child.node.getPath(), null, null, MaterialViewRestrict.NONE, false);
                    } else {
                        contentNode = createContentNode(child.node, child.level, processHtml, includeHidden);
                    }
                    folderContentNode.addChild(contentNode);
                }
                return folderContentNode;
            case MATERIAL:
                DOMParser parser = null;
                Transformer transformer = null;
                if (processHtml) {
                    parser = new DOMParser(new HTMLConfiguration());
                    parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
                    transformer = TransformerFactory.newInstance().newTransformer();
                    transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
                    transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
                    transformer.setOutputProperty(OutputKeys.METHOD, "xml");
                    transformer.setOutputProperty(OutputKeys.INDENT, "no");
                }
                WorkspaceMaterial workspaceMaterial = (WorkspaceMaterial) rootMaterialNode;
                Material material = materialController.findMaterialById(workspaceMaterial.getMaterialId());
                Long currentRevision = material instanceof HtmlMaterial ? htmlMaterialController.lastHtmlMaterialRevision((HtmlMaterial) material) : 0l;
                Long publishedRevision = material instanceof HtmlMaterial ? ((HtmlMaterial) material).getRevisionNumber() : 0l;
                List<String> producerNames = null;
                String html;
                List<MaterialProducer> producers = materialController.listMaterialProducers(material);
                if ((producers != null) && !producers.isEmpty()) {
                    producerNames = new ArrayList<>();
                    for (MaterialProducer producer : producers) {
                        producerNames.add(StringUtils.replace(StringEscapeUtils.escapeHtml4(producer.getName()), ",", "&#44;"));
                    }
                }
                viewRestricted = !sessionController.isLoggedIn() && material.getViewRestrict() == MaterialViewRestrict.LOGGED_IN;
                if (!viewRestricted) {
                    html = processHtml ? getMaterialHtml(material, parser, transformer) : null;
                } else {
                    html = String.format("<p class=\"content-view-restricted-message\">%s</p>", localeController.getText(sessionController.getLocale(), "plugin.workspace.materialViewRestricted"));
                }
                return new ContentNode(workspaceMaterial.getTitle(), material.getType(), rootMaterialNode.getId(), material.getId(), level, workspaceMaterial.getAssignmentType(), workspaceMaterial.getCorrectAnswers(), workspaceMaterial.getParent().getId(), workspaceMaterial.getHidden(), html, currentRevision, publishedRevision, workspaceMaterial.getPath(), material.getLicense(), StringUtils.join(producerNames, ','), material.getViewRestrict(), viewRestricted);
            default:
                return null;
        }
    } catch (SAXNotRecognizedException | SAXNotSupportedException | TransformerConfigurationException e) {
        throw new WorkspaceMaterialException(e);
    }
}

Also used : Transformer(javax.xml.transform.Transformer) TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) Material(fi.otavanopisto.muikku.plugins.material.model.Material) HtmlMaterial(fi.otavanopisto.muikku.plugins.material.model.HtmlMaterial) WorkspaceMaterial(fi.otavanopisto.muikku.plugins.workspace.model.WorkspaceMaterial) SAXNotRecognizedException(org.xml.sax.SAXNotRecognizedException) MaterialProducer(fi.otavanopisto.muikku.plugins.material.model.MaterialProducer) WorkspaceNode(fi.otavanopisto.muikku.plugins.workspace.model.WorkspaceNode) WorkspaceMaterial(fi.otavanopisto.muikku.plugins.workspace.model.WorkspaceMaterial) WorkspaceFolder(fi.otavanopisto.muikku.plugins.workspace.model.WorkspaceFolder) SAXNotSupportedException(org.xml.sax.SAXNotSupportedException) DOMParser(org.apache.xerces.parsers.DOMParser) HtmlMaterial(fi.otavanopisto.muikku.plugins.material.model.HtmlMaterial)

Example 3 with HTMLConfiguration

use of org.cyberneko.html.HTMLConfiguration in project muikku by otavanopisto.

the class DeusNexMachinaController method postProcessHtml.

private void postProcessHtml(HtmlMaterial material) throws ParserConfigurationException, SAXException, IOException, XPathExpressionException, TransformerException {
    StringReader htmlReader = new StringReader(material.getHtml());
    DOMParser parser = new DOMParser(new HTMLConfiguration());
    parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
    InputSource inputSource = new InputSource(htmlReader);
    parser.parse(inputSource);
    org.w3c.dom.Document domDocument = parser.getDocument();
    boolean modified = false;
    // Embedded YouTube clips; strip protocol
    List<Element> elements = DeusNexXmlUtils.getElementsByXPath(domDocument.getDocumentElement(), "//iframe");
    if (!elements.isEmpty()) {
        for (Element element : elements) {
            String src = element.getAttribute("src");
            if (src != null && src.startsWith("http://www.youtube.com/")) {
                element.setAttribute("src", src.substring(5));
                modified = true;
            }
        }
    }
    // Embedded documents; add data attributes and determine correct material title
    elements = DeusNexXmlUtils.getElementsByXPath(domDocument.getDocumentElement(), "//iframe[@data-type=\"embedded-document\"]");
    if (!elements.isEmpty()) {
        modified = true;
        for (Element element : elements) {
            Integer resourceNo = Integer.valueOf(element.getAttribute("data-resource-no"));
            WorkspaceMaterial workspaceMaterial = workspaceMaterialController.findWorkspaceMaterialById(getResourceWorkspaceNodeId(resourceNo));
            HtmlMaterial htmlMaterial = htmlMaterialController.findHtmlMaterialById(workspaceMaterial.getMaterialId());
            // If a header precedes an embedded document, use its text as the embedded
            // document's title and remove it from the parent document altogether
            Node possibleHeaderNode = getPreviousSiblingElement(element);
            if (isHeader(possibleHeaderNode)) {
                String headerText = StringUtils.trim(possibleHeaderNode.getTextContent());
                if (!StringUtils.isBlank(headerText)) {
                    htmlMaterialController.updateHtmlMaterialTitle(htmlMaterial, headerText);
                    possibleHeaderNode.getParentNode().removeChild(possibleHeaderNode);
                }
            }
            element.setAttribute("data-material-id", String.valueOf(htmlMaterial.getId()));
            element.setAttribute("data-material-type", htmlMaterial.getType());
            element.setAttribute("data-workspace-material-id", String.valueOf(workspaceMaterial.getId()));
        }
    }
    // Update to post-processed version, if applicable
    if (modified) {
        StringWriter writer = new StringWriter();
        TransformerFactory transformerFactory = TransformerFactory.newInstance();
        Transformer transformer = transformerFactory.newTransformer();
        transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
        transformer.setOutputProperty(OutputKeys.METHOD, "xml");
        transformer.setOutputProperty(OutputKeys.INDENT, "no");
        transformer.transform(new DOMSource(domDocument), new StreamResult(writer));
        htmlMaterialController.updateHtmlMaterialHtml(material, writer.getBuffer().toString());
    }
}

Also used : InputSource(org.xml.sax.InputSource) DOMSource(javax.xml.transform.dom.DOMSource) TransformerFactory(javax.xml.transform.TransformerFactory) Transformer(javax.xml.transform.Transformer) StreamResult(javax.xml.transform.stream.StreamResult) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) Element(org.w3c.dom.Element) WorkspaceNode(fi.otavanopisto.muikku.plugins.workspace.model.WorkspaceNode) Node(org.w3c.dom.Node) WorkspaceMaterial(fi.otavanopisto.muikku.plugins.workspace.model.WorkspaceMaterial) StringWriter(java.io.StringWriter) StringReader(java.io.StringReader) DOMParser(org.apache.xerces.parsers.DOMParser) HtmlMaterial(fi.otavanopisto.muikku.plugins.material.model.HtmlMaterial)

Example 4 with HTMLConfiguration

use of org.cyberneko.html.HTMLConfiguration in project muikku by otavanopisto.

the class HtmlMaterialCleaner method cleanMaterial.

public void cleanMaterial(HtmlMaterial htmlMaterial, WorkspaceMaterial ownerMaterial) {
    Long maxRevision = getMaterialRevision(htmlMaterial);
    try {
        // Document
        String html = htmlMaterialController.getRevisionHtml(htmlMaterial, maxRevision);
        DOMParser parser = new DOMParser(new HTMLConfiguration());
        parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        InputSource inputSource = new InputSource(new StringReader(html));
        parser.parse(inputSource);
        Document document = parser.getDocument();
        // Tasks
        Iterator<HtmlMaterialCleanerTask> taskIterator = analyzerTasks.iterator();
        List<HtmlMaterialCleanerTask> cleanerTasks = new ArrayList<HtmlMaterialCleanerTask>();
        while (taskIterator.hasNext()) {
            cleanerTasks.add(taskIterator.next());
        }
        Collections.sort(cleanerTasks, new Comparator<HtmlMaterialCleanerTask>() {

            @Override
            public int compare(HtmlMaterialCleanerTask o1, HtmlMaterialCleanerTask o2) {
                return o1.getPriority().compareTo(o2.getPriority());
            }
        });
        String newHtml = null;
        for (HtmlMaterialCleanerTask cleanerTask : cleanerTasks) {
            if (cleanerTask.process(document, ownerMaterial)) {
                newHtml = DeusNexXmlUtils.serializeElement(document.getDocumentElement(), true, false, "html");
                patch(htmlMaterial, newHtml);
            }
        }
    } catch (Exception e) {
        logger.log(Level.SEVERE, "Failed to clean material " + htmlMaterial.getId(), e);
    }
}

Also used : InputSource(org.xml.sax.InputSource) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) ArrayList(java.util.ArrayList) Document(org.w3c.dom.Document) CoOpsUsageException(fi.foyt.coops.CoOpsUsageException) CoOpsInternalErrorException(fi.foyt.coops.CoOpsInternalErrorException) StringReader(java.io.StringReader) DOMParser(org.apache.xerces.parsers.DOMParser)

Example 5 with HTMLConfiguration

use of org.cyberneko.html.HTMLConfiguration in project zm-mailbox by Zimbra.

the class HtmlDetag method detag.

public String detag(String html) {
    StringWriter out = new StringWriter();
    UnescapeWriter writer = new UnescapeWriter(out, "utf-8");
    XMLDocumentFilter[] filters = { this, writer };
    XMLParserConfiguration parser = new HTMLConfiguration();
    parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
    parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
    parser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
    parser.setFeature("http://xml.org/sax/features/namespaces", false);
    XMLInputSource source = new XMLInputSource(null, null, null, new StringReader(html), null);
    try {
        parser.parse(source);
    } catch (Exception x) {
        ZimbraLog.misc.warn("Can't detag HTML [" + html + "]");
    }
    // return whatever has been done
    return out.toString();
}

Also used : StringWriter(java.io.StringWriter) XMLInputSource(org.apache.xerces.xni.parser.XMLInputSource) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) StringReader(java.io.StringReader) XMLDocumentFilter(org.apache.xerces.xni.parser.XMLDocumentFilter) XMLParserConfiguration(org.apache.xerces.xni.parser.XMLParserConfiguration) IOException(java.io.IOException) XNIException(org.apache.xerces.xni.XNIException)

Aggregations

HTMLConfiguration (org.cyberneko.html.HTMLConfiguration)9 StringReader (java.io.StringReader)5 DOMParser (org.apache.xerces.parsers.DOMParser)4 XMLDocumentFilter (org.apache.xerces.xni.parser.XMLDocumentFilter)4 XMLParserConfiguration (org.apache.xerces.xni.parser.XMLParserConfiguration)4 InputSource (org.xml.sax.InputSource)3 HtmlMaterial (fi.otavanopisto.muikku.plugins.material.model.HtmlMaterial)2 WorkspaceMaterial (fi.otavanopisto.muikku.plugins.workspace.model.WorkspaceMaterial)2 WorkspaceNode (fi.otavanopisto.muikku.plugins.workspace.model.WorkspaceNode)2 ResourceInstantiationException (gate.creole.ResourceInstantiationException)2 IOException (java.io.IOException)2 StringWriter (java.io.StringWriter)2 Transformer (javax.xml.transform.Transformer)2 XMLInputSource (org.apache.xerces.xni.parser.XMLInputSource)2 Document (org.w3c.dom.Document)2 Node (org.w3c.dom.Node)2 CoOpsInternalErrorException (fi.foyt.coops.CoOpsInternalErrorException)1 CoOpsUsageException (fi.foyt.coops.CoOpsUsageException)1 Material (fi.otavanopisto.muikku.plugins.material.model.Material)1 MaterialProducer (fi.otavanopisto.muikku.plugins.material.model.MaterialProducer)1