Search in sources :

Example 6 with DOMParser

use of org.cyberneko.html.parsers.DOMParser in project fess-crawler by codelibs.

the class XpathTransformer method storeData.

@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
    final DOMParser parser = getDomParser();
    try (final InputStream in = responseData.getResponseBody()) {
        final InputSource is = new InputSource(in);
        if (responseData.getCharSet() != null) {
            is.setEncoding(responseData.getCharSet());
        }
        parser.parse(is);
    } catch (final Exception e) {
        throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
    }
    final Document document = parser.getDocument();
    final StringBuilder buf = new StringBuilder(1000);
    buf.append(getResultDataHeader());
    for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
        final String path = entry.getValue();
        try {
            final XObject xObj = getXPathAPI().eval(document, path);
            final int type = xObj.getType();
            switch(type) {
                case XObject.CLASS_BOOLEAN:
                    final boolean b = xObj.bool();
                    buf.append(getResultDataBody(entry.getKey(), Boolean.toString(b)));
                    break;
                case XObject.CLASS_NUMBER:
                    final double d = xObj.num();
                    buf.append(getResultDataBody(entry.getKey(), Double.toString(d)));
                    break;
                case XObject.CLASS_STRING:
                    final String str = xObj.str();
                    buf.append(getResultDataBody(entry.getKey(), str.trim()));
                    break;
                case XObject.CLASS_NODESET:
                    final NodeList nodeList = xObj.nodelist();
                    final List<String> strList = new ArrayList<>();
                    for (int i = 0; i < nodeList.getLength(); i++) {
                        final Node node = nodeList.item(i);
                        strList.add(node.getTextContent());
                    }
                    buf.append(getResultDataBody(entry.getKey(), strList));
                    break;
                case XObject.CLASS_RTREEFRAG:
                    final int rtf = xObj.rtf();
                    buf.append(getResultDataBody(entry.getKey(), Integer.toString(rtf)));
                    break;
                case XObject.CLASS_NULL:
                case XObject.CLASS_UNKNOWN:
                case XObject.CLASS_UNRESOLVEDVARIABLE:
                default:
                    Object obj = xObj.object();
                    if (obj == null) {
                        obj = "";
                    }
                    buf.append(getResultDataBody(entry.getKey(), obj.toString()));
                    break;
            }
        } catch (final TransformerException e) {
            logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue());
        }
    }
    buf.append(getAdditionalData(responseData, document));
    buf.append(getResultDataFooter());
    final String data = buf.toString().trim();
    try {
        resultData.setData(data.getBytes(charsetName));
    } catch (final UnsupportedEncodingException e) {
        if (logger.isInfoEnabled()) {
            logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
        }
        charsetName = Constants.UTF_8_CHARSET.name();
        resultData.setData(data.getBytes(Constants.UTF_8_CHARSET));
    }
    resultData.setEncoding(charsetName);
}
Also used : InputSource(org.xml.sax.InputSource) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) InputStream(java.io.InputStream) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node) ArrayList(java.util.ArrayList) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Document(org.w3c.dom.Document) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) TransformerException(javax.xml.transform.TransformerException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) XObject(org.apache.xpath.objects.XObject) DOMParser(org.cyberneko.html.parsers.DOMParser) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) XObject(org.apache.xpath.objects.XObject) TransformerException(javax.xml.transform.TransformerException)

Example 7 with DOMParser

use of org.cyberneko.html.parsers.DOMParser in project fess-crawler by codelibs.

the class HtmlXpathExtractor method getText.

/*
     * (non-Javadoc)
     *
     * @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
     * java.util.Map)
     */
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
    if (in == null) {
        throw new CrawlerSystemException("The inputstream is null.");
    }
    try {
        final BufferedInputStream bis = new BufferedInputStream(in);
        final String enc = getEncoding(bis);
        final DOMParser parser = getDomParser();
        final InputSource inputSource = new InputSource(bis);
        inputSource.setEncoding(enc);
        parser.parse(inputSource);
        final Document document = parser.getDocument();
        final StringBuilder buf = new StringBuilder(255);
        final NodeList nodeList = getXPathAPI().selectNodeList(document, targetNodePath);
        for (int i = 0; i < nodeList.getLength(); i++) {
            final Node node = nodeList.item(i);
            buf.append(node.getTextContent()).append(' ');
        }
        return new ExtractData(buf.toString().replaceAll("\\s+", " ").trim());
    } catch (final Exception e) {
        throw new ExtractException(e);
    }
}
Also used : InputSource(org.xml.sax.InputSource) ExtractException(org.codelibs.fess.crawler.exception.ExtractException) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node) Document(org.w3c.dom.Document) ExtractException(org.codelibs.fess.crawler.exception.ExtractException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) BufferedInputStream(java.io.BufferedInputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DOMParser(org.cyberneko.html.parsers.DOMParser)

Example 8 with DOMParser

use of org.cyberneko.html.parsers.DOMParser in project zm-mailbox by Zimbra.

the class QuotedTextUtil method getOriginalHtmlContent.

/**
 * Using the DOM structure of the message content, traverse node by node and
 * if we find a node that is recognized as a separator, remove all
 * subsequent elements
 *
 * @param text the message content
 * @return original content if the quoted content was found otherwise the
 *         complete message content
 */
private String getOriginalHtmlContent(String text) {
    ArrayList<Node> nodeList = new ArrayList<Node>();
    Node previousNode = null, sepNode = null;
    LineType previousType = null;
    boolean done = false;
    DOMParser parser = new DOMParser();
    Document document;
    Node htmlNode = null;
    try {
        parser.parse(new InputSource(new StringReader(text)));
        document = parser.getDocument();
        htmlNode = document.getFirstChild();
        flatten(htmlNode, nodeList);
        for (int i = 0; i < nodeList.size(); i++) {
            Node currentNode = nodeList.get(i);
            if (currentNode.getNodeType() == ELEMENT_NODE) {
                currentNode.normalize();
            }
            String nodeName = currentNode.getNodeName() != null ? currentNode.getNodeName() : "";
            String nodeValue = currentNode.getNodeValue() != null ? currentNode.getNodeValue() : "";
            LineType type = checkNode(currentNode);
            /*
                 * Check for a multi-element "wrote:" attribution (usually a
                 * combo of #text and A nodes), for example:
                 * 
                 * On Feb 28, 2014, at 3:42 PM, Joe Smith &lt;<a
                 * href="mailto:jsmith@zimbra.com"
                 * target="_blank">jsmith@zimbra.com</a>&gt; wrote:
                 * 
                 * If the current node is a #text with a date or "On ...", find
                 * #text nodes within the next ten nodes, concatenate them, and
                 * check the result.
                 */
            if (type == LineType.UNKNOWN && nodeName.equals("#text") && (MATCHER_ORIG_DATE.reset(nodeValue).matches() || MATCHER_ORIG_INTRO.reset(nodeValue).matches())) {
                String value = nodeValue;
                for (int j = 1; j < 10; j++) {
                    Node tempNode = nodeList.get(i + j);
                    if (tempNode != null && tempNode.getNodeName() != null && tempNode.getNodeName().equals("#text")) {
                        value += tempNode.getNodeValue();
                        if ("/:$/".matches(value)) {
                            type = getLineType(value.trim());
                            if (type == LineType.SEP_STRONG) {
                                i = i + j;
                                break;
                            }
                        }
                    }
                }
            }
            if (type != null) {
                // definite separator
                if (type == LineType.SEP_STRONG || type == LineType.WROTE_STRONG) {
                    sepNode = currentNode;
                    done = true;
                    break;
                }
                // some sort of line followed by a header
                if (type == LineType.HEADER && previousType == LineType.LINE) {
                    sepNode = previousNode;
                    done = true;
                    break;
                }
                previousNode = currentNode;
                previousType = type;
            }
        }
        if (sepNode != null) {
            prune(sepNode, true);
        }
        if (done) {
            String originalText = getHtml(document);
            return (originalText == null || originalText.isEmpty()) ? text : originalText;
        }
    } catch (SAXException | IOException e) {
        ZimbraLog.soap.warn("Exception while removing quoted text from html message", e);
    }
    return text;
}
Also used : InputSource(org.xml.sax.InputSource) Node(org.w3c.dom.Node) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Document(org.w3c.dom.Document) SAXException(org.xml.sax.SAXException) StringReader(java.io.StringReader) DOMParser(org.cyberneko.html.parsers.DOMParser)

Example 9 with DOMParser

use of org.cyberneko.html.parsers.DOMParser in project openolat by klemens.

the class QuoteAndTagFilter method filter.

/**
 * @see org.olat.core.util.filter.Filter#filter(java.lang.String)
 */
@Override
public String filter(String original) {
    try {
        DOMParser parser = new DOMParser();
        parser.parse(new InputSource(new StringReader(original)));
        Document document = parser.getDocument();
        StringBuilder sb = new StringBuilder();
        scanNode(document, sb);
        return sb.toString();
    } catch (SAXException e) {
        log.error("", e);
        return null;
    } catch (IOException e) {
        log.error("", e);
        return null;
    }
}
Also used : InputSource(org.xml.sax.InputSource) StringReader(java.io.StringReader) DOMParser(org.cyberneko.html.parsers.DOMParser) IOException(java.io.IOException) Document(org.w3c.dom.Document) SAXException(org.xml.sax.SAXException)

Example 10 with DOMParser

use of org.cyberneko.html.parsers.DOMParser in project intellij-community by JetBrains.

the class FindJarFix method initiateDownload.

private void initiateDownload(String url, String jarName) {
    DOMParser parser = new DOMParser();
    try {
        parser.parse(url);
        final Document doc = parser.getDocument();
        if (doc != null) {
            final NodeList links = doc.getElementsByTagName(LINK_TAG_NAME);
            if (links != null) {
                for (int i = 0; i < links.getLength(); i++) {
                    final Node item = links.item(i);
                    if (item != null) {
                        final NamedNodeMap attributes = item.getAttributes();
                        if (attributes != null) {
                            final Node link = attributes.getNamedItem(LINK_ATTR_NAME);
                            if (link != null) {
                                final String jarUrl = link.getTextContent();
                                if (jarUrl != null && jarUrl.endsWith(jarName)) {
                                    downloadJar(jarUrl, jarName);
                                }
                            }
                        }
                    }
                }
            }
        }
    } catch (SAXException | IOException e) {
    //
    }
}
Also used : NamedNodeMap(org.w3c.dom.NamedNodeMap) NodeList(org.w3c.dom.NodeList) Node(org.w3c.dom.Node) DOMParser(org.cyberneko.html.parsers.DOMParser) IOException(java.io.IOException) Document(org.w3c.dom.Document) SAXException(org.xml.sax.SAXException)

Aggregations

DOMParser (org.cyberneko.html.parsers.DOMParser)12 Document (org.w3c.dom.Document)10 IOException (java.io.IOException)8 InputSource (org.xml.sax.InputSource)8 SAXException (org.xml.sax.SAXException)7 Node (org.w3c.dom.Node)6 NodeList (org.w3c.dom.NodeList)5 StringReader (java.io.StringReader)4 ArrayList (java.util.ArrayList)3 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)3 BufferedInputStream (java.io.BufferedInputStream)2 InputStream (java.io.InputStream)2 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2 URL (java.net.URL)2 LinkedHashMap (java.util.LinkedHashMap)2 Map (java.util.Map)2 TransformerException (javax.xml.transform.TransformerException)2 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)2 NamedNodeMap (org.w3c.dom.NamedNodeMap)2 ProgressIndicator (com.intellij.openapi.progress.ProgressIndicator)1