Search in sources :

Example 1 with SimpleHtmlSerializer

use of org.htmlcleaner.SimpleHtmlSerializer in project k-9 by k9mail.

the class HtmlSignatureRemover method stripSignature.

public static String stripSignature(String content) {
    Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
    if (dashSignatureHtml.find()) {
        Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
        Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
        List<Integer> start = new ArrayList<>();
        List<Integer> end = new ArrayList<>();
        while (blockquoteStart.find()) {
            start.add(blockquoteStart.start());
        }
        while (blockquoteEnd.find()) {
            end.add(blockquoteEnd.start());
        }
        if (start.size() != end.size()) {
            Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.", start.size(), end.size());
        } else if (start.size() > 0) {
            // Ignore quoted signatures in blockquotes.
            dashSignatureHtml.region(0, start.get(0));
            if (dashSignatureHtml.find()) {
                // before first <blockquote>.
                content = content.substring(0, dashSignatureHtml.start());
            } else {
                for (int i = 0; i < start.size() - 1; i++) {
                    // within blockquotes.
                    if (end.get(i) < start.get(i + 1)) {
                        dashSignatureHtml.region(end.get(i), start.get(i + 1));
                        if (dashSignatureHtml.find()) {
                            content = content.substring(0, dashSignatureHtml.start());
                            break;
                        }
                    }
                }
                if (end.get(end.size() - 1) < content.length()) {
                    // after last </blockquote>.
                    dashSignatureHtml.region(end.get(end.size() - 1), content.length());
                    if (dashSignatureHtml.find()) {
                        content = content.substring(0, dashSignatureHtml.start());
                    }
                }
            }
        } else {
            // No blockquotes found.
            content = content.substring(0, dashSignatureHtml.start());
        }
    }
    // Fix the stripping off of closing tags if a signature was stripped,
    // as well as clean up the HTML of the quoted message.
    HtmlCleaner cleaner = new HtmlCleaner();
    CleanerProperties properties = cleaner.getProperties();
    // see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
    properties.setNamespacesAware(false);
    properties.setAdvancedXmlEscape(false);
    properties.setOmitXmlDeclaration(true);
    properties.setOmitDoctypeDeclaration(false);
    properties.setTranslateSpecialEntities(false);
    properties.setRecognizeUnicodeChars(false);
    TagNode node = cleaner.clean(content);
    SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
    content = htmlSerialized.getAsString(node, "UTF8");
    return content;
}
Also used : SimpleHtmlSerializer(org.htmlcleaner.SimpleHtmlSerializer) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) CleanerProperties(org.htmlcleaner.CleanerProperties) HtmlCleaner(org.htmlcleaner.HtmlCleaner) TagNode(org.htmlcleaner.TagNode)

Aggregations

ArrayList (java.util.ArrayList)1 Matcher (java.util.regex.Matcher)1 CleanerProperties (org.htmlcleaner.CleanerProperties)1 HtmlCleaner (org.htmlcleaner.HtmlCleaner)1 SimpleHtmlSerializer (org.htmlcleaner.SimpleHtmlSerializer)1 TagNode (org.htmlcleaner.TagNode)1