Search in sources :

Example 1 with TextBlock

use of de.l3s.boilerpipe.document.TextBlock in project tika by apache.

the class BoilerpipeContentHandler method endDocument.

@Override
public void endDocument() throws SAXException {
    super.endDocument();
    td = toTextDocument();
    try {
        extractor.process(td);
    } catch (BoilerpipeProcessingException e) {
        throw new SAXException(e);
    }
    Attributes emptyAttrs = new AttributesImpl();
    // and only emit character runs that passed the boilerpipe filters.
    if (includeMarkup) {
        BitSet validCharacterRuns = new BitSet();
        for (TextBlock block : td.getTextBlocks()) {
            if (block.isContent()) {
                BitSet bs = block.getContainedTextElements();
                if (bs != null) {
                    validCharacterRuns.or(bs);
                }
            }
        }
        // Now have bits set for all valid character runs. Replay our recorded elements,
        // but only emit character runs flagged as valid.
        int curCharsIndex = headerCharOffset;
        for (RecordedElement element : elements) {
            switch(element.getElementType()) {
                case START:
                    delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
                case CONTINUE:
                    // we have to follow suit.
                    for (char[] chars : element.getCharacters()) {
                        curCharsIndex++;
                        if (validCharacterRuns.get(curCharsIndex)) {
                            delegate.characters(chars, 0, chars.length);
                            // https://issues.apache.org/jira/browse/TIKA-961
                            if (!Character.isWhitespace(chars[chars.length - 1])) {
                                // Only add whitespace for certain elements
                                if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
                                    delegate.ignorableWhitespace(NL, 0, NL.length);
                                }
                            }
                        }
                    }
                    break;
                case END:
                    delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
                    break;
                default:
                    throw new RuntimeException("Unhandled element type: " + element.getElementType());
            }
        }
    } else {
        for (TextBlock block : td.getTextBlocks()) {
            if (block.isContent()) {
                delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
                char[] chars = block.getText().toCharArray();
                delegate.characters(chars, 0, chars.length);
                delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
                delegate.ignorableWhitespace(NL, 0, NL.length);
            }
        }
    }
    delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
    delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
    // We defer ending any prefix mapping until here, which is why we don't pass this
    // through to the delegate in an overridden method.
    delegate.endPrefixMapping("");
    delegate.endDocument();
}
Also used : AttributesImpl(org.xml.sax.helpers.AttributesImpl) Attributes(org.xml.sax.Attributes) BitSet(java.util.BitSet) TextBlock(de.l3s.boilerpipe.document.TextBlock) BoilerpipeProcessingException(de.l3s.boilerpipe.BoilerpipeProcessingException) SAXException(org.xml.sax.SAXException)

Aggregations

BoilerpipeProcessingException (de.l3s.boilerpipe.BoilerpipeProcessingException)1 TextBlock (de.l3s.boilerpipe.document.TextBlock)1 BitSet (java.util.BitSet)1 Attributes (org.xml.sax.Attributes)1 SAXException (org.xml.sax.SAXException)1 AttributesImpl (org.xml.sax.helpers.AttributesImpl)1