use of de.l3s.boilerpipe.BoilerpipeProcessingException in project tika by apache.
the class BoilerpipeContentHandler method endDocument.
@Override
public void endDocument() throws SAXException {
super.endDocument();
td = toTextDocument();
try {
extractor.process(td);
} catch (BoilerpipeProcessingException e) {
throw new SAXException(e);
}
Attributes emptyAttrs = new AttributesImpl();
// and only emit character runs that passed the boilerpipe filters.
if (includeMarkup) {
BitSet validCharacterRuns = new BitSet();
for (TextBlock block : td.getTextBlocks()) {
if (block.isContent()) {
BitSet bs = block.getContainedTextElements();
if (bs != null) {
validCharacterRuns.or(bs);
}
}
}
// Now have bits set for all valid character runs. Replay our recorded elements,
// but only emit character runs flagged as valid.
int curCharsIndex = headerCharOffset;
for (RecordedElement element : elements) {
switch(element.getElementType()) {
case START:
delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
case CONTINUE:
// we have to follow suit.
for (char[] chars : element.getCharacters()) {
curCharsIndex++;
if (validCharacterRuns.get(curCharsIndex)) {
delegate.characters(chars, 0, chars.length);
// https://issues.apache.org/jira/browse/TIKA-961
if (!Character.isWhitespace(chars[chars.length - 1])) {
// Only add whitespace for certain elements
if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
delegate.ignorableWhitespace(NL, 0, NL.length);
}
}
}
}
break;
case END:
delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
break;
default:
throw new RuntimeException("Unhandled element type: " + element.getElementType());
}
}
} else {
for (TextBlock block : td.getTextBlocks()) {
if (block.isContent()) {
delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
char[] chars = block.getText().toCharArray();
delegate.characters(chars, 0, chars.length);
delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
delegate.ignorableWhitespace(NL, 0, NL.length);
}
}
}
delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
// We defer ending any prefix mapping until here, which is why we don't pass this
// through to the delegate in an overridden method.
delegate.endPrefixMapping("");
delegate.endDocument();
}
Aggregations