use of org.apache.tika.sax.XHTMLContentHandler in project jackrabbit-oak by apache.
the class HtmlRepresentation method render.
public void render(PropertyState property, HttpServletResponse response) throws IOException {
try {
XHTMLContentHandler xhtml = startResponse(response, property.getName());
xhtml.startDocument();
if (property.isArray()) {
xhtml.startElement("ol");
for (String value : property.getValue(STRINGS)) {
xhtml.element("li", value);
}
xhtml.endElement("ol");
} else {
xhtml.element("p", property.getValue(STRING));
}
xhtml.endDocument();
} catch (SAXException e) {
throw new IOException(e);
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class ExternalParser method parse.
/**
* Executes the configured external command and passes the given document
* stream as a simple XHTML document to the given SAX content handler.
* Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
* has been called to set patterns.
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
TemporaryResources tmp = new TemporaryResources();
try {
parse(TikaInputStream.get(stream, tmp), xhtml, metadata, tmp);
} finally {
tmp.dispose();
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class EmptyParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class RTFParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
TaggedInputStream tagged = new TaggedInputStream(stream);
try {
XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb());
final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
ert.extract(stream);
} catch (IOException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("Error parsing an RTF document", e);
}
}
use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.
the class Latin1StringsParser method doParse.
/**
* Does a best effort to extract Latin1 strings encoded with ISO-8859-1,
* UTF-8 or UTF-16. Valid chars are saved into the output buffer and the
* temporary buffer position is incremented. When an invalid char is read,
* the difference of the temporary and current buffer position is checked.
* If it is greater than the minimum string size, the current buffer
* position is updated to the temp position. If it is not, the temp position
* is reseted to the current position.
*
* @param stream
* the input stream.
* @param handler
* the output content handler
* @param metadata
* the metadata of the file
* @param context
* the parsing context
* @throws IOException
* if an io error occurs
* @throws SAXException
* if a sax error occurs
*/
private void doParse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException {
tmpPos = 0;
outPos = 0;
xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
int i = 0;
do {
inSize = 0;
while ((i = stream.read(input, inSize, BUF_SIZE - inSize)) > 0) {
inSize += i;
}
inPos = 0;
while (inPos < inSize) {
byte c = input[inPos++];
boolean utf8 = false;
/*
* Test for a possible UTF8 encoded char
*/
if (c == (byte) 0xC3) {
byte c_ = inPos < inSize ? input[inPos++] : (byte) stream.read();
/*
* Test if the next byte is in the valid UTF8 range
*/
if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) {
utf8 = true;
output[tmpPos++] = (byte) (c_ + 0x40);
} else {
output[tmpPos++] = c;
c = c_;
}
if (tmpPos == BUF_SIZE)
flushBuffer();
/*
* Test for a possible UTF8 encoded char
*/
} else if (c == (byte) 0xC2) {
byte c_ = inPos < inSize ? input[inPos++] : (byte) stream.read();
/*
* Test if the next byte is in the valid UTF8 range
*/
if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) {
utf8 = true;
output[tmpPos++] = c_;
} else {
output[tmpPos++] = c;
c = c_;
}
if (tmpPos == BUF_SIZE)
flushBuffer();
}
if (!utf8)
/*
* Test if the byte is a valid char.
*/
if (isChar(c)) {
output[tmpPos++] = c;
if (tmpPos == BUF_SIZE)
flushBuffer();
} else {
/*
* Test if the byte is an invalid char, marking a string
* end. If it is a zero, test 2 positions before or
* ahead for a valid char, meaning it marks the
* transition between ISO-8859-1 and UTF16 sequences.
*/
if (c != 0 || (inPos >= 3 && isChar(input[inPos - 3])) || (inPos + 1 < inSize && isChar(input[inPos + 1]))) {
if (tmpPos - outPos >= minSize) {
output[tmpPos++] = 0x0A;
outPos = tmpPos;
if (tmpPos == BUF_SIZE)
flushBuffer();
} else
tmpPos = outPos;
}
}
}
} while (i != -1 && !Thread.currentThread().isInterrupted());
if (tmpPos - outPos >= minSize) {
output[tmpPos++] = 0x0A;
outPos = tmpPos;
}
xhtml.characters(new String(output, 0, outPos, "windows-1252"));
xhtml.endDocument();
}
Aggregations