use of org.apache.tika.sax.WriteOutContentHandler in project tika by apache.
the class TXTParserTest method testEBCDIC_CP500.
@Test
public void testEBCDIC_CP500() throws Exception {
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
parser.parse(TXTParserTest.class.getResourceAsStream("/test-documents/english.cp500.txt"), new WriteOutContentHandler(writer), metadata, new ParseContext());
assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));
// Additional check that it isn't too eager on short blocks of text
metadata = new Metadata();
writer = new StringWriter();
parser.parse(new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(ISO_8859_1)), new WriteOutContentHandler(writer), metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
}
use of org.apache.tika.sax.WriteOutContentHandler in project tika by apache.
the class RTFParserTest method getResult.
private Result getResult(String filename) throws Exception {
File file = getResourceAsFile("/test-documents/" + filename);
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
tika.getParser().parse(new FileInputStream(file), new WriteOutContentHandler(writer), metadata, new ParseContext());
String content = writer.toString();
return new Result(content, metadata);
}
use of org.apache.tika.sax.WriteOutContentHandler in project tika by apache.
the class Tika method parseToString.
/**
* Parses the given document and returns the extracted text content.
* The given input stream is closed by this method. This method lets
* you control the maxStringLength per call.
* <p>
* To avoid unpredictable excess memory use, the returned string contains
* only up to maxLength (parameter) first characters extracted
* from the input document.
* <p>
* <strong>NOTE:</strong> Unlike most other Tika methods that take an
* {@link InputStream}, this method will close the given stream for
* you as a convenience. With other methods you are still responsible
* for closing the stream or a wrapper instance returned by Tika.
*
* @param stream the document to be parsed
* @param metadata document metadata
* @param maxLength maximum length of the returned string
* @return extracted text content
* @throws IOException if the document can not be read
* @throws TikaException if the document can not be parsed
*/
public String parseToString(InputStream stream, Metadata metadata, int maxLength) throws IOException, TikaException {
WriteOutContentHandler handler = new WriteOutContentHandler(maxLength);
try {
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
parser.parse(stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
if (!handler.isWriteLimitReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
} finally {
stream.close();
}
return handler.toString();
}
use of org.apache.tika.sax.WriteOutContentHandler in project jackrabbit-oak by apache.
the class BinaryTextExtractor method parseStringValue0.
private String parseStringValue0(Blob v, Metadata metadata, String path) {
WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength());
long start = System.currentTimeMillis();
long bytesRead = 0;
long length = v.length();
if (log.isDebugEnabled()) {
log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
}
String oldThreadName = null;
if (length > SMALL_BINARY) {
Thread t = Thread.currentThread();
oldThreadName = t.getName();
t.setName(oldThreadName + ": Extracting " + path + ", " + length + " bytes");
}
try {
CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
try {
getParser().parse(stream, handler, metadata, new ParseContext());
} finally {
bytesRead = stream.getCount();
stream.close();
}
} catch (LinkageError e) {
// Capture and ignore errors caused by extraction libraries
// not being present. This is equivalent to disabling
// selected media types in configuration, so we can simply
// ignore these errors.
} catch (Throwable t) {
// The special STOP exception is used for normal termination.
if (!handler.isWriteLimitReached(t)) {
log.debug("[{}] Failed to extract text from a binary property: {}." + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", getIndexName(), path, t);
extractedTextCache.put(v, ExtractedText.ERROR);
return TEXT_EXTRACTION_ERROR;
}
} finally {
if (oldThreadName != null) {
Thread.currentThread().setName(oldThreadName);
}
}
String result = handler.toString();
if (bytesRead > 0) {
long time = System.currentTimeMillis() - start;
int len = result.length();
recordTextExtractionStats(time, bytesRead, len);
if (log.isDebugEnabled()) {
log.debug("Extracting {} took {} ms, {} bytes read, {} text size", path, time, bytesRead, len);
}
}
extractedTextCache.put(v, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result));
return result;
}
use of org.apache.tika.sax.WriteOutContentHandler in project jackrabbit-oak by apache.
the class TextExtractor method parseStringValue.
//~--------------------------------------< Tika >
private String parseStringValue(ByteSource byteSource, Metadata metadata, String path) {
WriteOutContentHandler handler = new WriteOutContentHandler(maxExtractedLength);
long start = System.currentTimeMillis();
long size = 0;
try {
CountingInputStream stream = new CountingInputStream(new LazyInputStream(byteSource));
try {
tika.getParser().parse(stream, handler, metadata, new ParseContext());
} finally {
size = stream.getCount();
stream.close();
}
} catch (LinkageError e) {
// Capture and ignore errors caused by extraction libraries
// not being present. This is equivalent to disabling
// selected media types in configuration, so we can simply
// ignore these errors.
} catch (Throwable t) {
// The special STOP exception is used for normal termination.
if (!handler.isWriteLimitReached(t)) {
parserErrorCount.incrementAndGet();
parserError.debug("Failed to extract text from a binary property: " + path + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", t);
return ERROR_TEXT;
}
}
String result = handler.toString();
timeTaken.addAndGet(System.currentTimeMillis() - start);
if (size > 0) {
extractedTextSize.addAndGet(result.length());
extractionCount.incrementAndGet();
totalSizeRead.addAndGet(size);
return result;
}
return null;
}
Aggregations