use of org.apache.jackrabbit.oak.commons.io.LazyInputStream in project jackrabbit-oak by apache.
the class TextExtractor method parseStringValue.
// ~--------------------------------------< Tika >
private String parseStringValue(ByteSource byteSource, Metadata metadata, String path) {
WriteOutContentHandler handler = new WriteOutContentHandler(maxExtractedLength);
long start = System.currentTimeMillis();
long size = 0;
try {
CountingInputStream stream = new CountingInputStream(new LazyInputStream(byteSource));
try {
tika.getParser().parse(stream, handler, metadata, new ParseContext());
} finally {
size = stream.getCount();
stream.close();
}
} catch (LinkageError e) {
// Capture errors caused by extraction libraries
// not being present. This is equivalent to disabling
// selected media types in configuration, so we can simply
// ignore these errors.
log.debug("Failed to extract text from a binary property: {}." + " This often happens when some media types are disabled by configuration." + " The stack trace is included to flag some 'unintended' failures", path, e);
parserErrorCount.incrementAndGet();
return ERROR_TEXT;
} catch (Throwable t) {
// The special STOP exception is used for normal termination.
if (!handler.isWriteLimitReached(t)) {
parserErrorCount.incrementAndGet();
parserError.debug("Failed to extract text from a binary property: " + path + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", t);
return ERROR_TEXT;
} else {
parserError.debug("Extracted text size exceeded configured limit({})", maxExtractedLength);
}
}
String result = handler.toString();
timeTaken.addAndGet(System.currentTimeMillis() - start);
if (size > 0) {
extractedTextSize.addAndGet(result.length());
extractionCount.incrementAndGet();
totalSizeRead.addAndGet(size);
return result;
}
return null;
}
use of org.apache.jackrabbit.oak.commons.io.LazyInputStream in project jackrabbit-oak by apache.
the class BinaryTextExtractor method parseStringValue0.
private String parseStringValue0(Blob v, Metadata metadata, String path) {
WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength());
long start = System.currentTimeMillis();
long bytesRead = 0;
long length = v.length();
if (log.isDebugEnabled()) {
log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
}
try {
CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
try {
if (length > SMALL_BINARY) {
String name = "Extracting " + path + ", " + length + " bytes";
extractedTextCache.process(name, new Callable<Void>() {
@Override
public Void call() throws Exception {
getParser().parse(stream, handler, metadata, new ParseContext());
return null;
}
});
} else {
getParser().parse(stream, handler, metadata, new ParseContext());
}
} finally {
bytesRead = stream.getCount();
stream.close();
}
} catch (LinkageError e) {
// Capture errors caused by extraction libraries
// not being present. This is equivalent to disabling
// selected media types in configuration, so we can simply
// ignore these errors.
log.debug("[{}] Failed to extract text from a binary property: {}." + " This often happens when some media types are disabled by configuration." + " The stack trace is included to flag some 'unintended' failures", getIndexName(), path, e);
extractedTextCache.put(v, ExtractedText.ERROR);
return TEXT_EXTRACTION_ERROR;
} catch (TimeoutException t) {
log.warn("[{}] Failed to extract text from a binary property due to timeout: {}.", getIndexName(), path);
extractedTextCache.put(v, ExtractedText.ERROR);
extractedTextCache.putTimeout(v, ExtractedText.ERROR);
return TEXT_EXTRACTION_ERROR;
} catch (Throwable t) {
// The special STOP exception is used for normal termination.
if (!handler.isWriteLimitReached(t)) {
log.debug("[{}] Failed to extract text from a binary property: {}." + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", getIndexName(), path, t);
extractedTextCache.put(v, ExtractedText.ERROR);
return TEXT_EXTRACTION_ERROR;
} else {
log.debug("Extracted text size exceeded configured limit({})", definition.getMaxExtractLength());
}
}
String result = handler.toString();
if (bytesRead > 0) {
long time = System.currentTimeMillis() - start;
int len = result.length();
recordTextExtractionStats(time, bytesRead, len);
if (log.isDebugEnabled()) {
log.debug("Extracting {} took {} ms, {} bytes read, {} text size", path, time, bytesRead, len);
}
}
extractedTextCache.put(v, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result));
return result;
}
Aggregations