Search in sources :

Example 1 with LazyInputStream

use of org.apache.jackrabbit.oak.commons.io.LazyInputStream in project jackrabbit-oak by apache.

the class TextExtractor method parseStringValue.

// ~--------------------------------------< Tika >
private String parseStringValue(ByteSource byteSource, Metadata metadata, String path) {
    WriteOutContentHandler handler = new WriteOutContentHandler(maxExtractedLength);
    long start = System.currentTimeMillis();
    long size = 0;
    try {
        CountingInputStream stream = new CountingInputStream(new LazyInputStream(byteSource));
        try {
            tika.getParser().parse(stream, handler, metadata, new ParseContext());
        } finally {
            size = stream.getCount();
            stream.close();
        }
    } catch (LinkageError e) {
        // Capture errors caused by extraction libraries
        // not being present. This is equivalent to disabling
        // selected media types in configuration, so we can simply
        // ignore these errors.
        log.debug("Failed to extract text from a binary property: {}." + " This often happens when some media types are disabled by configuration." + " The stack trace is included to flag some 'unintended' failures", path, e);
        parserErrorCount.incrementAndGet();
        return ERROR_TEXT;
    } catch (Throwable t) {
        // The special STOP exception is used for normal termination.
        if (!handler.isWriteLimitReached(t)) {
            parserErrorCount.incrementAndGet();
            parserError.debug("Failed to extract text from a binary property: " + path + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", t);
            return ERROR_TEXT;
        } else {
            parserError.debug("Extracted text size exceeded configured limit({})", maxExtractedLength);
        }
    }
    String result = handler.toString();
    timeTaken.addAndGet(System.currentTimeMillis() - start);
    if (size > 0) {
        extractedTextSize.addAndGet(result.length());
        extractionCount.incrementAndGet();
        totalSizeRead.addAndGet(size);
        return result;
    }
    return null;
}
Also used : WriteOutContentHandler(org.apache.tika.sax.WriteOutContentHandler) LazyInputStream(org.apache.jackrabbit.oak.commons.io.LazyInputStream) CountingInputStream(com.google.common.io.CountingInputStream) ParseContext(org.apache.tika.parser.ParseContext)

Example 2 with LazyInputStream

use of org.apache.jackrabbit.oak.commons.io.LazyInputStream in project jackrabbit-oak by apache.

the class BinaryTextExtractor method parseStringValue0.

private String parseStringValue0(Blob v, Metadata metadata, String path) {
    WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength());
    long start = System.currentTimeMillis();
    long bytesRead = 0;
    long length = v.length();
    if (log.isDebugEnabled()) {
        log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
    }
    try {
        CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
        try {
            if (length > SMALL_BINARY) {
                String name = "Extracting " + path + ", " + length + " bytes";
                extractedTextCache.process(name, new Callable<Void>() {

                    @Override
                    public Void call() throws Exception {
                        getParser().parse(stream, handler, metadata, new ParseContext());
                        return null;
                    }
                });
            } else {
                getParser().parse(stream, handler, metadata, new ParseContext());
            }
        } finally {
            bytesRead = stream.getCount();
            stream.close();
        }
    } catch (LinkageError e) {
        // Capture errors caused by extraction libraries
        // not being present. This is equivalent to disabling
        // selected media types in configuration, so we can simply
        // ignore these errors.
        log.debug("[{}] Failed to extract text from a binary property: {}." + " This often happens when some media types are disabled by configuration." + " The stack trace is included to flag some 'unintended' failures", getIndexName(), path, e);
        extractedTextCache.put(v, ExtractedText.ERROR);
        return TEXT_EXTRACTION_ERROR;
    } catch (TimeoutException t) {
        log.warn("[{}] Failed to extract text from a binary property due to timeout: {}.", getIndexName(), path);
        extractedTextCache.put(v, ExtractedText.ERROR);
        extractedTextCache.putTimeout(v, ExtractedText.ERROR);
        return TEXT_EXTRACTION_ERROR;
    } catch (Throwable t) {
        // The special STOP exception is used for normal termination.
        if (!handler.isWriteLimitReached(t)) {
            log.debug("[{}] Failed to extract text from a binary property: {}." + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", getIndexName(), path, t);
            extractedTextCache.put(v, ExtractedText.ERROR);
            return TEXT_EXTRACTION_ERROR;
        } else {
            log.debug("Extracted text size exceeded configured limit({})", definition.getMaxExtractLength());
        }
    }
    String result = handler.toString();
    if (bytesRead > 0) {
        long time = System.currentTimeMillis() - start;
        int len = result.length();
        recordTextExtractionStats(time, bytesRead, len);
        if (log.isDebugEnabled()) {
            log.debug("Extracting {} took {} ms, {} bytes read, {} text size", path, time, bytesRead, len);
        }
    }
    extractedTextCache.put(v, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result));
    return result;
}
Also used : CountingInputStream(com.google.common.io.CountingInputStream) TimeoutException(java.util.concurrent.TimeoutException) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) ExtractedText(org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText) WriteOutContentHandler(org.apache.tika.sax.WriteOutContentHandler) LazyInputStream(org.apache.jackrabbit.oak.commons.io.LazyInputStream) ParseContext(org.apache.tika.parser.ParseContext) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

CountingInputStream (com.google.common.io.CountingInputStream)2 LazyInputStream (org.apache.jackrabbit.oak.commons.io.LazyInputStream)2 ParseContext (org.apache.tika.parser.ParseContext)2 WriteOutContentHandler (org.apache.tika.sax.WriteOutContentHandler)2 IOException (java.io.IOException)1 TimeoutException (java.util.concurrent.TimeoutException)1 ExtractedText (org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText)1 TikaException (org.apache.tika.exception.TikaException)1 SAXException (org.xml.sax.SAXException)1