Search in sources :

Example 6 with ExtractedText

use of org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText in project jackrabbit-oak by apache.

the class DataStoreTextWriterTest method nonExistingEntry.

@Test
public void nonExistingEntry() throws Exception {
    File fdsDir = temporaryFolder.newFolder();
    FileDataStore fds = DataStoreUtils.createFDS(fdsDir, 0);
    ByteArrayInputStream is = new ByteArrayInputStream("hello".getBytes());
    DataRecord dr = fds.addRecord(is);
    File writerDir = temporaryFolder.newFolder();
    DataStoreTextWriter w = new DataStoreTextWriter(writerDir, false);
    String id = dr.getIdentifier().toString();
    assertFalse(w.isProcessed(id));
    assertNull(w.getText("/a", new IdBlob("foo", id)));
    w.write(id, "foo");
    assertTrue(w.isProcessed(id));
    ExtractedText et = w.getText("/a", new IdBlob("foo", id));
    assertEquals("foo", et.getExtractedText());
    assertEquals(ExtractionResult.SUCCESS, et.getExtractionResult());
    w.markEmpty("a");
    assertTrue(w.isProcessed("a"));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) DataRecord(org.apache.jackrabbit.core.data.DataRecord) File(java.io.File) FileDataStore(org.apache.jackrabbit.core.data.FileDataStore) ExtractedText(org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText) Test(org.junit.Test)

Example 7 with ExtractedText

use of org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText in project jackrabbit-oak by apache.

the class ExtractedTextCache method get.

/**
     * Get the pre extracted text for given blob
     * @return null if no pre extracted text entry found. Otherwise returns the pre extracted
     *  text
     */
@CheckForNull
public String get(String nodePath, String propertyName, Blob blob, boolean reindexMode) {
    String result = null;
    //Consult the PreExtractedTextProvider only in reindex mode and not in
    //incremental indexing mode. As that would only contain older entries
    //That also avoid loading on various state (See DataStoreTextWriter)
    String propertyPath = concat(nodePath, propertyName);
    log.trace("Looking for extracted text for [{}] with blobId [{}]", propertyPath, blob.getContentIdentity());
    if ((reindexMode || alwaysUsePreExtractedCache) && extractedTextProvider != null) {
        try {
            ExtractedText text = extractedTextProvider.getText(propertyPath, blob);
            if (text != null) {
                preFetchedCount++;
                switch(text.getExtractionResult()) {
                    case SUCCESS:
                        result = text.getExtractedText().toString();
                        break;
                    case ERROR:
                        result = LuceneIndexEditor.TEXT_EXTRACTION_ERROR;
                        break;
                    case EMPTY:
                        result = EMPTY_STRING;
                        break;
                }
            }
        } catch (IOException e) {
            log.warn("Error occurred while fetching pre extracted text for {}", propertyPath, e);
        }
    }
    String id = blob.getContentIdentity();
    if (cache != null && id != null && result == null) {
        result = cache.getIfPresent(id);
    }
    return result;
}
Also used : IOException(java.io.IOException) ExtractedText(org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText) CheckForNull(javax.annotation.CheckForNull)

Example 8 with ExtractedText

use of org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText in project jackrabbit-oak by apache.

the class BinaryTextExtractor method parseStringValue0.

private String parseStringValue0(Blob v, Metadata metadata, String path) {
    WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength());
    long start = System.currentTimeMillis();
    long bytesRead = 0;
    long length = v.length();
    if (log.isDebugEnabled()) {
        log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
    }
    String oldThreadName = null;
    if (length > SMALL_BINARY) {
        Thread t = Thread.currentThread();
        oldThreadName = t.getName();
        t.setName(oldThreadName + ": Extracting " + path + ", " + length + " bytes");
    }
    try {
        CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
        try {
            getParser().parse(stream, handler, metadata, new ParseContext());
        } finally {
            bytesRead = stream.getCount();
            stream.close();
        }
    } catch (LinkageError e) {
    // Capture and ignore errors caused by extraction libraries
    // not being present. This is equivalent to disabling
    // selected media types in configuration, so we can simply
    // ignore these errors.
    } catch (Throwable t) {
        // The special STOP exception is used for normal termination.
        if (!handler.isWriteLimitReached(t)) {
            log.debug("[{}] Failed to extract text from a binary property: {}." + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", getIndexName(), path, t);
            extractedTextCache.put(v, ExtractedText.ERROR);
            return TEXT_EXTRACTION_ERROR;
        }
    } finally {
        if (oldThreadName != null) {
            Thread.currentThread().setName(oldThreadName);
        }
    }
    String result = handler.toString();
    if (bytesRead > 0) {
        long time = System.currentTimeMillis() - start;
        int len = result.length();
        recordTextExtractionStats(time, bytesRead, len);
        if (log.isDebugEnabled()) {
            log.debug("Extracting {} took {} ms, {} bytes read, {} text size", path, time, bytesRead, len);
        }
    }
    extractedTextCache.put(v, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result));
    return result;
}
Also used : WriteOutContentHandler(org.apache.tika.sax.WriteOutContentHandler) LazyInputStream(org.apache.jackrabbit.oak.commons.io.LazyInputStream) CountingInputStream(com.google.common.io.CountingInputStream) ParseContext(org.apache.tika.parser.ParseContext) ExtractedText(org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText)

Example 9 with ExtractedText

use of org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText in project jackrabbit-oak by apache.

the class ExtractedTextCacheTest method preExtractionAlwaysUse.

@Test
public void preExtractionAlwaysUse() throws Exception {
    ExtractedTextCache cache = new ExtractedTextCache(10 * FileUtils.ONE_MB, 100, true);
    PreExtractedTextProvider provider = mock(PreExtractedTextProvider.class);
    cache.setExtractedTextProvider(provider);
    when(provider.getText(anyString(), any(Blob.class))).thenReturn(new ExtractedText(ExtractionResult.SUCCESS, "bar"));
    Blob b = new IdBlob("hello", "a");
    String text = cache.get("/a", "foo", b, false);
    assertEquals("bar", text);
}
Also used : PreExtractedTextProvider(org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider) Blob(org.apache.jackrabbit.oak.api.Blob) ArrayBasedBlob(org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob) Matchers.anyString(org.mockito.Matchers.anyString) ExtractedText(org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText) Test(org.junit.Test)

Aggregations

ExtractedText (org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText)9 Test (org.junit.Test)6 Blob (org.apache.jackrabbit.oak.api.Blob)5 ArrayBasedBlob (org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob)5 Matchers.anyString (org.mockito.Matchers.anyString)5 File (java.io.File)2 PreExtractedTextProvider (org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider)2 CountingInputStream (com.google.common.io.CountingInputStream)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1 CheckForNull (javax.annotation.CheckForNull)1 DataRecord (org.apache.jackrabbit.core.data.DataRecord)1 FileDataStore (org.apache.jackrabbit.core.data.FileDataStore)1 LazyInputStream (org.apache.jackrabbit.oak.commons.io.LazyInputStream)1 ParseContext (org.apache.tika.parser.ParseContext)1 WriteOutContentHandler (org.apache.tika.sax.WriteOutContentHandler)1