use of org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText in project jackrabbit-oak by apache.
the class DataStoreTextWriterTest method nonExistingEntry.
@Test
public void nonExistingEntry() throws Exception {
File fdsDir = temporaryFolder.newFolder();
FileDataStore fds = DataStoreUtils.createFDS(fdsDir, 0);
ByteArrayInputStream is = new ByteArrayInputStream("hello".getBytes());
DataRecord dr = fds.addRecord(is);
File writerDir = temporaryFolder.newFolder();
DataStoreTextWriter w = new DataStoreTextWriter(writerDir, false);
String id = dr.getIdentifier().toString();
assertFalse(w.isProcessed(id));
assertNull(w.getText("/a", new IdBlob("foo", id)));
w.write(id, "foo");
assertTrue(w.isProcessed(id));
ExtractedText et = w.getText("/a", new IdBlob("foo", id));
assertEquals("foo", et.getExtractedText());
assertEquals(ExtractionResult.SUCCESS, et.getExtractionResult());
w.markEmpty("a");
assertTrue(w.isProcessed("a"));
}
use of org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText in project jackrabbit-oak by apache.
the class ExtractedTextCache method get.
/**
* Get the pre extracted text for given blob
* @return null if no pre extracted text entry found. Otherwise returns the pre extracted
* text
*/
@CheckForNull
public String get(String nodePath, String propertyName, Blob blob, boolean reindexMode) {
String result = null;
//Consult the PreExtractedTextProvider only in reindex mode and not in
//incremental indexing mode. As that would only contain older entries
//That also avoid loading on various state (See DataStoreTextWriter)
String propertyPath = concat(nodePath, propertyName);
log.trace("Looking for extracted text for [{}] with blobId [{}]", propertyPath, blob.getContentIdentity());
if ((reindexMode || alwaysUsePreExtractedCache) && extractedTextProvider != null) {
try {
ExtractedText text = extractedTextProvider.getText(propertyPath, blob);
if (text != null) {
preFetchedCount++;
switch(text.getExtractionResult()) {
case SUCCESS:
result = text.getExtractedText().toString();
break;
case ERROR:
result = LuceneIndexEditor.TEXT_EXTRACTION_ERROR;
break;
case EMPTY:
result = EMPTY_STRING;
break;
}
}
} catch (IOException e) {
log.warn("Error occurred while fetching pre extracted text for {}", propertyPath, e);
}
}
String id = blob.getContentIdentity();
if (cache != null && id != null && result == null) {
result = cache.getIfPresent(id);
}
return result;
}
use of org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText in project jackrabbit-oak by apache.
the class BinaryTextExtractor method parseStringValue0.
private String parseStringValue0(Blob v, Metadata metadata, String path) {
WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength());
long start = System.currentTimeMillis();
long bytesRead = 0;
long length = v.length();
if (log.isDebugEnabled()) {
log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
}
String oldThreadName = null;
if (length > SMALL_BINARY) {
Thread t = Thread.currentThread();
oldThreadName = t.getName();
t.setName(oldThreadName + ": Extracting " + path + ", " + length + " bytes");
}
try {
CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
try {
getParser().parse(stream, handler, metadata, new ParseContext());
} finally {
bytesRead = stream.getCount();
stream.close();
}
} catch (LinkageError e) {
// Capture and ignore errors caused by extraction libraries
// not being present. This is equivalent to disabling
// selected media types in configuration, so we can simply
// ignore these errors.
} catch (Throwable t) {
// The special STOP exception is used for normal termination.
if (!handler.isWriteLimitReached(t)) {
log.debug("[{}] Failed to extract text from a binary property: {}." + " This is a fairly common case, and nothing to" + " worry about. The stack trace is included to" + " help improve the text extraction feature.", getIndexName(), path, t);
extractedTextCache.put(v, ExtractedText.ERROR);
return TEXT_EXTRACTION_ERROR;
}
} finally {
if (oldThreadName != null) {
Thread.currentThread().setName(oldThreadName);
}
}
String result = handler.toString();
if (bytesRead > 0) {
long time = System.currentTimeMillis() - start;
int len = result.length();
recordTextExtractionStats(time, bytesRead, len);
if (log.isDebugEnabled()) {
log.debug("Extracting {} took {} ms, {} bytes read, {} text size", path, time, bytesRead, len);
}
}
extractedTextCache.put(v, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result));
return result;
}
use of org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText in project jackrabbit-oak by apache.
the class ExtractedTextCacheTest method preExtractionAlwaysUse.
@Test
public void preExtractionAlwaysUse() throws Exception {
ExtractedTextCache cache = new ExtractedTextCache(10 * FileUtils.ONE_MB, 100, true);
PreExtractedTextProvider provider = mock(PreExtractedTextProvider.class);
cache.setExtractedTextProvider(provider);
when(provider.getText(anyString(), any(Blob.class))).thenReturn(new ExtractedText(ExtractionResult.SUCCESS, "bar"));
Blob b = new IdBlob("hello", "a");
String text = cache.get("/a", "foo", b, false);
assertEquals("bar", text);
}
Aggregations