use of org.apache.tika.parser.AutoDetectParser in project jackrabbit by apache.
the class SearchIndex method createParser.
private Parser createParser() {
URL url = null;
if (tikaConfigPath != null) {
File file = new File(tikaConfigPath);
if (file.exists()) {
try {
url = file.toURI().toURL();
} catch (MalformedURLException e) {
log.warn("Invalid Tika configuration path: " + file, e);
}
} else {
ClassLoader loader = SearchIndex.class.getClassLoader();
url = loader.getResource(tikaConfigPath);
}
}
if (url == null) {
url = SearchIndex.class.getResource("tika-config.xml");
}
TikaConfig config = null;
if (url != null) {
try {
config = new TikaConfig(url);
} catch (Exception e) {
log.warn("Tika configuration not available: " + url, e);
}
}
if (config == null) {
config = TikaConfig.getDefaultConfig();
}
if (forkJavaCommand != null) {
ForkParser forkParser = new ForkParser(SearchIndex.class.getClassLoader(), new AutoDetectParser(config));
forkParser.setJavaCommand(forkJavaCommand);
forkParser.setPoolSize(extractorPoolSize);
return forkParser;
} else {
return new AutoDetectParser(config);
}
}
use of org.apache.tika.parser.AutoDetectParser in project jmeter by apache.
the class Document method getTextFromDocument.
/**
* Convert to text plain a lot of kind of document (like odt, ods, odp,
* doc(x), xls(x), ppt(x), pdf, mp3, mp4, etc.) with Apache Tika
*
* @param document
* binary representation of the document
* @return text from document without format
*/
public static String getTextFromDocument(byte[] document) {
// $NON-NLS-1$
String errMissingTika = JMeterUtils.getResString("view_results_response_missing_tika");
String response = errMissingTika;
Parser parser = new AutoDetectParser();
// -1 to disable the write limit
ContentHandler handler = new BodyContentHandler(MAX_DOCUMENT_SIZE > 0 ? MAX_DOCUMENT_SIZE : -1);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
// open the stream
InputStream stream = new ByteArrayInputStream(document);
try {
parser.parse(stream, handler, metadata, context);
response = handler.toString();
} catch (Exception e) {
response = e.toString();
log.warn("Error document parsing.", e);
} catch (NoClassDefFoundError e) {
// put a warning if tika-app.jar missing (or some dependencies in only tika-core|parsers packages are using)
if (!System.getProperty("java.class.path").contains("tika-app")) {
// $NON-NLS-1$ $NON-NLS-2$
log.warn(errMissingTika);
} else {
log.warn(errMissingTika, e);
}
} finally {
try {
// close the stream
stream.close();
} catch (IOException ioe) {
// $NON-NLS-1$
log.warn("Error closing document stream", ioe);
}
}
if (response.length() == 0 && document.length > 0) {
// $NON-NLS-1$
log.warn("Probably: {}", errMissingTika);
response = errMissingTika;
}
return response;
}
use of org.apache.tika.parser.AutoDetectParser in project jackrabbit-oak by apache.
the class BinaryTextExtractor method createDefaultParser.
private static AutoDetectParser createDefaultParser() {
ClassLoader current = Thread.currentThread().getContextClassLoader();
URL configUrl = LuceneIndexEditorContext.class.getResource("tika-config.xml");
InputStream is = null;
if (configUrl != null) {
try {
Thread.currentThread().setContextClassLoader(LuceneIndexEditorContext.class.getClassLoader());
is = configUrl.openStream();
TikaConfig config = new TikaConfig(is);
log.info("Loaded default Tika Config from classpath {}", configUrl);
return new AutoDetectParser(config);
} catch (Exception e) {
log.warn("Tika configuration not available : " + configUrl, e);
} finally {
IOUtils.closeQuietly(is);
Thread.currentThread().setContextClassLoader(current);
}
} else {
log.warn("Default Tika configuration not found");
}
return new AutoDetectParser();
}
use of org.apache.tika.parser.AutoDetectParser in project jackrabbit-oak by apache.
the class BinaryTextExtractor method initializeTikaParser.
private static Parser initializeTikaParser(IndexDefinition definition) {
ClassLoader current = Thread.currentThread().getContextClassLoader();
try {
if (definition.hasCustomTikaConfig()) {
log.debug("[{}] Using custom tika config", definition.getIndexName());
Thread.currentThread().setContextClassLoader(LuceneIndexEditorContext.class.getClassLoader());
InputStream is = definition.getTikaConfig();
try {
return new AutoDetectParser(getTikaConfig(is, definition));
} finally {
IOUtils.closeQuietly(is);
}
}
} finally {
Thread.currentThread().setContextClassLoader(current);
}
return defaultParser;
}
use of org.apache.tika.parser.AutoDetectParser in project ddf by codice.
the class PdfInputTransformer method transformWithExtractors.
private Metacard transformWithExtractors(InputStream input, String id) throws IOException, CatalogTransformerException {
try (TemporaryFileBackedOutputStream fbos = new TemporaryFileBackedOutputStream()) {
try {
IOUtils.copy(input, fbos);
} catch (IOException e) {
throw new CatalogTransformerException("Could not copy bytes of content message.", e);
}
String plainText = null;
try (InputStream isCopy = fbos.asByteSource().openStream()) {
Parser parser = new AutoDetectParser();
ContentHandler contentHandler = new ToTextContentHandler();
TikaMetadataExtractor tikaMetadataExtractor = new TikaMetadataExtractor(parser, contentHandler);
tikaMetadataExtractor.parseMetadata(isCopy, new ParseContext());
plainText = contentHandler.toString();
} catch (CatalogTransformerException e) {
LOGGER.warn("Cannot extract metadata from pdf", e);
}
try (InputStream isCopy = fbos.asByteSource().openStream();
PDDocument pdfDocument = pdDocumentGenerator.apply(isCopy)) {
return transformPdf(id, pdfDocument, plainText);
} catch (InvalidPasswordException e) {
LOGGER.debug("Cannot transform encrypted pdf", e);
return initializeMetacard(id);
}
}
}
Aggregations