use of org.apache.tika.config.TikaConfig in project tika by apache.
the class EmbeddedDocumentUtil method getExtension.
public String getExtension(TikaInputStream is, Metadata metadata) {
String mimeString = metadata.get(Metadata.CONTENT_TYPE);
TikaConfig config = getConfig();
MimeType mimeType = null;
MimeTypes types = config.getMimeRepository();
boolean detected = false;
if (mimeString != null) {
try {
mimeType = types.forName(mimeString);
} catch (MimeTypeException e) {
//swallow
}
}
if (mimeType == null) {
Detector detector = config.getDetector();
try {
MediaType mediaType = detector.detect(is, metadata);
mimeType = types.forName(mediaType.toString());
detected = true;
is.reset();
} catch (IOException e) {
//swallow
} catch (MimeTypeException e) {
//swallow
}
}
if (mimeType != null) {
if (detected) {
//set or correct the mime type
metadata.set(Metadata.CONTENT_TYPE, mimeType.toString());
}
return mimeType.getExtension();
}
return ".bin";
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class EmbeddedDocumentUtil method getEmbeddedDocumentExtractor.
/**
* This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext.
* As of Tika 1.15, an AutoDetectParser will automatically be added to parse
* embedded documents if no Parser.class is specified in the ParseContext.
* <p/>
* If you'd prefer not to parse embedded documents, set Parser.class
* to {@link org.apache.tika.parser.EmptyParser} in the ParseContext.
*
* @param context
* @return EmbeddedDocumentExtractor
*/
public static EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class);
if (extractor == null) {
//ensure that an AutoDetectParser is
//available for parsing embedded docs TIKA-2096
Parser embeddedParser = context.get(Parser.class);
if (embeddedParser == null) {
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
context.set(Parser.class, new AutoDetectParser());
} else {
context.set(Parser.class, new AutoDetectParser(tikaConfig));
}
}
extractor = new ParsingEmbeddedDocumentExtractor(context);
}
return extractor;
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class EnviHeaderParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Only outputting the MIME type as metadata
metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
// The following code was taken from the TXTParser
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
Charset charset = reader.getCharset();
MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
// text contents of the xhtml
String line;
while ((line = reader.readLine()) != null) {
xhtml.startElement("p");
xhtml.characters(line);
xhtml.endElement("p");
}
xhtml.endDocument();
}
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class OOXMLParserTest method testMacroinXlsm.
@Test
public void testMacroinXlsm() throws Exception {
//test default is "don't extract macros"
for (Metadata metadata : getRecursiveMetadata("testEXCEL_macro.xlsm")) {
if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
fail("Shouldn't have extracted macros as default");
}
}
//now test that they were extracted
ParseContext context = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
context.set(OfficeParserConfig.class, officeParserConfig);
Metadata minExpected = new Metadata();
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Dirty()");
minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "dirty dirt dirt");
minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm", context));
//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm", parser));
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class OOXMLParserTest method testInitializationViaConfig.
@Test
public void testInitializationViaConfig() throws Exception {
//NOTE: this test relies on a bug in the DOM extractor that
//is passing over the title information.
//once we fix that, this test will no longer be meaningful!
InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/microsoft/tika-config-sax-docx.xml");
assertNotNull(is);
TikaConfig tikaConfig = new TikaConfig(is);
AutoDetectParser p = new AutoDetectParser(tikaConfig);
XMLResult xml = getXML("testWORD_2006ml.docx", p, new Metadata());
assertContains("engaging title", xml.xml);
}
Aggregations