use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TikaTest method getXML.
protected XMLResult getXML(String filePath, Parser parser) throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, filePath);
return getXML(filePath, parser, metadata);
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TestParsers method testEXCELExtraction.
@Test
public void testEXCELExtraction() throws Exception {
final String expected = "Numbers and their Squares";
File file = getResourceAsFile("/test-documents/testEXCEL.xls");
String s1 = tika.parseToString(file);
assertTrue("Text does not contain '" + expected + "'", s1.contains(expected));
Parser parser = tika.getParser();
Metadata metadata = new Metadata();
try (InputStream stream = new FileInputStream(file)) {
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
}
assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TikaDetectorConfigTest method testPSTDetectionWithoutZipDetector.
/**
* TIKA-1708 - If the Zip detector is disabled, either explicitly,
* or via giving a list of detectors that it isn't part of, ensure
* that detection of PST files still works
*/
@Test
public void testPSTDetectionWithoutZipDetector() throws Exception {
// Check the one with an exclude
TikaConfig configWX = getConfig("TIKA-1708-detector-default.xml");
assertNotNull(configWX.getParser());
assertNotNull(configWX.getDetector());
CompositeDetector detectorWX = (CompositeDetector) configWX.getDetector();
// Check it has the POIFS one, but not the zip one
assertDetectors(detectorWX, true, false);
// Check the one with an explicit list
TikaConfig configCL = getConfig("TIKA-1708-detector-composite.xml");
assertNotNull(configCL.getParser());
assertNotNull(configCL.getDetector());
CompositeDetector detectorCL = (CompositeDetector) configCL.getDetector();
assertEquals(2, detectorCL.getDetectors().size());
// Check it also has the POIFS one, but not the zip one
assertDetectors(detectorCL, true, false);
// Check that both detectors have a mimetypes with entries
assertTrue("Not enough mime types: " + configWX.getMediaTypeRegistry().getTypes().size(), configWX.getMediaTypeRegistry().getTypes().size() > 100);
assertTrue("Not enough mime types: " + configCL.getMediaTypeRegistry().getTypes().size(), configCL.getMediaTypeRegistry().getTypes().size() > 100);
// Now check they detect PST files correctly
TikaInputStream stream = TikaInputStream.get(getResourceAsFile("/test-documents/testPST.pst"));
assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, detectorWX.detect(stream, new Metadata()));
assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE, detectorCL.detect(stream, new Metadata()));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class TikaEncodingDetectorTest method testConfigurabilityOfUserSpecified.
@Test
public void testConfigurabilityOfUserSpecified() throws Exception {
TikaConfig tikaConfig = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml"));
AutoDetectParser p = new AutoDetectParser(tikaConfig);
//make sure that all static and non-static parsers are using the same encoding detector!
List<Parser> parsers = new ArrayList<>();
findEncodingDetectionParsers(p, parsers);
assertEquals(3, parsers.size());
for (Parser encodingDetectingParser : parsers) {
EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) encodingDetectingParser).getEncodingDetector();
assertTrue(encodingDetector instanceof CompositeEncodingDetector);
assertEquals(2, ((CompositeEncodingDetector) encodingDetector).getDetectors().size());
for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector).getDetectors()) {
assertNotContained("cu4j", child.getClass().getCanonicalName());
}
}
//also just make sure this is still true
try {
Metadata metadata = getXML("english.cp500.txt", p).metadata;
fail("can't detect w/out ICU");
} catch (TikaException e) {
assertContains("Failed to detect", e.getMessage());
}
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class AutoDetectParserTest method testOggFlacAudio.
/**
* Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
* have been correctly included, and are available
*/
@SuppressWarnings("deprecation")
@Test
public void testOggFlacAudio() throws Exception {
// The three test files should all have similar test data
String[] testFiles = new String[] { "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", "testOPUS.opus" };
MediaType[] mediaTypes = new MediaType[] { MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) };
// Check we can load the parsers, and they claim to do the right things
VorbisParser vParser = new VorbisParser();
assertNotNull("Parser not found for " + mediaTypes[0], vParser.getSupportedTypes(new ParseContext()));
FlacParser fParser = new FlacParser();
assertNotNull("Parser not found for " + mediaTypes[1], fParser.getSupportedTypes(new ParseContext()));
assertNotNull("Parser not found for " + mediaTypes[2], fParser.getSupportedTypes(new ParseContext()));
OpusParser oParser = new OpusParser();
assertNotNull("Parser not found for " + mediaTypes[3], oParser.getSupportedTypes(new ParseContext()));
// Check we found the parser
CompositeParser parser = (CompositeParser) tika.getParser();
for (MediaType mt : mediaTypes) {
assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt));
}
// Have each file parsed, and check
for (int i = 0; i < testFiles.length; i++) {
String file = testFiles[i];
try (InputStream input = AutoDetectParserTest.class.getResourceAsStream("/test-documents/" + file)) {
if (input == null) {
fail("Could not find test file " + file);
}
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser(tika).parse(input, handler, metadata);
assertEquals("Incorrect content type for " + file, mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
// Check some of the common metadata
// Old style metadata
assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
assertEquals("Test Title", metadata.get(Metadata.TITLE));
// New style metadata
assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
// Check some of the XMPDM metadata
if (!file.endsWith(".opus")) {
assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
}
assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
// Check some of the text
String content = handler.toString();
assertTrue(content.contains("Test Title"));
assertTrue(content.contains("Test Artist"));
}
}
}
Aggregations