use of org.apache.tika.config.TikaConfig in project tika by apache.
the class CompositeParserTest method testDefaultParser.
@Test
public void testDefaultParser() throws Exception {
TikaConfig config = TikaConfig.getDefaultConfig();
CompositeParser parser = (CompositeParser) config.getParser();
// Check it has the full registry
assertEquals(config.getMediaTypeRegistry(), parser.getMediaTypeRegistry());
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class InitializableParserTest method testInitializableParser.
@Test
public void testInitializableParser() throws Exception {
URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
assert configFileUrl != null;
TikaConfig config = new TikaConfig(configFileUrl);
Tika tika = new Tika(config);
Metadata md = new Metadata();
tika.parse(TikaInputStream.get("someString".getBytes(StandardCharsets.ISO_8859_1)), md);
assertEquals("5", md.get(DummyInitializableParser.SUM_FIELD));
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class ParameterizedParserTest method getMetadata.
//TODO later -- add a test for a parser that isn't configurable
//but that has params in the config file
private Metadata getMetadata(String name) throws TikaException, IOException, SAXException {
URL url = this.getClass().getResource("/org/apache/tika/config/" + name);
assertNotNull("couldn't find: " + name, url);
TikaConfig tikaConfig = new TikaConfig(url);
Tika tika = new Tika(tikaConfig);
Metadata metadata = new Metadata();
tika.parse(url.openStream(), metadata);
return metadata;
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class RecursiveParserWrapperFSConsumerTest method testEmbeddedWithNPE.
@Test
public void testEmbeddedWithNPE() throws Exception {
final String path = "/test-documents/embedded_with_npe.xml";
final Metadata metadata = new Metadata();
metadata.add(Metadata.RESOURCE_NAME_KEY, "embedded_with_npe.xml");
ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<FileResource>(2);
queue.add(new FileResource() {
@Override
public String getResourceId() {
return "testFile";
}
@Override
public Metadata getMetadata() {
return metadata;
}
@Override
public InputStream openInputStream() throws IOException {
return this.getClass().getResourceAsStream(path);
}
});
queue.add(new PoisonFileResource());
MockOSFactory mockOSFactory = new MockOSFactory();
RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), mockOSFactory, new TikaConfig());
IFileProcessorFutureResult result = consumer.call();
mockOSFactory.getStreams().get(0).flush();
byte[] bytes = mockOSFactory.getStreams().get(0).toByteArray();
List<Metadata> results = JsonMetadataList.fromJson(new InputStreamReader(new ByteArrayInputStream(bytes), UTF_8));
assertEquals(4, results.size());
assertContains("another null pointer", results.get(2).get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
assertEquals("Nikolai Lobachevsky", results.get(0).get("author"));
for (int i = 1; i < 4; i++) {
assertEquals("embeddedAuthor" + i, results.get(i).get("author"));
assertContains("some_embedded_content" + i, results.get(i).get(RecursiveParserWrapper.TIKA_CONTENT));
}
}
use of org.apache.tika.config.TikaConfig in project tika by apache.
the class NLTKNERecogniserTest method testGetEntityTypes.
@Test
public void testGetEntityTypes() throws Exception {
String text = "America is a big country.";
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
if (names.size() != 0) {
assertTrue(names.contains("America"));
assertTrue(names.size() == 1);
}
}
Aggregations