Search in sources :

Example 16 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class CompositeParserTest method testDefaultParser.

@Test
public void testDefaultParser() throws Exception {
    TikaConfig config = TikaConfig.getDefaultConfig();
    CompositeParser parser = (CompositeParser) config.getParser();
    // Check it has the full registry
    assertEquals(config.getMediaTypeRegistry(), parser.getMediaTypeRegistry());
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Test(org.junit.Test)

Example 17 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class InitializableParserTest method testInitializableParser.

@Test
public void testInitializableParser() throws Exception {
    URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
    assert configFileUrl != null;
    TikaConfig config = new TikaConfig(configFileUrl);
    Tika tika = new Tika(config);
    Metadata md = new Metadata();
    tika.parse(TikaInputStream.get("someString".getBytes(StandardCharsets.ISO_8859_1)), md);
    assertEquals("5", md.get(DummyInitializableParser.SUM_FIELD));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) URL(java.net.URL) Test(org.junit.Test)

Example 18 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class ParameterizedParserTest method getMetadata.

//TODO later -- add a test for a parser that isn't configurable
//but that has params in the config file
private Metadata getMetadata(String name) throws TikaException, IOException, SAXException {
    URL url = this.getClass().getResource("/org/apache/tika/config/" + name);
    assertNotNull("couldn't find: " + name, url);
    TikaConfig tikaConfig = new TikaConfig(url);
    Tika tika = new Tika(tikaConfig);
    Metadata metadata = new Metadata();
    tika.parse(url.openStream(), metadata);
    return metadata;
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) URL(java.net.URL)

Example 19 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class RecursiveParserWrapperFSConsumerTest method testEmbeddedWithNPE.

@Test
public void testEmbeddedWithNPE() throws Exception {
    final String path = "/test-documents/embedded_with_npe.xml";
    final Metadata metadata = new Metadata();
    metadata.add(Metadata.RESOURCE_NAME_KEY, "embedded_with_npe.xml");
    ArrayBlockingQueue<FileResource> queue = new ArrayBlockingQueue<FileResource>(2);
    queue.add(new FileResource() {

        @Override
        public String getResourceId() {
            return "testFile";
        }

        @Override
        public Metadata getMetadata() {
            return metadata;
        }

        @Override
        public InputStream openInputStream() throws IOException {
            return this.getClass().getResourceAsStream(path);
        }
    });
    queue.add(new PoisonFileResource());
    MockOSFactory mockOSFactory = new MockOSFactory();
    RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(queue, new AutoDetectParserFactory(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), mockOSFactory, new TikaConfig());
    IFileProcessorFutureResult result = consumer.call();
    mockOSFactory.getStreams().get(0).flush();
    byte[] bytes = mockOSFactory.getStreams().get(0).toByteArray();
    List<Metadata> results = JsonMetadataList.fromJson(new InputStreamReader(new ByteArrayInputStream(bytes), UTF_8));
    assertEquals(4, results.size());
    assertContains("another null pointer", results.get(2).get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
    assertEquals("Nikolai Lobachevsky", results.get(0).get("author"));
    for (int i = 1; i < 4; i++) {
        assertEquals("embeddedAuthor" + i, results.get(i).get("author"));
        assertContains("some_embedded_content" + i, results.get(i).get(RecursiveParserWrapper.TIKA_CONTENT));
    }
}
Also used : RecursiveParserWrapperFSConsumer(org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaConfig(org.apache.tika.config.TikaConfig) InputStreamReader(java.io.InputStreamReader) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ByteArrayInputStream(java.io.ByteArrayInputStream) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 20 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class NLTKNERecogniserTest method testGetEntityTypes.

@Test
public void testGetEntityTypes() throws Exception {
    String text = "America is a big country.";
    System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName());
    Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
    Metadata md = new Metadata();
    tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
    Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES")));
    if (names.size() != 0) {
        assertTrue(names.contains("America"));
        assertTrue(names.size() == 1);
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5