Search in sources :

Example 36 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class AutoDetectParserTest method testSpecificParserList.

/**
     * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit
     * list of supported parsers.
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a>
     */
@Test
public void testSpecificParserList() throws Exception {
    AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
    InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8));
    Metadata metadata = new Metadata();
    parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("value", metadata.get("MyParser"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test)

Example 37 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TestMimeTypes method assertMagic.

private void assertMagic(String expected, byte[] prefix) throws IOException {
    MediaType type = repo.detect(new ByteArrayInputStream(prefix), new Metadata());
    assertNotNull(type);
    assertEquals(expected, type.toString());
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata)

Example 38 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class MetadataAwareLuceneIndexer method indexWithDublinCore.

public void indexWithDublinCore(File file) throws Exception {
    Metadata met = new Metadata();
    met.add(Metadata.CREATOR, "Manning");
    met.add(Metadata.CREATOR, "Tika in Action");
    met.set(Metadata.DATE, new Date());
    met.set(Metadata.FORMAT, tika.detect(file));
    met.set(DublinCore.SOURCE, file.toURI().toURL().toString());
    met.add(Metadata.SUBJECT, "File");
    met.add(Metadata.SUBJECT, "Indexing");
    met.add(Metadata.SUBJECT, "Metadata");
    met.set(Property.externalClosedChoise(Metadata.RIGHTS, "public", "private"), "public");
    try (InputStream is = new FileInputStream(file)) {
        tika.parse(is, met);
        Document document = new Document();
        for (String key : met.names()) {
            String[] values = met.getValues(key);
            for (String val : values) {
                document.add(new Field(key, val, Store.YES, Index.ANALYZED));
            }
            writer.addDocument(document);
        }
    }
}
Also used : Field(org.apache.lucene.document.Field) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Document(org.apache.lucene.document.Document) Date(java.util.Date) FileInputStream(java.io.FileInputStream)

Example 39 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class MyFirstTika method main.

public static void main(String[] args) throws Exception {
    String filename = args[0];
    TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
    Metadata metadata = new Metadata();
    String text = parseUsingComponents(filename, tikaConfig, metadata);
    System.out.println("Parsed Metadata: ");
    System.out.println(metadata);
    System.out.println("Parsed Text: ");
    System.out.println(text);
    System.out.println("-------------------------");
    metadata = new Metadata();
    text = parseUsingAutoDetect(filename, tikaConfig, metadata);
    System.out.println("Parsed Metadata: ");
    System.out.println(metadata);
    System.out.println("Parsed Text: ");
    System.out.println(text);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata)

Example 40 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class ParsingExample method parseEmbeddedExample.

/**
     * This example shows how to extract content from the outer document and all
     * embedded documents.  The key is to specify a {@link Parser} in the {@link ParseContext}.
     *
     * @return content, including from embedded documents
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
public String parseEmbeddedExample() throws IOException, SAXException, TikaException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
        parser.parse(stream, handler, metadata, context);
        return handler.toString();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)651 Test (org.junit.Test)467 InputStream (java.io.InputStream)320 ParseContext (org.apache.tika.parser.ParseContext)283 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)269 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)229 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)154 ByteArrayInputStream (java.io.ByteArrayInputStream)143 Parser (org.apache.tika.parser.Parser)136 TikaInputStream (org.apache.tika.io.TikaInputStream)133 IOException (java.io.IOException)66 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)48 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)29 MediaType (org.apache.tika.mime.MediaType)29 SAXException (org.xml.sax.SAXException)29