Search in sources :

Example 26 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TestContainerAwareDetector method testTruncatedFiles.

@Test
public void testTruncatedFiles() throws Exception {
    // First up a truncated OOXML (zip) file
    // With only the data supplied, the best we can do is the container
    Metadata m = new Metadata();
    try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
        assertEquals(MediaType.application("x-tika-ooxml"), detector.detect(xlsx, m));
    }
    // With truncated data + filename, we can use the filename to specialise
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
    try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
        assertEquals(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), detector.detect(xlsx, m));
    }
    // Now a truncated OLE2 file 
    m = new Metadata();
    try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
        assertEquals(MediaType.application("x-tika-msoffice"), detector.detect(xls, m));
    }
    // Finally a truncated OLE2 file, with a filename available
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls");
    try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
        assertEquals(MediaType.application("vnd.ms-excel"), detector.detect(xls, m));
    }
}
Also used : Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) Test(org.junit.Test)

Example 27 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class ExternalEmbedderTest method getMetadataToEmbed.

/**
     * Gets the tika <code>Metadata</code> object containing data to be
     * embedded.
     *
     * @return the populated tika metadata object
     */
protected Metadata getMetadataToEmbed(Date timestamp) {
    Metadata metadata = new Metadata();
    metadata.add(TikaCoreProperties.DESCRIPTION, getExpectedMetadataValueString(TikaCoreProperties.DESCRIPTION.toString(), timestamp));
    return metadata;
}
Also used : Metadata(org.apache.tika.metadata.Metadata)

Example 28 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TestContainerAwareDetector method assertTypeByNameAndData.

private void assertTypeByNameAndData(String dataFile, String name, String typeFromDetector, String typeFromMagic) throws Exception {
    try (TikaInputStream stream = TikaInputStream.get(TestContainerAwareDetector.class.getResource("/test-documents/" + dataFile))) {
        Metadata m = new Metadata();
        if (name != null)
            m.add(Metadata.RESOURCE_NAME_KEY, name);
        // Mime Magic version is likely to be less precise
        if (typeFromMagic != null) {
            assertEquals(MediaType.parse(typeFromMagic), mimeTypes.detect(stream, m));
        }
        // All being well, the detector should get it perfect
        assertEquals(MediaType.parse(typeFromDetector), detector.detect(stream, m));
    }
}
Also used : Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream)

Example 29 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TestParsers method testWORDxtraction.

@Test
public void testWORDxtraction() throws Exception {
    File file = getResourceAsFile("/test-documents/testWORD.doc");
    Parser parser = tika.getParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(file)) {
        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
    }
    assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) File(java.io.File) FileInputStream(java.io.FileInputStream) Parser(org.apache.tika.parser.Parser) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 30 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class TensorflowImageRecParser method recognise.

@Override
public List<RecognisedObject> recognise(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    Metadata md = new Metadata();
    parse(stream, handler, md, context);
    List<RecognisedObject> objects = new ArrayList<>();
    for (String key : md.names()) {
        double confidence = Double.parseDouble(md.get(key));
        objects.add(new RecognisedObject(key, "eng", key, confidence));
    }
    return objects;
}
Also used : Metadata(org.apache.tika.metadata.Metadata) ArrayList(java.util.ArrayList) RecognisedObject(org.apache.tika.parser.recognition.RecognisedObject)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)643 Test (org.junit.Test)467 InputStream (java.io.InputStream)318 ParseContext (org.apache.tika.parser.ParseContext)281 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)268 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)228 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)151 ByteArrayInputStream (java.io.ByteArrayInputStream)141 Parser (org.apache.tika.parser.Parser)134 TikaInputStream (org.apache.tika.io.TikaInputStream)131 IOException (java.io.IOException)62 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)46 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)28 FileInputStream (java.io.FileInputStream)27 MediaType (org.apache.tika.mime.MediaType)27