Search in sources :

Example 51 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class TikaTest method getXML.

protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
    if (context == null) {
        context = new ParseContext();
    }
    try {
        ContentHandler handler = new ToXMLContentHandler();
        parser.parse(input, handler, metadata, context);
        return new XMLResult(handler.toString(), metadata);
    } finally {
        input.close();
    }
}
Also used : ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler)

Example 52 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class AutoDetectParserTest method testOggFlacAudio.

/**
     * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
     *  have been correctly included, and are available
     */
@SuppressWarnings("deprecation")
@Test
public void testOggFlacAudio() throws Exception {
    // The three test files should all have similar test data
    String[] testFiles = new String[] { "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", "testOPUS.opus" };
    MediaType[] mediaTypes = new MediaType[] { MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) };
    // Check we can load the parsers, and they claim to do the right things
    VorbisParser vParser = new VorbisParser();
    assertNotNull("Parser not found for " + mediaTypes[0], vParser.getSupportedTypes(new ParseContext()));
    FlacParser fParser = new FlacParser();
    assertNotNull("Parser not found for " + mediaTypes[1], fParser.getSupportedTypes(new ParseContext()));
    assertNotNull("Parser not found for " + mediaTypes[2], fParser.getSupportedTypes(new ParseContext()));
    OpusParser oParser = new OpusParser();
    assertNotNull("Parser not found for " + mediaTypes[3], oParser.getSupportedTypes(new ParseContext()));
    // Check we found the parser
    CompositeParser parser = (CompositeParser) tika.getParser();
    for (MediaType mt : mediaTypes) {
        assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt));
    }
    // Have each file parsed, and check
    for (int i = 0; i < testFiles.length; i++) {
        String file = testFiles[i];
        try (InputStream input = AutoDetectParserTest.class.getResourceAsStream("/test-documents/" + file)) {
            if (input == null) {
                fail("Could not find test file " + file);
            }
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser(tika).parse(input, handler, metadata);
            assertEquals("Incorrect content type for " + file, mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
            // Check some of the common metadata
            // Old style metadata
            assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
            assertEquals("Test Title", metadata.get(Metadata.TITLE));
            // New style metadata
            assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
            // Check some of the XMPDM metadata
            if (!file.endsWith(".opus")) {
                assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
            }
            assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
            assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
            assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
            // Check some of the text
            String content = handler.toString();
            assertTrue(content.contains("Test Title"));
            assertTrue(content.contains("Test Artist"));
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) VorbisParser(org.gagravarr.tika.VorbisParser) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) MediaType(org.apache.tika.mime.MediaType) FlacParser(org.gagravarr.tika.FlacParser) OpusParser(org.gagravarr.tika.OpusParser) Test(org.junit.Test)

Example 53 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class MyFirstTika method parseUsingAutoDetect.

public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
    System.out.println("Handling using AutoDetectParser: [" + filename + "]");
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    ContentHandler handler = new BodyContentHandler();
    TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
    parser.parse(stream, handler, metadata, new ParseContext());
    return handler.toString();
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) File(java.io.File) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 54 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ExtractEmbeddedFiles method extract.

public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException {
    Metadata m = new Metadata();
    ParseContext c = new ParseContext();
    ContentHandler h = new BodyContentHandler(-1);
    c.set(Parser.class, parser);
    EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c);
    c.set(EmbeddedDocumentExtractor.class, ex);
    parser.parse(is, h, m, c);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ParsingEmbeddedDocumentExtractor(org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 55 with ContentHandler

use of org.xml.sax.ContentHandler in project tika by apache.

the class ContentHandlerExample method parseBodyToHTML.

/**
     * Example of extracting just the body as HTML, without the
     * head part, as a string
     */
public String parseBodyToHTML() throws IOException, SAXException, TikaException {
    ContentHandler handler = new BodyContentHandler(new ToXMLContentHandler());
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Aggregations

ContentHandler (org.xml.sax.ContentHandler)354 Metadata (org.apache.tika.metadata.Metadata)229 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)229 InputStream (java.io.InputStream)210 Test (org.junit.Test)208 ParseContext (org.apache.tika.parser.ParseContext)164 Parser (org.apache.tika.parser.Parser)106 TikaTest (org.apache.tika.TikaTest)103 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)102 TikaInputStream (org.apache.tika.io.TikaInputStream)75 ByteArrayInputStream (java.io.ByteArrayInputStream)64 SAXException (org.xml.sax.SAXException)40 IOException (java.io.IOException)34 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)28 TikaException (org.apache.tika.exception.TikaException)24 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)21 AttributesImpl (org.xml.sax.helpers.AttributesImpl)21 InputSource (org.xml.sax.InputSource)20