Search in sources :

Example 1 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ParserPostProcessor method parse.

/**
     * Forwards the call to the delegated parser and post-processes the
     * results as described above.
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    ContentHandler body = new BodyContentHandler();
    ContentHandler tee = new TeeContentHandler(handler, body);
    super.parse(stream, tee, metadata, context);
    String content = body.toString();
    metadata.set("fulltext", content);
    int length = Math.min(content.length(), 500);
    metadata.set("summary", content.substring(0, length));
    for (String link : RegexUtils.extractLinks(content)) {
        metadata.add("outlinks", link);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 2 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class AutoDetectParserTest method testOggFlacAudio.

/**
     * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
     *  have been correctly included, and are available
     */
@SuppressWarnings("deprecation")
@Test
public void testOggFlacAudio() throws Exception {
    // The three test files should all have similar test data
    String[] testFiles = new String[] { "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga", "testOPUS.opus" };
    MediaType[] mediaTypes = new MediaType[] { MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE), MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS) };
    // Check we can load the parsers, and they claim to do the right things
    VorbisParser vParser = new VorbisParser();
    assertNotNull("Parser not found for " + mediaTypes[0], vParser.getSupportedTypes(new ParseContext()));
    FlacParser fParser = new FlacParser();
    assertNotNull("Parser not found for " + mediaTypes[1], fParser.getSupportedTypes(new ParseContext()));
    assertNotNull("Parser not found for " + mediaTypes[2], fParser.getSupportedTypes(new ParseContext()));
    OpusParser oParser = new OpusParser();
    assertNotNull("Parser not found for " + mediaTypes[3], oParser.getSupportedTypes(new ParseContext()));
    // Check we found the parser
    CompositeParser parser = (CompositeParser) tika.getParser();
    for (MediaType mt : mediaTypes) {
        assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt));
    }
    // Have each file parsed, and check
    for (int i = 0; i < testFiles.length; i++) {
        String file = testFiles[i];
        try (InputStream input = AutoDetectParserTest.class.getResourceAsStream("/test-documents/" + file)) {
            if (input == null) {
                fail("Could not find test file " + file);
            }
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new AutoDetectParser(tika).parse(input, handler, metadata);
            assertEquals("Incorrect content type for " + file, mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
            // Check some of the common metadata
            // Old style metadata
            assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
            assertEquals("Test Title", metadata.get(Metadata.TITLE));
            // New style metadata
            assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
            // Check some of the XMPDM metadata
            if (!file.endsWith(".opus")) {
                assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
            }
            assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
            assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
            assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
            // Check some of the text
            String content = handler.toString();
            assertTrue(content.contains("Test Title"));
            assertTrue(content.contains("Test Artist"));
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) VorbisParser(org.gagravarr.tika.VorbisParser) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) MediaType(org.apache.tika.mime.MediaType) FlacParser(org.gagravarr.tika.FlacParser) OpusParser(org.gagravarr.tika.OpusParser) Test(org.junit.Test)

Example 3 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class AutoDetectParserTest method testNoBombDetectedForInvalidXml.

/**
     * Make sure XML parse errors don't trigger ZIP bomb detection.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a>
     */
@Test
public void testNoBombDetectedForInvalidXml() throws Exception {
    // create zip with ten empty / invalid XML files, 1.xml .. 10.xml
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ZipOutputStream zos = new ZipOutputStream(baos);
    for (int i = 1; i <= 10; i++) {
        zos.putNextEntry(new ZipEntry(i + ".xml"));
        zos.closeEntry();
    }
    zos.finish();
    zos.close();
    new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1), new Metadata());
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) ZipOutputStream(java.util.zip.ZipOutputStream) ZipEntry(java.util.zip.ZipEntry) Metadata(org.apache.tika.metadata.Metadata) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Test(org.junit.Test)

Example 4 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class AutoDetectParserTest method testSpecificParserList.

/**
     * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit
     * list of supported parsers.
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a>
     */
@Test
public void testSpecificParserList() throws Exception {
    AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
    InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8));
    Metadata metadata = new Metadata();
    parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("value", metadata.get("MyParser"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test)

Example 5 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class MyFirstTika method parseUsingAutoDetect.

public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception {
    System.out.println("Handling using AutoDetectParser: [" + filename + "]");
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    ContentHandler handler = new BodyContentHandler();
    TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
    parser.parse(stream, handler, metadata, new ParseContext());
    return handler.toString();
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) File(java.io.File) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)261 Metadata (org.apache.tika.metadata.Metadata)252 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)206 InputStream (java.io.InputStream)194 ParseContext (org.apache.tika.parser.ParseContext)176 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)92 Parser (org.apache.tika.parser.Parser)84 ByteArrayInputStream (java.io.ByteArrayInputStream)66 TikaInputStream (org.apache.tika.io.TikaInputStream)66 TikaException (org.apache.tika.exception.TikaException)25 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)23 EmptyParser (org.apache.tika.parser.EmptyParser)15 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 SAXException (org.xml.sax.SAXException)15 MediaType (org.apache.tika.mime.MediaType)11 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10