Search in sources :

Example 56 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ParserDecoratorTest method withFallback.

/**
     * Testing one proposed implementation for TIKA-1509
     */
@Test
public void withFallback() throws Exception {
    Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
    Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
    ParseContext context = new ParseContext();
    BodyContentHandler handler;
    Metadata metadata;
    ErrorParser pFail = new ErrorParser();
    DummyParser pWork = new DummyParser(onlyOct, new HashMap<String, String>(), "Fell back!");
    EmptyParser pNothing = new EmptyParser();
    // Create a combination which will fail first
    @SuppressWarnings("deprecation") Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText);
    // Will claim to support the types given, not those on the child parsers
    Set<MediaType> types = p.getSupportedTypes(context);
    assertEquals(2, types.size());
    assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN));
    assertEquals(types.toString(), true, types.contains(MediaType.OCTET_STREAM));
    // Parsing will make it to the second one
    metadata = new Metadata();
    handler = new BodyContentHandler();
    p.parse(new ByteArrayInputStream(new byte[] { 0, 1, 2, 3, 4 }), handler, metadata, context);
    assertEquals("Fell back!", handler.toString());
    // With a parser that will work with no output, will get nothing
    p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, pWork), octAndText);
    metadata = new Metadata();
    handler = new BodyContentHandler();
    p.parse(new ByteArrayInputStream(new byte[] { 0, 1, 2, 3, 4 }), handler, metadata, context);
    assertEquals("", handler.toString());
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ByteArrayInputStream(java.io.ByteArrayInputStream) MediaType(org.apache.tika.mime.MediaType) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 57 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ForkParserTest method testParallelParsing.

@Test
public void testParallelParsing() throws Exception {
    final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
    try {
        final ParseContext context = new ParseContext();
        Thread[] threads = new Thread[10];
        ContentHandler[] output = new ContentHandler[threads.length];
        for (int i = 0; i < threads.length; i++) {
            final ContentHandler o = new BodyContentHandler();
            output[i] = o;
            threads[i] = new Thread() {

                public void run() {
                    try {
                        InputStream stream = new ByteArrayInputStream(new byte[0]);
                        parser.parse(stream, o, new Metadata(), context);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            };
            threads[i].start();
        }
        for (int i = 0; i < threads.length; i++) {
            threads[i].join();
            assertEquals("Hello, World!", output[i].toString().trim());
        }
    } finally {
        parser.close();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) PipedInputStream(java.io.PipedInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test)

Example 58 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class ForkParserTest method testPoolSizeReached.

@Test
public void testPoolSizeReached() throws Exception {
    final ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser());
    try {
        final Semaphore barrier = new Semaphore(0);
        Thread[] threads = new Thread[parser.getPoolSize()];
        PipedOutputStream[] pipes = new PipedOutputStream[threads.length];
        final ParseContext context = new ParseContext();
        for (int i = 0; i < threads.length; i++) {
            final PipedInputStream input = new PipedInputStream() {

                @Override
                public synchronized int read() throws IOException {
                    barrier.release();
                    return super.read();
                }
            };
            pipes[i] = new PipedOutputStream(input);
            threads[i] = new Thread() {

                public void run() {
                    try {
                        ContentHandler o = new DefaultHandler();
                        parser.parse(input, o, new Metadata(), context);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            };
            threads[i].start();
        }
        // Wait until all the background parsers have been started
        barrier.acquire(parser.getPoolSize());
        final ContentHandler o = new BodyContentHandler();
        Thread blocked = new Thread() {

            public void run() {
                try {
                    barrier.release();
                    InputStream stream = new ByteArrayInputStream(new byte[0]);
                    parser.parse(stream, o, new Metadata(), context);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        };
        blocked.start();
        // Wait until the last thread is started, and then some to
        // make sure that it would have had a chance to start processing
        // data had it not been blocked.
        barrier.acquire();
        Thread.sleep(1000);
        assertEquals("", o.toString());
        for (int i = 0; i < threads.length; i++) {
            pipes[i].close();
            threads[i].join();
        }
        blocked.join();
        assertEquals("Hello, World!", o.toString().trim());
    } finally {
        parser.close();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) PipedInputStream(java.io.PipedInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) PipedOutputStream(java.io.PipedOutputStream) Semaphore(java.util.concurrent.Semaphore) PipedInputStream(java.io.PipedInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) DefaultHandler(org.xml.sax.helpers.DefaultHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) Test(org.junit.Test)

Example 59 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class Tika method parseToString.

/**
     * Parses the given document and returns the extracted text content.
     * The given input stream is closed by this method. This method lets
     * you control the maxStringLength per call.
     * <p>
     * To avoid unpredictable excess memory use, the returned string contains
     * only up to maxLength (parameter) first characters extracted
     * from the input document.
     * <p>
     * <strong>NOTE:</strong> Unlike most other Tika methods that take an
     * {@link InputStream}, this method will close the given stream for
     * you as a convenience. With other methods you are still responsible
     * for closing the stream or a wrapper instance returned by Tika.
     *
     * @param stream the document to be parsed
     * @param metadata document metadata
     * @param maxLength maximum length of the returned string
     * @return extracted text content
     * @throws IOException if the document can not be read
     * @throws TikaException if the document can not be parsed
     */
public String parseToString(InputStream stream, Metadata metadata, int maxLength) throws IOException, TikaException {
    WriteOutContentHandler handler = new WriteOutContentHandler(maxLength);
    try {
        ParseContext context = new ParseContext();
        context.set(Parser.class, parser);
        parser.parse(stream, new BodyContentHandler(handler), metadata, context);
    } catch (SAXException e) {
        if (!handler.isWriteLimitReached(e)) {
            // This should never happen with BodyContentHandler...
            throw new TikaException("Unexpected SAX processing failure", e);
        }
    } finally {
        stream.close();
    }
    return handler.toString();
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaException(org.apache.tika.exception.TikaException) WriteOutContentHandler(org.apache.tika.sax.WriteOutContentHandler) ParseContext(org.apache.tika.parser.ParseContext) SAXException(org.xml.sax.SAXException)

Example 60 with BodyContentHandler

use of org.apache.tika.sax.BodyContentHandler in project tika by apache.

the class BundleIT method testForkParser.

@Test
public void testForkParser() throws Exception {
    ForkParser parser = new ForkParser(Activator.class.getClassLoader(), defaultParser);
    String data = "<!DOCTYPE html>\n<html><body><p>test <span>content</span></p></body></html>";
    InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
    Writer writer = new StringWriter();
    ContentHandler contentHandler = new BodyContentHandler(writer);
    Metadata metadata = new Metadata();
    MediaType type = contentTypeDetector.detect(stream, metadata);
    assertEquals(type.toString(), "text/html");
    metadata.add(Metadata.CONTENT_TYPE, type.toString());
    ParseContext parseCtx = new ParseContext();
    parser.parse(stream, contentHandler, metadata, parseCtx);
    writer.flush();
    String content = writer.toString();
    assertTrue(content.length() > 0);
    assertEquals("test content", content.trim());
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) StringWriter(java.io.StringWriter) Activator(org.apache.tika.parser.internal.Activator) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) JarInputStream(java.util.jar.JarInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) MediaType(org.apache.tika.mime.MediaType) StringWriter(java.io.StringWriter) Writer(java.io.Writer) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Aggregations

BodyContentHandler (org.apache.tika.sax.BodyContentHandler)261 Metadata (org.apache.tika.metadata.Metadata)252 Test (org.junit.Test)213 ContentHandler (org.xml.sax.ContentHandler)206 InputStream (java.io.InputStream)194 ParseContext (org.apache.tika.parser.ParseContext)176 TikaTest (org.apache.tika.TikaTest)117 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)92 Parser (org.apache.tika.parser.Parser)84 ByteArrayInputStream (java.io.ByteArrayInputStream)66 TikaInputStream (org.apache.tika.io.TikaInputStream)66 TikaException (org.apache.tika.exception.TikaException)25 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)24 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)24 IOException (java.io.IOException)23 EmptyParser (org.apache.tika.parser.EmptyParser)15 OfficeParser (org.apache.tika.parser.microsoft.OfficeParser)15 SAXException (org.xml.sax.SAXException)15 MediaType (org.apache.tika.mime.MediaType)11 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)10